feat: Phase 5 — OCR pipeline + document/news search

Docling-powered OCR pipeline: PDF → markdown → chunks → Ollama embed → Qdrant. News embedding seeder for news_embeddings collection. Document and news semantic search API endpoints. - embeddings/ocr-pipeline.ts: Docling convert → chunk → embed pipeline - embeddings/seed-news.ts: Batch embed news_articles into Qdrant - routes/documents.ts: POST /api/documents/process, GET /api/documents - routes/search.ts: GET /search/documents, GET /search/news endpoints - sql/005-documents.sql: Add chunks_count, processed_at to documents table - Ollama + nomic-embed-text installed on Erik (CPU mode) - 89 products + 40 datasheet chunks + 33 news articles in Qdrant
2026-03-28 00:22:01 +13:00 · 2026-03-28 00:22:01 +13:00 · 122ca8444d
commit 122ca8444d
parent 0260d0b365
6 changed files with 761 additions and 0 deletions
--- a/packages/api/src/embeddings/ocr-pipeline.ts
+++ b/packages/api/src/embeddings/ocr-pipeline.ts
@ -0,0 +1,336 @@
+/**
+ * OCR Pipeline: PDF/document → Docling → chunks → Ollama embed → Qdrant
+ *
+ * Connects to the Docling REST API (Erik port 8100) for document conversion,
+ * then chunks markdown output and embeds into Qdrant collections.
+ *
+ * Collections:
+ *   - datasheet_chunks: Product datasheets (specs, diagrams, compliance)
+ *   - manual_chunks: Installation/configuration manuals
+ *
+ * Run: npx tsx packages/api/src/embeddings/ocr-pipeline.ts [--url <pdf_url>] [--dir <path>]
+ */
+import { pool } from "../db/client";
+import { embed, upsertPoints, CollectionName } from "./client";
+import { randomUUID } from "crypto";
+
+const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100";
+
+interface DoclingResult {
+  success: boolean;
+  content: string;
+  format: string;
+  pages: number | null;
+  error?: string;
+}
+
+interface DocumentChunk {
+  id: string;
+  vector: number[];
+  payload: {
+    document_id: string;
+    source_url: string;
+    document_type: "datasheet" | "manual" | "whitepaper";
+    chunk_index: number;
+    total_chunks: number;
+    title: string;
+    section_heading: string;
+    text: string;
+    page_estimate: number | null;
+    vendor: string;
+    product_slug: string;
+  };
+}
+
+/** Convert a document via Docling API */
+async function convertDocument(url: string, format: "markdown" | "json" = "markdown"): Promise<DoclingResult> {
+  const resp = await fetch(`${DOCLING_URL}/convert`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ url, format }),
+    signal: AbortSignal.timeout(120000), // 2 min for large PDFs
+  });
+
+  if (!resp.ok) {
+    throw new Error(`Docling convert failed: ${resp.status} ${await resp.text()}`);
+  }
+
+  return resp.json() as Promise<DoclingResult>;
+}
+
+/**
+ * Chunk markdown into overlapping sections.
+ *
+ * Strategy:
+ *   1. Split by ## headings first (natural section boundaries)
+ *   2. If a section exceeds maxChunkSize, split by paragraphs
+ *   3. Apply overlap (repeat last N chars of previous chunk)
+ */
+function chunkMarkdown(
+  markdown: string,
+  maxChunkSize: number = 1500,
+  overlapSize: number = 200,
+): Array<{ heading: string; text: string }> {
+  const sections = markdown.split(/(?=^#{1,3}\s)/m);
+  const chunks: Array<{ heading: string; text: string }> = [];
+
+  for (const section of sections) {
+    const trimmed = section.trim();
+    if (!trimmed || trimmed.length < 20) continue;
+
+    // Extract heading
+    const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/);
+    const heading = headingMatch ? headingMatch[2].trim() : "Introduction";
+    const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed;
+
+    if (body.length <= maxChunkSize) {
+      chunks.push({ heading, text: body });
+    } else {
+      // Split large sections by paragraphs
+      const paragraphs = body.split(/\n\n+/);
+      let currentChunk = "";
+
+      for (const para of paragraphs) {
+        if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) {
+          chunks.push({ heading, text: currentChunk.trim() });
+          // Overlap: keep tail of previous chunk
+          const overlapText = currentChunk.slice(-overlapSize);
+          currentChunk = overlapText + "\n\n" + para;
+        } else {
+          currentChunk += (currentChunk ? "\n\n" : "") + para;
+        }
+      }
+
+      if (currentChunk.trim().length > 20) {
+        chunks.push({ heading, text: currentChunk.trim() });
+      }
+    }
+  }
+
+  return chunks;
+}
+
+/** Classify document type from URL or content */
+function classifyDocument(url: string, content: string): "datasheet" | "manual" | "whitepaper" {
+  const urlLower = url.toLowerCase();
+  const contentLower = content.slice(0, 2000).toLowerCase();
+
+  if (urlLower.includes("datasheet") || contentLower.includes("datasheet") || contentLower.includes("specifications")) {
+    return "datasheet";
+  }
+  if (urlLower.includes("manual") || urlLower.includes("install") || contentLower.includes("installation guide") || contentLower.includes("user manual")) {
+    return "manual";
+  }
+  return "whitepaper";
+}
+
+/** Extract vendor name from URL or content */
+function extractVendor(url: string): string {
+  const urlLower = url.toLowerCase();
+  const vendorPatterns: Array<[RegExp, string]> = [
+    [/flexoptix/i, "Flexoptix"],
+    [/cisco/i, "Cisco"],
+    [/juniper/i, "Juniper"],
+    [/arista/i, "Arista"],
+    [/nokia/i, "Nokia"],
+    [/huawei/i, "Huawei"],
+    [/finisar|ii-vi|coherent/i, "II-VI/Coherent"],
+    [/innolight/i, "Innolight"],
+    [/broadcom/i, "Broadcom"],
+    [/intel/i, "Intel"],
+    [/fs\.com|fiberstore/i, "FS.com"],
+    [/10gtek/i, "10Gtek"],
+  ];
+
+  for (const [pattern, name] of vendorPatterns) {
+    if (pattern.test(urlLower)) return name;
+  }
+  return "Unknown";
+}
+
+/** Extract product slug from URL */
+function extractProductSlug(url: string): string {
+  const filename = url.split("/").pop() || "";
+  return filename.replace(/\.(pdf|docx|doc|xlsx)$/i, "").replace(/[^a-zA-Z0-9-]/g, "-").toLowerCase();
+}
+
+/** Process a single document: convert → chunk → embed → store */
+async function processDocument(
+  url: string,
+  collection: CollectionName = "datasheet_chunks",
+  title?: string,
+): Promise<{ documentId: string; chunksStored: number }> {
+  const documentId = randomUUID();
+
+  console.log(`  Converting: ${url}`);
+  const result = await convertDocument(url);
+
+  if (!result.success) {
+    throw new Error(`Conversion failed: ${result.error}`);
+  }
+
+  const markdown = result.content;
+  console.log(`  Converted: ${result.pages ?? "?"} pages, ${markdown.length} chars`);
+
+  const docType = classifyDocument(url, markdown);
+  const vendor = extractVendor(url);
+  const productSlug = extractProductSlug(url);
+  const docTitle = title || productSlug.replace(/-/g, " ");
+
+  // Chunk the markdown
+  const chunks = chunkMarkdown(markdown);
+  console.log(`  Chunked: ${chunks.length} chunks (type: ${docType})`);
+
+  if (chunks.length === 0) {
+    console.log("  Warning: No chunks produced, skipping");
+    return { documentId, chunksStored: 0 };
+  }
+
+  // Embed and store in batches
+  const BATCH_SIZE = 5;
+  let stored = 0;
+
+  for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
+    const batch = chunks.slice(i, i + BATCH_SIZE);
+
+    const points: DocumentChunk[] = await Promise.all(
+      batch.map(async (chunk, idx) => {
+        const chunkIndex = i + idx;
+        const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`;
+        const vector = await embed(embeddingText);
+
+        return {
+          id: randomUUID(),
+          vector,
+          payload: {
+            document_id: documentId,
+            source_url: url,
+            document_type: docType,
+            chunk_index: chunkIndex,
+            total_chunks: chunks.length,
+            title: docTitle,
+            section_heading: chunk.heading,
+            text: chunk.text,
+            page_estimate: result.pages,
+            vendor,
+            product_slug: productSlug,
+          },
+        };
+      }),
+    );
+
+    await upsertPoints(collection, points);
+    stored += points.length;
+    console.log(`  Embedded ${stored}/${chunks.length} chunks`);
+  }
+
+  // Record in documents table
+  try {
+    await pool.query(
+      `INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, processed_at)
+       VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', NOW())
+       ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE
+         SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`,
+      [documentId, docType, docTitle, `ocr/${documentId}`, url, result.pages, chunks.length],
+    );
+  } catch {
+    // ignore if insert fails
+  }
+
+  return { documentId, chunksStored: stored };
+}
+
+/** Known datasheet URLs to seed from */
+const SEED_DATASHEETS: Array<{ url: string; title: string; collection: CollectionName }> = [
+  // Flexoptix product guides
+  {
+    url: "https://www.flexoptix.net/media/pdf/flexoptix-sfp-compatibility-guide.pdf",
+    title: "Flexoptix SFP Compatibility Guide",
+    collection: "datasheet_chunks",
+  },
+  // IEEE standards (publicly available)
+  {
+    url: "https://standards.ieee.org/content/dam/ieee-standards/standards/web/download/802.3-2022_downloads/802.3-2022.pdf",
+    title: "IEEE 802.3 Ethernet Standard",
+    collection: "manual_chunks",
+  },
+];
+
+async function main() {
+  const args = process.argv.slice(2);
+
+  console.log("=== OCR Pipeline: Document → Chunks → Embeddings ===\n");
+
+  // Check Docling health
+  try {
+    const healthResp = await fetch(`${DOCLING_URL}/health`, { signal: AbortSignal.timeout(5000) });
+    const health = await healthResp.json() as { status: string };
+    console.log(`Docling API: ${health.status} at ${DOCLING_URL}`);
+  } catch (err) {
+    console.error(`Docling API not reachable at ${DOCLING_URL}: ${(err as Error).message}`);
+    console.error("Set DOCLING_URL env var or start Docling on Erik (port 8100)");
+    process.exit(1);
+  }
+
+  let totalDocs = 0;
+  let totalChunks = 0;
+
+  if (args.includes("--url")) {
+    // Process a single URL
+    const urlIdx = args.indexOf("--url") + 1;
+    const url = args[urlIdx];
+    const title = args.includes("--title") ? args[args.indexOf("--title") + 1] : undefined;
+    const collection = (args.includes("--collection") ? args[args.indexOf("--collection") + 1] : "datasheet_chunks") as CollectionName;
+
+    if (!url) {
+      console.error("Usage: --url <pdf_url> [--title <title>] [--collection <name>]");
+      process.exit(1);
+    }
+
+    const result = await processDocument(url, collection, title);
+    totalDocs = 1;
+    totalChunks = result.chunksStored;
+  } else if (args.includes("--dir")) {
+    // Process all PDFs in a directory
+    const dirIdx = args.indexOf("--dir") + 1;
+    const dir = args[dirIdx];
+    const { readdirSync } = await import("fs");
+    const files = readdirSync(dir).filter((f) => f.toLowerCase().endsWith(".pdf"));
+
+    console.log(`Found ${files.length} PDFs in ${dir}\n`);
+
+    for (const file of files) {
+      const filePath = `${dir}/${file}`;
+      try {
+        const result = await processDocument(filePath, "datasheet_chunks");
+        totalDocs++;
+        totalChunks += result.chunksStored;
+      } catch (err) {
+        console.error(`  Failed: ${file} — ${(err as Error).message}`);
+      }
+    }
+  } else {
+    // Seed from known URLs
+    console.log(`Processing ${SEED_DATASHEETS.length} seed documents\n`);
+
+    for (const doc of SEED_DATASHEETS) {
+      try {
+        console.log(`\n[${doc.title}]`);
+        const result = await processDocument(doc.url, doc.collection, doc.title);
+        totalDocs++;
+        totalChunks += result.chunksStored;
+      } catch (err) {
+        console.error(`  Failed: ${doc.title} — ${(err as Error).message}`);
+      }
+    }
+  }
+
+  console.log(`\n=== Done: ${totalDocs} documents, ${totalChunks} chunks embedded ===`);
+  await pool.end();
+}
+
+main().catch((err) => {
+  console.error("Fatal:", err);
+  pool.end();
+  process.exit(1);
+});
--- a/packages/api/src/embeddings/seed-news.ts
+++ b/packages/api/src/embeddings/seed-news.ts
@ -0,0 +1,80 @@
+/**
+ * Seed news_embeddings collection from PostgreSQL news_articles table.
+ *
+ * Run: npx tsx packages/api/src/embeddings/seed-news.ts
+ */
+import { pool } from "../db/client";
+import { embed, upsertPoints } from "./client";
+
+function articleToText(row: Record<string, unknown>): string {
+  const parts = [
+    row.title && `${row.title}`,
+    row.source && `Source: ${row.source}`,
+    row.category && `Category: ${row.category}`,
+    row.summary && `${row.summary}`,
+    row.full_text && `${String(row.full_text).slice(0, 500)}`,
+  ].filter(Boolean);
+
+  return parts.join(". ");
+}
+
+async function main() {
+  console.log("=== Seeding news_embeddings ===\n");
+
+  const result = await pool.query(
+    `SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score
+     FROM news_articles
+     ORDER BY published_at DESC
+     LIMIT 500`,
+  );
+
+  console.log(`Found ${result.rows.length} news articles to embed\n`);
+
+  if (result.rows.length === 0) {
+    console.log("No articles found. Run the news scraper first.");
+    await pool.end();
+    return;
+  }
+
+  const BATCH_SIZE = 10;
+  let total = 0;
+
+  for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
+    const batch = result.rows.slice(i, i + BATCH_SIZE);
+
+    const points = await Promise.all(
+      batch.map(async (row) => {
+        const text = articleToText(row);
+        const vector = await embed(text);
+
+        return {
+          id: String(row.id),
+          vector,
+          payload: {
+            title: row.title || "",
+            url: row.source_url || "",
+            source: row.source || "",
+            category: row.category || "",
+            summary: row.summary || "",
+            published_at: row.published_at ? new Date(row.published_at).toISOString() : "",
+            relevance_score: row.relevance_score || 0,
+            text,
+          },
+        };
+      }),
+    );
+
+    await upsertPoints("news_embeddings", points);
+    total += points.length;
+    console.log(`  Embedded ${total}/${result.rows.length} articles`);
+  }
+
+  console.log(`\n=== Done: ${total} articles embedded ===`);
+  await pool.end();
+}
+
+main().catch((err) => {
+  console.error("Fatal:", err);
+  pool.end();
+  process.exit(1);
+});
--- a/packages/api/src/index.ts
+++ b/packages/api/src/index.ts
@ -10,6 +10,7 @@ import { standardRouter } from "./routes/standards";
 import { healthRouter } from "./routes/health";
 import { hypeCycleRouter } from "./routes/hype-cycle";
 import { searchRouter } from "./routes/search";
+import { documentRouter } from "./routes/documents";

 const app = express();

@ -34,6 +35,7 @@ app.use("/api/standards", standardRouter);
 app.use("/api/health", healthRouter);
 app.use("/api/hype-cycle", hypeCycleRouter);
 app.use("/api/search", searchRouter);
+app.use("/api/documents", documentRouter);

 // Root
 app.get("/", (_req, res) => {
@ -53,7 +55,12 @@ app.get("/", (_req, res) => {
      "GET /api/hype-cycle/:tech",
      "GET /api/search?q=&collection=&limit=",
      "GET /api/search/products?q=&form_factor=&speed_gbps=&fiber_type=",
+      "GET /api/search/documents?q=&doc_type=&vendor=&collection=",
+      "GET /api/search/news?q=&source=",
      "GET /api/search/stats",
+      "POST /api/documents/process  {url, title?, doc_type?, vendor?, collection?}",
+      "GET /api/documents",
+      "GET /api/documents/:id",
    ],
  });
 });
--- a/packages/api/src/routes/documents.ts
+++ b/packages/api/src/routes/documents.ts
@ -0,0 +1,217 @@
+/**
+ * Document processing API routes (OCR Pipeline)
+ *
+ * POST /api/documents/process  — Submit a document URL for OCR + embedding
+ * GET  /api/documents          — List processed documents
+ * GET  /api/documents/:id      — Get document chunks
+ */
+import { Router, Request, Response } from "express";
+import { embed, upsertPoints, CollectionName } from "../embeddings/client";
+import { pool } from "../db/client";
+import { randomUUID } from "crypto";
+
+export const documentRouter = Router();
+
+const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100";
+
+interface DoclingResult {
+  success: boolean;
+  content: string;
+  format: string;
+  pages: number | null;
+  error?: string;
+}
+
+/** Chunk markdown into overlapping sections */
+function chunkMarkdown(
+  markdown: string,
+  maxChunkSize: number = 1500,
+  overlapSize: number = 200,
+): Array<{ heading: string; text: string }> {
+  const sections = markdown.split(/(?=^#{1,3}\s)/m);
+  const chunks: Array<{ heading: string; text: string }> = [];
+
+  for (const section of sections) {
+    const trimmed = section.trim();
+    if (!trimmed || trimmed.length < 20) continue;
+
+    const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/);
+    const heading = headingMatch ? headingMatch[2].trim() : "Introduction";
+    const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed;
+
+    if (body.length <= maxChunkSize) {
+      chunks.push({ heading, text: body });
+    } else {
+      const paragraphs = body.split(/\n\n+/);
+      let currentChunk = "";
+
+      for (const para of paragraphs) {
+        if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) {
+          chunks.push({ heading, text: currentChunk.trim() });
+          const overlapText = currentChunk.slice(-overlapSize);
+          currentChunk = overlapText + "\n\n" + para;
+        } else {
+          currentChunk += (currentChunk ? "\n\n" : "") + para;
+        }
+      }
+
+      if (currentChunk.trim().length > 20) {
+        chunks.push({ heading, text: currentChunk.trim() });
+      }
+    }
+  }
+
+  return chunks;
+}
+
+// POST /api/documents/process — Process a document URL
+documentRouter.post("/process", async (req: Request, res: Response) => {
+  const { url, title, doc_type, vendor, collection } = req.body as {
+    url?: string;
+    title?: string;
+    doc_type?: string;
+    vendor?: string;
+    collection?: string;
+  };
+
+  if (!url) {
+    res.status(400).json({ success: false, error: "Missing 'url' in request body" });
+    return;
+  }
+
+  const targetCollection = (collection || "datasheet_chunks") as CollectionName;
+  if (!["datasheet_chunks", "manual_chunks"].includes(targetCollection)) {
+    res.status(400).json({ success: false, error: "collection must be 'datasheet_chunks' or 'manual_chunks'" });
+    return;
+  }
+
+  try {
+    // Convert via Docling
+    const docResp = await fetch(`${DOCLING_URL}/convert`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ url, format: "markdown" }),
+      signal: AbortSignal.timeout(120000),
+    });
+
+    if (!docResp.ok) {
+      res.status(502).json({ success: false, error: "Docling conversion failed", detail: await docResp.text() });
+      return;
+    }
+
+    const docResult = (await docResp.json()) as DoclingResult;
+    if (!docResult.success) {
+      res.status(502).json({ success: false, error: "Docling conversion failed", detail: docResult.error });
+      return;
+    }
+
+    const documentId = randomUUID();
+    const docTitle = title || url.split("/").pop()?.replace(/\.[^.]+$/, "") || "untitled";
+    const docType = doc_type || "datasheet";
+    const docVendor = vendor || "Unknown";
+
+    // Chunk
+    const chunks = chunkMarkdown(docResult.content);
+
+    // Embed and store
+    const BATCH_SIZE = 5;
+    let stored = 0;
+
+    for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
+      const batch = chunks.slice(i, i + BATCH_SIZE);
+      const points = await Promise.all(
+        batch.map(async (chunk, idx) => {
+          const chunkIndex = i + idx;
+          const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`;
+          const vector = await embed(embeddingText);
+          return {
+            id: randomUUID(),
+            vector,
+            payload: {
+              document_id: documentId,
+              source_url: url,
+              document_type: docType,
+              chunk_index: chunkIndex,
+              total_chunks: chunks.length,
+              title: docTitle,
+              section_heading: chunk.heading,
+              text: chunk.text,
+              page_estimate: docResult.pages,
+              vendor: docVendor,
+              product_slug: docTitle.replace(/\s+/g, "-").toLowerCase(),
+            },
+          };
+        }),
+      );
+
+      await upsertPoints(targetCollection, points);
+      stored += points.length;
+    }
+
+    // Record in documents table (existing schema)
+    try {
+      await pool.query(
+        `INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, ocr_text, processed_at)
+         VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', $8, NOW())
+         ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE
+           SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`,
+        [documentId, docType, docTitle, `ocr/${documentId}`, url, docResult.pages, chunks.length, docResult.content.slice(0, 50000)],
+      );
+    } catch {
+      // ignore if insert fails
+    }
+
+    res.json({
+      success: true,
+      document_id: documentId,
+      title: docTitle,
+      pages: docResult.pages,
+      chunks: chunks.length,
+      collection: targetCollection,
+      markdown_length: docResult.content.length,
+    });
+  } catch (err) {
+    res.status(503).json({
+      success: false,
+      error: "Document processing failed",
+      detail: (err as Error).message,
+    });
+  }
+});
+
+// GET /api/documents — List processed documents
+documentRouter.get("/", async (_req: Request, res: Response) => {
+  try {
+    const result = await pool.query(
+      `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at
+       FROM documents
+       ORDER BY COALESCE(processed_at, created_at) DESC
+       LIMIT 100`,
+    );
+
+    res.json({ success: true, documents: result.rows, count: result.rows.length });
+  } catch {
+    // Table may not exist — return empty
+    res.json({ success: true, documents: [], count: 0, note: "documents table not yet created" });
+  }
+});
+
+// GET /api/documents/:id — Get document details
+documentRouter.get("/:id", async (req: Request, res: Response) => {
+  try {
+    const result = await pool.query(
+      `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at
+       FROM documents WHERE id = $1::uuid`,
+      [req.params.id],
+    );
+
+    if (result.rows.length === 0) {
+      res.status(404).json({ success: false, error: "Document not found" });
+      return;
+    }
+
+    res.json({ success: true, document: result.rows[0] });
+  } catch {
+    res.status(404).json({ success: false, error: "Document not found or table not created" });
+  }
+});
--- a/packages/api/src/routes/search.ts
+++ b/packages/api/src/routes/search.ts
@ -3,6 +3,8 @@
 *
 * GET /api/search?q=<query>&collection=<col>&limit=<n>
 * GET /api/search/products?q=<query>&form_factor=&speed_gbps=&fiber_type=
+ * GET /api/search/documents?q=<query>&doc_type=&vendor=
+ * GET /api/search/news?q=<query>&source=
 */
 import { Router, Request, Response } from "express";
 import { semanticSearch, getCollectionInfo, CollectionName } from "../embeddings/client";
@ -123,6 +125,122 @@ searchRouter.get("/products", async (req: Request, res: Response) => {
  }
 });

+// GET /api/search/documents — Search datasheets and manuals
+searchRouter.get("/documents", async (req: Request, res: Response) => {
+  const query = q("q", req);
+  const limit = parseInt(q("limit", req) || "10");
+  const docType = q("doc_type", req);
+  const vendor = q("vendor", req);
+  const collection = (q("collection", req) || "datasheet_chunks") as CollectionName;
+
+  if (!query) {
+    res.status(400).json({ success: false, error: "Missing 'q' parameter" });
+    return;
+  }
+
+  if (!["datasheet_chunks", "manual_chunks"].includes(collection)) {
+    res.status(400).json({
+      success: false,
+      error: "collection must be 'datasheet_chunks' or 'manual_chunks'",
+    });
+    return;
+  }
+
+  const mustConditions: Array<Record<string, unknown>> = [];
+  if (docType) {
+    mustConditions.push({ key: "document_type", match: { value: docType.toLowerCase() } });
+  }
+  if (vendor) {
+    mustConditions.push({ key: "vendor", match: { value: vendor } });
+  }
+
+  const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined;
+
+  try {
+    const results = await semanticSearch(collection, query, limit, filter);
+
+    // Group by document for cleaner output
+    const byDocument = new Map<string, { title: string; vendor: string; source_url: string; chunks: Array<{ score: number; heading: string; text: string; chunk_index: number }> }>();
+
+    for (const r of results) {
+      const docId = String(r.payload.document_id || r.id);
+      if (!byDocument.has(docId)) {
+        byDocument.set(docId, {
+          title: String(r.payload.title || ""),
+          vendor: String(r.payload.vendor || ""),
+          source_url: String(r.payload.source_url || ""),
+          chunks: [],
+        });
+      }
+      byDocument.get(docId)!.chunks.push({
+        score: Math.round(r.score * 1000) / 1000,
+        heading: String(r.payload.section_heading || ""),
+        text: String(r.payload.text || "").slice(0, 500),
+        chunk_index: Number(r.payload.chunk_index || 0),
+      });
+    }
+
+    res.json({
+      success: true,
+      query,
+      collection,
+      filters: { docType, vendor },
+      documents: Array.from(byDocument.values()),
+      totalChunks: results.length,
+    });
+  } catch (err) {
+    res.status(503).json({
+      success: false,
+      error: "Vector search unavailable",
+      detail: (err as Error).message,
+    });
+  }
+});
+
+// GET /api/search/news — Search news articles
+searchRouter.get("/news", async (req: Request, res: Response) => {
+  const query = q("q", req);
+  const limit = parseInt(q("limit", req) || "10");
+  const source = q("source", req);
+
+  if (!query) {
+    res.status(400).json({ success: false, error: "Missing 'q' parameter" });
+    return;
+  }
+
+  const mustConditions: Array<Record<string, unknown>> = [];
+  if (source) {
+    mustConditions.push({ key: "source", match: { value: source } });
+  }
+
+  const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined;
+
+  try {
+    const results = await semanticSearch("news_embeddings", query, limit, filter);
+    res.json({
+      success: true,
+      query,
+      filters: { source },
+      results: results.map((r) => ({
+        id: r.id,
+        score: Math.round(r.score * 1000) / 1000,
+        title: r.payload.title,
+        url: r.payload.url,
+        source: r.payload.source,
+        summary: r.payload.summary,
+        published_at: r.payload.published_at,
+      })),
+      count: results.length,
+    });
+  } catch (err) {
+    res.status(503).json({
+      success: false,
+      error: "Vector search unavailable",
+      detail: (err as Error).message,
+    });
+  }
+});
+
 // GET /api/search/stats — Collection statistics
 searchRouter.get("/stats", async (_req: Request, res: Response) => {
  try {
--- a/sql/005-documents.sql
+++ b/sql/005-documents.sql
@ -0,0 +1,3 @@
+-- Add OCR pipeline columns to existing documents table
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS chunks_count INT DEFAULT 0;
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ;