diff --git a/packages/api/src/embeddings/ocr-pipeline.ts b/packages/api/src/embeddings/ocr-pipeline.ts new file mode 100644 index 0000000..ddcb710 --- /dev/null +++ b/packages/api/src/embeddings/ocr-pipeline.ts @@ -0,0 +1,336 @@ +/** + * OCR Pipeline: PDF/document → Docling → chunks → Ollama embed → Qdrant + * + * Connects to the Docling REST API (Erik port 8100) for document conversion, + * then chunks markdown output and embeds into Qdrant collections. + * + * Collections: + * - datasheet_chunks: Product datasheets (specs, diagrams, compliance) + * - manual_chunks: Installation/configuration manuals + * + * Run: npx tsx packages/api/src/embeddings/ocr-pipeline.ts [--url ] [--dir ] + */ +import { pool } from "../db/client"; +import { embed, upsertPoints, CollectionName } from "./client"; +import { randomUUID } from "crypto"; + +const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100"; + +interface DoclingResult { + success: boolean; + content: string; + format: string; + pages: number | null; + error?: string; +} + +interface DocumentChunk { + id: string; + vector: number[]; + payload: { + document_id: string; + source_url: string; + document_type: "datasheet" | "manual" | "whitepaper"; + chunk_index: number; + total_chunks: number; + title: string; + section_heading: string; + text: string; + page_estimate: number | null; + vendor: string; + product_slug: string; + }; +} + +/** Convert a document via Docling API */ +async function convertDocument(url: string, format: "markdown" | "json" = "markdown"): Promise { + const resp = await fetch(`${DOCLING_URL}/convert`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ url, format }), + signal: AbortSignal.timeout(120000), // 2 min for large PDFs + }); + + if (!resp.ok) { + throw new Error(`Docling convert failed: ${resp.status} ${await resp.text()}`); + } + + return resp.json() as Promise; +} + +/** + * Chunk markdown into overlapping sections. + * + * Strategy: + * 1. Split by ## headings first (natural section boundaries) + * 2. If a section exceeds maxChunkSize, split by paragraphs + * 3. Apply overlap (repeat last N chars of previous chunk) + */ +function chunkMarkdown( + markdown: string, + maxChunkSize: number = 1500, + overlapSize: number = 200, +): Array<{ heading: string; text: string }> { + const sections = markdown.split(/(?=^#{1,3}\s)/m); + const chunks: Array<{ heading: string; text: string }> = []; + + for (const section of sections) { + const trimmed = section.trim(); + if (!trimmed || trimmed.length < 20) continue; + + // Extract heading + const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/); + const heading = headingMatch ? headingMatch[2].trim() : "Introduction"; + const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed; + + if (body.length <= maxChunkSize) { + chunks.push({ heading, text: body }); + } else { + // Split large sections by paragraphs + const paragraphs = body.split(/\n\n+/); + let currentChunk = ""; + + for (const para of paragraphs) { + if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) { + chunks.push({ heading, text: currentChunk.trim() }); + // Overlap: keep tail of previous chunk + const overlapText = currentChunk.slice(-overlapSize); + currentChunk = overlapText + "\n\n" + para; + } else { + currentChunk += (currentChunk ? "\n\n" : "") + para; + } + } + + if (currentChunk.trim().length > 20) { + chunks.push({ heading, text: currentChunk.trim() }); + } + } + } + + return chunks; +} + +/** Classify document type from URL or content */ +function classifyDocument(url: string, content: string): "datasheet" | "manual" | "whitepaper" { + const urlLower = url.toLowerCase(); + const contentLower = content.slice(0, 2000).toLowerCase(); + + if (urlLower.includes("datasheet") || contentLower.includes("datasheet") || contentLower.includes("specifications")) { + return "datasheet"; + } + if (urlLower.includes("manual") || urlLower.includes("install") || contentLower.includes("installation guide") || contentLower.includes("user manual")) { + return "manual"; + } + return "whitepaper"; +} + +/** Extract vendor name from URL or content */ +function extractVendor(url: string): string { + const urlLower = url.toLowerCase(); + const vendorPatterns: Array<[RegExp, string]> = [ + [/flexoptix/i, "Flexoptix"], + [/cisco/i, "Cisco"], + [/juniper/i, "Juniper"], + [/arista/i, "Arista"], + [/nokia/i, "Nokia"], + [/huawei/i, "Huawei"], + [/finisar|ii-vi|coherent/i, "II-VI/Coherent"], + [/innolight/i, "Innolight"], + [/broadcom/i, "Broadcom"], + [/intel/i, "Intel"], + [/fs\.com|fiberstore/i, "FS.com"], + [/10gtek/i, "10Gtek"], + ]; + + for (const [pattern, name] of vendorPatterns) { + if (pattern.test(urlLower)) return name; + } + return "Unknown"; +} + +/** Extract product slug from URL */ +function extractProductSlug(url: string): string { + const filename = url.split("/").pop() || ""; + return filename.replace(/\.(pdf|docx|doc|xlsx)$/i, "").replace(/[^a-zA-Z0-9-]/g, "-").toLowerCase(); +} + +/** Process a single document: convert → chunk → embed → store */ +async function processDocument( + url: string, + collection: CollectionName = "datasheet_chunks", + title?: string, +): Promise<{ documentId: string; chunksStored: number }> { + const documentId = randomUUID(); + + console.log(` Converting: ${url}`); + const result = await convertDocument(url); + + if (!result.success) { + throw new Error(`Conversion failed: ${result.error}`); + } + + const markdown = result.content; + console.log(` Converted: ${result.pages ?? "?"} pages, ${markdown.length} chars`); + + const docType = classifyDocument(url, markdown); + const vendor = extractVendor(url); + const productSlug = extractProductSlug(url); + const docTitle = title || productSlug.replace(/-/g, " "); + + // Chunk the markdown + const chunks = chunkMarkdown(markdown); + console.log(` Chunked: ${chunks.length} chunks (type: ${docType})`); + + if (chunks.length === 0) { + console.log(" Warning: No chunks produced, skipping"); + return { documentId, chunksStored: 0 }; + } + + // Embed and store in batches + const BATCH_SIZE = 5; + let stored = 0; + + for (let i = 0; i < chunks.length; i += BATCH_SIZE) { + const batch = chunks.slice(i, i + BATCH_SIZE); + + const points: DocumentChunk[] = await Promise.all( + batch.map(async (chunk, idx) => { + const chunkIndex = i + idx; + const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`; + const vector = await embed(embeddingText); + + return { + id: randomUUID(), + vector, + payload: { + document_id: documentId, + source_url: url, + document_type: docType, + chunk_index: chunkIndex, + total_chunks: chunks.length, + title: docTitle, + section_heading: chunk.heading, + text: chunk.text, + page_estimate: result.pages, + vendor, + product_slug: productSlug, + }, + }; + }), + ); + + await upsertPoints(collection, points); + stored += points.length; + console.log(` Embedded ${stored}/${chunks.length} chunks`); + } + + // Record in documents table + try { + await pool.query( + `INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, processed_at) + VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', NOW()) + ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE + SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`, + [documentId, docType, docTitle, `ocr/${documentId}`, url, result.pages, chunks.length], + ); + } catch { + // ignore if insert fails + } + + return { documentId, chunksStored: stored }; +} + +/** Known datasheet URLs to seed from */ +const SEED_DATASHEETS: Array<{ url: string; title: string; collection: CollectionName }> = [ + // Flexoptix product guides + { + url: "https://www.flexoptix.net/media/pdf/flexoptix-sfp-compatibility-guide.pdf", + title: "Flexoptix SFP Compatibility Guide", + collection: "datasheet_chunks", + }, + // IEEE standards (publicly available) + { + url: "https://standards.ieee.org/content/dam/ieee-standards/standards/web/download/802.3-2022_downloads/802.3-2022.pdf", + title: "IEEE 802.3 Ethernet Standard", + collection: "manual_chunks", + }, +]; + +async function main() { + const args = process.argv.slice(2); + + console.log("=== OCR Pipeline: Document → Chunks → Embeddings ===\n"); + + // Check Docling health + try { + const healthResp = await fetch(`${DOCLING_URL}/health`, { signal: AbortSignal.timeout(5000) }); + const health = await healthResp.json() as { status: string }; + console.log(`Docling API: ${health.status} at ${DOCLING_URL}`); + } catch (err) { + console.error(`Docling API not reachable at ${DOCLING_URL}: ${(err as Error).message}`); + console.error("Set DOCLING_URL env var or start Docling on Erik (port 8100)"); + process.exit(1); + } + + let totalDocs = 0; + let totalChunks = 0; + + if (args.includes("--url")) { + // Process a single URL + const urlIdx = args.indexOf("--url") + 1; + const url = args[urlIdx]; + const title = args.includes("--title") ? args[args.indexOf("--title") + 1] : undefined; + const collection = (args.includes("--collection") ? args[args.indexOf("--collection") + 1] : "datasheet_chunks") as CollectionName; + + if (!url) { + console.error("Usage: --url [--title ] [--collection <name>]"); + process.exit(1); + } + + const result = await processDocument(url, collection, title); + totalDocs = 1; + totalChunks = result.chunksStored; + } else if (args.includes("--dir")) { + // Process all PDFs in a directory + const dirIdx = args.indexOf("--dir") + 1; + const dir = args[dirIdx]; + const { readdirSync } = await import("fs"); + const files = readdirSync(dir).filter((f) => f.toLowerCase().endsWith(".pdf")); + + console.log(`Found ${files.length} PDFs in ${dir}\n`); + + for (const file of files) { + const filePath = `${dir}/${file}`; + try { + const result = await processDocument(filePath, "datasheet_chunks"); + totalDocs++; + totalChunks += result.chunksStored; + } catch (err) { + console.error(` Failed: ${file} — ${(err as Error).message}`); + } + } + } else { + // Seed from known URLs + console.log(`Processing ${SEED_DATASHEETS.length} seed documents\n`); + + for (const doc of SEED_DATASHEETS) { + try { + console.log(`\n[${doc.title}]`); + const result = await processDocument(doc.url, doc.collection, doc.title); + totalDocs++; + totalChunks += result.chunksStored; + } catch (err) { + console.error(` Failed: ${doc.title} — ${(err as Error).message}`); + } + } + } + + console.log(`\n=== Done: ${totalDocs} documents, ${totalChunks} chunks embedded ===`); + await pool.end(); +} + +main().catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); +}); diff --git a/packages/api/src/embeddings/seed-news.ts b/packages/api/src/embeddings/seed-news.ts new file mode 100644 index 0000000..4ca25d4 --- /dev/null +++ b/packages/api/src/embeddings/seed-news.ts @@ -0,0 +1,80 @@ +/** + * Seed news_embeddings collection from PostgreSQL news_articles table. + * + * Run: npx tsx packages/api/src/embeddings/seed-news.ts + */ +import { pool } from "../db/client"; +import { embed, upsertPoints } from "./client"; + +function articleToText(row: Record<string, unknown>): string { + const parts = [ + row.title && `${row.title}`, + row.source && `Source: ${row.source}`, + row.category && `Category: ${row.category}`, + row.summary && `${row.summary}`, + row.full_text && `${String(row.full_text).slice(0, 500)}`, + ].filter(Boolean); + + return parts.join(". "); +} + +async function main() { + console.log("=== Seeding news_embeddings ===\n"); + + const result = await pool.query( + `SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score + FROM news_articles + ORDER BY published_at DESC + LIMIT 500`, + ); + + console.log(`Found ${result.rows.length} news articles to embed\n`); + + if (result.rows.length === 0) { + console.log("No articles found. Run the news scraper first."); + await pool.end(); + return; + } + + const BATCH_SIZE = 10; + let total = 0; + + for (let i = 0; i < result.rows.length; i += BATCH_SIZE) { + const batch = result.rows.slice(i, i + BATCH_SIZE); + + const points = await Promise.all( + batch.map(async (row) => { + const text = articleToText(row); + const vector = await embed(text); + + return { + id: String(row.id), + vector, + payload: { + title: row.title || "", + url: row.source_url || "", + source: row.source || "", + category: row.category || "", + summary: row.summary || "", + published_at: row.published_at ? new Date(row.published_at).toISOString() : "", + relevance_score: row.relevance_score || 0, + text, + }, + }; + }), + ); + + await upsertPoints("news_embeddings", points); + total += points.length; + console.log(` Embedded ${total}/${result.rows.length} articles`); + } + + console.log(`\n=== Done: ${total} articles embedded ===`); + await pool.end(); +} + +main().catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); +}); diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index e7309ca..77838b5 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -10,6 +10,7 @@ import { standardRouter } from "./routes/standards"; import { healthRouter } from "./routes/health"; import { hypeCycleRouter } from "./routes/hype-cycle"; import { searchRouter } from "./routes/search"; +import { documentRouter } from "./routes/documents"; const app = express(); @@ -34,6 +35,7 @@ app.use("/api/standards", standardRouter); app.use("/api/health", healthRouter); app.use("/api/hype-cycle", hypeCycleRouter); app.use("/api/search", searchRouter); +app.use("/api/documents", documentRouter); // Root app.get("/", (_req, res) => { @@ -53,7 +55,12 @@ app.get("/", (_req, res) => { "GET /api/hype-cycle/:tech", "GET /api/search?q=&collection=&limit=", "GET /api/search/products?q=&form_factor=&speed_gbps=&fiber_type=", + "GET /api/search/documents?q=&doc_type=&vendor=&collection=", + "GET /api/search/news?q=&source=", "GET /api/search/stats", + "POST /api/documents/process {url, title?, doc_type?, vendor?, collection?}", + "GET /api/documents", + "GET /api/documents/:id", ], }); }); diff --git a/packages/api/src/routes/documents.ts b/packages/api/src/routes/documents.ts new file mode 100644 index 0000000..54c5aef --- /dev/null +++ b/packages/api/src/routes/documents.ts @@ -0,0 +1,217 @@ +/** + * Document processing API routes (OCR Pipeline) + * + * POST /api/documents/process — Submit a document URL for OCR + embedding + * GET /api/documents — List processed documents + * GET /api/documents/:id — Get document chunks + */ +import { Router, Request, Response } from "express"; +import { embed, upsertPoints, CollectionName } from "../embeddings/client"; +import { pool } from "../db/client"; +import { randomUUID } from "crypto"; + +export const documentRouter = Router(); + +const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100"; + +interface DoclingResult { + success: boolean; + content: string; + format: string; + pages: number | null; + error?: string; +} + +/** Chunk markdown into overlapping sections */ +function chunkMarkdown( + markdown: string, + maxChunkSize: number = 1500, + overlapSize: number = 200, +): Array<{ heading: string; text: string }> { + const sections = markdown.split(/(?=^#{1,3}\s)/m); + const chunks: Array<{ heading: string; text: string }> = []; + + for (const section of sections) { + const trimmed = section.trim(); + if (!trimmed || trimmed.length < 20) continue; + + const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/); + const heading = headingMatch ? headingMatch[2].trim() : "Introduction"; + const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed; + + if (body.length <= maxChunkSize) { + chunks.push({ heading, text: body }); + } else { + const paragraphs = body.split(/\n\n+/); + let currentChunk = ""; + + for (const para of paragraphs) { + if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) { + chunks.push({ heading, text: currentChunk.trim() }); + const overlapText = currentChunk.slice(-overlapSize); + currentChunk = overlapText + "\n\n" + para; + } else { + currentChunk += (currentChunk ? "\n\n" : "") + para; + } + } + + if (currentChunk.trim().length > 20) { + chunks.push({ heading, text: currentChunk.trim() }); + } + } + } + + return chunks; +} + +// POST /api/documents/process — Process a document URL +documentRouter.post("/process", async (req: Request, res: Response) => { + const { url, title, doc_type, vendor, collection } = req.body as { + url?: string; + title?: string; + doc_type?: string; + vendor?: string; + collection?: string; + }; + + if (!url) { + res.status(400).json({ success: false, error: "Missing 'url' in request body" }); + return; + } + + const targetCollection = (collection || "datasheet_chunks") as CollectionName; + if (!["datasheet_chunks", "manual_chunks"].includes(targetCollection)) { + res.status(400).json({ success: false, error: "collection must be 'datasheet_chunks' or 'manual_chunks'" }); + return; + } + + try { + // Convert via Docling + const docResp = await fetch(`${DOCLING_URL}/convert`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ url, format: "markdown" }), + signal: AbortSignal.timeout(120000), + }); + + if (!docResp.ok) { + res.status(502).json({ success: false, error: "Docling conversion failed", detail: await docResp.text() }); + return; + } + + const docResult = (await docResp.json()) as DoclingResult; + if (!docResult.success) { + res.status(502).json({ success: false, error: "Docling conversion failed", detail: docResult.error }); + return; + } + + const documentId = randomUUID(); + const docTitle = title || url.split("/").pop()?.replace(/\.[^.]+$/, "") || "untitled"; + const docType = doc_type || "datasheet"; + const docVendor = vendor || "Unknown"; + + // Chunk + const chunks = chunkMarkdown(docResult.content); + + // Embed and store + const BATCH_SIZE = 5; + let stored = 0; + + for (let i = 0; i < chunks.length; i += BATCH_SIZE) { + const batch = chunks.slice(i, i + BATCH_SIZE); + const points = await Promise.all( + batch.map(async (chunk, idx) => { + const chunkIndex = i + idx; + const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`; + const vector = await embed(embeddingText); + return { + id: randomUUID(), + vector, + payload: { + document_id: documentId, + source_url: url, + document_type: docType, + chunk_index: chunkIndex, + total_chunks: chunks.length, + title: docTitle, + section_heading: chunk.heading, + text: chunk.text, + page_estimate: docResult.pages, + vendor: docVendor, + product_slug: docTitle.replace(/\s+/g, "-").toLowerCase(), + }, + }; + }), + ); + + await upsertPoints(targetCollection, points); + stored += points.length; + } + + // Record in documents table (existing schema) + try { + await pool.query( + `INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, ocr_text, processed_at) + VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', $8, NOW()) + ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE + SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`, + [documentId, docType, docTitle, `ocr/${documentId}`, url, docResult.pages, chunks.length, docResult.content.slice(0, 50000)], + ); + } catch { + // ignore if insert fails + } + + res.json({ + success: true, + document_id: documentId, + title: docTitle, + pages: docResult.pages, + chunks: chunks.length, + collection: targetCollection, + markdown_length: docResult.content.length, + }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Document processing failed", + detail: (err as Error).message, + }); + } +}); + +// GET /api/documents — List processed documents +documentRouter.get("/", async (_req: Request, res: Response) => { + try { + const result = await pool.query( + `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at + FROM documents + ORDER BY COALESCE(processed_at, created_at) DESC + LIMIT 100`, + ); + + res.json({ success: true, documents: result.rows, count: result.rows.length }); + } catch { + // Table may not exist — return empty + res.json({ success: true, documents: [], count: 0, note: "documents table not yet created" }); + } +}); + +// GET /api/documents/:id — Get document details +documentRouter.get("/:id", async (req: Request, res: Response) => { + try { + const result = await pool.query( + `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at + FROM documents WHERE id = $1::uuid`, + [req.params.id], + ); + + if (result.rows.length === 0) { + res.status(404).json({ success: false, error: "Document not found" }); + return; + } + + res.json({ success: true, document: result.rows[0] }); + } catch { + res.status(404).json({ success: false, error: "Document not found or table not created" }); + } +}); diff --git a/packages/api/src/routes/search.ts b/packages/api/src/routes/search.ts index 7107a6b..3a1caf6 100644 --- a/packages/api/src/routes/search.ts +++ b/packages/api/src/routes/search.ts @@ -3,6 +3,8 @@ * * GET /api/search?q=<query>&collection=<col>&limit=<n> * GET /api/search/products?q=<query>&form_factor=&speed_gbps=&fiber_type= + * GET /api/search/documents?q=<query>&doc_type=&vendor= + * GET /api/search/news?q=<query>&source= */ import { Router, Request, Response } from "express"; import { semanticSearch, getCollectionInfo, CollectionName } from "../embeddings/client"; @@ -123,6 +125,122 @@ searchRouter.get("/products", async (req: Request, res: Response) => { } }); +// GET /api/search/documents — Search datasheets and manuals +searchRouter.get("/documents", async (req: Request, res: Response) => { + const query = q("q", req); + const limit = parseInt(q("limit", req) || "10"); + const docType = q("doc_type", req); + const vendor = q("vendor", req); + const collection = (q("collection", req) || "datasheet_chunks") as CollectionName; + + if (!query) { + res.status(400).json({ success: false, error: "Missing 'q' parameter" }); + return; + } + + if (!["datasheet_chunks", "manual_chunks"].includes(collection)) { + res.status(400).json({ + success: false, + error: "collection must be 'datasheet_chunks' or 'manual_chunks'", + }); + return; + } + + const mustConditions: Array<Record<string, unknown>> = []; + if (docType) { + mustConditions.push({ key: "document_type", match: { value: docType.toLowerCase() } }); + } + if (vendor) { + mustConditions.push({ key: "vendor", match: { value: vendor } }); + } + + const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined; + + try { + const results = await semanticSearch(collection, query, limit, filter); + + // Group by document for cleaner output + const byDocument = new Map<string, { title: string; vendor: string; source_url: string; chunks: Array<{ score: number; heading: string; text: string; chunk_index: number }> }>(); + + for (const r of results) { + const docId = String(r.payload.document_id || r.id); + if (!byDocument.has(docId)) { + byDocument.set(docId, { + title: String(r.payload.title || ""), + vendor: String(r.payload.vendor || ""), + source_url: String(r.payload.source_url || ""), + chunks: [], + }); + } + byDocument.get(docId)!.chunks.push({ + score: Math.round(r.score * 1000) / 1000, + heading: String(r.payload.section_heading || ""), + text: String(r.payload.text || "").slice(0, 500), + chunk_index: Number(r.payload.chunk_index || 0), + }); + } + + res.json({ + success: true, + query, + collection, + filters: { docType, vendor }, + documents: Array.from(byDocument.values()), + totalChunks: results.length, + }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Vector search unavailable", + detail: (err as Error).message, + }); + } +}); + +// GET /api/search/news — Search news articles +searchRouter.get("/news", async (req: Request, res: Response) => { + const query = q("q", req); + const limit = parseInt(q("limit", req) || "10"); + const source = q("source", req); + + if (!query) { + res.status(400).json({ success: false, error: "Missing 'q' parameter" }); + return; + } + + const mustConditions: Array<Record<string, unknown>> = []; + if (source) { + mustConditions.push({ key: "source", match: { value: source } }); + } + + const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined; + + try { + const results = await semanticSearch("news_embeddings", query, limit, filter); + res.json({ + success: true, + query, + filters: { source }, + results: results.map((r) => ({ + id: r.id, + score: Math.round(r.score * 1000) / 1000, + title: r.payload.title, + url: r.payload.url, + source: r.payload.source, + summary: r.payload.summary, + published_at: r.payload.published_at, + })), + count: results.length, + }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Vector search unavailable", + detail: (err as Error).message, + }); + } +}); + // GET /api/search/stats — Collection statistics searchRouter.get("/stats", async (_req: Request, res: Response) => { try { diff --git a/sql/005-documents.sql b/sql/005-documents.sql new file mode 100644 index 0000000..b80c066 --- /dev/null +++ b/sql/005-documents.sql @@ -0,0 +1,3 @@ +-- Add OCR pipeline columns to existing documents table +ALTER TABLE documents ADD COLUMN IF NOT EXISTS chunks_count INT DEFAULT 0; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ;