/** * Document processing API routes (OCR Pipeline) * * POST /api/documents/process — Submit a document URL for OCR + embedding * GET /api/documents — List processed documents * GET /api/documents/:id — Get document chunks */ import { Router, Request, Response } from "express"; import { embed, upsertPoints, CollectionName } from "../embeddings/client"; import { pool } from "../db/client"; import { randomUUID } from "crypto"; export const documentRouter = Router(); const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100"; interface DoclingResult { success: boolean; content: string; format: string; pages: number | null; error?: string; } /** Chunk markdown into overlapping sections */ function chunkMarkdown( markdown: string, maxChunkSize: number = 1500, overlapSize: number = 200, ): Array<{ heading: string; text: string }> { const sections = markdown.split(/(?=^#{1,3}\s)/m); const chunks: Array<{ heading: string; text: string }> = []; for (const section of sections) { const trimmed = section.trim(); if (!trimmed || trimmed.length < 20) continue; const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/); const heading = headingMatch ? headingMatch[2].trim() : "Introduction"; const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed; if (body.length <= maxChunkSize) { chunks.push({ heading, text: body }); } else { const paragraphs = body.split(/\n\n+/); let currentChunk = ""; for (const para of paragraphs) { if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) { chunks.push({ heading, text: currentChunk.trim() }); const overlapText = currentChunk.slice(-overlapSize); currentChunk = overlapText + "\n\n" + para; } else { currentChunk += (currentChunk ? "\n\n" : "") + para; } } if (currentChunk.trim().length > 20) { chunks.push({ heading, text: currentChunk.trim() }); } } } return chunks; } // POST /api/documents/process — Process a document URL documentRouter.post("/process", async (req: Request, res: Response) => { const { url, title, doc_type, vendor, collection } = req.body as { url?: string; title?: string; doc_type?: string; vendor?: string; collection?: string; }; if (!url) { res.status(400).json({ success: false, error: "Missing 'url' in request body" }); return; } const targetCollection = (collection || "datasheet_chunks") as CollectionName; if (!["datasheet_chunks", "manual_chunks"].includes(targetCollection)) { res.status(400).json({ success: false, error: "collection must be 'datasheet_chunks' or 'manual_chunks'" }); return; } try { // Convert via Docling const docResp = await fetch(`${DOCLING_URL}/convert`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ url, format: "markdown" }), signal: AbortSignal.timeout(120000), }); if (!docResp.ok) { res.status(502).json({ success: false, error: "Docling conversion failed", detail: await docResp.text() }); return; } const docResult = (await docResp.json()) as DoclingResult; if (!docResult.success) { res.status(502).json({ success: false, error: "Docling conversion failed", detail: docResult.error }); return; } const documentId = randomUUID(); const docTitle = title || url.split("/").pop()?.replace(/\.[^.]+$/, "") || "untitled"; const docType = doc_type || "datasheet"; const docVendor = vendor || "Unknown"; // Chunk const chunks = chunkMarkdown(docResult.content); // Embed and store const BATCH_SIZE = 5; let stored = 0; for (let i = 0; i < chunks.length; i += BATCH_SIZE) { const batch = chunks.slice(i, i + BATCH_SIZE); const points = await Promise.all( batch.map(async (chunk, idx) => { const chunkIndex = i + idx; const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`; const vector = await embed(embeddingText); return { id: randomUUID(), vector, payload: { document_id: documentId, source_url: url, document_type: docType, chunk_index: chunkIndex, total_chunks: chunks.length, title: docTitle, section_heading: chunk.heading, text: chunk.text, page_estimate: docResult.pages, vendor: docVendor, product_slug: docTitle.replace(/\s+/g, "-").toLowerCase(), }, }; }), ); await upsertPoints(targetCollection, points); stored += points.length; } // Record in documents table (existing schema) try { await pool.query( `INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, ocr_text, processed_at) VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', $8, NOW()) ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`, [documentId, docType, docTitle, `ocr/${documentId}`, url, docResult.pages, chunks.length, docResult.content.slice(0, 50000)], ); } catch { // ignore if insert fails } res.json({ success: true, document_id: documentId, title: docTitle, pages: docResult.pages, chunks: chunks.length, collection: targetCollection, markdown_length: docResult.content.length, }); } catch (err) { res.status(503).json({ success: false, error: "Document processing failed", detail: (err as Error).message, }); } }); // GET /api/documents — List processed documents documentRouter.get("/", async (_req: Request, res: Response) => { try { const result = await pool.query( `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at FROM documents ORDER BY COALESCE(processed_at, created_at) DESC LIMIT 100`, ); res.json({ success: true, documents: result.rows, count: result.rows.length }); } catch { // Table may not exist — return empty res.json({ success: true, documents: [], count: 0, note: "documents table not yet created" }); } }); // GET /api/documents/:id — Get document details documentRouter.get("/:id", async (req: Request, res: Response) => { try { const result = await pool.query( `SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at FROM documents WHERE id = $1::uuid`, [req.params.id], ); if (result.rows.length === 0) { res.status(404).json({ success: false, error: "Document not found" }); return; } res.json({ success: true, document: result.rows[0] }); } catch { res.status(404).json({ success: false, error: "Document not found or table not created" }); } });