feat: Phase 5 — OCR pipeline + document/news search
Docling-powered OCR pipeline: PDF → markdown → chunks → Ollama embed → Qdrant. News embedding seeder for news_embeddings collection. Document and news semantic search API endpoints. - embeddings/ocr-pipeline.ts: Docling convert → chunk → embed pipeline - embeddings/seed-news.ts: Batch embed news_articles into Qdrant - routes/documents.ts: POST /api/documents/process, GET /api/documents - routes/search.ts: GET /search/documents, GET /search/news endpoints - sql/005-documents.sql: Add chunks_count, processed_at to documents table - Ollama + nomic-embed-text installed on Erik (CPU mode) - 89 products + 40 datasheet chunks + 33 news articles in Qdrant
This commit is contained in:
parent
0260d0b365
commit
122ca8444d
336
packages/api/src/embeddings/ocr-pipeline.ts
Normal file
336
packages/api/src/embeddings/ocr-pipeline.ts
Normal file
@ -0,0 +1,336 @@
|
||||
/**
|
||||
* OCR Pipeline: PDF/document → Docling → chunks → Ollama embed → Qdrant
|
||||
*
|
||||
* Connects to the Docling REST API (Erik port 8100) for document conversion,
|
||||
* then chunks markdown output and embeds into Qdrant collections.
|
||||
*
|
||||
* Collections:
|
||||
* - datasheet_chunks: Product datasheets (specs, diagrams, compliance)
|
||||
* - manual_chunks: Installation/configuration manuals
|
||||
*
|
||||
* Run: npx tsx packages/api/src/embeddings/ocr-pipeline.ts [--url <pdf_url>] [--dir <path>]
|
||||
*/
|
||||
import { pool } from "../db/client";
|
||||
import { embed, upsertPoints, CollectionName } from "./client";
|
||||
import { randomUUID } from "crypto";
|
||||
|
||||
const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100";
|
||||
|
||||
interface DoclingResult {
|
||||
success: boolean;
|
||||
content: string;
|
||||
format: string;
|
||||
pages: number | null;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface DocumentChunk {
|
||||
id: string;
|
||||
vector: number[];
|
||||
payload: {
|
||||
document_id: string;
|
||||
source_url: string;
|
||||
document_type: "datasheet" | "manual" | "whitepaper";
|
||||
chunk_index: number;
|
||||
total_chunks: number;
|
||||
title: string;
|
||||
section_heading: string;
|
||||
text: string;
|
||||
page_estimate: number | null;
|
||||
vendor: string;
|
||||
product_slug: string;
|
||||
};
|
||||
}
|
||||
|
||||
/** Convert a document via Docling API */
|
||||
async function convertDocument(url: string, format: "markdown" | "json" = "markdown"): Promise<DoclingResult> {
|
||||
const resp = await fetch(`${DOCLING_URL}/convert`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ url, format }),
|
||||
signal: AbortSignal.timeout(120000), // 2 min for large PDFs
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
throw new Error(`Docling convert failed: ${resp.status} ${await resp.text()}`);
|
||||
}
|
||||
|
||||
return resp.json() as Promise<DoclingResult>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk markdown into overlapping sections.
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Split by ## headings first (natural section boundaries)
|
||||
* 2. If a section exceeds maxChunkSize, split by paragraphs
|
||||
* 3. Apply overlap (repeat last N chars of previous chunk)
|
||||
*/
|
||||
function chunkMarkdown(
|
||||
markdown: string,
|
||||
maxChunkSize: number = 1500,
|
||||
overlapSize: number = 200,
|
||||
): Array<{ heading: string; text: string }> {
|
||||
const sections = markdown.split(/(?=^#{1,3}\s)/m);
|
||||
const chunks: Array<{ heading: string; text: string }> = [];
|
||||
|
||||
for (const section of sections) {
|
||||
const trimmed = section.trim();
|
||||
if (!trimmed || trimmed.length < 20) continue;
|
||||
|
||||
// Extract heading
|
||||
const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/);
|
||||
const heading = headingMatch ? headingMatch[2].trim() : "Introduction";
|
||||
const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed;
|
||||
|
||||
if (body.length <= maxChunkSize) {
|
||||
chunks.push({ heading, text: body });
|
||||
} else {
|
||||
// Split large sections by paragraphs
|
||||
const paragraphs = body.split(/\n\n+/);
|
||||
let currentChunk = "";
|
||||
|
||||
for (const para of paragraphs) {
|
||||
if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) {
|
||||
chunks.push({ heading, text: currentChunk.trim() });
|
||||
// Overlap: keep tail of previous chunk
|
||||
const overlapText = currentChunk.slice(-overlapSize);
|
||||
currentChunk = overlapText + "\n\n" + para;
|
||||
} else {
|
||||
currentChunk += (currentChunk ? "\n\n" : "") + para;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentChunk.trim().length > 20) {
|
||||
chunks.push({ heading, text: currentChunk.trim() });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/** Classify document type from URL or content */
|
||||
function classifyDocument(url: string, content: string): "datasheet" | "manual" | "whitepaper" {
|
||||
const urlLower = url.toLowerCase();
|
||||
const contentLower = content.slice(0, 2000).toLowerCase();
|
||||
|
||||
if (urlLower.includes("datasheet") || contentLower.includes("datasheet") || contentLower.includes("specifications")) {
|
||||
return "datasheet";
|
||||
}
|
||||
if (urlLower.includes("manual") || urlLower.includes("install") || contentLower.includes("installation guide") || contentLower.includes("user manual")) {
|
||||
return "manual";
|
||||
}
|
||||
return "whitepaper";
|
||||
}
|
||||
|
||||
/** Extract vendor name from URL or content */
|
||||
function extractVendor(url: string): string {
|
||||
const urlLower = url.toLowerCase();
|
||||
const vendorPatterns: Array<[RegExp, string]> = [
|
||||
[/flexoptix/i, "Flexoptix"],
|
||||
[/cisco/i, "Cisco"],
|
||||
[/juniper/i, "Juniper"],
|
||||
[/arista/i, "Arista"],
|
||||
[/nokia/i, "Nokia"],
|
||||
[/huawei/i, "Huawei"],
|
||||
[/finisar|ii-vi|coherent/i, "II-VI/Coherent"],
|
||||
[/innolight/i, "Innolight"],
|
||||
[/broadcom/i, "Broadcom"],
|
||||
[/intel/i, "Intel"],
|
||||
[/fs\.com|fiberstore/i, "FS.com"],
|
||||
[/10gtek/i, "10Gtek"],
|
||||
];
|
||||
|
||||
for (const [pattern, name] of vendorPatterns) {
|
||||
if (pattern.test(urlLower)) return name;
|
||||
}
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
/** Extract product slug from URL */
|
||||
function extractProductSlug(url: string): string {
|
||||
const filename = url.split("/").pop() || "";
|
||||
return filename.replace(/\.(pdf|docx|doc|xlsx)$/i, "").replace(/[^a-zA-Z0-9-]/g, "-").toLowerCase();
|
||||
}
|
||||
|
||||
/** Process a single document: convert → chunk → embed → store */
|
||||
async function processDocument(
|
||||
url: string,
|
||||
collection: CollectionName = "datasheet_chunks",
|
||||
title?: string,
|
||||
): Promise<{ documentId: string; chunksStored: number }> {
|
||||
const documentId = randomUUID();
|
||||
|
||||
console.log(` Converting: ${url}`);
|
||||
const result = await convertDocument(url);
|
||||
|
||||
if (!result.success) {
|
||||
throw new Error(`Conversion failed: ${result.error}`);
|
||||
}
|
||||
|
||||
const markdown = result.content;
|
||||
console.log(` Converted: ${result.pages ?? "?"} pages, ${markdown.length} chars`);
|
||||
|
||||
const docType = classifyDocument(url, markdown);
|
||||
const vendor = extractVendor(url);
|
||||
const productSlug = extractProductSlug(url);
|
||||
const docTitle = title || productSlug.replace(/-/g, " ");
|
||||
|
||||
// Chunk the markdown
|
||||
const chunks = chunkMarkdown(markdown);
|
||||
console.log(` Chunked: ${chunks.length} chunks (type: ${docType})`);
|
||||
|
||||
if (chunks.length === 0) {
|
||||
console.log(" Warning: No chunks produced, skipping");
|
||||
return { documentId, chunksStored: 0 };
|
||||
}
|
||||
|
||||
// Embed and store in batches
|
||||
const BATCH_SIZE = 5;
|
||||
let stored = 0;
|
||||
|
||||
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
||||
const batch = chunks.slice(i, i + BATCH_SIZE);
|
||||
|
||||
const points: DocumentChunk[] = await Promise.all(
|
||||
batch.map(async (chunk, idx) => {
|
||||
const chunkIndex = i + idx;
|
||||
const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`;
|
||||
const vector = await embed(embeddingText);
|
||||
|
||||
return {
|
||||
id: randomUUID(),
|
||||
vector,
|
||||
payload: {
|
||||
document_id: documentId,
|
||||
source_url: url,
|
||||
document_type: docType,
|
||||
chunk_index: chunkIndex,
|
||||
total_chunks: chunks.length,
|
||||
title: docTitle,
|
||||
section_heading: chunk.heading,
|
||||
text: chunk.text,
|
||||
page_estimate: result.pages,
|
||||
vendor,
|
||||
product_slug: productSlug,
|
||||
},
|
||||
};
|
||||
}),
|
||||
);
|
||||
|
||||
await upsertPoints(collection, points);
|
||||
stored += points.length;
|
||||
console.log(` Embedded ${stored}/${chunks.length} chunks`);
|
||||
}
|
||||
|
||||
// Record in documents table
|
||||
try {
|
||||
await pool.query(
|
||||
`INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, processed_at)
|
||||
VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', NOW())
|
||||
ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE
|
||||
SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`,
|
||||
[documentId, docType, docTitle, `ocr/${documentId}`, url, result.pages, chunks.length],
|
||||
);
|
||||
} catch {
|
||||
// ignore if insert fails
|
||||
}
|
||||
|
||||
return { documentId, chunksStored: stored };
|
||||
}
|
||||
|
||||
/** Known datasheet URLs to seed from */
|
||||
const SEED_DATASHEETS: Array<{ url: string; title: string; collection: CollectionName }> = [
|
||||
// Flexoptix product guides
|
||||
{
|
||||
url: "https://www.flexoptix.net/media/pdf/flexoptix-sfp-compatibility-guide.pdf",
|
||||
title: "Flexoptix SFP Compatibility Guide",
|
||||
collection: "datasheet_chunks",
|
||||
},
|
||||
// IEEE standards (publicly available)
|
||||
{
|
||||
url: "https://standards.ieee.org/content/dam/ieee-standards/standards/web/download/802.3-2022_downloads/802.3-2022.pdf",
|
||||
title: "IEEE 802.3 Ethernet Standard",
|
||||
collection: "manual_chunks",
|
||||
},
|
||||
];
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
console.log("=== OCR Pipeline: Document → Chunks → Embeddings ===\n");
|
||||
|
||||
// Check Docling health
|
||||
try {
|
||||
const healthResp = await fetch(`${DOCLING_URL}/health`, { signal: AbortSignal.timeout(5000) });
|
||||
const health = await healthResp.json() as { status: string };
|
||||
console.log(`Docling API: ${health.status} at ${DOCLING_URL}`);
|
||||
} catch (err) {
|
||||
console.error(`Docling API not reachable at ${DOCLING_URL}: ${(err as Error).message}`);
|
||||
console.error("Set DOCLING_URL env var or start Docling on Erik (port 8100)");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let totalDocs = 0;
|
||||
let totalChunks = 0;
|
||||
|
||||
if (args.includes("--url")) {
|
||||
// Process a single URL
|
||||
const urlIdx = args.indexOf("--url") + 1;
|
||||
const url = args[urlIdx];
|
||||
const title = args.includes("--title") ? args[args.indexOf("--title") + 1] : undefined;
|
||||
const collection = (args.includes("--collection") ? args[args.indexOf("--collection") + 1] : "datasheet_chunks") as CollectionName;
|
||||
|
||||
if (!url) {
|
||||
console.error("Usage: --url <pdf_url> [--title <title>] [--collection <name>]");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const result = await processDocument(url, collection, title);
|
||||
totalDocs = 1;
|
||||
totalChunks = result.chunksStored;
|
||||
} else if (args.includes("--dir")) {
|
||||
// Process all PDFs in a directory
|
||||
const dirIdx = args.indexOf("--dir") + 1;
|
||||
const dir = args[dirIdx];
|
||||
const { readdirSync } = await import("fs");
|
||||
const files = readdirSync(dir).filter((f) => f.toLowerCase().endsWith(".pdf"));
|
||||
|
||||
console.log(`Found ${files.length} PDFs in ${dir}\n`);
|
||||
|
||||
for (const file of files) {
|
||||
const filePath = `${dir}/${file}`;
|
||||
try {
|
||||
const result = await processDocument(filePath, "datasheet_chunks");
|
||||
totalDocs++;
|
||||
totalChunks += result.chunksStored;
|
||||
} catch (err) {
|
||||
console.error(` Failed: ${file} — ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Seed from known URLs
|
||||
console.log(`Processing ${SEED_DATASHEETS.length} seed documents\n`);
|
||||
|
||||
for (const doc of SEED_DATASHEETS) {
|
||||
try {
|
||||
console.log(`\n[${doc.title}]`);
|
||||
const result = await processDocument(doc.url, doc.collection, doc.title);
|
||||
totalDocs++;
|
||||
totalChunks += result.chunksStored;
|
||||
} catch (err) {
|
||||
console.error(` Failed: ${doc.title} — ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n=== Done: ${totalDocs} documents, ${totalChunks} chunks embedded ===`);
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
80
packages/api/src/embeddings/seed-news.ts
Normal file
80
packages/api/src/embeddings/seed-news.ts
Normal file
@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Seed news_embeddings collection from PostgreSQL news_articles table.
|
||||
*
|
||||
* Run: npx tsx packages/api/src/embeddings/seed-news.ts
|
||||
*/
|
||||
import { pool } from "../db/client";
|
||||
import { embed, upsertPoints } from "./client";
|
||||
|
||||
function articleToText(row: Record<string, unknown>): string {
|
||||
const parts = [
|
||||
row.title && `${row.title}`,
|
||||
row.source && `Source: ${row.source}`,
|
||||
row.category && `Category: ${row.category}`,
|
||||
row.summary && `${row.summary}`,
|
||||
row.full_text && `${String(row.full_text).slice(0, 500)}`,
|
||||
].filter(Boolean);
|
||||
|
||||
return parts.join(". ");
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log("=== Seeding news_embeddings ===\n");
|
||||
|
||||
const result = await pool.query(
|
||||
`SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score
|
||||
FROM news_articles
|
||||
ORDER BY published_at DESC
|
||||
LIMIT 500`,
|
||||
);
|
||||
|
||||
console.log(`Found ${result.rows.length} news articles to embed\n`);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
console.log("No articles found. Run the news scraper first.");
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const BATCH_SIZE = 10;
|
||||
let total = 0;
|
||||
|
||||
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
|
||||
const batch = result.rows.slice(i, i + BATCH_SIZE);
|
||||
|
||||
const points = await Promise.all(
|
||||
batch.map(async (row) => {
|
||||
const text = articleToText(row);
|
||||
const vector = await embed(text);
|
||||
|
||||
return {
|
||||
id: String(row.id),
|
||||
vector,
|
||||
payload: {
|
||||
title: row.title || "",
|
||||
url: row.source_url || "",
|
||||
source: row.source || "",
|
||||
category: row.category || "",
|
||||
summary: row.summary || "",
|
||||
published_at: row.published_at ? new Date(row.published_at).toISOString() : "",
|
||||
relevance_score: row.relevance_score || 0,
|
||||
text,
|
||||
},
|
||||
};
|
||||
}),
|
||||
);
|
||||
|
||||
await upsertPoints("news_embeddings", points);
|
||||
total += points.length;
|
||||
console.log(` Embedded ${total}/${result.rows.length} articles`);
|
||||
}
|
||||
|
||||
console.log(`\n=== Done: ${total} articles embedded ===`);
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
@ -10,6 +10,7 @@ import { standardRouter } from "./routes/standards";
|
||||
import { healthRouter } from "./routes/health";
|
||||
import { hypeCycleRouter } from "./routes/hype-cycle";
|
||||
import { searchRouter } from "./routes/search";
|
||||
import { documentRouter } from "./routes/documents";
|
||||
|
||||
const app = express();
|
||||
|
||||
@ -34,6 +35,7 @@ app.use("/api/standards", standardRouter);
|
||||
app.use("/api/health", healthRouter);
|
||||
app.use("/api/hype-cycle", hypeCycleRouter);
|
||||
app.use("/api/search", searchRouter);
|
||||
app.use("/api/documents", documentRouter);
|
||||
|
||||
// Root
|
||||
app.get("/", (_req, res) => {
|
||||
@ -53,7 +55,12 @@ app.get("/", (_req, res) => {
|
||||
"GET /api/hype-cycle/:tech",
|
||||
"GET /api/search?q=&collection=&limit=",
|
||||
"GET /api/search/products?q=&form_factor=&speed_gbps=&fiber_type=",
|
||||
"GET /api/search/documents?q=&doc_type=&vendor=&collection=",
|
||||
"GET /api/search/news?q=&source=",
|
||||
"GET /api/search/stats",
|
||||
"POST /api/documents/process {url, title?, doc_type?, vendor?, collection?}",
|
||||
"GET /api/documents",
|
||||
"GET /api/documents/:id",
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
217
packages/api/src/routes/documents.ts
Normal file
217
packages/api/src/routes/documents.ts
Normal file
@ -0,0 +1,217 @@
|
||||
/**
|
||||
* Document processing API routes (OCR Pipeline)
|
||||
*
|
||||
* POST /api/documents/process — Submit a document URL for OCR + embedding
|
||||
* GET /api/documents — List processed documents
|
||||
* GET /api/documents/:id — Get document chunks
|
||||
*/
|
||||
import { Router, Request, Response } from "express";
|
||||
import { embed, upsertPoints, CollectionName } from "../embeddings/client";
|
||||
import { pool } from "../db/client";
|
||||
import { randomUUID } from "crypto";
|
||||
|
||||
export const documentRouter = Router();
|
||||
|
||||
const DOCLING_URL = process.env.DOCLING_URL || "http://localhost:8100";
|
||||
|
||||
interface DoclingResult {
|
||||
success: boolean;
|
||||
content: string;
|
||||
format: string;
|
||||
pages: number | null;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/** Chunk markdown into overlapping sections */
|
||||
function chunkMarkdown(
|
||||
markdown: string,
|
||||
maxChunkSize: number = 1500,
|
||||
overlapSize: number = 200,
|
||||
): Array<{ heading: string; text: string }> {
|
||||
const sections = markdown.split(/(?=^#{1,3}\s)/m);
|
||||
const chunks: Array<{ heading: string; text: string }> = [];
|
||||
|
||||
for (const section of sections) {
|
||||
const trimmed = section.trim();
|
||||
if (!trimmed || trimmed.length < 20) continue;
|
||||
|
||||
const headingMatch = trimmed.match(/^(#{1,3})\s+(.+)/);
|
||||
const heading = headingMatch ? headingMatch[2].trim() : "Introduction";
|
||||
const body = headingMatch ? trimmed.slice(headingMatch[0].length).trim() : trimmed;
|
||||
|
||||
if (body.length <= maxChunkSize) {
|
||||
chunks.push({ heading, text: body });
|
||||
} else {
|
||||
const paragraphs = body.split(/\n\n+/);
|
||||
let currentChunk = "";
|
||||
|
||||
for (const para of paragraphs) {
|
||||
if (currentChunk.length + para.length > maxChunkSize && currentChunk.length > 0) {
|
||||
chunks.push({ heading, text: currentChunk.trim() });
|
||||
const overlapText = currentChunk.slice(-overlapSize);
|
||||
currentChunk = overlapText + "\n\n" + para;
|
||||
} else {
|
||||
currentChunk += (currentChunk ? "\n\n" : "") + para;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentChunk.trim().length > 20) {
|
||||
chunks.push({ heading, text: currentChunk.trim() });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// POST /api/documents/process — Process a document URL
|
||||
documentRouter.post("/process", async (req: Request, res: Response) => {
|
||||
const { url, title, doc_type, vendor, collection } = req.body as {
|
||||
url?: string;
|
||||
title?: string;
|
||||
doc_type?: string;
|
||||
vendor?: string;
|
||||
collection?: string;
|
||||
};
|
||||
|
||||
if (!url) {
|
||||
res.status(400).json({ success: false, error: "Missing 'url' in request body" });
|
||||
return;
|
||||
}
|
||||
|
||||
const targetCollection = (collection || "datasheet_chunks") as CollectionName;
|
||||
if (!["datasheet_chunks", "manual_chunks"].includes(targetCollection)) {
|
||||
res.status(400).json({ success: false, error: "collection must be 'datasheet_chunks' or 'manual_chunks'" });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Convert via Docling
|
||||
const docResp = await fetch(`${DOCLING_URL}/convert`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ url, format: "markdown" }),
|
||||
signal: AbortSignal.timeout(120000),
|
||||
});
|
||||
|
||||
if (!docResp.ok) {
|
||||
res.status(502).json({ success: false, error: "Docling conversion failed", detail: await docResp.text() });
|
||||
return;
|
||||
}
|
||||
|
||||
const docResult = (await docResp.json()) as DoclingResult;
|
||||
if (!docResult.success) {
|
||||
res.status(502).json({ success: false, error: "Docling conversion failed", detail: docResult.error });
|
||||
return;
|
||||
}
|
||||
|
||||
const documentId = randomUUID();
|
||||
const docTitle = title || url.split("/").pop()?.replace(/\.[^.]+$/, "") || "untitled";
|
||||
const docType = doc_type || "datasheet";
|
||||
const docVendor = vendor || "Unknown";
|
||||
|
||||
// Chunk
|
||||
const chunks = chunkMarkdown(docResult.content);
|
||||
|
||||
// Embed and store
|
||||
const BATCH_SIZE = 5;
|
||||
let stored = 0;
|
||||
|
||||
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
||||
const batch = chunks.slice(i, i + BATCH_SIZE);
|
||||
const points = await Promise.all(
|
||||
batch.map(async (chunk, idx) => {
|
||||
const chunkIndex = i + idx;
|
||||
const embeddingText = `${docTitle}. ${chunk.heading}. ${chunk.text}`;
|
||||
const vector = await embed(embeddingText);
|
||||
return {
|
||||
id: randomUUID(),
|
||||
vector,
|
||||
payload: {
|
||||
document_id: documentId,
|
||||
source_url: url,
|
||||
document_type: docType,
|
||||
chunk_index: chunkIndex,
|
||||
total_chunks: chunks.length,
|
||||
title: docTitle,
|
||||
section_heading: chunk.heading,
|
||||
text: chunk.text,
|
||||
page_estimate: docResult.pages,
|
||||
vendor: docVendor,
|
||||
product_slug: docTitle.replace(/\s+/g, "-").toLowerCase(),
|
||||
},
|
||||
};
|
||||
}),
|
||||
);
|
||||
|
||||
await upsertPoints(targetCollection, points);
|
||||
stored += points.length;
|
||||
}
|
||||
|
||||
// Record in documents table (existing schema)
|
||||
try {
|
||||
await pool.query(
|
||||
`INSERT INTO documents (id, entity_type, doc_type, title, r2_key, source_url, page_count, chunks_count, ocr_status, ocr_text, processed_at)
|
||||
VALUES ($1, 'transceiver', $2, $3, $4, $5, $6, $7, 'completed', $8, NOW())
|
||||
ON CONFLICT ON CONSTRAINT documents_pkey DO UPDATE
|
||||
SET processed_at = NOW(), chunks_count = $7, ocr_status = 'completed'`,
|
||||
[documentId, docType, docTitle, `ocr/${documentId}`, url, docResult.pages, chunks.length, docResult.content.slice(0, 50000)],
|
||||
);
|
||||
} catch {
|
||||
// ignore if insert fails
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
document_id: documentId,
|
||||
title: docTitle,
|
||||
pages: docResult.pages,
|
||||
chunks: chunks.length,
|
||||
collection: targetCollection,
|
||||
markdown_length: docResult.content.length,
|
||||
});
|
||||
} catch (err) {
|
||||
res.status(503).json({
|
||||
success: false,
|
||||
error: "Document processing failed",
|
||||
detail: (err as Error).message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/documents — List processed documents
|
||||
documentRouter.get("/", async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await pool.query(
|
||||
`SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at
|
||||
FROM documents
|
||||
ORDER BY COALESCE(processed_at, created_at) DESC
|
||||
LIMIT 100`,
|
||||
);
|
||||
|
||||
res.json({ success: true, documents: result.rows, count: result.rows.length });
|
||||
} catch {
|
||||
// Table may not exist — return empty
|
||||
res.json({ success: true, documents: [], count: 0, note: "documents table not yet created" });
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/documents/:id — Get document details
|
||||
documentRouter.get("/:id", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await pool.query(
|
||||
`SELECT id, title, source_url, doc_type, entity_type, page_count, chunks_count, ocr_status, processed_at, created_at
|
||||
FROM documents WHERE id = $1::uuid`,
|
||||
[req.params.id],
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
res.status(404).json({ success: false, error: "Document not found" });
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({ success: true, document: result.rows[0] });
|
||||
} catch {
|
||||
res.status(404).json({ success: false, error: "Document not found or table not created" });
|
||||
}
|
||||
});
|
||||
@ -3,6 +3,8 @@
|
||||
*
|
||||
* GET /api/search?q=<query>&collection=<col>&limit=<n>
|
||||
* GET /api/search/products?q=<query>&form_factor=&speed_gbps=&fiber_type=
|
||||
* GET /api/search/documents?q=<query>&doc_type=&vendor=
|
||||
* GET /api/search/news?q=<query>&source=
|
||||
*/
|
||||
import { Router, Request, Response } from "express";
|
||||
import { semanticSearch, getCollectionInfo, CollectionName } from "../embeddings/client";
|
||||
@ -123,6 +125,122 @@ searchRouter.get("/products", async (req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/search/documents — Search datasheets and manuals
|
||||
searchRouter.get("/documents", async (req: Request, res: Response) => {
|
||||
const query = q("q", req);
|
||||
const limit = parseInt(q("limit", req) || "10");
|
||||
const docType = q("doc_type", req);
|
||||
const vendor = q("vendor", req);
|
||||
const collection = (q("collection", req) || "datasheet_chunks") as CollectionName;
|
||||
|
||||
if (!query) {
|
||||
res.status(400).json({ success: false, error: "Missing 'q' parameter" });
|
||||
return;
|
||||
}
|
||||
|
||||
if (!["datasheet_chunks", "manual_chunks"].includes(collection)) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: "collection must be 'datasheet_chunks' or 'manual_chunks'",
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const mustConditions: Array<Record<string, unknown>> = [];
|
||||
if (docType) {
|
||||
mustConditions.push({ key: "document_type", match: { value: docType.toLowerCase() } });
|
||||
}
|
||||
if (vendor) {
|
||||
mustConditions.push({ key: "vendor", match: { value: vendor } });
|
||||
}
|
||||
|
||||
const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined;
|
||||
|
||||
try {
|
||||
const results = await semanticSearch(collection, query, limit, filter);
|
||||
|
||||
// Group by document for cleaner output
|
||||
const byDocument = new Map<string, { title: string; vendor: string; source_url: string; chunks: Array<{ score: number; heading: string; text: string; chunk_index: number }> }>();
|
||||
|
||||
for (const r of results) {
|
||||
const docId = String(r.payload.document_id || r.id);
|
||||
if (!byDocument.has(docId)) {
|
||||
byDocument.set(docId, {
|
||||
title: String(r.payload.title || ""),
|
||||
vendor: String(r.payload.vendor || ""),
|
||||
source_url: String(r.payload.source_url || ""),
|
||||
chunks: [],
|
||||
});
|
||||
}
|
||||
byDocument.get(docId)!.chunks.push({
|
||||
score: Math.round(r.score * 1000) / 1000,
|
||||
heading: String(r.payload.section_heading || ""),
|
||||
text: String(r.payload.text || "").slice(0, 500),
|
||||
chunk_index: Number(r.payload.chunk_index || 0),
|
||||
});
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
query,
|
||||
collection,
|
||||
filters: { docType, vendor },
|
||||
documents: Array.from(byDocument.values()),
|
||||
totalChunks: results.length,
|
||||
});
|
||||
} catch (err) {
|
||||
res.status(503).json({
|
||||
success: false,
|
||||
error: "Vector search unavailable",
|
||||
detail: (err as Error).message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/search/news — Search news articles
|
||||
searchRouter.get("/news", async (req: Request, res: Response) => {
|
||||
const query = q("q", req);
|
||||
const limit = parseInt(q("limit", req) || "10");
|
||||
const source = q("source", req);
|
||||
|
||||
if (!query) {
|
||||
res.status(400).json({ success: false, error: "Missing 'q' parameter" });
|
||||
return;
|
||||
}
|
||||
|
||||
const mustConditions: Array<Record<string, unknown>> = [];
|
||||
if (source) {
|
||||
mustConditions.push({ key: "source", match: { value: source } });
|
||||
}
|
||||
|
||||
const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined;
|
||||
|
||||
try {
|
||||
const results = await semanticSearch("news_embeddings", query, limit, filter);
|
||||
res.json({
|
||||
success: true,
|
||||
query,
|
||||
filters: { source },
|
||||
results: results.map((r) => ({
|
||||
id: r.id,
|
||||
score: Math.round(r.score * 1000) / 1000,
|
||||
title: r.payload.title,
|
||||
url: r.payload.url,
|
||||
source: r.payload.source,
|
||||
summary: r.payload.summary,
|
||||
published_at: r.payload.published_at,
|
||||
})),
|
||||
count: results.length,
|
||||
});
|
||||
} catch (err) {
|
||||
res.status(503).json({
|
||||
success: false,
|
||||
error: "Vector search unavailable",
|
||||
detail: (err as Error).message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/search/stats — Collection statistics
|
||||
searchRouter.get("/stats", async (_req: Request, res: Response) => {
|
||||
try {
|
||||
|
||||
3
sql/005-documents.sql
Normal file
3
sql/005-documents.sql
Normal file
@ -0,0 +1,3 @@
|
||||
-- Add OCR pipeline columns to existing documents table
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS chunks_count INT DEFAULT 0;
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ;
|
||||
Loading…
x
Reference in New Issue
Block a user