Rene Fichtmueller 8bb3b586f3 feat: Phase 5 — OCR pipeline + document/news search
Docling-powered OCR pipeline: PDF → markdown → chunks → Ollama embed → Qdrant.
News embedding seeder for news_embeddings collection.
Document and news semantic search API endpoints.

- embeddings/ocr-pipeline.ts: Docling convert → chunk → embed pipeline
- embeddings/seed-news.ts: Batch embed news_articles into Qdrant
- routes/documents.ts: POST /api/documents/process, GET /api/documents
- routes/search.ts: GET /search/documents, GET /search/news endpoints
- sql/005-documents.sql: Add chunks_count, processed_at to documents table
- Ollama + nomic-embed-text installed on Erik (CPU mode)
- 89 products + 40 datasheet chunks + 33 news articles in Qdrant
2026-03-28 00:22:01 +13:00

81 lines
2.2 KiB
TypeScript

/**
* Seed news_embeddings collection from PostgreSQL news_articles table.
*
* Run: npx tsx packages/api/src/embeddings/seed-news.ts
*/
import { pool } from "../db/client";
import { embed, upsertPoints } from "./client";
function articleToText(row: Record<string, unknown>): string {
const parts = [
row.title && `${row.title}`,
row.source && `Source: ${row.source}`,
row.category && `Category: ${row.category}`,
row.summary && `${row.summary}`,
row.full_text && `${String(row.full_text).slice(0, 500)}`,
].filter(Boolean);
return parts.join(". ");
}
async function main() {
console.log("=== Seeding news_embeddings ===\n");
const result = await pool.query(
`SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score
FROM news_articles
ORDER BY published_at DESC
LIMIT 500`,
);
console.log(`Found ${result.rows.length} news articles to embed\n`);
if (result.rows.length === 0) {
console.log("No articles found. Run the news scraper first.");
await pool.end();
return;
}
const BATCH_SIZE = 10;
let total = 0;
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
const batch = result.rows.slice(i, i + BATCH_SIZE);
const points = await Promise.all(
batch.map(async (row) => {
const text = articleToText(row);
const vector = await embed(text);
return {
id: String(row.id),
vector,
payload: {
title: row.title || "",
url: row.source_url || "",
source: row.source || "",
category: row.category || "",
summary: row.summary || "",
published_at: row.published_at ? new Date(row.published_at).toISOString() : "",
relevance_score: row.relevance_score || 0,
text,
},
};
}),
);
await upsertPoints("news_embeddings", points);
total += points.length;
console.log(` Embedded ${total}/${result.rows.length} articles`);
}
console.log(`\n=== Done: ${total} articles embedded ===`);
await pool.end();
}
main().catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});