Docling-powered OCR pipeline: PDF → markdown → chunks → Ollama embed → Qdrant. News embedding seeder for news_embeddings collection. Document and news semantic search API endpoints. - embeddings/ocr-pipeline.ts: Docling convert → chunk → embed pipeline - embeddings/seed-news.ts: Batch embed news_articles into Qdrant - routes/documents.ts: POST /api/documents/process, GET /api/documents - routes/search.ts: GET /search/documents, GET /search/news endpoints - sql/005-documents.sql: Add chunks_count, processed_at to documents table - Ollama + nomic-embed-text installed on Erik (CPU mode) - 89 products + 40 datasheet chunks + 33 news articles in Qdrant
81 lines
2.2 KiB
TypeScript
81 lines
2.2 KiB
TypeScript
/**
|
|
* Seed news_embeddings collection from PostgreSQL news_articles table.
|
|
*
|
|
* Run: npx tsx packages/api/src/embeddings/seed-news.ts
|
|
*/
|
|
import { pool } from "../db/client";
|
|
import { embed, upsertPoints } from "./client";
|
|
|
|
function articleToText(row: Record<string, unknown>): string {
|
|
const parts = [
|
|
row.title && `${row.title}`,
|
|
row.source && `Source: ${row.source}`,
|
|
row.category && `Category: ${row.category}`,
|
|
row.summary && `${row.summary}`,
|
|
row.full_text && `${String(row.full_text).slice(0, 500)}`,
|
|
].filter(Boolean);
|
|
|
|
return parts.join(". ");
|
|
}
|
|
|
|
async function main() {
|
|
console.log("=== Seeding news_embeddings ===\n");
|
|
|
|
const result = await pool.query(
|
|
`SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score
|
|
FROM news_articles
|
|
ORDER BY published_at DESC
|
|
LIMIT 500`,
|
|
);
|
|
|
|
console.log(`Found ${result.rows.length} news articles to embed\n`);
|
|
|
|
if (result.rows.length === 0) {
|
|
console.log("No articles found. Run the news scraper first.");
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
const BATCH_SIZE = 10;
|
|
let total = 0;
|
|
|
|
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
|
|
const batch = result.rows.slice(i, i + BATCH_SIZE);
|
|
|
|
const points = await Promise.all(
|
|
batch.map(async (row) => {
|
|
const text = articleToText(row);
|
|
const vector = await embed(text);
|
|
|
|
return {
|
|
id: String(row.id),
|
|
vector,
|
|
payload: {
|
|
title: row.title || "",
|
|
url: row.source_url || "",
|
|
source: row.source || "",
|
|
category: row.category || "",
|
|
summary: row.summary || "",
|
|
published_at: row.published_at ? new Date(row.published_at).toISOString() : "",
|
|
relevance_score: row.relevance_score || 0,
|
|
text,
|
|
},
|
|
};
|
|
}),
|
|
);
|
|
|
|
await upsertPoints("news_embeddings", points);
|
|
total += points.length;
|
|
console.log(` Embedded ${total}/${result.rows.length} articles`);
|
|
}
|
|
|
|
console.log(`\n=== Done: ${total} articles embedded ===`);
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|