/** * Seed news_embeddings collection from PostgreSQL news_articles table. * * Run: npx tsx packages/api/src/embeddings/seed-news.ts */ import { pool } from "../db/client"; import { embed, upsertPoints } from "./client"; function articleToText(row: Record): string { const parts = [ row.title && `${row.title}`, row.source && `Source: ${row.source}`, row.category && `Category: ${row.category}`, row.summary && `${row.summary}`, row.full_text && `${String(row.full_text).slice(0, 500)}`, ].filter(Boolean); return parts.join(". "); } async function main() { console.log("=== Seeding news_embeddings ===\n"); const result = await pool.query( `SELECT id, title, source_url, source, category, summary, full_text, published_at, relevance_score FROM news_articles ORDER BY published_at DESC LIMIT 500`, ); console.log(`Found ${result.rows.length} news articles to embed\n`); if (result.rows.length === 0) { console.log("No articles found. Run the news scraper first."); await pool.end(); return; } const BATCH_SIZE = 10; let total = 0; for (let i = 0; i < result.rows.length; i += BATCH_SIZE) { const batch = result.rows.slice(i, i + BATCH_SIZE); const points = await Promise.all( batch.map(async (row) => { const text = articleToText(row); const vector = await embed(text); return { id: String(row.id), vector, payload: { title: row.title || "", url: row.source_url || "", source: row.source || "", category: row.category || "", summary: row.summary || "", published_at: row.published_at ? new Date(row.published_at).toISOString() : "", relevance_score: row.relevance_score || 0, text, }, }; }), ); await upsertPoints("news_embeddings", points); total += points.length; console.log(` Embedded ${total}/${result.rows.length} articles`); } console.log(`\n=== Done: ${total} articles embedded ===`); await pool.end(); } main().catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });