Rene Fichtmueller 6d3e5cc04a feat: Phase 4 — Vector embeddings + semantic search
Ollama nomic-embed-text (768 dim) → Qdrant vector search pipeline.
Embeds all 89 transceivers with rich text representation and payload
filters (form_factor, speed_gbps, fiber_type, wdm_type).

- embeddings/client.ts: Ollama embed + Qdrant upsert/search
- embeddings/seed-products.ts: Batch seeder for product_embeddings
- routes/search.ts: GET /api/search, /search/products, /search/stats
- 6 Qdrant collections: products, datasheets, FAQs, manuals, troubleshooting, news
2026-03-28 00:05:29 +13:00

97 lines
3.2 KiB
TypeScript

/**
* Seed product_embeddings collection in Qdrant from PostgreSQL transceivers.
*
* Creates a rich text representation of each transceiver, embeds it via
* Ollama nomic-embed-text, and stores in Qdrant with payload filters.
*
* Run: npx tsx packages/api/src/embeddings/seed-products.ts
*/
import { pool } from "../db/client";
import { embed, upsertPoints } from "./client";
function transceiverToText(row: Record<string, unknown>): string {
const parts = [
row.standard_name && `${row.standard_name}`,
row.form_factor && `Form factor: ${row.form_factor}`,
row.speed && `Speed: ${row.speed}`,
row.reach_label && `Reach: ${row.reach_label}`,
row.fiber_type && `Fiber: ${row.fiber_type}`,
row.connector && `Connector: ${row.connector}`,
row.wavelengths && `Wavelengths: ${row.wavelengths}`,
row.wdm_type && `WDM: ${row.wdm_type}`,
row.category && `Category: ${row.category}`,
row.coherent && `Coherent optics`,
row.power_consumption_w && `Power: ${row.power_consumption_w}W`,
row.temp_range && `Temperature: ${row.temp_range}`,
row.vendor_name && `Vendor: ${row.vendor_name}`,
row.description && `${row.description}`,
].filter(Boolean);
return parts.join(". ");
}
async function main() {
console.log("=== Seeding product_embeddings ===\n");
const result = await pool.query(
`SELECT t.id, t.slug, t.standard_name, t.form_factor, t.speed, t.speed_gbps,
t.reach_label, t.reach_meters, t.fiber_type, t.connector,
t.wavelengths, t.wdm_type, t.coherent, t.power_consumption_w,
t.temp_range, t.category, t.notes as description,
v.name as vendor_name
FROM transceivers t
LEFT JOIN vendors v ON v.id = t.vendor_id
ORDER BY t.speed_gbps DESC`
);
console.log(`Found ${result.rows.length} transceivers to embed\n`);
const BATCH_SIZE = 10;
let total = 0;
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
const batch = result.rows.slice(i, i + BATCH_SIZE);
const points = await Promise.all(
batch.map(async (row) => {
const text = transceiverToText(row);
const vector = await embed(text);
return {
id: row.id,
vector,
payload: {
slug: row.slug,
standard_name: row.standard_name || "",
form_factor: row.form_factor || "",
speed: row.speed || "",
speed_gbps: parseFloat(row.speed_gbps) || 0,
reach_label: row.reach_label || "",
reach_meters: row.reach_meters || 0,
fiber_type: row.fiber_type || "",
connector: row.connector || "",
wdm_type: row.wdm_type || "",
category: row.category || "",
coherent: row.coherent || false,
vendor: row.vendor_name || "",
text,
},
};
})
);
await upsertPoints("product_embeddings", points);
total += points.length;
console.log(` Embedded ${total}/${result.rows.length} transceivers`);
}
console.log(`\n=== Done: ${total} products embedded ===`);
await pool.end();
}
main().catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});