Ollama nomic-embed-text (768 dim) → Qdrant vector search pipeline. Embeds all 89 transceivers with rich text representation and payload filters (form_factor, speed_gbps, fiber_type, wdm_type). - embeddings/client.ts: Ollama embed + Qdrant upsert/search - embeddings/seed-products.ts: Batch seeder for product_embeddings - routes/search.ts: GET /api/search, /search/products, /search/stats - 6 Qdrant collections: products, datasheets, FAQs, manuals, troubleshooting, news
97 lines
3.2 KiB
TypeScript
97 lines
3.2 KiB
TypeScript
/**
|
|
* Seed product_embeddings collection in Qdrant from PostgreSQL transceivers.
|
|
*
|
|
* Creates a rich text representation of each transceiver, embeds it via
|
|
* Ollama nomic-embed-text, and stores in Qdrant with payload filters.
|
|
*
|
|
* Run: npx tsx packages/api/src/embeddings/seed-products.ts
|
|
*/
|
|
import { pool } from "../db/client";
|
|
import { embed, upsertPoints } from "./client";
|
|
|
|
function transceiverToText(row: Record<string, unknown>): string {
|
|
const parts = [
|
|
row.standard_name && `${row.standard_name}`,
|
|
row.form_factor && `Form factor: ${row.form_factor}`,
|
|
row.speed && `Speed: ${row.speed}`,
|
|
row.reach_label && `Reach: ${row.reach_label}`,
|
|
row.fiber_type && `Fiber: ${row.fiber_type}`,
|
|
row.connector && `Connector: ${row.connector}`,
|
|
row.wavelengths && `Wavelengths: ${row.wavelengths}`,
|
|
row.wdm_type && `WDM: ${row.wdm_type}`,
|
|
row.category && `Category: ${row.category}`,
|
|
row.coherent && `Coherent optics`,
|
|
row.power_consumption_w && `Power: ${row.power_consumption_w}W`,
|
|
row.temp_range && `Temperature: ${row.temp_range}`,
|
|
row.vendor_name && `Vendor: ${row.vendor_name}`,
|
|
row.description && `${row.description}`,
|
|
].filter(Boolean);
|
|
|
|
return parts.join(". ");
|
|
}
|
|
|
|
async function main() {
|
|
console.log("=== Seeding product_embeddings ===\n");
|
|
|
|
const result = await pool.query(
|
|
`SELECT t.id, t.slug, t.standard_name, t.form_factor, t.speed, t.speed_gbps,
|
|
t.reach_label, t.reach_meters, t.fiber_type, t.connector,
|
|
t.wavelengths, t.wdm_type, t.coherent, t.power_consumption_w,
|
|
t.temp_range, t.category, t.notes as description,
|
|
v.name as vendor_name
|
|
FROM transceivers t
|
|
LEFT JOIN vendors v ON v.id = t.vendor_id
|
|
ORDER BY t.speed_gbps DESC`
|
|
);
|
|
|
|
console.log(`Found ${result.rows.length} transceivers to embed\n`);
|
|
|
|
const BATCH_SIZE = 10;
|
|
let total = 0;
|
|
|
|
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
|
|
const batch = result.rows.slice(i, i + BATCH_SIZE);
|
|
|
|
const points = await Promise.all(
|
|
batch.map(async (row) => {
|
|
const text = transceiverToText(row);
|
|
const vector = await embed(text);
|
|
|
|
return {
|
|
id: row.id,
|
|
vector,
|
|
payload: {
|
|
slug: row.slug,
|
|
standard_name: row.standard_name || "",
|
|
form_factor: row.form_factor || "",
|
|
speed: row.speed || "",
|
|
speed_gbps: parseFloat(row.speed_gbps) || 0,
|
|
reach_label: row.reach_label || "",
|
|
reach_meters: row.reach_meters || 0,
|
|
fiber_type: row.fiber_type || "",
|
|
connector: row.connector || "",
|
|
wdm_type: row.wdm_type || "",
|
|
category: row.category || "",
|
|
coherent: row.coherent || false,
|
|
vendor: row.vendor_name || "",
|
|
text,
|
|
},
|
|
};
|
|
})
|
|
);
|
|
|
|
await upsertPoints("product_embeddings", points);
|
|
total += points.length;
|
|
console.log(` Embedded ${total}/${result.rows.length} transceivers`);
|
|
}
|
|
|
|
console.log(`\n=== Done: ${total} products embedded ===`);
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|