diff --git a/packages/api/src/embeddings/client.ts b/packages/api/src/embeddings/client.ts new file mode 100644 index 0000000..17bcb82 --- /dev/null +++ b/packages/api/src/embeddings/client.ts @@ -0,0 +1,151 @@ +/** + * Embedding + Qdrant client for vector search. + * + * Ollama nomic-embed-text (768 dim) → Qdrant collections. + * Supports: products, datasheets, FAQs, manuals, troubleshooting, news. + */ + +const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434"; +const QDRANT_URL = process.env.QDRANT_URL || "http://localhost:6333"; +const EMBED_MODEL = process.env.EMBED_MODEL || "nomic-embed-text"; + +export type CollectionName = + | "product_embeddings" + | "datasheet_chunks" + | "faq_embeddings" + | "manual_chunks" + | "troubleshooting_embeddings" + | "news_embeddings"; + +/** Generate embedding vector from text */ +export async function embed(text: string): Promise { + const resp = await fetch(`${OLLAMA_URL}/api/embeddings`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ model: EMBED_MODEL, prompt: text }), + signal: AbortSignal.timeout(30000), + }); + + if (!resp.ok) { + throw new Error(`Ollama embed failed: ${resp.status} ${await resp.text()}`); + } + + const data = await resp.json() as { embedding: number[] }; + return data.embedding; +} + +/** Batch embed multiple texts */ +export async function embedBatch(texts: ReadonlyArray): Promise { + const results: number[][] = []; + // Ollama doesn't support batch embedding natively, so we serialize + // with concurrency limit to avoid overloading + const CONCURRENCY = 3; + for (let i = 0; i < texts.length; i += CONCURRENCY) { + const batch = texts.slice(i, i + CONCURRENCY); + const embeddings = await Promise.all(batch.map((t) => embed(t))); + results.push(...embeddings); + } + return results; +} + +/** Upsert a point into Qdrant */ +export async function upsertPoint( + collection: CollectionName, + id: string, + vector: number[], + payload: Record, +): Promise { + const resp = await fetch(`${QDRANT_URL}/collections/${collection}/points`, { + method: "PUT", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + points: [{ id, vector, payload }], + }), + signal: AbortSignal.timeout(10000), + }); + + if (!resp.ok) { + throw new Error(`Qdrant upsert failed: ${resp.status} ${await resp.text()}`); + } +} + +/** Batch upsert points */ +export async function upsertPoints( + collection: CollectionName, + points: ReadonlyArray<{ id: string; vector: number[]; payload: Record }>, +): Promise { + if (points.length === 0) return; + + const resp = await fetch(`${QDRANT_URL}/collections/${collection}/points`, { + method: "PUT", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ points }), + signal: AbortSignal.timeout(30000), + }); + + if (!resp.ok) { + throw new Error(`Qdrant batch upsert failed: ${resp.status} ${await resp.text()}`); + } +} + +/** Search Qdrant with vector similarity + optional payload filter */ +export async function searchSimilar( + collection: CollectionName, + queryVector: number[], + limit: number = 10, + filter?: Record, +): Promise }>> { + const body: Record = { + vector: queryVector, + limit, + with_payload: true, + }; + + if (filter) { + body.filter = filter; + } + + const resp = await fetch(`${QDRANT_URL}/collections/${collection}/points/search`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + signal: AbortSignal.timeout(10000), + }); + + if (!resp.ok) { + throw new Error(`Qdrant search failed: ${resp.status} ${await resp.text()}`); + } + + const data = await resp.json() as { result: Array<{ id: string; score: number; payload: Record }> }; + return data.result; +} + +/** Semantic search: embed query text then search Qdrant */ +export async function semanticSearch( + collection: CollectionName, + query: string, + limit: number = 10, + filter?: Record, +): Promise }>> { + const vector = await embed(query); + return searchSimilar(collection, vector, limit, filter); +} + +/** Get collection info (point count, etc.) */ +export async function getCollectionInfo( + collection: CollectionName, +): Promise<{ pointsCount: number; vectorsCount: number }> { + const resp = await fetch(`${QDRANT_URL}/collections/${collection}`, { + signal: AbortSignal.timeout(5000), + }); + + if (!resp.ok) { + throw new Error(`Qdrant info failed: ${resp.status}`); + } + + const data = await resp.json() as { result: { points_count: number; vectors_count: number } }; + return { + pointsCount: data.result.points_count, + vectorsCount: data.result.vectors_count, + }; +} diff --git a/packages/api/src/embeddings/seed-products.ts b/packages/api/src/embeddings/seed-products.ts new file mode 100644 index 0000000..d3ed41d --- /dev/null +++ b/packages/api/src/embeddings/seed-products.ts @@ -0,0 +1,96 @@ +/** + * Seed product_embeddings collection in Qdrant from PostgreSQL transceivers. + * + * Creates a rich text representation of each transceiver, embeds it via + * Ollama nomic-embed-text, and stores in Qdrant with payload filters. + * + * Run: npx tsx packages/api/src/embeddings/seed-products.ts + */ +import { pool } from "../db/client"; +import { embed, upsertPoints } from "./client"; + +function transceiverToText(row: Record): string { + const parts = [ + row.standard_name && `${row.standard_name}`, + row.form_factor && `Form factor: ${row.form_factor}`, + row.speed && `Speed: ${row.speed}`, + row.reach_label && `Reach: ${row.reach_label}`, + row.fiber_type && `Fiber: ${row.fiber_type}`, + row.connector && `Connector: ${row.connector}`, + row.wavelengths && `Wavelengths: ${row.wavelengths}`, + row.wdm_type && `WDM: ${row.wdm_type}`, + row.category && `Category: ${row.category}`, + row.coherent && `Coherent optics`, + row.power_consumption_w && `Power: ${row.power_consumption_w}W`, + row.temp_range && `Temperature: ${row.temp_range}`, + row.vendor_name && `Vendor: ${row.vendor_name}`, + row.description && `${row.description}`, + ].filter(Boolean); + + return parts.join(". "); +} + +async function main() { + console.log("=== Seeding product_embeddings ===\n"); + + const result = await pool.query( + `SELECT t.id, t.slug, t.standard_name, t.form_factor, t.speed, t.speed_gbps, + t.reach_label, t.reach_meters, t.fiber_type, t.connector, + t.wavelengths, t.wdm_type, t.coherent, t.power_consumption_w, + t.temp_range, t.category, t.notes as description, + v.name as vendor_name + FROM transceivers t + LEFT JOIN vendors v ON v.id = t.vendor_id + ORDER BY t.speed_gbps DESC` + ); + + console.log(`Found ${result.rows.length} transceivers to embed\n`); + + const BATCH_SIZE = 10; + let total = 0; + + for (let i = 0; i < result.rows.length; i += BATCH_SIZE) { + const batch = result.rows.slice(i, i + BATCH_SIZE); + + const points = await Promise.all( + batch.map(async (row) => { + const text = transceiverToText(row); + const vector = await embed(text); + + return { + id: row.id, + vector, + payload: { + slug: row.slug, + standard_name: row.standard_name || "", + form_factor: row.form_factor || "", + speed: row.speed || "", + speed_gbps: parseFloat(row.speed_gbps) || 0, + reach_label: row.reach_label || "", + reach_meters: row.reach_meters || 0, + fiber_type: row.fiber_type || "", + connector: row.connector || "", + wdm_type: row.wdm_type || "", + category: row.category || "", + coherent: row.coherent || false, + vendor: row.vendor_name || "", + text, + }, + }; + }) + ); + + await upsertPoints("product_embeddings", points); + total += points.length; + console.log(` Embedded ${total}/${result.rows.length} transceivers`); + } + + console.log(`\n=== Done: ${total} products embedded ===`); + await pool.end(); +} + +main().catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); +}); diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index 4594fd6..e7309ca 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -9,6 +9,7 @@ import { vendorRouter } from "./routes/vendors"; import { standardRouter } from "./routes/standards"; import { healthRouter } from "./routes/health"; import { hypeCycleRouter } from "./routes/hype-cycle"; +import { searchRouter } from "./routes/search"; const app = express(); @@ -32,6 +33,7 @@ app.use("/api/vendors", vendorRouter); app.use("/api/standards", standardRouter); app.use("/api/health", healthRouter); app.use("/api/hype-cycle", hypeCycleRouter); +app.use("/api/search", searchRouter); // Root app.get("/", (_req, res) => { @@ -49,6 +51,9 @@ app.get("/", (_req, res) => { "GET /api/health", "GET /api/hype-cycle", "GET /api/hype-cycle/:tech", + "GET /api/search?q=&collection=&limit=", + "GET /api/search/products?q=&form_factor=&speed_gbps=&fiber_type=", + "GET /api/search/stats", ], }); }); diff --git a/packages/api/src/routes/search.ts b/packages/api/src/routes/search.ts new file mode 100644 index 0000000..7107a6b --- /dev/null +++ b/packages/api/src/routes/search.ts @@ -0,0 +1,148 @@ +/** + * Semantic search API routes (Qdrant vector search) + * + * GET /api/search?q=&collection=&limit= + * GET /api/search/products?q=&form_factor=&speed_gbps=&fiber_type= + */ +import { Router, Request, Response } from "express"; +import { semanticSearch, getCollectionInfo, CollectionName } from "../embeddings/client"; + +export const searchRouter = Router(); + +const VALID_COLLECTIONS: CollectionName[] = [ + "product_embeddings", + "datasheet_chunks", + "faq_embeddings", + "manual_chunks", + "troubleshooting_embeddings", + "news_embeddings", +]; + +const q = (p: string, req: Request): string | undefined => + req.query[p] ? String(req.query[p]) : undefined; + +// GET /api/search — Generic semantic search across any collection +searchRouter.get("/", async (req: Request, res: Response) => { + const query = q("q", req); + const collection = (q("collection", req) || "product_embeddings") as CollectionName; + const limit = parseInt(q("limit", req) || "10"); + + if (!query) { + res.status(400).json({ success: false, error: "Missing 'q' parameter" }); + return; + } + + if (!VALID_COLLECTIONS.includes(collection)) { + res.status(400).json({ + success: false, + error: `Invalid collection. Valid: ${VALID_COLLECTIONS.join(", ")}`, + }); + return; + } + + try { + const results = await semanticSearch(collection, query, limit); + res.json({ + success: true, + query, + collection, + results: results.map((r) => ({ + id: r.id, + score: Math.round(r.score * 1000) / 1000, + ...r.payload, + })), + count: results.length, + }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Vector search unavailable", + detail: (err as Error).message, + }); + } +}); + +// GET /api/search/products — Product-specific semantic search with filters +searchRouter.get("/products", async (req: Request, res: Response) => { + const query = q("q", req); + const limit = parseInt(q("limit", req) || "10"); + const formFactor = q("form_factor", req); + const speedGbps = q("speed_gbps", req); + const fiberType = q("fiber_type", req); + const wdmType = q("wdm_type", req); + + if (!query) { + res.status(400).json({ success: false, error: "Missing 'q' parameter" }); + return; + } + + // Build Qdrant payload filter + const mustConditions: Array> = []; + if (formFactor) { + mustConditions.push({ key: "form_factor", match: { value: formFactor.toUpperCase() } }); + } + if (speedGbps) { + mustConditions.push({ key: "speed_gbps", match: { value: parseFloat(speedGbps) } }); + } + if (fiberType) { + mustConditions.push({ key: "fiber_type", match: { value: fiberType.toUpperCase() } }); + } + if (wdmType) { + mustConditions.push({ key: "wdm_type", match: { value: wdmType.toUpperCase() } }); + } + + const filter = mustConditions.length > 0 ? { must: mustConditions } : undefined; + + try { + const results = await semanticSearch("product_embeddings", query, limit, filter); + res.json({ + success: true, + query, + filters: { formFactor, speedGbps, fiberType, wdmType }, + results: results.map((r) => ({ + id: r.id, + score: Math.round(r.score * 1000) / 1000, + slug: r.payload.slug, + standard_name: r.payload.standard_name, + form_factor: r.payload.form_factor, + speed: r.payload.speed, + reach: r.payload.reach_label, + fiber_type: r.payload.fiber_type, + connector: r.payload.connector, + category: r.payload.category, + vendor: r.payload.vendor, + })), + count: results.length, + }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Vector search unavailable", + detail: (err as Error).message, + }); + } +}); + +// GET /api/search/stats — Collection statistics +searchRouter.get("/stats", async (_req: Request, res: Response) => { + try { + const stats = await Promise.all( + VALID_COLLECTIONS.map(async (col) => { + try { + const info = await getCollectionInfo(col); + return { collection: col, ...info }; + } catch { + return { collection: col, pointsCount: 0, vectorsCount: 0, error: "unavailable" }; + } + }) + ); + + res.json({ success: true, collections: stats }); + } catch (err) { + res.status(503).json({ + success: false, + error: "Qdrant unavailable", + detail: (err as Error).message, + }); + } +});