transceiver-db/packages/api/src/embeddings/seed-knowledge-base.ts
Rene Fichtmueller 4b452ab49e feat(scrapers+mcp): ATGBICS + ProLabs scrapers, MCP HTTP/SSE server
Scrapers:
- atgbics.ts: PlaywrightCrawler for UK vendor ATGBICS (Shopify store),
  scrapes SFP/SFP+/SFP28/QSFP+/QSFP28/QSFP-DD in GBP, max 50 pages/run
- prolabs.ts: HttpCrawler for ProLabs (Legrand subsidiary), USD pricing,
  category-driven crawl with reach/fiber/speed detection
- Both registered in scheduler (every 8h, staggered) and index.ts CLI

MCP HTTP Server:
- packages/mcp-server/src/http-server.ts: Express + SSEServerTransport
- Exposes all 12 TIP tools via GET /sse + POST /message
- Bearer token auth (MCP_SECRET env), CORS-configurable
- GET /health → { status: "ok", tools: 12 }
- Port: MCP_HTTP_PORT (default 3201)

SQL + tools:
- sql/006-009: seed scripts for whitebox switches, vendors, assets
- switch-docs.ts: MCP tool for switch documentation queries
2026-03-29 02:26:45 +08:00

100 lines
3.1 KiB
TypeScript

/**
* Seed FAQ and troubleshooting embeddings in Qdrant from knowledge_base.
*
* Run: npx tsx packages/api/src/embeddings/seed-knowledge-base.ts
*/
import { pool } from "../db/client";
import { embed, upsertPoints, type CollectionName } from "./client";
function kbToText(row: Record<string, unknown>): string {
const parts = [
`Q: ${row.question}`,
`A: ${row.answer}`,
row.subcategory && `Topic: ${row.subcategory}`,
row.applies_to_form_factors && `Form factors: ${(row.applies_to_form_factors as string[]).join(", ")}`,
row.applies_to_speeds && `Speeds: ${(row.applies_to_speeds as string[]).join(", ")}`,
].filter(Boolean);
return parts.join(". ");
}
function collectionForCategory(category: string): CollectionName {
if (category === "troubleshooting" || category === "known_issue") {
return "troubleshooting_embeddings";
}
return "faq_embeddings";
}
async function main(): Promise<void> {
console.log("=== Seeding knowledge_base embeddings ===\n");
const result = await pool.query(
`SELECT id, category, subcategory, question, answer,
applies_to_form_factors, applies_to_speeds, severity, tags
FROM knowledge_base
ORDER BY category, created_at`
);
console.log(`Found ${result.rows.length} knowledge base entries\n`);
const BATCH_SIZE = 5;
let faqCount = 0;
let troubleCount = 0;
for (let i = 0; i < result.rows.length; i += BATCH_SIZE) {
const batch = result.rows.slice(i, i + BATCH_SIZE);
// Group by collection
const byCollection = new Map<CollectionName, typeof batch>();
for (const row of batch) {
const col = collectionForCategory(row.category as string);
if (!byCollection.has(col)) byCollection.set(col, []);
byCollection.get(col)!.push(row);
}
for (const [collection, rows] of byCollection) {
const points = await Promise.all(
rows.map(async (row) => {
const text = kbToText(row);
const vector = await embed(text);
return {
id: row.id,
vector,
payload: {
question: row.question || "",
answer: row.answer || "",
category: row.category || "",
subcategory: row.subcategory || "",
symptom: row.question || "",
cause: row.subcategory || "",
solution: row.answer || "",
severity: row.severity || "info",
form_factors: row.applies_to_form_factors || [],
speeds: row.applies_to_speeds || [],
tags: row.tags || [],
text,
},
};
})
);
await upsertPoints(collection, points);
if (collection === "faq_embeddings") faqCount += points.length;
else troubleCount += points.length;
}
console.log(` Embedded ${Math.min(i + BATCH_SIZE, result.rows.length)}/${result.rows.length} entries (FAQ: ${faqCount}, Troubleshooting: ${troubleCount})`);
}
console.log(`\n=== Done: ${faqCount} FAQ + ${troubleCount} troubleshooting embedded ===`);
await pool.end();
}
main().catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});