/** * Vendor Discovery Crawler — Intelligent transceiver catalog spider. * * Architecture: * vendor catalog URL * → PlaywrightCrawler (Crawlee) — renders JS, handles bot-detection * → page type detection (product vs. listing) * → LLM extraction (core.ts scrapeWithLLM) * → spec physical validation (spec-validator.ts) * → DB persist (db.ts findOrCreateScrapedTransceiver) * → training data (training-data-writer.ts) * * Each vendor config defines catalog entry points and optional blocklist patterns. * The crawler respects rate limits and uses stealth patches to avoid blocking. * * Run standalone: * tsx packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts * * Or import and call discoverVendorCatalog() from the scheduler. * Scheduler: 8 vendors daily, 3h stagger (20:00/22:00/00:00/02:00/04:00/06:00/08:00/10:00 UTC). */ import { PlaywrightCrawler, RequestQueue, Configuration, type Log } from "crawlee"; import { pool, ensureVendor, findOrCreateScrapedTransceiver } from "../utils/db"; import { scrapeWithLLM } from "./core"; import { validateTransceiverSpec, combineValidations, type ExtractedSpec } from "./spec-validator"; import { writeExtractionRecord, writeDiscoveryRecord, finalFlush, type CrawlExtraction, } from "./training-data-writer"; import { makeCrawleeConfig } from "../utils/crawlee-config"; import { createHash } from "crypto"; // ───────────────────────────────────────────────────────────────────────────── // Vendor catalog registry // ───────────────────────────────────────────────────────────────────────────── export interface VendorCatalogConfig { slug: string; name: string; website: string; catalogUrls: string[]; // entry points for the spider blockPatterns?: RegExp[]; // URL patterns to skip allowPatterns?: RegExp[]; // only follow these URL patterns (if set) maxPages?: number; // safety cap (default 200) maxDepth?: number; // link-follow depth (default 3) delayMs?: number; // polite crawl delay (default 1500) marketStatus?: "Mainstream" | "Growth" | "Emerging" | "Legacy" | "EOL"; category?: "DataCenter" | "Telecom" | "Industrial" | "Enterprise"; domSupport?: boolean; } /** Vendor catalog registry — add new vendors here */ export const VENDOR_CATALOG_REGISTRY: VendorCatalogConfig[] = [ { slug: "cisco-tmg", name: "Cisco", website: "https://www.cisco.com", catalogUrls: [ "https://www.cisco.com/c/en/us/products/interfaces-modules/transceiver-modules/index.html", ], allowPatterns: [/\/transceiver-modules\//, /\/products\/interfaces-modules\//], blockPatterns: [/\/support\//, /\/community\//, /signin/, /login/], maxPages: 300, maxDepth: 4, delayMs: 2000, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, { slug: "juniper", name: "Juniper Networks", website: "https://www.juniper.net", catalogUrls: [ "https://www.juniper.net/us/en/products/routers/routing-transports/optical-transceiver-modules.html", ], allowPatterns: [/\/transceiver/, /\/optical/, /\/sfp/, /\/qsfp/], blockPatterns: [/\/support\//, /\/community\//, /login/], maxPages: 200, maxDepth: 3, delayMs: 2000, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, { slug: "arista", name: "Arista Networks", website: "https://www.arista.com", catalogUrls: [ "https://www.arista.com/en/products/transceivers-cables", ], blockPatterns: [/\/support\//, /login/], maxPages: 150, maxDepth: 3, delayMs: 1500, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, { slug: "fs-com", name: "FS.com", website: "https://www.fs.com", catalogUrls: [ "https://www.fs.com/c/fiber-optic-transceivers-3013", ], blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/], maxPages: 500, maxDepth: 4, delayMs: 1000, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, { slug: "flexoptix", name: "Flexoptix", website: "https://www.flexoptix.net", catalogUrls: [ "https://www.flexoptix.net/en/optical-transceivers.html", ], blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/], maxPages: 400, maxDepth: 3, delayMs: 1200, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, { slug: "nokia", name: "Nokia", website: "https://www.nokia.com", catalogUrls: [ "https://www.nokia.com/networks/products/optical-interfaces/transceiver-modules/", ], blockPatterns: [/\/support\//, /login/, /\/community\//], maxPages: 200, maxDepth: 3, delayMs: 2000, marketStatus: "Mainstream", category: "Telecom", domSupport: true, }, { slug: "huawei", name: "Huawei", website: "https://e.huawei.com", catalogUrls: [ "https://e.huawei.com/en/products/optical-transmission/transceiver-modules", ], blockPatterns: [/\/support\//, /login/], maxPages: 200, maxDepth: 3, delayMs: 2500, marketStatus: "Mainstream", category: "Telecom", domSupport: true, }, { slug: "ii-vi", name: "II-VI / Coherent", website: "https://www.coherent.com", catalogUrls: [ "https://www.coherent.com/networking/transceivers", ], blockPatterns: [/login/, /\/account/], maxPages: 150, maxDepth: 3, delayMs: 1500, marketStatus: "Mainstream", category: "DataCenter", domSupport: true, }, ]; // ───────────────────────────────────────────────────────────────────────────── // State tracking // ───────────────────────────────────────────────────────────────────────────── interface CrawlStats { pagesVisited: number; productPagesFound: number; extractionsSucceeded: number; extractionsFailed: number; validationPassed: number; validationFailed: number; dbInserted: number; trainingPairsWritten: number; } // ───────────────────────────────────────────────────────────────────────────── // HTML cleaning // ───────────────────────────────────────────────────────────────────────────── function cleanHtml(html: string): string { return html .replace(/]*>[\s\S]*?<\/script>/gi, "") .replace(/]*>[\s\S]*?<\/style>/gi, "") .replace(//g, "") .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); } // ───────────────────────────────────────────────────────────────────────────── // URL filtering // ───────────────────────────────────────────────────────────────────────────── function shouldFollowUrl(url: string, config: VendorCatalogConfig): boolean { // Must be same domain try { const parsed = new URL(url); const domain = new URL(config.website).hostname.replace("www.", ""); if (!parsed.hostname.includes(domain)) return false; } catch { return false; } // Block patterns if (config.blockPatterns?.some((re) => re.test(url))) return false; // Allow patterns (if defined, URL must match at least one) if (config.allowPatterns && config.allowPatterns.length > 0) { return config.allowPatterns.some((re) => re.test(url)); } return true; } // ───────────────────────────────────────────────────────────────────────────── // Main crawl function // ───────────────────────────────────────────────────────────────────────────── export async function discoverVendorCatalog( config: VendorCatalogConfig, options: { dryRun?: boolean; verbose?: boolean } = {} ): Promise { const stats: CrawlStats = { pagesVisited: 0, productPagesFound: 0, extractionsSucceeded: 0, extractionsFailed: 0, validationPassed: 0, validationFailed: 0, dbInserted: 0, trainingPairsWritten: 0, }; const maxPages = config.maxPages ?? 200; const delayMs = config.delayMs ?? 1500; const log = (...args: unknown[]) => { if (options.verbose) console.log(`[${config.slug}]`, ...args); }; // Ensure vendor exists in DB const vendorId = await ensureVendor(config.name, "distributor", config.website, undefined); log(`Vendor ID: ${vendorId}`); const requestQueue = await RequestQueue.open(`vendor-${config.slug}-${Date.now()}`); for (const url of config.catalogUrls) { await requestQueue.addRequest({ url, userData: { depth: 0 } }); } const crawleeConfig = makeCrawleeConfig(`vendor-discovery-${config.slug}`); const seenUrls = new Set(); const crawler = new PlaywrightCrawler( { requestQueue, maxRequestsPerCrawl: maxPages, maxConcurrency: 1, // polite single-thread crawl navigationTimeoutSecs: 30, requestHandlerTimeoutSecs: 60, async requestHandler({ request, page, enqueueLinks }) { if (stats.pagesVisited >= maxPages) return; stats.pagesVisited++; seenUrls.add(request.url); log(`[${stats.pagesVisited}/${maxPages}] ${request.url}`); // Polite delay await new Promise((r) => setTimeout(r, delayMs)); // Get rendered HTML const html = await page.content(); const cleanedText = cleanHtml(html).slice(0, 2000); // Run LLM extraction (with page type detection) let llmResult: Awaited> | null = null; try { llmResult = await scrapeWithLLM(html, request.url, { vendorSlug: config.slug, skipPageDetection: false, }); } catch (err) { stats.extractionsFailed++; log(`LLM error: ${(err as Error).message.slice(0, 80)}`); } // Process product pages if (llmResult?.extraction.is_product_page) { stats.productPagesFound++; const ext = llmResult.extraction; if (llmResult.validation_passed) { stats.extractionsSucceeded++; // Build spec for physical validation const spec: ExtractedSpec = { part_number: ext.part_number ?? undefined, form_factor: ext.form_factor ?? undefined, speed_gbps: ext.speed_gbps ?? undefined, fiber_type: undefined, // not in stock extraction — derive later }; // Spec plausibility check const specResult = validateTransceiverSpec(spec); const combined = combineValidations(specResult, ext.confidence); if (combined.passed) { stats.validationPassed++; } else { stats.validationFailed++; } // Persist to DB (even if spec validation has warnings — just low tier) if (!options.dryRun && ext.part_number && combined.adjusted_confidence >= 0.5) { try { await findOrCreateScrapedTransceiver({ partNumber: ext.part_number, vendorId, productUrl: request.url, formFactor: ext.form_factor ?? undefined, speedGbps: ext.speed_gbps ?? undefined, speed: ext.speed_gbps ? `${ext.speed_gbps}G` : undefined, }); stats.dbInserted++; } catch (dbErr) { log(`DB error: ${(dbErr as Error).message.slice(0, 80)}`); } } // Write training data const crawlExt: CrawlExtraction = { url: request.url, vendor_slug: config.slug, vendor_name: config.name, spec, validation: combined, raw_html_snippet: cleanedText, crawled_at: new Date().toISOString(), }; const writeResult = writeExtractionRecord(crawlExt); if (writeResult.written) { stats.trainingPairsWritten += writeResult.pairs; } } else { stats.extractionsFailed++; log(`Extraction failed validation: ${llmResult.validation_errors.join("; ")}`); } } // Discover more URLs at current depth const currentDepth = (request.userData?.depth as number) ?? 0; const maxDepth = config.maxDepth ?? 3; if (currentDepth < maxDepth) { const links = await page.evaluate(() => Array.from(document.querySelectorAll("a[href]")) .map((a) => (a as HTMLAnchorElement).href) .filter(Boolean) ); for (const link of links) { if (seenUrls.has(link)) continue; if (!shouldFollowUrl(link, config)) continue; if (stats.pagesVisited >= maxPages) break; seenUrls.add(link); await requestQueue.addRequest({ url: link, userData: { depth: currentDepth + 1 }, }); } } }, failedRequestHandler({ request, log: crawleeLog }: { request: Parameters[1]; log: Log }) { stats.extractionsFailed++; (crawleeLog as Log).error(`Failed: ${(request as { url: string }).url}`); }, }, crawleeConfig ); await crawler.run(); // Write discovery record + final flush writeDiscoveryRecord(config.slug, config.name, config.catalogUrls[0], stats.productPagesFound); finalFlush(config.slug); console.log(`\n=== ${config.name} Discovery Complete ===`); console.log(` Pages visited: ${stats.pagesVisited}`); console.log(` Product pages: ${stats.productPagesFound}`); console.log(` Extractions OK: ${stats.extractionsSucceeded}`); console.log(` Spec valid: ${stats.validationPassed}`); console.log(` DB inserts: ${stats.dbInserted}`); console.log(` Training pairs: ${stats.trainingPairsWritten}\n`); return stats; } // ───────────────────────────────────────────────────────────────────────────── // Batch runner — crawl multiple vendors in sequence // ───────────────────────────────────────────────────────────────────────────── export async function runVendorDiscoveryBatch( vendorSlugs?: string[], options: { dryRun?: boolean; verbose?: boolean } = {} ): Promise { const targets = vendorSlugs ? VENDOR_CATALOG_REGISTRY.filter((v) => vendorSlugs.includes(v.slug)) : VENDOR_CATALOG_REGISTRY; console.log(`Starting vendor discovery for ${targets.length} vendor(s)...`); for (const config of targets) { try { await discoverVendorCatalog(config, options); } catch (err) { console.error(`[${config.slug}] Fatal crawl error:`, (err as Error).message); } } console.log("Vendor discovery batch complete."); } // ───────────────────────────────────────────────────────────────────────────── // Standalone execution // ───────────────────────────────────────────────────────────────────────────── if (require.main === module) { const target = process.argv[2]; // optional: specific vendor slug const dryRun = process.argv.includes("--dry-run"); const verbose = process.argv.includes("--verbose"); const run = async () => { if (target) { const config = VENDOR_CATALOG_REGISTRY.find((v) => v.slug === target); if (!config) { console.error(`Unknown vendor slug: ${target}`); console.log("Available:", VENDOR_CATALOG_REGISTRY.map((v) => v.slug).join(", ")); process.exit(1); } await discoverVendorCatalog(config, { dryRun, verbose }); } else { await runVendorDiscoveryBatch(undefined, { dryRun, verbose }); } }; run() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }