transceiver-db/packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts

475 lines
18 KiB
TypeScript

/**
* Vendor Discovery Crawler — Intelligent transceiver catalog spider.
*
* Architecture:
* vendor catalog URL
* → PlaywrightCrawler (Crawlee) — renders JS, handles bot-detection
* → page type detection (product vs. listing)
* → LLM extraction (core.ts scrapeWithLLM)
* → spec physical validation (spec-validator.ts)
* → DB persist (db.ts findOrCreateScrapedTransceiver)
* → training data (training-data-writer.ts)
*
* Each vendor config defines catalog entry points and optional blocklist patterns.
* The crawler respects rate limits and uses stealth patches to avoid blocking.
*
* Run standalone:
* tsx packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts
*
* Or import and call discoverVendorCatalog() from the scheduler.
* Scheduler: 8 vendors daily, 3h stagger (20:00/22:00/00:00/02:00/04:00/06:00/08:00/10:00 UTC).
*/
import { PlaywrightCrawler, RequestQueue, Configuration, type Log } from "crawlee";
import { pool, ensureVendor, findOrCreateScrapedTransceiver } from "../utils/db";
import { scrapeWithLLM } from "./core";
import { validateTransceiverSpec, combineValidations, type ExtractedSpec } from "./spec-validator";
import {
writeExtractionRecord,
writeDiscoveryRecord,
finalFlush,
type CrawlExtraction,
} from "./training-data-writer";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import { createHash } from "crypto";
// ─────────────────────────────────────────────────────────────────────────────
// Vendor catalog registry
// ─────────────────────────────────────────────────────────────────────────────
export interface VendorCatalogConfig {
slug: string;
name: string;
website: string;
catalogUrls: string[]; // entry points for the spider
blockPatterns?: RegExp[]; // URL patterns to skip
allowPatterns?: RegExp[]; // only follow these URL patterns (if set)
maxPages?: number; // safety cap (default 200)
maxDepth?: number; // link-follow depth (default 3)
delayMs?: number; // polite crawl delay (default 1500)
marketStatus?: "Mainstream" | "Growth" | "Emerging" | "Legacy" | "EOL";
category?: "DataCenter" | "Telecom" | "Industrial" | "Enterprise";
domSupport?: boolean;
}
/** Vendor catalog registry — add new vendors here */
export const VENDOR_CATALOG_REGISTRY: VendorCatalogConfig[] = [
{
slug: "cisco-tmg",
name: "Cisco",
website: "https://www.cisco.com",
catalogUrls: [
"https://www.cisco.com/c/en/us/products/interfaces-modules/transceiver-modules/index.html",
],
allowPatterns: [/\/transceiver-modules\//, /\/products\/interfaces-modules\//],
blockPatterns: [/\/support\//, /\/community\//, /signin/, /login/],
maxPages: 300,
maxDepth: 4,
delayMs: 2000,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
{
slug: "juniper",
name: "Juniper Networks",
website: "https://www.juniper.net",
catalogUrls: [
"https://www.juniper.net/us/en/products/routers/routing-transports/optical-transceiver-modules.html",
],
allowPatterns: [/\/transceiver/, /\/optical/, /\/sfp/, /\/qsfp/],
blockPatterns: [/\/support\//, /\/community\//, /login/],
maxPages: 200,
maxDepth: 3,
delayMs: 2000,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
{
slug: "arista",
name: "Arista Networks",
website: "https://www.arista.com",
catalogUrls: [
"https://www.arista.com/en/products/transceivers-cables",
],
blockPatterns: [/\/support\//, /login/],
maxPages: 150,
maxDepth: 3,
delayMs: 1500,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
{
slug: "fs-com",
name: "FS.com",
website: "https://www.fs.com",
catalogUrls: [
"https://www.fs.com/c/fiber-optic-transceivers-3013",
],
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
maxPages: 500,
maxDepth: 4,
delayMs: 1000,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
{
slug: "flexoptix",
name: "Flexoptix",
website: "https://www.flexoptix.net",
catalogUrls: [
"https://www.flexoptix.net/en/optical-transceivers.html",
],
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
maxPages: 400,
maxDepth: 3,
delayMs: 1200,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
{
slug: "nokia",
name: "Nokia",
website: "https://www.nokia.com",
catalogUrls: [
"https://www.nokia.com/networks/products/optical-interfaces/transceiver-modules/",
],
blockPatterns: [/\/support\//, /login/, /\/community\//],
maxPages: 200,
maxDepth: 3,
delayMs: 2000,
marketStatus: "Mainstream",
category: "Telecom",
domSupport: true,
},
{
slug: "huawei",
name: "Huawei",
website: "https://e.huawei.com",
catalogUrls: [
"https://e.huawei.com/en/products/optical-transmission/transceiver-modules",
],
blockPatterns: [/\/support\//, /login/],
maxPages: 200,
maxDepth: 3,
delayMs: 2500,
marketStatus: "Mainstream",
category: "Telecom",
domSupport: true,
},
{
slug: "ii-vi",
name: "II-VI / Coherent",
website: "https://www.coherent.com",
catalogUrls: [
"https://www.coherent.com/networking/transceivers",
],
blockPatterns: [/login/, /\/account/],
maxPages: 150,
maxDepth: 3,
delayMs: 1500,
marketStatus: "Mainstream",
category: "DataCenter",
domSupport: true,
},
];
// ─────────────────────────────────────────────────────────────────────────────
// State tracking
// ─────────────────────────────────────────────────────────────────────────────
interface CrawlStats {
pagesVisited: number;
productPagesFound: number;
extractionsSucceeded: number;
extractionsFailed: number;
validationPassed: number;
validationFailed: number;
dbInserted: number;
trainingPairsWritten: number;
}
// ─────────────────────────────────────────────────────────────────────────────
// HTML cleaning
// ─────────────────────────────────────────────────────────────────────────────
function cleanHtml(html: string): string {
return html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
.replace(/<!--[\s\S]*?-->/g, "")
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
// ─────────────────────────────────────────────────────────────────────────────
// URL filtering
// ─────────────────────────────────────────────────────────────────────────────
function shouldFollowUrl(url: string, config: VendorCatalogConfig): boolean {
// Must be same domain
try {
const parsed = new URL(url);
const domain = new URL(config.website).hostname.replace("www.", "");
if (!parsed.hostname.includes(domain)) return false;
} catch {
return false;
}
// Block patterns
if (config.blockPatterns?.some((re) => re.test(url))) return false;
// Allow patterns (if defined, URL must match at least one)
if (config.allowPatterns && config.allowPatterns.length > 0) {
return config.allowPatterns.some((re) => re.test(url));
}
return true;
}
// ─────────────────────────────────────────────────────────────────────────────
// Main crawl function
// ─────────────────────────────────────────────────────────────────────────────
export async function discoverVendorCatalog(
config: VendorCatalogConfig,
options: { dryRun?: boolean; verbose?: boolean } = {}
): Promise<CrawlStats> {
const stats: CrawlStats = {
pagesVisited: 0,
productPagesFound: 0,
extractionsSucceeded: 0,
extractionsFailed: 0,
validationPassed: 0,
validationFailed: 0,
dbInserted: 0,
trainingPairsWritten: 0,
};
const maxPages = config.maxPages ?? 200;
const delayMs = config.delayMs ?? 1500;
const log = (...args: unknown[]) => { if (options.verbose) console.log(`[${config.slug}]`, ...args); };
// Ensure vendor exists in DB
const vendorId = await ensureVendor(config.name, "distributor", config.website, undefined);
log(`Vendor ID: ${vendorId}`);
const requestQueue = await RequestQueue.open(`vendor-${config.slug}-${Date.now()}`);
for (const url of config.catalogUrls) {
await requestQueue.addRequest({ url, userData: { depth: 0 } });
}
const crawleeConfig = makeCrawleeConfig(`vendor-discovery-${config.slug}`);
const seenUrls = new Set<string>();
const crawler = new PlaywrightCrawler(
{
requestQueue,
maxRequestsPerCrawl: maxPages,
maxConcurrency: 1, // polite single-thread crawl
navigationTimeoutSecs: 30,
requestHandlerTimeoutSecs: 60,
async requestHandler({ request, page, enqueueLinks }) {
if (stats.pagesVisited >= maxPages) return;
stats.pagesVisited++;
seenUrls.add(request.url);
log(`[${stats.pagesVisited}/${maxPages}] ${request.url}`);
// Polite delay
await new Promise((r) => setTimeout(r, delayMs));
// Get rendered HTML
const html = await page.content();
const cleanedText = cleanHtml(html).slice(0, 2000);
// Run LLM extraction (with page type detection)
let llmResult: Awaited<ReturnType<typeof scrapeWithLLM>> | null = null;
try {
llmResult = await scrapeWithLLM(html, request.url, {
vendorSlug: config.slug,
skipPageDetection: false,
});
} catch (err) {
stats.extractionsFailed++;
log(`LLM error: ${(err as Error).message.slice(0, 80)}`);
}
// Process product pages
if (llmResult?.extraction.is_product_page) {
stats.productPagesFound++;
const ext = llmResult.extraction;
if (llmResult.validation_passed) {
stats.extractionsSucceeded++;
// Build spec for physical validation
const spec: ExtractedSpec = {
part_number: ext.part_number ?? undefined,
form_factor: ext.form_factor ?? undefined,
speed_gbps: ext.speed_gbps ?? undefined,
fiber_type: undefined, // not in stock extraction — derive later
};
// Spec plausibility check
const specResult = validateTransceiverSpec(spec);
const combined = combineValidations(specResult, ext.confidence);
if (combined.passed) {
stats.validationPassed++;
} else {
stats.validationFailed++;
}
// Persist to DB (even if spec validation has warnings — just low tier)
if (!options.dryRun && ext.part_number && combined.adjusted_confidence >= 0.5) {
try {
await findOrCreateScrapedTransceiver({
partNumber: ext.part_number,
vendorId,
productUrl: request.url,
formFactor: ext.form_factor ?? undefined,
speedGbps: ext.speed_gbps ?? undefined,
speed: ext.speed_gbps ? `${ext.speed_gbps}G` : undefined,
});
stats.dbInserted++;
} catch (dbErr) {
log(`DB error: ${(dbErr as Error).message.slice(0, 80)}`);
}
}
// Write training data
const crawlExt: CrawlExtraction = {
url: request.url,
vendor_slug: config.slug,
vendor_name: config.name,
spec,
validation: combined,
raw_html_snippet: cleanedText,
crawled_at: new Date().toISOString(),
};
const writeResult = writeExtractionRecord(crawlExt);
if (writeResult.written) {
stats.trainingPairsWritten += writeResult.pairs;
}
} else {
stats.extractionsFailed++;
log(`Extraction failed validation: ${llmResult.validation_errors.join("; ")}`);
}
}
// Discover more URLs at current depth
const currentDepth = (request.userData?.depth as number) ?? 0;
const maxDepth = config.maxDepth ?? 3;
if (currentDepth < maxDepth) {
const links = await page.evaluate(() =>
Array.from(document.querySelectorAll("a[href]"))
.map((a) => (a as HTMLAnchorElement).href)
.filter(Boolean)
);
for (const link of links) {
if (seenUrls.has(link)) continue;
if (!shouldFollowUrl(link, config)) continue;
if (stats.pagesVisited >= maxPages) break;
seenUrls.add(link);
await requestQueue.addRequest({
url: link,
userData: { depth: currentDepth + 1 },
});
}
}
},
failedRequestHandler({ request, log: crawleeLog }: { request: Parameters<typeof crawleeLog.error>[1]; log: Log }) {
stats.extractionsFailed++;
(crawleeLog as Log).error(`Failed: ${(request as { url: string }).url}`);
},
},
crawleeConfig
);
await crawler.run();
// Write discovery record + final flush
writeDiscoveryRecord(config.slug, config.name, config.catalogUrls[0], stats.productPagesFound);
finalFlush(config.slug);
console.log(`\n=== ${config.name} Discovery Complete ===`);
console.log(` Pages visited: ${stats.pagesVisited}`);
console.log(` Product pages: ${stats.productPagesFound}`);
console.log(` Extractions OK: ${stats.extractionsSucceeded}`);
console.log(` Spec valid: ${stats.validationPassed}`);
console.log(` DB inserts: ${stats.dbInserted}`);
console.log(` Training pairs: ${stats.trainingPairsWritten}\n`);
return stats;
}
// ─────────────────────────────────────────────────────────────────────────────
// Batch runner — crawl multiple vendors in sequence
// ─────────────────────────────────────────────────────────────────────────────
export async function runVendorDiscoveryBatch(
vendorSlugs?: string[],
options: { dryRun?: boolean; verbose?: boolean } = {}
): Promise<void> {
const targets = vendorSlugs
? VENDOR_CATALOG_REGISTRY.filter((v) => vendorSlugs.includes(v.slug))
: VENDOR_CATALOG_REGISTRY;
console.log(`Starting vendor discovery for ${targets.length} vendor(s)...`);
for (const config of targets) {
try {
await discoverVendorCatalog(config, options);
} catch (err) {
console.error(`[${config.slug}] Fatal crawl error:`, (err as Error).message);
}
}
console.log("Vendor discovery batch complete.");
}
// ─────────────────────────────────────────────────────────────────────────────
// Standalone execution
// ─────────────────────────────────────────────────────────────────────────────
if (require.main === module) {
const target = process.argv[2]; // optional: specific vendor slug
const dryRun = process.argv.includes("--dry-run");
const verbose = process.argv.includes("--verbose");
const run = async () => {
if (target) {
const config = VENDOR_CATALOG_REGISTRY.find((v) => v.slug === target);
if (!config) {
console.error(`Unknown vendor slug: ${target}`);
console.log("Available:", VENDOR_CATALOG_REGISTRY.map((v) => v.slug).join(", "));
process.exit(1);
}
await discoverVendorCatalog(config, { dryRun, verbose });
} else {
await runVendorDiscoveryBatch(undefined, { dryRun, verbose });
}
};
run()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}