475 lines
18 KiB
TypeScript
475 lines
18 KiB
TypeScript
/**
|
|
* Vendor Discovery Crawler — Intelligent transceiver catalog spider.
|
|
*
|
|
* Architecture:
|
|
* vendor catalog URL
|
|
* → PlaywrightCrawler (Crawlee) — renders JS, handles bot-detection
|
|
* → page type detection (product vs. listing)
|
|
* → LLM extraction (core.ts scrapeWithLLM)
|
|
* → spec physical validation (spec-validator.ts)
|
|
* → DB persist (db.ts findOrCreateScrapedTransceiver)
|
|
* → training data (training-data-writer.ts)
|
|
*
|
|
* Each vendor config defines catalog entry points and optional blocklist patterns.
|
|
* The crawler respects rate limits and uses stealth patches to avoid blocking.
|
|
*
|
|
* Run standalone:
|
|
* tsx packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts
|
|
*
|
|
* Or import and call discoverVendorCatalog() from the scheduler.
|
|
* Scheduler: 8 vendors daily, 3h stagger (20:00/22:00/00:00/02:00/04:00/06:00/08:00/10:00 UTC).
|
|
*/
|
|
|
|
import { PlaywrightCrawler, RequestQueue, Configuration, type Log } from "crawlee";
|
|
import { pool, ensureVendor, findOrCreateScrapedTransceiver } from "../utils/db";
|
|
import { scrapeWithLLM } from "./core";
|
|
import { validateTransceiverSpec, combineValidations, type ExtractedSpec } from "./spec-validator";
|
|
import {
|
|
writeExtractionRecord,
|
|
writeDiscoveryRecord,
|
|
finalFlush,
|
|
type CrawlExtraction,
|
|
} from "./training-data-writer";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
import { createHash } from "crypto";
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Vendor catalog registry
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export interface VendorCatalogConfig {
|
|
slug: string;
|
|
name: string;
|
|
website: string;
|
|
catalogUrls: string[]; // entry points for the spider
|
|
blockPatterns?: RegExp[]; // URL patterns to skip
|
|
allowPatterns?: RegExp[]; // only follow these URL patterns (if set)
|
|
maxPages?: number; // safety cap (default 200)
|
|
maxDepth?: number; // link-follow depth (default 3)
|
|
delayMs?: number; // polite crawl delay (default 1500)
|
|
marketStatus?: "Mainstream" | "Growth" | "Emerging" | "Legacy" | "EOL";
|
|
category?: "DataCenter" | "Telecom" | "Industrial" | "Enterprise";
|
|
domSupport?: boolean;
|
|
}
|
|
|
|
/** Vendor catalog registry — add new vendors here */
|
|
export const VENDOR_CATALOG_REGISTRY: VendorCatalogConfig[] = [
|
|
{
|
|
slug: "cisco-tmg",
|
|
name: "Cisco",
|
|
website: "https://www.cisco.com",
|
|
catalogUrls: [
|
|
"https://www.cisco.com/c/en/us/products/interfaces-modules/transceiver-modules/index.html",
|
|
],
|
|
allowPatterns: [/\/transceiver-modules\//, /\/products\/interfaces-modules\//],
|
|
blockPatterns: [/\/support\//, /\/community\//, /signin/, /login/],
|
|
maxPages: 300,
|
|
maxDepth: 4,
|
|
delayMs: 2000,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "juniper",
|
|
name: "Juniper Networks",
|
|
website: "https://www.juniper.net",
|
|
catalogUrls: [
|
|
"https://www.juniper.net/us/en/products/routers/routing-transports/optical-transceiver-modules.html",
|
|
],
|
|
allowPatterns: [/\/transceiver/, /\/optical/, /\/sfp/, /\/qsfp/],
|
|
blockPatterns: [/\/support\//, /\/community\//, /login/],
|
|
maxPages: 200,
|
|
maxDepth: 3,
|
|
delayMs: 2000,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "arista",
|
|
name: "Arista Networks",
|
|
website: "https://www.arista.com",
|
|
catalogUrls: [
|
|
"https://www.arista.com/en/products/transceivers-cables",
|
|
],
|
|
blockPatterns: [/\/support\//, /login/],
|
|
maxPages: 150,
|
|
maxDepth: 3,
|
|
delayMs: 1500,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "fs-com",
|
|
name: "FS.com",
|
|
website: "https://www.fs.com",
|
|
catalogUrls: [
|
|
"https://www.fs.com/c/fiber-optic-transceivers-3013",
|
|
],
|
|
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
|
|
maxPages: 500,
|
|
maxDepth: 4,
|
|
delayMs: 1000,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "flexoptix",
|
|
name: "Flexoptix",
|
|
website: "https://www.flexoptix.net",
|
|
catalogUrls: [
|
|
"https://www.flexoptix.net/en/optical-transceivers.html",
|
|
],
|
|
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
|
|
maxPages: 400,
|
|
maxDepth: 3,
|
|
delayMs: 1200,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "nokia",
|
|
name: "Nokia",
|
|
website: "https://www.nokia.com",
|
|
catalogUrls: [
|
|
"https://www.nokia.com/networks/products/optical-interfaces/transceiver-modules/",
|
|
],
|
|
blockPatterns: [/\/support\//, /login/, /\/community\//],
|
|
maxPages: 200,
|
|
maxDepth: 3,
|
|
delayMs: 2000,
|
|
marketStatus: "Mainstream",
|
|
category: "Telecom",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "huawei",
|
|
name: "Huawei",
|
|
website: "https://e.huawei.com",
|
|
catalogUrls: [
|
|
"https://e.huawei.com/en/products/optical-transmission/transceiver-modules",
|
|
],
|
|
blockPatterns: [/\/support\//, /login/],
|
|
maxPages: 200,
|
|
maxDepth: 3,
|
|
delayMs: 2500,
|
|
marketStatus: "Mainstream",
|
|
category: "Telecom",
|
|
domSupport: true,
|
|
},
|
|
{
|
|
slug: "ii-vi",
|
|
name: "II-VI / Coherent",
|
|
website: "https://www.coherent.com",
|
|
catalogUrls: [
|
|
"https://www.coherent.com/networking/transceivers",
|
|
],
|
|
blockPatterns: [/login/, /\/account/],
|
|
maxPages: 150,
|
|
maxDepth: 3,
|
|
delayMs: 1500,
|
|
marketStatus: "Mainstream",
|
|
category: "DataCenter",
|
|
domSupport: true,
|
|
},
|
|
];
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// State tracking
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
interface CrawlStats {
|
|
pagesVisited: number;
|
|
productPagesFound: number;
|
|
extractionsSucceeded: number;
|
|
extractionsFailed: number;
|
|
validationPassed: number;
|
|
validationFailed: number;
|
|
dbInserted: number;
|
|
trainingPairsWritten: number;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// HTML cleaning
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
function cleanHtml(html: string): string {
|
|
return html
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
.replace(/<!--[\s\S]*?-->/g, "")
|
|
.replace(/<[^>]+>/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// URL filtering
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
function shouldFollowUrl(url: string, config: VendorCatalogConfig): boolean {
|
|
// Must be same domain
|
|
try {
|
|
const parsed = new URL(url);
|
|
const domain = new URL(config.website).hostname.replace("www.", "");
|
|
if (!parsed.hostname.includes(domain)) return false;
|
|
} catch {
|
|
return false;
|
|
}
|
|
|
|
// Block patterns
|
|
if (config.blockPatterns?.some((re) => re.test(url))) return false;
|
|
|
|
// Allow patterns (if defined, URL must match at least one)
|
|
if (config.allowPatterns && config.allowPatterns.length > 0) {
|
|
return config.allowPatterns.some((re) => re.test(url));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Main crawl function
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export async function discoverVendorCatalog(
|
|
config: VendorCatalogConfig,
|
|
options: { dryRun?: boolean; verbose?: boolean } = {}
|
|
): Promise<CrawlStats> {
|
|
const stats: CrawlStats = {
|
|
pagesVisited: 0,
|
|
productPagesFound: 0,
|
|
extractionsSucceeded: 0,
|
|
extractionsFailed: 0,
|
|
validationPassed: 0,
|
|
validationFailed: 0,
|
|
dbInserted: 0,
|
|
trainingPairsWritten: 0,
|
|
};
|
|
|
|
const maxPages = config.maxPages ?? 200;
|
|
const delayMs = config.delayMs ?? 1500;
|
|
const log = (...args: unknown[]) => { if (options.verbose) console.log(`[${config.slug}]`, ...args); };
|
|
|
|
// Ensure vendor exists in DB
|
|
const vendorId = await ensureVendor(config.name, "distributor", config.website, undefined);
|
|
log(`Vendor ID: ${vendorId}`);
|
|
|
|
const requestQueue = await RequestQueue.open(`vendor-${config.slug}-${Date.now()}`);
|
|
for (const url of config.catalogUrls) {
|
|
await requestQueue.addRequest({ url, userData: { depth: 0 } });
|
|
}
|
|
|
|
const crawleeConfig = makeCrawleeConfig(`vendor-discovery-${config.slug}`);
|
|
const seenUrls = new Set<string>();
|
|
|
|
const crawler = new PlaywrightCrawler(
|
|
{
|
|
requestQueue,
|
|
maxRequestsPerCrawl: maxPages,
|
|
maxConcurrency: 1, // polite single-thread crawl
|
|
navigationTimeoutSecs: 30,
|
|
requestHandlerTimeoutSecs: 60,
|
|
|
|
async requestHandler({ request, page, enqueueLinks }) {
|
|
if (stats.pagesVisited >= maxPages) return;
|
|
stats.pagesVisited++;
|
|
seenUrls.add(request.url);
|
|
|
|
log(`[${stats.pagesVisited}/${maxPages}] ${request.url}`);
|
|
|
|
// Polite delay
|
|
await new Promise((r) => setTimeout(r, delayMs));
|
|
|
|
// Get rendered HTML
|
|
const html = await page.content();
|
|
const cleanedText = cleanHtml(html).slice(0, 2000);
|
|
|
|
// Run LLM extraction (with page type detection)
|
|
let llmResult: Awaited<ReturnType<typeof scrapeWithLLM>> | null = null;
|
|
try {
|
|
llmResult = await scrapeWithLLM(html, request.url, {
|
|
vendorSlug: config.slug,
|
|
skipPageDetection: false,
|
|
});
|
|
} catch (err) {
|
|
stats.extractionsFailed++;
|
|
log(`LLM error: ${(err as Error).message.slice(0, 80)}`);
|
|
}
|
|
|
|
// Process product pages
|
|
if (llmResult?.extraction.is_product_page) {
|
|
stats.productPagesFound++;
|
|
|
|
const ext = llmResult.extraction;
|
|
if (llmResult.validation_passed) {
|
|
stats.extractionsSucceeded++;
|
|
|
|
// Build spec for physical validation
|
|
const spec: ExtractedSpec = {
|
|
part_number: ext.part_number ?? undefined,
|
|
form_factor: ext.form_factor ?? undefined,
|
|
speed_gbps: ext.speed_gbps ?? undefined,
|
|
fiber_type: undefined, // not in stock extraction — derive later
|
|
};
|
|
|
|
// Spec plausibility check
|
|
const specResult = validateTransceiverSpec(spec);
|
|
const combined = combineValidations(specResult, ext.confidence);
|
|
|
|
if (combined.passed) {
|
|
stats.validationPassed++;
|
|
} else {
|
|
stats.validationFailed++;
|
|
}
|
|
|
|
// Persist to DB (even if spec validation has warnings — just low tier)
|
|
if (!options.dryRun && ext.part_number && combined.adjusted_confidence >= 0.5) {
|
|
try {
|
|
await findOrCreateScrapedTransceiver({
|
|
partNumber: ext.part_number,
|
|
vendorId,
|
|
productUrl: request.url,
|
|
formFactor: ext.form_factor ?? undefined,
|
|
speedGbps: ext.speed_gbps ?? undefined,
|
|
speed: ext.speed_gbps ? `${ext.speed_gbps}G` : undefined,
|
|
});
|
|
stats.dbInserted++;
|
|
} catch (dbErr) {
|
|
log(`DB error: ${(dbErr as Error).message.slice(0, 80)}`);
|
|
}
|
|
}
|
|
|
|
// Write training data
|
|
const crawlExt: CrawlExtraction = {
|
|
url: request.url,
|
|
vendor_slug: config.slug,
|
|
vendor_name: config.name,
|
|
spec,
|
|
validation: combined,
|
|
raw_html_snippet: cleanedText,
|
|
crawled_at: new Date().toISOString(),
|
|
};
|
|
|
|
const writeResult = writeExtractionRecord(crawlExt);
|
|
if (writeResult.written) {
|
|
stats.trainingPairsWritten += writeResult.pairs;
|
|
}
|
|
|
|
} else {
|
|
stats.extractionsFailed++;
|
|
log(`Extraction failed validation: ${llmResult.validation_errors.join("; ")}`);
|
|
}
|
|
}
|
|
|
|
// Discover more URLs at current depth
|
|
const currentDepth = (request.userData?.depth as number) ?? 0;
|
|
const maxDepth = config.maxDepth ?? 3;
|
|
|
|
if (currentDepth < maxDepth) {
|
|
const links = await page.evaluate(() =>
|
|
Array.from(document.querySelectorAll("a[href]"))
|
|
.map((a) => (a as HTMLAnchorElement).href)
|
|
.filter(Boolean)
|
|
);
|
|
|
|
for (const link of links) {
|
|
if (seenUrls.has(link)) continue;
|
|
if (!shouldFollowUrl(link, config)) continue;
|
|
if (stats.pagesVisited >= maxPages) break;
|
|
|
|
seenUrls.add(link);
|
|
await requestQueue.addRequest({
|
|
url: link,
|
|
userData: { depth: currentDepth + 1 },
|
|
});
|
|
}
|
|
}
|
|
},
|
|
|
|
failedRequestHandler({ request, log: crawleeLog }: { request: Parameters<typeof crawleeLog.error>[1]; log: Log }) {
|
|
stats.extractionsFailed++;
|
|
(crawleeLog as Log).error(`Failed: ${(request as { url: string }).url}`);
|
|
},
|
|
},
|
|
crawleeConfig
|
|
);
|
|
|
|
await crawler.run();
|
|
|
|
// Write discovery record + final flush
|
|
writeDiscoveryRecord(config.slug, config.name, config.catalogUrls[0], stats.productPagesFound);
|
|
finalFlush(config.slug);
|
|
|
|
console.log(`\n=== ${config.name} Discovery Complete ===`);
|
|
console.log(` Pages visited: ${stats.pagesVisited}`);
|
|
console.log(` Product pages: ${stats.productPagesFound}`);
|
|
console.log(` Extractions OK: ${stats.extractionsSucceeded}`);
|
|
console.log(` Spec valid: ${stats.validationPassed}`);
|
|
console.log(` DB inserts: ${stats.dbInserted}`);
|
|
console.log(` Training pairs: ${stats.trainingPairsWritten}\n`);
|
|
|
|
return stats;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Batch runner — crawl multiple vendors in sequence
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
export async function runVendorDiscoveryBatch(
|
|
vendorSlugs?: string[],
|
|
options: { dryRun?: boolean; verbose?: boolean } = {}
|
|
): Promise<void> {
|
|
const targets = vendorSlugs
|
|
? VENDOR_CATALOG_REGISTRY.filter((v) => vendorSlugs.includes(v.slug))
|
|
: VENDOR_CATALOG_REGISTRY;
|
|
|
|
console.log(`Starting vendor discovery for ${targets.length} vendor(s)...`);
|
|
|
|
for (const config of targets) {
|
|
try {
|
|
await discoverVendorCatalog(config, options);
|
|
} catch (err) {
|
|
console.error(`[${config.slug}] Fatal crawl error:`, (err as Error).message);
|
|
}
|
|
}
|
|
|
|
console.log("Vendor discovery batch complete.");
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
// Standalone execution
|
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
if (require.main === module) {
|
|
const target = process.argv[2]; // optional: specific vendor slug
|
|
const dryRun = process.argv.includes("--dry-run");
|
|
const verbose = process.argv.includes("--verbose");
|
|
|
|
const run = async () => {
|
|
if (target) {
|
|
const config = VENDOR_CATALOG_REGISTRY.find((v) => v.slug === target);
|
|
if (!config) {
|
|
console.error(`Unknown vendor slug: ${target}`);
|
|
console.log("Available:", VENDOR_CATALOG_REGISTRY.map((v) => v.slug).join(", "));
|
|
process.exit(1);
|
|
}
|
|
await discoverVendorCatalog(config, { dryRun, verbose });
|
|
} else {
|
|
await runVendorDiscoveryBatch(undefined, { dryRun, verbose });
|
|
}
|
|
};
|
|
|
|
run()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|