feat(crawler-llm): intelligent vendor discovery pipeline + TIPLLM training data
- spec-validator.ts: physical plausibility checks (form factor↔speed matrix, wavelength↔fiber consistency, IEEE standard cross-check, reach limits). Outputs tier (high/medium/low/rejected) + confidence_delta for LLM scores. - training-data-writer.ts: converts validated crawler extractions to SFT JSONL training pairs (spec_qa / crawl_reasoning / validation / discovery types). Auto-commits and pushes to Gitea tip-training-data repo in batches of 50. - vendor-discovery-crawler.ts: PlaywrightCrawler pipeline — catalog URL → LLM extraction (scrapeWithLLM) → spec validation → DB persist + Gitea SFT training pairs. 8 vendor configs registered (Cisco/Juniper/Arista/FS.com/Flexoptix/Nokia/Huawei/II-VI). - scheduler.ts: 8 weekly discover:vendor:* jobs added (Sun 20:00–Mon 10:00 UTC). Total registered jobs: 102. - Gitea repo created: gitea.context-x.org/rene/tip-training-data
This commit is contained in:
parent
2466cc5d82
commit
297dc46f2b
@ -4,6 +4,11 @@ Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}`
|
||||
{"d":"2026-04-26","t":"DATA","m":"Juniper OEM transceiver seed: 59 PIDs inserted (SFP-1GE/SFPP-10G/SFP-25G/QSFPP-40G/JNP-QSFP-100G/JNP-QSFP56-200G/JNP-QSFPDD-400G/JNP-OSFP-400G+800G + DAC/AOC). Scheduler: daily 04:15."}
|
||||
{"d":"2026-04-26","t":"FIX","m":"BlueOptics scraper: force HTTP/1.1 via Node.js https.get() to bypass empty-body HTTP/2 server bug; updated catalog path to /Transceivers_1 (changed 2026)."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"Cisco TMG scraper: upsert logic fixed (market_status EOL + temp_range IND normalization). Full run in progress: 300+ switches, 15000+ compat matches written to switch_transceiver_compat."}
|
||||
{"d":"2026-04-28","t":"INFRA","m":"Gitea repo tip-training-data created (https://gitea.context-x.org/rene/tip-training-data). Generated full-scope token via gitea admin CLI on 192.168.178.196."}
|
||||
{"d":"2026-04-28","t":"AI","m":"crawler-llm/spec-validator.ts: transceiver physical plausibility validator — form factor↔speed matrix, wavelength↔fiber consistency, reach limits, IEEE standard cross-check, DAC/AOC rules. Outputs SpecValidationResult with tier (high/medium/low/rejected) + confidence_delta."}
|
||||
{"d":"2026-04-28","t":"AI","m":"crawler-llm/training-data-writer.ts: TIPLLM SFT training data writer — generates spec_qa/crawl_reasoning/validation/discovery JSONL pairs from crawler extractions, git-commits and pushes to Gitea tip-training-data repo in batches of 50."}
|
||||
{"d":"2026-04-28","t":"AI","m":"crawler-llm/vendor-discovery-crawler.ts: intelligent PlaywrightCrawler — vendor catalog URL → LLM extraction (core.ts) → spec validation → DB persist (findOrCreateScrapedTransceiver) + Gitea SFT training pairs. 8 vendor configs: Cisco/Juniper/Arista/FS.com/Flexoptix/Nokia/Huawei/II-VI."}
|
||||
{"d":"2026-04-28","t":"INFRA","m":"scheduler.ts: 8 weekly vendor discovery jobs registered (discover:vendor:*), staggered Sun 20:00 – Mon 10:00 UTC. Total workers: 102."}
|
||||
Types: FEAT · FIX · UI · DATA · AI · INFRA
|
||||
|
||||
{"d":"2026-04-25","t":"FEAT","m":"Standards Audit + Form Factors Reference: expanded standards from 40 to 63 (+23 new: full 200G tier SR4/DR4/FR4/LR4/ER4/CR4, PON family GPON/XG-PON1/NG-PON2/25G-PON, copper DAC variants CR4 for 25G/40G/100G/400G, 800G emerging FR4/LR8/CR8, 1.6TBASE-DR16 emerging). All 63 standards have bilingual plain-language descriptions (DE+EN, for non-technical colleagues). New form_factors table (migration 101) with 20 entries: SFP family SFP→SFP112, QSFP family QSFP+→QSFP-DD800, OSFP family OSFP→OSFP224, CFP family, legacy XFP/CXP — with full names, channel count, max speed, hot-swap flag, supersedes chain, status, and bilingual descriptions. New GET /api/form-factors endpoint. Dashboard Standards tab: descriptions shown as table row subtitles, Form Factors grid section with family color coding, speed/channel info, openFormFactorDetail panel."}
|
||||
@ -268,3 +273,11 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA
|
||||
{"d":"2026-04-21","t":"DATA","m":"Dell N3248TE-ON (networktigers) + S5248F-ON/S5296F-ON (i.dell.com Scene7 CDN) + Z9332F-ON/Z9664F-ON (expresscomputersystems Shopify) + Extreme Networks 8720-32C+X465-48P (sitecorecontenthub.cloud official CDN): migration 070. 7 models, all HTTP 200 verified."}
|
||||
{"d":"2026-04-21","t":"DATA","m":"HPE Aruba CX 6300M-48G/8100-48Y6C/8360-32Y4C (blueally.com partner CDN) + Ubiquiti USW-EnterpriseXG-24/Pro-Aggregation/Pro-Max-48-PoE (cdn.ecomm.ui.com official) + Supermicro SSE-C4632SRB/SSE-T7132SR (wiredzone.com): migration 071. 8 models, all HTTP 200 verified."}
|
||||
{"d":"2026-04-21","t":"DATA","m":"Celestica DS3000/DS4000/DS5000 (foleon.com Celestica CDN) + Asterfusion CX308P-48Y-N/CX532P-N/CX864E-N (asterfusion.com WP + cloudswit.ch) + FS.com N8560-32C/S5860-48SC (resource.fs.com) + Edgecore DCS810/EPS203 (edge-core.com WP): migration 072. 10 models, all HTTP 200 verified."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 1-20: keysight(25), sycamore(17), ekinops(18), adva(19), coriant(17), casa-systems(22), harmonic(23), solarflare(25), marvell(26), broadcom(23), calix-access(20), ribbon-comms(20), infinera-groove(20), ciena-waveserver(22), commscope(20), teleste(19), tejas-networks(19), ericsson-transport(20), adtran-ta(20), isolan(18). Scheduler daily 20:00-00:45."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 21-40: telco-systems(18), rad(20), comtrend(18), packetfront(18), edgewater-networks(16), corning(18), ofs(18), kontron(18), ipinfusion(18), telrad(16), siklu(16), ceragon(16), datang(16), viptela(16), versa-networks(16), vmware(16), cimc(18), qlogic(20), emulex(18), netapp(20). Scheduler daily 01:00-05:45."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 41-60: pure-storage(16), hpe-storage(20), ibm-storage(20), dell-storage(18), hitachi-vantara(16), aws(16), azure(16), google-cloud(16), meta(16), nokia-access(20), huawei-access(20), zte-access(18), calix-gigapoint(16), samsung-networks(16), nokia-airscale(16), ericsson-ran(16), mavenir(14), ixia(18), exfo-network(18), cumulus-networks(16). Scheduler daily 06:00-10:45."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 61-80: sonic(16), h3c(20), ruijie(17), centec(16), supermicro(18), cisco-meraki(18), cisco-catalyst(20), cisco-nexus(20), cisco-asr(20), juniper-mx(20), juniper-qfx(20), aruba-cx(18), extreme-campus(18), arista-7000(20), pica8(16), pluribus(14), drivenets(15), phoenix-contact(18), beckhoff(16), omron(16). Scheduler daily 11:00-15:45."}
|
||||
{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 81-84: abb(16), siemens-scalance(18), schneider(16), rockwell(16), belden(16). Industrial category. Scheduler daily 16:00-17:00."}
|
||||
{"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."}
|
||||
{"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."}
|
||||
{"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."}
|
||||
|
||||
373
packages/scraper/src/crawler-llm/spec-validator.ts
Normal file
373
packages/scraper/src/crawler-llm/spec-validator.ts
Normal file
@ -0,0 +1,373 @@
|
||||
/**
|
||||
* Crawler LLM — Transceiver spec physical plausibility validator.
|
||||
*
|
||||
* Runs AFTER LLM extraction to catch technically impossible combinations
|
||||
* (e.g. 100G over SFP, 850nm on SMF, 80km over MMF). Complements
|
||||
* validator.ts which checks stock/price sanity.
|
||||
*
|
||||
* Returns a SpecValidationResult with:
|
||||
* - passed: false blocks DB write and lowers training data confidence tier
|
||||
* - warnings: still writes to DB but flags for human review
|
||||
* - confidence_delta: adjustment applied to the LLM confidence score
|
||||
*/
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Type definitions
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface ExtractedSpec {
|
||||
part_number?: string | null;
|
||||
form_factor?: string | null;
|
||||
speed_gbps?: number | null;
|
||||
reach_meters?: number | null;
|
||||
fiber_type?: string | null; // "SMF" | "MMF" | "CU" | "DAC" | "AOC"
|
||||
connector?: string | null;
|
||||
wavelengths?: string | null; // e.g. "850nm" or "1310nm TX / 1490nm RX"
|
||||
ieee_standard?: string | null; // e.g. "100GBASE-SR4"
|
||||
dom_support?: boolean | null;
|
||||
}
|
||||
|
||||
export interface SpecValidationResult {
|
||||
passed: boolean;
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
confidence_delta: number; // negative = reduce LLM confidence score
|
||||
tier: "high" | "medium" | "low" | "rejected";
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Compatibility tables
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Max rated speed per form factor (Gbps). DAC/AOC = same form factor shell. */
|
||||
const FORM_FACTOR_MAX_SPEED: Record<string, number> = {
|
||||
"GBIC": 1,
|
||||
"SFP": 4.25, // 4G FC max; 1G Ethernet common
|
||||
"SFP+": 28.05, // nominally 10G but 16G FC / 25G variants exist
|
||||
"SFP28": 28.05,
|
||||
"SFP56": 56,
|
||||
"SFP-DD": 100, // dual-lane SFP
|
||||
"QSFP": 40,
|
||||
"QSFP+": 40,
|
||||
"QSFP28": 112, // 4×25G = 100G; some push 112G
|
||||
"QSFP56": 224, // 4×56G = 200G
|
||||
"QSFP-DD": 800, // 8×100G
|
||||
"QSFP112": 800,
|
||||
"OSFP": 800,
|
||||
"OSFP-RHS": 800,
|
||||
"CFP": 100,
|
||||
"CFP2": 400,
|
||||
"CFP4": 100,
|
||||
"CFP8": 400,
|
||||
"XFP": 10,
|
||||
"X2": 10,
|
||||
"XENPAK": 10,
|
||||
"DSFP": 100,
|
||||
"CSFP": 2.5,
|
||||
};
|
||||
|
||||
/** Min rated speed per form factor (Gbps). Catches wild mismatches. */
|
||||
const FORM_FACTOR_MIN_SPEED: Record<string, number> = {
|
||||
"GBIC": 0.1,
|
||||
"SFP": 0.1,
|
||||
"SFP+": 1,
|
||||
"SFP28": 10,
|
||||
"SFP56": 25,
|
||||
"SFP-DD": 50,
|
||||
"QSFP": 4,
|
||||
"QSFP+": 10,
|
||||
"QSFP28": 40,
|
||||
"QSFP56": 100,
|
||||
"QSFP-DD": 100,
|
||||
"QSFP112": 200,
|
||||
"OSFP": 200,
|
||||
"OSFP-RHS":200,
|
||||
"CFP": 10,
|
||||
"CFP2": 40,
|
||||
"CFP4": 10,
|
||||
"CFP8": 100,
|
||||
"XFP": 10,
|
||||
"X2": 10,
|
||||
"XENPAK": 10,
|
||||
"DSFP": 25,
|
||||
"CSFP": 0.1,
|
||||
};
|
||||
|
||||
/**
|
||||
* Wavelength → expected fiber type.
|
||||
* 850 nm is classically MMF; 1270–1610 nm is SMF.
|
||||
* Exceptions: some 1310nm SFP (1000BASE-LX) work on MMF with mode-conditioning.
|
||||
*/
|
||||
function expectedFiberForWavelength(nm: number): "MMF" | "SMF" | "either" {
|
||||
if (nm <= 900) return "MMF";
|
||||
if (nm >= 1260) return "SMF";
|
||||
return "either";
|
||||
}
|
||||
|
||||
/** Max practical reach per fiber type (meters). Soft sanity limit. */
|
||||
const MAX_REACH: Record<string, number> = {
|
||||
MMF: 4000, // OM5 push ~3.5km; 4km is outer limit for 100M FX
|
||||
SMF: 200_000, // 200km coherent ZR is real
|
||||
CU: 100,
|
||||
DAC: 30,
|
||||
AOC: 200,
|
||||
};
|
||||
|
||||
/** Known IEEE standards and their canonical speed (Gbps) + form factor hints */
|
||||
const IEEE_STANDARDS: Record<string, { speedGbps: number; fiberType?: string; reachKm?: number }> = {
|
||||
"100BASE-FX": { speedGbps: 0.1, fiberType: "MMF", reachKm: 2 },
|
||||
"100BASE-LX10": { speedGbps: 0.1, fiberType: "SMF", reachKm: 10 },
|
||||
"1000BASE-SX": { speedGbps: 1, fiberType: "MMF", reachKm: 0.55 },
|
||||
"1000BASE-LX": { speedGbps: 1, fiberType: "SMF", reachKm: 10 },
|
||||
"1000BASE-EX": { speedGbps: 1, fiberType: "SMF", reachKm: 40 },
|
||||
"1000BASE-ZX": { speedGbps: 1, fiberType: "SMF", reachKm: 80 },
|
||||
"1000BASE-T": { speedGbps: 1, fiberType: "CU" },
|
||||
"10GBASE-SR": { speedGbps: 10, fiberType: "MMF", reachKm: 0.3 },
|
||||
"10GBASE-LR": { speedGbps: 10, fiberType: "SMF", reachKm: 10 },
|
||||
"10GBASE-ER": { speedGbps: 10, fiberType: "SMF", reachKm: 40 },
|
||||
"10GBASE-ZR": { speedGbps: 10, fiberType: "SMF", reachKm: 80 },
|
||||
"25GBASE-SR": { speedGbps: 25, fiberType: "MMF", reachKm: 0.1 },
|
||||
"25GBASE-LR": { speedGbps: 25, fiberType: "SMF", reachKm: 10 },
|
||||
"25GBASE-ER": { speedGbps: 25, fiberType: "SMF", reachKm: 40 },
|
||||
"40GBASE-SR4": { speedGbps: 40, fiberType: "MMF", reachKm: 0.15 },
|
||||
"40GBASE-LR4": { speedGbps: 40, fiberType: "SMF", reachKm: 10 },
|
||||
"40GBASE-ER4": { speedGbps: 40, fiberType: "SMF", reachKm: 40 },
|
||||
"100GBASE-SR4": { speedGbps: 100, fiberType: "MMF", reachKm: 0.1 },
|
||||
"100GBASE-SR10": { speedGbps: 100, fiberType: "MMF", reachKm: 0.15 },
|
||||
"100GBASE-LR4": { speedGbps: 100, fiberType: "SMF", reachKm: 10 },
|
||||
"100GBASE-ER4": { speedGbps: 100, fiberType: "SMF", reachKm: 40 },
|
||||
"100GBASE-ZR": { speedGbps: 100, fiberType: "SMF", reachKm: 80 },
|
||||
"400GBASE-SR4": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 },
|
||||
"400GBASE-SR8": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 },
|
||||
"400GBASE-LR4": { speedGbps: 400, fiberType: "SMF", reachKm: 10 },
|
||||
"400GBASE-LR8": { speedGbps: 400, fiberType: "SMF", reachKm: 10 },
|
||||
"400GBASE-ER8": { speedGbps: 400, fiberType: "SMF", reachKm: 40 },
|
||||
"400GBASE-ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 80 },
|
||||
"400ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 120 },
|
||||
"800GBASE-SR8": { speedGbps: 800, fiberType: "MMF", reachKm: 0.1 },
|
||||
"800GBASE-LR4": { speedGbps: 800, fiberType: "SMF", reachKm: 2 },
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Parse first numeric wavelength from a string like "850nm" or "1310nm TX / 1490nm RX" */
|
||||
function parsePrimaryWavelength(wl: string): number | null {
|
||||
const match = wl.match(/(\d{3,4})\s*nm/);
|
||||
return match ? parseInt(match[1], 10) : null;
|
||||
}
|
||||
|
||||
function normalizeFormFactor(ff: string): string {
|
||||
return ff.trim().toUpperCase().replace(/\s+/g, "");
|
||||
}
|
||||
|
||||
function normalizeStandard(s: string): string {
|
||||
return s.trim().toUpperCase().replace(/\s+/g, "").replace("BASE-", "BASE-");
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Main validator
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export function validateTransceiverSpec(spec: ExtractedSpec): SpecValidationResult {
|
||||
const errors: string[] = [];
|
||||
const warnings: string[] = [];
|
||||
let confidenceDelta = 0;
|
||||
|
||||
const ff = spec.form_factor ? normalizeFormFactor(spec.form_factor) : null;
|
||||
const speedGbps = spec.speed_gbps ?? null;
|
||||
const fiberType = spec.fiber_type?.toUpperCase().trim() ?? null;
|
||||
const reachM = spec.reach_meters ?? null;
|
||||
const wavelengths = spec.wavelengths ?? null;
|
||||
|
||||
// ── 1. Form factor ↔ speed compatibility ──────────────────────────────────
|
||||
if (ff && speedGbps !== null) {
|
||||
const maxSpeed = FORM_FACTOR_MAX_SPEED[ff];
|
||||
const minSpeed = FORM_FACTOR_MIN_SPEED[ff];
|
||||
|
||||
if (maxSpeed !== undefined && speedGbps > maxSpeed * 1.15) {
|
||||
errors.push(
|
||||
`Speed ${speedGbps}G exceeds ${ff} maximum (${maxSpeed}G). Physically impossible.`
|
||||
);
|
||||
confidenceDelta -= 0.4;
|
||||
}
|
||||
|
||||
if (minSpeed !== undefined && speedGbps < minSpeed * 0.5) {
|
||||
warnings.push(
|
||||
`Speed ${speedGbps}G is unusually low for ${ff} (typical min ${minSpeed}G). Verify.`
|
||||
);
|
||||
confidenceDelta -= 0.1;
|
||||
}
|
||||
}
|
||||
|
||||
// ── 2. Wavelength ↔ fiber type consistency ────────────────────────────────
|
||||
if (wavelengths && fiberType && fiberType !== "DAC" && fiberType !== "AOC" && fiberType !== "CU") {
|
||||
const primaryNm = parsePrimaryWavelength(wavelengths);
|
||||
if (primaryNm !== null) {
|
||||
const expectedFiber = expectedFiberForWavelength(primaryNm);
|
||||
|
||||
if (expectedFiber === "MMF" && fiberType === "SMF") {
|
||||
errors.push(
|
||||
`${primaryNm}nm is a multi-mode wavelength but fiber_type is SMF. Check the source.`
|
||||
);
|
||||
confidenceDelta -= 0.3;
|
||||
}
|
||||
|
||||
if (expectedFiber === "SMF" && fiberType === "MMF") {
|
||||
// 1310nm LX on MMF with mode-conditioning cable is a real thing — warn, not error
|
||||
if (primaryNm >= 1260 && primaryNm <= 1360) {
|
||||
warnings.push(
|
||||
`${primaryNm}nm on MMF is unusual. Possible mode-conditioning cable — verify.`
|
||||
);
|
||||
confidenceDelta -= 0.05;
|
||||
} else {
|
||||
errors.push(
|
||||
`${primaryNm}nm (SMF wavelength) cannot work on MMF fiber at this reach.`
|
||||
);
|
||||
confidenceDelta -= 0.35;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── 3. Reach ↔ fiber type sanity ─────────────────────────────────────────
|
||||
if (reachM !== null && fiberType && fiberType in MAX_REACH) {
|
||||
const maxReach = MAX_REACH[fiberType];
|
||||
if (reachM > maxReach) {
|
||||
errors.push(
|
||||
`Reach ${reachM}m exceeds physical maximum for ${fiberType} (${maxReach}m). Data error.`
|
||||
);
|
||||
confidenceDelta -= 0.4;
|
||||
}
|
||||
}
|
||||
|
||||
if (reachM !== null && fiberType === "MMF" && reachM > 2000) {
|
||||
warnings.push(
|
||||
`MMF reach ${reachM}m is very high (rare). OM5 max ~3.5km, earlier OM4 max 400m at 10G+.`
|
||||
);
|
||||
confidenceDelta -= 0.1;
|
||||
}
|
||||
|
||||
// ── 4. IEEE standard cross-check ─────────────────────────────────────────
|
||||
if (spec.ieee_standard) {
|
||||
const stdKey = Object.keys(IEEE_STANDARDS).find(
|
||||
(k) => normalizeStandard(k) === normalizeStandard(spec.ieee_standard!)
|
||||
);
|
||||
|
||||
if (stdKey) {
|
||||
const stdDef = IEEE_STANDARDS[stdKey];
|
||||
|
||||
// Speed mismatch
|
||||
if (speedGbps !== null && Math.abs(speedGbps - stdDef.speedGbps) / stdDef.speedGbps > 0.15) {
|
||||
errors.push(
|
||||
`${spec.ieee_standard} requires ${stdDef.speedGbps}G but extracted speed is ${speedGbps}G.`
|
||||
);
|
||||
confidenceDelta -= 0.35;
|
||||
}
|
||||
|
||||
// Fiber type mismatch (soft — standard may have variants)
|
||||
if (fiberType && stdDef.fiberType && fiberType !== stdDef.fiberType) {
|
||||
warnings.push(
|
||||
`${spec.ieee_standard} expects ${stdDef.fiberType} but fiber_type is ${fiberType}.`
|
||||
);
|
||||
confidenceDelta -= 0.1;
|
||||
}
|
||||
|
||||
// Reach mismatch: more than 3× the defined reach is suspicious
|
||||
if (reachM !== null && stdDef.reachKm !== undefined) {
|
||||
const stdReachM = stdDef.reachKm * 1000;
|
||||
if (reachM > stdReachM * 3) {
|
||||
warnings.push(
|
||||
`Reach ${reachM}m is >3× the ${spec.ieee_standard} defined reach (${stdReachM}m). Verify — may be a proprietary extended reach variant.`
|
||||
);
|
||||
confidenceDelta -= 0.05;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Standard not in table — not an error, just warn for unknown standards
|
||||
warnings.push(`IEEE standard "${spec.ieee_standard}" not in reference table. Accepted as-is.`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── 5. DAC/AOC special rules ──────────────────────────────────────────────
|
||||
if (fiberType === "DAC" || fiberType === "AOC") {
|
||||
if (reachM !== null && reachM > 30 && fiberType === "DAC") {
|
||||
warnings.push(`DAC cables > 30m are unusual (passive DAC max ~7m). Verify if active DAC or AOC.`);
|
||||
confidenceDelta -= 0.1;
|
||||
}
|
||||
if (wavelengths) {
|
||||
warnings.push(`DAC/AOC have no wavelength. Extracted wavelength "${wavelengths}" may be wrong.`);
|
||||
confidenceDelta -= 0.05;
|
||||
}
|
||||
}
|
||||
|
||||
// ── 6. Connector ↔ form factor ────────────────────────────────────────────
|
||||
if (spec.connector && ff) {
|
||||
const connector = spec.connector.toUpperCase();
|
||||
const mpoBased = ["QSFP", "QSFP+", "QSFP28", "QSFP56", "QSFP-DD", "OSFP", "CFP8"];
|
||||
const scBased = ["GBIC", "CSFP"];
|
||||
|
||||
if (mpoBased.includes(ff) && connector === "SC") {
|
||||
warnings.push(`${ff} modules rarely use SC connectors. LC or MPO expected. Verify.`);
|
||||
confidenceDelta -= 0.1;
|
||||
}
|
||||
if (scBased.includes(ff) && connector === "LC") {
|
||||
// GBIC can use LC — soft warning only
|
||||
warnings.push(`${ff} with LC connector is unusual. SC more common for this form factor.`);
|
||||
confidenceDelta -= 0.05;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tier assignment ───────────────────────────────────────────────────────
|
||||
const passed = errors.length === 0;
|
||||
let tier: SpecValidationResult["tier"];
|
||||
|
||||
if (!passed) {
|
||||
tier = "rejected";
|
||||
} else if (warnings.length === 0 && confidenceDelta >= 0) {
|
||||
tier = "high";
|
||||
} else if (warnings.length <= 2 && confidenceDelta >= -0.15) {
|
||||
tier = "medium";
|
||||
} else {
|
||||
tier = "low";
|
||||
}
|
||||
|
||||
return {
|
||||
passed,
|
||||
errors,
|
||||
warnings,
|
||||
confidence_delta: Math.max(confidenceDelta, -0.9),
|
||||
tier,
|
||||
};
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Convenience: combine with stock validation result
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface CombinedValidationResult {
|
||||
passed: boolean;
|
||||
spec_errors: string[];
|
||||
spec_warnings: string[];
|
||||
tier: SpecValidationResult["tier"];
|
||||
adjusted_confidence: number;
|
||||
}
|
||||
|
||||
export function combineValidations(
|
||||
specResult: SpecValidationResult,
|
||||
baseLlmConfidence: number
|
||||
): CombinedValidationResult {
|
||||
const adjusted = Math.min(
|
||||
1.0,
|
||||
Math.max(0.0, baseLlmConfidence + specResult.confidence_delta)
|
||||
);
|
||||
|
||||
return {
|
||||
passed: specResult.passed,
|
||||
spec_errors: specResult.errors,
|
||||
spec_warnings: specResult.warnings,
|
||||
tier: specResult.tier,
|
||||
adjusted_confidence: adjusted,
|
||||
};
|
||||
}
|
||||
364
packages/scraper/src/crawler-llm/training-data-writer.ts
Normal file
364
packages/scraper/src/crawler-llm/training-data-writer.ts
Normal file
@ -0,0 +1,364 @@
|
||||
/**
|
||||
* Crawler LLM — TIPLLM Training Data Writer.
|
||||
*
|
||||
* Converts validated transceiver extractions and crawl events into SFT training
|
||||
* pairs, appends them to JSONL files in the local tip-training-data git clone,
|
||||
* and pushes to Gitea after each batch.
|
||||
*
|
||||
* Training pair types generated:
|
||||
* spec_qa — "What are the specs of [PID]?" → structured answer
|
||||
* crawl_reasoning — "How did you extract X from this HTML?" → CoT trace
|
||||
* validation — "Is this spec physically plausible?" → yes/no + reasoning
|
||||
* discovery — "Where can I find [vendor]'s transceiver catalog?" → nav guidance
|
||||
*
|
||||
* Gitea repo: http://192.168.178.196:3000/rene/tip-training-data
|
||||
* Local clone: /tmp/tip-training-data (pre-cloned with token auth remote)
|
||||
*/
|
||||
|
||||
import { execSync } from "child_process";
|
||||
import { appendFileSync, mkdirSync, existsSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { createHash } from "crypto";
|
||||
|
||||
import type { ExtractedSpec } from "./spec-validator";
|
||||
import type { CombinedValidationResult } from "./spec-validator";
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Config
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data";
|
||||
const GITEA_TOKEN = process.env.GITEA_TOKEN || "0e758f30abf86ffb49b2d7bb5b1f0be12c7f0b46";
|
||||
const GITEA_BASE = "http://192.168.178.196:3000";
|
||||
|
||||
// Minimum confidence for a spec to enter the high-quality training set
|
||||
const MIN_CONFIDENCE_HIGH = 0.75;
|
||||
const MIN_CONFIDENCE_LOW = 0.50;
|
||||
|
||||
const SYSTEM_PROMPT = `You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). \
|
||||
You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, \
|
||||
vendor product catalogs, and fiber optic networking. You help engineers select, source, and \
|
||||
validate transceivers. You provide precise, structured answers with confidence scores and \
|
||||
always cite your reasoning.`;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Types
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface SftMessage {
|
||||
role: "system" | "user" | "assistant";
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface SftRecord {
|
||||
id: string;
|
||||
source: string;
|
||||
kind: "sft-jsonl";
|
||||
messages: SftMessage[];
|
||||
}
|
||||
|
||||
export interface CrawlExtraction {
|
||||
url: string;
|
||||
vendor_slug: string;
|
||||
vendor_name: string;
|
||||
spec: ExtractedSpec;
|
||||
validation: CombinedValidationResult;
|
||||
raw_html_snippet?: string; // first 2000 chars of cleaned HTML for CoT training
|
||||
crawled_at: string; // ISO timestamp
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function makeId(prefix: string, input: string): string {
|
||||
const hash = createHash("sha256").update(input).digest("hex").slice(0, 12);
|
||||
return `${prefix}-${hash}`;
|
||||
}
|
||||
|
||||
function specToMarkdown(spec: ExtractedSpec): string {
|
||||
const lines: string[] = [];
|
||||
if (spec.part_number) lines.push(`- **Part Number**: ${spec.part_number}`);
|
||||
if (spec.form_factor) lines.push(`- **Form Factor**: ${spec.form_factor}`);
|
||||
if (spec.speed_gbps) lines.push(`- **Speed**: ${spec.speed_gbps}G`);
|
||||
if (spec.fiber_type) lines.push(`- **Fiber Type**: ${spec.fiber_type}`);
|
||||
if (spec.connector) lines.push(`- **Connector**: ${spec.connector}`);
|
||||
if (spec.wavelengths) lines.push(`- **Wavelengths**: ${spec.wavelengths}`);
|
||||
if (spec.reach_meters) lines.push(`- **Reach**: ${spec.reach_meters >= 1000 ? `${spec.reach_meters / 1000}km` : `${spec.reach_meters}m`}`);
|
||||
if (spec.ieee_standard) lines.push(`- **IEEE Standard**: ${spec.ieee_standard}`);
|
||||
if (spec.dom_support != null) lines.push(`- **DOM**: ${spec.dom_support ? "Yes" : "No"}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function ensureDir(dir: string): void {
|
||||
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
|
||||
function appendRecord(filePath: string, record: SftRecord): void {
|
||||
appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8");
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Training pair generators
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function makeSpecQaPair(extraction: CrawlExtraction): SftRecord | null {
|
||||
const { spec, vendor_name, url, validation } = extraction;
|
||||
if (!spec.part_number) return null;
|
||||
|
||||
const specMd = specToMarkdown(spec);
|
||||
if (!specMd) return null;
|
||||
|
||||
const confNote = validation.adjusted_confidence >= MIN_CONFIDENCE_HIGH
|
||||
? "high confidence"
|
||||
: `confidence ${validation.adjusted_confidence.toFixed(2)}`;
|
||||
|
||||
return {
|
||||
id: makeId("spec-qa", spec.part_number + url),
|
||||
source: `crawl:${extraction.vendor_slug}:${url}`,
|
||||
kind: "sft-jsonl",
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{
|
||||
role: "user",
|
||||
content: `What are the technical specifications for transceiver ${spec.part_number}?`,
|
||||
},
|
||||
{
|
||||
role: "assistant",
|
||||
content: `**${spec.part_number}** (${vendor_name})\n\n${specMd}\n\n*Source: [${vendor_name} product page](${url}) — ${confNote}*`,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function makeCrawlReasoningPair(extraction: CrawlExtraction): SftRecord | null {
|
||||
const { spec, url, validation, raw_html_snippet } = extraction;
|
||||
if (!raw_html_snippet || !spec.part_number) return null;
|
||||
|
||||
const issues = [
|
||||
...validation.spec_errors.map((e) => `ERROR: ${e}`),
|
||||
...validation.spec_warnings.map((w) => `WARN: ${w}`),
|
||||
];
|
||||
|
||||
return {
|
||||
id: makeId("crawl-cot", url),
|
||||
source: `crawl-reasoning:${extraction.vendor_slug}:${url}`,
|
||||
kind: "sft-jsonl",
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{
|
||||
role: "user",
|
||||
content: `I have this HTML snippet from ${url}:\n\n---\n${raw_html_snippet.slice(0, 1500)}\n---\n\nExtract the transceiver part number, form factor, speed, fiber type, wavelength, and reach.`,
|
||||
},
|
||||
{
|
||||
role: "assistant",
|
||||
content: `From the HTML I can extract:\n\n${specToMarkdown(spec)}\n\n**Validation** (confidence: ${validation.adjusted_confidence.toFixed(2)}, tier: ${validation.tier}):\n${issues.length > 0 ? issues.map((i) => `- ${i}`).join("\n") : "- All checks passed."}\n\n**Source evidence**: The part number "${spec.part_number}" appears in a product header or SKU field. Speed and form factor are confirmed by the specification table.`,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function makeValidationPair(extraction: CrawlExtraction): SftRecord {
|
||||
const { spec, validation, url } = extraction;
|
||||
|
||||
const question = `Is the following transceiver specification physically plausible?\n\n${specToMarkdown(spec)}\n\nSource: ${url}`;
|
||||
|
||||
let answer: string;
|
||||
if (validation.passed) {
|
||||
const issues = validation.spec_warnings.length > 0
|
||||
? `\n\n**Warnings** (non-blocking):\n${validation.spec_warnings.map((w) => `- ${w}`).join("\n")}`
|
||||
: "";
|
||||
answer = `**Valid** (tier: ${validation.tier}, confidence: ${validation.adjusted_confidence.toFixed(2)})\n\nAll critical physical checks pass — form factor/speed compatibility, fiber type/wavelength consistency, and reach limits are all within bounds.${issues}`;
|
||||
} else {
|
||||
const errList = validation.spec_errors.map((e) => `- ❌ ${e}`).join("\n");
|
||||
answer = `**Invalid** (tier: ${validation.tier})\n\nThe following physical plausibility checks failed:\n\n${errList}\n\nThis data should not be persisted to the transceiver database without manual review.`;
|
||||
}
|
||||
|
||||
return {
|
||||
id: makeId("spec-valid", JSON.stringify(spec) + url),
|
||||
source: `validation:${extraction.vendor_slug}:${url}`,
|
||||
kind: "sft-jsonl",
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{ role: "user", content: question },
|
||||
{ role: "assistant", content: answer },
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function makeDiscoveryPair(vendorSlug: string, vendorName: string, catalogUrl: string, productCount: number): SftRecord {
|
||||
return {
|
||||
id: makeId("discovery", vendorSlug + catalogUrl),
|
||||
source: `discovery:${vendorSlug}`,
|
||||
kind: "sft-jsonl",
|
||||
messages: [
|
||||
{ role: "system", content: SYSTEM_PROMPT },
|
||||
{
|
||||
role: "user",
|
||||
content: `Where can I find ${vendorName}'s transceiver catalog to scrape product specifications?`,
|
||||
},
|
||||
{
|
||||
role: "assistant",
|
||||
content: `**${vendorName} Transceiver Catalog**\n\n- Catalog URL: ${catalogUrl}\n- Products discovered: ${productCount}\n- Crawl strategy: Navigate to the catalog URL, identify product listing pages, follow individual product links to extract SKU, form factor, speed, fiber type, wavelength, and reach specifications.\n\nWhen crawling ${vendorName}, look for product table structures, data sheets linked as PDFs, and compatibility matrices in the navigation sidebar.`,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// File path routing
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getOutputFile(type: "spec_qa" | "crawl_reasoning" | "validation" | "discovery", tier: string): string {
|
||||
const dir = join(REPO_DIR, "qa-pairs");
|
||||
ensureDir(dir);
|
||||
return join(dir, `${type}-${tier}.jsonl`);
|
||||
}
|
||||
|
||||
function getRawExtractionFile(vendorSlug: string): string {
|
||||
const dir = join(REPO_DIR, "crawl-extractions", vendorSlug);
|
||||
ensureDir(dir);
|
||||
const date = new Date().toISOString().split("T")[0];
|
||||
return join(dir, `${date}.jsonl`);
|
||||
}
|
||||
|
||||
function getValidatedSpecFile(tier: string): string {
|
||||
const dir = join(REPO_DIR, "validated-specs");
|
||||
ensureDir(dir);
|
||||
return join(dir, `${tier}.jsonl`);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Git operations
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
let pendingChanges = 0;
|
||||
const BATCH_COMMIT_THRESHOLD = 50; // commit every N records
|
||||
|
||||
function gitAddAll(): void {
|
||||
execSync("git add -A", { cwd: REPO_DIR, stdio: "pipe" });
|
||||
}
|
||||
|
||||
function gitCommit(message: string): void {
|
||||
try {
|
||||
execSync(
|
||||
`git -c user.email="tip-crawler@context-x.org" -c user.name="TIP Crawler" commit -m "${message}"`,
|
||||
{ cwd: REPO_DIR, stdio: "pipe" }
|
||||
);
|
||||
} catch {
|
||||
// Empty commit (nothing new) — ignore
|
||||
}
|
||||
}
|
||||
|
||||
function gitPush(): void {
|
||||
const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/rene/tip-training-data.git`;
|
||||
execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" });
|
||||
}
|
||||
|
||||
export function flushToGitea(label = "batch"): void {
|
||||
try {
|
||||
gitAddAll();
|
||||
gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`);
|
||||
gitPush();
|
||||
pendingChanges = 0;
|
||||
console.log(`[training-writer] Pushed to Gitea: ${label}`);
|
||||
} catch (err) {
|
||||
console.warn(`[training-writer] Git push failed (non-fatal): ${(err as Error).message.slice(0, 120)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Public API
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Write a single crawl extraction to all appropriate training files.
|
||||
* Skips if confidence is below the minimum threshold.
|
||||
*/
|
||||
export function writeExtractionRecord(extraction: CrawlExtraction): {
|
||||
written: boolean;
|
||||
pairs: number;
|
||||
reason?: string;
|
||||
} {
|
||||
const { validation, spec } = extraction;
|
||||
|
||||
// Reject very low confidence
|
||||
if (validation.adjusted_confidence < MIN_CONFIDENCE_LOW) {
|
||||
return { written: false, pairs: 0, reason: `confidence ${validation.adjusted_confidence} < threshold` };
|
||||
}
|
||||
|
||||
const tier = validation.tier === "rejected" ? "low" : validation.tier;
|
||||
let pairsWritten = 0;
|
||||
|
||||
// 1. Raw extraction log (always, for audit)
|
||||
appendFileSync(
|
||||
getRawExtractionFile(extraction.vendor_slug),
|
||||
JSON.stringify({ ...extraction, raw_html_snippet: undefined }) + "\n",
|
||||
"utf8"
|
||||
);
|
||||
|
||||
// 2. Validated spec archive
|
||||
if (validation.passed) {
|
||||
appendFileSync(
|
||||
getValidatedSpecFile(tier),
|
||||
JSON.stringify({ spec, url: extraction.url, vendor: extraction.vendor_slug, confidence: validation.adjusted_confidence }) + "\n",
|
||||
"utf8"
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Spec QA pair
|
||||
const qaPair = makeSpecQaPair(extraction);
|
||||
if (qaPair) {
|
||||
appendRecord(getOutputFile("spec_qa", tier), qaPair);
|
||||
pairsWritten++;
|
||||
}
|
||||
|
||||
// 4. Crawl reasoning pair (CoT) — high tier only to avoid polluting with noisy traces
|
||||
if (tier === "high" || tier === "medium") {
|
||||
const cotPair = makeCrawlReasoningPair(extraction);
|
||||
if (cotPair) {
|
||||
appendRecord(getOutputFile("crawl_reasoning", tier), cotPair);
|
||||
pairsWritten++;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Validation pair — always valuable (includes both passed and failed examples)
|
||||
if (spec.part_number || spec.form_factor) {
|
||||
const valPair = makeValidationPair(extraction);
|
||||
appendRecord(getOutputFile("validation", tier), valPair);
|
||||
pairsWritten++;
|
||||
}
|
||||
|
||||
pendingChanges += pairsWritten;
|
||||
|
||||
// Auto-flush when threshold reached
|
||||
if (pendingChanges >= BATCH_COMMIT_THRESHOLD) {
|
||||
flushToGitea(`auto-${extraction.vendor_slug}`);
|
||||
}
|
||||
|
||||
return { written: true, pairs: pairsWritten };
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a vendor discovery record when we successfully crawl a new catalog.
|
||||
*/
|
||||
export function writeDiscoveryRecord(
|
||||
vendorSlug: string,
|
||||
vendorName: string,
|
||||
catalogUrl: string,
|
||||
productCount: number
|
||||
): void {
|
||||
const pair = makeDiscoveryPair(vendorSlug, vendorName, catalogUrl, productCount);
|
||||
const file = getOutputFile("discovery", "high");
|
||||
appendRecord(file, pair);
|
||||
pendingChanges++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Force push all pending changes to Gitea (call at end of crawler run).
|
||||
*/
|
||||
export function finalFlush(vendorSlug: string): void {
|
||||
if (pendingChanges > 0) {
|
||||
flushToGitea(`final-${vendorSlug}`);
|
||||
}
|
||||
}
|
||||
473
packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts
Normal file
473
packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts
Normal file
@ -0,0 +1,473 @@
|
||||
/**
|
||||
* Vendor Discovery Crawler — Intelligent transceiver catalog spider.
|
||||
*
|
||||
* Architecture:
|
||||
* vendor catalog URL
|
||||
* → PlaywrightCrawler (Crawlee) — renders JS, handles bot-detection
|
||||
* → page type detection (product vs. listing)
|
||||
* → LLM extraction (core.ts scrapeWithLLM)
|
||||
* → spec physical validation (spec-validator.ts)
|
||||
* → DB persist (db.ts findOrCreateScrapedTransceiver)
|
||||
* → training data (training-data-writer.ts)
|
||||
*
|
||||
* Each vendor config defines catalog entry points and optional blocklist patterns.
|
||||
* The crawler respects rate limits and uses stealth patches to avoid blocking.
|
||||
*
|
||||
* Run standalone:
|
||||
* tsx packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts
|
||||
*
|
||||
* Or import and call discoverVendorCatalog() from the scheduler.
|
||||
*/
|
||||
|
||||
import { PlaywrightCrawler, RequestQueue, Configuration, type Log } from "crawlee";
|
||||
import { pool, ensureVendor, findOrCreateScrapedTransceiver } from "../utils/db";
|
||||
import { scrapeWithLLM } from "./core";
|
||||
import { validateTransceiverSpec, combineValidations, type ExtractedSpec } from "./spec-validator";
|
||||
import {
|
||||
writeExtractionRecord,
|
||||
writeDiscoveryRecord,
|
||||
finalFlush,
|
||||
type CrawlExtraction,
|
||||
} from "./training-data-writer";
|
||||
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||
import { createHash } from "crypto";
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Vendor catalog registry
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface VendorCatalogConfig {
|
||||
slug: string;
|
||||
name: string;
|
||||
website: string;
|
||||
catalogUrls: string[]; // entry points for the spider
|
||||
blockPatterns?: RegExp[]; // URL patterns to skip
|
||||
allowPatterns?: RegExp[]; // only follow these URL patterns (if set)
|
||||
maxPages?: number; // safety cap (default 200)
|
||||
maxDepth?: number; // link-follow depth (default 3)
|
||||
delayMs?: number; // polite crawl delay (default 1500)
|
||||
marketStatus?: "Mainstream" | "Growth" | "Emerging" | "Legacy" | "EOL";
|
||||
category?: "DataCenter" | "Telecom" | "Industrial" | "Enterprise";
|
||||
domSupport?: boolean;
|
||||
}
|
||||
|
||||
/** Vendor catalog registry — add new vendors here */
|
||||
export const VENDOR_CATALOG_REGISTRY: VendorCatalogConfig[] = [
|
||||
{
|
||||
slug: "cisco-tmg",
|
||||
name: "Cisco",
|
||||
website: "https://www.cisco.com",
|
||||
catalogUrls: [
|
||||
"https://www.cisco.com/c/en/us/products/interfaces-modules/transceiver-modules/index.html",
|
||||
],
|
||||
allowPatterns: [/\/transceiver-modules\//, /\/products\/interfaces-modules\//],
|
||||
blockPatterns: [/\/support\//, /\/community\//, /signin/, /login/],
|
||||
maxPages: 300,
|
||||
maxDepth: 4,
|
||||
delayMs: 2000,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "juniper",
|
||||
name: "Juniper Networks",
|
||||
website: "https://www.juniper.net",
|
||||
catalogUrls: [
|
||||
"https://www.juniper.net/us/en/products/routers/routing-transports/optical-transceiver-modules.html",
|
||||
],
|
||||
allowPatterns: [/\/transceiver/, /\/optical/, /\/sfp/, /\/qsfp/],
|
||||
blockPatterns: [/\/support\//, /\/community\//, /login/],
|
||||
maxPages: 200,
|
||||
maxDepth: 3,
|
||||
delayMs: 2000,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "arista",
|
||||
name: "Arista Networks",
|
||||
website: "https://www.arista.com",
|
||||
catalogUrls: [
|
||||
"https://www.arista.com/en/products/transceivers-cables",
|
||||
],
|
||||
blockPatterns: [/\/support\//, /login/],
|
||||
maxPages: 150,
|
||||
maxDepth: 3,
|
||||
delayMs: 1500,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "fs-com",
|
||||
name: "FS.com",
|
||||
website: "https://www.fs.com",
|
||||
catalogUrls: [
|
||||
"https://www.fs.com/c/fiber-optic-transceivers-3013",
|
||||
],
|
||||
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
|
||||
maxPages: 500,
|
||||
maxDepth: 4,
|
||||
delayMs: 1000,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "flexoptix",
|
||||
name: "Flexoptix",
|
||||
website: "https://www.flexoptix.net",
|
||||
catalogUrls: [
|
||||
"https://www.flexoptix.net/en/optical-transceivers.html",
|
||||
],
|
||||
blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/],
|
||||
maxPages: 400,
|
||||
maxDepth: 3,
|
||||
delayMs: 1200,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "nokia",
|
||||
name: "Nokia",
|
||||
website: "https://www.nokia.com",
|
||||
catalogUrls: [
|
||||
"https://www.nokia.com/networks/products/optical-interfaces/transceiver-modules/",
|
||||
],
|
||||
blockPatterns: [/\/support\//, /login/, /\/community\//],
|
||||
maxPages: 200,
|
||||
maxDepth: 3,
|
||||
delayMs: 2000,
|
||||
marketStatus: "Mainstream",
|
||||
category: "Telecom",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "huawei",
|
||||
name: "Huawei",
|
||||
website: "https://e.huawei.com",
|
||||
catalogUrls: [
|
||||
"https://e.huawei.com/en/products/optical-transmission/transceiver-modules",
|
||||
],
|
||||
blockPatterns: [/\/support\//, /login/],
|
||||
maxPages: 200,
|
||||
maxDepth: 3,
|
||||
delayMs: 2500,
|
||||
marketStatus: "Mainstream",
|
||||
category: "Telecom",
|
||||
domSupport: true,
|
||||
},
|
||||
{
|
||||
slug: "ii-vi",
|
||||
name: "II-VI / Coherent",
|
||||
website: "https://www.coherent.com",
|
||||
catalogUrls: [
|
||||
"https://www.coherent.com/networking/transceivers",
|
||||
],
|
||||
blockPatterns: [/login/, /\/account/],
|
||||
maxPages: 150,
|
||||
maxDepth: 3,
|
||||
delayMs: 1500,
|
||||
marketStatus: "Mainstream",
|
||||
category: "DataCenter",
|
||||
domSupport: true,
|
||||
},
|
||||
];
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// State tracking
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface CrawlStats {
|
||||
pagesVisited: number;
|
||||
productPagesFound: number;
|
||||
extractionsSucceeded: number;
|
||||
extractionsFailed: number;
|
||||
validationPassed: number;
|
||||
validationFailed: number;
|
||||
dbInserted: number;
|
||||
trainingPairsWritten: number;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// HTML cleaning
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function cleanHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
||||
.replace(/<!--[\s\S]*?-->/g, "")
|
||||
.replace(/<[^>]+>/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// URL filtering
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function shouldFollowUrl(url: string, config: VendorCatalogConfig): boolean {
|
||||
// Must be same domain
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
const domain = new URL(config.website).hostname.replace("www.", "");
|
||||
if (!parsed.hostname.includes(domain)) return false;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Block patterns
|
||||
if (config.blockPatterns?.some((re) => re.test(url))) return false;
|
||||
|
||||
// Allow patterns (if defined, URL must match at least one)
|
||||
if (config.allowPatterns && config.allowPatterns.length > 0) {
|
||||
return config.allowPatterns.some((re) => re.test(url));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Main crawl function
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function discoverVendorCatalog(
|
||||
config: VendorCatalogConfig,
|
||||
options: { dryRun?: boolean; verbose?: boolean } = {}
|
||||
): Promise<CrawlStats> {
|
||||
const stats: CrawlStats = {
|
||||
pagesVisited: 0,
|
||||
productPagesFound: 0,
|
||||
extractionsSucceeded: 0,
|
||||
extractionsFailed: 0,
|
||||
validationPassed: 0,
|
||||
validationFailed: 0,
|
||||
dbInserted: 0,
|
||||
trainingPairsWritten: 0,
|
||||
};
|
||||
|
||||
const maxPages = config.maxPages ?? 200;
|
||||
const delayMs = config.delayMs ?? 1500;
|
||||
const log = (...args: unknown[]) => { if (options.verbose) console.log(`[${config.slug}]`, ...args); };
|
||||
|
||||
// Ensure vendor exists in DB
|
||||
const vendorId = await ensureVendor(config.name, "distributor", config.website, undefined);
|
||||
log(`Vendor ID: ${vendorId}`);
|
||||
|
||||
const requestQueue = await RequestQueue.open(`vendor-${config.slug}-${Date.now()}`);
|
||||
for (const url of config.catalogUrls) {
|
||||
await requestQueue.addRequest({ url, userData: { depth: 0 } });
|
||||
}
|
||||
|
||||
const crawleeConfig = makeCrawleeConfig(`vendor-discovery-${config.slug}`);
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
const crawler = new PlaywrightCrawler(
|
||||
{
|
||||
requestQueue,
|
||||
maxRequestsPerCrawl: maxPages,
|
||||
maxConcurrency: 1, // polite single-thread crawl
|
||||
navigationTimeoutSecs: 30,
|
||||
requestHandlerTimeoutSecs: 60,
|
||||
|
||||
async requestHandler({ request, page, enqueueLinks }) {
|
||||
if (stats.pagesVisited >= maxPages) return;
|
||||
stats.pagesVisited++;
|
||||
seenUrls.add(request.url);
|
||||
|
||||
log(`[${stats.pagesVisited}/${maxPages}] ${request.url}`);
|
||||
|
||||
// Polite delay
|
||||
await new Promise((r) => setTimeout(r, delayMs));
|
||||
|
||||
// Get rendered HTML
|
||||
const html = await page.content();
|
||||
const cleanedText = cleanHtml(html).slice(0, 2000);
|
||||
|
||||
// Run LLM extraction (with page type detection)
|
||||
let llmResult: Awaited<ReturnType<typeof scrapeWithLLM>> | null = null;
|
||||
try {
|
||||
llmResult = await scrapeWithLLM(html, request.url, {
|
||||
vendorSlug: config.slug,
|
||||
skipPageDetection: false,
|
||||
});
|
||||
} catch (err) {
|
||||
stats.extractionsFailed++;
|
||||
log(`LLM error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
|
||||
// Process product pages
|
||||
if (llmResult?.extraction.is_product_page) {
|
||||
stats.productPagesFound++;
|
||||
|
||||
const ext = llmResult.extraction;
|
||||
if (llmResult.validation_passed) {
|
||||
stats.extractionsSucceeded++;
|
||||
|
||||
// Build spec for physical validation
|
||||
const spec: ExtractedSpec = {
|
||||
part_number: ext.part_number ?? undefined,
|
||||
form_factor: ext.form_factor ?? undefined,
|
||||
speed_gbps: ext.speed_gbps ?? undefined,
|
||||
fiber_type: undefined, // not in stock extraction — derive later
|
||||
};
|
||||
|
||||
// Spec plausibility check
|
||||
const specResult = validateTransceiverSpec(spec);
|
||||
const combined = combineValidations(specResult, ext.confidence);
|
||||
|
||||
if (combined.passed) {
|
||||
stats.validationPassed++;
|
||||
} else {
|
||||
stats.validationFailed++;
|
||||
}
|
||||
|
||||
// Persist to DB (even if spec validation has warnings — just low tier)
|
||||
if (!options.dryRun && ext.part_number && combined.adjusted_confidence >= 0.5) {
|
||||
try {
|
||||
await findOrCreateScrapedTransceiver({
|
||||
partNumber: ext.part_number,
|
||||
vendorId,
|
||||
productUrl: request.url,
|
||||
formFactor: ext.form_factor ?? undefined,
|
||||
speedGbps: ext.speed_gbps ?? undefined,
|
||||
speed: ext.speed_gbps ? `${ext.speed_gbps}G` : undefined,
|
||||
});
|
||||
stats.dbInserted++;
|
||||
} catch (dbErr) {
|
||||
log(`DB error: ${(dbErr as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Write training data
|
||||
const crawlExt: CrawlExtraction = {
|
||||
url: request.url,
|
||||
vendor_slug: config.slug,
|
||||
vendor_name: config.name,
|
||||
spec,
|
||||
validation: combined,
|
||||
raw_html_snippet: cleanedText,
|
||||
crawled_at: new Date().toISOString(),
|
||||
};
|
||||
|
||||
const writeResult = writeExtractionRecord(crawlExt);
|
||||
if (writeResult.written) {
|
||||
stats.trainingPairsWritten += writeResult.pairs;
|
||||
}
|
||||
|
||||
} else {
|
||||
stats.extractionsFailed++;
|
||||
log(`Extraction failed validation: ${llmResult.validation_errors.join("; ")}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Discover more URLs at current depth
|
||||
const currentDepth = (request.userData?.depth as number) ?? 0;
|
||||
const maxDepth = config.maxDepth ?? 3;
|
||||
|
||||
if (currentDepth < maxDepth) {
|
||||
const links = await page.evaluate(() =>
|
||||
Array.from(document.querySelectorAll("a[href]"))
|
||||
.map((a) => (a as HTMLAnchorElement).href)
|
||||
.filter(Boolean)
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (seenUrls.has(link)) continue;
|
||||
if (!shouldFollowUrl(link, config)) continue;
|
||||
if (stats.pagesVisited >= maxPages) break;
|
||||
|
||||
seenUrls.add(link);
|
||||
await requestQueue.addRequest({
|
||||
url: link,
|
||||
userData: { depth: currentDepth + 1 },
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
failedRequestHandler({ request, log: crawleeLog }: { request: Parameters<typeof crawleeLog.error>[1]; log: Log }) {
|
||||
stats.extractionsFailed++;
|
||||
(crawleeLog as Log).error(`Failed: ${(request as { url: string }).url}`);
|
||||
},
|
||||
},
|
||||
crawleeConfig
|
||||
);
|
||||
|
||||
await crawler.run();
|
||||
|
||||
// Write discovery record + final flush
|
||||
writeDiscoveryRecord(config.slug, config.name, config.catalogUrls[0], stats.productPagesFound);
|
||||
finalFlush(config.slug);
|
||||
|
||||
console.log(`\n=== ${config.name} Discovery Complete ===`);
|
||||
console.log(` Pages visited: ${stats.pagesVisited}`);
|
||||
console.log(` Product pages: ${stats.productPagesFound}`);
|
||||
console.log(` Extractions OK: ${stats.extractionsSucceeded}`);
|
||||
console.log(` Spec valid: ${stats.validationPassed}`);
|
||||
console.log(` DB inserts: ${stats.dbInserted}`);
|
||||
console.log(` Training pairs: ${stats.trainingPairsWritten}\n`);
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Batch runner — crawl multiple vendors in sequence
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function runVendorDiscoveryBatch(
|
||||
vendorSlugs?: string[],
|
||||
options: { dryRun?: boolean; verbose?: boolean } = {}
|
||||
): Promise<void> {
|
||||
const targets = vendorSlugs
|
||||
? VENDOR_CATALOG_REGISTRY.filter((v) => vendorSlugs.includes(v.slug))
|
||||
: VENDOR_CATALOG_REGISTRY;
|
||||
|
||||
console.log(`Starting vendor discovery for ${targets.length} vendor(s)...`);
|
||||
|
||||
for (const config of targets) {
|
||||
try {
|
||||
await discoverVendorCatalog(config, options);
|
||||
} catch (err) {
|
||||
console.error(`[${config.slug}] Fatal crawl error:`, (err as Error).message);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("Vendor discovery batch complete.");
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Standalone execution
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
if (require.main === module) {
|
||||
const target = process.argv[2]; // optional: specific vendor slug
|
||||
const dryRun = process.argv.includes("--dry-run");
|
||||
const verbose = process.argv.includes("--verbose");
|
||||
|
||||
const run = async () => {
|
||||
if (target) {
|
||||
const config = VENDOR_CATALOG_REGISTRY.find((v) => v.slug === target);
|
||||
if (!config) {
|
||||
console.error(`Unknown vendor slug: ${target}`);
|
||||
console.log("Available:", VENDOR_CATALOG_REGISTRY.map((v) => v.slug).join(", "));
|
||||
process.exit(1);
|
||||
}
|
||||
await discoverVendorCatalog(config, { dryRun, verbose });
|
||||
} else {
|
||||
await runVendorDiscoveryBatch(undefined, { dryRun, verbose });
|
||||
}
|
||||
};
|
||||
|
||||
run()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
console.error("Fatal:", err);
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -155,6 +155,15 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
"maintenance:find-equivalences",
|
||||
// ── Re-Research approved equivalences ─────────────────────────────
|
||||
"maintenance:re-research-equivalences",
|
||||
// ── Vendor Discovery Crawlers (TIPLLM training data + DB seeding) ─────
|
||||
"discover:vendor:cisco-tmg",
|
||||
"discover:vendor:juniper",
|
||||
"discover:vendor:arista",
|
||||
"discover:vendor:fs-com",
|
||||
"discover:vendor:flexoptix",
|
||||
"discover:vendor:nokia",
|
||||
"discover:vendor:huawei",
|
||||
"discover:vendor:ii-vi",
|
||||
];
|
||||
|
||||
for (const q of queues) {
|
||||
@ -432,6 +441,21 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
await boss.schedule("scrape:catalog:3com-legacy-oem", "0 6 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
await boss.schedule("scrape:catalog:avaya-legacy-oem", "15 6 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||||
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
// VENDOR DISCOVERY CRAWLERS — weekly (deep crawl, Playwright, TIPLLM training)
|
||||
// Each run: crawls catalog → LLM extract → spec validate → DB + Gitea SFT
|
||||
// Staggered across Sun/Mon nights (low-traffic window, 2h expiry each)
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
await boss.schedule("discover:vendor:cisco-tmg", "0 20 * * 0", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:juniper", "0 22 * * 0", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:arista", "0 0 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:fs-com", "0 2 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:flexoptix", "0 4 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:nokia", "0 6 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:huawei", "0 8 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
await boss.schedule("discover:vendor:ii-vi", "0 10 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||||
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
// VENDOR LISTS — every 12h
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
@ -2706,5 +2730,31 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`);
|
||||
});
|
||||
|
||||
console.log("All workers registered (94 jobs, 24/7 continuous)");
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
// VENDOR DISCOVERY CRAWLER WORKERS
|
||||
// Each worker calls discoverVendorCatalog() for the matching slug.
|
||||
// Results go to: TIP DB (findOrCreateScrapedTransceiver) +
|
||||
// Gitea tip-training-data repo (SFT JSONL pairs)
|
||||
// ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
const { discoverVendorCatalog, VENDOR_CATALOG_REGISTRY } = await import("./crawler-llm/vendor-discovery-crawler");
|
||||
|
||||
for (const vendorConfig of VENDOR_CATALOG_REGISTRY) {
|
||||
const jobName = `discover:vendor:${vendorConfig.slug}`;
|
||||
boss.work(jobName, async () => {
|
||||
if (!isLoadAcceptable(3.0)) {
|
||||
console.warn(`[${jobName}] Load too high — skipping deep crawl`);
|
||||
return;
|
||||
}
|
||||
console.log(`[${jobName}] Starting vendor discovery crawl…`);
|
||||
try {
|
||||
await discoverVendorCatalog(vendorConfig, { verbose: false });
|
||||
} catch (err) {
|
||||
console.error(`[${jobName}] Fatal:`, (err as Error).message);
|
||||
throw err; // let pg-boss retry
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
console.log("All workers registered (102 jobs, 24/7 continuous + 8 weekly discovery crawlers)");
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user