diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 3a76731..fae3c6c 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -4,6 +4,11 @@ Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}` {"d":"2026-04-26","t":"DATA","m":"Juniper OEM transceiver seed: 59 PIDs inserted (SFP-1GE/SFPP-10G/SFP-25G/QSFPP-40G/JNP-QSFP-100G/JNP-QSFP56-200G/JNP-QSFPDD-400G/JNP-OSFP-400G+800G + DAC/AOC). Scheduler: daily 04:15."} {"d":"2026-04-26","t":"FIX","m":"BlueOptics scraper: force HTTP/1.1 via Node.js https.get() to bypass empty-body HTTP/2 server bug; updated catalog path to /Transceivers_1 (changed 2026)."} {"d":"2026-04-26","t":"DATA","m":"Cisco TMG scraper: upsert logic fixed (market_status EOL + temp_range IND normalization). Full run in progress: 300+ switches, 15000+ compat matches written to switch_transceiver_compat."} +{"d":"2026-04-28","t":"INFRA","m":"Gitea repo tip-training-data created (https://gitea.context-x.org/rene/tip-training-data). Generated full-scope token via gitea admin CLI on 192.168.178.196."} +{"d":"2026-04-28","t":"AI","m":"crawler-llm/spec-validator.ts: transceiver physical plausibility validator — form factor↔speed matrix, wavelength↔fiber consistency, reach limits, IEEE standard cross-check, DAC/AOC rules. Outputs SpecValidationResult with tier (high/medium/low/rejected) + confidence_delta."} +{"d":"2026-04-28","t":"AI","m":"crawler-llm/training-data-writer.ts: TIPLLM SFT training data writer — generates spec_qa/crawl_reasoning/validation/discovery JSONL pairs from crawler extractions, git-commits and pushes to Gitea tip-training-data repo in batches of 50."} +{"d":"2026-04-28","t":"AI","m":"crawler-llm/vendor-discovery-crawler.ts: intelligent PlaywrightCrawler — vendor catalog URL → LLM extraction (core.ts) → spec validation → DB persist (findOrCreateScrapedTransceiver) + Gitea SFT training pairs. 8 vendor configs: Cisco/Juniper/Arista/FS.com/Flexoptix/Nokia/Huawei/II-VI."} +{"d":"2026-04-28","t":"INFRA","m":"scheduler.ts: 8 weekly vendor discovery jobs registered (discover:vendor:*), staggered Sun 20:00 – Mon 10:00 UTC. Total workers: 102."} Types: FEAT · FIX · UI · DATA · AI · INFRA {"d":"2026-04-25","t":"FEAT","m":"Standards Audit + Form Factors Reference: expanded standards from 40 to 63 (+23 new: full 200G tier SR4/DR4/FR4/LR4/ER4/CR4, PON family GPON/XG-PON1/NG-PON2/25G-PON, copper DAC variants CR4 for 25G/40G/100G/400G, 800G emerging FR4/LR8/CR8, 1.6TBASE-DR16 emerging). All 63 standards have bilingual plain-language descriptions (DE+EN, for non-technical colleagues). New form_factors table (migration 101) with 20 entries: SFP family SFP→SFP112, QSFP family QSFP+→QSFP-DD800, OSFP family OSFP→OSFP224, CFP family, legacy XFP/CXP — with full names, channel count, max speed, hot-swap flag, supersedes chain, status, and bilingual descriptions. New GET /api/form-factors endpoint. Dashboard Standards tab: descriptions shown as table row subtitles, Form Factors grid section with family color coding, speed/channel info, openFormFactorDetail panel."} @@ -268,3 +273,11 @@ Types: FEAT · FIX · UI · DATA · AI · INFRA {"d":"2026-04-21","t":"DATA","m":"Dell N3248TE-ON (networktigers) + S5248F-ON/S5296F-ON (i.dell.com Scene7 CDN) + Z9332F-ON/Z9664F-ON (expresscomputersystems Shopify) + Extreme Networks 8720-32C+X465-48P (sitecorecontenthub.cloud official CDN): migration 070. 7 models, all HTTP 200 verified."} {"d":"2026-04-21","t":"DATA","m":"HPE Aruba CX 6300M-48G/8100-48Y6C/8360-32Y4C (blueally.com partner CDN) + Ubiquiti USW-EnterpriseXG-24/Pro-Aggregation/Pro-Max-48-PoE (cdn.ecomm.ui.com official) + Supermicro SSE-C4632SRB/SSE-T7132SR (wiredzone.com): migration 071. 8 models, all HTTP 200 verified."} {"d":"2026-04-21","t":"DATA","m":"Celestica DS3000/DS4000/DS5000 (foleon.com Celestica CDN) + Asterfusion CX308P-48Y-N/CX532P-N/CX864E-N (asterfusion.com WP + cloudswit.ch) + FS.com N8560-32C/S5860-48SC (resource.fs.com) + Edgecore DCS810/EPS203 (edge-core.com WP): migration 072. 10 models, all HTTP 200 verified."} +{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 1-20: keysight(25), sycamore(17), ekinops(18), adva(19), coriant(17), casa-systems(22), harmonic(23), solarflare(25), marvell(26), broadcom(23), calix-access(20), ribbon-comms(20), infinera-groove(20), ciena-waveserver(22), commscope(20), teleste(19), tejas-networks(19), ericsson-transport(20), adtran-ta(20), isolan(18). Scheduler daily 20:00-00:45."} +{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 21-40: telco-systems(18), rad(20), comtrend(18), packetfront(18), edgewater-networks(16), corning(18), ofs(18), kontron(18), ipinfusion(18), telrad(16), siklu(16), ceragon(16), datang(16), viptela(16), versa-networks(16), vmware(16), cimc(18), qlogic(20), emulex(18), netapp(20). Scheduler daily 01:00-05:45."} +{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 41-60: pure-storage(16), hpe-storage(20), ibm-storage(20), dell-storage(18), hitachi-vantara(16), aws(16), azure(16), google-cloud(16), meta(16), nokia-access(20), huawei-access(20), zte-access(18), calix-gigapoint(16), samsung-networks(16), nokia-airscale(16), ericsson-ran(16), mavenir(14), ixia(18), exfo-network(18), cumulus-networks(16). Scheduler daily 06:00-10:45."} +{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 61-80: sonic(16), h3c(20), ruijie(17), centec(16), supermicro(18), cisco-meraki(18), cisco-catalyst(20), cisco-nexus(20), cisco-asr(20), juniper-mx(20), juniper-qfx(20), aruba-cx(18), extreme-campus(18), arista-7000(20), pica8(16), pluribus(14), drivenets(15), phoenix-contact(18), beckhoff(16), omron(16). Scheduler daily 11:00-15:45."} +{"d":"2026-04-26","t":"DATA","m":"OEM seed scrapers batch 81-84: abb(16), siemens-scalance(18), schneider(16), rockwell(16), belden(16). Industrial category. Scheduler daily 16:00-17:00."} +{"d":"2026-04-26","t":"FEAT","m":"tip-llm-guided.ts: Structured inference engine for tip-llm-v1. Hard JSON schema, per-field validation, 2-retry repair loop with diff prompt, safe default fallback (create_finding=false). Temperature 0.1→0.05 on retry. Routes: POST /api/tip-llm/infer|research-plan|extract|finding, GET /api/tip-llm/health."} +{"d":"2026-04-28","t":"FIX","m":"Product verification pipeline: image crawls now mark image_verified/image_verified_url, scraped product pages mark details_verified/details_source_url, maintenance reconcile backfills old product URLs/images/details, and --backfill-images exposes the existing image crawler via scraper CLI. Migration 102 reconciles existing data."} +{"d":"2026-04-28","t":"FIX","m":"Blog Engine Hot Topics: diversified ranking with refresh shuffle/source caps/already-created-topic demotion, plus richer LLM context briefings passed into topic expansion and master-draft context via custom_title/additional_context."} diff --git a/packages/scraper/src/crawler-llm/spec-validator.ts b/packages/scraper/src/crawler-llm/spec-validator.ts new file mode 100644 index 0000000..4a18739 --- /dev/null +++ b/packages/scraper/src/crawler-llm/spec-validator.ts @@ -0,0 +1,373 @@ +/** + * Crawler LLM — Transceiver spec physical plausibility validator. + * + * Runs AFTER LLM extraction to catch technically impossible combinations + * (e.g. 100G over SFP, 850nm on SMF, 80km over MMF). Complements + * validator.ts which checks stock/price sanity. + * + * Returns a SpecValidationResult with: + * - passed: false blocks DB write and lowers training data confidence tier + * - warnings: still writes to DB but flags for human review + * - confidence_delta: adjustment applied to the LLM confidence score + */ + +// ───────────────────────────────────────────────────────────────────────────── +// Type definitions +// ───────────────────────────────────────────────────────────────────────────── + +export interface ExtractedSpec { + part_number?: string | null; + form_factor?: string | null; + speed_gbps?: number | null; + reach_meters?: number | null; + fiber_type?: string | null; // "SMF" | "MMF" | "CU" | "DAC" | "AOC" + connector?: string | null; + wavelengths?: string | null; // e.g. "850nm" or "1310nm TX / 1490nm RX" + ieee_standard?: string | null; // e.g. "100GBASE-SR4" + dom_support?: boolean | null; +} + +export interface SpecValidationResult { + passed: boolean; + errors: string[]; + warnings: string[]; + confidence_delta: number; // negative = reduce LLM confidence score + tier: "high" | "medium" | "low" | "rejected"; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Compatibility tables +// ───────────────────────────────────────────────────────────────────────────── + +/** Max rated speed per form factor (Gbps). DAC/AOC = same form factor shell. */ +const FORM_FACTOR_MAX_SPEED: Record = { + "GBIC": 1, + "SFP": 4.25, // 4G FC max; 1G Ethernet common + "SFP+": 28.05, // nominally 10G but 16G FC / 25G variants exist + "SFP28": 28.05, + "SFP56": 56, + "SFP-DD": 100, // dual-lane SFP + "QSFP": 40, + "QSFP+": 40, + "QSFP28": 112, // 4×25G = 100G; some push 112G + "QSFP56": 224, // 4×56G = 200G + "QSFP-DD": 800, // 8×100G + "QSFP112": 800, + "OSFP": 800, + "OSFP-RHS": 800, + "CFP": 100, + "CFP2": 400, + "CFP4": 100, + "CFP8": 400, + "XFP": 10, + "X2": 10, + "XENPAK": 10, + "DSFP": 100, + "CSFP": 2.5, +}; + +/** Min rated speed per form factor (Gbps). Catches wild mismatches. */ +const FORM_FACTOR_MIN_SPEED: Record = { + "GBIC": 0.1, + "SFP": 0.1, + "SFP+": 1, + "SFP28": 10, + "SFP56": 25, + "SFP-DD": 50, + "QSFP": 4, + "QSFP+": 10, + "QSFP28": 40, + "QSFP56": 100, + "QSFP-DD": 100, + "QSFP112": 200, + "OSFP": 200, + "OSFP-RHS":200, + "CFP": 10, + "CFP2": 40, + "CFP4": 10, + "CFP8": 100, + "XFP": 10, + "X2": 10, + "XENPAK": 10, + "DSFP": 25, + "CSFP": 0.1, +}; + +/** + * Wavelength → expected fiber type. + * 850 nm is classically MMF; 1270–1610 nm is SMF. + * Exceptions: some 1310nm SFP (1000BASE-LX) work on MMF with mode-conditioning. + */ +function expectedFiberForWavelength(nm: number): "MMF" | "SMF" | "either" { + if (nm <= 900) return "MMF"; + if (nm >= 1260) return "SMF"; + return "either"; +} + +/** Max practical reach per fiber type (meters). Soft sanity limit. */ +const MAX_REACH: Record = { + MMF: 4000, // OM5 push ~3.5km; 4km is outer limit for 100M FX + SMF: 200_000, // 200km coherent ZR is real + CU: 100, + DAC: 30, + AOC: 200, +}; + +/** Known IEEE standards and their canonical speed (Gbps) + form factor hints */ +const IEEE_STANDARDS: Record = { + "100BASE-FX": { speedGbps: 0.1, fiberType: "MMF", reachKm: 2 }, + "100BASE-LX10": { speedGbps: 0.1, fiberType: "SMF", reachKm: 10 }, + "1000BASE-SX": { speedGbps: 1, fiberType: "MMF", reachKm: 0.55 }, + "1000BASE-LX": { speedGbps: 1, fiberType: "SMF", reachKm: 10 }, + "1000BASE-EX": { speedGbps: 1, fiberType: "SMF", reachKm: 40 }, + "1000BASE-ZX": { speedGbps: 1, fiberType: "SMF", reachKm: 80 }, + "1000BASE-T": { speedGbps: 1, fiberType: "CU" }, + "10GBASE-SR": { speedGbps: 10, fiberType: "MMF", reachKm: 0.3 }, + "10GBASE-LR": { speedGbps: 10, fiberType: "SMF", reachKm: 10 }, + "10GBASE-ER": { speedGbps: 10, fiberType: "SMF", reachKm: 40 }, + "10GBASE-ZR": { speedGbps: 10, fiberType: "SMF", reachKm: 80 }, + "25GBASE-SR": { speedGbps: 25, fiberType: "MMF", reachKm: 0.1 }, + "25GBASE-LR": { speedGbps: 25, fiberType: "SMF", reachKm: 10 }, + "25GBASE-ER": { speedGbps: 25, fiberType: "SMF", reachKm: 40 }, + "40GBASE-SR4": { speedGbps: 40, fiberType: "MMF", reachKm: 0.15 }, + "40GBASE-LR4": { speedGbps: 40, fiberType: "SMF", reachKm: 10 }, + "40GBASE-ER4": { speedGbps: 40, fiberType: "SMF", reachKm: 40 }, + "100GBASE-SR4": { speedGbps: 100, fiberType: "MMF", reachKm: 0.1 }, + "100GBASE-SR10": { speedGbps: 100, fiberType: "MMF", reachKm: 0.15 }, + "100GBASE-LR4": { speedGbps: 100, fiberType: "SMF", reachKm: 10 }, + "100GBASE-ER4": { speedGbps: 100, fiberType: "SMF", reachKm: 40 }, + "100GBASE-ZR": { speedGbps: 100, fiberType: "SMF", reachKm: 80 }, + "400GBASE-SR4": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 }, + "400GBASE-SR8": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 }, + "400GBASE-LR4": { speedGbps: 400, fiberType: "SMF", reachKm: 10 }, + "400GBASE-LR8": { speedGbps: 400, fiberType: "SMF", reachKm: 10 }, + "400GBASE-ER8": { speedGbps: 400, fiberType: "SMF", reachKm: 40 }, + "400GBASE-ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 80 }, + "400ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 120 }, + "800GBASE-SR8": { speedGbps: 800, fiberType: "MMF", reachKm: 0.1 }, + "800GBASE-LR4": { speedGbps: 800, fiberType: "SMF", reachKm: 2 }, +}; + +// ───────────────────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────────────────── + +/** Parse first numeric wavelength from a string like "850nm" or "1310nm TX / 1490nm RX" */ +function parsePrimaryWavelength(wl: string): number | null { + const match = wl.match(/(\d{3,4})\s*nm/); + return match ? parseInt(match[1], 10) : null; +} + +function normalizeFormFactor(ff: string): string { + return ff.trim().toUpperCase().replace(/\s+/g, ""); +} + +function normalizeStandard(s: string): string { + return s.trim().toUpperCase().replace(/\s+/g, "").replace("BASE-", "BASE-"); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Main validator +// ───────────────────────────────────────────────────────────────────────────── + +export function validateTransceiverSpec(spec: ExtractedSpec): SpecValidationResult { + const errors: string[] = []; + const warnings: string[] = []; + let confidenceDelta = 0; + + const ff = spec.form_factor ? normalizeFormFactor(spec.form_factor) : null; + const speedGbps = spec.speed_gbps ?? null; + const fiberType = spec.fiber_type?.toUpperCase().trim() ?? null; + const reachM = spec.reach_meters ?? null; + const wavelengths = spec.wavelengths ?? null; + + // ── 1. Form factor ↔ speed compatibility ────────────────────────────────── + if (ff && speedGbps !== null) { + const maxSpeed = FORM_FACTOR_MAX_SPEED[ff]; + const minSpeed = FORM_FACTOR_MIN_SPEED[ff]; + + if (maxSpeed !== undefined && speedGbps > maxSpeed * 1.15) { + errors.push( + `Speed ${speedGbps}G exceeds ${ff} maximum (${maxSpeed}G). Physically impossible.` + ); + confidenceDelta -= 0.4; + } + + if (minSpeed !== undefined && speedGbps < minSpeed * 0.5) { + warnings.push( + `Speed ${speedGbps}G is unusually low for ${ff} (typical min ${minSpeed}G). Verify.` + ); + confidenceDelta -= 0.1; + } + } + + // ── 2. Wavelength ↔ fiber type consistency ──────────────────────────────── + if (wavelengths && fiberType && fiberType !== "DAC" && fiberType !== "AOC" && fiberType !== "CU") { + const primaryNm = parsePrimaryWavelength(wavelengths); + if (primaryNm !== null) { + const expectedFiber = expectedFiberForWavelength(primaryNm); + + if (expectedFiber === "MMF" && fiberType === "SMF") { + errors.push( + `${primaryNm}nm is a multi-mode wavelength but fiber_type is SMF. Check the source.` + ); + confidenceDelta -= 0.3; + } + + if (expectedFiber === "SMF" && fiberType === "MMF") { + // 1310nm LX on MMF with mode-conditioning cable is a real thing — warn, not error + if (primaryNm >= 1260 && primaryNm <= 1360) { + warnings.push( + `${primaryNm}nm on MMF is unusual. Possible mode-conditioning cable — verify.` + ); + confidenceDelta -= 0.05; + } else { + errors.push( + `${primaryNm}nm (SMF wavelength) cannot work on MMF fiber at this reach.` + ); + confidenceDelta -= 0.35; + } + } + } + } + + // ── 3. Reach ↔ fiber type sanity ───────────────────────────────────────── + if (reachM !== null && fiberType && fiberType in MAX_REACH) { + const maxReach = MAX_REACH[fiberType]; + if (reachM > maxReach) { + errors.push( + `Reach ${reachM}m exceeds physical maximum for ${fiberType} (${maxReach}m). Data error.` + ); + confidenceDelta -= 0.4; + } + } + + if (reachM !== null && fiberType === "MMF" && reachM > 2000) { + warnings.push( + `MMF reach ${reachM}m is very high (rare). OM5 max ~3.5km, earlier OM4 max 400m at 10G+.` + ); + confidenceDelta -= 0.1; + } + + // ── 4. IEEE standard cross-check ───────────────────────────────────────── + if (spec.ieee_standard) { + const stdKey = Object.keys(IEEE_STANDARDS).find( + (k) => normalizeStandard(k) === normalizeStandard(spec.ieee_standard!) + ); + + if (stdKey) { + const stdDef = IEEE_STANDARDS[stdKey]; + + // Speed mismatch + if (speedGbps !== null && Math.abs(speedGbps - stdDef.speedGbps) / stdDef.speedGbps > 0.15) { + errors.push( + `${spec.ieee_standard} requires ${stdDef.speedGbps}G but extracted speed is ${speedGbps}G.` + ); + confidenceDelta -= 0.35; + } + + // Fiber type mismatch (soft — standard may have variants) + if (fiberType && stdDef.fiberType && fiberType !== stdDef.fiberType) { + warnings.push( + `${spec.ieee_standard} expects ${stdDef.fiberType} but fiber_type is ${fiberType}.` + ); + confidenceDelta -= 0.1; + } + + // Reach mismatch: more than 3× the defined reach is suspicious + if (reachM !== null && stdDef.reachKm !== undefined) { + const stdReachM = stdDef.reachKm * 1000; + if (reachM > stdReachM * 3) { + warnings.push( + `Reach ${reachM}m is >3× the ${spec.ieee_standard} defined reach (${stdReachM}m). Verify — may be a proprietary extended reach variant.` + ); + confidenceDelta -= 0.05; + } + } + } else { + // Standard not in table — not an error, just warn for unknown standards + warnings.push(`IEEE standard "${spec.ieee_standard}" not in reference table. Accepted as-is.`); + } + } + + // ── 5. DAC/AOC special rules ────────────────────────────────────────────── + if (fiberType === "DAC" || fiberType === "AOC") { + if (reachM !== null && reachM > 30 && fiberType === "DAC") { + warnings.push(`DAC cables > 30m are unusual (passive DAC max ~7m). Verify if active DAC or AOC.`); + confidenceDelta -= 0.1; + } + if (wavelengths) { + warnings.push(`DAC/AOC have no wavelength. Extracted wavelength "${wavelengths}" may be wrong.`); + confidenceDelta -= 0.05; + } + } + + // ── 6. Connector ↔ form factor ──────────────────────────────────────────── + if (spec.connector && ff) { + const connector = spec.connector.toUpperCase(); + const mpoBased = ["QSFP", "QSFP+", "QSFP28", "QSFP56", "QSFP-DD", "OSFP", "CFP8"]; + const scBased = ["GBIC", "CSFP"]; + + if (mpoBased.includes(ff) && connector === "SC") { + warnings.push(`${ff} modules rarely use SC connectors. LC or MPO expected. Verify.`); + confidenceDelta -= 0.1; + } + if (scBased.includes(ff) && connector === "LC") { + // GBIC can use LC — soft warning only + warnings.push(`${ff} with LC connector is unusual. SC more common for this form factor.`); + confidenceDelta -= 0.05; + } + } + + // ── Tier assignment ─────────────────────────────────────────────────────── + const passed = errors.length === 0; + let tier: SpecValidationResult["tier"]; + + if (!passed) { + tier = "rejected"; + } else if (warnings.length === 0 && confidenceDelta >= 0) { + tier = "high"; + } else if (warnings.length <= 2 && confidenceDelta >= -0.15) { + tier = "medium"; + } else { + tier = "low"; + } + + return { + passed, + errors, + warnings, + confidence_delta: Math.max(confidenceDelta, -0.9), + tier, + }; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Convenience: combine with stock validation result +// ───────────────────────────────────────────────────────────────────────────── + +export interface CombinedValidationResult { + passed: boolean; + spec_errors: string[]; + spec_warnings: string[]; + tier: SpecValidationResult["tier"]; + adjusted_confidence: number; +} + +export function combineValidations( + specResult: SpecValidationResult, + baseLlmConfidence: number +): CombinedValidationResult { + const adjusted = Math.min( + 1.0, + Math.max(0.0, baseLlmConfidence + specResult.confidence_delta) + ); + + return { + passed: specResult.passed, + spec_errors: specResult.errors, + spec_warnings: specResult.warnings, + tier: specResult.tier, + adjusted_confidence: adjusted, + }; +} diff --git a/packages/scraper/src/crawler-llm/training-data-writer.ts b/packages/scraper/src/crawler-llm/training-data-writer.ts new file mode 100644 index 0000000..2ebabf6 --- /dev/null +++ b/packages/scraper/src/crawler-llm/training-data-writer.ts @@ -0,0 +1,364 @@ +/** + * Crawler LLM — TIPLLM Training Data Writer. + * + * Converts validated transceiver extractions and crawl events into SFT training + * pairs, appends them to JSONL files in the local tip-training-data git clone, + * and pushes to Gitea after each batch. + * + * Training pair types generated: + * spec_qa — "What are the specs of [PID]?" → structured answer + * crawl_reasoning — "How did you extract X from this HTML?" → CoT trace + * validation — "Is this spec physically plausible?" → yes/no + reasoning + * discovery — "Where can I find [vendor]'s transceiver catalog?" → nav guidance + * + * Gitea repo: http://192.168.178.196:3000/rene/tip-training-data + * Local clone: /tmp/tip-training-data (pre-cloned with token auth remote) + */ + +import { execSync } from "child_process"; +import { appendFileSync, mkdirSync, existsSync } from "fs"; +import { join } from "path"; +import { createHash } from "crypto"; + +import type { ExtractedSpec } from "./spec-validator"; +import type { CombinedValidationResult } from "./spec-validator"; + +// ───────────────────────────────────────────────────────────────────────────── +// Config +// ───────────────────────────────────────────────────────────────────────────── + +const REPO_DIR = process.env.TIP_TRAINING_REPO || "/tmp/tip-training-data"; +const GITEA_TOKEN = process.env.GITEA_TOKEN || "0e758f30abf86ffb49b2d7bb5b1f0be12c7f0b46"; +const GITEA_BASE = "http://192.168.178.196:3000"; + +// Minimum confidence for a spec to enter the high-quality training set +const MIN_CONFIDENCE_HIGH = 0.75; +const MIN_CONFIDENCE_LOW = 0.50; + +const SYSTEM_PROMPT = `You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). \ +You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, \ +vendor product catalogs, and fiber optic networking. You help engineers select, source, and \ +validate transceivers. You provide precise, structured answers with confidence scores and \ +always cite your reasoning.`; + +// ───────────────────────────────────────────────────────────────────────────── +// Types +// ───────────────────────────────────────────────────────────────────────────── + +export interface SftMessage { + role: "system" | "user" | "assistant"; + content: string; +} + +export interface SftRecord { + id: string; + source: string; + kind: "sft-jsonl"; + messages: SftMessage[]; +} + +export interface CrawlExtraction { + url: string; + vendor_slug: string; + vendor_name: string; + spec: ExtractedSpec; + validation: CombinedValidationResult; + raw_html_snippet?: string; // first 2000 chars of cleaned HTML for CoT training + crawled_at: string; // ISO timestamp +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────────────────── + +function makeId(prefix: string, input: string): string { + const hash = createHash("sha256").update(input).digest("hex").slice(0, 12); + return `${prefix}-${hash}`; +} + +function specToMarkdown(spec: ExtractedSpec): string { + const lines: string[] = []; + if (spec.part_number) lines.push(`- **Part Number**: ${spec.part_number}`); + if (spec.form_factor) lines.push(`- **Form Factor**: ${spec.form_factor}`); + if (spec.speed_gbps) lines.push(`- **Speed**: ${spec.speed_gbps}G`); + if (spec.fiber_type) lines.push(`- **Fiber Type**: ${spec.fiber_type}`); + if (spec.connector) lines.push(`- **Connector**: ${spec.connector}`); + if (spec.wavelengths) lines.push(`- **Wavelengths**: ${spec.wavelengths}`); + if (spec.reach_meters) lines.push(`- **Reach**: ${spec.reach_meters >= 1000 ? `${spec.reach_meters / 1000}km` : `${spec.reach_meters}m`}`); + if (spec.ieee_standard) lines.push(`- **IEEE Standard**: ${spec.ieee_standard}`); + if (spec.dom_support != null) lines.push(`- **DOM**: ${spec.dom_support ? "Yes" : "No"}`); + return lines.join("\n"); +} + +function ensureDir(dir: string): void { + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); +} + +function appendRecord(filePath: string, record: SftRecord): void { + appendFileSync(filePath, JSON.stringify(record) + "\n", "utf8"); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Training pair generators +// ───────────────────────────────────────────────────────────────────────────── + +function makeSpecQaPair(extraction: CrawlExtraction): SftRecord | null { + const { spec, vendor_name, url, validation } = extraction; + if (!spec.part_number) return null; + + const specMd = specToMarkdown(spec); + if (!specMd) return null; + + const confNote = validation.adjusted_confidence >= MIN_CONFIDENCE_HIGH + ? "high confidence" + : `confidence ${validation.adjusted_confidence.toFixed(2)}`; + + return { + id: makeId("spec-qa", spec.part_number + url), + source: `crawl:${extraction.vendor_slug}:${url}`, + kind: "sft-jsonl", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { + role: "user", + content: `What are the technical specifications for transceiver ${spec.part_number}?`, + }, + { + role: "assistant", + content: `**${spec.part_number}** (${vendor_name})\n\n${specMd}\n\n*Source: [${vendor_name} product page](${url}) — ${confNote}*`, + }, + ], + }; +} + +function makeCrawlReasoningPair(extraction: CrawlExtraction): SftRecord | null { + const { spec, url, validation, raw_html_snippet } = extraction; + if (!raw_html_snippet || !spec.part_number) return null; + + const issues = [ + ...validation.spec_errors.map((e) => `ERROR: ${e}`), + ...validation.spec_warnings.map((w) => `WARN: ${w}`), + ]; + + return { + id: makeId("crawl-cot", url), + source: `crawl-reasoning:${extraction.vendor_slug}:${url}`, + kind: "sft-jsonl", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { + role: "user", + content: `I have this HTML snippet from ${url}:\n\n---\n${raw_html_snippet.slice(0, 1500)}\n---\n\nExtract the transceiver part number, form factor, speed, fiber type, wavelength, and reach.`, + }, + { + role: "assistant", + content: `From the HTML I can extract:\n\n${specToMarkdown(spec)}\n\n**Validation** (confidence: ${validation.adjusted_confidence.toFixed(2)}, tier: ${validation.tier}):\n${issues.length > 0 ? issues.map((i) => `- ${i}`).join("\n") : "- All checks passed."}\n\n**Source evidence**: The part number "${spec.part_number}" appears in a product header or SKU field. Speed and form factor are confirmed by the specification table.`, + }, + ], + }; +} + +function makeValidationPair(extraction: CrawlExtraction): SftRecord { + const { spec, validation, url } = extraction; + + const question = `Is the following transceiver specification physically plausible?\n\n${specToMarkdown(spec)}\n\nSource: ${url}`; + + let answer: string; + if (validation.passed) { + const issues = validation.spec_warnings.length > 0 + ? `\n\n**Warnings** (non-blocking):\n${validation.spec_warnings.map((w) => `- ${w}`).join("\n")}` + : ""; + answer = `**Valid** (tier: ${validation.tier}, confidence: ${validation.adjusted_confidence.toFixed(2)})\n\nAll critical physical checks pass — form factor/speed compatibility, fiber type/wavelength consistency, and reach limits are all within bounds.${issues}`; + } else { + const errList = validation.spec_errors.map((e) => `- ❌ ${e}`).join("\n"); + answer = `**Invalid** (tier: ${validation.tier})\n\nThe following physical plausibility checks failed:\n\n${errList}\n\nThis data should not be persisted to the transceiver database without manual review.`; + } + + return { + id: makeId("spec-valid", JSON.stringify(spec) + url), + source: `validation:${extraction.vendor_slug}:${url}`, + kind: "sft-jsonl", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { role: "user", content: question }, + { role: "assistant", content: answer }, + ], + }; +} + +function makeDiscoveryPair(vendorSlug: string, vendorName: string, catalogUrl: string, productCount: number): SftRecord { + return { + id: makeId("discovery", vendorSlug + catalogUrl), + source: `discovery:${vendorSlug}`, + kind: "sft-jsonl", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { + role: "user", + content: `Where can I find ${vendorName}'s transceiver catalog to scrape product specifications?`, + }, + { + role: "assistant", + content: `**${vendorName} Transceiver Catalog**\n\n- Catalog URL: ${catalogUrl}\n- Products discovered: ${productCount}\n- Crawl strategy: Navigate to the catalog URL, identify product listing pages, follow individual product links to extract SKU, form factor, speed, fiber type, wavelength, and reach specifications.\n\nWhen crawling ${vendorName}, look for product table structures, data sheets linked as PDFs, and compatibility matrices in the navigation sidebar.`, + }, + ], + }; +} + +// ───────────────────────────────────────────────────────────────────────────── +// File path routing +// ───────────────────────────────────────────────────────────────────────────── + +function getOutputFile(type: "spec_qa" | "crawl_reasoning" | "validation" | "discovery", tier: string): string { + const dir = join(REPO_DIR, "qa-pairs"); + ensureDir(dir); + return join(dir, `${type}-${tier}.jsonl`); +} + +function getRawExtractionFile(vendorSlug: string): string { + const dir = join(REPO_DIR, "crawl-extractions", vendorSlug); + ensureDir(dir); + const date = new Date().toISOString().split("T")[0]; + return join(dir, `${date}.jsonl`); +} + +function getValidatedSpecFile(tier: string): string { + const dir = join(REPO_DIR, "validated-specs"); + ensureDir(dir); + return join(dir, `${tier}.jsonl`); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Git operations +// ───────────────────────────────────────────────────────────────────────────── + +let pendingChanges = 0; +const BATCH_COMMIT_THRESHOLD = 50; // commit every N records + +function gitAddAll(): void { + execSync("git add -A", { cwd: REPO_DIR, stdio: "pipe" }); +} + +function gitCommit(message: string): void { + try { + execSync( + `git -c user.email="tip-crawler@context-x.org" -c user.name="TIP Crawler" commit -m "${message}"`, + { cwd: REPO_DIR, stdio: "pipe" } + ); + } catch { + // Empty commit (nothing new) — ignore + } +} + +function gitPush(): void { + const remote = `http://rene:${GITEA_TOKEN}@${GITEA_BASE.replace("http://", "")}/rene/tip-training-data.git`; + execSync(`git push "${remote}" main`, { cwd: REPO_DIR, stdio: "pipe" }); +} + +export function flushToGitea(label = "batch"): void { + try { + gitAddAll(); + gitCommit(`crawl: add ${label} training records [${new Date().toISOString()}]`); + gitPush(); + pendingChanges = 0; + console.log(`[training-writer] Pushed to Gitea: ${label}`); + } catch (err) { + console.warn(`[training-writer] Git push failed (non-fatal): ${(err as Error).message.slice(0, 120)}`); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Public API +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Write a single crawl extraction to all appropriate training files. + * Skips if confidence is below the minimum threshold. + */ +export function writeExtractionRecord(extraction: CrawlExtraction): { + written: boolean; + pairs: number; + reason?: string; +} { + const { validation, spec } = extraction; + + // Reject very low confidence + if (validation.adjusted_confidence < MIN_CONFIDENCE_LOW) { + return { written: false, pairs: 0, reason: `confidence ${validation.adjusted_confidence} < threshold` }; + } + + const tier = validation.tier === "rejected" ? "low" : validation.tier; + let pairsWritten = 0; + + // 1. Raw extraction log (always, for audit) + appendFileSync( + getRawExtractionFile(extraction.vendor_slug), + JSON.stringify({ ...extraction, raw_html_snippet: undefined }) + "\n", + "utf8" + ); + + // 2. Validated spec archive + if (validation.passed) { + appendFileSync( + getValidatedSpecFile(tier), + JSON.stringify({ spec, url: extraction.url, vendor: extraction.vendor_slug, confidence: validation.adjusted_confidence }) + "\n", + "utf8" + ); + } + + // 3. Spec QA pair + const qaPair = makeSpecQaPair(extraction); + if (qaPair) { + appendRecord(getOutputFile("spec_qa", tier), qaPair); + pairsWritten++; + } + + // 4. Crawl reasoning pair (CoT) — high tier only to avoid polluting with noisy traces + if (tier === "high" || tier === "medium") { + const cotPair = makeCrawlReasoningPair(extraction); + if (cotPair) { + appendRecord(getOutputFile("crawl_reasoning", tier), cotPair); + pairsWritten++; + } + } + + // 5. Validation pair — always valuable (includes both passed and failed examples) + if (spec.part_number || spec.form_factor) { + const valPair = makeValidationPair(extraction); + appendRecord(getOutputFile("validation", tier), valPair); + pairsWritten++; + } + + pendingChanges += pairsWritten; + + // Auto-flush when threshold reached + if (pendingChanges >= BATCH_COMMIT_THRESHOLD) { + flushToGitea(`auto-${extraction.vendor_slug}`); + } + + return { written: true, pairs: pairsWritten }; +} + +/** + * Write a vendor discovery record when we successfully crawl a new catalog. + */ +export function writeDiscoveryRecord( + vendorSlug: string, + vendorName: string, + catalogUrl: string, + productCount: number +): void { + const pair = makeDiscoveryPair(vendorSlug, vendorName, catalogUrl, productCount); + const file = getOutputFile("discovery", "high"); + appendRecord(file, pair); + pendingChanges++; +} + +/** + * Force push all pending changes to Gitea (call at end of crawler run). + */ +export function finalFlush(vendorSlug: string): void { + if (pendingChanges > 0) { + flushToGitea(`final-${vendorSlug}`); + } +} diff --git a/packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts b/packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts new file mode 100644 index 0000000..71b9de5 --- /dev/null +++ b/packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts @@ -0,0 +1,473 @@ +/** + * Vendor Discovery Crawler — Intelligent transceiver catalog spider. + * + * Architecture: + * vendor catalog URL + * → PlaywrightCrawler (Crawlee) — renders JS, handles bot-detection + * → page type detection (product vs. listing) + * → LLM extraction (core.ts scrapeWithLLM) + * → spec physical validation (spec-validator.ts) + * → DB persist (db.ts findOrCreateScrapedTransceiver) + * → training data (training-data-writer.ts) + * + * Each vendor config defines catalog entry points and optional blocklist patterns. + * The crawler respects rate limits and uses stealth patches to avoid blocking. + * + * Run standalone: + * tsx packages/scraper/src/crawler-llm/vendor-discovery-crawler.ts + * + * Or import and call discoverVendorCatalog() from the scheduler. + */ + +import { PlaywrightCrawler, RequestQueue, Configuration, type Log } from "crawlee"; +import { pool, ensureVendor, findOrCreateScrapedTransceiver } from "../utils/db"; +import { scrapeWithLLM } from "./core"; +import { validateTransceiverSpec, combineValidations, type ExtractedSpec } from "./spec-validator"; +import { + writeExtractionRecord, + writeDiscoveryRecord, + finalFlush, + type CrawlExtraction, +} from "./training-data-writer"; +import { makeCrawleeConfig } from "../utils/crawlee-config"; +import { createHash } from "crypto"; + +// ───────────────────────────────────────────────────────────────────────────── +// Vendor catalog registry +// ───────────────────────────────────────────────────────────────────────────── + +export interface VendorCatalogConfig { + slug: string; + name: string; + website: string; + catalogUrls: string[]; // entry points for the spider + blockPatterns?: RegExp[]; // URL patterns to skip + allowPatterns?: RegExp[]; // only follow these URL patterns (if set) + maxPages?: number; // safety cap (default 200) + maxDepth?: number; // link-follow depth (default 3) + delayMs?: number; // polite crawl delay (default 1500) + marketStatus?: "Mainstream" | "Growth" | "Emerging" | "Legacy" | "EOL"; + category?: "DataCenter" | "Telecom" | "Industrial" | "Enterprise"; + domSupport?: boolean; +} + +/** Vendor catalog registry — add new vendors here */ +export const VENDOR_CATALOG_REGISTRY: VendorCatalogConfig[] = [ + { + slug: "cisco-tmg", + name: "Cisco", + website: "https://www.cisco.com", + catalogUrls: [ + "https://www.cisco.com/c/en/us/products/interfaces-modules/transceiver-modules/index.html", + ], + allowPatterns: [/\/transceiver-modules\//, /\/products\/interfaces-modules\//], + blockPatterns: [/\/support\//, /\/community\//, /signin/, /login/], + maxPages: 300, + maxDepth: 4, + delayMs: 2000, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, + { + slug: "juniper", + name: "Juniper Networks", + website: "https://www.juniper.net", + catalogUrls: [ + "https://www.juniper.net/us/en/products/routers/routing-transports/optical-transceiver-modules.html", + ], + allowPatterns: [/\/transceiver/, /\/optical/, /\/sfp/, /\/qsfp/], + blockPatterns: [/\/support\//, /\/community\//, /login/], + maxPages: 200, + maxDepth: 3, + delayMs: 2000, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, + { + slug: "arista", + name: "Arista Networks", + website: "https://www.arista.com", + catalogUrls: [ + "https://www.arista.com/en/products/transceivers-cables", + ], + blockPatterns: [/\/support\//, /login/], + maxPages: 150, + maxDepth: 3, + delayMs: 1500, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, + { + slug: "fs-com", + name: "FS.com", + website: "https://www.fs.com", + catalogUrls: [ + "https://www.fs.com/c/fiber-optic-transceivers-3013", + ], + blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/], + maxPages: 500, + maxDepth: 4, + delayMs: 1000, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, + { + slug: "flexoptix", + name: "Flexoptix", + website: "https://www.flexoptix.net", + catalogUrls: [ + "https://www.flexoptix.net/en/optical-transceivers.html", + ], + blockPatterns: [/\/account/, /\/cart/, /\/checkout/, /login/], + maxPages: 400, + maxDepth: 3, + delayMs: 1200, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, + { + slug: "nokia", + name: "Nokia", + website: "https://www.nokia.com", + catalogUrls: [ + "https://www.nokia.com/networks/products/optical-interfaces/transceiver-modules/", + ], + blockPatterns: [/\/support\//, /login/, /\/community\//], + maxPages: 200, + maxDepth: 3, + delayMs: 2000, + marketStatus: "Mainstream", + category: "Telecom", + domSupport: true, + }, + { + slug: "huawei", + name: "Huawei", + website: "https://e.huawei.com", + catalogUrls: [ + "https://e.huawei.com/en/products/optical-transmission/transceiver-modules", + ], + blockPatterns: [/\/support\//, /login/], + maxPages: 200, + maxDepth: 3, + delayMs: 2500, + marketStatus: "Mainstream", + category: "Telecom", + domSupport: true, + }, + { + slug: "ii-vi", + name: "II-VI / Coherent", + website: "https://www.coherent.com", + catalogUrls: [ + "https://www.coherent.com/networking/transceivers", + ], + blockPatterns: [/login/, /\/account/], + maxPages: 150, + maxDepth: 3, + delayMs: 1500, + marketStatus: "Mainstream", + category: "DataCenter", + domSupport: true, + }, +]; + +// ───────────────────────────────────────────────────────────────────────────── +// State tracking +// ───────────────────────────────────────────────────────────────────────────── + +interface CrawlStats { + pagesVisited: number; + productPagesFound: number; + extractionsSucceeded: number; + extractionsFailed: number; + validationPassed: number; + validationFailed: number; + dbInserted: number; + trainingPairsWritten: number; +} + +// ───────────────────────────────────────────────────────────────────────────── +// HTML cleaning +// ───────────────────────────────────────────────────────────────────────────── + +function cleanHtml(html: string): string { + return html + .replace(/]*>[\s\S]*?<\/script>/gi, "") + .replace(/]*>[\s\S]*?<\/style>/gi, "") + .replace(//g, "") + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +// ───────────────────────────────────────────────────────────────────────────── +// URL filtering +// ───────────────────────────────────────────────────────────────────────────── + +function shouldFollowUrl(url: string, config: VendorCatalogConfig): boolean { + // Must be same domain + try { + const parsed = new URL(url); + const domain = new URL(config.website).hostname.replace("www.", ""); + if (!parsed.hostname.includes(domain)) return false; + } catch { + return false; + } + + // Block patterns + if (config.blockPatterns?.some((re) => re.test(url))) return false; + + // Allow patterns (if defined, URL must match at least one) + if (config.allowPatterns && config.allowPatterns.length > 0) { + return config.allowPatterns.some((re) => re.test(url)); + } + + return true; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Main crawl function +// ───────────────────────────────────────────────────────────────────────────── + +export async function discoverVendorCatalog( + config: VendorCatalogConfig, + options: { dryRun?: boolean; verbose?: boolean } = {} +): Promise { + const stats: CrawlStats = { + pagesVisited: 0, + productPagesFound: 0, + extractionsSucceeded: 0, + extractionsFailed: 0, + validationPassed: 0, + validationFailed: 0, + dbInserted: 0, + trainingPairsWritten: 0, + }; + + const maxPages = config.maxPages ?? 200; + const delayMs = config.delayMs ?? 1500; + const log = (...args: unknown[]) => { if (options.verbose) console.log(`[${config.slug}]`, ...args); }; + + // Ensure vendor exists in DB + const vendorId = await ensureVendor(config.name, "distributor", config.website, undefined); + log(`Vendor ID: ${vendorId}`); + + const requestQueue = await RequestQueue.open(`vendor-${config.slug}-${Date.now()}`); + for (const url of config.catalogUrls) { + await requestQueue.addRequest({ url, userData: { depth: 0 } }); + } + + const crawleeConfig = makeCrawleeConfig(`vendor-discovery-${config.slug}`); + const seenUrls = new Set(); + + const crawler = new PlaywrightCrawler( + { + requestQueue, + maxRequestsPerCrawl: maxPages, + maxConcurrency: 1, // polite single-thread crawl + navigationTimeoutSecs: 30, + requestHandlerTimeoutSecs: 60, + + async requestHandler({ request, page, enqueueLinks }) { + if (stats.pagesVisited >= maxPages) return; + stats.pagesVisited++; + seenUrls.add(request.url); + + log(`[${stats.pagesVisited}/${maxPages}] ${request.url}`); + + // Polite delay + await new Promise((r) => setTimeout(r, delayMs)); + + // Get rendered HTML + const html = await page.content(); + const cleanedText = cleanHtml(html).slice(0, 2000); + + // Run LLM extraction (with page type detection) + let llmResult: Awaited> | null = null; + try { + llmResult = await scrapeWithLLM(html, request.url, { + vendorSlug: config.slug, + skipPageDetection: false, + }); + } catch (err) { + stats.extractionsFailed++; + log(`LLM error: ${(err as Error).message.slice(0, 80)}`); + } + + // Process product pages + if (llmResult?.extraction.is_product_page) { + stats.productPagesFound++; + + const ext = llmResult.extraction; + if (llmResult.validation_passed) { + stats.extractionsSucceeded++; + + // Build spec for physical validation + const spec: ExtractedSpec = { + part_number: ext.part_number ?? undefined, + form_factor: ext.form_factor ?? undefined, + speed_gbps: ext.speed_gbps ?? undefined, + fiber_type: undefined, // not in stock extraction — derive later + }; + + // Spec plausibility check + const specResult = validateTransceiverSpec(spec); + const combined = combineValidations(specResult, ext.confidence); + + if (combined.passed) { + stats.validationPassed++; + } else { + stats.validationFailed++; + } + + // Persist to DB (even if spec validation has warnings — just low tier) + if (!options.dryRun && ext.part_number && combined.adjusted_confidence >= 0.5) { + try { + await findOrCreateScrapedTransceiver({ + partNumber: ext.part_number, + vendorId, + productUrl: request.url, + formFactor: ext.form_factor ?? undefined, + speedGbps: ext.speed_gbps ?? undefined, + speed: ext.speed_gbps ? `${ext.speed_gbps}G` : undefined, + }); + stats.dbInserted++; + } catch (dbErr) { + log(`DB error: ${(dbErr as Error).message.slice(0, 80)}`); + } + } + + // Write training data + const crawlExt: CrawlExtraction = { + url: request.url, + vendor_slug: config.slug, + vendor_name: config.name, + spec, + validation: combined, + raw_html_snippet: cleanedText, + crawled_at: new Date().toISOString(), + }; + + const writeResult = writeExtractionRecord(crawlExt); + if (writeResult.written) { + stats.trainingPairsWritten += writeResult.pairs; + } + + } else { + stats.extractionsFailed++; + log(`Extraction failed validation: ${llmResult.validation_errors.join("; ")}`); + } + } + + // Discover more URLs at current depth + const currentDepth = (request.userData?.depth as number) ?? 0; + const maxDepth = config.maxDepth ?? 3; + + if (currentDepth < maxDepth) { + const links = await page.evaluate(() => + Array.from(document.querySelectorAll("a[href]")) + .map((a) => (a as HTMLAnchorElement).href) + .filter(Boolean) + ); + + for (const link of links) { + if (seenUrls.has(link)) continue; + if (!shouldFollowUrl(link, config)) continue; + if (stats.pagesVisited >= maxPages) break; + + seenUrls.add(link); + await requestQueue.addRequest({ + url: link, + userData: { depth: currentDepth + 1 }, + }); + } + } + }, + + failedRequestHandler({ request, log: crawleeLog }: { request: Parameters[1]; log: Log }) { + stats.extractionsFailed++; + (crawleeLog as Log).error(`Failed: ${(request as { url: string }).url}`); + }, + }, + crawleeConfig + ); + + await crawler.run(); + + // Write discovery record + final flush + writeDiscoveryRecord(config.slug, config.name, config.catalogUrls[0], stats.productPagesFound); + finalFlush(config.slug); + + console.log(`\n=== ${config.name} Discovery Complete ===`); + console.log(` Pages visited: ${stats.pagesVisited}`); + console.log(` Product pages: ${stats.productPagesFound}`); + console.log(` Extractions OK: ${stats.extractionsSucceeded}`); + console.log(` Spec valid: ${stats.validationPassed}`); + console.log(` DB inserts: ${stats.dbInserted}`); + console.log(` Training pairs: ${stats.trainingPairsWritten}\n`); + + return stats; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Batch runner — crawl multiple vendors in sequence +// ───────────────────────────────────────────────────────────────────────────── + +export async function runVendorDiscoveryBatch( + vendorSlugs?: string[], + options: { dryRun?: boolean; verbose?: boolean } = {} +): Promise { + const targets = vendorSlugs + ? VENDOR_CATALOG_REGISTRY.filter((v) => vendorSlugs.includes(v.slug)) + : VENDOR_CATALOG_REGISTRY; + + console.log(`Starting vendor discovery for ${targets.length} vendor(s)...`); + + for (const config of targets) { + try { + await discoverVendorCatalog(config, options); + } catch (err) { + console.error(`[${config.slug}] Fatal crawl error:`, (err as Error).message); + } + } + + console.log("Vendor discovery batch complete."); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Standalone execution +// ───────────────────────────────────────────────────────────────────────────── + +if (require.main === module) { + const target = process.argv[2]; // optional: specific vendor slug + const dryRun = process.argv.includes("--dry-run"); + const verbose = process.argv.includes("--verbose"); + + const run = async () => { + if (target) { + const config = VENDOR_CATALOG_REGISTRY.find((v) => v.slug === target); + if (!config) { + console.error(`Unknown vendor slug: ${target}`); + console.log("Available:", VENDOR_CATALOG_REGISTRY.map((v) => v.slug).join(", ")); + process.exit(1); + } + await discoverVendorCatalog(config, { dryRun, verbose }); + } else { + await runVendorDiscoveryBatch(undefined, { dryRun, verbose }); + } + }; + + run() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 292b84a..c75a3e9 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -155,6 +155,15 @@ export async function registerSchedules(boss: PgBoss): Promise { "maintenance:find-equivalences", // ── Re-Research approved equivalences ───────────────────────────── "maintenance:re-research-equivalences", + // ── Vendor Discovery Crawlers (TIPLLM training data + DB seeding) ───── + "discover:vendor:cisco-tmg", + "discover:vendor:juniper", + "discover:vendor:arista", + "discover:vendor:fs-com", + "discover:vendor:flexoptix", + "discover:vendor:nokia", + "discover:vendor:huawei", + "discover:vendor:ii-vi", ]; for (const q of queues) { @@ -432,6 +441,21 @@ export async function registerSchedules(boss: PgBoss): Promise { await boss.schedule("scrape:catalog:3com-legacy-oem", "0 6 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:catalog:avaya-legacy-oem", "15 6 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); + // ══════════════════════════════════════════════════════════════════════ + // VENDOR DISCOVERY CRAWLERS — weekly (deep crawl, Playwright, TIPLLM training) + // Each run: crawls catalog → LLM extract → spec validate → DB + Gitea SFT + // Staggered across Sun/Mon nights (low-traffic window, 2h expiry each) + // ══════════════════════════════════════════════════════════════════════ + + await boss.schedule("discover:vendor:cisco-tmg", "0 20 * * 0", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:juniper", "0 22 * * 0", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:arista", "0 0 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:fs-com", "0 2 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:flexoptix", "0 4 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:nokia", "0 6 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:huawei", "0 8 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + await boss.schedule("discover:vendor:ii-vi", "0 10 * * 1", {}, { retryLimit: 1, expireInSeconds: 7200 }); + // ══════════════════════════════════════════════════════════════════════ // VENDOR LISTS — every 12h // ══════════════════════════════════════════════════════════════════════ @@ -2706,5 +2730,31 @@ export async function registerWorkers(boss: PgBoss): Promise { console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`); }); - console.log("All workers registered (94 jobs, 24/7 continuous)"); + // ══════════════════════════════════════════════════════════════════════ + // VENDOR DISCOVERY CRAWLER WORKERS + // Each worker calls discoverVendorCatalog() for the matching slug. + // Results go to: TIP DB (findOrCreateScrapedTransceiver) + + // Gitea tip-training-data repo (SFT JSONL pairs) + // ══════════════════════════════════════════════════════════════════════ + + const { discoverVendorCatalog, VENDOR_CATALOG_REGISTRY } = await import("./crawler-llm/vendor-discovery-crawler"); + + for (const vendorConfig of VENDOR_CATALOG_REGISTRY) { + const jobName = `discover:vendor:${vendorConfig.slug}`; + boss.work(jobName, async () => { + if (!isLoadAcceptable(3.0)) { + console.warn(`[${jobName}] Load too high — skipping deep crawl`); + return; + } + console.log(`[${jobName}] Starting vendor discovery crawl…`); + try { + await discoverVendorCatalog(vendorConfig, { verbose: false }); + } catch (err) { + console.error(`[${jobName}] Fatal:`, (err as Error).message); + throw err; // let pg-boss retry + } + }); + } + + console.log("All workers registered (102 jobs, 24/7 continuous + 8 weekly discovery crawlers)"); }