Rene Fichtmueller 297dc46f2b feat(crawler-llm): intelligent vendor discovery pipeline + TIPLLM training data
- spec-validator.ts: physical plausibility checks (form factor↔speed matrix,
  wavelength↔fiber consistency, IEEE standard cross-check, reach limits).
  Outputs tier (high/medium/low/rejected) + confidence_delta for LLM scores.

- training-data-writer.ts: converts validated crawler extractions to SFT JSONL
  training pairs (spec_qa / crawl_reasoning / validation / discovery types).
  Auto-commits and pushes to Gitea tip-training-data repo in batches of 50.

- vendor-discovery-crawler.ts: PlaywrightCrawler pipeline — catalog URL →
  LLM extraction (scrapeWithLLM) → spec validation → DB persist +
  Gitea SFT training pairs. 8 vendor configs registered
  (Cisco/Juniper/Arista/FS.com/Flexoptix/Nokia/Huawei/II-VI).

- scheduler.ts: 8 weekly discover:vendor:* jobs added (Sun 20:00–Mon 10:00 UTC).
  Total registered jobs: 102.

- Gitea repo created: gitea.context-x.org/rene/tip-training-data
2026-04-28 23:46:34 +02:00

374 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Crawler LLM — Transceiver spec physical plausibility validator.
*
* Runs AFTER LLM extraction to catch technically impossible combinations
* (e.g. 100G over SFP, 850nm on SMF, 80km over MMF). Complements
* validator.ts which checks stock/price sanity.
*
* Returns a SpecValidationResult with:
* - passed: false blocks DB write and lowers training data confidence tier
* - warnings: still writes to DB but flags for human review
* - confidence_delta: adjustment applied to the LLM confidence score
*/
// ─────────────────────────────────────────────────────────────────────────────
// Type definitions
// ─────────────────────────────────────────────────────────────────────────────
export interface ExtractedSpec {
part_number?: string | null;
form_factor?: string | null;
speed_gbps?: number | null;
reach_meters?: number | null;
fiber_type?: string | null; // "SMF" | "MMF" | "CU" | "DAC" | "AOC"
connector?: string | null;
wavelengths?: string | null; // e.g. "850nm" or "1310nm TX / 1490nm RX"
ieee_standard?: string | null; // e.g. "100GBASE-SR4"
dom_support?: boolean | null;
}
export interface SpecValidationResult {
passed: boolean;
errors: string[];
warnings: string[];
confidence_delta: number; // negative = reduce LLM confidence score
tier: "high" | "medium" | "low" | "rejected";
}
// ─────────────────────────────────────────────────────────────────────────────
// Compatibility tables
// ─────────────────────────────────────────────────────────────────────────────
/** Max rated speed per form factor (Gbps). DAC/AOC = same form factor shell. */
const FORM_FACTOR_MAX_SPEED: Record<string, number> = {
"GBIC": 1,
"SFP": 4.25, // 4G FC max; 1G Ethernet common
"SFP+": 28.05, // nominally 10G but 16G FC / 25G variants exist
"SFP28": 28.05,
"SFP56": 56,
"SFP-DD": 100, // dual-lane SFP
"QSFP": 40,
"QSFP+": 40,
"QSFP28": 112, // 4×25G = 100G; some push 112G
"QSFP56": 224, // 4×56G = 200G
"QSFP-DD": 800, // 8×100G
"QSFP112": 800,
"OSFP": 800,
"OSFP-RHS": 800,
"CFP": 100,
"CFP2": 400,
"CFP4": 100,
"CFP8": 400,
"XFP": 10,
"X2": 10,
"XENPAK": 10,
"DSFP": 100,
"CSFP": 2.5,
};
/** Min rated speed per form factor (Gbps). Catches wild mismatches. */
const FORM_FACTOR_MIN_SPEED: Record<string, number> = {
"GBIC": 0.1,
"SFP": 0.1,
"SFP+": 1,
"SFP28": 10,
"SFP56": 25,
"SFP-DD": 50,
"QSFP": 4,
"QSFP+": 10,
"QSFP28": 40,
"QSFP56": 100,
"QSFP-DD": 100,
"QSFP112": 200,
"OSFP": 200,
"OSFP-RHS":200,
"CFP": 10,
"CFP2": 40,
"CFP4": 10,
"CFP8": 100,
"XFP": 10,
"X2": 10,
"XENPAK": 10,
"DSFP": 25,
"CSFP": 0.1,
};
/**
* Wavelength → expected fiber type.
* 850 nm is classically MMF; 12701610 nm is SMF.
* Exceptions: some 1310nm SFP (1000BASE-LX) work on MMF with mode-conditioning.
*/
function expectedFiberForWavelength(nm: number): "MMF" | "SMF" | "either" {
if (nm <= 900) return "MMF";
if (nm >= 1260) return "SMF";
return "either";
}
/** Max practical reach per fiber type (meters). Soft sanity limit. */
const MAX_REACH: Record<string, number> = {
MMF: 4000, // OM5 push ~3.5km; 4km is outer limit for 100M FX
SMF: 200_000, // 200km coherent ZR is real
CU: 100,
DAC: 30,
AOC: 200,
};
/** Known IEEE standards and their canonical speed (Gbps) + form factor hints */
const IEEE_STANDARDS: Record<string, { speedGbps: number; fiberType?: string; reachKm?: number }> = {
"100BASE-FX": { speedGbps: 0.1, fiberType: "MMF", reachKm: 2 },
"100BASE-LX10": { speedGbps: 0.1, fiberType: "SMF", reachKm: 10 },
"1000BASE-SX": { speedGbps: 1, fiberType: "MMF", reachKm: 0.55 },
"1000BASE-LX": { speedGbps: 1, fiberType: "SMF", reachKm: 10 },
"1000BASE-EX": { speedGbps: 1, fiberType: "SMF", reachKm: 40 },
"1000BASE-ZX": { speedGbps: 1, fiberType: "SMF", reachKm: 80 },
"1000BASE-T": { speedGbps: 1, fiberType: "CU" },
"10GBASE-SR": { speedGbps: 10, fiberType: "MMF", reachKm: 0.3 },
"10GBASE-LR": { speedGbps: 10, fiberType: "SMF", reachKm: 10 },
"10GBASE-ER": { speedGbps: 10, fiberType: "SMF", reachKm: 40 },
"10GBASE-ZR": { speedGbps: 10, fiberType: "SMF", reachKm: 80 },
"25GBASE-SR": { speedGbps: 25, fiberType: "MMF", reachKm: 0.1 },
"25GBASE-LR": { speedGbps: 25, fiberType: "SMF", reachKm: 10 },
"25GBASE-ER": { speedGbps: 25, fiberType: "SMF", reachKm: 40 },
"40GBASE-SR4": { speedGbps: 40, fiberType: "MMF", reachKm: 0.15 },
"40GBASE-LR4": { speedGbps: 40, fiberType: "SMF", reachKm: 10 },
"40GBASE-ER4": { speedGbps: 40, fiberType: "SMF", reachKm: 40 },
"100GBASE-SR4": { speedGbps: 100, fiberType: "MMF", reachKm: 0.1 },
"100GBASE-SR10": { speedGbps: 100, fiberType: "MMF", reachKm: 0.15 },
"100GBASE-LR4": { speedGbps: 100, fiberType: "SMF", reachKm: 10 },
"100GBASE-ER4": { speedGbps: 100, fiberType: "SMF", reachKm: 40 },
"100GBASE-ZR": { speedGbps: 100, fiberType: "SMF", reachKm: 80 },
"400GBASE-SR4": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 },
"400GBASE-SR8": { speedGbps: 400, fiberType: "MMF", reachKm: 0.1 },
"400GBASE-LR4": { speedGbps: 400, fiberType: "SMF", reachKm: 10 },
"400GBASE-LR8": { speedGbps: 400, fiberType: "SMF", reachKm: 10 },
"400GBASE-ER8": { speedGbps: 400, fiberType: "SMF", reachKm: 40 },
"400GBASE-ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 80 },
"400ZR": { speedGbps: 400, fiberType: "SMF", reachKm: 120 },
"800GBASE-SR8": { speedGbps: 800, fiberType: "MMF", reachKm: 0.1 },
"800GBASE-LR4": { speedGbps: 800, fiberType: "SMF", reachKm: 2 },
};
// ─────────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────────
/** Parse first numeric wavelength from a string like "850nm" or "1310nm TX / 1490nm RX" */
function parsePrimaryWavelength(wl: string): number | null {
const match = wl.match(/(\d{3,4})\s*nm/);
return match ? parseInt(match[1], 10) : null;
}
function normalizeFormFactor(ff: string): string {
return ff.trim().toUpperCase().replace(/\s+/g, "");
}
function normalizeStandard(s: string): string {
return s.trim().toUpperCase().replace(/\s+/g, "").replace("BASE-", "BASE-");
}
// ─────────────────────────────────────────────────────────────────────────────
// Main validator
// ─────────────────────────────────────────────────────────────────────────────
export function validateTransceiverSpec(spec: ExtractedSpec): SpecValidationResult {
const errors: string[] = [];
const warnings: string[] = [];
let confidenceDelta = 0;
const ff = spec.form_factor ? normalizeFormFactor(spec.form_factor) : null;
const speedGbps = spec.speed_gbps ?? null;
const fiberType = spec.fiber_type?.toUpperCase().trim() ?? null;
const reachM = spec.reach_meters ?? null;
const wavelengths = spec.wavelengths ?? null;
// ── 1. Form factor ↔ speed compatibility ──────────────────────────────────
if (ff && speedGbps !== null) {
const maxSpeed = FORM_FACTOR_MAX_SPEED[ff];
const minSpeed = FORM_FACTOR_MIN_SPEED[ff];
if (maxSpeed !== undefined && speedGbps > maxSpeed * 1.15) {
errors.push(
`Speed ${speedGbps}G exceeds ${ff} maximum (${maxSpeed}G). Physically impossible.`
);
confidenceDelta -= 0.4;
}
if (minSpeed !== undefined && speedGbps < minSpeed * 0.5) {
warnings.push(
`Speed ${speedGbps}G is unusually low for ${ff} (typical min ${minSpeed}G). Verify.`
);
confidenceDelta -= 0.1;
}
}
// ── 2. Wavelength ↔ fiber type consistency ────────────────────────────────
if (wavelengths && fiberType && fiberType !== "DAC" && fiberType !== "AOC" && fiberType !== "CU") {
const primaryNm = parsePrimaryWavelength(wavelengths);
if (primaryNm !== null) {
const expectedFiber = expectedFiberForWavelength(primaryNm);
if (expectedFiber === "MMF" && fiberType === "SMF") {
errors.push(
`${primaryNm}nm is a multi-mode wavelength but fiber_type is SMF. Check the source.`
);
confidenceDelta -= 0.3;
}
if (expectedFiber === "SMF" && fiberType === "MMF") {
// 1310nm LX on MMF with mode-conditioning cable is a real thing — warn, not error
if (primaryNm >= 1260 && primaryNm <= 1360) {
warnings.push(
`${primaryNm}nm on MMF is unusual. Possible mode-conditioning cable — verify.`
);
confidenceDelta -= 0.05;
} else {
errors.push(
`${primaryNm}nm (SMF wavelength) cannot work on MMF fiber at this reach.`
);
confidenceDelta -= 0.35;
}
}
}
}
// ── 3. Reach ↔ fiber type sanity ─────────────────────────────────────────
if (reachM !== null && fiberType && fiberType in MAX_REACH) {
const maxReach = MAX_REACH[fiberType];
if (reachM > maxReach) {
errors.push(
`Reach ${reachM}m exceeds physical maximum for ${fiberType} (${maxReach}m). Data error.`
);
confidenceDelta -= 0.4;
}
}
if (reachM !== null && fiberType === "MMF" && reachM > 2000) {
warnings.push(
`MMF reach ${reachM}m is very high (rare). OM5 max ~3.5km, earlier OM4 max 400m at 10G+.`
);
confidenceDelta -= 0.1;
}
// ── 4. IEEE standard cross-check ─────────────────────────────────────────
if (spec.ieee_standard) {
const stdKey = Object.keys(IEEE_STANDARDS).find(
(k) => normalizeStandard(k) === normalizeStandard(spec.ieee_standard!)
);
if (stdKey) {
const stdDef = IEEE_STANDARDS[stdKey];
// Speed mismatch
if (speedGbps !== null && Math.abs(speedGbps - stdDef.speedGbps) / stdDef.speedGbps > 0.15) {
errors.push(
`${spec.ieee_standard} requires ${stdDef.speedGbps}G but extracted speed is ${speedGbps}G.`
);
confidenceDelta -= 0.35;
}
// Fiber type mismatch (soft — standard may have variants)
if (fiberType && stdDef.fiberType && fiberType !== stdDef.fiberType) {
warnings.push(
`${spec.ieee_standard} expects ${stdDef.fiberType} but fiber_type is ${fiberType}.`
);
confidenceDelta -= 0.1;
}
// Reach mismatch: more than 3× the defined reach is suspicious
if (reachM !== null && stdDef.reachKm !== undefined) {
const stdReachM = stdDef.reachKm * 1000;
if (reachM > stdReachM * 3) {
warnings.push(
`Reach ${reachM}m is >3× the ${spec.ieee_standard} defined reach (${stdReachM}m). Verify — may be a proprietary extended reach variant.`
);
confidenceDelta -= 0.05;
}
}
} else {
// Standard not in table — not an error, just warn for unknown standards
warnings.push(`IEEE standard "${spec.ieee_standard}" not in reference table. Accepted as-is.`);
}
}
// ── 5. DAC/AOC special rules ──────────────────────────────────────────────
if (fiberType === "DAC" || fiberType === "AOC") {
if (reachM !== null && reachM > 30 && fiberType === "DAC") {
warnings.push(`DAC cables > 30m are unusual (passive DAC max ~7m). Verify if active DAC or AOC.`);
confidenceDelta -= 0.1;
}
if (wavelengths) {
warnings.push(`DAC/AOC have no wavelength. Extracted wavelength "${wavelengths}" may be wrong.`);
confidenceDelta -= 0.05;
}
}
// ── 6. Connector ↔ form factor ────────────────────────────────────────────
if (spec.connector && ff) {
const connector = spec.connector.toUpperCase();
const mpoBased = ["QSFP", "QSFP+", "QSFP28", "QSFP56", "QSFP-DD", "OSFP", "CFP8"];
const scBased = ["GBIC", "CSFP"];
if (mpoBased.includes(ff) && connector === "SC") {
warnings.push(`${ff} modules rarely use SC connectors. LC or MPO expected. Verify.`);
confidenceDelta -= 0.1;
}
if (scBased.includes(ff) && connector === "LC") {
// GBIC can use LC — soft warning only
warnings.push(`${ff} with LC connector is unusual. SC more common for this form factor.`);
confidenceDelta -= 0.05;
}
}
// ── Tier assignment ───────────────────────────────────────────────────────
const passed = errors.length === 0;
let tier: SpecValidationResult["tier"];
if (!passed) {
tier = "rejected";
} else if (warnings.length === 0 && confidenceDelta >= 0) {
tier = "high";
} else if (warnings.length <= 2 && confidenceDelta >= -0.15) {
tier = "medium";
} else {
tier = "low";
}
return {
passed,
errors,
warnings,
confidence_delta: Math.max(confidenceDelta, -0.9),
tier,
};
}
// ─────────────────────────────────────────────────────────────────────────────
// Convenience: combine with stock validation result
// ─────────────────────────────────────────────────────────────────────────────
export interface CombinedValidationResult {
passed: boolean;
spec_errors: string[];
spec_warnings: string[];
tier: SpecValidationResult["tier"];
adjusted_confidence: number;
}
export function combineValidations(
specResult: SpecValidationResult,
baseLlmConfidence: number
): CombinedValidationResult {
const adjusted = Math.min(
1.0,
Math.max(0.0, baseLlmConfidence + specResult.confidence_delta)
);
return {
passed: specResult.passed,
spec_errors: specResult.errors,
spec_warnings: specResult.warnings,
tier: specResult.tier,
adjusted_confidence: adjusted,
};
}