/** * Crawler LLM — Rule-based validator. * * Runs AFTER the LLM extraction to catch hallucinations and obvious errors. * The LLM is good at structure; this catches range violations and nonsense. */ import type { StockExtractionResult } from "./stock-schema"; export interface ValidationResult { passed: boolean; warnings: string[]; errors: string[]; } /** Expected price ranges per speed class (USD/EUR). Rough but effective. */ const PRICE_RANGES: Record = { "1G": [10, 500], "10G": [20, 2000], "25G": [30, 2000], "40G": [50, 3000], "100G": [80, 15000], "200G": [200, 20000], "400G": [200, 50000], "800G": [500, 80000], }; const VALID_FORM_FACTORS = new Set([ "SFP", "SFP+", "SFP28", "SFP56", "SFP-DD", "QSFP", "QSFP+", "QSFP28", "QSFP56", "QSFP-DD", "QSFP112", "OSFP", "OSFP-RHS", "CFP", "CFP2", "CFP4", "CFP8", "XFP", "X2", "XENPAK", "DSFP", "CSFP", ]); const VALID_CURRENCIES = new Set(["USD", "EUR", "GBP", "CNY"]); export function validateStockExtraction( result: StockExtractionResult, speedGbps?: number ): ValidationResult { const errors: string[] = []; const warnings: string[] = []; // Not a product page — caller should discard, not an error if (!result.is_product_page) { return { passed: false, errors: ["Not a product page"], warnings: [] }; } // Confidence too low if (result.confidence < 0.5) { errors.push(`Confidence ${result.confidence} below threshold 0.5`); } // Price validation if (result.price !== null) { if (result.price <= 0) { errors.push(`Price ${result.price} is not positive`); } if (result.price > 500_000) { errors.push(`Price ${result.price} exceeds maximum sanity limit`); } if (!result.currency || !VALID_CURRENCIES.has(result.currency)) { errors.push(`Invalid currency: ${result.currency}`); } // Speed-class price range check if (speedGbps) { const speedKey = `${speedGbps}G`; const range = PRICE_RANGES[speedKey]; if (range && (result.price < range[0] * 0.1 || result.price > range[1] * 10)) { warnings.push(`Price ${result.price} ${result.currency} looks unusual for ${speedKey} (expected ${range[0]}–${range[1]})`); } } } // Stock quantity sanity if (result.stock_quantity !== null) { if (result.stock_quantity < 0) { errors.push(`Stock quantity ${result.stock_quantity} is negative`); } if (result.stock_quantity > 100_000) { warnings.push(`Stock quantity ${result.stock_quantity} unusually high — verify`); } } // Lead time sanity if (result.lead_time_days !== null) { if (result.lead_time_days < 0) { errors.push(`Lead time ${result.lead_time_days} is negative`); } if (result.lead_time_days > 730) { warnings.push(`Lead time ${result.lead_time_days} days (>2 years) — verify`); } } // MOQ sanity if (result.moq !== null && result.moq < 1) { errors.push(`MOQ ${result.moq} must be at least 1`); } // Form factor check if (result.form_factor && !VALID_FORM_FACTORS.has(result.form_factor)) { warnings.push(`Unknown form factor: ${result.form_factor}`); } // Price break consistency if (result.price_breaks.length > 0) { for (const pb of result.price_breaks) { if (pb.qty < 1 || pb.price <= 0) { errors.push(`Invalid price break: qty=${pb.qty} price=${pb.price}`); } if (result.price && pb.price > result.price * 2) { warnings.push(`Price break ${pb.qty}x=${pb.price} higher than unit price — unusual`); } } } // Incoming ETA must be a future-ish date if (result.incoming_eta) { const eta = new Date(result.incoming_eta); if (isNaN(eta.getTime())) { errors.push(`Invalid incoming_eta date: ${result.incoming_eta}`); } } return { passed: errors.length === 0, errors, warnings, }; } /** Cross-source comparison: do two extractions agree within tolerance? */ export function crossValidate( a: StockExtractionResult, b: StockExtractionResult, priceTolerance = 0.10 // 10% price difference allowed ): boolean { if (a.price === null || b.price === null) return false; // Both in same currency if (a.currency !== b.currency) return false; // Price within tolerance const diff = Math.abs(a.price - b.price) / Math.max(a.price, b.price); if (diff > priceTolerance) return false; // Part numbers match (if both present) if (a.part_number && b.part_number) { const normalize = (s: string) => s.replace(/[\s\-_]/g, "").toUpperCase(); if (normalize(a.part_number) !== normalize(b.part_number)) return false; } return true; }