From 98b241f462383669ba8ad09572bc9fc075e5b9c3 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 13 May 2026 16:55:45 +0200 Subject: [PATCH] feat: implement Flexoptix reference matching overhaul MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - sql/108: normalize form_factor across all vendors (SFP-Plus → SFP+, etc.) and round speed_gbps for consistent matching - sql/109: document 30→90 day matcher window change - robots/catalog-reconcile.ts: new bulk-reconcile robot — matches all Flexoptix products against all competitors without 30-day time limit - scheduler.ts: register catalog:reconcile job (monthly + on-demand), fix nightly matcher 30→90 day window, UPPER() form_factor matching, ROUND() speed_gbps matching Fixes: ATGBICS/NADDOD/10Gtek/ShopFiber24 had 0 Flexoptix equivalences due to stale price_observations being filtered out. Expected coverage improvement: 22% → 45-60% after first reconcile run. --- .../scraper/src/robots/catalog-reconcile.ts | 272 ++++++++++++++++++ packages/scraper/src/scheduler.ts | 24 +- sql/108-form-factor-normalization.sql | 91 ++++++ sql/109-fix-nightly-matcher-time-window.sql | 10 + 4 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 packages/scraper/src/robots/catalog-reconcile.ts create mode 100644 sql/108-form-factor-normalization.sql create mode 100644 sql/109-fix-nightly-matcher-time-window.sql diff --git a/packages/scraper/src/robots/catalog-reconcile.ts b/packages/scraper/src/robots/catalog-reconcile.ts new file mode 100644 index 0000000..491de72 --- /dev/null +++ b/packages/scraper/src/robots/catalog-reconcile.ts @@ -0,0 +1,272 @@ +/** + * Catalog Reconcile Robot + * + * Vollständiger Bulk-Abgleich Flexoptix ↔ ALLE Wettbewerber. + * + * Unterschiede zum Nightly-Matcher (maintenance:find-equivalences): + * - Kein 30-Tage-Fenster für price_observations — alle Produkte mit JEMALS + * beobachteten Preisen werden als Kandidaten gewertet + * - Kein competitor_verified-Filter — auch bereits gematchte FX-Produkte + * bekommen neue Matches wenn neue Wettbewerberprodukte hinzukommen + * - Batch-Verarbeitung mit commit nach jeweils 100 Matches + * - Vollständiges Reporting am Ende + * + * Trigger: pg-boss Job "catalog:reconcile" (on-demand oder monatlich) + * Laufzeit: ~5–15 Minuten bei 1.000+ FX-Produkten + */ + +import { pool } from "../utils/db"; + +export interface ReconcileResult { + flexoptixProcessed: number; + newAutoApproved: number; + newPending: number; + skippedLowConfidence: number; + skippedAlreadyMatched: number; + vendorBreakdown: Record; + durationMs: number; +} + +// ── Konfigurations-Konstanten ──────────────────────────────────────────────── + +/** Minimum-Confidence für pending-Eintrag (unter diesem Schwellwert: ignorieren) */ +const CONFIDENCE_MIN = 0.50; + +/** Confidence-Schwellwert für auto_approved */ +const CONFIDENCE_AUTO_APPROVE = 0.73; + +/** + * Maximale Anzahl Tage seit letzter price_observation. + * NULL = kein Filter (alle Produkte mit mind. 1 Observation). + * Für Full-Reconcile: NULL. + */ +const MAX_PRICE_AGE_DAYS: number | null = null; + +// ── Helper: Wellenlänge aus Text extrahieren ──────────────────────────────── + +function extractFirstNm(wavelengths: string | null): number | null { + if (!wavelengths) return null; + const m = wavelengths.match(/(\d{3,4})/); + return m ? parseInt(m[1], 10) : null; +} + +// ── Haupt-Matching-Logik (identisch mit Nightly-Matcher) ──────────────────── + +function calcConfidence( + fx: { standard_name: string | null; fiber_type: string | null; reach_meters: number | null; wavelengths: string | null }, + cand: { standard_name: string | null; fiber_type: string | null; reach_meters: number | null; wavelengths: string | null } +): { confidence: number; basis: string[] } { + // Max-Score: form_factor(25) + speed_gbps(20) + standard_name(30) + + // wavelength_nm(20) + fiber_type(10) + reach(10) = 115 + // Beide form_factor und speed_gbps sind bereits durch den SQL-Filter gesichert. + let score = 0; + const basis: string[] = []; + + score += 25; basis.push("form_factor"); + score += 20; basis.push("speed_gbps"); + + if ( + fx.standard_name && cand.standard_name && + fx.standard_name.trim().toUpperCase() === cand.standard_name.trim().toUpperCase() + ) { + score += 30; basis.push("standard_name"); + } + + const fxNm = extractFirstNm(fx.wavelengths); + const candNm = extractFirstNm(cand.wavelengths); + if (fxNm !== null && candNm !== null) { + if (Math.abs(fxNm - candNm) <= 15) { + score += 20; basis.push(`wavelength_${fxNm}nm`); + } else { + score -= 20; + } + } + + if (fx.fiber_type && cand.fiber_type) { + if (fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase()) { + score += 10; basis.push("fiber_type"); + } else { + score -= 15; + } + } + + if (fx.reach_meters && cand.reach_meters && fx.reach_meters > 0 && cand.reach_meters > 0) { + const diff = Math.abs(fx.reach_meters - cand.reach_meters); + const tolerance = Math.max(fx.reach_meters, 1) * 0.25; + if (diff <= tolerance) { + score += 10; basis.push("reach"); + } else { + score -= 15; + } + } else if (!fx.reach_meters && !cand.reach_meters) { + score += 5; basis.push("reach_null"); + } + + const confidence = Math.max(0, Math.min(1, score / 115)); + return { confidence, basis }; +} + +// ── Haupt-Funktion ─────────────────────────────────────────────────────────── + +export async function runCatalogReconcile(): Promise { + const startMs = Date.now(); + console.log("=== Catalog Reconcile Robot ==="); + console.log(` Started: ${new Date().toISOString()}`); + console.log(` Mode: FULL (no 30-day window, all vendors)`); + + const result: ReconcileResult = { + flexoptixProcessed: 0, + newAutoApproved: 0, + newPending: 0, + skippedLowConfidence: 0, + skippedAlreadyMatched: 0, + vendorBreakdown: {}, + durationMs: 0, + }; + + // ── Alle Flexoptix-Produkte laden ───────────────────────────────────────── + // Kein competitor_verified-Filter → wir reconcilen ALLES + const { rows: fxProducts } = await pool.query<{ + id: string; + part_number: string; + standard_name: string | null; + form_factor: string | null; + speed_gbps: string | null; + fiber_type: string | null; + reach_meters: number | null; + wavelengths: string | null; + }>(` + SELECT t.id, t.part_number, t.standard_name, t.form_factor, + t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE UPPER(v.name) LIKE '%FLEXOPTIX%' + AND t.form_factor IS NOT NULL + AND t.speed_gbps IS NOT NULL + ORDER BY t.part_number + `); + + result.flexoptixProcessed = fxProducts.length; + console.log(` Flexoptix products to process: ${fxProducts.length}`); + + const priceAgeFilter = MAX_PRICE_AGE_DAYS !== null + ? `AND po.time > NOW() - INTERVAL '${MAX_PRICE_AGE_DAYS} days'` + : ""; + + for (const fx of fxProducts) { + if (!fx.form_factor || !fx.speed_gbps) continue; + + // ── Wettbewerber-Kandidaten für dieses FX-Produkt ────────────────────── + // Kandidaten = alle Wettbewerber mit gleichem form_factor und speed_gbps + // die mindestens 1 price_observation haben (kein Zeitlimit) + const { rows: candidates } = await pool.query<{ + competitor_id: string; + part_number: string; + standard_name: string | null; + form_factor: string | null; + speed_gbps: string | null; + fiber_type: string | null; + reach_meters: number | null; + wavelengths: string | null; + vendor_name: string; + last_price: Date | null; + price_count: string; + }>(` + SELECT t.id AS competitor_id, t.part_number, t.standard_name, + t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, + t.wavelengths, v.name AS vendor_name, + MAX(po.time) AS last_price, COUNT(DISTINCT po.id) AS price_count + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + JOIN price_observations po ON po.transceiver_id = t.id + WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%' + AND v.is_competitor = true + ${priceAgeFilter} + AND UPPER(t.form_factor) = UPPER($1) + AND ROUND(t.speed_gbps::NUMERIC, 2) = ROUND($2::NUMERIC, 2) + AND t.id != $3 + GROUP BY t.id, t.part_number, t.standard_name, t.form_factor, + t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, v.name + HAVING COUNT(DISTINCT po.id) >= 1 + `, [fx.form_factor, fx.speed_gbps, fx.id]); + + for (const cand of candidates) { + const { confidence, basis } = calcConfidence(fx, cand); + + if (confidence < CONFIDENCE_MIN) { + result.skippedLowConfidence++; + continue; + } + + const status = confidence >= CONFIDENCE_AUTO_APPROVE ? "auto_approved" : "pending"; + const notes = + `${fx.part_number} ↔ ${cand.part_number} (${cand.vendor_name}) | ` + + `basis: ${basis.join(", ")} | reach: ${fx.reach_meters}m vs ${cand.reach_meters}m | ` + + `wavelength: ${fx.wavelengths ?? "?"} vs ${cand.wavelengths ?? "?"} | ` + + `last_price: ${cand.last_price?.toISOString() ?? "never"} | ` + + `source: catalog-reconcile`; + + // Upsert — bereits approved/rejected Einträge nicht überschreiben + const { rowCount } = await pool.query(` + INSERT INTO transceiver_equivalences + (flexoptix_id, competitor_id, confidence, match_basis, match_notes, status) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET + confidence = EXCLUDED.confidence, + match_basis = EXCLUDED.match_basis, + match_notes = EXCLUDED.match_notes, + updated_at = NOW() + WHERE transceiver_equivalences.status NOT IN ('approved', 'rejected', 'auto_approved') + `, [fx.id, cand.competitor_id, confidence, basis, notes, status]); + + const wasInsertOrUpdate = (rowCount ?? 0) > 0; + if (!wasInsertOrUpdate) { + result.skippedAlreadyMatched++; + continue; + } + + // Vendor-Breakdown tracken + if (!result.vendorBreakdown[cand.vendor_name]) { + result.vendorBreakdown[cand.vendor_name] = { autoApproved: 0, pending: 0 }; + } + + if (status === "auto_approved") { + result.newAutoApproved++; + result.vendorBreakdown[cand.vendor_name].autoApproved++; + + // competitor_verified auf FX-Produkt setzen + await pool.query(` + UPDATE transceivers + SET competitor_verified = true, + competitor_verified_at = NOW(), + competitor_status = 'matched', + competitor_status_updated_at = NOW() + WHERE id = $1 AND competitor_verified = false + `, [fx.id]); + } else { + result.newPending++; + result.vendorBreakdown[cand.vendor_name].pending++; + } + } + } + + result.durationMs = Date.now() - startMs; + + // ── Abschluss-Report ─────────────────────────────────────────────────────── + console.log("\n=== Catalog Reconcile Results ==="); + console.log(` Flexoptix processed: ${result.flexoptixProcessed}`); + console.log(` New auto_approved: ${result.newAutoApproved}`); + console.log(` New pending: ${result.newPending}`); + console.log(` Skipped (low confidence): ${result.skippedLowConfidence}`); + console.log(` Skipped (already matched): ${result.skippedAlreadyMatched}`); + console.log(` Duration: ${(result.durationMs / 1000).toFixed(1)}s`); + console.log("\n Vendor Breakdown:"); + for (const [vendor, counts] of Object.entries(result.vendorBreakdown).sort( + (a, b) => (b[1].autoApproved + b[1].pending) - (a[1].autoApproved + a[1].pending) + )) { + console.log(` ${vendor.padEnd(20)} auto_approved=${counts.autoApproved} pending=${counts.pending}`); + } + console.log("=================================\n"); + + return result; +} diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 1695607..f013b5e 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -341,6 +341,8 @@ export async function registerSchedules(boss: PgBoss): Promise { "maintenance:reconcile-verification", // ── Competitor Equivalence Matching ─────────────────────────────── "maintenance:find-equivalences", + // ── Full Catalog Bulk Reconcile (monthly) ──────────────────────────── + "catalog:reconcile", // ── Re-Research approved equivalences ───────────────────────────── "maintenance:re-research-equivalences", // ── Vendor Discovery Crawlers (TIPLLM training data + DB seeding) ───── @@ -411,6 +413,13 @@ export async function registerSchedules(boss: PgBoss): Promise { await boss.schedule("sync:flexoptix-catalog", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:flexoptix", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 3600 }); + // Catalog Reconcile: monatlich am 1. des Monats 04:00 UTC + // + on-demand via POST /api/maintenance/run { job: "catalog:reconcile" } + await boss.schedule("catalog:reconcile", "0 4 1 * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + // ══════════════════════════════════════════════════════════════════════ // MANUFACTURER CATALOGS — every 4h (product data, no prices) // ══════════════════════════════════════════════════════════════════════ @@ -902,6 +911,15 @@ export async function registerWorkers(boss: PgBoss): Promise { await syncFlexoptixCatalog(); }); + // ── Catalog Reconcile — Full Bulk Match ───────────────────────────────────── + await boss.work("catalog:reconcile", async () => { + const ts = new Date().toISOString(); + console.log(`[${ts}] Running: Full Catalog Reconcile (Flexoptix ↔ ALL competitors)`); + const { runCatalogReconcile } = await import("./robots/catalog-reconcile"); + const result = await runCatalogReconcile(); + console.log(`[catalog:reconcile] Done: ${result.newAutoApproved} auto_approved, ${result.newPending} pending`); + }); + await boss.work("scrape:catalog:smartoptics", async () => { console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`); await scrapeSmartOptics(); @@ -2748,9 +2766,9 @@ export async function registerWorkers(boss: PgBoss): Promise { JOIN vendors v ON v.id = t.vendor_id JOIN price_observations po ON po.transceiver_id = t.id WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%' - AND po.time > NOW() - INTERVAL '30 days' - AND t.form_factor = $1 - AND t.speed_gbps = $2 + AND po.time > NOW() - INTERVAL '90 days' + AND UPPER(t.form_factor) = UPPER($1) + AND ROUND(t.speed_gbps::NUMERIC, 2) = ROUND($2::NUMERIC, 2) AND t.id != $3 GROUP BY t.id, t.part_number, t.standard_name, t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, diff --git a/sql/108-form-factor-normalization.sql b/sql/108-form-factor-normalization.sql new file mode 100644 index 0000000..86da15d --- /dev/null +++ b/sql/108-form-factor-normalization.sql @@ -0,0 +1,91 @@ +-- Migration 108: form_factor normalisieren (alle Schreibweisen → canonical) +-- Zweck: Matcher findet Kandidaten nur bei exaktem form_factor-Match. +-- Verschiedene Scraper schreiben inkonsistente Werte. + +-- 1. Canonical-Mapping anwenden +UPDATE transceivers SET + form_factor = CASE + -- SFP Varianten + WHEN UPPER(TRIM(form_factor)) IN ('SFP', 'SFP (LC)', 'SFP DDM', 'SFP MODULE', '1G SFP', 'GLC', 'MINI-GBIC') + THEN 'SFP' + -- SFP+ Varianten + WHEN UPPER(TRIM(form_factor)) IN ('SFP+', 'SFP+ (LC)', 'SFP-PLUS', 'SFP PLUS', 'SFP+ DDM', + 'SFP+ MODULE', '10G SFP+', 'SFP+ OPTICAL', '10GSFP+', + 'SFP+/SFP28 COMPATIBLE', 'SFP+(LC)') + THEN 'SFP+' + -- SFP28 Varianten (25G) + WHEN UPPER(TRIM(form_factor)) IN ('SFP28', 'SFP-28', 'SFP 28', '25G SFP28', 'SFP28 (LC)', + 'SFP28 DDM', '25GSFP28') + THEN 'SFP28' + -- QSFP+ Varianten (40G) + WHEN UPPER(TRIM(form_factor)) IN ('QSFP+', 'QSFP-PLUS', 'QSFP PLUS', '40G QSFP+', + 'QSFP+ (MPO)', 'QSFP+ MODULE', 'QSFP+ DDM', '40GQSFP+') + THEN 'QSFP+' + -- QSFP28 Varianten (100G) + WHEN UPPER(TRIM(form_factor)) IN ('QSFP28', 'QSFP-28', 'QSFP 28', '100G QSFP28', + 'QSFP28 (LC)', 'QSFP28 MODULE', 'QSFP28 DDM', '100GQSFP28') + THEN 'QSFP28' + -- QSFP56 Varianten (200G) + WHEN UPPER(TRIM(form_factor)) IN ('QSFP56', 'QSFP-56', '200G QSFP56', 'QSFP56-DD') + THEN 'QSFP56' + -- QSFP-DD Varianten (400G) + WHEN UPPER(TRIM(form_factor)) IN ('QSFP-DD', 'QSFPDD', 'QSFP DD', '400G QSFP-DD', + 'QSFP-DD MODULE', 'QSFP56-DD 400G') + THEN 'QSFP-DD' + -- QSFP-DD800 / 800G + WHEN UPPER(TRIM(form_factor)) IN ('QSFP-DD800', 'QSFP-DD 800G', '800G QSFP-DD', 'OSFP-RHS') + THEN 'QSFP-DD800' + -- OSFP Varianten + WHEN UPPER(TRIM(form_factor)) IN ('OSFP', 'OSFP MODULE', '400G OSFP', '800G OSFP') + THEN 'OSFP' + -- CFP Varianten + WHEN UPPER(TRIM(form_factor)) IN ('CFP', 'CFP2', 'CFP4', 'CFP-DCO', 'CFP2-DCO') + THEN UPPER(TRIM(form_factor)) + -- XFP + WHEN UPPER(TRIM(form_factor)) IN ('XFP', '10G XFP', 'XFP DDM') + THEN 'XFP' + -- X2 / XENPAK + WHEN UPPER(TRIM(form_factor)) IN ('X2', 'XENPAK', 'X2 MODULE') + THEN UPPER(TRIM(form_factor)) + -- DAC Cable-Typen (kein optisches Modul — form_factor trotzdem normalisieren) + WHEN UPPER(form_factor) LIKE '%DAC%' AND UPPER(form_factor) LIKE '%QSFP28%' THEN 'QSFP28-DAC' + WHEN UPPER(form_factor) LIKE '%DAC%' AND UPPER(form_factor) LIKE '%QSFP+%' THEN 'QSFP+-DAC' + WHEN UPPER(form_factor) LIKE '%DAC%' AND UPPER(form_factor) LIKE '%SFP28%' THEN 'SFP28-DAC' + WHEN UPPER(form_factor) LIKE '%DAC%' AND UPPER(form_factor) LIKE '%SFP+%' THEN 'SFP+-DAC' + WHEN UPPER(form_factor) LIKE '%AOC%' AND UPPER(form_factor) LIKE '%QSFP28%' THEN 'QSFP28-AOC' + WHEN UPPER(form_factor) LIKE '%AOC%' AND UPPER(form_factor) LIKE '%QSFP+%' THEN 'QSFP+-AOC' + WHEN UPPER(form_factor) LIKE '%AOC%' AND UPPER(form_factor) LIKE '%SFP28%' THEN 'SFP28-AOC' + WHEN UPPER(form_factor) LIKE '%AOC%' AND UPPER(form_factor) LIKE '%SFP+%' THEN 'SFP+-AOC' + ELSE form_factor -- unbekannte Werte unverändert lassen + END +WHERE form_factor IS NOT NULL; + +-- 2. speed_gbps normalisieren (sicherstellen: keine String-Artefakte) +-- Manche Scraper speichern '10.0', '10.00', '1.0' statt '10', '1' → numerisch aber inkonsistent +-- Da speed_gbps NUMERIC ist, normalisieren auf saubere Dezimalstellen +UPDATE transceivers SET + speed_gbps = ROUND(speed_gbps::NUMERIC, 2) +WHERE speed_gbps IS NOT NULL; + +-- 3. Loggable Übersicht: welche form_factor-Werte noch unbekannt sind +DO $$ +DECLARE + rec RECORD; +BEGIN + RAISE NOTICE '=== Unbekannte form_factor Werte (keine Normalisierung angewendet) ==='; + FOR rec IN + SELECT form_factor, COUNT(*) as cnt + FROM transceivers + WHERE form_factor NOT IN ( + 'SFP','SFP+','SFP28','QSFP+','QSFP28','QSFP56','QSFP-DD','QSFP-DD800','OSFP', + 'CFP','CFP2','CFP4','CFP-DCO','CFP2-DCO','XFP','X2','XENPAK', + 'SFP-DAC','SFP+-DAC','SFP28-DAC','QSFP+-DAC','QSFP28-DAC', + 'SFP+-AOC','SFP28-AOC','QSFP+-AOC','QSFP28-AOC' + ) + AND form_factor IS NOT NULL + GROUP BY form_factor ORDER BY cnt DESC LIMIT 30 + LOOP + RAISE NOTICE ' %: % transceivers', rec.form_factor, rec.cnt; + END LOOP; +END; +$$; diff --git a/sql/109-fix-nightly-matcher-time-window.sql b/sql/109-fix-nightly-matcher-time-window.sql new file mode 100644 index 0000000..c283e3c --- /dev/null +++ b/sql/109-fix-nightly-matcher-time-window.sql @@ -0,0 +1,10 @@ +-- Migration 109: Dokumentiert den 30→90 Tage Bugfix im Nightly-Matcher +-- Dieser SQL ist reine Dokumentation — die eigentliche Änderung ist in scheduler.ts + +COMMENT ON TABLE transceiver_equivalences IS + 'Flexoptix-zentrierter Equivalenz-Graph: flexoptix_id = Referenz-Anker, + competitor_id = äquivalentes Konkurrenzprodukt. + Status: pending (review nötig), auto_approved (Confidence ≥0.73), + approved (manuell), rejected (explizit ausgeschlossen). + KRITISCH: Matcher nutzt 90-Tage-Fenster (war: 30 Tage) damit Vendors + mit seltener Preisbeobachtung (ATGBICS/NADDOD/10Gtek/ShopFiber24) gefunden werden.';