From d9f5fc253fc305c42658102a1d7a8c1a2194cc9e Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 4 Apr 2026 15:41:57 +0200 Subject: [PATCH] =?UTF-8?q?fix(verification):=20100%=20Verified=20Badge=20?= =?UTF-8?q?war=20dramatisch=20zu=20gro=C3=9Fz=C3=BCgig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KERNPROBLEME BEHOBEN: 1. ATGBICS part_number = URL slug statt echte OEM-Nummer extractOemPartNumber() entfernt -r-compatible-transceiver-* Suffix + trailing Vendor-Namen (nokia, cisco, juniper, ...) Ergebnis: 3he16564aa-nokia-r-compatible-transceiver-... → 3HE16564AA 2. reach_label = '' (leer) wurde als details_verified akzeptiert IS NOT NULL erlaubt leere Strings → Fix: AND reach_label != '' 3. details_verified = true trotz garbled part_number Neue Kriterien: NOT ILIKE '%-compatible-transceiver%' NOT ILIKE '%-r-compatible%' 4. data_confidence Werte falsch in Funktion ('scraped_unverified' etc) Echte Werte: low/medium/high/garbage → NOT IN ('garbage','unknown') ERGEBNIS nach recompute_all_verification(): fully_verified: 3.654 → 581 (Badge war 6x übertrieben) details_verified: inflated → 1.075 (korrekt) ATGBICS Scraper: - extractOemPartNumber() für collection und product detail pages - detectReach() jetzt auch auf URL-slug (120km im slug → reach_label) Price Anomaly Detection: - API: price_anomaly field wenn max/min ratio ≥ 10x - Dashboard: ⚠ Preisanomalie Banner mit Ratio + EUR Range SQL 025: Part number cleanup (30 records), reach from slug (12 records) --- packages/api/src/routes/transceivers.ts | 25 ++++++++- packages/dashboard/index.html | 13 +++++ packages/scraper/src/scrapers/atgbics.ts | 65 ++++++++++++++++++++++-- sql/017-verification-tags.sql | 5 ++ sql/025-verification-quality-fix.sql | 59 +++++++++++++++++++++ 5 files changed, 161 insertions(+), 6 deletions(-) create mode 100644 sql/025-verification-quality-fix.sql diff --git a/packages/api/src/routes/transceivers.ts b/packages/api/src/routes/transceivers.ts index 9c48797..2a12dee 100644 --- a/packages/api/src/routes/transceivers.ts +++ b/packages/api/src/routes/transceivers.ts @@ -115,9 +115,32 @@ transceiverRouter.get("/:id", async (req: Request, res: Response) => { comparable_id: row.comparable_id, })); + const allPrices = [...prices, ...comparablePrices]; + + // Price anomaly detection: flag if max/min ratio > 10x (same-product prices only) + const samePricesEur = allPrices + .filter((p) => p.is_same_product && p.price > 0) + .map((p) => { + // Normalize to EUR for comparison + if (p.currency === "EUR") return p.price; + if (p.currency === "USD") return p.price * 0.92; + if (p.currency === "GBP") return p.price * 1.17; + return p.price; + }); + + let priceAnomaly: { ratio: number; min_eur: number; max_eur: number } | null = null; + if (samePricesEur.length >= 2) { + const minEur = Math.min(...samePricesEur); + const maxEur = Math.max(...samePricesEur); + const ratio = minEur > 0 ? Math.round((maxEur / minEur) * 10) / 10 : 0; + if (ratio >= 10) { + priceAnomaly = { ratio, min_eur: Math.round(minEur * 100) / 100, max_eur: Math.round(maxEur * 100) / 100 }; + } + } + res.json({ success: true, - data: { ...transceiver, competitor_prices: [...prices, ...comparablePrices] }, + data: { ...transceiver, competitor_prices: allPrices, price_anomaly: priceAnomaly }, }); } catch (err) { console.error("Get transceiver error:", err); diff --git a/packages/dashboard/index.html b/packages/dashboard/index.html index cc27aa7..d43cc17 100644 --- a/packages/dashboard/index.html +++ b/packages/dashboard/index.html @@ -2767,7 +2767,20 @@ async function openTxDetail(id) { var comparPrices = allPrices.filter(function(p) { return p.is_same_product === false; }); if (allPrices.length > 0) { + // Price anomaly warning — show before price table if ratio ≥ 10x + var anomaly = t.price_anomaly; + var anomalyBanner = ''; + if (anomaly && anomaly.ratio >= 10) { + anomalyBanner = '
' + + '⚠ Preisanomalie — ' + + anomaly.ratio + 'x Unterschied zwischen Anbietern' + + ' (min. EUR\u00a0' + anomaly.min_eur.toLocaleString('de-DE',{minimumFractionDigits:2}) + ' / max. EUR\u00a0' + anomaly.max_eur.toLocaleString('de-DE',{minimumFractionDigits:2}) + ').' + + ' Entweder ist ein Preis falsch erfasst, oder es handelt sich um unterschiedliche Produktvarianten.' + + '
'; + } + h += '
Current Prices
'; + h += anomalyBanner; h += '
'; function renderPriceRow(p) { diff --git a/packages/scraper/src/scrapers/atgbics.ts b/packages/scraper/src/scrapers/atgbics.ts index a78c42e..282e1a8 100644 --- a/packages/scraper/src/scrapers/atgbics.ts +++ b/packages/scraper/src/scrapers/atgbics.ts @@ -83,6 +83,51 @@ function detectReach(text: string): string | undefined { return undefined; } +/** + * Extract the real OEM part number from an ATGBICS URL slug. + * + * ATGBICS slug format: {oem-part-number}-{vendor}-r-compatible-transceiver-{specs} + * Examples: + * 3he16564aa-nokia-r-compatible-transceiver-qsfp-dd-... → 3HE16564AA + * jnp-sfp-25g-lr-juniper-r-compatible-... → JNP-SFP-25G-LR + * sfp-10g-sr-cisco-compatible-... → SFP-10G-SR + * + * Returns the slug uppercased if extraction fails (better than full slug). + */ +function extractOemPartNumber(slug: string): string { + let pn = slug; + + // Remove "-r-compatible-transceiver-..." and everything after + pn = pn.replace(/-r-compatible(?:-transceiver.*)?$/i, ""); + // Remove "-compatible-transceiver-..." (no "r-") + pn = pn.replace(/-compatible-transceiver.*$/i, ""); + // Remove "-compatible-..." (short form) + pn = pn.replace(/-compatible.*$/i, ""); + + // Remove trailing known OEM vendor names that ATGBICS appends before "-r-compatible" + const oemVendors = [ + "nokia", "cisco", "juniper", "arista", "huawei", "hp", "hpe", "dell", + "extreme", "brocade", "avaya", "netgear", "mikrotik", "ubiquiti", "mellanox", + "intel", "broadcom", "allied", "planet", "zyxel", "dlink", "d-link", + "foundry", "force10", "enterasys", "optical", "palo", "fortinet", "hitachi", + "calix", "calix", "ciena", "adtran", "ribbon", "sycamore", "rad", "zhone", + "infinera", "fujitsu", "nec", "ericsson", "alcatel", "lucent", + ]; + for (const v of oemVendors) { + pn = pn.replace(new RegExp(`-${v}$`, "i"), ""); + } + + // Final cleanup: normalize to uppercase (OEM part numbers are uppercase) + const result = pn.toUpperCase().trim(); + + // Safety: if result is empty, longer than 40 chars, or still has "TRANSCEIVER", return slug as-is + if (!result || result.length > 40 || result.includes("TRANSCEIVER")) { + return slug.toUpperCase().slice(0, 40); + } + + return result; +} + function detectFiberType(text: string): string | undefined { const lower = text.toLowerCase(); if (lower.includes("single mode") || lower.includes("single-mode") || lower.includes("smf") || lower.includes("-lr") || lower.includes("-er") || lower.includes("-zr")) return "SMF"; @@ -161,10 +206,11 @@ export async function scrapeAtgbics(): Promise { const stock = stockEl?.textContent?.trim() || ""; // Derive part number from URL slug: /products/sfp-10g-lr → sfp-10g-lr + // Then extract real OEM part number (strips "-r-compatible-transceiver-*") const slug = href.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; if (href && name && name.length > 3) { - results.push({ name, href, price, stock, partNumber: slug }); + results.push({ name, href, price, stock, partNumber: slug }); // OEM extraction done below after page parse } } @@ -204,9 +250,13 @@ export async function scrapeAtgbics(): Promise { if (item.price) { const { price, currency } = parsePrice(item.price); const speedInfo = detectSpeed(item.name); + // Extract real OEM part number from slug (strips -r-compatible-transceiver-*) + const realPartNumber = extractOemPartNumber(item.partNumber); + // Extract reach from name OR slug (slug often has "120km" even when name doesn't) + const reachLabel = detectReach(item.name) || detectReach(item.partNumber) || undefined; if (price > 0) { products.push({ - partNumber: item.partNumber || item.name.slice(0, 80), + partNumber: realPartNumber || item.name.slice(0, 80), name: item.name, price, currency: currency === "USD" ? "GBP" : currency, // ATGBICS is GBP — parsePrice may default to USD if no symbol on listing @@ -216,7 +266,7 @@ export async function scrapeAtgbics(): Promise { formFactor: detectFormFactor(item.name), speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, - reachLabel: detectReach(item.name), + reachLabel, fiberType: detectFiberType(item.name), }); } @@ -270,7 +320,10 @@ export async function scrapeAtgbics(): Promise { }); const slug = url.split("/products/")[1]?.split("?")[0]?.replace(/\/$/, "") || ""; - const partNumber = data.sku || slug; + // Prefer Shopify SKU if available, otherwise extract real OEM PN from slug + const partNumber = data.sku && data.sku.length > 2 && data.sku.length < 40 + ? data.sku.toUpperCase() + : extractOemPartNumber(slug); const name = data.title || slug; const combinedText = `${name} ${data.description}`; @@ -278,6 +331,8 @@ export async function scrapeAtgbics(): Promise { if (price > 0) { const speedInfo = detectSpeed(combinedText); + // Reach from title/description first, then fall back to slug (slug often has "120km") + const reachLabel = detectReach(combinedText) || detectReach(slug) || undefined; products.push({ partNumber, name, @@ -289,7 +344,7 @@ export async function scrapeAtgbics(): Promise { formFactor: detectFormFactor(combinedText), speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, - reachLabel: detectReach(combinedText), + reachLabel, fiberType: detectFiberType(combinedText), }); } diff --git a/sql/017-verification-tags.sql b/sql/017-verification-tags.sql index 4ba6997..f7743ea 100644 --- a/sql/017-verification-tags.sql +++ b/sql/017-verification-tags.sql @@ -106,8 +106,13 @@ BEGIN transceivers.product_page_url IS NOT NULL AND transceivers.form_factor IS NOT NULL AND transceivers.speed_gbps IS NOT NULL + -- reach_label must be a non-empty string (IS NOT NULL allows empty string — wrong) AND transceivers.reach_label IS NOT NULL + AND transceivers.reach_label != '' + -- part_number must not be a URL slug (garbled data from scraper) AND (transceivers.part_number IS NOT NULL AND transceivers.part_number != transceivers.slug) + AND transceivers.part_number NOT ILIKE '%-compatible-transceiver%' + AND transceivers.part_number NOT ILIKE '%-r-compatible%' AND transceivers.data_confidence IN ('scraped_unverified', 'verified', 'official') ) INTO v_details_ok FROM transceivers diff --git a/sql/025-verification-quality-fix.sql b/sql/025-verification-quality-fix.sql new file mode 100644 index 0000000..cebe672 --- /dev/null +++ b/sql/025-verification-quality-fix.sql @@ -0,0 +1,59 @@ +-- Migration 025: Fix details_verified quality gate + repair garbled ATGBICS records +-- Problem: details_verified = true when: +-- 1. reach_label = '' (empty string passes IS NOT NULL) +-- 2. part_number contains 'compatible-transceiver' (URL slug stored as PN) +-- ───────────────────────────────────────────────────────────────────────────── + +-- Step 1: Fix part_numbers that are ATGBICS URL slugs +-- Extract the real OEM part number: take everything before "-r-compatible" or "-compatible" +UPDATE transceivers +SET + part_number = UPPER( + REGEXP_REPLACE( + REGEXP_REPLACE( + part_number, + '-(nokia|cisco|juniper|arista|huawei|hp|hpe|dell|extreme|brocade|mellanox|intel|broadcom|netgear|foundry|calix|ciena|adtran|palo|fortinet|alcatel|ericsson|nec|fujitsu|infinera|ribbon|hitachi|rad|zhone|ubiquiti|mikrotik|avaya|enterasys|allied|planet|zyxel|dlink)$', + '', + 'i' + ), + '-(r-compatible|compatible)(-transceiver.*)?$', + '', + 'i' + ) + ), + updated_at = NOW() +WHERE + part_number ILIKE '%-r-compatible%' + OR part_number ILIKE '%-compatible-transceiver%'; + +-- Step 2: Extract reach_meters from reach_label where reach_meters = 0 but reach_label has data +UPDATE transceivers +SET + reach_meters = CASE + WHEN reach_label ILIKE '%km' THEN + CAST(REGEXP_REPLACE(reach_label, '[^0-9]', '', 'g') AS INTEGER) * 1000 + WHEN reach_label ILIKE '%m' AND reach_label NOT ILIKE '%km' THEN + CAST(REGEXP_REPLACE(reach_label, '[^0-9]', '', 'g') AS INTEGER) + ELSE reach_meters + END, + updated_at = NOW() +WHERE reach_meters = 0 + AND reach_label IS NOT NULL + AND reach_label != '' + AND reach_label ~ '^\d+\s*(m|km)$'; + +-- Step 3: Also extract reach_label from slug where still missing +-- For records where slug contains NNkm pattern (e.g. scraped-3he16564aa-...-120km-...) +UPDATE transceivers +SET + reach_label = (REGEXP_MATCH(slug, '(\d+km)'))[1], + reach_meters = CAST((REGEXP_MATCH(slug, '(\d+)km'))[1] AS INTEGER) * 1000, + updated_at = NOW() +WHERE + (reach_label IS NULL OR reach_label = '') + AND reach_meters = 0 + AND slug ~ '\d+km'; + +-- Step 4: Recompute all verification badges with the fixed criteria +-- (Updates details_verified, fully_verified for all affected transceivers) +SELECT recompute_all_verification();