From c2421c03a3fed260e6f0558ace56dc15a07309ca Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 17:24:06 +0200 Subject: [PATCH] fix: harden shopfiber24 reach parsing --- packages/scraper/src/scrapers/fiber24.ts | 20 ++++++++++++- sync/CURRENT.md | 20 ++++++++++++- ...2026-05-09-shopfiber24-parser-hardening.md | 30 +++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 sync/history/2026-05-09-shopfiber24-parser-hardening.md diff --git a/packages/scraper/src/scrapers/fiber24.ts b/packages/scraper/src/scrapers/fiber24.ts index 35b9d9d..5adee5c 100644 --- a/packages/scraper/src/scrapers/fiber24.ts +++ b/packages/scraper/src/scrapers/fiber24.ts @@ -49,7 +49,8 @@ function sleep(ms: number): Promise { function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const lower = text.toLowerCase(); - if (lower.includes("400g") || lower.includes("qsfp-dd800") || lower.includes("800g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; + if (lower.includes("800g") || lower.includes("qsfp-dd800")) return { formFactor: lower.includes("osfp") ? "OSFP" : "QSFP-DD", speed: "800G", speedGbps: 800 }; + if (lower.includes("400g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; @@ -63,6 +64,23 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp } function detectReach(text: string): { label: string; meters: number } | undefined { + // Variable-length products such as "1 - 30 m" must not be marked as one + // deterministic cable reach. They need variant-level extraction first. + if (/\b\d+(?:[.,]\d+)?\s*(?:-|–|to|bis)\s*\d+(?:[.,]\d+)?\s*(?:m|km)\b/i.test(text)) { + return undefined; + } + + const explicit = text.match(/\b(\d+(?:[.,]\d+)?)\s*(km|m)\b/i); + if (explicit) { + const value = parseFloat(explicit[1].replace(",", ".")); + const unit = explicit[2].toLowerCase(); + if (Number.isFinite(value) && value > 0) { + const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value); + const label = unit === "km" ? `${explicit[1].replace(",", ".")}km` : `${explicit[1].replace(",", ".")}m`; + return { label, meters }; + } + } + const patterns: [RegExp, string, number][] = [ [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000], diff --git a/sync/CURRENT.md b/sync/CURRENT.md index f506b75..052523b 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,27 @@ # Current TIP Sync State -Updated: 2026-05-09 15:15 UTC +Updated: 2026-05-09 15:23 UTC ## Newest Work +- ShopFiber24 parser hardening for deterministic cable/detail verification on 2026-05-09: + - root cause: + - ShopFiber24 contains variable-length AOC/DAC products such as `1 - 30 m` + - those must not be interpreted as one fixed `30m` reach and marked detail-verified + - the scraper also treated `800G` / `QSFP-DD800` product text as `400G` + - code hardened: + - `packages/scraper/src/scrapers/fiber24.ts` + - detects `800G` as `800G` / `800Gbps` + - parses explicit single `m/km` reach values generically + - refuses variable ranges like `1 - 30 m`, `1 to 30 m`, `1 bis 30 m` + - verification: + - `npm run build -w packages/scraper` passed locally + - deployment: + - not deployed yet because Erik SSH currently refuses connections + - truth: + - future ShopFiber24 passes should only mark product details verified when reach is deterministic + - variable cable-family rows need variant-level extraction instead of broad approval + - FiberMall source-title optical detail backfill on 2026-05-09: - precheck: - `69` FiberMall rows had price + image + source URL but lacked detail verification diff --git a/sync/history/2026-05-09-shopfiber24-parser-hardening.md b/sync/history/2026-05-09-shopfiber24-parser-hardening.md new file mode 100644 index 0000000..879c4e8 --- /dev/null +++ b/sync/history/2026-05-09-shopfiber24-parser-hardening.md @@ -0,0 +1,30 @@ +# ShopFiber24 Parser Hardening - 2026-05-09 + +## Context + +ShopFiber24 has many near-complete rows where prices, images and source URLs exist. Several remaining products are variable-length cable families, so they must not be treated as deterministic variants. + +## Root Cause + +- Variable cable ranges such as `1 - 30 m` can be misread by simple reach patterns as a fixed `30m` reach +- `800G` / `QSFP-DD800` product text was classified as `400G` + +## Code Hardened + +- `packages/scraper/src/scrapers/fiber24.ts` + - detects `800G` as `800G` / `800Gbps` + - parses explicit single `m/km` reach values generically + - refuses variable ranges like `1 - 30 m`, `1 to 30 m`, `1 bis 30 m` + +## Verification + +- `npm run build -w packages/scraper` passed locally + +## Deployment + +Not deployed yet because Erik SSH was refusing connections. This should be synced to `/opt/tip` and rebuilt once Erik is reachable. + +## Truth Policy + +Future ShopFiber24 passes should only mark product details verified when reach is deterministic. Variable cable-family rows need variant-level extraction and must not be blindly approved. +