fix: harden shopfiber24 reach parsing

This commit is contained in:
Rene Fichtmueller 2026-05-09 17:24:06 +02:00
parent 6d8d7874d3
commit c2421c03a3
3 changed files with 68 additions and 2 deletions

View File

@ -49,7 +49,8 @@ function sleep(ms: number): Promise<void> {
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
const lower = text.toLowerCase(); const lower = text.toLowerCase();
if (lower.includes("400g") || lower.includes("qsfp-dd800") || lower.includes("800g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("800g") || lower.includes("qsfp-dd800")) return { formFactor: lower.includes("osfp") ? "OSFP" : "QSFP-DD", speed: "800G", speedGbps: 800 };
if (lower.includes("400g")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 }; if (lower.includes("osfp") && !lower.includes("qsfp")) return { formFactor: "OSFP", speed: "400G", speedGbps: 400 };
if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp-dd") || lower.includes("qsfpdd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; if (lower.includes("qsfp28") || lower.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
@ -63,6 +64,23 @@ function detectFormFactor(text: string): { formFactor: string; speed: string; sp
} }
function detectReach(text: string): { label: string; meters: number } | undefined { function detectReach(text: string): { label: string; meters: number } | undefined {
// Variable-length products such as "1 - 30 m" must not be marked as one
// deterministic cable reach. They need variant-level extraction first.
if (/\b\d+(?:[.,]\d+)?\s*(?:-||to|bis)\s*\d+(?:[.,]\d+)?\s*(?:m|km)\b/i.test(text)) {
return undefined;
}
const explicit = text.match(/\b(\d+(?:[.,]\d+)?)\s*(km|m)\b/i);
if (explicit) {
const value = parseFloat(explicit[1].replace(",", "."));
const unit = explicit[2].toLowerCase();
if (Number.isFinite(value) && value > 0) {
const meters = unit === "km" ? Math.round(value * 1000) : Math.round(value);
const label = unit === "km" ? `${explicit[1].replace(",", ".")}km` : `${explicit[1].replace(",", ".")}m`;
return { label, meters };
}
}
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
[/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000], [/\b20\s*km\b/i, "20km", 20000], [/\b15\s*km\b/i, "15km", 15000],

View File

@ -1,9 +1,27 @@
# Current TIP Sync State # Current TIP Sync State
Updated: 2026-05-09 15:15 UTC Updated: 2026-05-09 15:23 UTC
## Newest Work ## Newest Work
- ShopFiber24 parser hardening for deterministic cable/detail verification on 2026-05-09:
- root cause:
- ShopFiber24 contains variable-length AOC/DAC products such as `1 - 30 m`
- those must not be interpreted as one fixed `30m` reach and marked detail-verified
- the scraper also treated `800G` / `QSFP-DD800` product text as `400G`
- code hardened:
- `packages/scraper/src/scrapers/fiber24.ts`
- detects `800G` as `800G` / `800Gbps`
- parses explicit single `m/km` reach values generically
- refuses variable ranges like `1 - 30 m`, `1 to 30 m`, `1 bis 30 m`
- verification:
- `npm run build -w packages/scraper` passed locally
- deployment:
- not deployed yet because Erik SSH currently refuses connections
- truth:
- future ShopFiber24 passes should only mark product details verified when reach is deterministic
- variable cable-family rows need variant-level extraction instead of broad approval
- FiberMall source-title optical detail backfill on 2026-05-09: - FiberMall source-title optical detail backfill on 2026-05-09:
- precheck: - precheck:
- `69` FiberMall rows had price + image + source URL but lacked detail verification - `69` FiberMall rows had price + image + source URL but lacked detail verification

View File

@ -0,0 +1,30 @@
# ShopFiber24 Parser Hardening - 2026-05-09
## Context
ShopFiber24 has many near-complete rows where prices, images and source URLs exist. Several remaining products are variable-length cable families, so they must not be treated as deterministic variants.
## Root Cause
- Variable cable ranges such as `1 - 30 m` can be misread by simple reach patterns as a fixed `30m` reach
- `800G` / `QSFP-DD800` product text was classified as `400G`
## Code Hardened
- `packages/scraper/src/scrapers/fiber24.ts`
- detects `800G` as `800G` / `800Gbps`
- parses explicit single `m/km` reach values generically
- refuses variable ranges like `1 - 30 m`, `1 to 30 m`, `1 bis 30 m`
## Verification
- `npm run build -w packages/scraper` passed locally
## Deployment
Not deployed yet because Erik SSH was refusing connections. This should be synced to `/opt/tip` and rebuilt once Erik is reachable.
## Truth Policy
Future ShopFiber24 passes should only mark product details verified when reach is deterministic. Variable cable-family rows need variant-level extraction and must not be blindly approved.