From c25300199a7806b6e2ffd8d85fdd1737e8b1d937 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 16:41:18 +0200 Subject: [PATCH] fix: harden atgbics wavelength semantics --- packages/scraper/src/scrapers/atgbics.ts | 15 ++++- sync/CURRENT.md | 38 +++++++++++- ...n-rerun-and-copper-wavelength-semantics.md | 62 +++++++++++++++++++ 3 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 sync/history/2026-05-09-atgbics-json-rerun-and-copper-wavelength-semantics.md diff --git a/packages/scraper/src/scrapers/atgbics.ts b/packages/scraper/src/scrapers/atgbics.ts index 434e9f4..1f6baea 100644 --- a/packages/scraper/src/scrapers/atgbics.ts +++ b/packages/scraper/src/scrapers/atgbics.ts @@ -132,8 +132,21 @@ function detectFiber(text: string): string { } function detectWavelength(text: string): string { + if (/copper|dac|twinax|base-t|rj.?45/i.test(text)) return "N/A"; + const m = text.match(/(\d{3,4})\s*nm/i); - return m ? m[1] : ""; + if (m) { + const nm = m[1]; + if (nm === "1311") return "1310"; + return nm; + } + + // Use protocol-family evidence only when the optical code is explicit. + // This avoids treating arbitrary product-number digits as wavelengths. + if (/\bCWDM4\b/i.test(text)) return "1271,1291,1311,1331"; + if (/\b(?:SR|SR4|SR8|SRBD|VR|VR4|ESR4|CSR4)\b/i.test(text)) return "850"; + if (/\b(?:DR|DR4|DR8|FR|FR4|FR8|LR|LR4|ER|ER4|PSM4|2DR4|2FR4)\b/i.test(text)) return "1310"; + return ""; } /** diff --git a/sync/CURRENT.md b/sync/CURRENT.md index f3858b5..46ff08b 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,45 @@ # Current TIP Sync State -Updated: 2026-05-09 14:28 UTC +Updated: 2026-05-09 14:39 UTC ## Newest Work +- ATGBICS safe JSON rerun + Copper wavelength semantics on 2026-05-09: + - code hardened: + - `packages/scraper/src/scrapers/atgbics.ts` + - detects `N/A` wavelength for Copper/DAC/Twinax/Base-T/RJ45 products + - detects safe optical protocol-family wavelengths: + - CWDM4 => `1271,1291,1311,1331` + - SR/SR4/SR8/SRBD/VR/ESR/CSR => `850` + - DR/FR/LR/ER/PSM family => `1310` + - deployment: + - synced patched ATGBICS scraper source to active `/opt/tip` + - `pnpm -C packages/scraper build` passed on Erik + - runtime: + - ran one light ATGBICS Shopify `products.json` pass with `nice -n 10` + - no Playwright/browser crawler + - processed `7946` products + - price updates `61` + - image observations/updates `7943` + - observation: + - ATGBICS verification counters did not move because remaining highspeed wavelength gaps are mostly product rows whose source keys are cable/coherent/variant cases not solved by the current lightweight parser + - sample remaining rows include QSFP-DD ZR/C-band/coherent products and Copper/DAC rows + - DB truth correction: + - Copper/DAC products do not have an optical wavelength and should not be counted as missing optical wavelength + - set empty Copper `wavelengths` to `N/A` for `1044` rows + - highspeed missing-wavelength count changed: + - before Copper correction: `1908` + - after Copper correction: `1360` + - highspeed Copper missing: `0` + - remaining optical/non-Copper highspeed missing: `1220` + - health: + - public TIP health after run/update: `healthy` + - load status `ok` + - memory used `14%` + - truth: + - the ATGBICS JSON run was safe and confirmed current prices/images, but did not materially improve ATGBICS technical completeness yet + - next ATGBICS work should be a targeted parser for product URL slug classes: `ZR`, `DCO`, `C-band`, `LAN-WDM`, `CR8`, `breakout`, and OSFP/QSFP-DD cable form-factor correction + - DB-only highspeed wavelength evidence backfill on 2026-05-09: - purpose: - improve product-level technical completeness and future 1:1 comparison quality without running a browser crawler on Erik diff --git a/sync/history/2026-05-09-atgbics-json-rerun-and-copper-wavelength-semantics.md b/sync/history/2026-05-09-atgbics-json-rerun-and-copper-wavelength-semantics.md new file mode 100644 index 0000000..1fe27eb --- /dev/null +++ b/sync/history/2026-05-09-atgbics-json-rerun-and-copper-wavelength-semantics.md @@ -0,0 +1,62 @@ +# ATGBICS JSON Rerun And Copper Wavelength Semantics + +Date: 2026-05-09 +Actor: Codex +Scope: ATGBICS scraper and highspeed wavelength completeness +Mode: Erik-safe Shopify JSON, no browser crawler + +## Code Change + +Patched `packages/scraper/src/scrapers/atgbics.ts`: + +- Copper/DAC/Twinax/Base-T/RJ45 products now produce `wavelengths=N/A` +- CWDM4 now produces `1271,1291,1311,1331` +- SR/SR4/SR8/SRBD/VR/ESR/CSR family now produces `850` +- DR/FR/LR/ER/PSM family now produces `1310` + +The scraper source was synced to active `/opt/tip` and `pnpm -C packages/scraper build` passed on Erik. + +## Runtime + +Ran one light ATGBICS Shopify `products.json` pass: + +- products processed: `7946` +- price updates: `61` +- image observations/updates: `7943` +- no Playwright/browser crawler +- command used `nice -n 10` + +## Result + +ATGBICS counters stayed effectively unchanged: + +- total: `8269` +- price verified: `8241` +- image verified: `8257` +- details verified: `7435` +- fully verified: `7428` +- highspeed missing wavelengths: `663` + +Reason: remaining ATGBICS highspeed wavelength gaps include many Cable/Copper and coherent/ZR/DCO/C-band variants. These need targeted classification and parser work rather than another broad JSON pass. + +## Copper Truth Correction + +Copper/DAC products do not have optical wavelengths. Empty Copper `wavelengths` were set to `N/A`: + +- rows updated: `1044` +- highspeed missing-wavelength count before Copper correction: `1908` +- highspeed missing-wavelength count after Copper correction: `1360` +- highspeed Copper missing after correction: `0` +- remaining optical/non-Copper highspeed missing: `1220` + +## Health + +Public TIP health after run/update: + +- status: `healthy` +- load status: `ok` +- memory used: `14%` + +## Training Note + +TIPLLM should not count Copper/DAC/Twinax products as missing optical wavelengths. Use `N/A` for wavelength semantics. For ATGBICS, another broad Shopify JSON pass is low-risk but low-yield; next useful work is targeted parsing of URL/title classes such as `ZR`, `DCO`, `C-band`, `LAN-WDM`, `CR8`, `breakout`, and cable form-factor correction.