crawl: add vendor detail verifier learning

This commit is contained in:
Rene Fichtmueller 2026-05-09 18:22:31 +02:00
parent 5567c449ab
commit b979358ff0

View File

@ -30,3 +30,5 @@
{"event":"db_evidence_backfill","observed_at":"2026-05-09T15:53:00Z","actor":"codex-naddod-adapter-and-fs-final-detail-closure","profile":"erik-safe-db-plus-source-search","wave":"naddod-fs-near-complete-closure","vendor":"NADDOD+FS.COM","summary":"Closed NADDOD and FS.COM near-complete detail queues: classified three NADDOD rows as adapter/converter products and corrected FS SKU 110529 to QDD-LR4-400G 400GBASE-LR4 QSFP-DD evidence.","input":{"precheck":{"naddod_near_complete":3,"fscom_near_complete":1},"source_evidence":["NADDOD product pages/products search snippets identify 100GBASE-S25, 40GBASE-S10, and MAM1Q00A-QSA28-S as QSFP/SFP adapter converter modules.","Official FS source for SKU 110529 identifies QDD-LR4-400G, 400GBASE-LR4 QSFP-DD, SMF, 10km, CWDM4 1271/1291/1311/1331nm, Duplex LC."]},"decision":{"rules":["Adapter/converter products get Adapter / Converter class and N/A optical reach/fiber/wavelength semantics.","Do not use adapter/converter rows as optical equivalence evidence.","FS SKU 110529 should be normalized to QDD-LR4-400G instead of opaque FS-110529."],"runtime_policy":"DB-only update after source lookup; no crawler wave; no external AI."},"outcome":{"updated":{"detail_rows":4,"fully_verified_promoted":3},"postcheck":{"naddod_near_complete":0,"fscom_near_complete":0},"global_after":{"total":17647,"details_verified":11652,"fully_verified":10375},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Adapter/converter rows are product records but not optical transceiver equivalents; FS SKU 110529 is LR4, not a generic opaque product.","safety_notes":["No external AI was used.","No browser crawler was started.","FS direct page body was WAF-challenged, so official indexed FS source evidence was used instead."]} {"event":"db_evidence_backfill","observed_at":"2026-05-09T15:53:00Z","actor":"codex-naddod-adapter-and-fs-final-detail-closure","profile":"erik-safe-db-plus-source-search","wave":"naddod-fs-near-complete-closure","vendor":"NADDOD+FS.COM","summary":"Closed NADDOD and FS.COM near-complete detail queues: classified three NADDOD rows as adapter/converter products and corrected FS SKU 110529 to QDD-LR4-400G 400GBASE-LR4 QSFP-DD evidence.","input":{"precheck":{"naddod_near_complete":3,"fscom_near_complete":1},"source_evidence":["NADDOD product pages/products search snippets identify 100GBASE-S25, 40GBASE-S10, and MAM1Q00A-QSA28-S as QSFP/SFP adapter converter modules.","Official FS source for SKU 110529 identifies QDD-LR4-400G, 400GBASE-LR4 QSFP-DD, SMF, 10km, CWDM4 1271/1291/1311/1331nm, Duplex LC."]},"decision":{"rules":["Adapter/converter products get Adapter / Converter class and N/A optical reach/fiber/wavelength semantics.","Do not use adapter/converter rows as optical equivalence evidence.","FS SKU 110529 should be normalized to QDD-LR4-400G instead of opaque FS-110529."],"runtime_policy":"DB-only update after source lookup; no crawler wave; no external AI."},"outcome":{"updated":{"detail_rows":4,"fully_verified_promoted":3},"postcheck":{"naddod_near_complete":0,"fscom_near_complete":0},"global_after":{"total":17647,"details_verified":11652,"fully_verified":10375},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Adapter/converter rows are product records but not optical transceiver equivalents; FS SKU 110529 is LR4, not a generic opaque product.","safety_notes":["No external AI was used.","No browser crawler was started.","FS direct page body was WAF-challenged, so official indexed FS source evidence was used instead."]}
{"event":"db_evidence_backfill","observed_at":"2026-05-09T16:00:00Z","actor":"codex-atgbics-explicit-url-evidence-backfill","profile":"erik-safe-db-only","wave":"atgbics-url-evidence-closure","vendor":"ATGBICS","summary":"Backfilled 346 ATGBICS rows from explicit product URL evidence for distance, media and wavelength. Promoted 346 additional rows to fully_verified.","input":{"precheck":{"atgbics_near_complete_missing_details":485,"safe_url_reach_media_rows":346,"url_evidence":["m/km distance","smf/mmf/copper/dac/base-t/rj45 media","nm wavelength where optical"]}},"decision":{"rules":["Only accept rows where product URL contains explicit distance and media evidence.","Extract reach from m/km URL token, not from unrelated product-number digits.","Extract wavelength from nm URL token when optical; use N/A for copper/dac/base-t/rj45.","Correct form factor and speed only from clear protocol terms in the URL."],"runtime_policy":"DB-only update; no crawler wave; no external AI."},"outcome":{"updated":{"atgbics_detail_rows":346,"fully_verified_promoted":346},"postcheck":{"atgbics_near_complete_missing_details":139},"global_after":{"total":17647,"details_verified":11998,"fully_verified":10721},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Remaining ATGBICS rows need product-page parsing or special handling because they do not expose simple m/km plus media URL evidence.","safety_notes":["No external AI was used.","No browser crawler was started.","Erik public health stayed healthy."]} {"event":"db_evidence_backfill","observed_at":"2026-05-09T16:00:00Z","actor":"codex-atgbics-explicit-url-evidence-backfill","profile":"erik-safe-db-only","wave":"atgbics-url-evidence-closure","vendor":"ATGBICS","summary":"Backfilled 346 ATGBICS rows from explicit product URL evidence for distance, media and wavelength. Promoted 346 additional rows to fully_verified.","input":{"precheck":{"atgbics_near_complete_missing_details":485,"safe_url_reach_media_rows":346,"url_evidence":["m/km distance","smf/mmf/copper/dac/base-t/rj45 media","nm wavelength where optical"]}},"decision":{"rules":["Only accept rows where product URL contains explicit distance and media evidence.","Extract reach from m/km URL token, not from unrelated product-number digits.","Extract wavelength from nm URL token when optical; use N/A for copper/dac/base-t/rj45.","Correct form factor and speed only from clear protocol terms in the URL."],"runtime_policy":"DB-only update; no crawler wave; no external AI."},"outcome":{"updated":{"atgbics_detail_rows":346,"fully_verified_promoted":346},"postcheck":{"atgbics_near_complete_missing_details":139},"global_after":{"total":17647,"details_verified":11998,"fully_verified":10721},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Remaining ATGBICS rows need product-page parsing or special handling because they do not expose simple m/km plus media URL evidence.","safety_notes":["No external AI was used.","No browser crawler was started.","Erik public health stayed healthy."]}
{"event":"db_evidence_backfill","observed_at":"2026-05-09T16:05:00Z","actor":"codex-atgbics-deterministic-special-case-backfill","profile":"erik-safe-db-only","wave":"atgbics-special-case-closure","vendor":"ATGBICS","summary":"Backfilled 32 ATGBICS special-case rows using deterministic protocol/product-class rules. Promoted 32 additional rows to fully_verified.","input":{"precheck":{"atgbics_near_complete_missing_details":139,"safe_special_case_rows":32,"patterns":["loopback","10GBASE-T/RJ45","LRM","BX60/BXD-60/BXU-60","CWDM 10G 60km","CSR"]}},"decision":{"rules":["Loopback/test modules are non-optical test products with N/A reach/fiber/wavelength semantics.","10GBASE-T/RJ45 SFP+ uses 30m Copper with N/A wavelength.","LRM uses 220m MMF at 1310nm.","BX60 uses 60km SMF with directional BiDi wavelength evidence.","CWDM 10G 60 uses 60km SMF and source wavelength.","CSR uses 400m MMF at 850nm."],"runtime_policy":"DB-only update; no crawler wave; no external AI."},"outcome":{"updated":{"atgbics_detail_rows":32,"fully_verified_promoted":32},"postcheck":{"atgbics_near_complete_missing_details":107},"global_after":{"total":17647,"details_verified":12030,"fully_verified":10753},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":12}},"truth_policy":"Remaining ATGBICS rows need detail-page extraction because the URL slug no longer carries enough reach evidence.","safety_notes":["No external AI was used.","No browser crawler was started.","Erik public health stayed healthy."]} {"event":"db_evidence_backfill","observed_at":"2026-05-09T16:05:00Z","actor":"codex-atgbics-deterministic-special-case-backfill","profile":"erik-safe-db-only","wave":"atgbics-special-case-closure","vendor":"ATGBICS","summary":"Backfilled 32 ATGBICS special-case rows using deterministic protocol/product-class rules. Promoted 32 additional rows to fully_verified.","input":{"precheck":{"atgbics_near_complete_missing_details":139,"safe_special_case_rows":32,"patterns":["loopback","10GBASE-T/RJ45","LRM","BX60/BXD-60/BXU-60","CWDM 10G 60km","CSR"]}},"decision":{"rules":["Loopback/test modules are non-optical test products with N/A reach/fiber/wavelength semantics.","10GBASE-T/RJ45 SFP+ uses 30m Copper with N/A wavelength.","LRM uses 220m MMF at 1310nm.","BX60 uses 60km SMF with directional BiDi wavelength evidence.","CWDM 10G 60 uses 60km SMF and source wavelength.","CSR uses 400m MMF at 850nm."],"runtime_policy":"DB-only update; no crawler wave; no external AI."},"outcome":{"updated":{"atgbics_detail_rows":32,"fully_verified_promoted":32},"postcheck":{"atgbics_near_complete_missing_details":107},"global_after":{"total":17647,"details_verified":12030,"fully_verified":10753},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":12}},"truth_policy":"Remaining ATGBICS rows need detail-page extraction because the URL slug no longer carries enough reach evidence.","safety_notes":["No external AI was used.","No browser crawler was started.","Erik public health stayed healthy."]}
{"event":"targeted_detail_verifier","observed_at":"2026-05-09T16:12:00Z","actor":"codex-atgbics-product-js-detail-verifier","profile":"erik-safe-lightweight-fetch","wave":"atgbics-product-js-closure","vendor":"ATGBICS","summary":"Added and ran a lightweight Shopify product.js verifier for ATGBICS near-complete rows. Closed the ATGBICS near-complete detail queue.","code":{"repo_path":"packages/scraper/src/scrapers/atgbics-detail-pages.ts","script":"scrape:atgbics:details","fetch_mode":"one product.js JSON endpoint per product page","browser_used":false},"input":{"precheck":{"atgbics_near_complete_missing_details":107},"source_evidence":["Shopify product.js title","Shopify product.js description","Shopify tags such as Max Data Rate, Max Distance, Cable Type, Wavelength, Interface, Product Category"]},"decision":{"rules":["Prefer structured Shopify tags but fall back to title/body when a tag is unparseable or says N/A.","Never accept distance without media evidence.","Loopback/test products use N/A reach/fiber/wavelength semantics.","Do not use external AI; parser rules only."],"bug_learned":"Max Distance_N/A tags can hide real reach in product title/description; parser must use parsed tag value, not tag presence, as the gate.","runtime_policy":"Lightweight source fetch; no browser crawler; paced requests."},"outcome":{"runs":[{"fetched":107,"updated":97,"skipped":10,"promoted":97},{"fetched":10,"updated":10,"skipped":0,"promoted":10}],"postcheck":{"atgbics_near_complete_missing_details":0},"global_after":{"total":17647,"details_verified":12137,"fully_verified":10860},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"ATGBICS product pages are accepted as source evidence only when detail fields can be parsed from tags/title/body; unparseable rows remain skipped until parser evidence is improved.","safety_notes":["No external AI was used.","No Playwright/browser crawler was started.","SSH refused intermittently; retries were paused to protect Erik."]}
{"event":"targeted_detail_verifier","observed_at":"2026-05-09T16:20:00Z","actor":"codex-shopfiber24-fibermall-detail-verifier","profile":"erik-safe-lightweight-fetch","wave":"shopfiber24-fibermall-near-complete-closure","vendor":"ShopFiber24+FiberMall","summary":"Added and ran a lightweight static detail verifier for the remaining ShopFiber24 and FiberMall near-complete rows. Closed the global near-complete queue.","code":{"repo_path":"packages/scraper/src/scrapers/shopfiber24-fibermall-detail-pages.ts","script":"scrape:vendors:details","fetch_mode":"one static HTML page per product URL","browser_used":false},"input":{"precheck":{"fibermall_near_complete_missing_details":24,"shopfiber24_near_complete_missing_details":92,"global_near_complete_missing_details":116},"source_evidence":["FiberMall Schema.org Product JSON-LD name/description/mpn","ShopFiber24 title/meta description/OG metadata","source URL and existing DB price/image/competitor evidence"]},"decision":{"rules":["FiberMall JSON-LD Product blocks are primary source evidence for name, MPN, media, speed, reach, wavelength, connector and image provenance.","ShopFiber24 static title/meta text is accepted when it contains deterministic protocol/reach/media evidence.","Variable AOC/DAC/category pages must be classified as product families with Variant reach, not a fake fixed meter value.","Switches, media converters, muxes and adapters must be classified as their product class, not as optical equivalents.","100G DWDM DCO rows without a normal reach should be classified as Coherent DWDM with line-system-dependent reach.","10GBASE-T copper SFP+ rows use RJ45 Copper 30m standard semantics when source identifies 10G Base-T copper but omits distance."],"runtime_policy":"Lightweight source fetch; no browser crawler; no external AI."},"outcome":{"runs":[{"fetched":116,"updated":112,"skipped":4,"promoted":112},{"fetched":4,"updated":4,"skipped":0,"promoted":4}],"postcheck":{"fibermall_near_complete_missing_details":0,"shopfiber24_near_complete_missing_details":0,"global_near_complete_missing_details":0},"global_after":{"total":17647,"details_verified":12253,"fully_verified":10976},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":12}},"truth_policy":"Closing a queue does not mean inventing optics. Families/accessories/converters are verified as the product they actually are, and should not be used as 1:1 optical competitor equivalents unless a separate equivalence rule allows it.","safety_notes":["No external AI was used.","No Playwright/browser crawler was started.","Erik public health stayed healthy.","SSH refused intermittently; retries were delayed."]}