crawl: add global verification robot learning

This commit is contained in:
Rene Fichtmueller 2026-05-09 20:19:44 +02:00
parent 79651febbe
commit d5361413bc

View File

@ -33,3 +33,4 @@
{"event":"targeted_detail_verifier","observed_at":"2026-05-09T16:12:00Z","actor":"codex-atgbics-product-js-detail-verifier","profile":"erik-safe-lightweight-fetch","wave":"atgbics-product-js-closure","vendor":"ATGBICS","summary":"Added and ran a lightweight Shopify product.js verifier for ATGBICS near-complete rows. Closed the ATGBICS near-complete detail queue.","code":{"repo_path":"packages/scraper/src/scrapers/atgbics-detail-pages.ts","script":"scrape:atgbics:details","fetch_mode":"one product.js JSON endpoint per product page","browser_used":false},"input":{"precheck":{"atgbics_near_complete_missing_details":107},"source_evidence":["Shopify product.js title","Shopify product.js description","Shopify tags such as Max Data Rate, Max Distance, Cable Type, Wavelength, Interface, Product Category"]},"decision":{"rules":["Prefer structured Shopify tags but fall back to title/body when a tag is unparseable or says N/A.","Never accept distance without media evidence.","Loopback/test products use N/A reach/fiber/wavelength semantics.","Do not use external AI; parser rules only."],"bug_learned":"Max Distance_N/A tags can hide real reach in product title/description; parser must use parsed tag value, not tag presence, as the gate.","runtime_policy":"Lightweight source fetch; no browser crawler; paced requests."},"outcome":{"runs":[{"fetched":107,"updated":97,"skipped":10,"promoted":97},{"fetched":10,"updated":10,"skipped":0,"promoted":10}],"postcheck":{"atgbics_near_complete_missing_details":0},"global_after":{"total":17647,"details_verified":12137,"fully_verified":10860},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"ATGBICS product pages are accepted as source evidence only when detail fields can be parsed from tags/title/body; unparseable rows remain skipped until parser evidence is improved.","safety_notes":["No external AI was used.","No Playwright/browser crawler was started.","SSH refused intermittently; retries were paused to protect Erik."]}
{"event":"targeted_detail_verifier","observed_at":"2026-05-09T16:20:00Z","actor":"codex-shopfiber24-fibermall-detail-verifier","profile":"erik-safe-lightweight-fetch","wave":"shopfiber24-fibermall-near-complete-closure","vendor":"ShopFiber24+FiberMall","summary":"Added and ran a lightweight static detail verifier for the remaining ShopFiber24 and FiberMall near-complete rows. Closed the global near-complete queue.","code":{"repo_path":"packages/scraper/src/scrapers/shopfiber24-fibermall-detail-pages.ts","script":"scrape:vendors:details","fetch_mode":"one static HTML page per product URL","browser_used":false},"input":{"precheck":{"fibermall_near_complete_missing_details":24,"shopfiber24_near_complete_missing_details":92,"global_near_complete_missing_details":116},"source_evidence":["FiberMall Schema.org Product JSON-LD name/description/mpn","ShopFiber24 title/meta description/OG metadata","source URL and existing DB price/image/competitor evidence"]},"decision":{"rules":["FiberMall JSON-LD Product blocks are primary source evidence for name, MPN, media, speed, reach, wavelength, connector and image provenance.","ShopFiber24 static title/meta text is accepted when it contains deterministic protocol/reach/media evidence.","Variable AOC/DAC/category pages must be classified as product families with Variant reach, not a fake fixed meter value.","Switches, media converters, muxes and adapters must be classified as their product class, not as optical equivalents.","100G DWDM DCO rows without a normal reach should be classified as Coherent DWDM with line-system-dependent reach.","10GBASE-T copper SFP+ rows use RJ45 Copper 30m standard semantics when source identifies 10G Base-T copper but omits distance."],"runtime_policy":"Lightweight source fetch; no browser crawler; no external AI."},"outcome":{"runs":[{"fetched":116,"updated":112,"skipped":4,"promoted":112},{"fetched":4,"updated":4,"skipped":0,"promoted":4}],"postcheck":{"fibermall_near_complete_missing_details":0,"shopfiber24_near_complete_missing_details":0,"global_near_complete_missing_details":0},"global_after":{"total":17647,"details_verified":12253,"fully_verified":10976},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":12}},"truth_policy":"Closing a queue does not mean inventing optics. Families/accessories/converters are verified as the product they actually are, and should not be used as 1:1 optical competitor equivalents unless a separate equivalence rule allows it.","safety_notes":["No external AI was used.","No Playwright/browser crawler was started.","Erik public health stayed healthy.","SSH refused intermittently; retries were delayed."]}
{"event":"targeted_detail_verifier_followup","observed_at":"2026-05-09T16:25:00Z","actor":"codex-atgbics-product-js-detail-verifier","profile":"erik-safe-lightweight-fetch","wave":"atgbics-aoc-late-price-followup","vendor":"ATGBICS","summary":"A concurrent price-verification process exposed 23 new ATGBICS AOC near-complete rows after the first closure. Re-ran the ATGBICS product.js verifier and returned the global near-complete queue to zero.","input":{"precheck":{"new_atgbics_near_complete_missing_details":23,"reason":"price_verified increased from 11557 to 11582 after the first queue closure"}},"decision":{"rules":["Use the already-deployed ATGBICS product.js verifier for newly exposed rows.","AOC product URLs/titles with explicit m length, MMF and 850nm are deterministic source evidence.","Pause between SSH retries when Erik refuses connections."],"runtime_policy":"Lightweight source fetch; no browser crawler; no external AI."},"outcome":{"runs":[{"fetched":23,"updated":23,"skipped":0,"promoted":23}],"postcheck":{"global_near_complete_missing_details":0},"global_after":{"total":17647,"price_verified":11582,"details_verified":12276,"fully_verified":11001},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":12}},"truth_policy":"Queue closure must be rechecked after concurrent crawler/price jobs because newly price-verified rows can enter the near-complete set after a previous zero state.","safety_notes":["No external AI was used.","No Playwright/browser crawler was started.","Erik public health stayed healthy."]}
{"event":"targeted_scraper_repair","observed_at":"2026-05-09T18:16:00Z","actor":"codex-gaotek-catalog-image-detail-repair","profile":"erik-safe-lightweight-fetch","wave":"global-verification-continuation","vendor":"GAO Tek+OEM catalog vendors+Flexoptix","summary":"Repaired GAO Tek product-card extraction, excluded category URLs from active product verification, added catalog-derived detail verification, and fixed Flexoptix image backfill case sensitivity.","code":{"changed_paths":["packages/scraper/src/scrapers/gaotek.ts","packages/scraper/src/utils/verify-catalog-details.ts","packages/scraper/src/utils/backfill-images.ts","packages/scraper/src/scheduler.ts","packages/api/src/routes/health.ts","packages/api/src/db/queries.ts","packages/scraper/package.json"],"browser_used":false,"external_ai_used":false},"input":{"precheck":{"active_total":17647,"price_verified":11582,"image_verified":11966,"details_verified":12277,"competitor_verified":11492,"fully_verified":11001,"top_blockers":["GAO Tek","Juniper Networks","Cisco Systems","Ascent Optics","Eoptolink","10Gtek","Flexoptix","FS.COM"]}},"decision":{"rules":["GAO Tek live category uses Woodmart .wd-product.product-grid-item cards; old WooCommerce li.product selector returns zero cards and causes noisy fallback records.","Fallback extraction must only accept real https://gaotek.com/product/... URLs, never category URLs.","GAO public catalog has no prices; do not fake price verification for quote-only/no-public-price products.","Catalog-derived details can be promoted only when normalized part_number, form_factor, speed_gbps, reach_label and fiber_type are complete and a vendor website/docs/datasheet source URL exists.","Category URLs must be excluded from active verification counters and search results so non-products do not block product verification.","Flexoptix image backfill must match vendor name case-insensitively because DB has Flexoptix, not FLEXOPTIX."],"runtime_policy":"Use TIP deterministic scrapers/robots only; no external AI; keep Erik safe with sequential lightweight fetches."},"outcome":{"gaotek":{"pages_fetched":20,"products_per_page":24,"unique_products":480,"category_artifacts_reset":6,"prices_found":0},"catalog_details_verifier":{"details_verified":4340,"fully_verified_earned":0},"image_backfill":{"expanded_vendor_backfill_updated":48,"flexoptix_case_fix_updated":12,"flexoptix_remaining_no_image":110},"jobs":{"pi_fetch_wave":["scrape:pricing:gaotek","scrape:catalog:juniper-oem","scrape:catalog:juniper-mx-oem","scrape:catalog:juniper-qfx-oem","scrape:catalog:cisco-nexus-oem","scrape:catalog:cisco-catalyst-oem","scrape:catalog:cisco-asr-oem","scrape:pricing:ascentoptics","scrape:catalog:eoptolink","scrape:pricing:flexoptix","scrape:vendors:flexoptix-supported","scrape:catalog:arista-oem"],"reconcile_completed":true,"matcher_completed":true},"global_after":{"active_total":17714,"price_verified":11582,"image_verified":12194,"details_verified":16684,"competitor_verified":11492,"fully_verified":11052},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Details verification may be source-backed by official catalog/documentation roots when product rows already have complete normalized specs; price verification still requires an actual observed public/current price and must remain false for quote-only products.","safety_notes":["No external AI was used; TIPLLM/Ollama endpoint was unavailable earlier and no fallback AI was used.","No Playwright wave was started.","Erik load stayed ok and memory stayed about 13%.","pg-boss training writer on Erik is not inside a git checkout, so this JSONL entry was written manually to the TIP training pool."]}