crawl: add crawlee integration and equivalence revalidation records

This commit is contained in:
Rene Fichtmueller 2026-05-09 14:14:58 +02:00
parent 4d9a11c4cd
commit dfdd1d96a3

View File

@ -9,3 +9,5 @@
{"event":"crawler_result","observed_at":"2026-05-09T09:02:00Z","actor":"codex-fscom-db-detail-batch-3","profile":"erik-safe-playwright-db-detail-only","wave":"manual-targeted-enrichment","vendor":"FS.COM","summary":"Ran a third targeted FS.com DB-detail-only batch. It completed 90 requests with 0 final failures, added fresh price and stock observations, and moved most remaining HTML product URLs into verified detail coverage.","input":{"user_request":"Finish TIP today with perfect transceiver, competitor and switch database; continue FS.com/Fiberstore verification safely.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":220,"price_image_details_verified":220,"fully_verified":137,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":187,"html_product_like_rows":299,"html_complete":218}},"decision":{"runtime_policy":"Use another capped DB-detail-only batch because remaining product URLs still produced verified evidence. Increase cap from 80 to 90 but keep Playwright concurrency 1, nice -n 10 and no category discovery.","truth_policy":"Do not mark pages with no useful text as details verified. Record no-text cases for retry/source-review instead."},"outcome":{"batch":{"target":90,"scraped":89,"failed":0,"new_price_observations":21,"new_stock_observations":24,"specs_verified":47,"retry_histogram":"70 first try, 13 one retry, 5 two retries, 2 three retries","notable_warning":"One FS.com page returned no useful text and was not counted as scraped."},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":260,"price_image_details_verified":260,"fully_verified":199,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":124,"html_product_like_rows":299,"html_complete":258,"html_image_verified":282,"no_url_rows":76,"category_rows":4},"tip_health":{"status":"healthy","transceiver_count":17647,"vendor_count":478,"switch_count":680,"price_verified":11557,"image_verified":10636,"details_verified":9816,"fully_verified":8516,"load_status":"ok","memory_used_pct":13}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Erik remained stable; TIP health stayed OK after the run.","Next best action is a smaller DB-detail-only closure batch for the remaining FS.com HTML product URLs, then a separate source-discovery task for no-url rows."]}
{"event":"crawler_result","observed_at":"2026-05-09T09:15:00Z","actor":"codex-fscom-db-detail-batch-4","profile":"erik-safe-playwright-db-detail-only","wave":"manual-targeted-enrichment-closure","vendor":"FS.COM","summary":"Ran a smaller FS.com closure batch after the productive HTML detail waves. It completed all 42 queued pages cleanly, but details_verified did not increase, showing that the DB-detail-only approach is now exhausted for the remaining gaps.","input":{"user_request":"Finish FS.com/Fiberstore as far as possible without manual validation and keep all robot experience in TIPLLM training pool.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":260,"price_image_details_verified":260,"fully_verified":199,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":124,"html_product_like_rows":299,"html_complete":258}},"decision":{"runtime_policy":"Run one smaller closure batch because only 42 eligible DB product URLs remained. Keep concurrency 1, nice -n 10, no category discovery.","stop_condition":"If a clean closure batch produces no details_verified movement, stop repeating the same Playwright detail-only job and switch to source-discovery/classification or parser work for the remaining gaps."},"outcome":{"batch":{"target":42,"scraped":42,"failed":0,"new_price_observations":5,"new_stock_observations":3,"specs_verified":25,"retry_histogram":"42 first try, 0 retries"},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":260,"price_image_details_verified":260,"fully_verified":205,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":123,"html_product_like_rows":299,"html_complete":258,"html_image_verified":282,"no_url_rows":76,"category_rows":4},"tip_health":{"status":"healthy","transceiver_count":17647,"vendor_count":478,"switch_count":680,"price_verified":11557,"image_verified":10636,"details_verified":9816,"fully_verified":8522,"load_status":"ok","memory_used_pct":13}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Erik remained healthy after the closure batch.","Do not claim FS.com is complete: 258 of 299 HTML product-like rows are complete; 76 no-url rows need source discovery; 4 category rows need classification."]}
{"event":"crawler_result","observed_at":"2026-05-09T11:58:00Z","actor":"codex-fscom-url-discovery-and-reconcile","profile":"erik-safe-playwright-url-discovery","wave":"manual-targeted-url-discovery","vendor":"FS.COM","summary":"Added and ran FS.com URL-discovery mode for historical FS-<id> rows without product URLs, then reran DB-detail reconciliation with H1-derived detail evidence. This converted almost all FS.com no-url rows into source-linked product rows and substantially increased image/detail coverage.","input":{"user_request":"Continue and finish FS.com/Fiberstore; all crawler/scraper/robot work must go to TIPLLM training pool. Operator also asked whether Crawlee repos can help; use that evaluation to improve safe crawler architecture.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":260,"price_image_details_verified":260,"fully_verified":205,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":123,"html_product_like_rows":299,"html_complete":258}},"decision":{"code_changes":["Added FS_URL_DISCOVERY_ONLY mode that maps existing FS-<numeric-id> rows without product_page_url to https://www.fs.com/de/products/<id>.html and writes evidence onto the existing DB row instead of creating duplicates.","DB_DETAIL_ONLY rows now carry targetTransceiverId so targeted reconciliation updates the original row.","Image verification is marked for target rows when a current FS.com product image is seen.","Detail verification now accepts deterministic source evidence from H1/part/spec text, not only classical spec tables, when fiber/reach/wavelength/connector can be inferred."],"runtime_policy":"Run URL discovery in safe bounded batches on Erik with Playwright concurrency 1, nice -n 10, no category crawl, and post-run DB measurement."},"outcome":{"url_discovery_pilot":{"target":20,"scraped":19,"failed":0,"new_price_observations":19,"new_stock_observations":17,"specs_verified":3,"post_missing_url":57},"url_discovery_full":{"target":56,"scraped":55,"failed":1,"failed_url":"https://www.fs.com/de/products/229461.html","new_price_observations":55,"new_stock_observations":43,"specs_verified":55},"reconciliation":{"target":57,"scraped":55,"failed":0,"new_price_observations":41,"new_stock_observations":40,"specs_verified":55,"notable_no_text":["https://www.fs.com/de/products/110529.html","https://www.fs.com/de/products/167147.html"]},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":374,"details_verified":373,"price_image_details_verified":373,"fully_verified":205,"missing_url":2,"missing_image_url":9,"missing_reach_label":4,"missing_fiber_type":9,"html_product_like_rows":373,"html_image_verified":372,"html_details_verified":371,"html_complete":371,"no_url_rows":2,"category_rows":4},"tip_health":{"status":"healthy","transceiver_count":17647,"vendor_count":478,"switch_count":680,"price_verified":11557,"image_verified":10711,"details_verified":9929,"fully_verified":8526,"load_status":"ok","memory_used_pct":13}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Erik remained healthy; Crawlee AutoscaledPool stayed at concurrency 1 and reported no CPU/memory/event-loop overload.","Do not claim FS.com is complete: 371 of 373 HTML product-like rows are complete, but 2 no-url rows, 4 category rows, and 9 image/fiber gaps remain."]}
{"event":"crawler_runtime_integration","observed_at":"2026-05-09T12:12:00Z","actor":"codex-crawlee-binding","profile":"crawlee-core-plus-python-side-worker","wave":"runtime-integration","vendor":"all","summary":"Installed and bound Crawlee runtime work into TIP. TypeScript Crawlee remains the production core; Crawlee-Python was added as an optional isolated evidence worker for Pi/Proxmox/residential nodes.","input":{"user_request":"Install, use and bind apify/crawlee and crawlee-python for TIP.","repos_evaluated":["apify/crawlee","apify/crawlee-python","hiteshchoudhary/crawlee-project"]},"decision":{"code_changes":["Added package scripts for FS.com DB-detail and URL-discovery Crawlee modes.","Added packages/crawlee-python optional worker package.","Added scripts/setup-crawlee-python-worker.sh.","Added docs/TIP_CRAWLEE_RUNTIME.md.","Added package-local .gitignore so Python build artifacts are not committed."],"runtime_policy":"Production writes remain in the TypeScript TIP scraper. Crawlee-Python writes JSONL evidence only and must not directly mutate TIP DB."},"outcome":{"tip_commits":["60531b6 feat: add crawlee python worker integration","49f0871 chore: ignore crawlee python build artifacts"],"python_smoke_test":{"venv":"/tmp/tip-crawlee-python-venv","package":"crawlee==1.6.3","url":"https://crawlee.dev","result":"JSONL evidence output succeeded"},"safety_notes":["Use low concurrency on Erik.","Move heavier experiments to Pi/Proxmox/residential workers.","No external AI was used."]}}
{"event":"equivalence_revalidation_result","observed_at":"2026-05-09T12:16:00Z","actor":"codex-immediate-full-equivalence-revalidation","profile":"erik-safe-db-only","wave":"manual-immediate-full-active-research","vendor":"all-active-equivalence-matches","summary":"Revalidated all active approved/auto-approved TIP equivalence matches immediately with deterministic 1:1 DB evidence rules. No crawler wave was started.","input":{"user_request":"Verify and validate everything open in TIP immediately and check whether products truly match 1:1.","precheck":{"pending":0,"approved":1986,"auto_approved":32080,"rejected":148367,"due_research":0,"active_matches":34066,"no_recent_price_gaps":0,"hard_technical_mismatches":0,"missing_critical_evidence":0}},"decision":{"criteria":["recent competitor price evidence","form factor exact","speed exact","fiber type exact","reach ratio >= 0.85","primary wavelength within 15nm"],"runtime_policy":"Mark all active matches due immediately and process with existing PgBoss maintenance:re-research-equivalences worker. Keep work DB-only to protect Erik."},"outcome":{"marked_due":34066,"jobs_queued":18,"jobs_completed":18,"due_research_after":0,"active_researched_today":34066,"rejected_in_immediate_pass":0,"final_queue":{"pending":0,"approved":1986,"auto_approved":32080,"rejected":148367},"verification_counters":{"competitor_verified":11470,"fully_verified":9135,"price_verified":11557,"image_verified":10711,"details_verified":9929,"total":17647},"tip_health":{"status":"healthy","load_status":"ok","memory_used_pct":13}},"truth_policy":"Manual equivalence queue is empty and active matches were just rechecked, but not every product row in TIP is fully complete; remaining work is vendor-specific product source enrichment.","safety_notes":["No external AI was used.","No broad browser crawler wave was started.","Continue targeted vendor crawlers or offload heavy discovery to Pi/Proxmox workers."]}