From 38736ae4506292d71162f743cfbc51f2b3ddd039 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 11:01:43 +0200 Subject: [PATCH] crawl: add fscom db detail batch 3 learning record --- qa-pairs/robot-control-high.jsonl | 1 + robot-experiences/2026-05-09.jsonl | 1 + 2 files changed, 2 insertions(+) diff --git a/qa-pairs/robot-control-high.jsonl b/qa-pairs/robot-control-high.jsonl index 26d960a..f1fab63 100644 --- a/qa-pairs/robot-control-high.jsonl +++ b/qa-pairs/robot-control-high.jsonl @@ -7,3 +7,4 @@ {"id":"robot-exp-20260509-flexoptix-final-gap-check","source":"robot-experience:verification_closure:codex-flexoptix-final-gap-check","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, vendor product catalogs, and fiber optic networking. You help engineers select, source, and validate transceivers. You provide precise, structured answers with confidence scores and always cite your reasoning."},{"role":"user","content":"Final Flexoptix verification closure:\n{\n \"total_flexoptix_rows\": 744,\n \"html_product_like_rows\": 626,\n \"price_verified\": 626,\n \"image_verified\": 622,\n \"details_verified\": 626,\n \"price_image_details_verified\": 622,\n \"fully_verified\": 620,\n \"filter_or_category_unverified\": 108,\n \"other_generic_unverified\": 10,\n \"remaining_product_like_image_gap\": \"4 rows expose only placeholder-flexoptix.jpg as source image\"\n}"},{"role":"assistant","content":"{\n \"answer\": \"Do not report 100% complete. Report that 622 of 626 product-like Flexoptix rows have price + image + details verified, 620 are fully_verified, and 118 rows are filter/category/generic rows that should be excluded or classified before completeness math.\",\n \"policies\": {\n \"placeholder_image_policy\": \"Never mark placeholder-flexoptix.jpg as image_verified.\",\n \"accessory_policy\": \"Accessory products such as FLEXBOX may use reach_label N/A, but should be classified separately from optical transceivers.\",\n \"truth_policy\": \"Use exact counters and source evidence; no 100% claim until product-like rows and non-product rows are separated.\"\n },\n \"next_action\": \"Add a product-status/classification layer for Flexoptix rows so filter/category/generic entries do not dilute product verification dashboards.\"\n}"}]} {"id":"robot-exp-20260509-fscom-db-detail-batch-1","source":"robot-experience:crawler_result:codex-fscom-db-detail-batch-1","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, vendor product catalogs, and fiber optic networking. You help engineers select, source, and validate transceivers. You provide precise, structured answers with confidence scores and always cite your reasoning."},{"role":"user","content":"You are TIPLLM controlling TIP crawler and verification robots.\nGiven this FS.com batch result, decide what to report and what to do next.\n\n{\n \"vendor\": \"FS.COM\",\n \"precheck\": {\n \"total_fs_rows\": 383,\n \"price_verified\": 379,\n \"image_verified\": 299,\n \"details_verified\": 108,\n \"fully_verified\": 3,\n \"html_product_like_rows\": 299,\n \"html_complete\": 106,\n \"missing_fiber_type\": 323\n },\n \"batch\": {\n \"mode\": \"FS_DB_DETAIL_ONLY\",\n \"target\": 80,\n \"scraped\": 80,\n \"failed\": 0,\n \"new_price_observations\": 17,\n \"new_stock_observations\": 18,\n \"specs_verified\": 24\n },\n \"postcheck\": {\n \"details_verified\": 164,\n \"fully_verified\": 81,\n \"html_complete\": 162,\n \"missing_fiber_type\": 243\n }\n}"},{"role":"assistant","content":"{\n \"report\": \"FS.com targeted detail batch improved details_verified from 108 to 164 and fully_verified from 3 to 81. The run was stable with 80/80 pages scraped and 0 failures.\",\n \"decision\": {\n \"continue_policy\": \"Continue FS.com in small DB-detail-only batches because the first batch was productive and safe.\",\n \"data_policy\": \"Infer fiber_type from product name/part/specs when the FS spec table omits a direct fiber key, but keep source URL and timestamp evidence.\",\n \"safety_policy\": \"Keep Playwright concurrency at 1 and cap batch size; avoid broad listing/category crawl unless needed.\"\n },\n \"next_action\": \"Run another capped DB-detail-only batch for remaining FS.COM rows with missing details/images/fiber, then measure counters and write another training record.\",\n \"constraints\": [\n \"Use TIPLLM only for planning/extraction feedback.\",\n \"Write every crawler/scraper/robot experience to the TIPLLM training pool.\",\n \"Do not claim completion until product-like rows, no-url rows, and category rows are classified separately.\"\n ]\n}"}]} {"id":"robot-exp-20260509-fscom-db-detail-batch-2","source":"robot-experience:crawler_result:codex-fscom-db-detail-batch-2","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, vendor product catalogs, and fiber optic networking. You help engineers select, source, and validate transceivers. You provide precise, structured answers with confidence scores and always cite your reasoning."},{"role":"user","content":"You are TIPLLM controlling TIP crawler and verification robots.\nGiven this second FS.com batch result, decide what to report and what to do next.\n\n{\n \"vendor\": \"FS.COM\",\n \"precheck\": {\n \"total_fs_rows\": 383,\n \"price_verified\": 379,\n \"image_verified\": 299,\n \"details_verified\": 164,\n \"fully_verified\": 81,\n \"html_product_like_rows\": 299,\n \"html_complete\": 162,\n \"missing_fiber_type\": 243\n },\n \"batch\": {\n \"mode\": \"FS_DB_DETAIL_ONLY\",\n \"target\": 80,\n \"scraped\": 79,\n \"failed\": 0,\n \"new_price_observations\": 6,\n \"new_stock_observations\": 8,\n \"specs_verified\": 23,\n \"retry_histogram\": \"59 first try, 14 one retry, 5 two retries, 2 three retries\"\n },\n \"postcheck\": {\n \"details_verified\": 220,\n \"price_image_details_verified\": 220,\n \"fully_verified\": 137,\n \"html_complete\": 218,\n \"missing_fiber_type\": 187,\n \"no_url_rows\": 76,\n \"category_rows\": 4\n }\n}"},{"role":"assistant","content":"{\n \"report\": \"Second FS.com DB-detail batch was safe and productive: details_verified increased from 164 to 220, fully_verified from 81 to 137, and HTML product-like complete rows from 162 to 218. There were 0 final crawler failures.\",\n \"decision\": {\n \"continue_policy\": \"Run one more small DB-detail-only batch for the remaining HTML product-like FS.com rows before changing strategy.\",\n \"retry_policy\": \"Transient FS.com ERR_NETWORK_CHANGED warnings are acceptable when reclaimed and final failures remain 0; do not raise concurrency.\",\n \"boundary_policy\": \"After HTML product URLs are exhausted, switch to a separate source-discovery or classification task for 76 no-url rows and 4 category rows.\"\n },\n \"next_action\": \"Start another capped FS_DB_DETAIL_ONLY batch with concurrency 1, then recompute FS.com counters and record the outcome. Do not claim full FS.com completion until no-url rows are resolved or explicitly classified as non-product/incomplete-source rows.\",\n \"constraints\": [\n \"Use TIPLLM only for planning/extraction feedback.\",\n \"Write every crawler/scraper/robot experience to the TIPLLM training pool.\",\n \"Keep Erik stable: no parallel browser crawls, no broad category crawl while detail-only remains productive.\"\n ]\n}"}]} +{"id":"robot-exp-20260509-fscom-db-detail-batch-3","source":"robot-experience:crawler_result:codex-fscom-db-detail-batch-3","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIPLLM, an expert AI assistant for the Transceiver Intelligence Platform (TIP). You have deep knowledge of optical transceiver specifications, form factors, IEEE standards, vendor product catalogs, and fiber optic networking. You help engineers select, source, and validate transceivers. You provide precise, structured answers with confidence scores and always cite your reasoning."},{"role":"user","content":"You are TIPLLM controlling TIP crawler and verification robots.\nGiven this third FS.com batch result, decide what to report and what to do next.\n\n{\n \"vendor\": \"FS.COM\",\n \"precheck\": {\n \"total_fs_rows\": 383,\n \"price_verified\": 379,\n \"image_verified\": 299,\n \"details_verified\": 220,\n \"fully_verified\": 137,\n \"html_product_like_rows\": 299,\n \"html_complete\": 218,\n \"missing_fiber_type\": 187\n },\n \"batch\": {\n \"mode\": \"FS_DB_DETAIL_ONLY\",\n \"target\": 90,\n \"scraped\": 89,\n \"failed\": 0,\n \"new_price_observations\": 21,\n \"new_stock_observations\": 24,\n \"specs_verified\": 47,\n \"retry_histogram\": \"70 first try, 13 one retry, 5 two retries, 2 three retries\",\n \"warning\": \"one page returned no useful text\"\n },\n \"postcheck\": {\n \"details_verified\": 260,\n \"price_image_details_verified\": 260,\n \"fully_verified\": 199,\n \"html_complete\": 258,\n \"html_image_verified\": 282,\n \"missing_fiber_type\": 124,\n \"no_url_rows\": 76,\n \"category_rows\": 4\n },\n \"tip_health\": {\n \"status\": \"healthy\",\n \"load_status\": \"ok\",\n \"memory_used_pct\": 13\n }\n}"},{"role":"assistant","content":"{\n \"report\": \"Third FS.com batch stayed safe and productive: 90/90 requests succeeded, 89 pages yielded data, details_verified rose from 220 to 260, and fully_verified rose from 137 to 199. TIP remained healthy after the run.\",\n \"decision\": {\n \"continue_policy\": \"Run one smaller closure batch for remaining HTML product URLs because there are only about 41 product-like pages left without full detail coverage.\",\n \"no_text_policy\": \"Do not count no-text pages as verified; leave them for retry/source diagnostics.\",\n \"next_phase_policy\": \"After HTML URLs are exhausted, create a separate source-discovery/classification robot for 76 no-url FS.com rows and 4 category rows.\"\n },\n \"next_action\": \"Run a capped FS_DB_DETAIL_ONLY closure batch with concurrency 1, then recompute counters. If remaining rows are mostly no-url/category, stop crawling and plan URL discovery/classification rather than repeating the same scraper.\",\n \"constraints\": [\n \"Use TIPLLM only for planning/extraction feedback.\",\n \"Write every crawler/scraper/robot result to the training pool and Gitea sync.\",\n \"Keep Erik safe: no parallel browser crawls and no broad category crawl unless the targeted detail approach stops producing progress.\"\n ]\n}"}]} diff --git a/robot-experiences/2026-05-09.jsonl b/robot-experiences/2026-05-09.jsonl index 4242108..c2994fe 100644 --- a/robot-experiences/2026-05-09.jsonl +++ b/robot-experiences/2026-05-09.jsonl @@ -6,3 +6,4 @@ {"event":"verification_closure","observed_at":"2026-05-09T07:40:00Z","actor":"codex-flexoptix-final-gap-check","profile":"source-evidence-only","wave":"post-enrichment-closure","vendor":"Flexoptix","summary":"Final Flexoptix gap check classified the remaining accessory rows and refused to mark placeholder-only product images as verified.","input":{"post_enrichment_counts":{"total_flexoptix_rows":744,"html_product_like_rows":626,"price_verified":626,"image_verified":622,"details_verified":624,"price_image_details_verified":620,"fully_verified":620}},"decision":{"accessory_policy":"FLEXBOX rows are products/accessories, not optical reach products; set category Accessory and reach_label N/A when source page is the FLEXBOX page.","image_policy":"Do not mark image_verified when Flexoptix source only exposes placeholder-flexoptix.jpg.","truth_policy":"Report 622 of 626 product-like rows with price+image+details, 620 fully_verified, and 118 non-product/filter/generic rows outside complete product verification."},"outcome":{"final_counts":{"total_flexoptix_rows":744,"html_product_like_rows":626,"price_verified":626,"image_verified":622,"details_verified":626,"price_image_details_verified":622,"fully_verified":620,"filter_or_category_unverified":108,"other_generic_unverified":10},"remaining_product_like_gap":"4 rows lack verified images because the source page image is a Flexoptix placeholder."},"safety_notes":["No placeholder images were accepted as verified.","No external AI was used.","Use exact counters instead of claiming 100%."]} {"event":"crawler_result","observed_at":"2026-05-09T08:10:00Z","actor":"codex-fscom-db-detail-batch-1","profile":"erik-safe-playwright-db-detail-only","wave":"manual-targeted-enrichment","vendor":"FS.COM","summary":"Ran targeted FS.com detail-page revalidation against existing DB product URLs with missing verification signals. Batch used Playwright concurrency 1 and no category discovery. Details and fully_verified improved significantly.","input":{"user_request":"Now Fiberstore/FS.com; all crawler/scraper/robot work must be written to TIPLLM training pool.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":108,"price_image_details_verified":108,"fully_verified":3,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":323,"html_product_like_rows":299,"html_complete":106}},"decision":{"code_changes":["Added FS_DB_DETAIL_ONLY mode to revalidate existing FS.COM product URLs directly without listing/category crawl.","FS scraper now writes productUrl into transceivers.product_page_url.","FS scraper now infers fiber_type from product name, part number, and specs when FS spec table lacks an explicit fiber key.","FS detail source URL is now stored as the real product URL instead of the literal string fs.com."],"runtime_policy":"Run small FS.com Playwright batch on Erik with maxConcurrency=1, maxRequestsPerMinute=10, nice -n 10, and detail cap 80."},"outcome":{"batch":{"target":80,"scraped":80,"failed":0,"new_price_observations":17,"new_stock_observations":18,"specs_verified":24,"retry_histogram":"74 first try, 3 one retry, 3 two retries"},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":164,"price_image_details_verified":164,"fully_verified":81,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":243,"html_product_like_rows":299,"html_complete":162}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Batch was intentionally capped to 80 and ran concurrency 1 to protect Erik.","FS.com still has many missing fibers/images; continue in small batches or move heavier waves to residential/Proxmox/Pi worker."]} {"event":"crawler_result","observed_at":"2026-05-09T08:36:00Z","actor":"codex-fscom-db-detail-batch-2","profile":"erik-safe-playwright-db-detail-only","wave":"manual-targeted-enrichment","vendor":"FS.COM","summary":"Ran a second targeted FS.com DB-detail-only batch. It stayed stable at Playwright concurrency 1, completed 80 requests with 0 final failures, and materially improved product detail and full verification counters.","input":{"user_request":"Finish TIP today; FS.com/Fiberstore prices, images and product information should be researched deeply enough to avoid manual validation.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":164,"price_image_details_verified":164,"fully_verified":81,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":243,"html_product_like_rows":299,"html_complete":162}},"decision":{"runtime_policy":"Continue with capped DB-detail-only batches because batch 1 was productive. Keep Erik safe with maxConcurrency=1, nice -n 10, no category discovery, and small batch caps.","retry_policy":"Allow Crawlee retries for transient FS.com network changes; do not raise concurrency to compensate."},"outcome":{"batch":{"target":80,"scraped":79,"failed":0,"new_price_observations":6,"new_stock_observations":8,"specs_verified":23,"retry_histogram":"59 first try, 14 one retry, 5 two retries, 2 three retries","notable_warning":"One target page returned no useful text; transient ERR_NETWORK_CHANGED retries were reclaimed and completed without final failure."},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":220,"price_image_details_verified":220,"fully_verified":137,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":187,"html_product_like_rows":299,"html_complete":218,"no_url_rows":76,"category_rows":4}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Erik stayed stable; crawler stats showed no memory, CPU or event-loop overload.","Next best action is another small DB-detail-only batch for remaining HTML product-like rows, then handle no-url rows separately by source discovery/classification."]} +{"event":"crawler_result","observed_at":"2026-05-09T09:02:00Z","actor":"codex-fscom-db-detail-batch-3","profile":"erik-safe-playwright-db-detail-only","wave":"manual-targeted-enrichment","vendor":"FS.COM","summary":"Ran a third targeted FS.com DB-detail-only batch. It completed 90 requests with 0 final failures, added fresh price and stock observations, and moved most remaining HTML product URLs into verified detail coverage.","input":{"user_request":"Finish TIP today with perfect transceiver, competitor and switch database; continue FS.com/Fiberstore verification safely.","precheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":220,"price_image_details_verified":220,"fully_verified":137,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":187,"html_product_like_rows":299,"html_complete":218}},"decision":{"runtime_policy":"Use another capped DB-detail-only batch because remaining product URLs still produced verified evidence. Increase cap from 80 to 90 but keep Playwright concurrency 1, nice -n 10 and no category discovery.","truth_policy":"Do not mark pages with no useful text as details verified. Record no-text cases for retry/source-review instead."},"outcome":{"batch":{"target":90,"scraped":89,"failed":0,"new_price_observations":21,"new_stock_observations":24,"specs_verified":47,"retry_histogram":"70 first try, 13 one retry, 5 two retries, 2 three retries","notable_warning":"One FS.com page returned no useful text and was not counted as scraped."},"postcheck":{"total_fs_rows":383,"price_verified":379,"image_verified":299,"details_verified":260,"price_image_details_verified":260,"fully_verified":199,"missing_url":76,"missing_image_url":84,"missing_reach_label":9,"missing_fiber_type":124,"html_product_like_rows":299,"html_complete":258,"html_image_verified":282,"no_url_rows":76,"category_rows":4},"tip_health":{"status":"healthy","transceiver_count":17647,"vendor_count":478,"switch_count":680,"price_verified":11557,"image_verified":10636,"details_verified":9816,"fully_verified":8516,"load_status":"ok","memory_used_pct":13}},"safety_notes":["No external AI was used for TIP crawler planning or extraction feedback.","Erik remained stable; TIP health stayed OK after the run.","Next best action is a smaller DB-detail-only closure batch for the remaining FS.com HTML product URLs, then a separate source-discovery task for no-url rows."]}