diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 9b37d92..9089fa3 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -2653,7 +2653,7 @@ export async function registerWorkers(boss: PgBoss): Promise { competitor_status = 'needs_research', competitor_status_updated_at = NOW() WHERE competitor_verified = true - AND COALESCE(competitor_status, 'matched') != 'no_valid_match' + AND COALESCE(competitor_status, 'matched') NOT IN ('no_valid_match', 'ambiguous') AND NOT EXISTS ( SELECT 1 FROM price_observations po JOIN vendors v ON po.source_vendor_id = v.id diff --git a/packages/scraper/src/utils/quarantine-non-transceivers.ts b/packages/scraper/src/utils/quarantine-non-transceivers.ts index 218035f..5a625d3 100644 --- a/packages/scraper/src/utils/quarantine-non-transceivers.ts +++ b/packages/scraper/src/utils/quarantine-non-transceivers.ts @@ -40,6 +40,21 @@ async function quarantine(): Promise { OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR ( + v.name = 'GBICS' + AND ( + t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' + OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' + ) + ) + OR ( + v.name = 'T&S Communication' + AND ( + COALESCE(t.product_page_url, '') !~* '/products/' + OR COALESCE(t.product_page_url, '') ~* '(\\.pdf$|/products/fiber-optic-transceivers/?$)' + OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)' + ) + ) OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' OR t.category IN ( 'Accessory', @@ -67,6 +82,21 @@ async function quarantine(): Promise { OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR ( + v.name = 'GBICS' + AND ( + t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' + OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' + ) + ) + OR ( + v.name = 'T&S Communication' + AND ( + COALESCE(t.product_page_url, '') !~* '/products/' + OR COALESCE(t.product_page_url, '') ~* '(\\.pdf$|/products/fiber-optic-transceivers/?$)' + OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)' + ) + ) OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' OR t.category IN ( 'Accessory', diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 2cd663e..108d515 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,42 @@ # Current TIP Sync State -Updated: 2026-05-09 22:05 UTC +Updated: 2026-05-09 22:15 UTC ## Newest Work +- TIP verification continuation on 2026-05-09: + - expanded deterministic non-transceiver quarantine for GBICS and T&S Communication artifacts + - live quarantine result: + - `93` additional artifacts moved out of the active transceiver base + - `verify:quarantine:non-transceivers` evidence count: `93` + - current vendor gaps after cleanup: + - GBICS: `88` active rows, `17` missing price, `17` missing image, `17` missing details + - T&S Communication: `36` active rows, `36` missing price, `36` missing image, `6` missing details + - 10Gtek: `175` active rows, `126` missing price, `131` missing image, `25` missing details + - fixed `maintenance:reconcile-verification`: + - preserve explicit `competitor_status IN ('no_valid_match', 'ambiguous')` + - do not reset deliberate research outcomes back to `needs_research` + - deployed to Erik: + - `packages/scraper/src/scheduler.ts` + - `packages/scraper/src/utils/quarantine-non-transceivers.ts` + - remote build passed + - `tip-scraper-daemon` restarted after pg-boss queue was empty + - restored competitor states after previous reconcile regression: + - dry-run candidates: `615` + - apply wrote `74` `no_valid_match`, `541` `ambiguous`, `74` `fully_verified_earned` + - fresh reconcile test completed successfully after the fix + - live health after reconcile test: + - active products: `17212` + - price verified: `11414` + - image verified: `12016` + - details verified: `16702` + - fully verified: `10449` + - competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5807` + - fully product-verified rows still in competitor `needs_research`: `0` + - TIPLLM training pool updated with: + - reconcile must preserve explicit competitor research states + - GBICS/T&S artifact quarantine rules + - MAGATAMA multi-LLM training lane expansion on 2026-05-09: - added first-class training lanes for: - `pulso_llm` diff --git a/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md b/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md index b541bf2..d7b4d79 100644 --- a/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md +++ b/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md @@ -91,3 +91,48 @@ Interpretation: - Build alias/dedupe resolvers for GBICS, T&S, and 10Gtek similar to FS.com numeric SKU alias cleanup. - Repair GBICS/T&S price selectors against current page HTML. - For OEM vendors such as Cisco/Juniper, add explicit non-public-price states instead of pretending OEM list prices exist. + +## Follow-Up: Artifact Quarantine + Reconcile Guard + +GBICS/T&S artifact quarantine was expanded and applied live. + +Result: + +- `93` additional non-transceiver artifacts quarantined with `artifact_quarantine` evidence +- GBICS active base moved from `135` rows with `64/64/64` price/image/details gaps to `88` rows with `17/17/17` gaps +- T&S Communication active base moved from `82` rows with `82/82/49` price/image/details gaps to `36` rows with `36/36/6` gaps +- 10Gtek current active gap remains `175` rows with `126/131/25` price/image/details gaps + +Important regression found: + +- `maintenance:reconcile-verification` reset explicit competitor research states back to `needs_research` +- fixed scheduler guard so reconcile preserves both `no_valid_match` and `ambiguous` +- deployed to Erik, built successfully, and restarted `tip-scraper-daemon` only after pg-boss active queue was empty + +Restore run: + +- dry-run found `615` fully product-verified rows that had fallen back into competitor `needs_research` +- apply wrote: + - `74` `no_valid_match` + - `541` `ambiguous` + - `74` newly/restored `fully_verified` +- live evidence ledger now includes: + - `74` `competitor_no_match` from `verify:open-competitor-status` + - `555` `competitor_ambiguous` from `verify:open-competitor-status` + - `93` `artifact_quarantine` from `verify:quarantine:non-transceivers` + - `109` `artifact_quarantine` from `verify:fs:sku-aliases` + +Verification after a fresh reconcile test: + +- health status: healthy +- active products: `17212` +- price verified: `11414` +- image verified: `12016` +- details verified: `16702` +- fully verified: `10449` +- competitor status: + - `matched=10775` + - `no_valid_match=74` + - `ambiguous=556` + - `needs_research=5807` +- fully product-verified rows still in competitor `needs_research`: `0` diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 01ebf5a..96f669e 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -45,3 +45,5 @@ {"id":"tip-20260509-open-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research queues with explicit evidence-backed states, not fake green checks."},{"role":"user","content":"What should TIP do with products that have price, image, and details verified but still sit in competitor needs_research?"},{"role":"assistant","content":"Run an open competitor status resolver. If no strict source-backed candidate and no active equivalence hint exists, set `competitor_status=no_valid_match`, set competitor_verified true, write `competitor_no_match` evidence, and recalculate fully_verified. If candidates or equivalence hints exist but no safe deterministic 1:1 match can be proven, set `competitor_status=ambiguous` and write `competitor_ambiguous` evidence. Do not leave fully populated rows in endless `needs_research`, and do not auto-match when the candidate set is broad or technically risky."}]} {"id":"tip-20260509-store-scraper-gap-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Treat scraper runs that find products but do not improve active verification as diagnostic evidence."},{"role":"user","content":"GBICS and T&S scrapers found many products but active TIP verification counts did not improve. What does that mean?"},{"role":"assistant","content":"It means the crawler is not simply dead; the remaining problem is likely row identity alignment, alias/dedupe, or price/image selector drift. For GBICS, discovery found 758 products and zero prices, while active gaps stayed at 64 price/image/details rows. For T&S, discovery found 109 unique products and zero prices, while active gaps stayed at 82 price/image and 49 details rows. Next steps are to repair price selectors and build alias/dedupe resolvers that map scraped product names/URLs to existing active rows before claiming verification progress."}]} {"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]} +{"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]} +{"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]}