fix: preserve explicit competitor states in reconcile
This commit is contained in:
parent
3926a1ef90
commit
0d4bcb6924
@ -2653,7 +2653,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
competitor_status = 'needs_research',
|
competitor_status = 'needs_research',
|
||||||
competitor_status_updated_at = NOW()
|
competitor_status_updated_at = NOW()
|
||||||
WHERE competitor_verified = true
|
WHERE competitor_verified = true
|
||||||
AND COALESCE(competitor_status, 'matched') != 'no_valid_match'
|
AND COALESCE(competitor_status, 'matched') NOT IN ('no_valid_match', 'ambiguous')
|
||||||
AND NOT EXISTS (
|
AND NOT EXISTS (
|
||||||
SELECT 1 FROM price_observations po
|
SELECT 1 FROM price_observations po
|
||||||
JOIN vendors v ON po.source_vendor_id = v.id
|
JOIN vendors v ON po.source_vendor_id = v.id
|
||||||
|
|||||||
@ -40,6 +40,21 @@ async function quarantine(): Promise<void> {
|
|||||||
OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)'
|
OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)'
|
||||||
OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/')
|
OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/')
|
||||||
OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com'))
|
OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com'))
|
||||||
|
OR (
|
||||||
|
v.name = 'GBICS'
|
||||||
|
AND (
|
||||||
|
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
OR (
|
||||||
|
v.name = 'T&S Communication'
|
||||||
|
AND (
|
||||||
|
COALESCE(t.product_page_url, '') !~* '/products/'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(\\.pdf$|/products/fiber-optic-transceivers/?$)'
|
||||||
|
OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)'
|
||||||
|
)
|
||||||
|
)
|
||||||
OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$'
|
OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$'
|
||||||
OR t.category IN (
|
OR t.category IN (
|
||||||
'Accessory',
|
'Accessory',
|
||||||
@ -67,6 +82,21 @@ async function quarantine(): Promise<void> {
|
|||||||
OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)'
|
OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)'
|
||||||
OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/')
|
OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/')
|
||||||
OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com'))
|
OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com'))
|
||||||
|
OR (
|
||||||
|
v.name = 'GBICS'
|
||||||
|
AND (
|
||||||
|
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
OR (
|
||||||
|
v.name = 'T&S Communication'
|
||||||
|
AND (
|
||||||
|
COALESCE(t.product_page_url, '') !~* '/products/'
|
||||||
|
OR COALESCE(t.product_page_url, '') ~* '(\\.pdf$|/products/fiber-optic-transceivers/?$)'
|
||||||
|
OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)'
|
||||||
|
)
|
||||||
|
)
|
||||||
OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$'
|
OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$'
|
||||||
OR t.category IN (
|
OR t.category IN (
|
||||||
'Accessory',
|
'Accessory',
|
||||||
|
|||||||
@ -1,9 +1,42 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 22:05 UTC
|
Updated: 2026-05-09 22:15 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
|
- TIP verification continuation on 2026-05-09:
|
||||||
|
- expanded deterministic non-transceiver quarantine for GBICS and T&S Communication artifacts
|
||||||
|
- live quarantine result:
|
||||||
|
- `93` additional artifacts moved out of the active transceiver base
|
||||||
|
- `verify:quarantine:non-transceivers` evidence count: `93`
|
||||||
|
- current vendor gaps after cleanup:
|
||||||
|
- GBICS: `88` active rows, `17` missing price, `17` missing image, `17` missing details
|
||||||
|
- T&S Communication: `36` active rows, `36` missing price, `36` missing image, `6` missing details
|
||||||
|
- 10Gtek: `175` active rows, `126` missing price, `131` missing image, `25` missing details
|
||||||
|
- fixed `maintenance:reconcile-verification`:
|
||||||
|
- preserve explicit `competitor_status IN ('no_valid_match', 'ambiguous')`
|
||||||
|
- do not reset deliberate research outcomes back to `needs_research`
|
||||||
|
- deployed to Erik:
|
||||||
|
- `packages/scraper/src/scheduler.ts`
|
||||||
|
- `packages/scraper/src/utils/quarantine-non-transceivers.ts`
|
||||||
|
- remote build passed
|
||||||
|
- `tip-scraper-daemon` restarted after pg-boss queue was empty
|
||||||
|
- restored competitor states after previous reconcile regression:
|
||||||
|
- dry-run candidates: `615`
|
||||||
|
- apply wrote `74` `no_valid_match`, `541` `ambiguous`, `74` `fully_verified_earned`
|
||||||
|
- fresh reconcile test completed successfully after the fix
|
||||||
|
- live health after reconcile test:
|
||||||
|
- active products: `17212`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16702`
|
||||||
|
- fully verified: `10449`
|
||||||
|
- competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5807`
|
||||||
|
- fully product-verified rows still in competitor `needs_research`: `0`
|
||||||
|
- TIPLLM training pool updated with:
|
||||||
|
- reconcile must preserve explicit competitor research states
|
||||||
|
- GBICS/T&S artifact quarantine rules
|
||||||
|
|
||||||
- MAGATAMA multi-LLM training lane expansion on 2026-05-09:
|
- MAGATAMA multi-LLM training lane expansion on 2026-05-09:
|
||||||
- added first-class training lanes for:
|
- added first-class training lanes for:
|
||||||
- `pulso_llm`
|
- `pulso_llm`
|
||||||
|
|||||||
@ -91,3 +91,48 @@ Interpretation:
|
|||||||
- Build alias/dedupe resolvers for GBICS, T&S, and 10Gtek similar to FS.com numeric SKU alias cleanup.
|
- Build alias/dedupe resolvers for GBICS, T&S, and 10Gtek similar to FS.com numeric SKU alias cleanup.
|
||||||
- Repair GBICS/T&S price selectors against current page HTML.
|
- Repair GBICS/T&S price selectors against current page HTML.
|
||||||
- For OEM vendors such as Cisco/Juniper, add explicit non-public-price states instead of pretending OEM list prices exist.
|
- For OEM vendors such as Cisco/Juniper, add explicit non-public-price states instead of pretending OEM list prices exist.
|
||||||
|
|
||||||
|
## Follow-Up: Artifact Quarantine + Reconcile Guard
|
||||||
|
|
||||||
|
GBICS/T&S artifact quarantine was expanded and applied live.
|
||||||
|
|
||||||
|
Result:
|
||||||
|
|
||||||
|
- `93` additional non-transceiver artifacts quarantined with `artifact_quarantine` evidence
|
||||||
|
- GBICS active base moved from `135` rows with `64/64/64` price/image/details gaps to `88` rows with `17/17/17` gaps
|
||||||
|
- T&S Communication active base moved from `82` rows with `82/82/49` price/image/details gaps to `36` rows with `36/36/6` gaps
|
||||||
|
- 10Gtek current active gap remains `175` rows with `126/131/25` price/image/details gaps
|
||||||
|
|
||||||
|
Important regression found:
|
||||||
|
|
||||||
|
- `maintenance:reconcile-verification` reset explicit competitor research states back to `needs_research`
|
||||||
|
- fixed scheduler guard so reconcile preserves both `no_valid_match` and `ambiguous`
|
||||||
|
- deployed to Erik, built successfully, and restarted `tip-scraper-daemon` only after pg-boss active queue was empty
|
||||||
|
|
||||||
|
Restore run:
|
||||||
|
|
||||||
|
- dry-run found `615` fully product-verified rows that had fallen back into competitor `needs_research`
|
||||||
|
- apply wrote:
|
||||||
|
- `74` `no_valid_match`
|
||||||
|
- `541` `ambiguous`
|
||||||
|
- `74` newly/restored `fully_verified`
|
||||||
|
- live evidence ledger now includes:
|
||||||
|
- `74` `competitor_no_match` from `verify:open-competitor-status`
|
||||||
|
- `555` `competitor_ambiguous` from `verify:open-competitor-status`
|
||||||
|
- `93` `artifact_quarantine` from `verify:quarantine:non-transceivers`
|
||||||
|
- `109` `artifact_quarantine` from `verify:fs:sku-aliases`
|
||||||
|
|
||||||
|
Verification after a fresh reconcile test:
|
||||||
|
|
||||||
|
- health status: healthy
|
||||||
|
- active products: `17212`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16702`
|
||||||
|
- fully verified: `10449`
|
||||||
|
- competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=74`
|
||||||
|
- `ambiguous=556`
|
||||||
|
- `needs_research=5807`
|
||||||
|
- fully product-verified rows still in competitor `needs_research`: `0`
|
||||||
|
|||||||
@ -45,3 +45,5 @@
|
|||||||
{"id":"tip-20260509-open-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research queues with explicit evidence-backed states, not fake green checks."},{"role":"user","content":"What should TIP do with products that have price, image, and details verified but still sit in competitor needs_research?"},{"role":"assistant","content":"Run an open competitor status resolver. If no strict source-backed candidate and no active equivalence hint exists, set `competitor_status=no_valid_match`, set competitor_verified true, write `competitor_no_match` evidence, and recalculate fully_verified. If candidates or equivalence hints exist but no safe deterministic 1:1 match can be proven, set `competitor_status=ambiguous` and write `competitor_ambiguous` evidence. Do not leave fully populated rows in endless `needs_research`, and do not auto-match when the candidate set is broad or technically risky."}]}
|
{"id":"tip-20260509-open-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research queues with explicit evidence-backed states, not fake green checks."},{"role":"user","content":"What should TIP do with products that have price, image, and details verified but still sit in competitor needs_research?"},{"role":"assistant","content":"Run an open competitor status resolver. If no strict source-backed candidate and no active equivalence hint exists, set `competitor_status=no_valid_match`, set competitor_verified true, write `competitor_no_match` evidence, and recalculate fully_verified. If candidates or equivalence hints exist but no safe deterministic 1:1 match can be proven, set `competitor_status=ambiguous` and write `competitor_ambiguous` evidence. Do not leave fully populated rows in endless `needs_research`, and do not auto-match when the candidate set is broad or technically risky."}]}
|
||||||
{"id":"tip-20260509-store-scraper-gap-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Treat scraper runs that find products but do not improve active verification as diagnostic evidence."},{"role":"user","content":"GBICS and T&S scrapers found many products but active TIP verification counts did not improve. What does that mean?"},{"role":"assistant","content":"It means the crawler is not simply dead; the remaining problem is likely row identity alignment, alias/dedupe, or price/image selector drift. For GBICS, discovery found 758 products and zero prices, while active gaps stayed at 64 price/image/details rows. For T&S, discovery found 109 unique products and zero prices, while active gaps stayed at 82 price/image and 49 details rows. Next steps are to repair price selectors and build alias/dedupe resolvers that map scraped product names/URLs to existing active rows before claiming verification progress."}]}
|
{"id":"tip-20260509-store-scraper-gap-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Treat scraper runs that find products but do not improve active verification as diagnostic evidence."},{"role":"user","content":"GBICS and T&S scrapers found many products but active TIP verification counts did not improve. What does that mean?"},{"role":"assistant","content":"It means the crawler is not simply dead; the remaining problem is likely row identity alignment, alias/dedupe, or price/image selector drift. For GBICS, discovery found 758 products and zero prices, while active gaps stayed at 64 price/image/details rows. For T&S, discovery found 109 unique products and zero prices, while active gaps stayed at 82 price/image and 49 details rows. Next steps are to repair price selectors and build alias/dedupe resolvers that map scraped product names/URLs to existing active rows before claiming verification progress."}]}
|
||||||
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
|
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
|
||||||
|
{"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]}
|
||||||
|
{"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user