diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 9584f9d..4e3f1a2 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -16,6 +16,7 @@ "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", + "verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:news": "tsx src/scrapers/news.ts", diff --git a/packages/scraper/src/utils/resolve-no-valid-competitor.ts b/packages/scraper/src/utils/resolve-no-valid-competitor.ts new file mode 100644 index 0000000..cd3a1a8 --- /dev/null +++ b/packages/scraper/src/utils/resolve-no-valid-competitor.ts @@ -0,0 +1,200 @@ +/** + * No-valid-competitor resolver. + * + * This is intentionally conservative. It only resolves products that already + * have price/image/details evidence and where TIP cannot find any technically + * strict, source-backed competitor candidate. + * + * Default is dry-run. Set NO_VALID_MATCH_APPLY=1 to write status/evidence. + */ +import { pool, checkAndSetFullyVerified, recordVerificationEvidence } from "./db"; +import { logger } from "./logger"; + +const EXCLUDED_CATEGORIES = [ + "NonTransceiver", + "Accessory", + "Adapter / Converter", + "Switch / Media Converter", + "Switch / Network Infrastructure", + "NIC / Adapter", + "Mux / Passive Optical", + "Product Family", + "Loopback / Test Module", +]; + +type Candidate = { + id: string; + part_number: string; + vendor_name: string; + form_factor: string; + speed_gbps: string; + reach_meters: string | null; + fiber_type: string | null; + candidate_count: string; + pending_equivalence_count: string; +}; + +function applyMode(): boolean { + return process.env.NO_VALID_MATCH_APPLY === "1"; +} + +async function resolveNoValidCompetitors(): Promise { + const limit = Math.max(1, parseInt(process.env.NO_VALID_MATCH_LIMIT || "500", 10)); + const vendorFilter = process.env.NO_VALID_MATCH_VENDOR || "Flexoptix"; + const apply = applyMode(); + + logger.info("=== No-valid-competitor resolver ===", { limit, vendorFilter, apply }); + + const result = await pool.query(` + WITH active AS ( + SELECT t.*, v.name AS vendor_name + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE COALESCE(t.data_confidence, 'unknown') != 'garbage' + AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%' + AND COALESCE(t.category, '') <> ALL($1::text[]) + ), + ready AS ( + SELECT a.* + FROM active a + WHERE a.price_verified = true + AND a.vendor_name ILIKE $2 + AND a.image_verified = true + AND a.details_verified = true + AND COALESCE(a.competitor_verified, false) = false + AND COALESCE(a.competitor_status, 'needs_research') IN ('unknown', 'needs_research', 'ambiguous') + AND a.form_factor IS NOT NULL + AND a.form_factor != '' + AND a.speed_gbps IS NOT NULL + AND a.speed_gbps > 0 + AND a.reach_meters IS NOT NULL + AND a.reach_meters > 0 + AND a.fiber_type IS NOT NULL + AND a.fiber_type != '' + ), + scored AS ( + SELECT + r.id, + r.part_number, + r.vendor_name, + r.form_factor, + r.speed_gbps, + r.reach_meters, + r.fiber_type, + ( + SELECT COUNT(*) + FROM active c + WHERE c.id != r.id + AND c.vendor_id != r.vendor_id + AND c.price_verified = true + AND c.image_verified = true + AND c.details_verified = true + AND c.form_factor = r.form_factor + AND c.speed_gbps = r.speed_gbps + AND UPPER(COALESCE(c.fiber_type, '')) = UPPER(COALESCE(r.fiber_type, '')) + AND c.reach_meters IS NOT NULL + AND c.reach_meters > 0 + AND ABS(c.reach_meters - r.reach_meters) <= GREATEST(25, r.reach_meters * 0.05) + AND ( + COALESCE(NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, '')), '') = '' + OR COALESCE(NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, '')), '') = '' + OR ABS( + NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, ''))::int + - NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, ''))::int + ) <= 15 + ) + ) AS candidate_count, + ( + SELECT COUNT(*) + FROM transceiver_equivalences eq + WHERE eq.flexoptix_id = r.id + AND eq.status IN ('pending', 'approved', 'auto_approved') + AND eq.confidence >= 0.50 + ) AS pending_equivalence_count + FROM ready r + ) + SELECT * + FROM scored + WHERE candidate_count = 0 + AND pending_equivalence_count = 0 + ORDER BY vendor_name, part_number + LIMIT $3 + `, [EXCLUDED_CATEGORIES, `%${vendorFilter}%`, limit]); + + logger.info("No-valid-match candidates", { count: result.rowCount ?? 0 }); + + if (!apply) { + for (const row of result.rows.slice(0, 30)) { + logger.info("dry-run candidate", { + vendor: row.vendor_name, + partNumber: row.part_number, + formFactor: row.form_factor, + speedGbps: row.speed_gbps, + reachMeters: row.reach_meters, + fiberType: row.fiber_type, + }); + } + return; + } + + let updated = 0; + let fullyVerifiedEarned = 0; + for (const row of result.rows) { + const reason = [ + "Strict deterministic no-valid-match resolver found no source-backed competitor candidate", + `same form_factor=${row.form_factor}`, + `speed_gbps=${row.speed_gbps}`, + `fiber_type=${row.fiber_type}`, + `reach tolerance=max(25m,5%) around ${row.reach_meters}m`, + "and no pending/approved equivalence above confidence 0.50", + ].join("; "); + + const update = await pool.query(` + UPDATE transceivers + SET competitor_verified = true, + competitor_verified_at = NOW(), + competitor_status = 'no_valid_match', + competitor_status_updated_at = NOW(), + no_match_verified_at = NOW(), + no_match_reason = $2, + updated_at = NOW() + WHERE id = $1 + AND COALESCE(competitor_verified, false) = false + RETURNING id + `, [row.id, reason]); + + if ((update.rowCount ?? 0) === 0) continue; + + await recordVerificationEvidence({ + transceiverId: row.id, + verificationType: "competitor_no_match", + evidenceValue: { + reason, + vendor: row.vendor_name, + partNumber: row.part_number, + candidateCount: Number(row.candidate_count), + pendingEquivalenceCount: Number(row.pending_equivalence_count), + }, + robotName: "verify:resolve-no-valid-competitor", + confidence: 1, + }); + + if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++; + updated++; + } + + logger.info("No-valid-competitor resolver complete", { + updated, + fully_verified_earned: fullyVerifiedEarned, + }); +} + +if (require.main === module) { + resolveNoValidCompetitors() + .then(() => pool.end()) + .catch((err) => { + logger.error("No-valid-competitor resolver failed", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 48994ef..ec9024d 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,48 @@ # Current TIP Sync State -Updated: 2026-05-09 21:00 UTC +Updated: 2026-05-09 21:13 UTC ## Newest Work +- TIP no-valid-competitor resolver on 2026-05-09: + - added `packages/scraper/src/utils/resolve-no-valid-competitor.ts` + - script: `pnpm -C packages/scraper run verify:no-valid-competitor` + - default mode is dry-run + - apply mode requires `NO_VALID_MATCH_APPLY=1` + - default vendor scope is `NO_VALID_MATCH_VENDOR=Flexoptix` + - purpose: + - close products that already have price, image, and details evidence + - only resolve competitor verification when there is no strict source-backed 1:1 competitor candidate + - avoid fake competitor matches for uncommon Flexoptix products + - conservative gates: + - active transceiver only; excludes known artifact/non-transceiver categories + - source-backed `price_verified`, `image_verified`, and `details_verified` required + - same-vendor candidates ignored; only other vendors count + - strict candidate match requires same form factor, same speed, same fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it + - no pending/approved equivalence above confidence `0.50` + - live Erik run: + - dry-run with Flexoptix scope found `73` no-valid-match candidates + - apply run updated `73` + - `73` additional products earned `fully_verified` + - evidence ledger wrote `73` `competitor_no_match` records + - live health after run: + - active products: `17414` + - price verified: `11523` + - image verified: `12125` + - details verified: `16814` + - fully verified: `10831` + - active competitor status: + - `matched=11158` + - `no_valid_match=73` + - `ambiguous=0` + - `needs_research=6183` + - operational note: + - `tip-scraper-daemon` was not restarted during active pricing jobs + - restart daemon only after active pg-boss jobs finish, then run reconcile/matcher again + - TIPLLM training pool: + - appended deterministic no-valid-match resolver lessons + - JSONL must remain valid after every append + - TIP verification truth model on 2026-05-09: - implemented migration `sql/103-verification-evidence-and-competitor-status.sql` - adds `transceivers.competitor_status` diff --git a/sync/history/2026-05-09-tip-no-valid-match-resolver.md b/sync/history/2026-05-09-tip-no-valid-match-resolver.md new file mode 100644 index 0000000..3967a16 --- /dev/null +++ b/sync/history/2026-05-09-tip-no-valid-match-resolver.md @@ -0,0 +1,69 @@ +# 2026-05-09 — TIP No-Valid-Match Resolver + +## Operator Requirement + +- Continue TIP verification until source-backed work is exhausted. +- Do not invent competitor matches just to turn products green. +- Use deterministic TIP robots/scrapers only. +- Keep Erik safe; do not restart crawler daemons while active jobs are running. +- Write crawler/scraper/robot learnings into the TIPLLM training pool. + +## Change + +Added `packages/scraper/src/utils/resolve-no-valid-competitor.ts` and package script: + +```bash +pnpm -C packages/scraper run verify:no-valid-competitor +``` + +The resolver is dry-run by default. Applying changes requires: + +```bash +NO_VALID_MATCH_APPLY=1 pnpm -C packages/scraper run verify:no-valid-competitor +``` + +The default vendor scope is Flexoptix: + +```bash +NO_VALID_MATCH_VENDOR=Flexoptix +``` + +## Rules + +A product can be resolved as `competitor_status='no_valid_match'` only when: + +- the product is active and not in a known artifact category +- price, image, and details are already verified +- competitor verification is still unresolved +- no other vendor has a source-backed strict candidate with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it +- no pending/approved equivalence above confidence `0.50` already exists + +The robot writes `competitor_no_match` evidence and then calls `checkAndSetFullyVerified`. + +## Live Result + +On Erik: + +- dry-run Flexoptix scope found `73` candidates +- apply run updated `73` +- `73` products earned `fully_verified` +- evidence ledger now has `73` `competitor_no_match` records + +Live health after the run: + +- active products: `17414` +- price verified: `11523` +- image verified: `12125` +- details verified: `16814` +- fully verified: `10831` +- competitor status: + - `matched=11158` + - `no_valid_match=73` + - `ambiguous=0` + - `needs_research=6183` + +## Operational Notes + +- `tip-scraper-daemon` was not restarted because pricing jobs were active. +- Restart only after active pg-boss jobs complete, then queue reconcile and matcher. +- This is the correct pattern for rare products: explicit no-match evidence beats false 1:1 matches. diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 56e2520..0dcaa6b 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -38,3 +38,5 @@ {"id":"tip-20260509-flex-fs-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Report verification status precisely and separate product-data completeness from competitor-match completeness."},{"role":"user","content":"After the 2026-05-09 cleanup, what is the correct interpretation of Flexoptix and FS.com verification status?"},{"role":"assistant","content":"For active Flexoptix and FS.com products, price, image, and details are complete after removing category, redirect, cable, support, and non-FS artifacts from the active base. Remaining not-fully-verified rows are due to competitor_verified=false, not missing product price/image/details. Do not say the whole vendor is 100% fully verified until 1:1 competitor equivalence exists or the absence of a valid competitor is explicitly represented as a verified no-match state."}]} {"id":"tip-20260509-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Model verification as an evidence-backed state machine, not a single green checkbox."},{"role":"user","content":"How should TIP represent competitor verification when no valid 1:1 competitor product exists?"},{"role":"assistant","content":"Use a distinct competitor_status value. `matched` means a source-backed 1:1 competitor exists. `no_valid_match` means research verified that no valid public 1:1 competitor is available; this can still resolve the competitor requirement without inventing a match. `ambiguous` means candidates exist but are technically risky or below approval threshold. `needs_research` means the robot has not resolved the state yet. Keep the reason and timestamp in no_match_reason/no_match_verified_at and write a `competitor_no_match` evidence record."}]} {"id":"tip-20260509-evidence-ledger-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Every verification decision should be auditable."},{"role":"user","content":"What should TIP write when a robot marks price, image, details, competitor match, no-match, or artifact quarantine as verified?"},{"role":"assistant","content":"Write an append-only row into transceiver_verification_evidence. Required fields: transceiver_id, verification_type, source_url when available, source_vendor_id when available, evidence_value JSON, evidence_hash, robot_name, confidence, and created_at. For artifact quarantine, record the rule reason and clear verification flags. For price/image/details, store the exact source-backed value. For competitor match/no-match, store the match basis or no-match reason. This lets TIP explain every green badge and every deliberate non-green state."}]} +{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]} +{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}