feat: add no-valid competitor resolver

This commit is contained in:
Rene Fichtmueller 2026-05-09 23:16:04 +02:00
parent 650de6ba9a
commit 79a57a5ac6
5 changed files with 312 additions and 1 deletions

View File

@ -16,6 +16,7 @@
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
"scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts",
"scrape:news": "tsx src/scrapers/news.ts", "scrape:news": "tsx src/scrapers/news.ts",

View File

@ -0,0 +1,200 @@
/**
* No-valid-competitor resolver.
*
* This is intentionally conservative. It only resolves products that already
* have price/image/details evidence and where TIP cannot find any technically
* strict, source-backed competitor candidate.
*
* Default is dry-run. Set NO_VALID_MATCH_APPLY=1 to write status/evidence.
*/
import { pool, checkAndSetFullyVerified, recordVerificationEvidence } from "./db";
import { logger } from "./logger";
const EXCLUDED_CATEGORIES = [
"NonTransceiver",
"Accessory",
"Adapter / Converter",
"Switch / Media Converter",
"Switch / Network Infrastructure",
"NIC / Adapter",
"Mux / Passive Optical",
"Product Family",
"Loopback / Test Module",
];
type Candidate = {
id: string;
part_number: string;
vendor_name: string;
form_factor: string;
speed_gbps: string;
reach_meters: string | null;
fiber_type: string | null;
candidate_count: string;
pending_equivalence_count: string;
};
function applyMode(): boolean {
return process.env.NO_VALID_MATCH_APPLY === "1";
}
async function resolveNoValidCompetitors(): Promise<void> {
const limit = Math.max(1, parseInt(process.env.NO_VALID_MATCH_LIMIT || "500", 10));
const vendorFilter = process.env.NO_VALID_MATCH_VENDOR || "Flexoptix";
const apply = applyMode();
logger.info("=== No-valid-competitor resolver ===", { limit, vendorFilter, apply });
const result = await pool.query<Candidate>(`
WITH active AS (
SELECT t.*, v.name AS vendor_name
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE COALESCE(t.data_confidence, 'unknown') != 'garbage'
AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'
AND COALESCE(t.category, '') <> ALL($1::text[])
),
ready AS (
SELECT a.*
FROM active a
WHERE a.price_verified = true
AND a.vendor_name ILIKE $2
AND a.image_verified = true
AND a.details_verified = true
AND COALESCE(a.competitor_verified, false) = false
AND COALESCE(a.competitor_status, 'needs_research') IN ('unknown', 'needs_research', 'ambiguous')
AND a.form_factor IS NOT NULL
AND a.form_factor != ''
AND a.speed_gbps IS NOT NULL
AND a.speed_gbps > 0
AND a.reach_meters IS NOT NULL
AND a.reach_meters > 0
AND a.fiber_type IS NOT NULL
AND a.fiber_type != ''
),
scored AS (
SELECT
r.id,
r.part_number,
r.vendor_name,
r.form_factor,
r.speed_gbps,
r.reach_meters,
r.fiber_type,
(
SELECT COUNT(*)
FROM active c
WHERE c.id != r.id
AND c.vendor_id != r.vendor_id
AND c.price_verified = true
AND c.image_verified = true
AND c.details_verified = true
AND c.form_factor = r.form_factor
AND c.speed_gbps = r.speed_gbps
AND UPPER(COALESCE(c.fiber_type, '')) = UPPER(COALESCE(r.fiber_type, ''))
AND c.reach_meters IS NOT NULL
AND c.reach_meters > 0
AND ABS(c.reach_meters - r.reach_meters) <= GREATEST(25, r.reach_meters * 0.05)
AND (
COALESCE(NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, '')), '') = ''
OR COALESCE(NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, '')), '') = ''
OR ABS(
NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, ''))::int
- NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, ''))::int
) <= 15
)
) AS candidate_count,
(
SELECT COUNT(*)
FROM transceiver_equivalences eq
WHERE eq.flexoptix_id = r.id
AND eq.status IN ('pending', 'approved', 'auto_approved')
AND eq.confidence >= 0.50
) AS pending_equivalence_count
FROM ready r
)
SELECT *
FROM scored
WHERE candidate_count = 0
AND pending_equivalence_count = 0
ORDER BY vendor_name, part_number
LIMIT $3
`, [EXCLUDED_CATEGORIES, `%${vendorFilter}%`, limit]);
logger.info("No-valid-match candidates", { count: result.rowCount ?? 0 });
if (!apply) {
for (const row of result.rows.slice(0, 30)) {
logger.info("dry-run candidate", {
vendor: row.vendor_name,
partNumber: row.part_number,
formFactor: row.form_factor,
speedGbps: row.speed_gbps,
reachMeters: row.reach_meters,
fiberType: row.fiber_type,
});
}
return;
}
let updated = 0;
let fullyVerifiedEarned = 0;
for (const row of result.rows) {
const reason = [
"Strict deterministic no-valid-match resolver found no source-backed competitor candidate",
`same form_factor=${row.form_factor}`,
`speed_gbps=${row.speed_gbps}`,
`fiber_type=${row.fiber_type}`,
`reach tolerance=max(25m,5%) around ${row.reach_meters}m`,
"and no pending/approved equivalence above confidence 0.50",
].join("; ");
const update = await pool.query(`
UPDATE transceivers
SET competitor_verified = true,
competitor_verified_at = NOW(),
competitor_status = 'no_valid_match',
competitor_status_updated_at = NOW(),
no_match_verified_at = NOW(),
no_match_reason = $2,
updated_at = NOW()
WHERE id = $1
AND COALESCE(competitor_verified, false) = false
RETURNING id
`, [row.id, reason]);
if ((update.rowCount ?? 0) === 0) continue;
await recordVerificationEvidence({
transceiverId: row.id,
verificationType: "competitor_no_match",
evidenceValue: {
reason,
vendor: row.vendor_name,
partNumber: row.part_number,
candidateCount: Number(row.candidate_count),
pendingEquivalenceCount: Number(row.pending_equivalence_count),
},
robotName: "verify:resolve-no-valid-competitor",
confidence: 1,
});
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
updated++;
}
logger.info("No-valid-competitor resolver complete", {
updated,
fully_verified_earned: fullyVerifiedEarned,
});
}
if (require.main === module) {
resolveNoValidCompetitors()
.then(() => pool.end())
.catch((err) => {
logger.error("No-valid-competitor resolver failed", { error: (err as Error).message });
pool.end();
process.exit(1);
});
}

View File

@ -1,9 +1,48 @@
# Current TIP Sync State # Current TIP Sync State
Updated: 2026-05-09 21:00 UTC Updated: 2026-05-09 21:13 UTC
## Newest Work ## Newest Work
- TIP no-valid-competitor resolver on 2026-05-09:
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`
- default mode is dry-run
- apply mode requires `NO_VALID_MATCH_APPLY=1`
- default vendor scope is `NO_VALID_MATCH_VENDOR=Flexoptix`
- purpose:
- close products that already have price, image, and details evidence
- only resolve competitor verification when there is no strict source-backed 1:1 competitor candidate
- avoid fake competitor matches for uncommon Flexoptix products
- conservative gates:
- active transceiver only; excludes known artifact/non-transceiver categories
- source-backed `price_verified`, `image_verified`, and `details_verified` required
- same-vendor candidates ignored; only other vendors count
- strict candidate match requires same form factor, same speed, same fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it
- no pending/approved equivalence above confidence `0.50`
- live Erik run:
- dry-run with Flexoptix scope found `73` no-valid-match candidates
- apply run updated `73`
- `73` additional products earned `fully_verified`
- evidence ledger wrote `73` `competitor_no_match` records
- live health after run:
- active products: `17414`
- price verified: `11523`
- image verified: `12125`
- details verified: `16814`
- fully verified: `10831`
- active competitor status:
- `matched=11158`
- `no_valid_match=73`
- `ambiguous=0`
- `needs_research=6183`
- operational note:
- `tip-scraper-daemon` was not restarted during active pricing jobs
- restart daemon only after active pg-boss jobs finish, then run reconcile/matcher again
- TIPLLM training pool:
- appended deterministic no-valid-match resolver lessons
- JSONL must remain valid after every append
- TIP verification truth model on 2026-05-09: - TIP verification truth model on 2026-05-09:
- implemented migration `sql/103-verification-evidence-and-competitor-status.sql` - implemented migration `sql/103-verification-evidence-and-competitor-status.sql`
- adds `transceivers.competitor_status` - adds `transceivers.competitor_status`

View File

@ -0,0 +1,69 @@
# 2026-05-09 — TIP No-Valid-Match Resolver
## Operator Requirement
- Continue TIP verification until source-backed work is exhausted.
- Do not invent competitor matches just to turn products green.
- Use deterministic TIP robots/scrapers only.
- Keep Erik safe; do not restart crawler daemons while active jobs are running.
- Write crawler/scraper/robot learnings into the TIPLLM training pool.
## Change
Added `packages/scraper/src/utils/resolve-no-valid-competitor.ts` and package script:
```bash
pnpm -C packages/scraper run verify:no-valid-competitor
```
The resolver is dry-run by default. Applying changes requires:
```bash
NO_VALID_MATCH_APPLY=1 pnpm -C packages/scraper run verify:no-valid-competitor
```
The default vendor scope is Flexoptix:
```bash
NO_VALID_MATCH_VENDOR=Flexoptix
```
## Rules
A product can be resolved as `competitor_status='no_valid_match'` only when:
- the product is active and not in a known artifact category
- price, image, and details are already verified
- competitor verification is still unresolved
- no other vendor has a source-backed strict candidate with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it
- no pending/approved equivalence above confidence `0.50` already exists
The robot writes `competitor_no_match` evidence and then calls `checkAndSetFullyVerified`.
## Live Result
On Erik:
- dry-run Flexoptix scope found `73` candidates
- apply run updated `73`
- `73` products earned `fully_verified`
- evidence ledger now has `73` `competitor_no_match` records
Live health after the run:
- active products: `17414`
- price verified: `11523`
- image verified: `12125`
- details verified: `16814`
- fully verified: `10831`
- competitor status:
- `matched=11158`
- `no_valid_match=73`
- `ambiguous=0`
- `needs_research=6183`
## Operational Notes
- `tip-scraper-daemon` was not restarted because pricing jobs were active.
- Restart only after active pg-boss jobs complete, then queue reconcile and matcher.
- This is the correct pattern for rare products: explicit no-match evidence beats false 1:1 matches.

View File

@ -38,3 +38,5 @@
{"id":"tip-20260509-flex-fs-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Report verification status precisely and separate product-data completeness from competitor-match completeness."},{"role":"user","content":"After the 2026-05-09 cleanup, what is the correct interpretation of Flexoptix and FS.com verification status?"},{"role":"assistant","content":"For active Flexoptix and FS.com products, price, image, and details are complete after removing category, redirect, cable, support, and non-FS artifacts from the active base. Remaining not-fully-verified rows are due to competitor_verified=false, not missing product price/image/details. Do not say the whole vendor is 100% fully verified until 1:1 competitor equivalence exists or the absence of a valid competitor is explicitly represented as a verified no-match state."}]} {"id":"tip-20260509-flex-fs-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Report verification status precisely and separate product-data completeness from competitor-match completeness."},{"role":"user","content":"After the 2026-05-09 cleanup, what is the correct interpretation of Flexoptix and FS.com verification status?"},{"role":"assistant","content":"For active Flexoptix and FS.com products, price, image, and details are complete after removing category, redirect, cable, support, and non-FS artifacts from the active base. Remaining not-fully-verified rows are due to competitor_verified=false, not missing product price/image/details. Do not say the whole vendor is 100% fully verified until 1:1 competitor equivalence exists or the absence of a valid competitor is explicitly represented as a verified no-match state."}]}
{"id":"tip-20260509-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Model verification as an evidence-backed state machine, not a single green checkbox."},{"role":"user","content":"How should TIP represent competitor verification when no valid 1:1 competitor product exists?"},{"role":"assistant","content":"Use a distinct competitor_status value. `matched` means a source-backed 1:1 competitor exists. `no_valid_match` means research verified that no valid public 1:1 competitor is available; this can still resolve the competitor requirement without inventing a match. `ambiguous` means candidates exist but are technically risky or below approval threshold. `needs_research` means the robot has not resolved the state yet. Keep the reason and timestamp in no_match_reason/no_match_verified_at and write a `competitor_no_match` evidence record."}]} {"id":"tip-20260509-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Model verification as an evidence-backed state machine, not a single green checkbox."},{"role":"user","content":"How should TIP represent competitor verification when no valid 1:1 competitor product exists?"},{"role":"assistant","content":"Use a distinct competitor_status value. `matched` means a source-backed 1:1 competitor exists. `no_valid_match` means research verified that no valid public 1:1 competitor is available; this can still resolve the competitor requirement without inventing a match. `ambiguous` means candidates exist but are technically risky or below approval threshold. `needs_research` means the robot has not resolved the state yet. Keep the reason and timestamp in no_match_reason/no_match_verified_at and write a `competitor_no_match` evidence record."}]}
{"id":"tip-20260509-evidence-ledger-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Every verification decision should be auditable."},{"role":"user","content":"What should TIP write when a robot marks price, image, details, competitor match, no-match, or artifact quarantine as verified?"},{"role":"assistant","content":"Write an append-only row into transceiver_verification_evidence. Required fields: transceiver_id, verification_type, source_url when available, source_vendor_id when available, evidence_value JSON, evidence_hash, robot_name, confidence, and created_at. For artifact quarantine, record the rule reason and clear verification flags. For price/image/details, store the exact source-backed value. For competitor match/no-match, store the match basis or no-match reason. This lets TIP explain every green badge and every deliberate non-green state."}]} {"id":"tip-20260509-evidence-ledger-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Every verification decision should be auditable."},{"role":"user","content":"What should TIP write when a robot marks price, image, details, competitor match, no-match, or artifact quarantine as verified?"},{"role":"assistant","content":"Write an append-only row into transceiver_verification_evidence. Required fields: transceiver_id, verification_type, source_url when available, source_vendor_id when available, evidence_value JSON, evidence_hash, robot_name, confidence, and created_at. For artifact quarantine, record the rule reason and clear verification flags. For price/image/details, store the exact source-backed value. For competitor match/no-match, store the match basis or no-match reason. This lets TIP explain every green badge and every deliberate non-green state."}]}
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}