feat: add no-valid competitor resolver
This commit is contained in:
parent
650de6ba9a
commit
79a57a5ac6
@ -16,6 +16,7 @@
|
||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
||||
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
||||
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||
"scrape:news": "tsx src/scrapers/news.ts",
|
||||
|
||||
200
packages/scraper/src/utils/resolve-no-valid-competitor.ts
Normal file
200
packages/scraper/src/utils/resolve-no-valid-competitor.ts
Normal file
@ -0,0 +1,200 @@
|
||||
/**
|
||||
* No-valid-competitor resolver.
|
||||
*
|
||||
* This is intentionally conservative. It only resolves products that already
|
||||
* have price/image/details evidence and where TIP cannot find any technically
|
||||
* strict, source-backed competitor candidate.
|
||||
*
|
||||
* Default is dry-run. Set NO_VALID_MATCH_APPLY=1 to write status/evidence.
|
||||
*/
|
||||
import { pool, checkAndSetFullyVerified, recordVerificationEvidence } from "./db";
|
||||
import { logger } from "./logger";
|
||||
|
||||
const EXCLUDED_CATEGORIES = [
|
||||
"NonTransceiver",
|
||||
"Accessory",
|
||||
"Adapter / Converter",
|
||||
"Switch / Media Converter",
|
||||
"Switch / Network Infrastructure",
|
||||
"NIC / Adapter",
|
||||
"Mux / Passive Optical",
|
||||
"Product Family",
|
||||
"Loopback / Test Module",
|
||||
];
|
||||
|
||||
type Candidate = {
|
||||
id: string;
|
||||
part_number: string;
|
||||
vendor_name: string;
|
||||
form_factor: string;
|
||||
speed_gbps: string;
|
||||
reach_meters: string | null;
|
||||
fiber_type: string | null;
|
||||
candidate_count: string;
|
||||
pending_equivalence_count: string;
|
||||
};
|
||||
|
||||
function applyMode(): boolean {
|
||||
return process.env.NO_VALID_MATCH_APPLY === "1";
|
||||
}
|
||||
|
||||
async function resolveNoValidCompetitors(): Promise<void> {
|
||||
const limit = Math.max(1, parseInt(process.env.NO_VALID_MATCH_LIMIT || "500", 10));
|
||||
const vendorFilter = process.env.NO_VALID_MATCH_VENDOR || "Flexoptix";
|
||||
const apply = applyMode();
|
||||
|
||||
logger.info("=== No-valid-competitor resolver ===", { limit, vendorFilter, apply });
|
||||
|
||||
const result = await pool.query<Candidate>(`
|
||||
WITH active AS (
|
||||
SELECT t.*, v.name AS vendor_name
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE COALESCE(t.data_confidence, 'unknown') != 'garbage'
|
||||
AND COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'
|
||||
AND COALESCE(t.category, '') <> ALL($1::text[])
|
||||
),
|
||||
ready AS (
|
||||
SELECT a.*
|
||||
FROM active a
|
||||
WHERE a.price_verified = true
|
||||
AND a.vendor_name ILIKE $2
|
||||
AND a.image_verified = true
|
||||
AND a.details_verified = true
|
||||
AND COALESCE(a.competitor_verified, false) = false
|
||||
AND COALESCE(a.competitor_status, 'needs_research') IN ('unknown', 'needs_research', 'ambiguous')
|
||||
AND a.form_factor IS NOT NULL
|
||||
AND a.form_factor != ''
|
||||
AND a.speed_gbps IS NOT NULL
|
||||
AND a.speed_gbps > 0
|
||||
AND a.reach_meters IS NOT NULL
|
||||
AND a.reach_meters > 0
|
||||
AND a.fiber_type IS NOT NULL
|
||||
AND a.fiber_type != ''
|
||||
),
|
||||
scored AS (
|
||||
SELECT
|
||||
r.id,
|
||||
r.part_number,
|
||||
r.vendor_name,
|
||||
r.form_factor,
|
||||
r.speed_gbps,
|
||||
r.reach_meters,
|
||||
r.fiber_type,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM active c
|
||||
WHERE c.id != r.id
|
||||
AND c.vendor_id != r.vendor_id
|
||||
AND c.price_verified = true
|
||||
AND c.image_verified = true
|
||||
AND c.details_verified = true
|
||||
AND c.form_factor = r.form_factor
|
||||
AND c.speed_gbps = r.speed_gbps
|
||||
AND UPPER(COALESCE(c.fiber_type, '')) = UPPER(COALESCE(r.fiber_type, ''))
|
||||
AND c.reach_meters IS NOT NULL
|
||||
AND c.reach_meters > 0
|
||||
AND ABS(c.reach_meters - r.reach_meters) <= GREATEST(25, r.reach_meters * 0.05)
|
||||
AND (
|
||||
COALESCE(NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, '')), '') = ''
|
||||
OR COALESCE(NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, '')), '') = ''
|
||||
OR ABS(
|
||||
NULLIF(regexp_replace(COALESCE(c.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(c.wavelengths, ''))::int
|
||||
- NULLIF(regexp_replace(COALESCE(r.wavelengths, ''), '^.*?(\\d{3,4}).*$', '\\1'), COALESCE(r.wavelengths, ''))::int
|
||||
) <= 15
|
||||
)
|
||||
) AS candidate_count,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM transceiver_equivalences eq
|
||||
WHERE eq.flexoptix_id = r.id
|
||||
AND eq.status IN ('pending', 'approved', 'auto_approved')
|
||||
AND eq.confidence >= 0.50
|
||||
) AS pending_equivalence_count
|
||||
FROM ready r
|
||||
)
|
||||
SELECT *
|
||||
FROM scored
|
||||
WHERE candidate_count = 0
|
||||
AND pending_equivalence_count = 0
|
||||
ORDER BY vendor_name, part_number
|
||||
LIMIT $3
|
||||
`, [EXCLUDED_CATEGORIES, `%${vendorFilter}%`, limit]);
|
||||
|
||||
logger.info("No-valid-match candidates", { count: result.rowCount ?? 0 });
|
||||
|
||||
if (!apply) {
|
||||
for (const row of result.rows.slice(0, 30)) {
|
||||
logger.info("dry-run candidate", {
|
||||
vendor: row.vendor_name,
|
||||
partNumber: row.part_number,
|
||||
formFactor: row.form_factor,
|
||||
speedGbps: row.speed_gbps,
|
||||
reachMeters: row.reach_meters,
|
||||
fiberType: row.fiber_type,
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let updated = 0;
|
||||
let fullyVerifiedEarned = 0;
|
||||
for (const row of result.rows) {
|
||||
const reason = [
|
||||
"Strict deterministic no-valid-match resolver found no source-backed competitor candidate",
|
||||
`same form_factor=${row.form_factor}`,
|
||||
`speed_gbps=${row.speed_gbps}`,
|
||||
`fiber_type=${row.fiber_type}`,
|
||||
`reach tolerance=max(25m,5%) around ${row.reach_meters}m`,
|
||||
"and no pending/approved equivalence above confidence 0.50",
|
||||
].join("; ");
|
||||
|
||||
const update = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET competitor_verified = true,
|
||||
competitor_verified_at = NOW(),
|
||||
competitor_status = 'no_valid_match',
|
||||
competitor_status_updated_at = NOW(),
|
||||
no_match_verified_at = NOW(),
|
||||
no_match_reason = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND COALESCE(competitor_verified, false) = false
|
||||
RETURNING id
|
||||
`, [row.id, reason]);
|
||||
|
||||
if ((update.rowCount ?? 0) === 0) continue;
|
||||
|
||||
await recordVerificationEvidence({
|
||||
transceiverId: row.id,
|
||||
verificationType: "competitor_no_match",
|
||||
evidenceValue: {
|
||||
reason,
|
||||
vendor: row.vendor_name,
|
||||
partNumber: row.part_number,
|
||||
candidateCount: Number(row.candidate_count),
|
||||
pendingEquivalenceCount: Number(row.pending_equivalence_count),
|
||||
},
|
||||
robotName: "verify:resolve-no-valid-competitor",
|
||||
confidence: 1,
|
||||
});
|
||||
|
||||
if (await checkAndSetFullyVerified(row.id)) fullyVerifiedEarned++;
|
||||
updated++;
|
||||
}
|
||||
|
||||
logger.info("No-valid-competitor resolver complete", {
|
||||
updated,
|
||||
fully_verified_earned: fullyVerifiedEarned,
|
||||
});
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
resolveNoValidCompetitors()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
logger.error("No-valid-competitor resolver failed", { error: (err as Error).message });
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -1,9 +1,48 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 21:00 UTC
|
||||
Updated: 2026-05-09 21:13 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP no-valid-competitor resolver on 2026-05-09:
|
||||
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
|
||||
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`
|
||||
- default mode is dry-run
|
||||
- apply mode requires `NO_VALID_MATCH_APPLY=1`
|
||||
- default vendor scope is `NO_VALID_MATCH_VENDOR=Flexoptix`
|
||||
- purpose:
|
||||
- close products that already have price, image, and details evidence
|
||||
- only resolve competitor verification when there is no strict source-backed 1:1 competitor candidate
|
||||
- avoid fake competitor matches for uncommon Flexoptix products
|
||||
- conservative gates:
|
||||
- active transceiver only; excludes known artifact/non-transceiver categories
|
||||
- source-backed `price_verified`, `image_verified`, and `details_verified` required
|
||||
- same-vendor candidates ignored; only other vendors count
|
||||
- strict candidate match requires same form factor, same speed, same fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it
|
||||
- no pending/approved equivalence above confidence `0.50`
|
||||
- live Erik run:
|
||||
- dry-run with Flexoptix scope found `73` no-valid-match candidates
|
||||
- apply run updated `73`
|
||||
- `73` additional products earned `fully_verified`
|
||||
- evidence ledger wrote `73` `competitor_no_match` records
|
||||
- live health after run:
|
||||
- active products: `17414`
|
||||
- price verified: `11523`
|
||||
- image verified: `12125`
|
||||
- details verified: `16814`
|
||||
- fully verified: `10831`
|
||||
- active competitor status:
|
||||
- `matched=11158`
|
||||
- `no_valid_match=73`
|
||||
- `ambiguous=0`
|
||||
- `needs_research=6183`
|
||||
- operational note:
|
||||
- `tip-scraper-daemon` was not restarted during active pricing jobs
|
||||
- restart daemon only after active pg-boss jobs finish, then run reconcile/matcher again
|
||||
- TIPLLM training pool:
|
||||
- appended deterministic no-valid-match resolver lessons
|
||||
- JSONL must remain valid after every append
|
||||
|
||||
- TIP verification truth model on 2026-05-09:
|
||||
- implemented migration `sql/103-verification-evidence-and-competitor-status.sql`
|
||||
- adds `transceivers.competitor_status`
|
||||
|
||||
69
sync/history/2026-05-09-tip-no-valid-match-resolver.md
Normal file
69
sync/history/2026-05-09-tip-no-valid-match-resolver.md
Normal file
@ -0,0 +1,69 @@
|
||||
# 2026-05-09 — TIP No-Valid-Match Resolver
|
||||
|
||||
## Operator Requirement
|
||||
|
||||
- Continue TIP verification until source-backed work is exhausted.
|
||||
- Do not invent competitor matches just to turn products green.
|
||||
- Use deterministic TIP robots/scrapers only.
|
||||
- Keep Erik safe; do not restart crawler daemons while active jobs are running.
|
||||
- Write crawler/scraper/robot learnings into the TIPLLM training pool.
|
||||
|
||||
## Change
|
||||
|
||||
Added `packages/scraper/src/utils/resolve-no-valid-competitor.ts` and package script:
|
||||
|
||||
```bash
|
||||
pnpm -C packages/scraper run verify:no-valid-competitor
|
||||
```
|
||||
|
||||
The resolver is dry-run by default. Applying changes requires:
|
||||
|
||||
```bash
|
||||
NO_VALID_MATCH_APPLY=1 pnpm -C packages/scraper run verify:no-valid-competitor
|
||||
```
|
||||
|
||||
The default vendor scope is Flexoptix:
|
||||
|
||||
```bash
|
||||
NO_VALID_MATCH_VENDOR=Flexoptix
|
||||
```
|
||||
|
||||
## Rules
|
||||
|
||||
A product can be resolved as `competitor_status='no_valid_match'` only when:
|
||||
|
||||
- the product is active and not in a known artifact category
|
||||
- price, image, and details are already verified
|
||||
- competitor verification is still unresolved
|
||||
- no other vendor has a source-backed strict candidate with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it
|
||||
- no pending/approved equivalence above confidence `0.50` already exists
|
||||
|
||||
The robot writes `competitor_no_match` evidence and then calls `checkAndSetFullyVerified`.
|
||||
|
||||
## Live Result
|
||||
|
||||
On Erik:
|
||||
|
||||
- dry-run Flexoptix scope found `73` candidates
|
||||
- apply run updated `73`
|
||||
- `73` products earned `fully_verified`
|
||||
- evidence ledger now has `73` `competitor_no_match` records
|
||||
|
||||
Live health after the run:
|
||||
|
||||
- active products: `17414`
|
||||
- price verified: `11523`
|
||||
- image verified: `12125`
|
||||
- details verified: `16814`
|
||||
- fully verified: `10831`
|
||||
- competitor status:
|
||||
- `matched=11158`
|
||||
- `no_valid_match=73`
|
||||
- `ambiguous=0`
|
||||
- `needs_research=6183`
|
||||
|
||||
## Operational Notes
|
||||
|
||||
- `tip-scraper-daemon` was not restarted because pricing jobs were active.
|
||||
- Restart only after active pg-boss jobs complete, then queue reconcile and matcher.
|
||||
- This is the correct pattern for rare products: explicit no-match evidence beats false 1:1 matches.
|
||||
@ -38,3 +38,5 @@
|
||||
{"id":"tip-20260509-flex-fs-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Report verification status precisely and separate product-data completeness from competitor-match completeness."},{"role":"user","content":"After the 2026-05-09 cleanup, what is the correct interpretation of Flexoptix and FS.com verification status?"},{"role":"assistant","content":"For active Flexoptix and FS.com products, price, image, and details are complete after removing category, redirect, cable, support, and non-FS artifacts from the active base. Remaining not-fully-verified rows are due to competitor_verified=false, not missing product price/image/details. Do not say the whole vendor is 100% fully verified until 1:1 competitor equivalence exists or the absence of a valid competitor is explicitly represented as a verified no-match state."}]}
|
||||
{"id":"tip-20260509-competitor-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Model verification as an evidence-backed state machine, not a single green checkbox."},{"role":"user","content":"How should TIP represent competitor verification when no valid 1:1 competitor product exists?"},{"role":"assistant","content":"Use a distinct competitor_status value. `matched` means a source-backed 1:1 competitor exists. `no_valid_match` means research verified that no valid public 1:1 competitor is available; this can still resolve the competitor requirement without inventing a match. `ambiguous` means candidates exist but are technically risky or below approval threshold. `needs_research` means the robot has not resolved the state yet. Keep the reason and timestamp in no_match_reason/no_match_verified_at and write a `competitor_no_match` evidence record."}]}
|
||||
{"id":"tip-20260509-evidence-ledger-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Every verification decision should be auditable."},{"role":"user","content":"What should TIP write when a robot marks price, image, details, competitor match, no-match, or artifact quarantine as verified?"},{"role":"assistant","content":"Write an append-only row into transceiver_verification_evidence. Required fields: transceiver_id, verification_type, source_url when available, source_vendor_id when available, evidence_value JSON, evidence_hash, robot_name, confidence, and created_at. For artifact quarantine, record the rule reason and clear verification flags. For price/image/details, store the exact source-backed value. For competitor match/no-match, store the match basis or no-match reason. This lets TIP explain every green badge and every deliberate non-green state."}]}
|
||||
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
|
||||
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user