From fb9db56617086dba8d5df3329cbebd8e0c00eb73 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 23:35:01 +0200 Subject: [PATCH] fix: quarantine fs numeric sku aliases --- packages/scraper/package.json | 1 + .../src/utils/quarantine-fs-sku-aliases.ts | 145 ++++++++++++++++++ sync/CURRENT.md | 44 +++++- .../2026-05-09-tip-fs-sku-alias-quarantine.md | 76 +++++++++ training-data/tip-llm-capabilities-v1.jsonl | 1 + 5 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 packages/scraper/src/utils/quarantine-fs-sku-aliases.ts create mode 100644 sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 4e3f1a2..709d0fd 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -16,6 +16,7 @@ "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", + "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", "verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts", diff --git a/packages/scraper/src/utils/quarantine-fs-sku-aliases.ts b/packages/scraper/src/utils/quarantine-fs-sku-aliases.ts new file mode 100644 index 0000000..c7d41b1 --- /dev/null +++ b/packages/scraper/src/utils/quarantine-fs-sku-aliases.ts @@ -0,0 +1,145 @@ +/** + * FS.com numeric SKU alias quarantine. + * + * FS.com pages expose both a marketplace SKU (FS-123456) and a real optical + * product P/N (for example OSFP-DR8-1.6T-FL). Older scraper passes created + * duplicate active rows for the numeric SKU. Those rows distort equivalence + * research because the real product row already exists for the same URL. + * + * Default is dry-run. Set FS_SKU_ALIAS_APPLY=1 to write quarantine/evidence. + */ +import { pool, recordVerificationEvidence } from "./db"; +import { logger } from "./logger"; + +type AliasCandidate = { + alias_id: string; + alias_part_number: string; + alias_url: string; + canonical_id: string; + canonical_part_number: string; + canonical_url: string; +}; + +function applyMode(): boolean { + return process.env.FS_SKU_ALIAS_APPLY === "1"; +} + +async function quarantineFsSkuAliases(): Promise { + const limit = Math.max(1, parseInt(process.env.FS_SKU_ALIAS_LIMIT || "500", 10)); + const apply = applyMode(); + + logger.info("=== FS.com numeric SKU alias quarantine ===", { limit, apply }); + + const result = await pool.query(` + WITH fs AS ( + SELECT + t.*, + regexp_replace( + regexp_replace(COALESCE(t.product_page_url, ''), '\\?.*$', ''), + '^https://www\\.fs\\.com/de/de/', + 'https://www.fs.com/de/' + ) AS norm_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'FS.COM' + AND COALESCE(t.product_page_url, '') ~* 'fs\\.com/.*/products/[0-9]+\\.html' + ), + numeric_alias AS ( + SELECT * + FROM fs + WHERE part_number ~ '^FS-[0-9]+$' + AND COALESCE(data_confidence, 'unknown') != 'garbage' + ), + canonical AS ( + SELECT * + FROM fs + WHERE part_number !~ '^FS-[0-9]+$' + AND COALESCE(data_confidence, 'unknown') != 'garbage' + AND price_verified = true + AND image_verified = true + AND details_verified = true + ) + SELECT DISTINCT ON (n.id) + n.id AS alias_id, + n.part_number AS alias_part_number, + n.product_page_url AS alias_url, + c.id AS canonical_id, + c.part_number AS canonical_part_number, + c.product_page_url AS canonical_url + FROM numeric_alias n + JOIN canonical c ON c.norm_url = n.norm_url + ORDER BY n.id, c.fully_verified DESC, c.updated_at DESC + LIMIT $1 + `, [limit]); + + logger.info("FS.com numeric SKU alias candidates", { count: result.rowCount ?? 0 }); + + if (!apply) { + for (const row of result.rows.slice(0, 40)) { + logger.info("dry-run alias", { + alias: row.alias_part_number, + canonical: row.canonical_part_number, + url: row.alias_url, + }); + } + return; + } + + let updated = 0; + for (const row of result.rows) { + const reason = [ + "FS.com numeric marketplace SKU alias", + `${row.alias_part_number} duplicates canonical product P/N ${row.canonical_part_number}`, + "for the same normalized FS product URL", + ].join("; "); + + await recordVerificationEvidence({ + transceiverId: row.alias_id, + verificationType: "artifact_quarantine", + sourceUrl: row.alias_url, + evidenceValue: { + reason, + aliasPartNumber: row.alias_part_number, + canonicalTransceiverId: row.canonical_id, + canonicalPartNumber: row.canonical_part_number, + canonicalUrl: row.canonical_url, + }, + robotName: "verify:fs:sku-aliases", + confidence: 1, + }); + + const update = await pool.query(` + UPDATE transceivers + SET category = 'NonTransceiver', + price_verified = false, + image_verified = false, + details_verified = false, + competitor_verified = false, + fully_verified = false, + competitor_status = 'needs_research', + competitor_status_updated_at = NOW(), + notes = CASE + WHEN COALESCE(notes, '') ILIKE '%' || $2::text || '%' THEN notes + ELSE CONCAT_WS(E'\\n', NULLIF(notes, ''), $2::text) + END, + updated_at = NOW() + WHERE id = $1 + AND COALESCE(category, '') != 'NonTransceiver' + RETURNING id + `, [row.alias_id, reason]); + + if ((update.rowCount ?? 0) > 0) updated++; + } + + logger.info("FS.com numeric SKU alias quarantine complete", { updated }); +} + +if (require.main === module) { + quarantineFsSkuAliases() + .then(() => pool.end()) + .catch((err) => { + logger.error("FS.com numeric SKU alias quarantine failed", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index a57882c..8133503 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,51 @@ # Current TIP Sync State -Updated: 2026-05-09 21:24 UTC +Updated: 2026-05-09 21:33 UTC ## Newest Work +- TIP FS.com SKU alias cleanup on 2026-05-09: + - added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts` + - script: `pnpm -C packages/scraper run verify:fs:sku-aliases` + - default mode is dry-run + - apply mode requires `FS_SKU_ALIAS_APPLY=1` + - purpose: + - remove duplicate active FS.com numeric SKU rows such as `FS-380881` when the same FS URL already has the real product P/N row such as `OSFP-DR8-1.6T-FL` + - prevent numeric FS SKU aliases from becoming false competitor or no-match candidates + - safe gates: + - alias must match `^FS-[0-9]+$` + - same normalized FS product URL must have a non-numeric canonical product row + - canonical row must already have price, image, and details verified + - live Erik run: + - dry-run found `109` candidates + - apply quarantined `109` + - evidence ledger wrote `109` `artifact_quarantine` records from `verify:fs:sku-aliases` + - active numeric-SKU duplicates with canonical product row after run: `0` + - specific user-reported FS.com 1.6T case: + - numeric shadow rows `FS-380881` and `FS-380883` were duplicate aliases + - canonical rows remain active: + - `OSFP-DR8-1.6T-FL` + - `OSFP-2FR4-1.6T-FL` + - this preserves both the 500m DR8 and the 2km FR4 product instead of treating the numeric SKU as a separate transceiver + - live health after reconcile/matcher: + - active products: `17305` + - price verified: `11414` + - image verified: `12016` + - details verified: `16705` + - fully verified: `10448` + - active competitor status: + - `matched=10775` + - `no_valid_match=73` + - `ambiguous=192` + - `needs_research=6265` + - fully populated product rows still needing competitor research: + - `Flexoptix=359` + - `FS.COM=4` + - `ATGBICS=2` + - FS.com and Flexoptix no-valid-match dry-runs now both return `0`; remaining cases need real candidate research/normalization, not no-match closure + - TIPLLM training pool: + - appended lesson for FS.com numeric SKU alias quarantine + - TIP no-valid-competitor resolver on 2026-05-09: - added `packages/scraper/src/utils/resolve-no-valid-competitor.ts` - script: `pnpm -C packages/scraper run verify:no-valid-competitor` diff --git a/sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md b/sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md new file mode 100644 index 0000000..c8e985c --- /dev/null +++ b/sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md @@ -0,0 +1,76 @@ +# 2026-05-09 — TIP FS.com Numeric SKU Alias Quarantine + +## Problem + +FS.com pages expose two identifiers: + +- marketplace SKU, for example `FS-380881` +- real optical product P/N, for example `OSFP-DR8-1.6T-FL` + +Older scraper passes created active duplicate rows for both. This polluted equivalence research because numeric SKU rows looked like separate transceivers. + +The user-reported 1.6T case confirmed the issue: + +- `FS-380881` is the numeric SKU alias for `OSFP-DR8-1.6T-FL` (500m) +- `FS-380883` is the numeric SKU alias for `OSFP-2FR4-1.6T-FL` (2km) +- both real product P/N rows must remain in TIP +- the numeric aliases must not be treated as independent products + +## Change + +Added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`. + +Script: + +```bash +pnpm -C packages/scraper run verify:fs:sku-aliases +``` + +Apply mode: + +```bash +FS_SKU_ALIAS_APPLY=1 pnpm -C packages/scraper run verify:fs:sku-aliases +``` + +## Safety Gates + +A row is quarantined only when: + +- vendor is `FS.COM` +- part number matches `^FS-[0-9]+$` +- the same normalized FS product URL has a canonical non-numeric product P/N row +- the canonical row already has price, image, and details verified + +The robot sets the numeric alias row to `category='NonTransceiver'`, clears verification flags, and writes `artifact_quarantine` evidence. + +## Live Result + +On Erik: + +- dry-run found `109` candidates +- apply quarantined `109` +- evidence ledger wrote `109` `artifact_quarantine` records +- active numeric-SKU duplicates with canonical product row after run: `0` + +Post-run reconcile and matcher completed. + +Live health after cleanup: + +- active products: `17305` +- price verified: `11414` +- image verified: `12016` +- details verified: `16705` +- fully verified: `10448` +- competitor status: + - `matched=10775` + - `no_valid_match=73` + - `ambiguous=192` + - `needs_research=6265` + +Fully populated products still needing competitor research: + +- `Flexoptix=359` +- `FS.COM=4` +- `ATGBICS=2` + +FS.com and Flexoptix no-valid-match dry-runs now both return `0`, so the remaining rows need true candidate research or normalization, not blind no-match closure. diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index e026586..d2c1dd9 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -41,3 +41,4 @@ {"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]} {"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]} {"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]} +{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}