fix: quarantine fs numeric sku aliases

This commit is contained in:
Rene Fichtmueller 2026-05-09 23:35:01 +02:00
parent 7b8e229cf0
commit fb9db56617
5 changed files with 266 additions and 1 deletions

View File

@ -16,6 +16,7 @@
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
"scrape:optcore": "tsx src/scrapers/optcore.ts",

View File

@ -0,0 +1,145 @@
/**
* FS.com numeric SKU alias quarantine.
*
* FS.com pages expose both a marketplace SKU (FS-123456) and a real optical
* product P/N (for example OSFP-DR8-1.6T-FL). Older scraper passes created
* duplicate active rows for the numeric SKU. Those rows distort equivalence
* research because the real product row already exists for the same URL.
*
* Default is dry-run. Set FS_SKU_ALIAS_APPLY=1 to write quarantine/evidence.
*/
import { pool, recordVerificationEvidence } from "./db";
import { logger } from "./logger";
type AliasCandidate = {
alias_id: string;
alias_part_number: string;
alias_url: string;
canonical_id: string;
canonical_part_number: string;
canonical_url: string;
};
function applyMode(): boolean {
return process.env.FS_SKU_ALIAS_APPLY === "1";
}
async function quarantineFsSkuAliases(): Promise<void> {
const limit = Math.max(1, parseInt(process.env.FS_SKU_ALIAS_LIMIT || "500", 10));
const apply = applyMode();
logger.info("=== FS.com numeric SKU alias quarantine ===", { limit, apply });
const result = await pool.query<AliasCandidate>(`
WITH fs AS (
SELECT
t.*,
regexp_replace(
regexp_replace(COALESCE(t.product_page_url, ''), '\\?.*$', ''),
'^https://www\\.fs\\.com/de/de/',
'https://www.fs.com/de/'
) AS norm_url
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = 'FS.COM'
AND COALESCE(t.product_page_url, '') ~* 'fs\\.com/.*/products/[0-9]+\\.html'
),
numeric_alias AS (
SELECT *
FROM fs
WHERE part_number ~ '^FS-[0-9]+$'
AND COALESCE(data_confidence, 'unknown') != 'garbage'
),
canonical AS (
SELECT *
FROM fs
WHERE part_number !~ '^FS-[0-9]+$'
AND COALESCE(data_confidence, 'unknown') != 'garbage'
AND price_verified = true
AND image_verified = true
AND details_verified = true
)
SELECT DISTINCT ON (n.id)
n.id AS alias_id,
n.part_number AS alias_part_number,
n.product_page_url AS alias_url,
c.id AS canonical_id,
c.part_number AS canonical_part_number,
c.product_page_url AS canonical_url
FROM numeric_alias n
JOIN canonical c ON c.norm_url = n.norm_url
ORDER BY n.id, c.fully_verified DESC, c.updated_at DESC
LIMIT $1
`, [limit]);
logger.info("FS.com numeric SKU alias candidates", { count: result.rowCount ?? 0 });
if (!apply) {
for (const row of result.rows.slice(0, 40)) {
logger.info("dry-run alias", {
alias: row.alias_part_number,
canonical: row.canonical_part_number,
url: row.alias_url,
});
}
return;
}
let updated = 0;
for (const row of result.rows) {
const reason = [
"FS.com numeric marketplace SKU alias",
`${row.alias_part_number} duplicates canonical product P/N ${row.canonical_part_number}`,
"for the same normalized FS product URL",
].join("; ");
await recordVerificationEvidence({
transceiverId: row.alias_id,
verificationType: "artifact_quarantine",
sourceUrl: row.alias_url,
evidenceValue: {
reason,
aliasPartNumber: row.alias_part_number,
canonicalTransceiverId: row.canonical_id,
canonicalPartNumber: row.canonical_part_number,
canonicalUrl: row.canonical_url,
},
robotName: "verify:fs:sku-aliases",
confidence: 1,
});
const update = await pool.query(`
UPDATE transceivers
SET category = 'NonTransceiver',
price_verified = false,
image_verified = false,
details_verified = false,
competitor_verified = false,
fully_verified = false,
competitor_status = 'needs_research',
competitor_status_updated_at = NOW(),
notes = CASE
WHEN COALESCE(notes, '') ILIKE '%' || $2::text || '%' THEN notes
ELSE CONCAT_WS(E'\\n', NULLIF(notes, ''), $2::text)
END,
updated_at = NOW()
WHERE id = $1
AND COALESCE(category, '') != 'NonTransceiver'
RETURNING id
`, [row.alias_id, reason]);
if ((update.rowCount ?? 0) > 0) updated++;
}
logger.info("FS.com numeric SKU alias quarantine complete", { updated });
}
if (require.main === module) {
quarantineFsSkuAliases()
.then(() => pool.end())
.catch((err) => {
logger.error("FS.com numeric SKU alias quarantine failed", { error: (err as Error).message });
pool.end();
process.exit(1);
});
}

View File

@ -1,9 +1,51 @@
# Current TIP Sync State
Updated: 2026-05-09 21:24 UTC
Updated: 2026-05-09 21:33 UTC
## Newest Work
- TIP FS.com SKU alias cleanup on 2026-05-09:
- added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`
- script: `pnpm -C packages/scraper run verify:fs:sku-aliases`
- default mode is dry-run
- apply mode requires `FS_SKU_ALIAS_APPLY=1`
- purpose:
- remove duplicate active FS.com numeric SKU rows such as `FS-380881` when the same FS URL already has the real product P/N row such as `OSFP-DR8-1.6T-FL`
- prevent numeric FS SKU aliases from becoming false competitor or no-match candidates
- safe gates:
- alias must match `^FS-[0-9]+$`
- same normalized FS product URL must have a non-numeric canonical product row
- canonical row must already have price, image, and details verified
- live Erik run:
- dry-run found `109` candidates
- apply quarantined `109`
- evidence ledger wrote `109` `artifact_quarantine` records from `verify:fs:sku-aliases`
- active numeric-SKU duplicates with canonical product row after run: `0`
- specific user-reported FS.com 1.6T case:
- numeric shadow rows `FS-380881` and `FS-380883` were duplicate aliases
- canonical rows remain active:
- `OSFP-DR8-1.6T-FL`
- `OSFP-2FR4-1.6T-FL`
- this preserves both the 500m DR8 and the 2km FR4 product instead of treating the numeric SKU as a separate transceiver
- live health after reconcile/matcher:
- active products: `17305`
- price verified: `11414`
- image verified: `12016`
- details verified: `16705`
- fully verified: `10448`
- active competitor status:
- `matched=10775`
- `no_valid_match=73`
- `ambiguous=192`
- `needs_research=6265`
- fully populated product rows still needing competitor research:
- `Flexoptix=359`
- `FS.COM=4`
- `ATGBICS=2`
- FS.com and Flexoptix no-valid-match dry-runs now both return `0`; remaining cases need real candidate research/normalization, not no-match closure
- TIPLLM training pool:
- appended lesson for FS.com numeric SKU alias quarantine
- TIP no-valid-competitor resolver on 2026-05-09:
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`

View File

@ -0,0 +1,76 @@
# 2026-05-09 — TIP FS.com Numeric SKU Alias Quarantine
## Problem
FS.com pages expose two identifiers:
- marketplace SKU, for example `FS-380881`
- real optical product P/N, for example `OSFP-DR8-1.6T-FL`
Older scraper passes created active duplicate rows for both. This polluted equivalence research because numeric SKU rows looked like separate transceivers.
The user-reported 1.6T case confirmed the issue:
- `FS-380881` is the numeric SKU alias for `OSFP-DR8-1.6T-FL` (500m)
- `FS-380883` is the numeric SKU alias for `OSFP-2FR4-1.6T-FL` (2km)
- both real product P/N rows must remain in TIP
- the numeric aliases must not be treated as independent products
## Change
Added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`.
Script:
```bash
pnpm -C packages/scraper run verify:fs:sku-aliases
```
Apply mode:
```bash
FS_SKU_ALIAS_APPLY=1 pnpm -C packages/scraper run verify:fs:sku-aliases
```
## Safety Gates
A row is quarantined only when:
- vendor is `FS.COM`
- part number matches `^FS-[0-9]+$`
- the same normalized FS product URL has a canonical non-numeric product P/N row
- the canonical row already has price, image, and details verified
The robot sets the numeric alias row to `category='NonTransceiver'`, clears verification flags, and writes `artifact_quarantine` evidence.
## Live Result
On Erik:
- dry-run found `109` candidates
- apply quarantined `109`
- evidence ledger wrote `109` `artifact_quarantine` records
- active numeric-SKU duplicates with canonical product row after run: `0`
Post-run reconcile and matcher completed.
Live health after cleanup:
- active products: `17305`
- price verified: `11414`
- image verified: `12016`
- details verified: `16705`
- fully verified: `10448`
- competitor status:
- `matched=10775`
- `no_valid_match=73`
- `ambiguous=192`
- `needs_research=6265`
Fully populated products still needing competitor research:
- `Flexoptix=359`
- `FS.COM=4`
- `ATGBICS=2`
FS.com and Flexoptix no-valid-match dry-runs now both return `0`, so the remaining rows need true candidate research or normalization, not blind no-match closure.

View File

@ -41,3 +41,4 @@
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}