fix: quarantine fs numeric sku aliases
This commit is contained in:
parent
7b8e229cf0
commit
fb9db56617
@ -16,6 +16,7 @@
|
||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
||||
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
||||
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
||||
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||
|
||||
145
packages/scraper/src/utils/quarantine-fs-sku-aliases.ts
Normal file
145
packages/scraper/src/utils/quarantine-fs-sku-aliases.ts
Normal file
@ -0,0 +1,145 @@
|
||||
/**
|
||||
* FS.com numeric SKU alias quarantine.
|
||||
*
|
||||
* FS.com pages expose both a marketplace SKU (FS-123456) and a real optical
|
||||
* product P/N (for example OSFP-DR8-1.6T-FL). Older scraper passes created
|
||||
* duplicate active rows for the numeric SKU. Those rows distort equivalence
|
||||
* research because the real product row already exists for the same URL.
|
||||
*
|
||||
* Default is dry-run. Set FS_SKU_ALIAS_APPLY=1 to write quarantine/evidence.
|
||||
*/
|
||||
import { pool, recordVerificationEvidence } from "./db";
|
||||
import { logger } from "./logger";
|
||||
|
||||
type AliasCandidate = {
|
||||
alias_id: string;
|
||||
alias_part_number: string;
|
||||
alias_url: string;
|
||||
canonical_id: string;
|
||||
canonical_part_number: string;
|
||||
canonical_url: string;
|
||||
};
|
||||
|
||||
function applyMode(): boolean {
|
||||
return process.env.FS_SKU_ALIAS_APPLY === "1";
|
||||
}
|
||||
|
||||
async function quarantineFsSkuAliases(): Promise<void> {
|
||||
const limit = Math.max(1, parseInt(process.env.FS_SKU_ALIAS_LIMIT || "500", 10));
|
||||
const apply = applyMode();
|
||||
|
||||
logger.info("=== FS.com numeric SKU alias quarantine ===", { limit, apply });
|
||||
|
||||
const result = await pool.query<AliasCandidate>(`
|
||||
WITH fs AS (
|
||||
SELECT
|
||||
t.*,
|
||||
regexp_replace(
|
||||
regexp_replace(COALESCE(t.product_page_url, ''), '\\?.*$', ''),
|
||||
'^https://www\\.fs\\.com/de/de/',
|
||||
'https://www.fs.com/de/'
|
||||
) AS norm_url
|
||||
FROM transceivers t
|
||||
JOIN vendors v ON v.id = t.vendor_id
|
||||
WHERE v.name = 'FS.COM'
|
||||
AND COALESCE(t.product_page_url, '') ~* 'fs\\.com/.*/products/[0-9]+\\.html'
|
||||
),
|
||||
numeric_alias AS (
|
||||
SELECT *
|
||||
FROM fs
|
||||
WHERE part_number ~ '^FS-[0-9]+$'
|
||||
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||
),
|
||||
canonical AS (
|
||||
SELECT *
|
||||
FROM fs
|
||||
WHERE part_number !~ '^FS-[0-9]+$'
|
||||
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||
AND price_verified = true
|
||||
AND image_verified = true
|
||||
AND details_verified = true
|
||||
)
|
||||
SELECT DISTINCT ON (n.id)
|
||||
n.id AS alias_id,
|
||||
n.part_number AS alias_part_number,
|
||||
n.product_page_url AS alias_url,
|
||||
c.id AS canonical_id,
|
||||
c.part_number AS canonical_part_number,
|
||||
c.product_page_url AS canonical_url
|
||||
FROM numeric_alias n
|
||||
JOIN canonical c ON c.norm_url = n.norm_url
|
||||
ORDER BY n.id, c.fully_verified DESC, c.updated_at DESC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
|
||||
logger.info("FS.com numeric SKU alias candidates", { count: result.rowCount ?? 0 });
|
||||
|
||||
if (!apply) {
|
||||
for (const row of result.rows.slice(0, 40)) {
|
||||
logger.info("dry-run alias", {
|
||||
alias: row.alias_part_number,
|
||||
canonical: row.canonical_part_number,
|
||||
url: row.alias_url,
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let updated = 0;
|
||||
for (const row of result.rows) {
|
||||
const reason = [
|
||||
"FS.com numeric marketplace SKU alias",
|
||||
`${row.alias_part_number} duplicates canonical product P/N ${row.canonical_part_number}`,
|
||||
"for the same normalized FS product URL",
|
||||
].join("; ");
|
||||
|
||||
await recordVerificationEvidence({
|
||||
transceiverId: row.alias_id,
|
||||
verificationType: "artifact_quarantine",
|
||||
sourceUrl: row.alias_url,
|
||||
evidenceValue: {
|
||||
reason,
|
||||
aliasPartNumber: row.alias_part_number,
|
||||
canonicalTransceiverId: row.canonical_id,
|
||||
canonicalPartNumber: row.canonical_part_number,
|
||||
canonicalUrl: row.canonical_url,
|
||||
},
|
||||
robotName: "verify:fs:sku-aliases",
|
||||
confidence: 1,
|
||||
});
|
||||
|
||||
const update = await pool.query(`
|
||||
UPDATE transceivers
|
||||
SET category = 'NonTransceiver',
|
||||
price_verified = false,
|
||||
image_verified = false,
|
||||
details_verified = false,
|
||||
competitor_verified = false,
|
||||
fully_verified = false,
|
||||
competitor_status = 'needs_research',
|
||||
competitor_status_updated_at = NOW(),
|
||||
notes = CASE
|
||||
WHEN COALESCE(notes, '') ILIKE '%' || $2::text || '%' THEN notes
|
||||
ELSE CONCAT_WS(E'\\n', NULLIF(notes, ''), $2::text)
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
AND COALESCE(category, '') != 'NonTransceiver'
|
||||
RETURNING id
|
||||
`, [row.alias_id, reason]);
|
||||
|
||||
if ((update.rowCount ?? 0) > 0) updated++;
|
||||
}
|
||||
|
||||
logger.info("FS.com numeric SKU alias quarantine complete", { updated });
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
quarantineFsSkuAliases()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => {
|
||||
logger.error("FS.com numeric SKU alias quarantine failed", { error: (err as Error).message });
|
||||
pool.end();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@ -1,9 +1,51 @@
|
||||
# Current TIP Sync State
|
||||
|
||||
Updated: 2026-05-09 21:24 UTC
|
||||
Updated: 2026-05-09 21:33 UTC
|
||||
|
||||
## Newest Work
|
||||
|
||||
- TIP FS.com SKU alias cleanup on 2026-05-09:
|
||||
- added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`
|
||||
- script: `pnpm -C packages/scraper run verify:fs:sku-aliases`
|
||||
- default mode is dry-run
|
||||
- apply mode requires `FS_SKU_ALIAS_APPLY=1`
|
||||
- purpose:
|
||||
- remove duplicate active FS.com numeric SKU rows such as `FS-380881` when the same FS URL already has the real product P/N row such as `OSFP-DR8-1.6T-FL`
|
||||
- prevent numeric FS SKU aliases from becoming false competitor or no-match candidates
|
||||
- safe gates:
|
||||
- alias must match `^FS-[0-9]+$`
|
||||
- same normalized FS product URL must have a non-numeric canonical product row
|
||||
- canonical row must already have price, image, and details verified
|
||||
- live Erik run:
|
||||
- dry-run found `109` candidates
|
||||
- apply quarantined `109`
|
||||
- evidence ledger wrote `109` `artifact_quarantine` records from `verify:fs:sku-aliases`
|
||||
- active numeric-SKU duplicates with canonical product row after run: `0`
|
||||
- specific user-reported FS.com 1.6T case:
|
||||
- numeric shadow rows `FS-380881` and `FS-380883` were duplicate aliases
|
||||
- canonical rows remain active:
|
||||
- `OSFP-DR8-1.6T-FL`
|
||||
- `OSFP-2FR4-1.6T-FL`
|
||||
- this preserves both the 500m DR8 and the 2km FR4 product instead of treating the numeric SKU as a separate transceiver
|
||||
- live health after reconcile/matcher:
|
||||
- active products: `17305`
|
||||
- price verified: `11414`
|
||||
- image verified: `12016`
|
||||
- details verified: `16705`
|
||||
- fully verified: `10448`
|
||||
- active competitor status:
|
||||
- `matched=10775`
|
||||
- `no_valid_match=73`
|
||||
- `ambiguous=192`
|
||||
- `needs_research=6265`
|
||||
- fully populated product rows still needing competitor research:
|
||||
- `Flexoptix=359`
|
||||
- `FS.COM=4`
|
||||
- `ATGBICS=2`
|
||||
- FS.com and Flexoptix no-valid-match dry-runs now both return `0`; remaining cases need real candidate research/normalization, not no-match closure
|
||||
- TIPLLM training pool:
|
||||
- appended lesson for FS.com numeric SKU alias quarantine
|
||||
|
||||
- TIP no-valid-competitor resolver on 2026-05-09:
|
||||
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
|
||||
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`
|
||||
|
||||
76
sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md
Normal file
76
sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md
Normal file
@ -0,0 +1,76 @@
|
||||
# 2026-05-09 — TIP FS.com Numeric SKU Alias Quarantine
|
||||
|
||||
## Problem
|
||||
|
||||
FS.com pages expose two identifiers:
|
||||
|
||||
- marketplace SKU, for example `FS-380881`
|
||||
- real optical product P/N, for example `OSFP-DR8-1.6T-FL`
|
||||
|
||||
Older scraper passes created active duplicate rows for both. This polluted equivalence research because numeric SKU rows looked like separate transceivers.
|
||||
|
||||
The user-reported 1.6T case confirmed the issue:
|
||||
|
||||
- `FS-380881` is the numeric SKU alias for `OSFP-DR8-1.6T-FL` (500m)
|
||||
- `FS-380883` is the numeric SKU alias for `OSFP-2FR4-1.6T-FL` (2km)
|
||||
- both real product P/N rows must remain in TIP
|
||||
- the numeric aliases must not be treated as independent products
|
||||
|
||||
## Change
|
||||
|
||||
Added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`.
|
||||
|
||||
Script:
|
||||
|
||||
```bash
|
||||
pnpm -C packages/scraper run verify:fs:sku-aliases
|
||||
```
|
||||
|
||||
Apply mode:
|
||||
|
||||
```bash
|
||||
FS_SKU_ALIAS_APPLY=1 pnpm -C packages/scraper run verify:fs:sku-aliases
|
||||
```
|
||||
|
||||
## Safety Gates
|
||||
|
||||
A row is quarantined only when:
|
||||
|
||||
- vendor is `FS.COM`
|
||||
- part number matches `^FS-[0-9]+$`
|
||||
- the same normalized FS product URL has a canonical non-numeric product P/N row
|
||||
- the canonical row already has price, image, and details verified
|
||||
|
||||
The robot sets the numeric alias row to `category='NonTransceiver'`, clears verification flags, and writes `artifact_quarantine` evidence.
|
||||
|
||||
## Live Result
|
||||
|
||||
On Erik:
|
||||
|
||||
- dry-run found `109` candidates
|
||||
- apply quarantined `109`
|
||||
- evidence ledger wrote `109` `artifact_quarantine` records
|
||||
- active numeric-SKU duplicates with canonical product row after run: `0`
|
||||
|
||||
Post-run reconcile and matcher completed.
|
||||
|
||||
Live health after cleanup:
|
||||
|
||||
- active products: `17305`
|
||||
- price verified: `11414`
|
||||
- image verified: `12016`
|
||||
- details verified: `16705`
|
||||
- fully verified: `10448`
|
||||
- competitor status:
|
||||
- `matched=10775`
|
||||
- `no_valid_match=73`
|
||||
- `ambiguous=192`
|
||||
- `needs_research=6265`
|
||||
|
||||
Fully populated products still needing competitor research:
|
||||
|
||||
- `Flexoptix=359`
|
||||
- `FS.COM=4`
|
||||
- `ATGBICS=2`
|
||||
|
||||
FS.com and Flexoptix no-valid-match dry-runs now both return `0`, so the remaining rows need true candidate research or normalization, not blind no-match closure.
|
||||
@ -41,3 +41,4 @@
|
||||
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
|
||||
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
||||
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
|
||||
{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user