fix: quarantine fs numeric sku aliases
This commit is contained in:
parent
7b8e229cf0
commit
fb9db56617
@ -16,6 +16,7 @@
|
|||||||
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
|
||||||
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
|
||||||
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
|
||||||
|
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
|
||||||
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",
|
||||||
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
"scrape:cisco": "tsx src/scrapers/cisco-tmg.ts",
|
||||||
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
"scrape:optcore": "tsx src/scrapers/optcore.ts",
|
||||||
|
|||||||
145
packages/scraper/src/utils/quarantine-fs-sku-aliases.ts
Normal file
145
packages/scraper/src/utils/quarantine-fs-sku-aliases.ts
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
/**
|
||||||
|
* FS.com numeric SKU alias quarantine.
|
||||||
|
*
|
||||||
|
* FS.com pages expose both a marketplace SKU (FS-123456) and a real optical
|
||||||
|
* product P/N (for example OSFP-DR8-1.6T-FL). Older scraper passes created
|
||||||
|
* duplicate active rows for the numeric SKU. Those rows distort equivalence
|
||||||
|
* research because the real product row already exists for the same URL.
|
||||||
|
*
|
||||||
|
* Default is dry-run. Set FS_SKU_ALIAS_APPLY=1 to write quarantine/evidence.
|
||||||
|
*/
|
||||||
|
import { pool, recordVerificationEvidence } from "./db";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
|
||||||
|
type AliasCandidate = {
|
||||||
|
alias_id: string;
|
||||||
|
alias_part_number: string;
|
||||||
|
alias_url: string;
|
||||||
|
canonical_id: string;
|
||||||
|
canonical_part_number: string;
|
||||||
|
canonical_url: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function applyMode(): boolean {
|
||||||
|
return process.env.FS_SKU_ALIAS_APPLY === "1";
|
||||||
|
}
|
||||||
|
|
||||||
|
async function quarantineFsSkuAliases(): Promise<void> {
|
||||||
|
const limit = Math.max(1, parseInt(process.env.FS_SKU_ALIAS_LIMIT || "500", 10));
|
||||||
|
const apply = applyMode();
|
||||||
|
|
||||||
|
logger.info("=== FS.com numeric SKU alias quarantine ===", { limit, apply });
|
||||||
|
|
||||||
|
const result = await pool.query<AliasCandidate>(`
|
||||||
|
WITH fs AS (
|
||||||
|
SELECT
|
||||||
|
t.*,
|
||||||
|
regexp_replace(
|
||||||
|
regexp_replace(COALESCE(t.product_page_url, ''), '\\?.*$', ''),
|
||||||
|
'^https://www\\.fs\\.com/de/de/',
|
||||||
|
'https://www.fs.com/de/'
|
||||||
|
) AS norm_url
|
||||||
|
FROM transceivers t
|
||||||
|
JOIN vendors v ON v.id = t.vendor_id
|
||||||
|
WHERE v.name = 'FS.COM'
|
||||||
|
AND COALESCE(t.product_page_url, '') ~* 'fs\\.com/.*/products/[0-9]+\\.html'
|
||||||
|
),
|
||||||
|
numeric_alias AS (
|
||||||
|
SELECT *
|
||||||
|
FROM fs
|
||||||
|
WHERE part_number ~ '^FS-[0-9]+$'
|
||||||
|
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||||
|
),
|
||||||
|
canonical AS (
|
||||||
|
SELECT *
|
||||||
|
FROM fs
|
||||||
|
WHERE part_number !~ '^FS-[0-9]+$'
|
||||||
|
AND COALESCE(data_confidence, 'unknown') != 'garbage'
|
||||||
|
AND price_verified = true
|
||||||
|
AND image_verified = true
|
||||||
|
AND details_verified = true
|
||||||
|
)
|
||||||
|
SELECT DISTINCT ON (n.id)
|
||||||
|
n.id AS alias_id,
|
||||||
|
n.part_number AS alias_part_number,
|
||||||
|
n.product_page_url AS alias_url,
|
||||||
|
c.id AS canonical_id,
|
||||||
|
c.part_number AS canonical_part_number,
|
||||||
|
c.product_page_url AS canonical_url
|
||||||
|
FROM numeric_alias n
|
||||||
|
JOIN canonical c ON c.norm_url = n.norm_url
|
||||||
|
ORDER BY n.id, c.fully_verified DESC, c.updated_at DESC
|
||||||
|
LIMIT $1
|
||||||
|
`, [limit]);
|
||||||
|
|
||||||
|
logger.info("FS.com numeric SKU alias candidates", { count: result.rowCount ?? 0 });
|
||||||
|
|
||||||
|
if (!apply) {
|
||||||
|
for (const row of result.rows.slice(0, 40)) {
|
||||||
|
logger.info("dry-run alias", {
|
||||||
|
alias: row.alias_part_number,
|
||||||
|
canonical: row.canonical_part_number,
|
||||||
|
url: row.alias_url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let updated = 0;
|
||||||
|
for (const row of result.rows) {
|
||||||
|
const reason = [
|
||||||
|
"FS.com numeric marketplace SKU alias",
|
||||||
|
`${row.alias_part_number} duplicates canonical product P/N ${row.canonical_part_number}`,
|
||||||
|
"for the same normalized FS product URL",
|
||||||
|
].join("; ");
|
||||||
|
|
||||||
|
await recordVerificationEvidence({
|
||||||
|
transceiverId: row.alias_id,
|
||||||
|
verificationType: "artifact_quarantine",
|
||||||
|
sourceUrl: row.alias_url,
|
||||||
|
evidenceValue: {
|
||||||
|
reason,
|
||||||
|
aliasPartNumber: row.alias_part_number,
|
||||||
|
canonicalTransceiverId: row.canonical_id,
|
||||||
|
canonicalPartNumber: row.canonical_part_number,
|
||||||
|
canonicalUrl: row.canonical_url,
|
||||||
|
},
|
||||||
|
robotName: "verify:fs:sku-aliases",
|
||||||
|
confidence: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
const update = await pool.query(`
|
||||||
|
UPDATE transceivers
|
||||||
|
SET category = 'NonTransceiver',
|
||||||
|
price_verified = false,
|
||||||
|
image_verified = false,
|
||||||
|
details_verified = false,
|
||||||
|
competitor_verified = false,
|
||||||
|
fully_verified = false,
|
||||||
|
competitor_status = 'needs_research',
|
||||||
|
competitor_status_updated_at = NOW(),
|
||||||
|
notes = CASE
|
||||||
|
WHEN COALESCE(notes, '') ILIKE '%' || $2::text || '%' THEN notes
|
||||||
|
ELSE CONCAT_WS(E'\\n', NULLIF(notes, ''), $2::text)
|
||||||
|
END,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
AND COALESCE(category, '') != 'NonTransceiver'
|
||||||
|
RETURNING id
|
||||||
|
`, [row.alias_id, reason]);
|
||||||
|
|
||||||
|
if ((update.rowCount ?? 0) > 0) updated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("FS.com numeric SKU alias quarantine complete", { updated });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
quarantineFsSkuAliases()
|
||||||
|
.then(() => pool.end())
|
||||||
|
.catch((err) => {
|
||||||
|
logger.error("FS.com numeric SKU alias quarantine failed", { error: (err as Error).message });
|
||||||
|
pool.end();
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -1,9 +1,51 @@
|
|||||||
# Current TIP Sync State
|
# Current TIP Sync State
|
||||||
|
|
||||||
Updated: 2026-05-09 21:24 UTC
|
Updated: 2026-05-09 21:33 UTC
|
||||||
|
|
||||||
## Newest Work
|
## Newest Work
|
||||||
|
|
||||||
|
- TIP FS.com SKU alias cleanup on 2026-05-09:
|
||||||
|
- added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`
|
||||||
|
- script: `pnpm -C packages/scraper run verify:fs:sku-aliases`
|
||||||
|
- default mode is dry-run
|
||||||
|
- apply mode requires `FS_SKU_ALIAS_APPLY=1`
|
||||||
|
- purpose:
|
||||||
|
- remove duplicate active FS.com numeric SKU rows such as `FS-380881` when the same FS URL already has the real product P/N row such as `OSFP-DR8-1.6T-FL`
|
||||||
|
- prevent numeric FS SKU aliases from becoming false competitor or no-match candidates
|
||||||
|
- safe gates:
|
||||||
|
- alias must match `^FS-[0-9]+$`
|
||||||
|
- same normalized FS product URL must have a non-numeric canonical product row
|
||||||
|
- canonical row must already have price, image, and details verified
|
||||||
|
- live Erik run:
|
||||||
|
- dry-run found `109` candidates
|
||||||
|
- apply quarantined `109`
|
||||||
|
- evidence ledger wrote `109` `artifact_quarantine` records from `verify:fs:sku-aliases`
|
||||||
|
- active numeric-SKU duplicates with canonical product row after run: `0`
|
||||||
|
- specific user-reported FS.com 1.6T case:
|
||||||
|
- numeric shadow rows `FS-380881` and `FS-380883` were duplicate aliases
|
||||||
|
- canonical rows remain active:
|
||||||
|
- `OSFP-DR8-1.6T-FL`
|
||||||
|
- `OSFP-2FR4-1.6T-FL`
|
||||||
|
- this preserves both the 500m DR8 and the 2km FR4 product instead of treating the numeric SKU as a separate transceiver
|
||||||
|
- live health after reconcile/matcher:
|
||||||
|
- active products: `17305`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16705`
|
||||||
|
- fully verified: `10448`
|
||||||
|
- active competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=73`
|
||||||
|
- `ambiguous=192`
|
||||||
|
- `needs_research=6265`
|
||||||
|
- fully populated product rows still needing competitor research:
|
||||||
|
- `Flexoptix=359`
|
||||||
|
- `FS.COM=4`
|
||||||
|
- `ATGBICS=2`
|
||||||
|
- FS.com and Flexoptix no-valid-match dry-runs now both return `0`; remaining cases need real candidate research/normalization, not no-match closure
|
||||||
|
- TIPLLM training pool:
|
||||||
|
- appended lesson for FS.com numeric SKU alias quarantine
|
||||||
|
|
||||||
- TIP no-valid-competitor resolver on 2026-05-09:
|
- TIP no-valid-competitor resolver on 2026-05-09:
|
||||||
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
|
- added `packages/scraper/src/utils/resolve-no-valid-competitor.ts`
|
||||||
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`
|
- script: `pnpm -C packages/scraper run verify:no-valid-competitor`
|
||||||
|
|||||||
76
sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md
Normal file
76
sync/history/2026-05-09-tip-fs-sku-alias-quarantine.md
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
# 2026-05-09 — TIP FS.com Numeric SKU Alias Quarantine
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
FS.com pages expose two identifiers:
|
||||||
|
|
||||||
|
- marketplace SKU, for example `FS-380881`
|
||||||
|
- real optical product P/N, for example `OSFP-DR8-1.6T-FL`
|
||||||
|
|
||||||
|
Older scraper passes created active duplicate rows for both. This polluted equivalence research because numeric SKU rows looked like separate transceivers.
|
||||||
|
|
||||||
|
The user-reported 1.6T case confirmed the issue:
|
||||||
|
|
||||||
|
- `FS-380881` is the numeric SKU alias for `OSFP-DR8-1.6T-FL` (500m)
|
||||||
|
- `FS-380883` is the numeric SKU alias for `OSFP-2FR4-1.6T-FL` (2km)
|
||||||
|
- both real product P/N rows must remain in TIP
|
||||||
|
- the numeric aliases must not be treated as independent products
|
||||||
|
|
||||||
|
## Change
|
||||||
|
|
||||||
|
Added `packages/scraper/src/utils/quarantine-fs-sku-aliases.ts`.
|
||||||
|
|
||||||
|
Script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm -C packages/scraper run verify:fs:sku-aliases
|
||||||
|
```
|
||||||
|
|
||||||
|
Apply mode:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
FS_SKU_ALIAS_APPLY=1 pnpm -C packages/scraper run verify:fs:sku-aliases
|
||||||
|
```
|
||||||
|
|
||||||
|
## Safety Gates
|
||||||
|
|
||||||
|
A row is quarantined only when:
|
||||||
|
|
||||||
|
- vendor is `FS.COM`
|
||||||
|
- part number matches `^FS-[0-9]+$`
|
||||||
|
- the same normalized FS product URL has a canonical non-numeric product P/N row
|
||||||
|
- the canonical row already has price, image, and details verified
|
||||||
|
|
||||||
|
The robot sets the numeric alias row to `category='NonTransceiver'`, clears verification flags, and writes `artifact_quarantine` evidence.
|
||||||
|
|
||||||
|
## Live Result
|
||||||
|
|
||||||
|
On Erik:
|
||||||
|
|
||||||
|
- dry-run found `109` candidates
|
||||||
|
- apply quarantined `109`
|
||||||
|
- evidence ledger wrote `109` `artifact_quarantine` records
|
||||||
|
- active numeric-SKU duplicates with canonical product row after run: `0`
|
||||||
|
|
||||||
|
Post-run reconcile and matcher completed.
|
||||||
|
|
||||||
|
Live health after cleanup:
|
||||||
|
|
||||||
|
- active products: `17305`
|
||||||
|
- price verified: `11414`
|
||||||
|
- image verified: `12016`
|
||||||
|
- details verified: `16705`
|
||||||
|
- fully verified: `10448`
|
||||||
|
- competitor status:
|
||||||
|
- `matched=10775`
|
||||||
|
- `no_valid_match=73`
|
||||||
|
- `ambiguous=192`
|
||||||
|
- `needs_research=6265`
|
||||||
|
|
||||||
|
Fully populated products still needing competitor research:
|
||||||
|
|
||||||
|
- `Flexoptix=359`
|
||||||
|
- `FS.COM=4`
|
||||||
|
- `ATGBICS=2`
|
||||||
|
|
||||||
|
FS.com and Flexoptix no-valid-match dry-runs now both return `0`, so the remaining rows need true candidate research or normalization, not blind no-match closure.
|
||||||
@ -41,3 +41,4 @@
|
|||||||
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
|
{"id":"tip-20260509-no-valid-competitor-resolver-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Prefer conservative deterministic verification over fake equivalence."},{"role":"user","content":"When may TIP mark a product's competitor requirement as resolved even though no 1:1 competitor product exists?"},{"role":"assistant","content":"Only use `competitor_status=no_valid_match` after deterministic research proves the absence of a strict source-backed competitor candidate. Require price_verified, image_verified, and details_verified first. Exclude artifacts and non-transceiver categories. Search other vendors for candidates with the same form factor, speed, fiber, reach within max(25m, 5%), and compatible wavelength when both sides expose it. Also require no pending or approved equivalence above confidence 0.50. When these gates pass, set competitor_verified true, store no_match_reason/no_match_verified_at, write a `competitor_no_match` evidence record, then recalculate fully_verified."}]}
|
||||||
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
{"id":"tip-20260509-no-valid-competitor-resolver-002","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Operate TIP crawlers safely on Erik and make every robot run reversible until evidence is checked."},{"role":"user","content":"What is the safe runbook for the no-valid-competitor resolver?"},{"role":"assistant","content":"Run dry-run first with a narrow vendor scope, for example `NO_VALID_MATCH_VENDOR=Flexoptix pnpm -C packages/scraper run verify:no-valid-competitor`. Inspect candidates for normalization mistakes, especially speed, reach, form factor, fiber, wavelength, cable/AOC/breakout artifacts, and category pages. Only apply with `NO_VALID_MATCH_APPLY=1` after the dry-run looks clean. Do not restart scraper daemons while pg-boss jobs are active. After apply, check `/api/health`, evidence counts, and protected review stats, then document the result in `sync/`."}]}
|
||||||
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
|
{"id":"tip-20260509-ambiguous-equivalence-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Ambiguity is a valid safety outcome, not a failure."},{"role":"user","content":"What should TIP do when the matcher finds possible competitor candidates but cannot prove a safe 1:1 equivalence?"},{"role":"assistant","content":"Set `competitor_status=ambiguous` and keep the product out of fully_verified. Do not choose the cheapest or visually similar candidate. Ambiguous means there is enough evidence to reject `no_valid_match`, but not enough evidence to approve `matched`. Typical causes include breakout vs non-breakout, AOC/DAC vs optical module, reach mismatch, MMF vs SMF mismatch, wavelength mismatch, connector/channel mismatch, or too many candidate products in the same broad class. Ambiguous rows need targeted vendor research or stricter extractor normalization."}]}
|
||||||
|
{"id":"tip-20260509-fs-sku-alias-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Distinguish vendor marketplace SKUs from real optical product part numbers."},{"role":"user","content":"How should TIP handle FS.com rows like FS-380881 when the same product URL also has OSFP-DR8-1.6T-FL?"},{"role":"assistant","content":"Treat the numeric `FS-#####` row as a marketplace SKU alias, not as an independent transceiver, when the same normalized FS.com product URL already has a canonical non-numeric product P/N row with price, image, and details verified. Quarantine the numeric alias from the active transceiver base, clear verification flags, and write `artifact_quarantine` evidence that points to the canonical row. Keep the real product P/N active. For the 1.6T FS.com case, preserve both `OSFP-DR8-1.6T-FL` (500m) and `OSFP-2FR4-1.6T-FL` (2km), and do not classify `FS-380881` or `FS-380883` as separate products or no-valid-match products."}]}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user