feat: add targeted product page asset verifier

This commit is contained in:
Rene Fichtmueller 2026-05-10 00:31:33 +02:00
parent 0d4bcb6924
commit adb2661fac
6 changed files with 452 additions and 0 deletions

View File

@ -18,6 +18,7 @@
"scrape:sfpcables": "tsx src/scrapers/sfpcables.ts", "scrape:sfpcables": "tsx src/scrapers/sfpcables.ts",
"verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts",
"verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts",
"verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts",
"verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts",
"verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts",
"verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts", "verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts",

View File

@ -44,7 +44,9 @@ async function quarantine(): Promise<void> {
v.name = 'GBICS' v.name = 'GBICS'
AND ( AND (
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)'
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)'
) )
) )
OR ( OR (
@ -86,7 +88,9 @@ async function quarantine(): Promise<void> {
v.name = 'GBICS' v.name = 'GBICS'
AND ( AND (
t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)'
OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)'
OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)'
OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)'
) )
) )
OR ( OR (

View File

@ -0,0 +1,368 @@
import { pool, markDetailsVerified, markImageVerified } from "./db";
const EXCLUDED_CATEGORIES = [
"NonTransceiver",
"Accessory",
"Adapter / Converter",
"Switch / Media Converter",
"Switch / Network Infrastructure",
"NIC / Adapter",
"Mux / Passive Optical",
"Product Family",
"Loopback / Test Module",
];
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
};
type Candidate = {
id: string;
vendorName: string;
partNumber: string;
productUrl: string;
formFactor: string | null;
speedGbps: number | null;
reachLabel: string | null;
fiberType: string | null;
imageVerified: boolean | null;
detailsVerified: boolean | null;
};
type InferredDetails = {
formFactor?: string;
speed?: string;
speedGbps?: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelengths?: string;
};
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function decodeEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&quot;/g, '"')
.replace(/&#039;/g, "'")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">");
}
function absolutizeUrl(src: string, baseUrl: string): string {
try {
return new URL(decodeEntities(src), baseUrl).toString();
} catch {
return "";
}
}
function stripHtml(html: string): string {
return html
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function extractTitle(html: string): string {
const og = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:title["'][^>]*>/i);
if (og) return decodeEntities(og[1]);
const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
return title ? decodeEntities(stripHtml(title[1])) : "";
}
function extractImageUrl(html: string, baseUrl: string): string | undefined {
const meta = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["'][^>]*>/i)
|| html.match(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["'][^>]*>/i)
|| html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:image["'][^>]*>/i);
if (meta) {
const url = absolutizeUrl(meta[1], baseUrl);
if (isLikelyProductImage(url)) return url;
}
const idImage = html.match(/<img[^>]+id=["']image["'][^>]+src=["']([^"']+)["'][^>]*>/i)
|| html.match(/<img[^>]+src=["']([^"']+)["'][^>]+id=["']image["'][^>]*>/i);
if (idImage) {
const url = absolutizeUrl(idImage[1], baseUrl);
if (isLikelyProductImage(url)) return url;
}
const imgRegex = /<img\b[^>]*\bsrc=["']([^"']+)["'][^>]*>/gi;
let match: RegExpExecArray | null;
while ((match = imgRegex.exec(html)) !== null) {
const tag = match[0];
const url = absolutizeUrl(match[1], baseUrl);
const alt = (tag.match(/\balt=["']([^"']*)["']/i)?.[1] || "").toLowerCase();
const haystack = `${url} ${alt}`.toLowerCase();
if (!isLikelyProductImage(url)) continue;
if (/transceiver|optical|sfp|qsfp|osfp|xfp|module|base-/i.test(haystack)) return url;
}
return undefined;
}
function isLikelyProductImage(url: string): boolean {
const lower = url.toLowerCase();
if (!lower || !/^https?:\/\//.test(lower)) return false;
if (!/\.(jpg|jpeg|png|webp)(\?|$)/.test(lower)) return false;
if (/(logo|country|icon|sprite|banner|menu|payment|placeholder|no-image|company|featured-|solution|fofc|mdc|mmc)/.test(lower)) {
return false;
}
return /(catalog\/product|\/uploads\/image\/|product|transceiver|sfp|qsfp|osfp|xfp)/.test(lower);
}
function inferDetails(text: string): InferredDetails {
const lower = text.toLowerCase();
const details: InferredDetails = {};
const formFactors: Array<[RegExp, string]> = [
[/\bqsfp-dd\b/i, "QSFP-DD"],
[/\bqsfp112\b/i, "QSFP112"],
[/\bosfp112\b/i, "OSFP112"],
[/\bosfp\b/i, "OSFP"],
[/\bqsfp56\b/i, "QSFP56"],
[/\bqsfp28\b/i, "QSFP28"],
[/\bqsfp\+|\bqsfp\b/i, "QSFP+"],
[/\bsfp56\b/i, "SFP56"],
[/\bsfp28\b/i, "SFP28"],
[/\bsfp\+|\bsfp plus\b/i, "SFP+"],
[/\bxfp\b/i, "XFP"],
[/\bsfp\b/i, "SFP"],
];
for (const [regex, formFactor] of formFactors) {
if (regex.test(text)) {
details.formFactor = formFactor;
break;
}
}
const speedPatterns: Array<[RegExp, string, number]> = [
[/\b800\s*g(?:base)?\b/i, "800G", 800],
[/\b400\s*g(?:base)?\b/i, "400G", 400],
[/\b200\s*g(?:base)?\b/i, "200G", 200],
[/\b128\s*gfc\b/i, "128G FC", 128],
[/\b100\s*g(?:base)?\b/i, "100G", 100],
[/\b50\s*g(?:base)?\b/i, "50G", 50],
[/\b40\s*g(?:base)?\b/i, "40G", 40],
[/\b32\s*gfc\b/i, "32G FC", 32],
[/\b16\s*gfc\b/i, "16G FC", 16],
[/\b10\s*g(?:base)?\b/i, "10G", 10],
[/\b1\.?25\s*g(?:base)?\b|\b1000base\b|\b1\s*g(?:base)?\b/i, "1G", 1],
[/(^|[^.\d-])25\s*g(?:base)?\b/i, "25G", 25],
[/\b2[.-]?5\s*g(?:base)?\b/i, "2.5G", 2.5],
];
for (const [regex, speed, speedGbps] of speedPatterns) {
if (regex.test(text)) {
details.speed = speed;
details.speedGbps = speedGbps;
break;
}
}
const kmMatch = text.match(/\b(160|120|100|80|60|40|30|20|10|3|2|1(?:\.4)?)\s*km\b/i);
const mMatch = text.match(/\b(550|500|400|300|150|100|80|30)\s*m(?:eters?)?\b/i);
if (kmMatch) {
const km = parseFloat(kmMatch[1]);
details.reachLabel = `${kmMatch[1]}km`;
details.reachMeters = Math.round(km * 1000);
} else if (mMatch) {
const meters = parseInt(mMatch[1], 10);
details.reachLabel = `${meters}m`;
details.reachMeters = meters;
} else if (/\bfr4\b/i.test(text)) {
details.reachLabel = "2km";
details.reachMeters = 2000;
} else if (/\bdr\d*\b/i.test(text)) {
details.reachLabel = "500m";
details.reachMeters = 500;
} else if (/\blr4?\b/i.test(text)) {
details.reachLabel = "10km";
details.reachMeters = 10000;
} else if (/\ber4?\b/i.test(text)) {
details.reachLabel = "40km";
details.reachMeters = 40000;
} else if (/\bzr4?\b/i.test(text)) {
details.reachLabel = "80km";
details.reachMeters = 80000;
} else if (/\bsr\d*|\bsw4\b/i.test(text)) {
details.reachLabel = "300m";
details.reachMeters = 300;
}
if (/base-?t|rj-?45|cat ?6|cat ?5|copper/i.test(text)) {
details.fiberType = "Copper";
} else if (/single.?mode|\bsmf\b|\blr\b|\ber\b|\bzr\b|\bfr\b|\bdr\b|cwdm|dwdm|bidi|psm/i.test(lower)) {
details.fiberType = "SMF";
} else if (/multi.?mode|\bmmf\b|\bsr\b|\bsw\b/i.test(lower)) {
details.fiberType = "MMF";
}
const wavelength = text.match(/\b(\d{3,4})\s*nm\b/i);
if (wavelength) details.wavelengths = wavelength[1];
return details;
}
function mergeDetails(existing: Candidate, inferred: InferredDetails): InferredDetails {
const updates: InferredDetails = {};
if (inferred.formFactor && (!existing.formFactor || existing.formFactor === "SFP+" && inferred.formFactor.startsWith("QSFP"))) {
updates.formFactor = inferred.formFactor;
}
if (inferred.speedGbps && (!existing.speedGbps || inferred.speedGbps > existing.speedGbps)) {
updates.speed = inferred.speed;
updates.speedGbps = inferred.speedGbps;
}
if (inferred.reachLabel && !existing.reachLabel) {
updates.reachLabel = inferred.reachLabel;
updates.reachMeters = inferred.reachMeters;
}
if (inferred.fiberType && !existing.fiberType) {
updates.fiberType = inferred.fiberType;
}
return updates;
}
async function fetchHtml(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
return resp.text();
}
async function applyDetailUpdates(id: string, updates: InferredDetails): Promise<void> {
const sets: string[] = [];
const values: unknown[] = [id];
let idx = 2;
if (updates.formFactor) {
sets.push(`form_factor = $${idx++}`);
values.push(updates.formFactor);
}
if (updates.speedGbps) {
sets.push(`speed_gbps = $${idx++}`);
values.push(updates.speedGbps);
}
if (updates.speed) {
sets.push(`speed = $${idx++}`);
values.push(updates.speed);
}
if (updates.reachMeters) {
sets.push(`reach_meters = $${idx++}`);
values.push(updates.reachMeters);
}
if (updates.reachLabel) {
sets.push(`reach_label = $${idx++}`);
values.push(updates.reachLabel);
}
if (updates.fiberType) {
sets.push(`fiber_type = $${idx++}`);
values.push(updates.fiberType);
}
if (updates.wavelengths) {
sets.push(`wavelengths = COALESCE(NULLIF(wavelengths, ''), $${idx++})`);
values.push(updates.wavelengths);
}
if (sets.length === 0) return;
sets.push("updated_at = NOW()");
await pool.query(`UPDATE transceivers SET ${sets.join(", ")} WHERE id = $1`, values);
}
async function main(): Promise<void> {
const vendorFilter = (process.env["PRODUCT_ASSET_VENDOR"] || "").trim();
const limit = parseInt(process.env["PRODUCT_ASSET_LIMIT"] || "120", 10);
const apply = process.env["PRODUCT_ASSET_APPLY"] === "1";
const vendorNames = vendorFilter
? vendorFilter.split(",").map((v) => v.trim()).filter(Boolean)
: ["T&S Communication", "10Gtek", "GBICS"];
const candidates = await pool.query<Candidate>(
`SELECT t.id,
v.name AS "vendorName",
t.part_number AS "partNumber",
t.product_page_url AS "productUrl",
t.form_factor AS "formFactor",
t.speed_gbps AS "speedGbps",
t.reach_label AS "reachLabel",
t.fiber_type AS "fiberType",
t.image_verified AS "imageVerified",
t.details_verified AS "detailsVerified"
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = ANY($1)
AND COALESCE(t.category, '') <> ALL($2)
AND COALESCE(t.product_page_url, '') ~* '^https?://'
AND (
COALESCE(t.image_verified, false) = false
OR COALESCE(t.details_verified, false) = false
OR t.speed_gbps IS NULL
OR t.speed_gbps = 0
OR COALESCE(t.fiber_type, '') = ''
)
ORDER BY v.name, t.part_number
LIMIT $3`,
[vendorNames, EXCLUDED_CATEGORIES, limit],
);
let images = 0;
let details = 0;
let skipped = 0;
let errors = 0;
console.log("=== Product page asset verifier ===", { vendorNames, limit, apply, count: candidates.rows.length });
for (const row of candidates.rows) {
try {
const html = await fetchHtml(row.productUrl);
const title = extractTitle(html);
const text = `${row.partNumber} ${row.productUrl} ${title}`;
const imageUrl = extractImageUrl(html, row.productUrl);
const inferred = inferDetails(text);
const updates = mergeDetails(row, inferred);
if (!apply) {
console.log("dry-run product asset", {
vendor: row.vendorName,
partNumber: row.partNumber,
imageUrl,
updates,
});
} else {
await applyDetailUpdates(row.id, updates);
if (imageUrl && !row.imageVerified) {
const changed = await markImageVerified(row.id, imageUrl);
if (changed) images++;
}
const marked = await markDetailsVerified({ transceiverId: row.id, sourceUrl: row.productUrl });
if (marked) details++;
}
if (!imageUrl && Object.keys(updates).length === 0) skipped++;
await sleep(750);
} catch (err) {
errors++;
console.warn(`asset verifier error for ${row.vendorName} ${row.partNumber}: ${(err as Error).message}`);
}
}
console.log("Product page asset verifier complete", { images, details, skipped, errors, apply });
}
if (require.main === module) {
main()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}

View File

@ -37,6 +37,37 @@ Updated: 2026-05-09 22:15 UTC
- reconcile must preserve explicit competitor research states - reconcile must preserve explicit competitor research states
- GBICS/T&S artifact quarantine rules - GBICS/T&S artifact quarantine rules
- TIP product-page asset verifier on 2026-05-09:
- added `verify:product-page-assets`
- deterministic scope:
- only existing `product_page_url` rows
- vendor-limited batches via `PRODUCT_ASSET_VENDOR`
- dry-run by default, apply only with `PRODUCT_ASSET_APPLY=1`
- extracts images from source-backed product image tags/meta only
- infers details only from part number, product URL, and title to avoid navigation pollution
- remote build passed on Erik
- live verifier results:
- GBICS extra quarantine: `17` additional category/family artifacts
- T&S Communication asset apply: `36` images, `36` details closed after a second DR8 reach pass
- 10Gtek/SFPcables asset apply: `5` images, `10` details improved on rows with existing product URLs
- current vendor gaps:
- GBICS: `71` active rows, `0` missing price, `0` missing image, `0` missing details
- T&S Communication: `36` active rows, `36` missing price, `0` missing image, `0` missing details
- 10Gtek: `175` active rows, `126` missing price, `126` missing image, `20` missing details
- interpretation:
- T&S is now product-data complete but public-price blocked; pages expose no real public price (`price: 0.00` / quote-only behavior)
- 10Gtek remaining gaps are mostly rows without reliable product URLs/price sources and need alias/source discovery rather than blind image guessing
- live health after this pass:
- active products: `17195`
- price verified: `11414`
- image verified: `12057`
- details verified: `16713`
- fully verified: `10459`
- competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5790`
- TIPLLM training pool updated with:
- product-page asset verifier dry-run/apply pattern
- T&S quote-only public-price rule
- MAGATAMA multi-LLM training lane expansion on 2026-05-09: - MAGATAMA multi-LLM training lane expansion on 2026-05-09:
- added first-class training lanes for: - added first-class training lanes for:
- `pulso_llm` - `pulso_llm`

View File

@ -136,3 +136,49 @@ Verification after a fresh reconcile test:
- `ambiguous=556` - `ambiguous=556`
- `needs_research=5807` - `needs_research=5807`
- fully product-verified rows still in competitor `needs_research`: `0` - fully product-verified rows still in competitor `needs_research`: `0`
## Follow-Up: Product Page Asset Verifier
Added `packages/scraper/src/utils/verify-product-page-assets.ts` and script `verify:product-page-assets`.
Safety design:
- vendor-limited with `PRODUCT_ASSET_VENDOR`
- batch-limited with `PRODUCT_ASSET_LIMIT`
- dry-run by default
- apply only with `PRODUCT_ASSET_APPLY=1`
- uses existing `product_page_url` rows only
- writes through existing `markImageVerified` / `markDetailsVerified` helpers so the evidence ledger remains canonical
- infers details only from part number, product URL, and title; full body text was intentionally excluded after a dry-run showed T&S navigation links polluting speed inference
Live runs:
- extra GBICS quarantine:
- `17` category/family artifacts removed from the active base
- GBICS is now product-data complete in the active base: `71` active rows, `0/0/0` price/image/details gaps
- T&S Communication:
- product-page asset verifier applied
- `36` images verified
- `36` details verified after adding DR8/SR8 reach inference
- current active gaps: `36` active rows, `36` missing price, `0` missing image, `0` missing details
- interpretation: T&S is public-price blocked; pages expose no deterministic price, only quote/no-price or `price: 0.00`
- 10Gtek / SFPcables:
- dry-run caught `1.25G` / `2.5G` vs `25G` regex risk; regex was hardened before apply
- `5` images verified
- `10` details verified/improved
- current active gaps: `175` active rows, `126` missing price, `126` missing image, `20` missing details
- interpretation: remaining 10Gtek rows need URL/source alias discovery, not blind verification
Live health after this pass:
- health: healthy
- active products: `17195`
- price verified: `11414`
- image verified: `12057`
- details verified: `16713`
- fully verified: `10459`
- competitor status:
- `matched=10775`
- `no_valid_match=74`
- `ambiguous=556`
- `needs_research=5790`

View File

@ -47,3 +47,5 @@
{"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]} {"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]}
{"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]} {"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]}
{"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]} {"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]}
{"id":"tip-20260509-product-page-asset-verifier-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Design deterministic product-page verifiers that improve evidence without broad crawling or invented values."},{"role":"user","content":"How should TIP safely backfill images and details from existing vendor product URLs?"},{"role":"assistant","content":"Use a targeted product-page asset verifier. Select active rows with an existing product_page_url and missing image/details; fetch one page per row with low concurrency; extract only likely product images from og:image, twitter:image, Magento product image IDs, or filtered product image URLs; infer details only from part number, product URL, and page title, not from full body navigation. Always dry-run first to catch navigation pollution such as T&S 800G menu links or 1.25G being misread as 25G. Apply through `markImageVerified` and `markDetailsVerified` so evidence is recorded."}]}
{"id":"tip-20260509-quote-only-tscom-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Public-price verification requires a real public price."},{"role":"user","content":"T&S Communication product pages have valid images and details but expose no real public prices, only no price or price 0.00. What should TIP do?"},{"role":"assistant","content":"Do not mark price_verified. Treat T&S as image/detail-verifiable but public-price-missing until a real price observation exists. `price: 0.00`, quote/request flows, missing amount fields, or SEO filenames containing `price` are not price evidence. Keep image_verified/details_verified true when source-backed, keep price_verified false, and document the vendor as quote-only/public-price-unavailable rather than fabricating a market price."}]}