diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 606f386..28adc05 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -18,6 +18,7 @@ "scrape:sfpcables": "tsx src/scrapers/sfpcables.ts", "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", + "verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", "verify:no-valid-competitor": "tsx src/utils/resolve-no-valid-competitor.ts", diff --git a/packages/scraper/src/utils/quarantine-non-transceivers.ts b/packages/scraper/src/utils/quarantine-non-transceivers.ts index 5a625d3..186e782 100644 --- a/packages/scraper/src/utils/quarantine-non-transceivers.ts +++ b/packages/scraper/src/utils/quarantine-non-transceivers.ts @@ -44,7 +44,9 @@ async function quarantine(): Promise { v.name = 'GBICS' AND ( t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' + OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)' OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' + OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)' ) ) OR ( @@ -86,7 +88,9 @@ async function quarantine(): Promise { v.name = 'GBICS' AND ( t.part_number ~* '(Cables?|Patch Leads?|Choose By|Cart with|Vendor|Speed|Network Adapters|Media Converters|All NVIDIA|ConnectX|Cisco Systems|Cisco Meraki|Cisco Viptela|Alcatel-Lucent|Brocade|WatchGuard|HP Procurve|AOC|DAC)' + OR t.part_number ~* '(^[0-9]+G (QSFP|OSFP|SFP)|I-Temp Tranceivers|Gigabit Ethernet$)' OR COALESCE(t.product_page_url, '') ~* '(active-optical-cables|direct-attach|patch-cables|patch-leads|choose-by|custom-transceivers|network-adapters|media-converters|cart\\.php|nvidia-2|connectx|cisco|watchguard|hp-procurve|aoc-dac|mtp-mpo-cables)' + OR COALESCE(t.product_page_url, '') ~* '(100g-qsfp28/?$|100g-qsfp28-i-temp|10gb-industrial-temp|1g-sfp-i-temp|200g-qsfp56/?$|25g-sfp28-i-temp|400g-qsfp-dd/?$|400g-qsfp112/?$|40g-qsfp-i-temp|i-temp-tranceivers|osfp-[0-9]+g-gigabit|qsfp.*gigabit|400g-qsfp112-2)' ) ) OR ( diff --git a/packages/scraper/src/utils/verify-product-page-assets.ts b/packages/scraper/src/utils/verify-product-page-assets.ts new file mode 100644 index 0000000..67b25df --- /dev/null +++ b/packages/scraper/src/utils/verify-product-page-assets.ts @@ -0,0 +1,368 @@ +import { pool, markDetailsVerified, markImageVerified } from "./db"; + +const EXCLUDED_CATEGORIES = [ + "NonTransceiver", + "Accessory", + "Adapter / Converter", + "Switch / Media Converter", + "Switch / Network Infrastructure", + "NIC / Adapter", + "Mux / Passive Optical", + "Product Family", + "Loopback / Test Module", +]; + +const HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +}; + +type Candidate = { + id: string; + vendorName: string; + partNumber: string; + productUrl: string; + formFactor: string | null; + speedGbps: number | null; + reachLabel: string | null; + fiberType: string | null; + imageVerified: boolean | null; + detailsVerified: boolean | null; +}; + +type InferredDetails = { + formFactor?: string; + speed?: string; + speedGbps?: number; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelengths?: string; +}; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function decodeEntities(value: string): string { + return value + .replace(/&/g, "&") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/</g, "<") + .replace(/>/g, ">"); +} + +function absolutizeUrl(src: string, baseUrl: string): string { + try { + return new URL(decodeEntities(src), baseUrl).toString(); + } catch { + return ""; + } +} + +function stripHtml(html: string): string { + return html + .replace(//gi, " ") + .replace(//gi, " ") + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function extractTitle(html: string): string { + const og = html.match(/]+property=["']og:title["'][^>]+content=["']([^"']+)["'][^>]*>/i) + || html.match(/]+content=["']([^"']+)["'][^>]+property=["']og:title["'][^>]*>/i); + if (og) return decodeEntities(og[1]); + const title = html.match(/]*>([\s\S]*?)<\/title>/i); + return title ? decodeEntities(stripHtml(title[1])) : ""; +} + +function extractImageUrl(html: string, baseUrl: string): string | undefined { + const meta = html.match(/]+property=["']og:image["'][^>]+content=["']([^"']+)["'][^>]*>/i) + || html.match(/]+content=["']([^"']+)["'][^>]+property=["']og:image["'][^>]*>/i) + || html.match(/]+name=["']twitter:image["'][^>]+content=["']([^"']+)["'][^>]*>/i) + || html.match(/]+content=["']([^"']+)["'][^>]+name=["']twitter:image["'][^>]*>/i); + if (meta) { + const url = absolutizeUrl(meta[1], baseUrl); + if (isLikelyProductImage(url)) return url; + } + + const idImage = html.match(/]+id=["']image["'][^>]+src=["']([^"']+)["'][^>]*>/i) + || html.match(/]+src=["']([^"']+)["'][^>]+id=["']image["'][^>]*>/i); + if (idImage) { + const url = absolutizeUrl(idImage[1], baseUrl); + if (isLikelyProductImage(url)) return url; + } + + const imgRegex = /]*\bsrc=["']([^"']+)["'][^>]*>/gi; + let match: RegExpExecArray | null; + while ((match = imgRegex.exec(html)) !== null) { + const tag = match[0]; + const url = absolutizeUrl(match[1], baseUrl); + const alt = (tag.match(/\balt=["']([^"']*)["']/i)?.[1] || "").toLowerCase(); + const haystack = `${url} ${alt}`.toLowerCase(); + if (!isLikelyProductImage(url)) continue; + if (/transceiver|optical|sfp|qsfp|osfp|xfp|module|base-/i.test(haystack)) return url; + } + + return undefined; +} + +function isLikelyProductImage(url: string): boolean { + const lower = url.toLowerCase(); + if (!lower || !/^https?:\/\//.test(lower)) return false; + if (!/\.(jpg|jpeg|png|webp)(\?|$)/.test(lower)) return false; + if (/(logo|country|icon|sprite|banner|menu|payment|placeholder|no-image|company|featured-|solution|fofc|mdc|mmc)/.test(lower)) { + return false; + } + return /(catalog\/product|\/uploads\/image\/|product|transceiver|sfp|qsfp|osfp|xfp)/.test(lower); +} + +function inferDetails(text: string): InferredDetails { + const lower = text.toLowerCase(); + const details: InferredDetails = {}; + + const formFactors: Array<[RegExp, string]> = [ + [/\bqsfp-dd\b/i, "QSFP-DD"], + [/\bqsfp112\b/i, "QSFP112"], + [/\bosfp112\b/i, "OSFP112"], + [/\bosfp\b/i, "OSFP"], + [/\bqsfp56\b/i, "QSFP56"], + [/\bqsfp28\b/i, "QSFP28"], + [/\bqsfp\+|\bqsfp\b/i, "QSFP+"], + [/\bsfp56\b/i, "SFP56"], + [/\bsfp28\b/i, "SFP28"], + [/\bsfp\+|\bsfp plus\b/i, "SFP+"], + [/\bxfp\b/i, "XFP"], + [/\bsfp\b/i, "SFP"], + ]; + for (const [regex, formFactor] of formFactors) { + if (regex.test(text)) { + details.formFactor = formFactor; + break; + } + } + + const speedPatterns: Array<[RegExp, string, number]> = [ + [/\b800\s*g(?:base)?\b/i, "800G", 800], + [/\b400\s*g(?:base)?\b/i, "400G", 400], + [/\b200\s*g(?:base)?\b/i, "200G", 200], + [/\b128\s*gfc\b/i, "128G FC", 128], + [/\b100\s*g(?:base)?\b/i, "100G", 100], + [/\b50\s*g(?:base)?\b/i, "50G", 50], + [/\b40\s*g(?:base)?\b/i, "40G", 40], + [/\b32\s*gfc\b/i, "32G FC", 32], + [/\b16\s*gfc\b/i, "16G FC", 16], + [/\b10\s*g(?:base)?\b/i, "10G", 10], + [/\b1\.?25\s*g(?:base)?\b|\b1000base\b|\b1\s*g(?:base)?\b/i, "1G", 1], + [/(^|[^.\d-])25\s*g(?:base)?\b/i, "25G", 25], + [/\b2[.-]?5\s*g(?:base)?\b/i, "2.5G", 2.5], + ]; + for (const [regex, speed, speedGbps] of speedPatterns) { + if (regex.test(text)) { + details.speed = speed; + details.speedGbps = speedGbps; + break; + } + } + + const kmMatch = text.match(/\b(160|120|100|80|60|40|30|20|10|3|2|1(?:\.4)?)\s*km\b/i); + const mMatch = text.match(/\b(550|500|400|300|150|100|80|30)\s*m(?:eters?)?\b/i); + if (kmMatch) { + const km = parseFloat(kmMatch[1]); + details.reachLabel = `${kmMatch[1]}km`; + details.reachMeters = Math.round(km * 1000); + } else if (mMatch) { + const meters = parseInt(mMatch[1], 10); + details.reachLabel = `${meters}m`; + details.reachMeters = meters; + } else if (/\bfr4\b/i.test(text)) { + details.reachLabel = "2km"; + details.reachMeters = 2000; + } else if (/\bdr\d*\b/i.test(text)) { + details.reachLabel = "500m"; + details.reachMeters = 500; + } else if (/\blr4?\b/i.test(text)) { + details.reachLabel = "10km"; + details.reachMeters = 10000; + } else if (/\ber4?\b/i.test(text)) { + details.reachLabel = "40km"; + details.reachMeters = 40000; + } else if (/\bzr4?\b/i.test(text)) { + details.reachLabel = "80km"; + details.reachMeters = 80000; + } else if (/\bsr\d*|\bsw4\b/i.test(text)) { + details.reachLabel = "300m"; + details.reachMeters = 300; + } + + if (/base-?t|rj-?45|cat ?6|cat ?5|copper/i.test(text)) { + details.fiberType = "Copper"; + } else if (/single.?mode|\bsmf\b|\blr\b|\ber\b|\bzr\b|\bfr\b|\bdr\b|cwdm|dwdm|bidi|psm/i.test(lower)) { + details.fiberType = "SMF"; + } else if (/multi.?mode|\bmmf\b|\bsr\b|\bsw\b/i.test(lower)) { + details.fiberType = "MMF"; + } + + const wavelength = text.match(/\b(\d{3,4})\s*nm\b/i); + if (wavelength) details.wavelengths = wavelength[1]; + + return details; +} + +function mergeDetails(existing: Candidate, inferred: InferredDetails): InferredDetails { + const updates: InferredDetails = {}; + if (inferred.formFactor && (!existing.formFactor || existing.formFactor === "SFP+" && inferred.formFactor.startsWith("QSFP"))) { + updates.formFactor = inferred.formFactor; + } + if (inferred.speedGbps && (!existing.speedGbps || inferred.speedGbps > existing.speedGbps)) { + updates.speed = inferred.speed; + updates.speedGbps = inferred.speedGbps; + } + if (inferred.reachLabel && !existing.reachLabel) { + updates.reachLabel = inferred.reachLabel; + updates.reachMeters = inferred.reachMeters; + } + if (inferred.fiberType && !existing.fiberType) { + updates.fiberType = inferred.fiberType; + } + return updates; +} + +async function fetchHtml(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status}`); + return resp.text(); +} + +async function applyDetailUpdates(id: string, updates: InferredDetails): Promise { + const sets: string[] = []; + const values: unknown[] = [id]; + let idx = 2; + + if (updates.formFactor) { + sets.push(`form_factor = $${idx++}`); + values.push(updates.formFactor); + } + if (updates.speedGbps) { + sets.push(`speed_gbps = $${idx++}`); + values.push(updates.speedGbps); + } + if (updates.speed) { + sets.push(`speed = $${idx++}`); + values.push(updates.speed); + } + if (updates.reachMeters) { + sets.push(`reach_meters = $${idx++}`); + values.push(updates.reachMeters); + } + if (updates.reachLabel) { + sets.push(`reach_label = $${idx++}`); + values.push(updates.reachLabel); + } + if (updates.fiberType) { + sets.push(`fiber_type = $${idx++}`); + values.push(updates.fiberType); + } + if (updates.wavelengths) { + sets.push(`wavelengths = COALESCE(NULLIF(wavelengths, ''), $${idx++})`); + values.push(updates.wavelengths); + } + + if (sets.length === 0) return; + sets.push("updated_at = NOW()"); + await pool.query(`UPDATE transceivers SET ${sets.join(", ")} WHERE id = $1`, values); +} + +async function main(): Promise { + const vendorFilter = (process.env["PRODUCT_ASSET_VENDOR"] || "").trim(); + const limit = parseInt(process.env["PRODUCT_ASSET_LIMIT"] || "120", 10); + const apply = process.env["PRODUCT_ASSET_APPLY"] === "1"; + + const vendorNames = vendorFilter + ? vendorFilter.split(",").map((v) => v.trim()).filter(Boolean) + : ["T&S Communication", "10Gtek", "GBICS"]; + + const candidates = await pool.query( + `SELECT t.id, + v.name AS "vendorName", + t.part_number AS "partNumber", + t.product_page_url AS "productUrl", + t.form_factor AS "formFactor", + t.speed_gbps AS "speedGbps", + t.reach_label AS "reachLabel", + t.fiber_type AS "fiberType", + t.image_verified AS "imageVerified", + t.details_verified AS "detailsVerified" + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = ANY($1) + AND COALESCE(t.category, '') <> ALL($2) + AND COALESCE(t.product_page_url, '') ~* '^https?://' + AND ( + COALESCE(t.image_verified, false) = false + OR COALESCE(t.details_verified, false) = false + OR t.speed_gbps IS NULL + OR t.speed_gbps = 0 + OR COALESCE(t.fiber_type, '') = '' + ) + ORDER BY v.name, t.part_number + LIMIT $3`, + [vendorNames, EXCLUDED_CATEGORIES, limit], + ); + + let images = 0; + let details = 0; + let skipped = 0; + let errors = 0; + + console.log("=== Product page asset verifier ===", { vendorNames, limit, apply, count: candidates.rows.length }); + + for (const row of candidates.rows) { + try { + const html = await fetchHtml(row.productUrl); + const title = extractTitle(html); + const text = `${row.partNumber} ${row.productUrl} ${title}`; + const imageUrl = extractImageUrl(html, row.productUrl); + const inferred = inferDetails(text); + const updates = mergeDetails(row, inferred); + + if (!apply) { + console.log("dry-run product asset", { + vendor: row.vendorName, + partNumber: row.partNumber, + imageUrl, + updates, + }); + } else { + await applyDetailUpdates(row.id, updates); + if (imageUrl && !row.imageVerified) { + const changed = await markImageVerified(row.id, imageUrl); + if (changed) images++; + } + const marked = await markDetailsVerified({ transceiverId: row.id, sourceUrl: row.productUrl }); + if (marked) details++; + } + + if (!imageUrl && Object.keys(updates).length === 0) skipped++; + await sleep(750); + } catch (err) { + errors++; + console.warn(`asset verifier error for ${row.vendorName} ${row.partNumber}: ${(err as Error).message}`); + } + } + + console.log("Product page asset verifier complete", { images, details, skipped, errors, apply }); +} + +if (require.main === module) { + main() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 108d515..48d9659 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -37,6 +37,37 @@ Updated: 2026-05-09 22:15 UTC - reconcile must preserve explicit competitor research states - GBICS/T&S artifact quarantine rules +- TIP product-page asset verifier on 2026-05-09: + - added `verify:product-page-assets` + - deterministic scope: + - only existing `product_page_url` rows + - vendor-limited batches via `PRODUCT_ASSET_VENDOR` + - dry-run by default, apply only with `PRODUCT_ASSET_APPLY=1` + - extracts images from source-backed product image tags/meta only + - infers details only from part number, product URL, and title to avoid navigation pollution + - remote build passed on Erik + - live verifier results: + - GBICS extra quarantine: `17` additional category/family artifacts + - T&S Communication asset apply: `36` images, `36` details closed after a second DR8 reach pass + - 10Gtek/SFPcables asset apply: `5` images, `10` details improved on rows with existing product URLs + - current vendor gaps: + - GBICS: `71` active rows, `0` missing price, `0` missing image, `0` missing details + - T&S Communication: `36` active rows, `36` missing price, `0` missing image, `0` missing details + - 10Gtek: `175` active rows, `126` missing price, `126` missing image, `20` missing details + - interpretation: + - T&S is now product-data complete but public-price blocked; pages expose no real public price (`price: 0.00` / quote-only behavior) + - 10Gtek remaining gaps are mostly rows without reliable product URLs/price sources and need alias/source discovery rather than blind image guessing + - live health after this pass: + - active products: `17195` + - price verified: `11414` + - image verified: `12057` + - details verified: `16713` + - fully verified: `10459` + - competitor status: `matched=10775`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5790` + - TIPLLM training pool updated with: + - product-page asset verifier dry-run/apply pattern + - T&S quote-only public-price rule + - MAGATAMA multi-LLM training lane expansion on 2026-05-09: - added first-class training lanes for: - `pulso_llm` diff --git a/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md b/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md index d7b4d79..41d4c27 100644 --- a/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md +++ b/sync/history/2026-05-09-tip-open-competitor-and-product-gap-probing.md @@ -136,3 +136,49 @@ Verification after a fresh reconcile test: - `ambiguous=556` - `needs_research=5807` - fully product-verified rows still in competitor `needs_research`: `0` + +## Follow-Up: Product Page Asset Verifier + +Added `packages/scraper/src/utils/verify-product-page-assets.ts` and script `verify:product-page-assets`. + +Safety design: + +- vendor-limited with `PRODUCT_ASSET_VENDOR` +- batch-limited with `PRODUCT_ASSET_LIMIT` +- dry-run by default +- apply only with `PRODUCT_ASSET_APPLY=1` +- uses existing `product_page_url` rows only +- writes through existing `markImageVerified` / `markDetailsVerified` helpers so the evidence ledger remains canonical +- infers details only from part number, product URL, and title; full body text was intentionally excluded after a dry-run showed T&S navigation links polluting speed inference + +Live runs: + +- extra GBICS quarantine: + - `17` category/family artifacts removed from the active base + - GBICS is now product-data complete in the active base: `71` active rows, `0/0/0` price/image/details gaps +- T&S Communication: + - product-page asset verifier applied + - `36` images verified + - `36` details verified after adding DR8/SR8 reach inference + - current active gaps: `36` active rows, `36` missing price, `0` missing image, `0` missing details + - interpretation: T&S is public-price blocked; pages expose no deterministic price, only quote/no-price or `price: 0.00` +- 10Gtek / SFPcables: + - dry-run caught `1.25G` / `2.5G` vs `25G` regex risk; regex was hardened before apply + - `5` images verified + - `10` details verified/improved + - current active gaps: `175` active rows, `126` missing price, `126` missing image, `20` missing details + - interpretation: remaining 10Gtek rows need URL/source alias discovery, not blind verification + +Live health after this pass: + +- health: healthy +- active products: `17195` +- price verified: `11414` +- image verified: `12057` +- details verified: `16713` +- fully verified: `10459` +- competitor status: + - `matched=10775` + - `no_valid_match=74` + - `ambiguous=556` + - `needs_research=5790` diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 96f669e..195c2d2 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -47,3 +47,5 @@ {"id":"tip-20260509-quote-only-oem-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent prices for OEM or quote-only sources."},{"role":"user","content":"Large OEM vendors such as Cisco and Juniper have many rows without public prices. Should TIP mark those prices verified?"},{"role":"assistant","content":"No. OEM catalog rows can be detail-verified when public documentation provides technical specs, but price_verified requires a real public price observation or a deliberately modeled non-public-price state. Do not scrape estimates, reseller guesses, or competitor prices into OEM price_verified. If the source is quote-only or no public price exists, record that as a separate state and keep price_verified false until the data model supports it."}]} {"id":"tip-20260509-reconcile-preserve-explicit-competitor-states-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Preserve evidence-backed verification states during maintenance jobs."},{"role":"user","content":"A maintenance reconcile resets competitor_verified rows with no recent non-Flexoptix price. What states must it preserve?"},{"role":"assistant","content":"Preserve explicit research outcomes. `competitor_status=no_valid_match` and `competitor_status=ambiguous` are deliberate evidence-backed states and must not be reset to `needs_research` by a generic price freshness reconcile. Reconcile may reset stale `matched` rows when their required evidence disappears, but it must keep no-valid-match and ambiguous rows intact, then verify that fully product-verified rows left in `needs_research` stay at zero."}]} {"id":"tip-20260509-gbics-ts-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active transceiver counts honest by quarantining vendor navigation, category, PDF, cable, and infrastructure artifacts."},{"role":"user","content":"What did the GBICS and T&S cleanup teach the crawler robots?"},{"role":"assistant","content":"GBICS rows such as category labels, patch leads, active optical cables, direct-attach cables, Choose By pages, cart pages, vendor pages, NVIDIA/ConnectX/Cisco/WatchGuard/HP pages, network adapters, media converters, AOC/DAC and MTP/MPO cable pages are not active transceiver SKUs. T&S rows with non-/products URLs, PDFs, generic fiber-optic-transceivers family URLs, Product-Brochure names, or broad 100G/200G/400G-Transceiver family labels are artifacts. Quarantine these outside the active verification base, clear verification flags, and write `artifact_quarantine` evidence before recalculating health."}]} +{"id":"tip-20260509-product-page-asset-verifier-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Design deterministic product-page verifiers that improve evidence without broad crawling or invented values."},{"role":"user","content":"How should TIP safely backfill images and details from existing vendor product URLs?"},{"role":"assistant","content":"Use a targeted product-page asset verifier. Select active rows with an existing product_page_url and missing image/details; fetch one page per row with low concurrency; extract only likely product images from og:image, twitter:image, Magento product image IDs, or filtered product image URLs; infer details only from part number, product URL, and page title, not from full body navigation. Always dry-run first to catch navigation pollution such as T&S 800G menu links or 1.25G being misread as 25G. Apply through `markImageVerified` and `markDetailsVerified` so evidence is recorded."}]} +{"id":"tip-20260509-quote-only-tscom-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Public-price verification requires a real public price."},{"role":"user","content":"T&S Communication product pages have valid images and details but expose no real public prices, only no price or price 0.00. What should TIP do?"},{"role":"assistant","content":"Do not mark price_verified. Treat T&S as image/detail-verifiable but public-price-missing until a real price observation exists. `price: 0.00`, quote/request flows, missing amount fields, or SEO filenames containing `price` are not price evidence. Keep image_verified/details_verified true when source-backed, keep price_verified false, and document the vendor as quote-only/public-price-unavailable rather than fabricating a market price."}]}