From 2be61f2441e0de3036ffeeb7c01c23cac3b5dfdd Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sun, 10 May 2026 01:42:24 +0200 Subject: [PATCH] feat: close TIP retail price research states --- packages/scraper/package.json | 2 + packages/scraper/src/scrapers/fiber24.ts | 6 +- packages/scraper/src/scrapers/sfpcables.ts | 34 +++- packages/scraper/src/utils/db.ts | 24 ++- .../src/utils/quarantine-non-transceivers.ts | 22 ++- .../src/utils/resolve-price-availability.ts | 4 +- .../src/utils/verify-product-page-prices.ts | 173 ++++++++++++++++++ sync/CURRENT.md | 35 +++- ...9-tip-price-source-verification-closure.md | 40 ++++ training-data/tip-llm-capabilities-v1.jsonl | 4 + 10 files changed, 330 insertions(+), 14 deletions(-) create mode 100644 packages/scraper/src/utils/verify-product-page-prices.ts create mode 100644 sync/history/2026-05-09-tip-price-source-verification-closure.md diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 504cd38..c0904c5 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -13,12 +13,14 @@ "scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts", "scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts", "scrape:gaotek:details": "tsx src/scrapers/gaotek-detail-pages.ts", + "scrape:10gtek": "tsx src/scrapers/tenGtek.ts", "scrape:gbics": "tsx src/scrapers/gbics.ts", "scrape:tscom": "tsx src/scrapers/tscom.ts", "scrape:sfpcables": "tsx src/scrapers/sfpcables.ts", "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts", + "verify:product-page-prices": "tsx src/utils/verify-product-page-prices.ts", "verify:part-number-details": "tsx src/utils/verify-part-number-details.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", diff --git a/packages/scraper/src/scrapers/fiber24.ts b/packages/scraper/src/scrapers/fiber24.ts index 5adee5c..b0e24a3 100644 --- a/packages/scraper/src/scrapers/fiber24.ts +++ b/packages/scraper/src/scrapers/fiber24.ts @@ -148,13 +148,13 @@ function parseProductPage(html: string, url: string): Product | null { || partNumber; if (!name || name.length < 5) return null; - // Price — take the lowest itemprop price (minPrice if available, else first price) + // Price — take the first product price in DOM order. Later itemprop/minPrice + // values can belong to related accessories and must not override the main SKU. const priceEls = $("[itemprop='price']").map((_i, el) => { const val = $(el).attr("content") || $(el).text(); return parseFloat(val.replace(",", ".")); }).get().filter((p: number) => !isNaN(p) && p > 0 && p < 50000); - const minPriceEl = $("[itemprop='minPrice']").attr("content"); - const price = minPriceEl ? parseFloat(minPriceEl) : (priceEls.length > 0 ? Math.min(...priceEls) : undefined); + const price = priceEls.length > 0 ? priceEls[0] : undefined; // Currency const currency = $("[itemprop='priceCurrency']").first().attr("content") || "EUR"; diff --git a/packages/scraper/src/scrapers/sfpcables.ts b/packages/scraper/src/scrapers/sfpcables.ts index 3dea4b6..d93a1de 100644 --- a/packages/scraper/src/scrapers/sfpcables.ts +++ b/packages/scraper/src/scrapers/sfpcables.ts @@ -139,6 +139,23 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product }); } +function parseProductPagePrice(html: string): number | undefined { + const patterns = [ + /]+property=["']product:price:amount["'][^>]+content=["']([\d,.]+)["']/i, + /]+itemprop=["']price["'][^>]+content=["']([\d,.]+)["']/i, + /]+class=["'][^"']*price-box[^"']*["'][\s\S]{0,1500}?]+class=["']price["'][^>]*>\s*US?\$?\s*([\d,.]+)/i, + /]+class=["']price["'][^>]*>\s*US?\$?\s*([\d,.]+)/i, + ]; + + for (const pattern of patterns) { + const match = html.match(pattern); + if (!match) continue; + const price = parseFloat(match[1].replace(",", "")); + if (Number.isFinite(price) && price > 0 && price < 50000) return price; + } + return undefined; +} + async function fetchPage(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, @@ -189,6 +206,16 @@ export async function scrapeSfpCables(): Promise { for (const product of catProducts) { try { + let observedPrice = product.price; + if (!observedPrice) { + await sleep(600); + try { + observedPrice = parseProductPagePrice(await fetchPage(product.url)); + } catch (err) { + console.warn(` Price page failed: ${product.partNumber}: ${(err as Error).message.slice(0, 80)}`); + } + } + const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, @@ -200,14 +227,15 @@ export async function scrapeSfpCables(): Promise { fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", + productUrl: product.url, }); - if (product.price && product.price > 0) { - const hash = contentHash({ price: product.price, part: product.partNumber }); + if (observedPrice && observedPrice > 0) { + const hash = contentHash({ price: observedPrice, part: product.partNumber }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, - price: product.price, + price: observedPrice, currency: "USD", stockLevel: "in_stock", url: product.url, diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index e6eaea4..8464d26 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -181,14 +181,18 @@ const PRICE_BOUNDS: Record = { async function isPriceAnomalous(transceiverId: string, priceUsd: number): Promise { const row = await pool.query( - `SELECT form_factor FROM transceivers WHERE id = $1`, + `SELECT form_factor, part_number, product_page_url FROM transceivers WHERE id = $1`, [transceiverId] ); const formFactor = row.rows[0]?.form_factor as string | undefined; if (!formFactor) return false; const bounds = PRICE_BOUNDS[formFactor]; if (!bounds) return false; - return priceUsd < bounds[0] || priceUsd > bounds[1]; + const descriptor = `${row.rows[0]?.part_number || ""} ${row.rows[0]?.product_page_url || ""}`; + const max = /\b(zr|zr4|dco|coherent|otu|4000km|tunable)\b/i.test(descriptor) + ? Math.max(bounds[1], 80000) + : bounds[1]; + return priceUsd < bounds[0] || priceUsd > max; } export async function upsertPriceObservation(params: { @@ -253,7 +257,11 @@ export async function upsertPriceObservation(params: { await pool.query( `UPDATE transceivers SET price_verified = true, - price_verified_at = NOW() + price_verified_at = NOW(), + price_status = 'public_price', + price_status_updated_at = NOW(), + price_unavailable_verified_at = NULL, + price_unavailable_reason = NULL ${isCompetitor ? ", competitor_verified = true, competitor_verified_at = NOW(), competitor_status = 'matched', competitor_status_updated_at = NOW()" : ""} WHERE id = $1`, [params.transceiverId] @@ -296,6 +304,10 @@ export async function upsertPriceObservation(params: { `UPDATE transceivers SET price_verified = true, price_verified_at = NOW(), + price_status = 'public_price', + price_status_updated_at = NOW(), + price_unavailable_verified_at = NULL, + price_unavailable_reason = NULL, competitor_verified = true, competitor_verified_at = NOW(), competitor_status = 'matched', @@ -307,7 +319,11 @@ export async function upsertPriceObservation(params: { await pool.query( `UPDATE transceivers SET price_verified = true, - price_verified_at = NOW() + price_verified_at = NOW(), + price_status = 'public_price', + price_status_updated_at = NOW(), + price_unavailable_verified_at = NULL, + price_unavailable_reason = NULL WHERE id = $1`, [params.transceiverId] ); diff --git a/packages/scraper/src/utils/quarantine-non-transceivers.ts b/packages/scraper/src/utils/quarantine-non-transceivers.ts index 186e782..dc551cd 100644 --- a/packages/scraper/src/utils/quarantine-non-transceivers.ts +++ b/packages/scraper/src/utils/quarantine-non-transceivers.ts @@ -57,7 +57,16 @@ async function quarantine(): Promise { OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)' ) ) - OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' + OR ( + v.name = 'ShopFiber24' + AND ( + t.part_number ~* '^(FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-.*|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*&.*AOC.*Kabel|.*DAC.*AOC.*Kabel)$' + OR t.part_number ~* '^FOCP-' + OR COALESCE(t.product_page_url, '') ~* '/(FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver|FOCP-|.*DAC.*AOC.*Kabel)/' + ) + ) + OR (v.name = 'Vcelink' AND t.part_number ~ '^[0-9]+$' AND COALESCE(t.product_page_url, '') = '') + OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*&.*AOC.*Kabel|.*DAC.*AOC.*Kabel)$' OR t.category IN ( 'Accessory', 'Adapter / Converter', @@ -101,7 +110,16 @@ async function quarantine(): Promise { OR t.part_number ~* '(^Transceiver$|Product-Brochure|^[0-9]+G(/[0-9]+G(/[0-9]+G)?)?-Transceiver$)' ) ) - OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' + OR ( + v.name = 'ShopFiber24' + AND ( + t.part_number ~* '^(FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-.*|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*&.*AOC.*Kabel|.*DAC.*AOC.*Kabel)$' + OR t.part_number ~* '^FOCP-' + OR COALESCE(t.product_page_url, '') ~* '/(FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver|FOCP-|.*DAC.*AOC.*Kabel)/' + ) + ) + OR (v.name = 'Vcelink' AND t.part_number ~ '^[0-9]+$' AND COALESCE(t.product_page_url, '') = '') + OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*&.*AOC.*Kabel|.*DAC.*AOC.*Kabel)$' OR t.category IN ( 'Accessory', 'Adapter / Converter', diff --git a/packages/scraper/src/utils/resolve-price-availability.ts b/packages/scraper/src/utils/resolve-price-availability.ts index 6f0cfa9..dfa9854 100644 --- a/packages/scraper/src/utils/resolve-price-availability.ts +++ b/packages/scraper/src/utils/resolve-price-availability.ts @@ -136,7 +136,9 @@ async function main(): Promise { console.log("=== Price availability resolver ===", { vendorNames, limit, apply, count: result.rows.length }); for (const row of result.rows) { - const reason = getNoPublicPriceReason(row.vendorName); + const reason = row.vendorName === "10Gtek" && !row.productUrl + ? "not_listed_in_public_sfpcables_retail_catalog_after_full_crawl" + : getNoPublicPriceReason(row.vendorName); if (!reason) { skipped++; continue; diff --git a/packages/scraper/src/utils/verify-product-page-prices.ts b/packages/scraper/src/utils/verify-product-page-prices.ts new file mode 100644 index 0000000..cab2ecb --- /dev/null +++ b/packages/scraper/src/utils/verify-product-page-prices.ts @@ -0,0 +1,173 @@ +import { pool, upsertPriceObservation } from "./db"; +import { contentHash } from "./hash"; + +type Candidate = { + id: string; + vendorId: string; + vendorName: string; + partNumber: string; + productUrl: string; +}; + +type PriceResult = { + price: number; + currency: string; +}; + +const HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml", + "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", +}; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function parseNumericPrice(value: string | undefined): number | undefined { + if (!value) return undefined; + const normalized = value + .replace(/\s/g, "") + .replace(/[^\d,.]/g, "") + .replace(/\.(?=\d{3}(?:\D|$))/g, "") + .replace(",", "."); + const price = parseFloat(normalized); + return Number.isFinite(price) && price > 0 && price < 50000 ? price : undefined; +} + +function parseSfpCables(html: string): PriceResult | undefined { + const patterns = [ + /]+property=["']product:price:amount["'][^>]+content=["']([\d,.]+)["']/i, + /]+itemprop=["']price["'][^>]+content=["']([\d,.]+)["']/i, + /]+class=["'][^"']*price-box[^"']*["'][\s\S]{0,1500}?]+class=["']price["'][^>]*>\s*US?\$?\s*([\d,.]+)/i, + /]+class=["']price["'][^>]*>\s*US?\$?\s*([\d,.]+)/i, + ]; + for (const pattern of patterns) { + const price = parseNumericPrice(html.match(pattern)?.[1]); + if (price) return { price, currency: "USD" }; + } + return undefined; +} + +function parseShopFiber24(html: string): PriceResult | undefined { + const price = parseNumericPrice(html.match(/itemprop=["']price["'][^>]+content=["']([\d,.]+)["']/i)?.[1]) + || parseNumericPrice(html.match(/([\d.]+,\d{2})\s*€/i)?.[1]); + if (!price) return undefined; + return { price, currency: "EUR" }; +} + +function parsePrice(vendorName: string, html: string): PriceResult | undefined { + if (vendorName === "SFPcables") return parseSfpCables(html); + if (vendorName === "ShopFiber24") return parseShopFiber24(html); + return undefined; +} + +function parseAtgbicsProductJson(jsonText: string): PriceResult | undefined { + const parsed = JSON.parse(jsonText) as { price?: number; variants?: Array<{ price?: number | string }> }; + const rawPrice = parsed.price ?? parsed.variants?.[0]?.price; + const numeric = typeof rawPrice === "string" ? parseFloat(rawPrice) : rawPrice; + if (!Number.isFinite(numeric) || !numeric || numeric <= 0) return undefined; + return { price: numeric / 100, currency: "GBP" }; +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { + headers: HEADERS, + signal: AbortSignal.timeout(30000), + }); + if (!resp.ok) throw new Error(`HTTP ${resp.status}`); + return resp.text(); +} + +async function fetchVendorPrice(row: Candidate): Promise { + if (row.vendorName === "ATGBICS") { + const jsonUrl = `${row.productUrl.replace(/\/$/, "")}.js`; + return parseAtgbicsProductJson(await fetchPage(jsonUrl)); + } + return parsePrice(row.vendorName, await fetchPage(row.productUrl)); +} + +async function main(): Promise { + const vendorFilter = (process.env["PRODUCT_PRICE_VENDOR"] || "").trim(); + const limit = parseInt(process.env["PRODUCT_PRICE_LIMIT"] || "100", 10); + const apply = process.env["PRODUCT_PRICE_APPLY"] === "1"; + const vendorNames = vendorFilter + ? vendorFilter.split(",").map((v) => v.trim()).filter(Boolean) + : ["SFPcables", "ShopFiber24", "ATGBICS"]; + + const result = await pool.query( + `SELECT t.id, + t.part_number AS "partNumber", + t.product_page_url AS "productUrl", + v.id AS "vendorId", + v.name AS "vendorName" + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = ANY($1) + AND COALESCE(t.category, '') != 'NonTransceiver' + AND COALESCE(t.price_verified, false) = false + AND COALESCE(t.price_status, 'needs_research') IN ('unknown', 'needs_research', 'ambiguous') + AND COALESCE(t.product_page_url, '') != '' + ORDER BY v.name, t.part_number + LIMIT $2`, + [vendorNames, limit], + ); + + let prices = 0; + let skipped = 0; + let errors = 0; + + console.log("=== Product page price verifier ===", { vendorNames, limit, apply, count: result.rows.length }); + + for (const row of result.rows) { + await sleep(800); + try { + const parsed = await fetchVendorPrice(row); + if (!parsed) { + skipped++; + continue; + } + + if (apply) { + const hash = contentHash({ price: parsed.price, currency: parsed.currency, part: row.partNumber }); + await upsertPriceObservation({ + transceiverId: row.id, + sourceVendorId: row.vendorId, + price: parsed.price, + currency: parsed.currency, + stockLevel: "in_stock", + url: row.productUrl, + contentHash: hash, + }); + } + + console.log("price verified", { + vendor: row.vendorName, + partNumber: row.partNumber, + price: parsed.price, + currency: parsed.currency, + apply, + }); + prices++; + } catch (err) { + errors++; + console.warn("price page failed", { + vendor: row.vendorName, + partNumber: row.partNumber, + error: (err as Error).message, + }); + } + } + + console.log("Product page price verifier complete", { prices, skipped, errors, apply }); +} + +if (require.main === module) { + main() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index dcd4ddf..06ccff1 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,41 @@ # Current TIP Sync State -Updated: 2026-05-09 23:15 UTC +Updated: 2026-05-09 23:38 UTC ## Newest Work +- TIP price/source verification closure on 2026-05-10 local / 2026-05-09 UTC: + - fixed SFPcables scraper to persist `product_page_url` + - added product-page price fallback for SFPcables when listing pages omit price markup + - added `verify:product-page-prices` + - source-backed public price verification from existing product URLs + - ShopFiber24 parser takes the first main product `itemprop=price`, not related-product `minPrice` + - ATGBICS parser uses Shopify `/products/{handle}.js` prices for coherent/ZR products + - fixed `upsertPriceObservation` to set `price_status='public_price'` + - widened price anomaly handling only for explicit coherent/ZR/DCO/tunable products + - expanded quarantine for ShopFiber24 FOCP/category/DAC-AOC artifacts and Vcelink numeric rows + - live runs on Erik: + - ShopFiber24 quarantine: `12` artifacts removed + - SFPcables scraper with detail fallback: `110` products, `37` price observations + - SFPcables asset verifier: `31` images, `29` details, `0` errors + - ShopFiber24 price verifier: `12` real EUR prices + - ATGBICS price verifier: `3` real GBP coherent/ZR prices + - Vcelink quarantine: `2` numeric artifacts removed + - 10Gtek/SFPcables retail crawl confirmed remaining `126` rows have no public retail product URL + - 10Gtek price availability resolver: `126` rows set to `price_status=no_public_price` with evidence + - live health after this pass: + - active products: `17181` + - price verified: `11460` + - price status: `public_price=11460`, `no_public_price=5721`, `needs_research=0`, `ambiguous=0` + - image verified: `12125` + - details verified: `16920` + - fully verified: `10549` + - competitor status: `matched=10821`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5730` + - interpretation: + - price research queue is closed without fabricated prices + - remaining verification work is image/details/competitor state, dominated by OEM/catalog rows + - largest current product-data gaps: Juniper, Cisco, 10Gtek, Nokia, Palo Alto, Arista + - TIP continuation on 2026-05-10 local / 2026-05-09 UTC: - added `verify:part-number-details` - deterministic part-number speed inference for rows where form factor/reach/fiber already exist but `speed_gbps=0` @@ -65,6 +97,7 @@ Updated: 2026-05-09 23:15 UTC - API payload now uses `iters` and `seed_only` instead of stale `iterations` and `seedOnly` - added `all` mode for sequential full-lane training - streams SSE lines to the log instead of buffering until the response closes + - MAGATAMA Gitea commit: `76d4054` - live sequence started on Erik: - command: `python3 -u scripts/trigger_lane_training_once.py all 500 false` - log: `/opt/magatama/logs/runpod-all-lanes-20260509T230549Z.log` diff --git a/sync/history/2026-05-09-tip-price-source-verification-closure.md b/sync/history/2026-05-09-tip-price-source-verification-closure.md new file mode 100644 index 0000000..ef414c7 --- /dev/null +++ b/sync/history/2026-05-09-tip-price-source-verification-closure.md @@ -0,0 +1,40 @@ +# TIP Price / Source Verification Closure + +Date: 2026-05-09 23:38 UTC + +## Scope + +Continued TIP verification work with deterministic crawlers/verifiers only. No external AI was used. Erik was kept to one focused run at a time. + +## Changes + +- SFPcables now persists `product_page_url`. +- SFPcables now fetches product pages for missing listing prices. +- Added `verify:product-page-prices`. +- `upsertPriceObservation` now also writes `price_status='public_price'`. +- Coherent/ZR/DCO/tunable optics get a higher anomaly ceiling so real high-end prices are not rejected. +- ShopFiber24 parser now uses the first main product `itemprop=price`, not related-product `minPrice`. +- ShopFiber24/Vcelink quarantine rules were expanded for remaining artifacts. +- 10Gtek rows absent from the public SFPcables retail catalog are resolved as `price_status=no_public_price`, not `price_verified`. + +## Live Results + +- SFPcables: `0` price gaps, `0` image gaps, `3` detail gaps after scraper + asset verifier. +- ShopFiber24: `0` price/image/detail gaps. +- ATGBICS: `3` coherent/ZR prices verified from Shopify product JSON. +- Vcelink: `2` numeric artifacts quarantined. +- Price research queue: `needs_research=0`. + +## Health Snapshot + +- active products: `17181` +- price verified: `11460` +- price status: `public_price=11460`, `no_public_price=5721`, `needs_research=0`, `ambiguous=0` +- image verified: `12125` +- details verified: `16920` +- fully verified: `10549` +- competitor status: `matched=10821`, `no_valid_match=74`, `ambiguous=556`, `needs_research=5730` + +## Remaining Work + +The price queue is closed without fabricated prices. Remaining work is image/details/competitor verification, led by OEM/catalog rows such as Juniper, Cisco, 10Gtek, Nokia, Palo Alto and Arista. diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 31bea1d..0353883 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -51,3 +51,7 @@ {"id":"tip-20260509-quote-only-tscom-pricing-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Public-price verification requires a real public price."},{"role":"user","content":"T&S Communication product pages have valid images and details but expose no real public prices, only no price or price 0.00. What should TIP do?"},{"role":"assistant","content":"Do not mark price_verified. Treat T&S as image/detail-verifiable but public-price-missing until a real price observation exists. `price: 0.00`, quote/request flows, missing amount fields, or SEO filenames containing `price` are not price evidence. Keep image_verified/details_verified true when source-backed, keep price_verified false, and document the vendor as quote-only/public-price-unavailable rather than fabricating a market price."}]} {"id":"tip-20260510-part-number-details-verifier-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Use deterministic part-number inference only for conservative, auditable detail repair."},{"role":"user","content":"How should TIP close OEM detail gaps when form factor, reach and fiber are known but speed_gbps is zero?"},{"role":"assistant","content":"Use a part-number details verifier. Infer speed only from unambiguous tokens such as 800G, 400G, 100GBASE, 40GBASE, SFP10G, XFP-10GE, 4X25G, 2X50G, 10X10G, CTP-SFP-1GE, and known FC tokens. Dry-run first and exclude traps such as Cisco GLC-FE Fast Ethernet; do not treat all GLC-* as 1G. Update speed/speed_gbps, then mark details through the canonical evidence path with robot_name `verify:part-number-details`."}]} {"id":"tip-20260510-price-status-model-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Separate real public prices from resolved no-public-price states."},{"role":"user","content":"How should TIP handle OEM, component, hyperscaler, test-equipment, and quote-only vendors with no public checkout price?"},{"role":"assistant","content":"Do not set `price_verified` unless there is a real public price observation. Add or use `price_status`: `public_price` for true price observations, `no_public_price` for source/policy-backed quote-only or OEM catalog cases, `needs_research` for real retail/source discovery work, and `ambiguous` for uncertain cases. Write `price_unavailable` evidence with the vendor, part number, reason, and source URL if available. Keep retail vendors such as 10Gtek, SFPcables, ShopFiber24, ATGBICS, Vcelink, FS.com and Flexoptix out of no-public-price policies because they should have real prices."}]} +{"id":"tip-20260510-product-page-price-verifier-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Verify retail prices from source-backed product pages and never from related-product widgets."},{"role":"user","content":"SFPcables and ShopFiber24 rows have product URLs but still sit in price_status=needs_research. What should the crawler do?"},{"role":"assistant","content":"Run a targeted product-page price verifier. For SFPcables, use the product page main price box or product price meta tags when listing pages omit prices. For ShopFiber24, take the first main product `itemprop=price` in DOM order and ignore later `minPrice` or related-accessory prices. Write a real price_observation through `upsertPriceObservation`; that function must set `price_verified=true` and `price_status=public_price`. Do not mark public price if no source price is found."}]} +{"id":"tip-20260510-coherent-price-bounds-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Price anomaly filters must reject artifacts without blocking legitimate high-end coherent optics."},{"role":"user","content":"ATGBICS 100G/400G coherent ZR products show prices like 15105.95 GBP or 17554.95 GBP. Should TIP reject them as anomalous?"},{"role":"assistant","content":"No, not when the product descriptor contains explicit coherent evidence such as ZR, ZR4, DCO, coherent, OTU, tunable or ultra-long-haul reach. Standard form-factor ceilings are useful for ordinary SR/LR optics, but coherent/ZR optics legitimately exceed normal QSFP28/QSFP-DD bounds. Raise the anomaly ceiling only for those explicit coherent tokens and still require a real source-backed price observation."}]} +{"id":"tip-20260510-retail-absent-no-public-price-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research states with explicit source-discovery evidence instead of endless manual queues."},{"role":"user","content":"10Gtek has old catalog rows with no product URL after a full SFPcables retail crawl. How should TIP classify their price state?"},{"role":"assistant","content":"If a full controlled 10Gtek/SFPcables retail crawl finds the public retail catalog and a row still has no product_page_url or public price source, set `price_status=no_public_price` with reason `not_listed_in_public_sfpcables_retail_catalog_after_full_crawl`. Do not set `price_verified`; no fabricated price exists. Record `price_unavailable` evidence so the dashboard can distinguish resolved no-public-price rows from real open price research."}]} +{"id":"tip-20260510-artifact-quarantine-shopfiber-vcelink-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active product counts clean by quarantining deterministic artifacts."},{"role":"user","content":"What remaining ShopFiber24 and Vcelink artifacts were found during price closure?"},{"role":"assistant","content":"ShopFiber24 artifacts include FOCP fiber cable SKUs, FO-TRANSCEIVER/category labels, Kompatible-Transceiver, Transceivermodule, QSFP+-MSA/SFP+-MSA, Marken-Transceiver pages, Zu-den-* navigation labels, and DAC/AOC cable category pages. Vcelink rows with numeric-only part numbers and no product_page_url are not real transceiver SKUs. Move them to NonTransceiver, clear verification flags, and write artifact_quarantine evidence."}]}