From 1af4f090f7920d847164bd2708ca5adc0442c53b Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 22:16:29 +0200 Subject: [PATCH] fix: harden TIP verification cleanup --- packages/api/src/db/queries.ts | 1 + packages/api/src/routes/health.ts | 11 + packages/scraper/package.json | 3 + .../scraper/src/robots/verification-robots.ts | 4 + packages/scraper/src/scheduler.ts | 11 + packages/scraper/src/scrapers/ascentoptics.ts | 6 + .../src/scrapers/gaotek-detail-pages.ts | 191 ++++++++++++++++++ packages/scraper/src/scrapers/gaotek.ts | 8 + packages/scraper/src/scrapers/tenGtek.ts | 12 ++ .../src/utils/normalize-product-urls.ts | 51 +++++ .../src/utils/quarantine-non-transceivers.ts | 99 +++++++++ sync/CURRENT.md | 59 +++++- ...-artifact-cleanup-and-vendor-completion.md | 86 ++++++++ training-data/tip-llm-capabilities-v1.jsonl | 4 + 14 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 packages/scraper/src/scrapers/gaotek-detail-pages.ts create mode 100644 packages/scraper/src/utils/normalize-product-urls.ts create mode 100644 packages/scraper/src/utils/quarantine-non-transceivers.ts create mode 100644 sync/history/2026-05-09-tip-verification-artifact-cleanup-and-vendor-completion.md diff --git a/packages/api/src/db/queries.ts b/packages/api/src/db/queries.ts index 3194a26..c507ce9 100644 --- a/packages/api/src/db/queries.ts +++ b/packages/api/src/db/queries.ts @@ -22,6 +22,7 @@ export async function searchTransceivers(params: SearchParams) { const conditions: string[] = [ `COALESCE(t.data_confidence, 'unknown') != 'garbage'`, `COALESCE(t.product_page_url, '') NOT LIKE '%/category/%'`, + `COALESCE(t.category, '') NOT IN ('NonTransceiver', 'Accessory', 'Adapter / Converter', 'Switch / Media Converter', 'Switch / Network Infrastructure', 'NIC / Adapter', 'Mux / Passive Optical', 'Product Family', 'Loopback / Test Module')`, ]; const values: any[] = []; let idx = 1; diff --git a/packages/api/src/routes/health.ts b/packages/api/src/routes/health.ts index 05b26d6..818a9d7 100644 --- a/packages/api/src/routes/health.ts +++ b/packages/api/src/routes/health.ts @@ -24,6 +24,17 @@ healthRouter.get("/", async (_req: Request, res: Response) => { FROM transceivers WHERE COALESCE(data_confidence, 'unknown') != 'garbage' AND COALESCE(product_page_url, '') NOT LIKE '%/category/%' + AND COALESCE(category, '') NOT IN ( + 'NonTransceiver', + 'Accessory', + 'Adapter / Converter', + 'Switch / Media Converter', + 'Switch / Network Infrastructure', + 'NIC / Adapter', + 'Mux / Passive Optical', + 'Product Family', + 'Loopback / Test Module' + ) `).catch(() => ({ rows: [{}] })); const v = verStats.rows[0] || {}; diff --git a/packages/scraper/package.json b/packages/scraper/package.json index 07fd9b2..9584f9d 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -12,7 +12,10 @@ "scrape:fs:url-discovery": "FS_URL_DISCOVERY_ONLY=1 TIP_FORCE_REVALIDATE=1 tsx src/scrapers/fs-com.ts", "scrape:atgbics:details": "tsx src/scrapers/atgbics-detail-pages.ts", "scrape:vendors:details": "tsx src/scrapers/shopfiber24-fibermall-detail-pages.ts", + "scrape:gaotek:details": "tsx src/scrapers/gaotek-detail-pages.ts", "verify:catalog:details": "tsx src/utils/verify-catalog-details.ts", + "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", + "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "scrape:cisco": "tsx src/scrapers/cisco-tmg.ts", "scrape:optcore": "tsx src/scrapers/optcore.ts", "scrape:news": "tsx src/scrapers/news.ts", diff --git a/packages/scraper/src/robots/verification-robots.ts b/packages/scraper/src/robots/verification-robots.ts index 0eda3d4..ac067ee 100644 --- a/packages/scraper/src/robots/verification-robots.ts +++ b/packages/scraper/src/robots/verification-robots.ts @@ -157,6 +157,8 @@ export async function getVerificationStatus(): Promise<{ summary: Record(` @@ -171,6 +173,8 @@ export async function getVerificationStatus(): Promise<{ summary: Record 0 ORDER BY diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 439adb1..02f68ff 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -2622,6 +2622,17 @@ export async function registerWorkers(boss: PgBoss): Promise { WHERE product_page_url IS NOT NULL AND product_page_url != '' AND product_page_url NOT LIKE '%/category/%' + AND COALESCE(category, '') NOT IN ( + 'NonTransceiver', + 'Accessory', + 'Adapter / Converter', + 'Switch / Media Converter', + 'Switch / Network Infrastructure', + 'NIC / Adapter', + 'Mux / Passive Optical', + 'Product Family', + 'Loopback / Test Module' + ) AND form_factor IS NOT NULL AND speed_gbps IS NOT NULL AND part_number IS NOT NULL diff --git a/packages/scraper/src/scrapers/ascentoptics.ts b/packages/scraper/src/scrapers/ascentoptics.ts index 2f93e2b..b8f7b63 100644 --- a/packages/scraper/src/scrapers/ascentoptics.ts +++ b/packages/scraper/src/scrapers/ascentoptics.ts @@ -148,6 +148,12 @@ function parseProductTable( // Skip header rows and non-product rows if (!rawPart || rawPart.length < 3 || /part\s*no|description/i.test(rawPart)) return; if (rawPart.length > 80) return; + if ( + rawPart.startsWith("--") || + /^(Distance|Optical-Transceivers|Coherent-Transceivers|Fiber-Channel-Transceivers|LPO-Transceivers|Liquid-Cooling-Transceivers|PON-Transceivers|XGSPON)$/i.test(rawPart) || + /Transceivers$/i.test(rawPart) || + /^(QSFP112|QSFP112-DD|QSFP-DD800|QSFP28-DD|QSFP56-DD|SFP-DD|SFP112|SFP56-DD|OSFP224|OSFP-XD)$/i.test(rawPart) + ) return; const url = (() => { const a = partNumberCell.find("a[href]").first().attr("href") ?? descCell.find("a[href]").first().attr("href"); diff --git a/packages/scraper/src/scrapers/gaotek-detail-pages.ts b/packages/scraper/src/scrapers/gaotek-detail-pages.ts new file mode 100644 index 0000000..3768068 --- /dev/null +++ b/packages/scraper/src/scrapers/gaotek-detail-pages.ts @@ -0,0 +1,191 @@ +/** + * GAO Tek Detail Page Verifier + * + * Fetches GAO product pages that are already known real product URLs but still + * lack details_verified, then extracts deterministic specs from title/meta/body. + */ +import * as cheerio from "cheerio"; +import { pool, markDetailsVerified } from "../utils/db"; +import { logger } from "../utils/logger"; + +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-GAOTekDetailVerifier/1.0)", + Accept: "text/html,application/xhtml+xml", + "Accept-Language": "en-US,en;q=0.9", +}; + +interface Row { + id: string; + part_number: string; + product_page_url: string; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchHtml(url: string): Promise { + const resp = await fetch(url, { + headers: HEADERS, + signal: AbortSignal.timeout(25000), + }); + if (!resp.ok) throw new Error(`HTTP ${resp.status}`); + return resp.text(); +} + +function compact(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function parseSpeed(text: string): { speed: string; speedGbps: number; formFactor?: string } { + const s = text.toUpperCase(); + if (/100\s*G|100G|100GE/.test(s)) return { speed: "100G", speedGbps: 100, formFactor: /CFP/.test(s) ? "CFP" : "QSFP28" }; + if (/40\s*G|40G|40GE/.test(s)) return { speed: "40G", speedGbps: 40, formFactor: "QSFP+" }; + if (/32\s*G|32G/.test(s)) return { speed: "32G", speedGbps: 32, formFactor: "SFP+" }; + if (/25\s*G|25G/.test(s)) return { speed: "25G", speedGbps: 25, formFactor: "SFP28" }; + if (/10\s*G|10G|11\.3\s*GB/.test(s)) return { speed: "10G", speedGbps: 10, formFactor: /XFP/.test(s) ? "XFP" : "SFP+" }; + if (/3\.125\s*G/.test(s)) return { speed: "3.125G", speedGbps: 3.125, formFactor: "SFP" }; + if (/2\.5\s*G/.test(s)) return { speed: "2.5G", speedGbps: 2.5, formFactor: "SFP" }; + if (/1\.25\s*G|1G|1000BASE/.test(s)) return { speed: "1G", speedGbps: 1, formFactor: "SFP" }; + return { speed: "Unknown", speedGbps: 0 }; +} + +function parseReach(text: string): { label: string; meters: number } | null { + const matches = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*(km|m)\b/gi)] + .map((m) => ({ value: Number(m[1]), unit: m[2].toLowerCase() })) + .filter((m) => m.value > 0); + + if (matches.length > 0) { + const meters = matches.map((m) => m.unit === "km" ? Math.round(m.value * 1000) : Math.round(m.value)); + const maxMeters = Math.max(...meters); + return { + label: maxMeters >= 1000 ? `${Math.round(maxMeters / 1000)}km` : `${maxMeters}m`, + meters: maxMeters, + }; + } + + const s = text.toUpperCase(); + if (s.includes("ZR")) return { label: "80km", meters: 80000 }; + if (s.includes("ER")) return { label: "40km", meters: 40000 }; + if (s.includes("LR")) return { label: "10km", meters: 10000 }; + if (s.includes("FR")) return { label: "2km", meters: 2000 }; + if (s.includes("DR")) return { label: "500m", meters: 500 }; + if (s.includes("SR")) return { label: "100m", meters: 100 }; + return null; +} + +function parseFiber(text: string): string { + if (/single[- ]?mode|\bSMF\b|9\s*(?:µm|um)|1310\s*nm|1550\s*nm|CWDM|DWDM|BIDI/i.test(text)) return "SMF"; + if (/multi[- ]?mode|\bMMF\b|850\s*nm|OM3|OM4/i.test(text)) return "MMF"; + if (/copper|RJ-?45|1000BASE-T|10GBASE-T|DAC|twinax/i.test(text)) return "Copper"; + return ""; +} + +function parseWavelengths(text: string): string { + const values = [...text.matchAll(/\b(\d{3,4})\s*nm\b/gi)] + .map((m) => m[1]) + .filter((v, i, a) => a.indexOf(v) === i) + .slice(0, 8); + return values.join("/"); +} + +function extractText(html: string): string { + const $ = cheerio.load(html); + const title = $(".product_title, h1").first().text(); + const meta = $("meta[property='og:description']").attr("content") || + $("meta[name='description']").attr("content") || + ""; + const body = $(".woocommerce-product-details__short-description, .summary, .woocommerce-Tabs-panel, .product-tabs-wrapper") + .text(); + return compact(`${title} ${meta} ${body}`); +} + +async function loadRows(limit: number): Promise { + const result = await pool.query(` + SELECT t.id, t.part_number, t.product_page_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'GAO Tek' + AND COALESCE(t.details_verified, false) = false + AND COALESCE(t.category, '') != 'NonTransceiver' + AND t.product_page_url LIKE 'https://gaotek.com/product/%' + ORDER BY t.image_verified DESC, t.part_number + LIMIT $1 + `, [limit]); + return result.rows; +} + +export async function verifyGaoTekDetailPages(): Promise { + const limit = Math.max(1, parseInt(process.env.GAOTEK_DETAIL_LIMIT || "500", 10)); + const rows = await loadRows(limit); + logger.info("=== GAO Tek detail verifier ===", { rows: rows.length, limit }); + + let updated = 0; + let verified = 0; + let skipped = 0; + let errors = 0; + + for (const row of rows) { + try { + const html = await fetchHtml(row.product_page_url); + const text = extractText(html); + const reach = parseReach(text); + const fiber = parseFiber(text); + const speed = parseSpeed(text); + const wavelengths = parseWavelengths(text); + + if (!reach || !fiber || speed.speedGbps <= 0) { + skipped++; + await sleep(250); + continue; + } + + await pool.query(` + UPDATE transceivers + SET form_factor = CASE WHEN form_factor IS NULL OR form_factor = '' OR form_factor = 'SFP+' THEN COALESCE($2, form_factor) ELSE form_factor END, + speed = CASE WHEN speed IS NULL OR speed = '' OR speed = 'Unknown' THEN $3 ELSE speed END, + speed_gbps = CASE WHEN speed_gbps IS NULL OR speed_gbps = 0 THEN $4 ELSE speed_gbps END, + reach_label = CASE WHEN reach_label IS NULL OR reach_label = '' THEN $5 ELSE reach_label END, + reach_meters = CASE WHEN reach_meters IS NULL OR reach_meters = 0 THEN $6 ELSE reach_meters END, + fiber_type = CASE WHEN fiber_type IS NULL OR fiber_type = '' THEN $7 ELSE fiber_type END, + wavelengths = CASE WHEN wavelengths IS NULL OR wavelengths = '' THEN NULLIF($8, '') ELSE wavelengths END, + updated_at = NOW() + WHERE id = $1 + `, [ + row.id, + speed.formFactor || null, + speed.speed, + speed.speedGbps, + reach.label, + reach.meters, + fiber, + wavelengths, + ]); + updated++; + + if (await markDetailsVerified({ transceiverId: row.id, sourceUrl: row.product_page_url })) { + verified++; + } + } catch (err) { + errors++; + logger.warn("GAO Tek detail page failed", { + partNumber: row.part_number, + url: row.product_page_url, + error: (err as Error).message, + }); + } + await sleep(250); + } + + logger.info("GAO Tek detail verifier complete", { updated, verified, skipped, errors }); +} + +if (require.main === module) { + verifyGaoTekDetailPages() + .then(() => pool.end()) + .catch((err) => { + logger.error("GAO Tek detail verifier failed", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/scrapers/gaotek.ts b/packages/scraper/src/scrapers/gaotek.ts index e08dddc..b4152ec 100644 --- a/packages/scraper/src/scrapers/gaotek.ts +++ b/packages/scraper/src/scrapers/gaotek.ts @@ -127,6 +127,10 @@ function isProductUrl(url: string): boolean { return /^https:\/\/gaotek\.com\/product\/[^/?#]+\/?$/i.test(url); } +function isNonTransceiverProduct(text: string): boolean { + return /\b(switch|walkie|radio|tester|meter|microscope|amplifier|oadm|demux|demultiplexer|multiplexer|mux|platform|transponder|transport system|solution|converter|adapter|cleaning|cassette|attenuator|telephone|sensor)\b/i.test(text); +} + function parseProductList(html: string): Product[] { const $ = cheerio.load(html); const products: Product[] = []; @@ -137,6 +141,7 @@ function parseProductList(html: string): Product[] { const titleEl = $(el).find(".wd-entities-title a, .woocommerce-loop-product__title, h2, h3, .product-title, .product-name").first(); const name = titleEl.text().trim(); if (!name || name.length < 5) return; + if (isNonTransceiverProduct(name)) return; const linkEl = $(el).find("a.wd-product-img-link[href], .wd-entities-title a[href], a[href]").first(); const href = linkEl.attr("href") || ""; @@ -192,6 +197,7 @@ function parseProductList(html: string): Product[] { if ( name.length < 8 || name.length > 200 || !isProductUrl(url) || + isNonTransceiverProduct(`${name} ${url}`) || !/sfp|qsfp|xfp|transceiver|optic/i.test(name) ) return; @@ -332,6 +338,8 @@ async function quarantineGaoTekCategoryArtifacts(vendorId: string): Promise { @@ -48,6 +49,7 @@ function sleep(ms: number): Promise { function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ + [/\b1\s*-\s*4\s*km\b/i, "4km", 4000], [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], @@ -56,6 +58,9 @@ function detectReach(text: string): { label: string; meters: number } | undefine [/\b500\s*m\b/i, "500m", 500], [/\b300\s*m\b/i, "300m", 300], [/\b100\s*m\b/i, "100m", 100], + [/\b80\s*meters?\b/i, "80m", 80], + [/\b40\s*meters?\b/i, "40m", 40], + [/\b30\s*meters?\b/i, "30m", 30], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bER\b/, "40km", 40000], @@ -142,6 +147,10 @@ function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Pr // Detect reach and fiber type from product title const reach = detectReach(rawTitle); const fiber = detectFiber(rawTitle); + const imageMatch = block.match(/]+(?:src|data-src)="([^"]+)"/i); + const imageUrl = imageMatch?.[1] + ? (imageMatch[1].startsWith("http") ? imageMatch[1] : `${BASE}${imageMatch[1]}`) + : undefined; products.push({ partNumber, @@ -155,6 +164,7 @@ function parseProductsFromPage(html: string, cat: typeof CATEGORIES[number]): Pr reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: fiber || undefined, + imageUrl, }); } @@ -239,6 +249,7 @@ export async function scrape10Gtek(): Promise { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, + productUrl: product.url, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, @@ -246,6 +257,7 @@ export async function scrape10Gtek(): Promise { reachLabel: product.reachLabel, fiberType: product.fiberType, category: "DataCenter", + imageUrl: product.imageUrl, }); if (product.price && product.price > 0) { diff --git a/packages/scraper/src/utils/normalize-product-urls.ts b/packages/scraper/src/utils/normalize-product-urls.ts new file mode 100644 index 0000000..d7d7797 --- /dev/null +++ b/packages/scraper/src/utils/normalize-product-urls.ts @@ -0,0 +1,51 @@ +/** + * Product URL Normalizer + * + * Repairs malformed source URLs created by older scrapers, so follow-up image + * and detail fetchers can reach the actual product pages. + */ +import { pool } from "./db"; +import { logger } from "./logger"; + +async function normalizeProductUrls(): Promise { + logger.info("=== Product URL Normalizer ==="); + + const mouserDe = await pool.query(` + UPDATE transceivers + SET product_page_url = regexp_replace( + product_page_url, + '^https://www\\.mouser\\.dehttps://www\\.mouser\\.de', + 'https://www.mouser.de' + ), + updated_at = NOW() + WHERE product_page_url LIKE 'https://www.mouser.dehttps://www.mouser.de%' + RETURNING id + `); + + const mouserCom = await pool.query(` + UPDATE transceivers + SET product_page_url = regexp_replace( + product_page_url, + '^https://www\\.mouser\\.comhttps://www\\.mouser\\.com', + 'https://www.mouser.com' + ), + updated_at = NOW() + WHERE product_page_url LIKE 'https://www.mouser.comhttps://www.mouser.com%' + RETURNING id + `); + + logger.info("Product URL normalizer complete", { + mouser_de_fixed: mouserDe.rowCount ?? 0, + mouser_com_fixed: mouserCom.rowCount ?? 0, + }); +} + +if (require.main === module) { + normalizeProductUrls() + .then(() => pool.end()) + .catch((err) => { + logger.error("Product URL normalizer failed", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/packages/scraper/src/utils/quarantine-non-transceivers.ts b/packages/scraper/src/utils/quarantine-non-transceivers.ts new file mode 100644 index 0000000..bcd487c --- /dev/null +++ b/packages/scraper/src/utils/quarantine-non-transceivers.ts @@ -0,0 +1,99 @@ +/** + * Non-Transceiver Quarantine + * + * Moves obvious accessories, switches, instruments, radio products and optical + * transport systems out of the active transceiver verification base. + */ +import { pool } from "./db"; +import { logger } from "./logger"; + +async function quarantine(): Promise { + logger.info("=== Non-Transceiver Quarantine ==="); + + const result = await pool.query(` + UPDATE transceivers t + SET category = 'NonTransceiver', + price_verified = false, + image_verified = false, + details_verified = false, + competitor_verified = false, + fully_verified = false, + price_verified_at = NULL, + image_verified_at = NULL, + details_verified_at = NULL, + competitor_verified_at = NULL, + fully_verified_at = NULL, + updated_at = NOW() + FROM vendors v + WHERE v.id = t.vendor_id + AND ( + v.name = 'GAO Tek' + OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~ '^https://www\\.flexoptix\\.net/en/transceiver/?\\?') + OR ( + v.name = 'Ascent Optics' + AND ( + t.part_number LIKE '--%' + OR t.part_number ~* '(^Distance$|^Optical-Transceivers$|^Coherent-Transceivers$|^Fiber-Channel-Transceivers$|^LPO-Transceivers$|^Liquid-Cooling-Transceivers$|^PON-Transceivers$|^XGSPON$|Transceivers$|^[A-Z0-9+-]+-DD$|^QSFP112$|^QSFP112-DD$|^QSFP-DD800$|^OSFP224$|^OSFP-XD$)' + OR COALESCE(t.product_page_url, '') ~* '/category/' + ) + ) + OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' + OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') + OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' + OR t.category IN ( + 'Accessory', + 'Adapter / Converter', + 'Switch / Media Converter', + 'Switch / Network Infrastructure', + 'NIC / Adapter', + 'Mux / Passive Optical', + 'Product Family', + 'Loopback / Test Module' + ) + ) + AND ( + t.part_number ~* '(switch|walkie|radio|tester|meter|microscope|amplifier|oadm|demux|demultiplexer|multiplexer|mux|platform|transponder|transport[ -]system|solution|converter|adapter|cleaning|cassette|attenuator|telephone|sensor)' + OR COALESCE(t.product_page_url, '') ~* '(switch|walkie|radio|tester|meter|microscope|amplifier|oadm|demux|demultiplexer|multiplexer|mux|platform|transponder|transport-system|solution|converter|adapter|cleaning|cassette|attenuator|telephone|sensor)' + OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~ '^https://www\\.flexoptix\\.net/en/transceiver/?\\?') + OR ( + v.name = 'Ascent Optics' + AND ( + t.part_number LIKE '--%' + OR t.part_number ~* '(^Distance$|^Optical-Transceivers$|^Coherent-Transceivers$|^Fiber-Channel-Transceivers$|^LPO-Transceivers$|^Liquid-Cooling-Transceivers$|^PON-Transceivers$|^XGSPON$|Transceivers$|^[A-Z0-9+-]+-DD$|^QSFP112$|^QSFP112-DD$|^QSFP-DD800$|^OSFP224$|^OSFP-XD$)' + OR COALESCE(t.product_page_url, '') ~* '/category/' + ) + ) + OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' + OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') + OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR t.part_number ~* '^(N/A|Change|wurde|Distance|FO-TRANSCEIVER|Kompatible-Transceiver|Transceivermodule|Zu-den-|QSFP\\+-MSA|SFP\\+-MSA|Marken-Transceiver.*|.*DAC.*AOC.*Kabel)$' + OR t.category IN ( + 'Accessory', + 'Adapter / Converter', + 'Switch / Media Converter', + 'Switch / Network Infrastructure', + 'NIC / Adapter', + 'Mux / Passive Optical', + 'Product Family', + 'Loopback / Test Module' + ) + ) + AND COALESCE(t.category, '') != 'NonTransceiver' + RETURNING t.id + `); + + logger.info("Non-transceiver quarantine complete", { + quarantined: result.rowCount ?? 0, + }); +} + +if (require.main === module) { + quarantine() + .then(() => pool.end()) + .catch((err) => { + logger.error("Non-transceiver quarantine failed", { error: (err as Error).message }); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 69e5c28..2d16979 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,66 @@ # Current TIP Sync State -Updated: 2026-05-09 18:16 UTC +Updated: 2026-05-09 20:12 UTC ## Newest Work +- TIP verification artifact cleanup and vendor completion on 2026-05-09: + - operator requirement: + - continue until all source-backed verification work is exhausted + - use deterministic TIP robots/scrapers only; no external AI + - keep Erik safe by running targeted jobs and waiting for pg-boss completion + - write crawler/scraper/robot learnings into the TIPLLM training pool + - deployed fixes: + - added/expanded `verify:quarantine:non-transceivers` + - removes GAO, Ascent, FS.com, Flexoptix, Arista, ShopFiber24, and Coherent category/support/cable/switch artifacts from the active transceiver base + - clears price/image/details/competitor/fully verification flags for those artifacts + - added `verify:normalize:product-urls` + - repaired malformed older Mouser URLs such as duplicated `https://www.mouser.dehttps://www.mouser.de...` + - added `scrape:gaotek:details` + - lightweight fetch+cheerio detail verifier for GAO product URLs + - hardened Ascent parser so product-family/category rows are skipped + - repaired 10Gtek/SFPcables scraper to pass product URL and image URL into verification and parse common meter/range reaches + - scheduler reconcile now excludes known non-transceiver categories when promoting `details_verified` + - live robot runs: + - non-transceiver quarantine: + - first pass quarantined 121 artifacts + - Flexoptix filter URL pass quarantined 103 artifacts + - Ascent/Flex/FS/Arista/ShopFiber/Coherent cleanup quarantined 68 + 38 + 6 additional artifacts + - GAO detail verifier: + - 245 GAO product pages examined + - 181 rows updated and details verified + - 64 skipped because source text still lacked complete deterministic specs + - Mouser URL normalizer: + - 388 malformed `mouser.de` URLs repaired + - 10Gtek scraper: + - 50 product pages parsed via sfpcables.com + - URL/image propagation repaired for future verification + - Ascent scraper: + - 237 genuine product rows kept after parser hardening + - category/family rows no longer re-enter active verification + - FS.com DB detail run: + - 1 remaining detail page scraped + - 1 price observation and 1 spec verification written + - reconcile completed + - equivalence matcher completed at `2026-05-09 20:11:39 UTC` + - latest live TIP health: + - status `healthy` + - load status `ok` + - memory used `13%` + - active total `17,405` + - `price_verified=11,523` + - `image_verified=12,125` + - `details_verified=16,810` + - `fully_verified=10,758` + - vendor truth after cleanup: + - active Flexoptix products now have price/image/details complete; remaining `not_full=280` is competitor-match only + - active FS.com products now have price/image/details complete; remaining `not_full=74` is competitor-match only + - GAO Tek remains quote-only/no public prices: 433 active rows still blocked by missing public price/competitor evidence + - Juniper/Cisco/Eoptolink/Ascent/OEM families remain the largest open blockers because public price/image evidence is not available for many rows + - TIPLLM training pool: + - appended deterministic lessons to `training-data/tip-llm-capabilities-v1.jsonl` + - JSONL validated locally + - TIP global verification continuation on 2026-05-09: - operator requirement: - continue until all possible product data is searched, found, verified, and source-backed diff --git a/sync/history/2026-05-09-tip-verification-artifact-cleanup-and-vendor-completion.md b/sync/history/2026-05-09-tip-verification-artifact-cleanup-and-vendor-completion.md new file mode 100644 index 0000000..5c0f2b9 --- /dev/null +++ b/sync/history/2026-05-09-tip-verification-artifact-cleanup-and-vendor-completion.md @@ -0,0 +1,86 @@ +# TIP Verification Artifact Cleanup And Vendor Completion — 2026-05-09 + +## Scope + +- Continue TIP verification with deterministic robots only. +- Keep Erik safe by avoiding broad parallel crawl waves. +- Do not use external AI; TIPLLM training receives the lessons, not runtime inference. +- Sync all learnings into Gitea for Claude/Codex handoff. + +## Implemented + +- Added `verify:quarantine:non-transceivers`. + - Excludes obvious non-transceiver artifacts from active product verification. + - Clears price/image/details/competitor/fully flags on those rows. + - Covers GAO, Ascent, FS.com, Flexoptix, Arista, ShopFiber24, and Coherent artifact patterns. +- Added `verify:normalize:product-urls`. + - Repairs duplicated Mouser URL prefixes. +- Added `scrape:gaotek:details`. + - Lightweight fetch+cheerio verifier for GAO product pages. +- Hardened Ascent parser. + - Skips category/family rows before they enter the database. +- Repaired 10Gtek/SFPcables scraper. + - Passes product URL and image URL into the common verification path. + - Adds deterministic reach parsing for common meter/range text. +- Hardened scheduler reconcile. + - Does not promote excluded non-transceiver categories into `details_verified`. + +## Live Runs + +- Non-transceiver cleanup: + - 121 artifacts quarantined. + - 103 Flexoptix filter URL artifacts quarantined. + - 68 Ascent/category artifacts quarantined. + - 38 FS/Flex/Arista/ShopFiber/Coherent artifacts quarantined. + - 6 final FS/Flex redirect/no-source artifacts quarantined. +- GAO detail verifier: + - 245 product pages inspected. + - 181 rows updated and details verified. + - 64 skipped because the source still lacked complete deterministic specs. +- Mouser URL normalizer: + - 388 malformed `mouser.de` URLs repaired. +- 10Gtek/SFPcables: + - 50 products parsed after URL/image propagation fix. +- Ascent: + - 237 genuine products kept after category filtering. +- FS.com: + - 1 remaining DB detail page scraped. + - 1 price observation and 1 spec verification written. +- Reconcile completed. +- Equivalence matcher completed at `2026-05-09 20:11:39 UTC`. + +## Final Observed State + +- TIP health: healthy. +- Load: ok. +- Memory used: 13%. +- Active total: 17,405. +- Price verified: 11,523. +- Image verified: 12,125. +- Details verified: 16,810. +- Fully verified: 10,758. + +## Vendor Truth + +- Flexoptix: + - Active products have price/image/details complete. + - Remaining not-full rows are competitor-match only. +- FS.com: + - Active products have price/image/details complete. + - Remaining not-full rows are competitor-match only. +- GAO Tek: + - Quote-only/no public prices in crawled catalog. + - Prices were not fabricated. +- OEM-heavy vendors: + - Juniper, Cisco, Eoptolink, Ascent and similar vendors remain blocked mostly by missing public price/image/competitor evidence. + +## Training Pool + +- Appended four TIPLLM lessons to `training-data/tip-llm-capabilities-v1.jsonl`. +- Lessons cover: + - quote-only truthfulness + - non-transceiver artifact quarantine + - Erik-safe crawler operation + - Flexoptix/FS distinction between product-data completeness and competitor-match completeness +- JSONL validation passed. + diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 6970f4d..28f8c33 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -32,3 +32,7 @@ {"id":"1238b39dd28336d7876acf23","source":"tip-llm-capabilities-v1","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.\n\nYour five core capabilities:\n\nCAP-1 · TRANSCEIVER RESEARCH\nResearch any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.\n\nCAP-2 · SWITCH RESEARCH\nResearch network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.\n\nCAP-3 · BLOG LLM DATA EVALUATION\nEvaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.\n\nCAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN\nDesign, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.\n\nCAP-5 · HYPE CYCLE CALCULATION\nCalculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0–100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 12–24 month forecast."},{"role":"user","content":"TIP_LLM, research and fully populate a database entry for the Lumentum QSFP-DD 400G-ZR coherent transceiver."},{"role":"assistant","content":"{\n \"part_number\": \"QSFP-DD-400G-ZR-C\",\n \"vendor\": \"Lumentum\",\n \"form_factor\": \"QSFP-DD\",\n \"protocol\": \"400ZR\",\n \"standard\": \"OIF 400ZR Implementation Agreement\",\n \"speed_gbps\": 400,\n \"channels\": 1,\n \"modulation\": \"DP-16QAM\",\n \"coherent\": true,\n \"fiber_type\": \"SMF OS2\",\n \"reach_km\": 80,\n \"reach_label\": \"ZR\",\n \"connector\": \"LC duplex\",\n \"wavelength\": \"C-band tunable, 75GHz ITU-T grid\",\n \"tx_power_dbm\": { \"min\": -10, \"max\": 0 },\n \"rx_sensitivity_dbm\": -22,\n \"osnr_required_db\": 22.5,\n \"power_consumption_w\": { \"typical\": 14, \"max\": 15 },\n \"dom_support\": true,\n \"cmis_version\": \"CMIS 4.0\",\n \"temperature_range_c\": \"0 to 70\",\n \"fec\": \"oFEC (concatenated)\",\n \"line_rate_gbps\": 478.75,\n \"net_data_rate_gbps\": 400,\n \"use_cases\": [\n \"Metro DWDM replacement (up to 80km, no inline amplifiers)\",\n \"DCI (Data Center Interconnect) direct connect\",\n \"Open ROADM integration (OpenConfig)\",\n \"ROADM bypass for campus/metro spans\"\n ],\n \"compatible_switches\": [\n \"Cisco NCS 1010 (native ZR host)\",\n \"Arista 7130 (coherent uplink)\",\n \"Juniper PTX10001-36MR\",\n \"Any router/switch with QSFP-DD slot + CMIS 4.0 + ZR DSP awareness\"\n ],\n \"price_usd\": 4800,\n \"price_trend\": \"Declining — $8,000 (2021) → $4,800 (2024) → ~$3,500 est (2025)\",\n \"hype_cycle_position\": \"SLOPE_OF_ENLIGHTENMENT (pos=48)\",\n \"buy_signal\": \"CONSIDER — technology proven, prices still elevated vs long-term target\",\n \"notes\": \"Requires host platform with ZR DSP awareness. Not plug-and-play on all QSFP-DD switches — confirm CMIS 4.0 + ZR mode support with switch vendor.\",\n \"data_confidence\": \"HIGH\",\n \"sources\": [\"OIF-400ZR-01.0 spec\", \"Lumentum datasheet 2024-Q1\"]\n}"}]} {"id":"294ddb4442128b6f70eda03c","source":"tip-llm-capabilities-v1","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.\n\nYour five core capabilities:\n\nCAP-1 · TRANSCEIVER RESEARCH\nResearch any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.\n\nCAP-2 · SWITCH RESEARCH\nResearch network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.\n\nCAP-3 · BLOG LLM DATA EVALUATION\nEvaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.\n\nCAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN\nDesign, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.\n\nCAP-5 · HYPE CYCLE CALCULATION\nCalculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0–100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 12–24 month forecast."},{"role":"user","content":"Design a complete automated pipeline: crawl new transceiver announcements → extract data → evaluate for blog → update TIP database → update Hype Cycle."},{"role":"assistant","content":"AUTOMATED TIP INTELLIGENCE PIPELINE\n\nOVERVIEW:\n[Sources] → [Crawl] → [Extract] → [Evaluate] → [DB Update] → [Hype Cycle Update] → [Blog Trigger]\n\nSTAGE 1 — SOURCE MONITORING (every 6h):\n• Crawlee monitor actors watching:\n - vendor press pages (Lumentum, Coherent, II-VI, Inphi)\n - IEEE 802.3 workgroup announcements\n - OIF press releases\n - Light Reading, Fierce Telecom, Data Center Knowledge RSS feeds\n - NANOG mailing list digest\n• Change detection: SHA256 of relevant page sections vs stored hash\n\nSTAGE 2 — EXTRACTION (per new page):\n• Playwright actor: extract product specs using EXTRACTION_SCHEMA\n• LLM parsing (TIP_LLM CAP-1): \"Extract all transceiver specs from this page as JSON\"\n• Confidence scoring: HIGH/MEDIUM/LOW based on field completeness\n\nSTAGE 3 — BLOG EVALUATION (CAP-3):\n• TIP_LLM scores content: technical_depth, factual_density, recency, uniqueness\n• Score ≥ 6.0 → queue for Blog_LLM article generation\n• Extracted claims → Blog_LLM context injection\n\nSTAGE 4 — DATABASE UPDATE:\n• New part: INSERT into transceivers (status=pending_review)\n• Existing part: UPDATE changed fields, log in audit table\n• Price update: INSERT into price_observations (TimescaleDB)\n• Flag significant price moves (>5%) for procurement alert\n\nSTAGE 5 — HYPE CYCLE RECALCULATION (CAP-5):\n• On new technology announcement: recalculate trigger_score\n• On adoption milestone (vendor count, price threshold): advance phase\n• On pricing data: update Bass model parameters\n• Output: updated hype_cycle table entry + dashboard notification\n\nSTAGE 6 — BLOG TRIGGER:\n• If blog_worthy_score ≥ 7.0: POST to /api/blog/generate with topic + context\n• Blog_LLM generates draft → status=draft → human review queue\n• Auto-publish if confidence=HIGH and human reviewed within 48h\n\nTECH STACK: Crawlee + BullMQ + PostgreSQL/TimescaleDB + TIP_LLM (CAP-1,3,5) + Blog_LLM"}]} {"id":"cb64c31c6b47f2c81a13bc61","source":"tip-llm-capabilities-v1","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM — the Transceiver Intelligence Platform's core research, data-engineering, and market-intelligence model.\n\nYour five core capabilities:\n\nCAP-1 · TRANSCEIVER RESEARCH\nResearch any optical transceiver by part number, vendor, form factor, or speed tier. Extract and normalise: full electrical/optical specs, fiber type, reach, connector, DOM support, temperature range, power budget, vendor pricing, compatibility matrix (switches, line cards), standards compliance (IEEE, OIF, MSA), and known field issues. Output structured JSON or normalised text. Never invent specs — flag unknowns explicitly.\n\nCAP-2 · SWITCH RESEARCH\nResearch network switches: port density, supported form factors, transceiver compatibility lists, ASIC type, buffer depth, forwarding capacity, SONiC/NOS support, rack unit size, power draw, and vendor pricing. Cross-reference transceivers → switches and vice versa. Identify supported QSFP-DD, OSFP, SFP28 variants per slot. Flag MACsec, FEC, and breakout constraints.\n\nCAP-3 · BLOG LLM DATA EVALUATION\nEvaluate raw crawled content, vendor pages, forum posts, and market reports for Blog_LLM ingestion quality. Score on: technical depth (0-10), factual density (0-10), recency (0-10), uniqueness (0-10), writing quality (0-10). Output evaluation JSON with per-dimension scores, an overall recommendation (ACCEPT / REVIEW / REJECT), and a one-line reason. Extract blog-worthy angles and key claims for reuse.\n\nCAP-4 · CRAWLER / SCRAPER / ROBOT DESIGN\nDesign, plan, and generate production-ready crawlers using Crawlee + Playwright/Puppeteer. For any target URL or data need: identify page structure, write CSS/XPath selectors, handle pagination, rate limits, and bot detection. Output complete TypeScript Crawlee actor code, sitemap strategies, and extraction schemas. Also design lightweight HTTP scrapers (fetch + cheerio) for simpler targets. Flag legal/ToS considerations.\n\nCAP-5 · HYPE CYCLE CALCULATION\nCalculate Gartner Hype Cycle position for optical networking technologies using the Norton-Bass diffusion model. Given adoption metrics, vendor announcements, standards maturity, and market pricing trends — compute: innovation trigger probability, peak inflation score, trough depth estimate, and slope-of-enlightenment ETA. Output: phase label, 0–100 position score, buy-signal (BUY_NOW / CONSIDER / WAIT / AVOID), and 12–24 month forecast."},{"role":"user","content":"A crawl finds a transceiver with part number 'FTLX8574D3BNL'. It's not in the TIP database. What should TIP_LLM do?"},{"role":"assistant","content":"NEW PART RESOLUTION WORKFLOW — FTLX8574D3BNL\n\nSTEP 1 — DECODE PART NUMBER:\n• Prefix FTLX = Finisar/II-VI\n• 8574 → 10G SFP+ family (857x = 850nm SR/SR+ family)\n• D = Duplex LC\n• 3 = Extended temperature (-40 to 85°C)\n• B = Digital Diagnostic Monitoring\n• NL = RoHS compliant, OEM variant (vs BCL = Cisco-coded variant)\n\nSTEP 2 — MAP TO BASE SPEC:\n• FTLX8574D3BCL is the Cisco-coded version — already in many compatibility lists\n• BNL = generic / non-Cisco coded → different EEPROM vendor ID\n• Both are functionally identical: 10GBASE-SR, OM3 300m / OM4 400m, 850nm, LC duplex, DDM\n\nSTEP 3 — RESEARCH ENRICHMENT:\nFetch: https://ii-vi.com/product/ftlx8574d3bnl/\nExtract: All spec fields → populate EXTRACTION_SCHEMA\nCross-reference: IEEE 802.3ae, SFF-8431\n\nSTEP 4 — DATABASE ACTION:\nINSERT INTO transceivers:\n part_number = 'FTLX8574D3BNL'\n vendor_name = 'II-VI (Finisar)'\n form_factor = 'SFP+'\n speed_gbps = 10\n protocol = '10GBASE-SR'\n fiber_type = 'MMF'\n reach_label = 'SR'\n reach_m = 400\n connector_type = 'LC'\n wavelength_nm = 850\n dom_support = true\n temperature_range = '-40 to 85°C'\n temp_type = 'industrial'\n notes = 'Extended temp variant of FTLX8574D3BCL. Generic EEPROM (no vendor lock). RoHS compliant.'\n data_confidence = 'HIGH'\n status = 'active'\n\nSTEP 5 — LINK RELATIONSHIPS:\nAdd to compatibility matrix: same switches as FTLX8574D3BCL\nAdd vendor_equivalents: FTLX8574D3BCL (Cisco OEM coded), FP-SFP+-SR (Flexoptix compatible)"}]} +{"id":"tip-20260509-verification-truth-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Never invent product evidence. A product can be fully verified only when source-backed price, image, details, and competitor equivalence are all present."},{"role":"user","content":"A vendor crawl leaves GAO Tek products with images and details but almost no public prices. Should TIP mark those products fully verified?"},{"role":"assistant","content":"No. GAO Tek behaves like a quote-only source for most products. TIP may mark image_verified and details_verified when product pages provide source evidence, but price_verified must remain false without a public price observation. Do not fabricate prices from competitors or estimates. Keep the products active if they are real transceivers, but report the blocker as quote-only/no public price evidence."}]} +{"id":"tip-20260509-artifact-quarantine-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Protect TIP data quality by distinguishing real transceiver product pages from category, support, cable, accessory, switch, and redirect artifacts."},{"role":"user","content":"How should TIP classify rows scraped from FS.com category URLs, Flexoptix supported-vendor pages, Flexoptix DAC/AOC or patch-cable pages, Arista switch pages, ShopFiber24 category labels, or Coherent direct-attach/dynamic-component pages?"},{"role":"assistant","content":"Classify them outside the active transceiver verification base. Set category to NonTransceiver or another excluded class and clear price/image/details/competitor/fully verified flags. Examples include fs.com /c/ category URLs, flexoptix.net supported-vendors, universal-dac-aoc, optical-patch-cables, universal-transceiver-our-voodoo, redirect URLs, arista.com products/solutions switch pages, ShopFiber24 labels like FO-TRANSCEIVER or Transceivermodule, and Coherent direct-attach-cables or dynamic-components pages. These are not 1:1 comparable transceiver SKUs."}]} +{"id":"tip-20260509-erik-safe-crawler-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Design crawler operations that protect Erik and prefer deterministic source extraction over blind crawling."},{"role":"user","content":"What is the safe operating pattern for TIP crawlers on Erik after the 2026-05-09 verification run?"},{"role":"assistant","content":"Use targeted robots, not broad waves. Run one focused verifier at a time, wait for pg-boss jobs to complete, and check pm2/API health between runs. Prefer lightweight fetch+cheerio detail verifiers for known product URLs. Use Playwright/Crawlee only when the source requires a browser, with concurrency 1 and short batches. Do not start additional jobs while scrape:pricing:flexoptix, discover:vendor:cisco-tmg, or maintenance:find-equivalences are active. Always reconcile and rerun the matcher after quarantine or source updates."}]} +{"id":"tip-20260509-flex-fs-status-001","source":"codex-tip-verification-2026-05-09","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Report verification status precisely and separate product-data completeness from competitor-match completeness."},{"role":"user","content":"After the 2026-05-09 cleanup, what is the correct interpretation of Flexoptix and FS.com verification status?"},{"role":"assistant","content":"For active Flexoptix and FS.com products, price, image, and details are complete after removing category, redirect, cable, support, and non-FS artifacts from the active base. Remaining not-fully-verified rows are due to competitor_verified=false, not missing product price/image/details. Do not say the whole vendor is 100% fully verified until 1:1 competitor equivalence exists or the absence of a valid competitor is explicitly represented as a verified no-match state."}]}