diff --git a/packages/scraper/package.json b/packages/scraper/package.json index c0904c5..85c7648 100644 --- a/packages/scraper/package.json +++ b/packages/scraper/package.json @@ -21,6 +21,7 @@ "verify:quarantine:non-transceivers": "tsx src/utils/quarantine-non-transceivers.ts", "verify:product-page-assets": "tsx src/utils/verify-product-page-assets.ts", "verify:product-page-prices": "tsx src/utils/verify-product-page-prices.ts", + "verify:10gtek:datasheets": "tsx src/utils/verify-10gtek-datasheets.ts", "verify:part-number-details": "tsx src/utils/verify-part-number-details.ts", "verify:normalize:product-urls": "tsx src/utils/normalize-product-urls.ts", "verify:fs:sku-aliases": "tsx src/utils/quarantine-fs-sku-aliases.ts", diff --git a/packages/scraper/src/utils/quarantine-non-transceivers.ts b/packages/scraper/src/utils/quarantine-non-transceivers.ts index dc551cd..8f8c599 100644 --- a/packages/scraper/src/utils/quarantine-non-transceivers.ts +++ b/packages/scraper/src/utils/quarantine-non-transceivers.ts @@ -38,8 +38,10 @@ async function quarantine(): Promise { ) ) OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' + OR t.part_number ~* '(AOC|DAC|DACBL|XDAC|DIRECT[ -]?ATTACH|TWINAX|BREAKOUT)' + OR COALESCE(t.product_page_url, '') ~* '(aoc|dac|direct-attach|active-optical|twinax|breakout)' OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') - OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com' OR COALESCE(t.product_page_url, '') ~* 'fs\\.com/c/')) OR ( v.name = 'GBICS' AND ( @@ -91,8 +93,10 @@ async function quarantine(): Promise { ) ) OR COALESCE(t.product_page_url, '') ~* '(/c/[a-z0-9-]+-[0-9]+|supported-vendors|universal-dac-aoc|optical-patch-cables|universal-transceiver-our-voodoo|flexoptix\\.net/(en/)?transceiver/?$|direct-attach-cables|dynamic-components|arista\\.com/en/(products|solutions)/)' + OR t.part_number ~* '(AOC|DAC|DACBL|XDAC|DIRECT[ -]?ATTACH|TWINAX|BREAKOUT)' + OR COALESCE(t.product_page_url, '') ~* '(aoc|dac|direct-attach|active-optical|twinax|breakout)' OR (v.name = 'Flexoptix' AND COALESCE(t.product_page_url, '') ~* '/stores/store/redirect/') - OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com')) + OR (v.name = 'FS.COM' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.product_page_url, '') !~* 'fs\\.com' OR COALESCE(t.product_page_url, '') ~* 'fs\\.com/c/')) OR ( v.name = 'GBICS' AND ( diff --git a/packages/scraper/src/utils/verify-10gtek-datasheets.ts b/packages/scraper/src/utils/verify-10gtek-datasheets.ts new file mode 100644 index 0000000..87b8fda --- /dev/null +++ b/packages/scraper/src/utils/verify-10gtek-datasheets.ts @@ -0,0 +1,169 @@ +import { pool, markDetailsVerified, recordVerificationEvidence } from "./db"; + +const TEN_GTEK_TRANSCEIVERS_URL = "https://www.10gtek.com/transceivers"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml", +}; + +type Candidate = { + id: string; + partNumber: string; + productUrl: string | null; + formFactor: string | null; + speedGbps: string | number | null; + reachLabel: string | null; + fiberType: string | null; +}; + +function normalize(value: string): string { + return value + .toUpperCase() + .replace(/%20/g, " ") + .replace(/\.(PDF|HTML?)$/i, "") + .replace(/[^A-Z0-9]+/g, ""); +} + +function decodeUrl(url: string): string { + try { + return decodeURIComponent(url); + } catch { + return url; + } +} + +function absoluteUrl(href: string): string { + if (href.startsWith("http")) return href; + return new URL(href.replace(/^\.\.\//, "/"), TEN_GTEK_TRANSCEIVERS_URL).toString(); +} + +function extractPdfUrls(html: string): Array<{ url: string; key: string; label: string }> { + const urls: Array<{ url: string; key: string; label: string }> = []; + const seen = new Set(); + const regex = /href=["']([^"']+\.pdf)["']/gi; + let match: RegExpExecArray | null; + while ((match = regex.exec(html)) !== null) { + const url = absoluteUrl(decodeUrl(match[1]).trim()); + if (seen.has(url)) continue; + seen.add(url); + const label = decodeUrl(url.split("/").pop() || url).replace(/\.pdf$/i, "").trim(); + urls.push({ url, key: normalize(label), label }); + } + return urls; +} + +function bestMatch(partNumber: string, pdfs: Array<{ url: string; key: string; label: string }>) { + const key = normalize(partNumber); + if (key.length < 4) return undefined; + + const exact = pdfs.find((pdf) => pdf.key === key); + if (exact) return exact; + + return pdfs.find((pdf) => { + if (pdf.key.length < 4) return false; + return pdf.key.includes(key) || key.includes(pdf.key); + }); +} + +async function main(): Promise { + const limit = parseInt(process.env["TEN_GTEK_DATASHEET_LIMIT"] || "500", 10); + const apply = process.env["TEN_GTEK_DATASHEET_APPLY"] === "1"; + + const resp = await fetch(TEN_GTEK_TRANSCEIVERS_URL, { + headers: HEADERS, + signal: AbortSignal.timeout(30000), + }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${TEN_GTEK_TRANSCEIVERS_URL}`); + + const pdfs = extractPdfUrls(await resp.text()); + const candidates = await pool.query( + `SELECT t.id, + t.part_number AS "partNumber", + t.product_page_url AS "productUrl", + t.form_factor AS "formFactor", + t.speed_gbps AS "speedGbps", + t.reach_label AS "reachLabel", + t.fiber_type AS "fiberType" + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = '10Gtek' + AND COALESCE(t.category, '') != 'NonTransceiver' + AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.details_verified, false) = false) + ORDER BY t.part_number + LIMIT $1`, + [limit], + ); + + let matched = 0; + let urls = 0; + let details = 0; + let skipped = 0; + + console.log("=== 10Gtek datasheet verifier ===", { + pdfs: pdfs.length, + candidates: candidates.rowCount ?? 0, + limit, + apply, + }); + + for (const row of candidates.rows) { + const pdf = bestMatch(row.partNumber, pdfs); + if (!pdf) { + skipped++; + continue; + } + matched++; + + const canMarkDetails = Boolean(row.formFactor && Number(row.speedGbps || 0) > 0 && row.reachLabel && row.fiberType); + console.log("datasheet match", { + partNumber: row.partNumber, + pdf: pdf.label, + url: pdf.url, + canMarkDetails, + apply, + }); + + if (!apply) continue; + + const updated = await pool.query( + `UPDATE transceivers + SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), + details_source_url = COALESCE(NULLIF(details_source_url, ''), $2), + updated_at = NOW() + WHERE id = $1 + RETURNING id`, + [row.id, pdf.url], + ); + if ((updated.rowCount ?? 0) > 0) urls++; + + if (canMarkDetails && await markDetailsVerified({ transceiverId: row.id, sourceUrl: pdf.url })) { + details++; + } + + await recordVerificationEvidence({ + transceiverId: row.id, + verificationType: "details", + sourceUrl: pdf.url, + evidenceValue: { + partNumber: row.partNumber, + datasheet: pdf.label, + matchedBy: "10gtek_transceivers_pdf_index", + canMarkDetails, + }, + robotName: "verify:10gtek-datasheets", + confidence: canMarkDetails ? 0.95 : 0.8, + }); + } + + console.log("10Gtek datasheet verifier complete", { matched, urls, details, skipped, apply }); +} + +if (require.main === module) { + main() + .then(() => pool.end()) + .catch((err) => { + console.error("Fatal:", err); + pool.end(); + process.exit(1); + }); +} diff --git a/sync/CURRENT.md b/sync/CURRENT.md index 6f10387..ad1be73 100644 --- a/sync/CURRENT.md +++ b/sync/CURRENT.md @@ -1,9 +1,35 @@ # Current TIP Sync State -Updated: 2026-05-10 02:58 UTC +Updated: 2026-05-10 07:38 UTC ## Newest Work +- TIP active-base cleanup continuation on 2026-05-10 UTC: + - fixed FS.com category leakage: + - new FS.com `/c/` category/landing rows quarantined + - price status returned to `needs_research=0` + - removed DAC/AOC/Breakout/Twinax/direct-attach cable rows from active transceiver verification base: + - first pass quarantined `879` + - second embedded-SKU pass quarantined `57` + - remaining active DAC/AOC-like rows: `0` + - added `verify:10gtek:datasheets` + - fetches `https://www.10gtek.com/transceivers` + - extracts official PDF datasheet URLs + - matches existing 10Gtek part numbers deterministically + - live apply matched `9` 10Gtek rows, wrote `9` datasheet URLs and `9` details evidence records + - rebuilt scraper package on Erik and restarted `tip-scraper-daemon` after confirming pg-boss queue was empty + - live health after cleanup: + - active products: `16245` + - price verified: `10851` + - price status: `public_price=10851`, `no_public_price=5394`, `needs_research=0`, `ambiguous=0` + - image verified: `11606` + - details verified: `16004` + - fully verified: `10599` + - competitor status: `matched=10838`, `ambiguous=5`, `needs_research=5402` + - interpretation: + - TIP active base is now much cleaner: cable/breakout products are no longer counted as transceiver-module verification debt + - remaining largest gaps are real OEM/catalog image availability and competitor state, not public price research + - MAGATAMA all-lane RunPod training completion on 2026-05-10: - RunPod training/adoption is now verified end-to-end for all five active MAGATAMA LLM lanes: - `magatamallm`: active `magatama-coder:latest`, model version `magatama-coder-r2`, dataset `1375 train / 153 eval / 1528 total` diff --git a/sync/history/2026-05-10-tip-active-base-cable-cleanup.md b/sync/history/2026-05-10-tip-active-base-cable-cleanup.md new file mode 100644 index 0000000..633d373 --- /dev/null +++ b/sync/history/2026-05-10-tip-active-base-cable-cleanup.md @@ -0,0 +1,36 @@ +# TIP Active Base Cable Cleanup + +Date: 2026-05-10 07:38 UTC + +## Scope + +Continued TIP verification after the price queue was closed. Focus: remove DAC/AOC/Breakout/Twinax/direct-attach cable products from the active transceiver-module verification base and add a 10Gtek datasheet source verifier. + +## Live Work + +- FS.com `/c/` category/landing rows were quarantined; price `needs_research` returned to `0`. +- DAC/AOC/Breakout/Twinax/direct-attach quarantine: + - first pass quarantined `879` + - embedded-SKU pass quarantined `57` + - remaining active DAC/AOC-like rows: `0` +- Added `verify:10gtek:datasheets`. + - source: `https://www.10gtek.com/transceivers` + - extracted `58` official PDF links + - matched `9` existing 10Gtek rows + - wrote `9` datasheet URLs and `9` details evidence records +- Scraper package built on Erik. +- `tip-scraper-daemon` restarted after pg-boss queue was empty. + +## Health Snapshot + +- active products: `16245` +- price verified: `10851` +- price status: `public_price=10851`, `no_public_price=5394`, `needs_research=0`, `ambiguous=0` +- image verified: `11606` +- details verified: `16004` +- fully verified: `10599` +- competitor status: `matched=10838`, `ambiguous=5`, `needs_research=5402` + +## Interpretation + +The active product base is cleaner and no longer counts cable/breakout rows as transceiver-module verification debt. Remaining gaps are dominated by OEM/catalog image availability and competitor state, not public price research. diff --git a/training-data/tip-llm-capabilities-v1.jsonl b/training-data/tip-llm-capabilities-v1.jsonl index 61422e9..ecca2bc 100644 --- a/training-data/tip-llm-capabilities-v1.jsonl +++ b/training-data/tip-llm-capabilities-v1.jsonl @@ -56,3 +56,5 @@ {"id":"tip-20260510-retail-absent-no-public-price-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Close research states with explicit source-discovery evidence instead of endless manual queues."},{"role":"user","content":"10Gtek has old catalog rows with no product URL after a full SFPcables retail crawl. How should TIP classify their price state?"},{"role":"assistant","content":"If a full controlled 10Gtek/SFPcables retail crawl finds the public retail catalog and a row still has no product_page_url or public price source, set `price_status=no_public_price` with reason `not_listed_in_public_sfpcables_retail_catalog_after_full_crawl`. Do not set `price_verified`; no fabricated price exists. Record `price_unavailable` evidence so the dashboard can distinguish resolved no-public-price rows from real open price research."}]} {"id":"tip-20260510-artifact-quarantine-shopfiber-vcelink-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Keep active product counts clean by quarantining deterministic artifacts."},{"role":"user","content":"What remaining ShopFiber24 and Vcelink artifacts were found during price closure?"},{"role":"assistant","content":"ShopFiber24 artifacts include FOCP fiber cable SKUs, FO-TRANSCEIVER/category labels, Kompatible-Transceiver, Transceivermodule, QSFP+-MSA/SFP+-MSA, Marken-Transceiver pages, Zu-den-* navigation labels, and DAC/AOC cable category pages. Vcelink rows with numeric-only part numbers and no product_page_url are not real transceiver SKUs. Move them to NonTransceiver, clear verification flags, and write artifact_quarantine evidence."}]} {"id":"tip-20260510-oem-image-detail-probing-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Official OEM product pages may verify details without providing usable product images."},{"role":"user","content":"Cisco and Juniper still have image/detail gaps after price closure. Should TIP invent or reuse generic product images?"},{"role":"assistant","content":"No. Use official product pages and only mark images when a concrete product image URL is extracted. Cisco pages may expose official DOCX/PDF rendition images and can be marked image_verified when the URL is source-backed. Some Cisco pages return HTTP 403. Juniper product pages in sampled batches often expose no useful product image URL; keep image_verified=false rather than using a generic transceiver picture. Treat DAC/AOC/cable-like OEM rows as a separate modeling/quarantine decision, not as optical module image gaps."}]} +{"id":"tip-20260510-cable-breakout-quarantine-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Do not compare cable/breakout products as if they were standalone optical transceiver modules."},{"role":"user","content":"How should TIP handle DAC, AOC, Twinax, direct-attach and breakout rows in the active transceiver verification base?"},{"role":"assistant","content":"Move them out of the active transceiver-module base unless TIP intentionally models a separate cable/AOC product class. Deterministic indicators include part numbers or URLs containing DAC, AOC, DACBL, XDAC, Direct Attach, Twinax, Breakout or active-optical. These rows should not block transceiver-module completion and must not be used for 1:1 optical module competitor matching. Quarantine them with artifact_quarantine evidence and clear price/image/details/competitor/fully verified flags."}]} +{"id":"tip-20260510-10gtek-datasheet-verifier-001","source":"codex-tip-verification-2026-05-10","kind":"sft-jsonl","messages":[{"role":"system","content":"You are TIP_LLM. Use official datasheet indexes for source-backed detail evidence when retail pages are absent."},{"role":"user","content":"10Gtek has active rows with no SFPcables retail URL but the main 10gtek.com site exposes PDF datasheets. What should the robot do?"},{"role":"assistant","content":"Fetch the official 10Gtek transceiver page, extract PDF datasheet links, normalize PDF filenames and part numbers, and only match when the normalized filename and part number contain each other or are exact. Write the PDF URL to product_page_url/details_source_url and mark details only if form_factor, speed_gbps, reach_label and fiber_type are already present. Record details evidence with robot_name `verify:10gtek-datasheets`. Do not infer prices from PDFs."}]}