import { pool, markDetailsVerified, recordVerificationEvidence } from "./db"; const TEN_GTEK_TRANSCEIVERS_URL = "https://www.10gtek.com/transceivers"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", }; type Candidate = { id: string; partNumber: string; productUrl: string | null; formFactor: string | null; speedGbps: string | number | null; reachLabel: string | null; fiberType: string | null; }; function normalize(value: string): string { return value .toUpperCase() .replace(/%20/g, " ") .replace(/\.(PDF|HTML?)$/i, "") .replace(/[^A-Z0-9]+/g, ""); } function decodeUrl(url: string): string { try { return decodeURIComponent(url); } catch { return url; } } function absoluteUrl(href: string): string { if (href.startsWith("http")) return href; return new URL(href.replace(/^\.\.\//, "/"), TEN_GTEK_TRANSCEIVERS_URL).toString(); } function extractPdfUrls(html: string): Array<{ url: string; key: string; label: string }> { const urls: Array<{ url: string; key: string; label: string }> = []; const seen = new Set(); const regex = /href=["']([^"']+\.pdf)["']/gi; let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { const url = absoluteUrl(decodeUrl(match[1]).trim()); if (seen.has(url)) continue; seen.add(url); const label = decodeUrl(url.split("/").pop() || url).replace(/\.pdf$/i, "").trim(); urls.push({ url, key: normalize(label), label }); } return urls; } function bestMatch(partNumber: string, pdfs: Array<{ url: string; key: string; label: string }>) { const key = normalize(partNumber); if (key.length < 4) return undefined; const exact = pdfs.find((pdf) => pdf.key === key); if (exact) return exact; return pdfs.find((pdf) => { if (pdf.key.length < 4) return false; return pdf.key.includes(key) || key.includes(pdf.key); }); } async function main(): Promise { const limit = parseInt(process.env["TEN_GTEK_DATASHEET_LIMIT"] || "500", 10); const apply = process.env["TEN_GTEK_DATASHEET_APPLY"] === "1"; const resp = await fetch(TEN_GTEK_TRANSCEIVERS_URL, { headers: HEADERS, signal: AbortSignal.timeout(30000), }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${TEN_GTEK_TRANSCEIVERS_URL}`); const pdfs = extractPdfUrls(await resp.text()); const candidates = await pool.query( `SELECT t.id, t.part_number AS "partNumber", t.product_page_url AS "productUrl", t.form_factor AS "formFactor", t.speed_gbps AS "speedGbps", t.reach_label AS "reachLabel", t.fiber_type AS "fiberType" FROM transceivers t JOIN vendors v ON v.id = t.vendor_id WHERE v.name = '10Gtek' AND COALESCE(t.category, '') != 'NonTransceiver' AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.details_verified, false) = false) ORDER BY t.part_number LIMIT $1`, [limit], ); let matched = 0; let urls = 0; let details = 0; let skipped = 0; console.log("=== 10Gtek datasheet verifier ===", { pdfs: pdfs.length, candidates: candidates.rowCount ?? 0, limit, apply, }); for (const row of candidates.rows) { const pdf = bestMatch(row.partNumber, pdfs); if (!pdf) { skipped++; continue; } matched++; const canMarkDetails = Boolean(row.formFactor && Number(row.speedGbps || 0) > 0 && row.reachLabel && row.fiberType); console.log("datasheet match", { partNumber: row.partNumber, pdf: pdf.label, url: pdf.url, canMarkDetails, apply, }); if (!apply) continue; const updated = await pool.query( `UPDATE transceivers SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2), details_source_url = COALESCE(NULLIF(details_source_url, ''), $2), updated_at = NOW() WHERE id = $1 RETURNING id`, [row.id, pdf.url], ); if ((updated.rowCount ?? 0) > 0) urls++; if (canMarkDetails && await markDetailsVerified({ transceiverId: row.id, sourceUrl: pdf.url })) { details++; } await recordVerificationEvidence({ transceiverId: row.id, verificationType: "details", sourceUrl: pdf.url, evidenceValue: { partNumber: row.partNumber, datasheet: pdf.label, matchedBy: "10gtek_transceivers_pdf_index", canMarkDetails, }, robotName: "verify:10gtek-datasheets", confidence: canMarkDetails ? 0.95 : 0.8, }); } console.log("10Gtek datasheet verifier complete", { matched, urls, details, skipped, apply }); } if (require.main === module) { main() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }