transceiver-db/packages/scraper/src/utils/verify-10gtek-datasheets.ts
2026-05-10 09:41:59 +02:00

170 lines
4.9 KiB
TypeScript

import { pool, markDetailsVerified, recordVerificationEvidence } from "./db";
const TEN_GTEK_TRANSCEIVERS_URL = "https://www.10gtek.com/transceivers";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
};
type Candidate = {
id: string;
partNumber: string;
productUrl: string | null;
formFactor: string | null;
speedGbps: string | number | null;
reachLabel: string | null;
fiberType: string | null;
};
function normalize(value: string): string {
return value
.toUpperCase()
.replace(/%20/g, " ")
.replace(/\.(PDF|HTML?)$/i, "")
.replace(/[^A-Z0-9]+/g, "");
}
function decodeUrl(url: string): string {
try {
return decodeURIComponent(url);
} catch {
return url;
}
}
function absoluteUrl(href: string): string {
if (href.startsWith("http")) return href;
return new URL(href.replace(/^\.\.\//, "/"), TEN_GTEK_TRANSCEIVERS_URL).toString();
}
function extractPdfUrls(html: string): Array<{ url: string; key: string; label: string }> {
const urls: Array<{ url: string; key: string; label: string }> = [];
const seen = new Set<string>();
const regex = /href=["']([^"']+\.pdf)["']/gi;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const url = absoluteUrl(decodeUrl(match[1]).trim());
if (seen.has(url)) continue;
seen.add(url);
const label = decodeUrl(url.split("/").pop() || url).replace(/\.pdf$/i, "").trim();
urls.push({ url, key: normalize(label), label });
}
return urls;
}
function bestMatch(partNumber: string, pdfs: Array<{ url: string; key: string; label: string }>) {
const key = normalize(partNumber);
if (key.length < 4) return undefined;
const exact = pdfs.find((pdf) => pdf.key === key);
if (exact) return exact;
return pdfs.find((pdf) => {
if (pdf.key.length < 4) return false;
return pdf.key.includes(key) || key.includes(pdf.key);
});
}
async function main(): Promise<void> {
const limit = parseInt(process.env["TEN_GTEK_DATASHEET_LIMIT"] || "500", 10);
const apply = process.env["TEN_GTEK_DATASHEET_APPLY"] === "1";
const resp = await fetch(TEN_GTEK_TRANSCEIVERS_URL, {
headers: HEADERS,
signal: AbortSignal.timeout(30000),
});
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${TEN_GTEK_TRANSCEIVERS_URL}`);
const pdfs = extractPdfUrls(await resp.text());
const candidates = await pool.query<Candidate>(
`SELECT t.id,
t.part_number AS "partNumber",
t.product_page_url AS "productUrl",
t.form_factor AS "formFactor",
t.speed_gbps AS "speedGbps",
t.reach_label AS "reachLabel",
t.fiber_type AS "fiberType"
FROM transceivers t
JOIN vendors v ON v.id = t.vendor_id
WHERE v.name = '10Gtek'
AND COALESCE(t.category, '') != 'NonTransceiver'
AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.details_verified, false) = false)
ORDER BY t.part_number
LIMIT $1`,
[limit],
);
let matched = 0;
let urls = 0;
let details = 0;
let skipped = 0;
console.log("=== 10Gtek datasheet verifier ===", {
pdfs: pdfs.length,
candidates: candidates.rowCount ?? 0,
limit,
apply,
});
for (const row of candidates.rows) {
const pdf = bestMatch(row.partNumber, pdfs);
if (!pdf) {
skipped++;
continue;
}
matched++;
const canMarkDetails = Boolean(row.formFactor && Number(row.speedGbps || 0) > 0 && row.reachLabel && row.fiberType);
console.log("datasheet match", {
partNumber: row.partNumber,
pdf: pdf.label,
url: pdf.url,
canMarkDetails,
apply,
});
if (!apply) continue;
const updated = await pool.query(
`UPDATE transceivers
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
details_source_url = COALESCE(NULLIF(details_source_url, ''), $2),
updated_at = NOW()
WHERE id = $1
RETURNING id`,
[row.id, pdf.url],
);
if ((updated.rowCount ?? 0) > 0) urls++;
if (canMarkDetails && await markDetailsVerified({ transceiverId: row.id, sourceUrl: pdf.url })) {
details++;
}
await recordVerificationEvidence({
transceiverId: row.id,
verificationType: "details",
sourceUrl: pdf.url,
evidenceValue: {
partNumber: row.partNumber,
datasheet: pdf.label,
matchedBy: "10gtek_transceivers_pdf_index",
canMarkDetails,
},
robotName: "verify:10gtek-datasheets",
confidence: canMarkDetails ? 0.95 : 0.8,
});
}
console.log("10Gtek datasheet verifier complete", { matched, urls, details, skipped, apply });
}
if (require.main === module) {
main()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}