170 lines
4.9 KiB
TypeScript
170 lines
4.9 KiB
TypeScript
import { pool, markDetailsVerified, recordVerificationEvidence } from "./db";
|
|
|
|
const TEN_GTEK_TRANSCEIVERS_URL = "https://www.10gtek.com/transceivers";
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
Accept: "text/html,application/xhtml+xml",
|
|
};
|
|
|
|
type Candidate = {
|
|
id: string;
|
|
partNumber: string;
|
|
productUrl: string | null;
|
|
formFactor: string | null;
|
|
speedGbps: string | number | null;
|
|
reachLabel: string | null;
|
|
fiberType: string | null;
|
|
};
|
|
|
|
function normalize(value: string): string {
|
|
return value
|
|
.toUpperCase()
|
|
.replace(/%20/g, " ")
|
|
.replace(/\.(PDF|HTML?)$/i, "")
|
|
.replace(/[^A-Z0-9]+/g, "");
|
|
}
|
|
|
|
function decodeUrl(url: string): string {
|
|
try {
|
|
return decodeURIComponent(url);
|
|
} catch {
|
|
return url;
|
|
}
|
|
}
|
|
|
|
function absoluteUrl(href: string): string {
|
|
if (href.startsWith("http")) return href;
|
|
return new URL(href.replace(/^\.\.\//, "/"), TEN_GTEK_TRANSCEIVERS_URL).toString();
|
|
}
|
|
|
|
function extractPdfUrls(html: string): Array<{ url: string; key: string; label: string }> {
|
|
const urls: Array<{ url: string; key: string; label: string }> = [];
|
|
const seen = new Set<string>();
|
|
const regex = /href=["']([^"']+\.pdf)["']/gi;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const url = absoluteUrl(decodeUrl(match[1]).trim());
|
|
if (seen.has(url)) continue;
|
|
seen.add(url);
|
|
const label = decodeUrl(url.split("/").pop() || url).replace(/\.pdf$/i, "").trim();
|
|
urls.push({ url, key: normalize(label), label });
|
|
}
|
|
return urls;
|
|
}
|
|
|
|
function bestMatch(partNumber: string, pdfs: Array<{ url: string; key: string; label: string }>) {
|
|
const key = normalize(partNumber);
|
|
if (key.length < 4) return undefined;
|
|
|
|
const exact = pdfs.find((pdf) => pdf.key === key);
|
|
if (exact) return exact;
|
|
|
|
return pdfs.find((pdf) => {
|
|
if (pdf.key.length < 4) return false;
|
|
return pdf.key.includes(key) || key.includes(pdf.key);
|
|
});
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const limit = parseInt(process.env["TEN_GTEK_DATASHEET_LIMIT"] || "500", 10);
|
|
const apply = process.env["TEN_GTEK_DATASHEET_APPLY"] === "1";
|
|
|
|
const resp = await fetch(TEN_GTEK_TRANSCEIVERS_URL, {
|
|
headers: HEADERS,
|
|
signal: AbortSignal.timeout(30000),
|
|
});
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${TEN_GTEK_TRANSCEIVERS_URL}`);
|
|
|
|
const pdfs = extractPdfUrls(await resp.text());
|
|
const candidates = await pool.query<Candidate>(
|
|
`SELECT t.id,
|
|
t.part_number AS "partNumber",
|
|
t.product_page_url AS "productUrl",
|
|
t.form_factor AS "formFactor",
|
|
t.speed_gbps AS "speedGbps",
|
|
t.reach_label AS "reachLabel",
|
|
t.fiber_type AS "fiberType"
|
|
FROM transceivers t
|
|
JOIN vendors v ON v.id = t.vendor_id
|
|
WHERE v.name = '10Gtek'
|
|
AND COALESCE(t.category, '') != 'NonTransceiver'
|
|
AND (COALESCE(t.product_page_url, '') = '' OR COALESCE(t.details_verified, false) = false)
|
|
ORDER BY t.part_number
|
|
LIMIT $1`,
|
|
[limit],
|
|
);
|
|
|
|
let matched = 0;
|
|
let urls = 0;
|
|
let details = 0;
|
|
let skipped = 0;
|
|
|
|
console.log("=== 10Gtek datasheet verifier ===", {
|
|
pdfs: pdfs.length,
|
|
candidates: candidates.rowCount ?? 0,
|
|
limit,
|
|
apply,
|
|
});
|
|
|
|
for (const row of candidates.rows) {
|
|
const pdf = bestMatch(row.partNumber, pdfs);
|
|
if (!pdf) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
matched++;
|
|
|
|
const canMarkDetails = Boolean(row.formFactor && Number(row.speedGbps || 0) > 0 && row.reachLabel && row.fiberType);
|
|
console.log("datasheet match", {
|
|
partNumber: row.partNumber,
|
|
pdf: pdf.label,
|
|
url: pdf.url,
|
|
canMarkDetails,
|
|
apply,
|
|
});
|
|
|
|
if (!apply) continue;
|
|
|
|
const updated = await pool.query(
|
|
`UPDATE transceivers
|
|
SET product_page_url = COALESCE(NULLIF(product_page_url, ''), $2),
|
|
details_source_url = COALESCE(NULLIF(details_source_url, ''), $2),
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
RETURNING id`,
|
|
[row.id, pdf.url],
|
|
);
|
|
if ((updated.rowCount ?? 0) > 0) urls++;
|
|
|
|
if (canMarkDetails && await markDetailsVerified({ transceiverId: row.id, sourceUrl: pdf.url })) {
|
|
details++;
|
|
}
|
|
|
|
await recordVerificationEvidence({
|
|
transceiverId: row.id,
|
|
verificationType: "details",
|
|
sourceUrl: pdf.url,
|
|
evidenceValue: {
|
|
partNumber: row.partNumber,
|
|
datasheet: pdf.label,
|
|
matchedBy: "10gtek_transceivers_pdf_index",
|
|
canMarkDetails,
|
|
},
|
|
robotName: "verify:10gtek-datasheets",
|
|
confidence: canMarkDetails ? 0.95 : 0.8,
|
|
});
|
|
}
|
|
|
|
console.log("10Gtek datasheet verifier complete", { matched, urls, details, skipped, apply });
|
|
}
|
|
|
|
if (require.main === module) {
|
|
main()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|