From f64dbf7b6bd549392e50a232e9d8089b28e76174 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 9 May 2026 11:15:36 +0200 Subject: [PATCH] fix: add fscom targeted detail verification mode --- packages/scraper/src/scrapers/fs-com.ts | 65 ++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 72532c2..6031e11 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -72,6 +72,7 @@ const MAX_DETAIL_PAGES_PER_RUN = parseInt(process.env["FS_MAX_DETAIL_PAGES_PER_R const STOCK_FRESH_HOURS = parseInt(process.env["FS_STOCK_FRESH_HOURS"] ?? "12", 10); const FORCE_REVALIDATE = process.env["TIP_FORCE_REVALIDATE"] === "1"; const ONLY_MISSING_IMAGES = process.env["FS_ONLY_MISSING_IMAGES"] === "1"; +const DB_DETAIL_ONLY = process.env["FS_DB_DETAIL_ONLY"] === "1"; const PROXY_URLS = (process.env["PROXY_URLS"] ?? "") .split(",") @@ -241,8 +242,18 @@ function detectSpeed(text: string): { speed: string; speedGbps: number } | undef } function detectReach(text: string): string | undefined { - const m = text.match(/(\d+)\s*(m|km)\b/i); - return m ? `${m[1]}${m[2].toLowerCase()}` : undefined; + const m = text.match(/(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\s*(m|km)\b/i); + return m ? `${m[1].replace(/,/g, "")}${m[2].toLowerCase()}` : undefined; +} + +function detectFiberType(text: string): string | undefined { + if (/active\s+optical|\baoc\b/i.test(text)) return "AOC"; + if (/copper|dac|twinax|direct\s+attach|rj-?45|base-t/i.test(text)) return "Copper"; + if (/single.?mode|\bsmf\b|os2|cwdm|dwdm|\bcw-|^cw-|dw-|bidi|\blx\b|\blr\d*\b|\ber\d*\b|\bzr\d*\b|\bdr\d*\b|\bfr\d*\b|\bpsm\d*\b/i.test(text)) { + return "SMF"; + } + if (/multi.?mode|\bmmf\b|om[1-5]|\bsx\b|\bsr\d*\b/i.test(text)) return "MMF"; + return undefined; } // ── Types ────────────────────────────────────────────────────────────────────── @@ -777,8 +788,45 @@ export async function scrapeFs(): Promise { console.log(`Vendor ID: ${vendorId}`); // ── Phase 1: Discover product URLs ───────────────────────────────────────── - console.log("\n[Phase 1] Collecting product URLs from category listing pages…"); - const productMap = await collectProductUrls(proxyConfiguration); + let productMap: Map; + if (DB_DETAIL_ONLY) { + console.log("\n[Phase 1] DB detail-only mode — using existing FS.COM product URLs with missing verification signals…"); + const dbRows = await pool.query( + ` + SELECT t.part_number, t.product_page_url + FROM transceivers t + JOIN vendors v ON v.id = t.vendor_id + WHERE v.name = 'FS.COM' + AND t.product_page_url IS NOT NULL + AND t.product_page_url != '' + AND t.product_page_url LIKE '%/products/%' + AND ( + COALESCE(t.price_verified, false) = false + OR COALESCE(t.image_verified, false) = false + OR COALESCE(t.details_verified, false) = false + OR COALESCE(t.fiber_type, '') = '' + OR COALESCE(t.reach_label, '') = '' + ) + ORDER BY + COALESCE(t.price_verified, false) DESC, + COALESCE(t.image_verified, false) DESC, + COALESCE(t.details_verified, false) ASC, + t.part_number + LIMIT $1 + `, + [MAX_DETAIL_PAGES_PER_RUN] + ); + productMap = new Map( + dbRows.rows.map((row) => { + const url = normalizeFsProductUrl(row.product_page_url as string); + const partNumber = row.part_number as string; + return [url, { url, name: partNumber, partNumber }]; + }) + ); + } else { + console.log("\n[Phase 1] Collecting product URLs from category listing pages…"); + productMap = await collectProductUrls(proxyConfiguration); + } if (productMap.size === 0) { console.warn("[Phase 1] No products discovered — check selectors or proxy."); @@ -860,16 +908,19 @@ export async function scrapeFs(): Promise { const speedInfo = detectSpeed(detail.name); const reach = detectReach(detail.name); const parsed = parseSpecTable(detail.specs); + const textForInference = `${detail.name} ${detail.partNumber} ${Object.values(detail.specs).join(" ")}`; + const fiberType = parsed.fiberType ?? detectFiberType(textForInference); const transceiverId = await findOrCreateScrapedTransceiver({ partNumber: detail.partNumber, vendorId, + productUrl: detail.url, formFactor: ff, speedGbps: speedInfo?.speedGbps, speed: speedInfo?.speed, reachLabel: reach ?? parsed.reachLabel, reachMeters: parsed.reachMeters, - fiberType: parsed.fiberType, + fiberType, wavelengths: parsed.wavelengths, imageUrl: detail.imageUrl, category: "DataCenter", @@ -922,7 +973,7 @@ export async function scrapeFs(): Promise { if (Object.keys(detail.specs).length > 0) { const updated = await updateVerifiedSpecs({ transceiverId, - fiberType: parsed.fiberType, + fiberType, connector: parsed.connector, wavelengths: parsed.wavelengths, reachMeters: parsed.reachMeters, @@ -933,7 +984,7 @@ export async function scrapeFs(): Promise { domSupport: parsed.domSupport, imageUrl: detail.imageUrl, datasheetUrl: detail.datasheetUrl, - source: "fs.com", + source: detail.url, }); if (updated) specsUpdated++; }