/** * ProLabs Scraper — Catalog-only (no public pricing) * * ProLabs (an Amphenol company) uses a B2B quote model — prices require a * sales contact or reseller account and are NOT shown on the public website. * The schema.org markup consistently shows price=0.00. * * Approach: sitemap-driven fetch scraper (curl-compatible headers). * CloudFront allows regular HTTP requests; only Playwright/browser automation * gets blocked via TLS fingerprinting. * * Collects: part numbers, product names, form factors, specs (no prices). * Writes: `transceivers` catalog entries via findOrCreateScrapedTransceiver. * * SKU format examples: "SFP-10G-SR-PR", "Q28-100G-LR4-PR", "Q-4X10G-LR-PR" */ import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db"; const BASE = "https://www.prolabs.com"; const SITEMAP_INDEX = `${BASE}/sitemap.xml`; const RATE_LIMIT_MS = 300; // ~3 req/sec — polite crawl const HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", }; const TRANSCEIVER_KEYWORDS = /\b(sfp|qsfp|xfp|osfp|cfp|cxp|transceiver|fiber.optic|fibre.optic)\b/i; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchText(url: string): Promise { try { const res = await fetch(url, { headers: HEADERS }); if (!res.ok) return null; return await res.text(); } catch { return null; } } function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const lower = text.toLowerCase(); if (lower.includes("osfp") && lower.includes("1600g")) return { formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }; if (lower.includes("osfp") && lower.includes("800g")) return { formFactor: "OSFP", speed: "800G", speedGbps: 800 }; if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (lower.includes("qsfp28") || lower.includes("100gbase")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; if ((lower.includes("qsfp+") || lower.includes("qsfp plus")) && !lower.includes("28")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; if (lower.includes("sfp28") || lower.includes("25gbase") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 }; if (lower.includes("cfp2")) return { formFactor: "CFP2", speed: "100G", speedGbps: 100 }; if (lower.includes("cfp4")) return { formFactor: "CFP4", speed: "100G", speedGbps: 100 }; if (lower.includes("cfp")) return { formFactor: "CFP", speed: "100G", speedGbps: 100 }; if (lower.includes("1000base") || lower.includes("1gbase")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; if (lower.includes("qsfp")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; } function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ [/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], [/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000], [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], [/\b300\s*m\b/i, "300m", 300], [/\b100\s*m\b/i, "100m", 100], [/\bLR4\b/i, "10km", 10000], [/\b[^Z]LR\b/i, "10km", 10000], [/\bER4?\b/i, "40km", 40000], [/\bZR4?\b/i, "80km", 80000], [/\bSR4?\b/i, "300m", 300], [/\bDR4?\b/i, "500m", 500], [/\bFR4?\b/i, "2km", 2000], ]; for (const [rx, label, meters] of patterns) { if (rx.test(text)) return { label, meters }; } return undefined; } function detectFiber(text: string): string { if (/single.?mode|smf|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/multi.?mode|mmf|\bsx\b|\bsr\b/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper"; return ""; } function detectWavelength(text: string): string { const m = text.match(/(\d{3,4})\s*nm/i); return m ? m[1] : ""; } /** Extract SKU from ProLabs product URL slug */ function skuFromSlug(slug: string): string { // URL pattern: /vendor-sku-c-description or /sku-description // The -c- separator splits SKU from compatibility desc const parts = slug.replace(/^\//, "").split("-c-"); const base = parts[0]; // ProLabs appends vendor code at end — extract the actual part number // e.g. "extreme-sfp-10gbase-zr-100-i-ex" → "SFP-10GBASE-ZR-100-I-EX-C" return base.toUpperCase().replace(/-+/g, "-"); } /** Download and parse sitemap index to collect all sitemap URLs */ async function fetchSitemapUrls(): Promise { const index = await fetchText(SITEMAP_INDEX); if (!index) { console.warn(" Could not fetch sitemap index"); return []; } const sitemapUrls = [...index.matchAll(/([^<]+sitemap[^<]+\.xml)<\/loc>/gi)] .map((m) => m[1].trim()); console.log(` Found ${sitemapUrls.length} sitemaps`); return sitemapUrls; } /** Download a single sitemap and extract product URLs */ async function fetchProductUrlsFromSitemap(sitemapUrl: string): Promise { const xml = await fetchText(sitemapUrl); if (!xml) return []; return [...xml.matchAll(/([^<]+prolabs\.com\/[^<]+)<\/loc>/gi)] .map((m) => m[1].trim()) .filter((u) => { // Filter out category pages, search, contact, etc. const path = u.replace(/https?:\/\/[^/]+/, ""); return !path.match(/^\/(search|contact|sitemap|sitemap|downloads|articles|industry|support|why-|prolabs-dense|prolabs-test|360-virtual|videos|multi-coded|case-studies|white-papers|faqs|built-for-ai|what-is-new|about|associations|careers|multi-source|legacy|rma|tech-support|warranty|wintune|edfamux|privacy|newsletter|where-to-buy|media-converter|multiservice|eon-omp|multiplexers|patch-cables|cassettes|adapter|server|desktop|network|a-v-cables|power-adapters|usb-cables|c-lc|c-sc|c-fc|c-st|c-mtp|c-mpo|c-mt|power-|usb-)/) && !path.includes("memory") && !path.includes("/cart") && !path.includes("/order"); }); } /** Scrape a ProLabs product page for part number and specs */ async function scrapeProductPage(url: string): Promise<{ partNumber: string; name: string; formFactor: string; speed: string; speedGbps: number; reachLabel?: string; reachMeters?: number; fiberType?: string; wavelength?: string; } | null> { const html = await fetchText(url); if (!html) return null; // Extract title from tag const titleM = html.match(/<title>([^<]+)<\/title>/i); const pageTitle = titleM ? titleM[1].replace(/\s*\|\s*[^|]+$/, "").trim() : ""; // Extract H1 (canonical product name) const h1M = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i); const h1 = h1M ? h1M[1].replace(/<[^>]+>/g, "").replace(/&#x[0-9A-F]+;/gi, (e) => String.fromCharCode(parseInt(e.slice(3, -1), 16))).replace(/&/g, "&").trim() : ""; // Extract SKU from nopCommerce structure const skuM = html.match(/id="sku-\d+"[^>]*>([^<]+)<\/span>/i); const pageSku = skuM ? skuM[1].trim() : ""; // If no transceiver keywords found, skip const combined = [pageTitle, h1, pageSku].join(" "); if (!TRANSCEIVER_KEYWORDS.test(combined) && !combined.includes("-PR")) return null; // Use page SKU if available, otherwise derive from URL slug const slug = url.replace(/https?:\/\/[^/]+/, ""); const partNumber = pageSku || skuFromSlug(slug); const name = h1 || pageTitle; if (!name || name.length < 5) return null; const { formFactor, speed, speedGbps } = detectFormFactor(combined); const reach = detectReach(combined); return { partNumber, name, formFactor, speed, speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(combined), wavelength: detectWavelength(combined), }; } export async function scrapeProLabs(): Promise<void> { console.log("=== ProLabs Catalog Scraper (fetch-based, no Playwright) ===\n"); console.log("Note: ProLabs uses B2B quote model — catalog data only, no public prices.\n"); const vendorId = await ensureVendor( "ProLabs", "compatible", "https://www.prolabs.com", "https://www.prolabs.com/transceivers" ); let totalProcessed = 0; let totalNew = 0; let skipped = 0; // Step 1: Collect product URLs from sitemaps console.log("Step 1: Collecting sitemaps..."); const sitemapUrls = await fetchSitemapUrls(); const productUrls: string[] = []; let sitemapCount = 0; for (const smUrl of sitemapUrls) { const urls = await fetchProductUrlsFromSitemap(smUrl); productUrls.push(...urls); sitemapCount++; if (sitemapCount % 3 === 0) { console.log(` Processed ${sitemapCount}/${sitemapUrls.length} sitemaps, ${productUrls.length} product URLs so far`); } await sleep(RATE_LIMIT_MS); } // Deduplicate const uniqueUrls = [...new Set(productUrls)]; console.log(`\nStep 2: Found ${uniqueUrls.length} unique product URLs\n`); // Step 3: Scrape each product page // Limit to transceiver-related URLs first (filter by keyword in slug) const transceiverUrls = uniqueUrls.filter((u) => { const path = u.toLowerCase(); return TRANSCEIVER_KEYWORDS.test(path) || path.includes("-pr") || path.includes("transceiver"); }); console.log(`Step 3: Scraping ${transceiverUrls.length} transceiver-related pages...\n`); for (const url of transceiverUrls) { try { const product = await scrapeProductPage(url); totalProcessed++; if (!product) { skipped++; } else { await findOrCreateScrapedTransceiver({ partNumber: product.partNumber.slice(0, 80), vendorId, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, reachMeters: product.reachMeters, reachLabel: product.reachLabel, fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", }); totalNew++; if (totalNew % 50 === 0) { console.log(` ${totalNew} catalog entries written (${totalProcessed} pages processed)`); } } await sleep(RATE_LIMIT_MS); } catch (err) { console.warn(` Error [${url.slice(-60)}]: ${(err as Error).message.slice(0, 80)}`); } } console.log(`\n=== ProLabs Catalog Complete ===`); console.log(` Pages processed: ${totalProcessed}`); console.log(` Catalog entries written: ${totalNew}`); console.log(` Skipped (non-transceiver): ${skipped}`); console.log(` Note: No prices (B2B quote model)`); } if (require.main === module) { scrapeProLabs() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }