/** * ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary) * * prolabs.com — CloudFront WAF aggressively blocks datacenter IPs. * Uses PlaywrightCrawler with Firefox for anti-detection. * * KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs * (HTTP 403 "Request blocked"). This scraper works correctly from * residential IPs. Solutions: * 1. Set PROXY_URL env var to a residential/rotating proxy * 2. Run from a residential IP (e.g. home server) * 3. Route through WireGuard with internet breakout at home * * Products listed under /products/networking/fiber-optics/ category pages. * Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min. * * SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR" */ import { PlaywrightCrawler, RequestQueue } from "crawlee"; import { firefox } from "playwright"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.prolabs.com"; const MAX_PAGES = 100; const PROXY_URL = process.env.PROXY_URL || ""; const CATEGORIES = [ { path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 }, { path: "/products/networking/fiber-optics/sfp-plus-modules", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, { path: "/products/networking/fiber-optics/sfp28-modules", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, { path: "/products/networking/fiber-optics/qsfp-plus-modules", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, { path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, { path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, ]; interface Product { partNumber: string; name: string; url: string; price?: number; stockStatus?: string; formFactor: string; speed: string; speedGbps: number; reachLabel?: string; reachMeters?: number; fiberType?: string; wavelength?: string; } /* ------------------------------------------------------------------ */ /* Helper / detection functions (unchanged from original) */ /* ------------------------------------------------------------------ */ function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ [/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], [/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000], [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], [/\b400\s*m\b/i, "400m", 400], [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], [/\b30\s*m\b/i, "30m", 30], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], ]; for (const [regex, label, meters] of patterns) { if (regex.test(text)) return { label, meters }; } return undefined; } function detectFiber(text: string): string { if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper"; return ""; } function detectWavelength(text: string): string { const match = text.match(/(\d{3,4})\s*nm/i); return match ? match[1] : ""; } function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): { formFactor: string; speed: string; speedGbps: number; } { const upper = sku.toUpperCase(); if (/^QDD[-_]|QSFP.DD/i.test(upper)) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (/^Q28[-_]|QSFP28/i.test(upper)) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; if (/^Q[-_]4X|^Q[-_]/i.test(upper) && !/28/i.test(upper.slice(0, 5))) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; if (/^SFP28[-_]|SFP-25/i.test(upper)) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; if (/^S[-_]/i.test(upper) && !/sfp/i.test(upper.slice(1, 4))) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps }; } function normalizeStockLevel( raw?: string ): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { if (!raw) return "on_request"; const lower = raw.toLowerCase(); if (lower.includes("in stock") || lower.includes("available")) return "in_stock"; if (lower.includes("out of stock") || lower.includes("backordered")) return "out_of_stock"; if (lower.includes("low stock") || lower.includes("limited")) return "low_stock"; return "on_request"; } /* ------------------------------------------------------------------ */ /* Main scraper */ /* ------------------------------------------------------------------ */ export async function scrapeProLabs(): Promise { console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n"); if (PROXY_URL) { console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`); } else { console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs."); console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n"); } const vendorId = await ensureVendor( "ProLabs", "compatible", "https://www.prolabs.com", "https://www.prolabs.com/products/networking/fiber-optics" ); let totalProducts = 0; let priceUpdates = 0; let blockedPages = 0; const seenUrls = new Set(); // Map URL -> category metadata const urlToCat = new Map(); const requestQueue = await RequestQueue.open(); for (const cat of CATEGORIES) { const url = `${BASE}${cat.path}`; urlToCat.set(url, cat); await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } }); } const crawler = new PlaywrightCrawler({ requestQueue, maxConcurrency: 1, maxRequestsPerMinute: 10, requestHandlerTimeoutSecs: 120, navigationTimeoutSecs: 60, maxRequestRetries: 2, headless: true, // Override default blockedStatusCodes (normally [401, 403, 429]). // We allow 403 so our handler can inspect the page — CloudFront may // serve a JS challenge that resolves, or we can log the block gracefully. sessionPoolOptions: { blockedStatusCodes: [401, 429], }, browserPoolOptions: { useFingerprints: false, }, launchContext: { launcher: firefox, launchOptions: { firefoxUserPrefs: { "toolkit.telemetry.enabled": false, "privacy.trackingprotection.enabled": false, }, }, }, ...(PROXY_URL ? { proxyConfiguration: new (require("crawlee").ProxyConfiguration)({ proxyUrls: [PROXY_URL], }), } : {}), preNavigationHooks: [ async ({ page }, goToOptions) => { // Realistic viewport await page.setViewportSize({ width: 1920, height: 1080 }); // Override webdriver detection await page.addInitScript(() => { Object.defineProperty(navigator, "webdriver", { get: () => false }); }); if (goToOptions) { goToOptions.waitUntil = "load"; } }, ], async requestHandler({ page, request, log }) { const currentPage: number = request.userData?.page ?? 1; const catPath: string = request.userData?.catPath ?? ""; const cat = urlToCat.get(request.url) ?? CATEGORIES.find((c) => catPath === c.path) ?? CATEGORIES[CATEGORIES.length - 1]; urlToCat.set(request.url, cat); log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`); // Give JS challenges time to resolve await page.waitForTimeout(8000); // Check what we actually got const pageTitle = await page.title(); const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || ""); log.info(` Title: "${pageTitle}"`); // Detect CloudFront WAF block if (bodyText.includes("Request blocked") || bodyText.includes("Access Denied") || bodyText.includes("403 ERROR") || pageTitle.includes("ERROR")) { blockedPages++; log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`); if (blockedPages >= 3 && totalProducts === 0) { log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`); } return; } // Extract products via page.evaluate const productData = await page.evaluate(() => { const results: Array<{ name: string; href: string; price: string; stock: string; partNumber: string; }> = []; // Strategy 1: Product card links const productLinks = document.querySelectorAll( 'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a' ); for (const link of productLinks) { const el = link as HTMLAnchorElement; const name = el.textContent?.trim() || ""; const href = el.getAttribute("href") || ""; if (!name || name.length < 5 || name.length > 200 || !href) continue; if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue; const container = el.closest('[class*="product"]') || el.closest('[class*="item"]') || el.closest('[class*="card"]') || el.closest("li") || el.parentElement?.parentElement?.parentElement; let price = ""; let stock = ""; let pn = ""; if (container) { const priceEl = container.querySelector( '[class*="price"], [class*="Price"], [data-price], .price' ); price = priceEl?.textContent?.trim() || ""; if (!price) { const containerText = container.textContent || ""; const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/); if (priceMatch) price = priceMatch[0]; } const stockEl = container.querySelector( '[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]' ); stock = stockEl?.textContent?.trim() || ""; const skuEl = container.querySelector( '[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]' ); pn = skuEl?.textContent?.trim() || ""; } if (!pn) { pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || ""; } if (name && href.includes("/products/")) { results.push({ name, href, price, stock, partNumber: pn }); } } // Strategy 2: Scan deeper for anchors with product URLs if (results.length === 0) { const allAnchors = document.querySelectorAll("a[href*='/products/']"); for (const el of allAnchors) { const anchor = el as HTMLAnchorElement; const href = anchor.getAttribute("href") || ""; const name = anchor.textContent?.trim() || ""; if (!name || name.length < 5) continue; let parent: Element | null = anchor; let price = ""; for (let i = 0; i < 4 && parent; i++) { parent = parent.parentElement; if (parent) { const text = parent.textContent || ""; const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/); if (m) { price = m[0]; break; } } } const pn = href.split("/").pop()?.replace(/\.html?$/, "") || ""; results.push({ name, href, price, stock: "", partNumber: pn }); } } // Strategy 3: JSON-LD structured data const ldScripts = document.querySelectorAll('script[type="application/ld+json"]'); for (const script of ldScripts) { try { const data = JSON.parse(script.textContent || ""); const items = data.itemListElement || (Array.isArray(data) ? data : [data]); for (const item of items) { if (item["@type"] === "Product" || item.offers) { const name = item.name || ""; const href = item.url || ""; const offers = item.offers || {}; const price = offers.price ? `$${offers.price}` : ""; const stock = offers.availability || ""; const pn = item.sku || item.mpn || href.split("/").pop() || ""; if (name) results.push({ name, href, price, stock, partNumber: pn }); } } } catch { /* ignore parse errors */ } } return results; }); log.info(` Raw items extracted: ${productData.length}`); // Process extracted products const pageProducts: Product[] = []; for (const item of productData) { if (!item.name) continue; const partNumber = (item.partNumber || item.name).slice(0, 80).trim(); const name = item.name.slice(0, 200).trim(); const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`; let price: number | undefined; if (item.price) { const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", ""); const parsed = parseFloat(cleaned); if (parsed > 0 && parsed < 100000) price = parsed; } const combined = name + " " + partNumber; const reach = detectReach(combined); const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat); pageProducts.push({ partNumber, name, url, price, stockStatus: item.stock || undefined, formFactor, speed, speedGbps, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(combined), wavelength: detectWavelength(combined), }); } // Deduplicate against global set const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url)); for (const p of newProducts) seenUrls.add(p.url); log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`); // Write to database for (const product of newProducts) { try { const txId = await findOrCreateScrapedTransceiver({ partNumber: product.partNumber, vendorId, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, reachMeters: product.reachMeters, reachLabel: product.reachLabel, fiberType: product.fiberType, wavelengths: product.wavelength, category: "DataCenter", }); if (product.price && product.price > 0) { const hash = contentHash({ price: product.price, part: product.partNumber, stock: product.stockStatus ?? "", }); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: "USD", stockLevel: normalizeStockLevel(product.stockStatus), url: product.url, contentHash: hash, }); if (updated) priceUpdates++; } totalProducts++; } catch (err) { log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); } } // Check for next page const hasNext = await page.evaluate((currentPageNum: number) => { const nextLink = document.querySelector('a[rel="next"], link[rel="next"]'); if (nextLink) return true; const nextNum = currentPageNum + 1; const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a'); for (const link of paginationLinks) { const href = (link as HTMLAnchorElement).getAttribute("href") || ""; if (href.includes(`page=${nextNum}`)) return true; const text = link.textContent?.trim() || ""; if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true; } return false; }, currentPage); if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) { const nextPageNum = currentPage + 1; const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`; urlToCat.set(nextUrl, cat); await requestQueue.addRequest({ url: nextUrl, userData: { page: nextPageNum, catPath }, }); log.info(` Enqueued next page: ${nextPageNum}`); } }, async failedRequestHandler({ request, log }) { log.error(`Request failed after retries: ${request.url}`); }, }); await crawler.run(); console.log(`\n=== ProLabs Complete ===`); console.log(` Products processed: ${totalProducts}`); console.log(` Price updates: ${priceUpdates}`); console.log(` Pages blocked by WAF: ${blockedPages}`); if (blockedPages > 0 && totalProducts === 0) { console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`); console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`); } } if (require.main === module) { scrapeProLabs() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }