From 6febb9c88e68aac7f50024018e73fa78c44cd845 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 11 Apr 2026 02:57:13 +0200 Subject: [PATCH] refactor(prolabs): replace Playwright+Firefox with fetch-based catalog scraper ProLabs uses B2B quote model - prices require reseller account and are not shown publicly (schema.org always shows price=0.00). Fighting CloudFront WAF with Firefox automation is pointless. New approach: - Sitemap-driven: downloads all 14 sitemaps to collect product URLs - fetch-based: curl-compatible HTTP requests bypass CloudFront TLS detection - catalog-only: writes part numbers + specs to transceivers table - Rate-limited: 300ms between requests (~3 req/sec) - No proxy needed: Pi nodes no longer consumed for ProLabs --- packages/scraper/src/scrapers/prolabs.ts | 631 ++++++++--------------- 1 file changed, 218 insertions(+), 413 deletions(-) diff --git a/packages/scraper/src/scrapers/prolabs.ts b/packages/scraper/src/scrapers/prolabs.ts index bf71d44..d358b00 100644 --- a/packages/scraper/src/scrapers/prolabs.ts +++ b/packages/scraper/src/scrapers/prolabs.ts @@ -1,59 +1,66 @@ /** - * ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary) + * ProLabs Scraper — Catalog-only (no public pricing) * - * prolabs.com — CloudFront WAF aggressively blocks datacenter IPs. - * Uses PlaywrightCrawler with Firefox for anti-detection. + * ProLabs (an Amphenol company) uses a B2B quote model — prices require a + * sales contact or reseller account and are NOT shown on the public website. + * The schema.org markup consistently shows price=0.00. * - * KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs - * (HTTP 403 "Request blocked"). This scraper works correctly from - * residential IPs. Solutions: - * 1. Set PROXY_URL env var to a residential/rotating proxy - * 2. Run from a residential IP (e.g. home server) - * 3. Route through WireGuard with internet breakout at home + * Approach: sitemap-driven fetch scraper (curl-compatible headers). + * CloudFront allows regular HTTP requests; only Playwright/browser automation + * gets blocked via TLS fingerprinting. * - * Products listed under /products/networking/fiber-optics/ category pages. - * Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min. + * Collects: part numbers, product names, form factors, specs (no prices). + * Writes: `transceivers` catalog entries via findOrCreateScrapedTransceiver. * - * SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR" + * SKU format examples: "SFP-10G-SR-PR", "Q28-100G-LR4-PR", "Q-4X10G-LR-PR" */ -import { PlaywrightCrawler, RequestQueue } from "crawlee"; -import { firefox } from "playwright"; -import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; -import { contentHash } from "../utils/hash"; +import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db"; const BASE = "https://www.prolabs.com"; -const MAX_PAGES = 100; -const PROXY_URL = process.env.PROXY_URL || ""; +const SITEMAP_INDEX = `${BASE}/sitemap.xml`; +const RATE_LIMIT_MS = 300; // ~3 req/sec — polite crawl -const CATEGORIES = [ - { path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 }, - { path: "/products/networking/fiber-optics/sfp-plus-modules", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, - { path: "/products/networking/fiber-optics/sfp28-modules", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, - { path: "/products/networking/fiber-optics/qsfp-plus-modules", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, - { path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, - { path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - { path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, -]; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", +}; -interface Product { - partNumber: string; - name: string; - url: string; - price?: number; - stockStatus?: string; - formFactor: string; - speed: string; - speedGbps: number; - reachLabel?: string; - reachMeters?: number; - fiberType?: string; - wavelength?: string; +const TRANSCEIVER_KEYWORDS = /\b(sfp|qsfp|xfp|osfp|cfp|cxp|transceiver|fiber.optic|fibre.optic)\b/i; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); } -/* ------------------------------------------------------------------ */ -/* Helper / detection functions (unchanged from original) */ -/* ------------------------------------------------------------------ */ +async function fetchText(url: string): Promise { + try { + const res = await fetch(url, { headers: HEADERS }); + if (!res.ok) return null; + return await res.text(); + } catch { + return null; + } +} + +function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { + const lower = text.toLowerCase(); + if (lower.includes("osfp") && lower.includes("1600g")) return { formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 }; + if (lower.includes("osfp") && lower.includes("800g")) return { formFactor: "OSFP", speed: "800G", speedGbps: 800 }; + if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; + if (lower.includes("qsfp28") || lower.includes("100gbase")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; + if ((lower.includes("qsfp+") || lower.includes("qsfp plus")) && !lower.includes("28")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; + if (lower.includes("sfp28") || lower.includes("25gbase") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; + if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 }; + if (lower.includes("cfp2")) return { formFactor: "CFP2", speed: "100G", speedGbps: 100 }; + if (lower.includes("cfp4")) return { formFactor: "CFP4", speed: "100G", speedGbps: 100 }; + if (lower.includes("cfp")) return { formFactor: "CFP", speed: "100G", speedGbps: 100 }; + if (lower.includes("1000base") || lower.includes("1gbase")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; + if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; + if (lower.includes("qsfp")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; + return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; +} function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ @@ -65,413 +72,211 @@ function detectReach(text: string): { label: string; meters: number } | undefine [/\b2\s*km\b/i, "2km", 2000], [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], - [/\b400\s*m\b/i, "400m", 400], [/\b300\s*m\b/i, "300m", 300], - [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], - [/\b30\s*m\b/i, "30m", 30], - [/\bLR4\b/, "10km", 10000], - [/\bLR\b/, "10km", 10000], - [/\bER4?\b/, "40km", 40000], - [/\bZR4?\b/, "80km", 80000], - [/\bSR4?\b/, "300m", 300], - [/\bDR4?\b/, "500m", 500], - [/\bFR4?\b/, "2km", 2000], + [/\bLR4\b/i, "10km", 10000], [/\b[^Z]LR\b/i, "10km", 10000], + [/\bER4?\b/i, "40km", 40000], [/\bZR4?\b/i, "80km", 80000], + [/\bSR4?\b/i, "300m", 300], [/\bDR4?\b/i, "500m", 500], + [/\bFR4?\b/i, "2km", 2000], ]; - for (const [regex, label, meters] of patterns) { - if (regex.test(text)) return { label, meters }; + for (const [rx, label, meters] of patterns) { + if (rx.test(text)) return { label, meters }; } return undefined; } function detectFiber(text: string): string { - if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; - if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/single.?mode|smf|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|\bsx\b|\bsr\b/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper"; return ""; } function detectWavelength(text: string): string { - const match = text.match(/(\d{3,4})\s*nm/i); - return match ? match[1] : ""; + const m = text.match(/(\d{3,4})\s*nm/i); + return m ? m[1] : ""; } -function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): { +/** Extract SKU from ProLabs product URL slug */ +function skuFromSlug(slug: string): string { + // URL pattern: /vendor-sku-c-description or /sku-description + // The -c- separator splits SKU from compatibility desc + const parts = slug.replace(/^\//, "").split("-c-"); + const base = parts[0]; + // ProLabs appends vendor code at end — extract the actual part number + // e.g. "extreme-sfp-10gbase-zr-100-i-ex" → "SFP-10GBASE-ZR-100-I-EX-C" + return base.toUpperCase().replace(/-+/g, "-"); +} + +/** Download and parse sitemap index to collect all sitemap URLs */ +async function fetchSitemapUrls(): Promise { + const index = await fetchText(SITEMAP_INDEX); + if (!index) { + console.warn(" Could not fetch sitemap index"); + return []; + } + + const sitemapUrls = [...index.matchAll(/([^<]+sitemap[^<]+\.xml)<\/loc>/gi)] + .map((m) => m[1].trim()); + + console.log(` Found ${sitemapUrls.length} sitemaps`); + return sitemapUrls; +} + +/** Download a single sitemap and extract product URLs */ +async function fetchProductUrlsFromSitemap(sitemapUrl: string): Promise { + const xml = await fetchText(sitemapUrl); + if (!xml) return []; + + return [...xml.matchAll(/([^<]+prolabs\.com\/[^<]+)<\/loc>/gi)] + .map((m) => m[1].trim()) + .filter((u) => { + // Filter out category pages, search, contact, etc. + const path = u.replace(/https?:\/\/[^/]+/, ""); + return !path.match(/^\/(search|contact|sitemap|sitemap|downloads|articles|industry|support|why-|prolabs-dense|prolabs-test|360-virtual|videos|multi-coded|case-studies|white-papers|faqs|built-for-ai|what-is-new|about|associations|careers|multi-source|legacy|rma|tech-support|warranty|wintune|edfamux|privacy|newsletter|where-to-buy|media-converter|multiservice|eon-omp|multiplexers|patch-cables|cassettes|adapter|server|desktop|network|a-v-cables|power-adapters|usb-cables|c-lc|c-sc|c-fc|c-st|c-mtp|c-mpo|c-mt|power-|usb-)/) + && !path.includes("memory") + && !path.includes("/cart") + && !path.includes("/order"); + }); +} + +/** Scrape a ProLabs product page for part number and specs */ +async function scrapeProductPage(url: string): Promise<{ + partNumber: string; + name: string; formFactor: string; speed: string; speedGbps: number; -} { - const upper = sku.toUpperCase(); - if (/^QDD[-_]|QSFP.DD/i.test(upper)) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; - if (/^Q28[-_]|QSFP28/i.test(upper)) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; - if (/^Q[-_]4X|^Q[-_]/i.test(upper) && !/28/i.test(upper.slice(0, 5))) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; - if (/^SFP28[-_]|SFP-25/i.test(upper)) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; - if (/^S[-_]/i.test(upper) && !/sfp/i.test(upper.slice(1, 4))) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; - return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps }; -} + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelength?: string; +} | null> { + const html = await fetchText(url); + if (!html) return null; -function normalizeStockLevel( - raw?: string -): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { - if (!raw) return "on_request"; - const lower = raw.toLowerCase(); - if (lower.includes("in stock") || lower.includes("available")) return "in_stock"; - if (lower.includes("out of stock") || lower.includes("backordered")) return "out_of_stock"; - if (lower.includes("low stock") || lower.includes("limited")) return "low_stock"; - return "on_request"; -} + // Extract title from tag + const titleM = html.match(/<title>([^<]+)<\/title>/i); + const pageTitle = titleM ? titleM[1].replace(/\s*\|\s*[^|]+$/, "").trim() : ""; -/* ------------------------------------------------------------------ */ -/* Main scraper */ -/* ------------------------------------------------------------------ */ + // Extract H1 (canonical product name) + const h1M = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i); + const h1 = h1M ? h1M[1].replace(/<[^>]+>/g, "").replace(/&#x[0-9A-F]+;/gi, (e) => + String.fromCharCode(parseInt(e.slice(3, -1), 16))).replace(/&/g, "&").trim() : ""; + + // Extract SKU from nopCommerce structure + const skuM = html.match(/id="sku-\d+"[^>]*>([^<]+)<\/span>/i); + const pageSku = skuM ? skuM[1].trim() : ""; + + // If no transceiver keywords found, skip + const combined = [pageTitle, h1, pageSku].join(" "); + if (!TRANSCEIVER_KEYWORDS.test(combined) && !combined.includes("-PR")) return null; + + // Use page SKU if available, otherwise derive from URL slug + const slug = url.replace(/https?:\/\/[^/]+/, ""); + const partNumber = pageSku || skuFromSlug(slug); + const name = h1 || pageTitle; + + if (!name || name.length < 5) return null; + + const { formFactor, speed, speedGbps } = detectFormFactor(combined); + const reach = detectReach(combined); + + return { + partNumber, + name, + formFactor, + speed, + speedGbps, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(combined), + wavelength: detectWavelength(combined), + }; +} export async function scrapeProLabs(): Promise<void> { - console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n"); - - if (PROXY_URL) { - console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`); - } else { - console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs."); - console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n"); - } + console.log("=== ProLabs Catalog Scraper (fetch-based, no Playwright) ===\n"); + console.log("Note: ProLabs uses B2B quote model — catalog data only, no public prices.\n"); const vendorId = await ensureVendor( "ProLabs", "compatible", "https://www.prolabs.com", - "https://www.prolabs.com/products/networking/fiber-optics" + "https://www.prolabs.com/transceivers" ); - let totalProducts = 0; - let priceUpdates = 0; - let blockedPages = 0; - const seenUrls = new Set<string>(); + let totalProcessed = 0; + let totalNew = 0; + let skipped = 0; - // Map URL -> category metadata - const urlToCat = new Map<string, typeof CATEGORIES[number]>(); + // Step 1: Collect product URLs from sitemaps + console.log("Step 1: Collecting sitemaps..."); + const sitemapUrls = await fetchSitemapUrls(); - const requestQueue = await RequestQueue.open(); - - for (const cat of CATEGORIES) { - const url = `${BASE}${cat.path}`; - urlToCat.set(url, cat); - await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } }); + const productUrls: string[] = []; + let sitemapCount = 0; + for (const smUrl of sitemapUrls) { + const urls = await fetchProductUrlsFromSitemap(smUrl); + productUrls.push(...urls); + sitemapCount++; + if (sitemapCount % 3 === 0) { + console.log(` Processed ${sitemapCount}/${sitemapUrls.length} sitemaps, ${productUrls.length} product URLs so far`); + } + await sleep(RATE_LIMIT_MS); } - const crawler = new PlaywrightCrawler({ - requestQueue, - maxConcurrency: 1, - maxRequestsPerMinute: 10, - requestHandlerTimeoutSecs: 120, - navigationTimeoutSecs: 60, - maxRequestRetries: 2, - headless: true, - // Override default blockedStatusCodes (normally [401, 403, 429]). - // We allow 403 so our handler can inspect the page — CloudFront may - // serve a JS challenge that resolves, or we can log the block gracefully. - sessionPoolOptions: { - blockedStatusCodes: [401, 429], - }, - browserPoolOptions: { - useFingerprints: false, - }, - launchContext: { - launcher: firefox, - launchOptions: { - firefoxUserPrefs: { - "toolkit.telemetry.enabled": false, - "privacy.trackingprotection.enabled": false, - }, - }, - }, - ...(PROXY_URL ? { - proxyConfiguration: new (require("crawlee").ProxyConfiguration)({ - proxyUrls: [PROXY_URL], - }), - } : {}), - preNavigationHooks: [ - async ({ page }, goToOptions) => { - // Realistic viewport - await page.setViewportSize({ width: 1920, height: 1080 }); + // Deduplicate + const uniqueUrls = [...new Set(productUrls)]; + console.log(`\nStep 2: Found ${uniqueUrls.length} unique product URLs\n`); - // Override webdriver detection - await page.addInitScript(() => { - Object.defineProperty(navigator, "webdriver", { get: () => false }); - }); - - if (goToOptions) { - goToOptions.waitUntil = "load"; - } - }, - ], - - async requestHandler({ page, request, log }) { - const currentPage: number = request.userData?.page ?? 1; - const catPath: string = request.userData?.catPath ?? ""; - - const cat = urlToCat.get(request.url) ?? - CATEGORIES.find((c) => catPath === c.path) ?? - CATEGORIES[CATEGORIES.length - 1]; - urlToCat.set(request.url, cat); - - log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`); - - // Give JS challenges time to resolve - await page.waitForTimeout(8000); - - // Check what we actually got - const pageTitle = await page.title(); - const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || ""); - log.info(` Title: "${pageTitle}"`); - - // Detect CloudFront WAF block - if (bodyText.includes("Request blocked") || - bodyText.includes("Access Denied") || - bodyText.includes("403 ERROR") || - pageTitle.includes("ERROR")) { - blockedPages++; - log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`); - if (blockedPages >= 3 && totalProducts === 0) { - log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`); - } - return; - } - - // Extract products via page.evaluate - const productData = await page.evaluate(() => { - const results: Array<{ - name: string; - href: string; - price: string; - stock: string; - partNumber: string; - }> = []; - - // Strategy 1: Product card links - const productLinks = document.querySelectorAll( - 'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a' - ); - - for (const link of productLinks) { - const el = link as HTMLAnchorElement; - const name = el.textContent?.trim() || ""; - const href = el.getAttribute("href") || ""; - - if (!name || name.length < 5 || name.length > 200 || !href) continue; - if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue; - - const container = - el.closest('[class*="product"]') || - el.closest('[class*="item"]') || - el.closest('[class*="card"]') || - el.closest("li") || - el.parentElement?.parentElement?.parentElement; - - let price = ""; - let stock = ""; - let pn = ""; - - if (container) { - const priceEl = container.querySelector( - '[class*="price"], [class*="Price"], [data-price], .price' - ); - price = priceEl?.textContent?.trim() || ""; - if (!price) { - const containerText = container.textContent || ""; - const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/); - if (priceMatch) price = priceMatch[0]; - } - - const stockEl = container.querySelector( - '[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]' - ); - stock = stockEl?.textContent?.trim() || ""; - - const skuEl = container.querySelector( - '[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]' - ); - pn = skuEl?.textContent?.trim() || ""; - } - - if (!pn) { - pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || ""; - } - - if (name && href.includes("/products/")) { - results.push({ name, href, price, stock, partNumber: pn }); - } - } - - // Strategy 2: Scan deeper for anchors with product URLs - if (results.length === 0) { - const allAnchors = document.querySelectorAll("a[href*='/products/']"); - for (const el of allAnchors) { - const anchor = el as HTMLAnchorElement; - const href = anchor.getAttribute("href") || ""; - const name = anchor.textContent?.trim() || ""; - if (!name || name.length < 5) continue; - - let parent: Element | null = anchor; - let price = ""; - for (let i = 0; i < 4 && parent; i++) { - parent = parent.parentElement; - if (parent) { - const text = parent.textContent || ""; - const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/); - if (m) { price = m[0]; break; } - } - } - - const pn = href.split("/").pop()?.replace(/\.html?$/, "") || ""; - results.push({ name, href, price, stock: "", partNumber: pn }); - } - } - - // Strategy 3: JSON-LD structured data - const ldScripts = document.querySelectorAll('script[type="application/ld+json"]'); - for (const script of ldScripts) { - try { - const data = JSON.parse(script.textContent || ""); - const items = data.itemListElement || (Array.isArray(data) ? data : [data]); - for (const item of items) { - if (item["@type"] === "Product" || item.offers) { - const name = item.name || ""; - const href = item.url || ""; - const offers = item.offers || {}; - const price = offers.price ? `$${offers.price}` : ""; - const stock = offers.availability || ""; - const pn = item.sku || item.mpn || href.split("/").pop() || ""; - if (name) results.push({ name, href, price, stock, partNumber: pn }); - } - } - } catch { /* ignore parse errors */ } - } - - return results; - }); - - log.info(` Raw items extracted: ${productData.length}`); - - // Process extracted products - const pageProducts: Product[] = []; - - for (const item of productData) { - if (!item.name) continue; - - const partNumber = (item.partNumber || item.name).slice(0, 80).trim(); - const name = item.name.slice(0, 200).trim(); - const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`; - - let price: number | undefined; - if (item.price) { - const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", ""); - const parsed = parseFloat(cleaned); - if (parsed > 0 && parsed < 100000) price = parsed; - } - - const combined = name + " " + partNumber; - const reach = detectReach(combined); - const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat); - - pageProducts.push({ - partNumber, name, url, price, - stockStatus: item.stock || undefined, - formFactor, speed, speedGbps, - reachLabel: reach?.label, - reachMeters: reach?.meters, - fiberType: detectFiber(combined), - wavelength: detectWavelength(combined), - }); - } - - // Deduplicate against global set - const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url)); - for (const p of newProducts) seenUrls.add(p.url); - - log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`); - - // Write to database - for (const product of newProducts) { - try { - const txId = await findOrCreateScrapedTransceiver({ - partNumber: product.partNumber, - vendorId, - formFactor: product.formFactor, - speedGbps: product.speedGbps, - speed: product.speed, - reachMeters: product.reachMeters, - reachLabel: product.reachLabel, - fiberType: product.fiberType, - wavelengths: product.wavelength, - category: "DataCenter", - }); - - if (product.price && product.price > 0) { - const hash = contentHash({ - price: product.price, - part: product.partNumber, - stock: product.stockStatus ?? "", - }); - const updated = await upsertPriceObservation({ - transceiverId: txId, - sourceVendorId: vendorId, - price: product.price, - currency: "USD", - stockLevel: normalizeStockLevel(product.stockStatus), - url: product.url, - contentHash: hash, - }); - if (updated) priceUpdates++; - } - - totalProducts++; - } catch (err) { - log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); - } - } - - // Check for next page - const hasNext = await page.evaluate((currentPageNum: number) => { - const nextLink = document.querySelector('a[rel="next"], link[rel="next"]'); - if (nextLink) return true; - const nextNum = currentPageNum + 1; - const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a'); - for (const link of paginationLinks) { - const href = (link as HTMLAnchorElement).getAttribute("href") || ""; - if (href.includes(`page=${nextNum}`)) return true; - const text = link.textContent?.trim() || ""; - if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true; - } - return false; - }, currentPage); - - if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) { - const nextPageNum = currentPage + 1; - const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`; - urlToCat.set(nextUrl, cat); - await requestQueue.addRequest({ - url: nextUrl, - userData: { page: nextPageNum, catPath }, - }); - log.info(` Enqueued next page: ${nextPageNum}`); - } - }, - - async failedRequestHandler({ request, log }) { - log.error(`Request failed after retries: ${request.url}`); - }, + // Step 3: Scrape each product page + // Limit to transceiver-related URLs first (filter by keyword in slug) + const transceiverUrls = uniqueUrls.filter((u) => { + const path = u.toLowerCase(); + return TRANSCEIVER_KEYWORDS.test(path) || path.includes("-pr") || path.includes("transceiver"); }); - await crawler.run(); + console.log(`Step 3: Scraping ${transceiverUrls.length} transceiver-related pages...\n`); - console.log(`\n=== ProLabs Complete ===`); - console.log(` Products processed: ${totalProducts}`); - console.log(` Price updates: ${priceUpdates}`); - console.log(` Pages blocked by WAF: ${blockedPages}`); - if (blockedPages > 0 && totalProducts === 0) { - console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`); - console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`); + for (const url of transceiverUrls) { + try { + const product = await scrapeProductPage(url); + totalProcessed++; + + if (!product) { + skipped++; + } else { + await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber.slice(0, 80), + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + totalNew++; + + if (totalNew % 50 === 0) { + console.log(` ${totalNew} catalog entries written (${totalProcessed} pages processed)`); + } + } + + await sleep(RATE_LIMIT_MS); + } catch (err) { + console.warn(` Error [${url.slice(-60)}]: ${(err as Error).message.slice(0, 80)}`); + } } + + console.log(`\n=== ProLabs Catalog Complete ===`); + console.log(` Pages processed: ${totalProcessed}`); + console.log(` Catalog entries written: ${totalNew}`); + console.log(` Skipped (non-transceiver): ${skipped}`); + console.log(` Note: No prices (B2B quote model)`); } if (require.main === module) {