From 0b07490114c7a440e1f7a3d9527f0a3e35c543f3 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 31 Mar 2026 07:32:02 +0200 Subject: [PATCH] chore: sync local changes --- packages/scraper/src/scheduler.ts | 41 +- packages/scraper/src/scrapers/cisco-tmg.ts | 250 ++++++---- packages/scraper/src/scrapers/fluxlight.ts | 12 +- packages/scraper/src/scrapers/gbics.ts | 19 +- packages/scraper/src/scrapers/news.ts | 19 +- packages/scraper/src/scrapers/prolabs.ts | 538 +++++++++++++-------- 6 files changed, 521 insertions(+), 358 deletions(-) diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index cf08518..16bde6d 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -61,9 +61,6 @@ export async function registerSchedules(boss: PgBoss): Promise { "scrape:pricing:10gtek", "scrape:pricing:atgbics", "scrape:pricing:prolabs", - "scrape:pricing:naddod", - "scrape:pricing:qsfptek", - "scrape:pricing:addon", "scrape:compat:cisco", "scrape:pricing:flexoptix", "scrape:vendors:flexoptix", @@ -117,30 +114,12 @@ export async function registerSchedules(boss: PgBoss): Promise { expireInSeconds: 3600, }); - // ProLabs pricing (every 8 hours — server-rendered HTML, USD prices) + // ProLabs pricing (every 8 hours — Playwright, needs proxy for CloudFront) await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // NADDOD pricing (every 8 hours — WooCommerce, USD prices) - await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, { - retryLimit: 2, - expireInSeconds: 3600, - }); - - // QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices) - await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, { - retryLimit: 2, - expireInSeconds: 3600, - }); - - // AddOn Networks pricing (every 12 hours — enterprise site, USD prices) - await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, { - retryLimit: 2, - expireInSeconds: 3600, - }); - // Flexoptix catalog (every 6 hours — fetch-based, fast) await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, { retryLimit: 2, @@ -173,9 +152,6 @@ export async function registerWorkers(boss: PgBoss): Promise { const { scrapeNews } = await import("./scrapers/news"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeProLabs } = await import("./scrapers/prolabs"); - const { scrapeNaddod } = await import("./scrapers/naddod"); - const { scrapeQsfptek } = await import("./scrapers/qsfptek"); - const { scrapeAddonNetworks } = await import("./scrapers/addon-networks"); await boss.work("scrape:pricing:fs", async (_job) => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); @@ -222,21 +198,6 @@ export async function registerWorkers(boss: PgBoss): Promise { await withIsolatedStorage("prolabs", scrapeProLabs); }); - await boss.work("scrape:pricing:naddod", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`); - await scrapeNaddod(); - }); - - await boss.work("scrape:pricing:qsfptek", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`); - await scrapeQsfptek(); - }); - - await boss.work("scrape:pricing:addon", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`); - await scrapeAddonNetworks(); - }); - await boss.work("scrape:faq", async (_job) => { console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); }); diff --git a/packages/scraper/src/scrapers/cisco-tmg.ts b/packages/scraper/src/scrapers/cisco-tmg.ts index 79cb6fd..dc1c5e1 100644 --- a/packages/scraper/src/scrapers/cisco-tmg.ts +++ b/packages/scraper/src/scrapers/cisco-tmg.ts @@ -1,27 +1,101 @@ /** * Cisco TMG Matrix Scraper — Transceiver Compatibility * - * Source: tmgmatrix.cisco.com + * Source: tmgmatrix.cisco.com (JSON API — no auth required) * Extracts: Switch model ↔ Transceiver compatibility data * Stores: switches, compatibility table * - * The TMG Matrix has a JSON API behind the scenes. + * Uses POST /public/api/networkdevice/search endpoint directly. */ -import { CheerioCrawler } from "crawlee"; import { pool, ensureVendor } from "../utils/db"; -const TMG_BASE = "https://tmgmatrix.cisco.com"; +const TMG_API = "https://tmgmatrix.cisco.com/public/api/networkdevice/search"; -interface TmgEntry { - switchModel: string; - switchSeries: string; - transceiverPid: string; - transceiverDescription: string; - speed: string; +interface TmgTransceiver { + tmgId: number; + productId: string; + productFamily: string; + formFactor: string; reach: string; + temperatureRange: string; cableType: string; - connector: string; - minSoftware: string; + media: string; + connectorType: string; + transmissionStandard: string; + dataRate: string; + endOfSale: string; + softReleaseMinVer: string; + breakoutMode: string; + osType: string; + domSupport: string; + type: string; +} + +interface TmgCompatEntry { + productId: string; // switch PID + transceivers: TmgTransceiver[]; +} + +interface TmgDevice { + productFamily: string; + networkAndTransceiverCompatibility: TmgCompatEntry[]; +} + +interface TmgSearchResponse { + totalCount: number; + filters: Array<{ name: string; values: Array<{ id: number; name: string; count: number }> }>; + networkDevices: TmgDevice[]; +} + +/** Key Nexus/Catalyst platform family IDs from the TMG API */ +const PLATFORM_FAMILIES = [ + { id: 74, name: "N9300" }, // Nexus 9300 — 8,515 entries + { id: 77, name: "N9500" }, // Nexus 9500 — 2,266 entries + { id: 78, name: "N9200" }, // Nexus 9200 — 708 entries + { id: 661, name: "N9800" }, // Nexus 9800 — 238 entries + { id: 76, name: "C9300" }, // Catalyst 9300 — 260 entries + { id: 601, name: "C9300L" }, // Catalyst 9300L — 720 entries + { id: 1181, name: "C9300X" }, // Catalyst 9300X — 413 entries + { id: 8, name: "C9500" }, // Catalyst 9500 — 1,141 entries + { id: 521, name: "C9600" }, // Catalyst 9600 — 771 entries + { id: 7, name: "C9400" }, // Catalyst 9400 — 561 entries + { id: 341, name: "C9200" }, // Catalyst 9200 — 222 entries + { id: 83, name: "ASR9000" }, // ASR 9000 — 3,644 entries +]; + +async function searchTmg(familyFilter: { id: number; name: string }): Promise { + const body = { + cableType: [], + dataRate: [], + formFactor: [], + reach: [], + searchInput: [""], + osType: [], + transceiverProductFamily: [], + transceiverProductID: [], + networkDeviceProductFamily: [familyFilter], + networkDeviceProductID: [], + media: [], + connectorType: [], + caseTemperature: [], + performanceMonitoring: [], + }; + + const res = await fetch(TMG_API, { + method: "POST", + headers: { + "Content-Type": "application/json", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "application/json", + }, + body: JSON.stringify(body), + }); + + if (!res.ok) { + throw new Error(`TMG API ${res.status}: ${res.statusText}`); + } + + return res.json() as Promise; } async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise { @@ -38,18 +112,31 @@ async function upsertCiscoSwitch(vendorId: string, model: string, series: string async function upsertCompatibility( switchId: string, transceiverId: string, - firmwareMin: string + firmwareMin: string, + formFactor: string, + reach: string, + cableType: string, + media: string, + dataRate: string ): Promise { await pool.query( - `INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url) - VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4) - ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET firmware_min = EXCLUDED.firmware_min`, - [switchId, transceiverId, firmwareMin || null, TMG_BASE] + `INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url, notes) + VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4, $5) + ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET + firmware_min = EXCLUDED.firmware_min, + notes = EXCLUDED.notes`, + [ + switchId, + transceiverId, + firmwareMin || null, + "https://tmgmatrix.cisco.com", + `${formFactor} ${dataRate} ${reach} ${media} ${cableType}`.trim(), + ] ); } export async function scrapeCiscoTmg(): Promise { - console.log("=== Cisco TMG Matrix Scraper Starting ===\n"); + console.log("=== Cisco TMG Matrix Scraper Starting (API mode) ===\n"); const ciscoVendorId = await ensureVendor( "Cisco", @@ -58,90 +145,69 @@ export async function scrapeCiscoTmg(): Promise { undefined ); - const entries: TmgEntry[] = []; + let totalSwitches = 0; + let totalCompat = 0; + let totalTransceivers = 0; - // TMG Matrix uses a search API - // First, try the public HTML interface - const crawler = new CheerioCrawler({ - maxConcurrency: 1, - maxRequestsPerMinute: 10, // Very respectful — Cisco rate limits aggressively - - async requestHandler({ request, $, log }) { - log.info(`Scraping: ${request.url}`); - - // The TMG Matrix renders a table with compatibility data - $("table tbody tr, .matrix-row, [class*='result-row']").each((_i, el) => { - const $row = $(el); - const cells = $row.find("td").map((_j, td) => $(td).text().trim()).get(); - - if (cells.length >= 4) { - entries.push({ - switchModel: cells[0] || "", - switchSeries: cells[0]?.split(" ")[0] || "Nexus", - transceiverPid: cells[1] || "", - transceiverDescription: cells[2] || "", - speed: cells[3] || "", - reach: cells[4] || "", - cableType: cells[5] || "", - connector: cells[6] || "", - minSoftware: cells[7] || "", - }); - } - }); - }, - }); - - // Start with Nexus switches (most relevant for Flexoptix) - await crawler.run([ - `${TMG_BASE}/public/tmg?searchValue=Nexus+9000`, - `${TMG_BASE}/public/tmg?searchValue=Nexus+3000`, - `${TMG_BASE}/public/tmg?searchValue=Nexus+7000`, - `${TMG_BASE}/public/tmg?searchValue=Catalyst+9000`, - ]); - - console.log(`\nEntries found: ${entries.length}`); - - // Write to database - let switches = 0; - let compat = 0; - - for (const entry of entries) { - if (!entry.switchModel || !entry.transceiverPid) continue; + for (const family of PLATFORM_FAMILIES) { + console.log(`\nFetching ${family.name}...`); try { - const switchId = await upsertCiscoSwitch( - ciscoVendorId, - entry.switchModel, - entry.switchSeries - ); - switches++; + const data = await searchTmg(family); + console.log(` ${family.name}: ${data.totalCount} total entries, ${data.networkDevices.length} device groups`); - // Try to match transceiver in our DB - const txResult = await pool.query( - `SELECT id FROM transceivers - WHERE part_number = $1 - OR slug LIKE $2 - OR standard_name ILIKE $3 - LIMIT 1`, - [ - entry.transceiverPid, - `%${entry.transceiverPid.toLowerCase().replace(/[^a-z0-9]/g, "")}%`, - `%${entry.speed}%${entry.reach}%`, - ] - ); + for (const device of data.networkDevices) { + for (const compat of device.networkAndTransceiverCompatibility) { + if (!compat.productId) continue; - if (txResult.rows.length > 0) { - await upsertCompatibility(switchId, txResult.rows[0].id, entry.minSoftware); - compat++; + const switchId = await upsertCiscoSwitch( + ciscoVendorId, + compat.productId, + device.productFamily + ); + totalSwitches++; + + for (const tx of compat.transceivers) { + if (!tx.productId) continue; + totalTransceivers++; + + // Try to match transceiver in our DB by Cisco PID + const txResult = await pool.query( + `SELECT id FROM transceivers + WHERE part_number = $1 + OR part_number = $2 + LIMIT 1`, + [tx.productId, tx.productId.replace(/-S$/, "")] + ); + + if (txResult.rows.length > 0) { + await upsertCompatibility( + switchId, + txResult.rows[0].id, + tx.softReleaseMinVer, + tx.formFactor, + tx.reach, + tx.cableType, + tx.media, + tx.dataRate + ); + totalCompat++; + } + } + } } + + // Rate limit: 2 seconds between platform families + await new Promise((r) => setTimeout(r, 2000)); } catch (err) { - // Skip duplicates silently + console.error(` Error fetching ${family.name}:`, err); } } - console.log(`Switches upserted: ${switches}`); - console.log(`Compatibility entries: ${compat}`); - console.log("=== Cisco TMG Scraper Complete ===\n"); + console.log(`\n=== Cisco TMG Scraper Complete ===`); + console.log(` Switches upserted: ${totalSwitches}`); + console.log(` Transceiver entries scanned: ${totalTransceivers}`); + console.log(` Compatibility matches: ${totalCompat}\n`); } if (require.main === module) { diff --git a/packages/scraper/src/scrapers/fluxlight.ts b/packages/scraper/src/scrapers/fluxlight.ts index c786e6e..03004e5 100644 --- a/packages/scraper/src/scrapers/fluxlight.ts +++ b/packages/scraper/src/scrapers/fluxlight.ts @@ -1,7 +1,7 @@ /** * Fluxlight Scraper — US-based compatible transceiver vendor * - * fluxlight.com — BigCommerce, server-rendered HTML with real prices. + * www.fluxlight.com — BigCommerce, server-rendered HTML with real prices. * ~144+ products across 6 pages. Uses pagination via ?page=N. * * Rate limited: 1 req/2sec. @@ -91,8 +91,8 @@ function parseProductList(html: string): Product[] { const products: Product[] = []; // BigCommerce product card pattern: product link + price - // Pattern: Product Name ... $29.99 - const productRegex = /href="(https?:\/\/fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi; + // Pattern: Product Name ... $29.99 + const productRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi; let match; while ((match = productRegex.exec(html)) !== null) { const url = match[1]; @@ -123,7 +123,7 @@ function parseProductList(html: string): Product[] { // Fallback: broader link pattern if (products.length === 0) { - const simpleRegex = /href="(https?:\/\/fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi; + const simpleRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi; while ((match = simpleRegex.exec(html)) !== null) { const url = match[1]; const name = match[2].trim(); @@ -166,7 +166,7 @@ async function fetchPage(url: string): Promise { export async function scrapeFluxlight(): Promise { console.log("=== Fluxlight Scraper Starting ===\n"); - const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://fluxlight.com/transceivers/"); + const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://www.fluxlight.com/transceivers/"); let allProducts: Product[] = []; @@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash({ price: product.price, part: product.partNumber }); + const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: "USD", diff --git a/packages/scraper/src/scrapers/gbics.ts b/packages/scraper/src/scrapers/gbics.ts index dfe1a7d..238e1db 100644 --- a/packages/scraper/src/scrapers/gbics.ts +++ b/packages/scraper/src/scrapers/gbics.ts @@ -8,7 +8,7 @@ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; -const BASE = "https://gbics.com"; +const BASE = "https://www.gbics.com"; const HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", Accept: "text/html,application/xhtml+xml", @@ -100,7 +100,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product // BigCommerce card-title pattern: // - const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/gbics\.com\/[^"]+)"\s+data-event-type="product-click"/gi; + const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*data-event-type="product-click"/gi; let match; while ((match = productRegex.exec(collapsed)) !== null) { const label = match[1].trim(); @@ -110,7 +110,14 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product // Split on last comma to separate name and price const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/); const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label; - const price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined; + let price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined; + + // Fallback: extract price from data-price-asc attribute on parent
  • + if (!price) { + const priceContext = collapsed.slice(Math.max(0, match.index - 500), match.index); + const dataPriceMatch = priceContext.match(/data-price-asc="(\d+)"/); + if (dataPriceMatch) price = parseFloat(dataPriceMatch[1]); + } if (name.length < 10) continue; @@ -131,7 +138,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product // Fallback: try "Now: £XX.XX" pattern near product links if (products.length === 0) { - const altRegex = /href="(https?:\/\/gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi; + const altRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi; while ((match = altRegex.exec(collapsed)) !== null) { const url = match[1]; const name = match[2].trim(); @@ -172,7 +179,7 @@ async function fetchPage(url: string): Promise { export async function scrapeGbics(): Promise { console.log("=== GBICS.com Scraper Starting ===\n"); - const vendorId = await ensureVendor("GBICS", "compatible", "https://gbics.com", "https://gbics.com/optical-transceivers/"); + const vendorId = await ensureVendor("GBICS", "compatible", "https://www.gbics.com", "https://www.gbics.com/optical-transceivers/"); let totalProducts = 0; let priceUpdates = 0; @@ -196,7 +203,7 @@ export async function scrapeGbics(): Promise { }); if (product.price && product.price > 0) { - const hash = contentHash({ price: product.price, part: product.partNumber }); + const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); const updated = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price: product.price, currency: "GBP", diff --git a/packages/scraper/src/scrapers/news.ts b/packages/scraper/src/scrapers/news.ts index 7d21080..c8ec19a 100644 --- a/packages/scraper/src/scrapers/news.ts +++ b/packages/scraper/src/scrapers/news.ts @@ -38,19 +38,14 @@ interface NewsArticle { const FEEDS: RssFeed[] = [ // === PRIMARY: Transceiver-specific === { - name: "Lightwave Online", - url: "https://www.lightwaveonline.com/rss", + name: "The Next Platform", + url: "https://www.nextplatform.com/feed/", category: "market_report", }, { - name: "Lightwave - Fiber Optics", - url: "https://www.lightwaveonline.com/fttx/rss", - category: "market_report", - }, - { - name: "Fierce Telecom", - url: "https://www.fiercetelecom.com/rss/xml", - category: "market_report", + name: "ServeTheHome", + url: "https://www.servethehome.com/feed/", + category: "product_launch", }, { name: "Optics.org", @@ -69,8 +64,8 @@ const FEEDS: RssFeed[] = [ category: "market_report", }, { - name: "SDxCentral", - url: "https://www.sdxcentral.com/feed/", + name: "The Register - Data Centre", + url: "https://www.theregister.com/data_centre/headlines.atom", category: "market_report", }, // === TERTIARY: General tech / photonics === diff --git a/packages/scraper/src/scrapers/prolabs.ts b/packages/scraper/src/scrapers/prolabs.ts index 125d90b..bf71d44 100644 --- a/packages/scraper/src/scrapers/prolabs.ts +++ b/packages/scraper/src/scrapers/prolabs.ts @@ -1,22 +1,29 @@ /** * ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary) * - * prolabs.com — Server-rendered HTML with public USD pricing. + * prolabs.com — CloudFront WAF aggressively blocks datacenter IPs. + * Uses PlaywrightCrawler with Firefox for anti-detection. + * + * KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs + * (HTTP 403 "Request blocked"). This scraper works correctly from + * residential IPs. Solutions: + * 1. Set PROXY_URL env var to a residential/rotating proxy + * 2. Run from a residential IP (e.g. home server) + * 3. Route through WireGuard with internet breakout at home + * * Products listed under /products/networking/fiber-optics/ category pages. - * Pagination via ?page=N. Rate limited: 1 req/2sec. Max 100 pages. + * Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min. * * SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR" */ +import { PlaywrightCrawler, RequestQueue } from "crawlee"; +import { firefox } from "playwright"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.prolabs.com"; -const HEADERS = { - "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", - Accept: "text/html,application/xhtml+xml", -}; - const MAX_PAGES = 100; +const PROXY_URL = process.env.PROXY_URL || ""; const CATEGORIES = [ { path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 }, @@ -26,7 +33,6 @@ const CATEGORIES = [ { path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, { path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, - // Broad fallback category in case above paths differ on the live site { path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, ]; @@ -45,9 +51,9 @@ interface Product { wavelength?: string; } -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); -} +/* ------------------------------------------------------------------ */ +/* Helper / detection functions (unchanged from original) */ +/* ------------------------------------------------------------------ */ function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ @@ -90,18 +96,6 @@ function detectWavelength(text: string): string { return match ? match[1] : ""; } -/** - * Infer form factor and speed from ProLabs SKU prefixes when category context - * is not specific enough (e.g. when crawling the broad fallback category). - * - * ProLabs SKU prefix conventions: - * Q- -> QSFP+ 40G - * Q28- -> QSFP28 100G - * QDD- -> QSFP-DD 400G - * SFP28- -> SFP28 25G - * SFP- -> SFP+ 10G (most common ProLabs prefix) - * S- -> SFP 1G - */ function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): { formFactor: string; speed: string; @@ -116,121 +110,6 @@ function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): { return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps }; } -/** - * Parse product listings from a ProLabs category page. - * - * ProLabs uses a standard e-commerce layout: - * - Product cards with an link containing the product URL and name - * - Price in a span with "price" in class or as "$XX.XX" text nearby - * - SKU / part number in the URL slug - * - Stock badge: "In Stock" / "Out of Stock" / "Call for Availability" - * - * We parse with lightweight regex on collapsed HTML — same approach as gbics.ts - * and sfpcables.ts (no DOM parser dependency). - */ -function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { - const products: Product[] = []; - const collapsed = html.replace(/\s+/g, " "); - - // Strategy 1: product cards with structured href containing a SKU-like segment - // Match anchor tags whose href is a deep product path ending in a SKU pattern - const productLinkRegex = /href="(\/products\/[^"]*?\/([A-Z0-9][A-Z0-9\-_]{3,}(?:-PR)?))"\s[^>]*>([^<]{10,})<\/a>/gi; - let match: RegExpExecArray | null; - - while ((match = productLinkRegex.exec(collapsed)) !== null) { - const relUrl = match[1]; - const skuFromUrl = match[2]; - const linkText = match[3].trim(); - - // Skip navigation / filter / pagination links - if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue; - if (linkText.length > 200) continue; - - const url = BASE + relUrl; - const partNumber = skuFromUrl.slice(0, 80); - const name = linkText.length > 10 ? linkText : partNumber; - - // Look for price in a 700-char window after the match position - const context = collapsed.slice(Math.max(0, match.index - 100), match.index + 700); - const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/) || - context.match(/price[^>]*>\s*\$?\s*([\d,]+\.?\d{0,2})/i); - const price = priceMatch ? parseFloat(priceMatch[1].replace(",", "")) : undefined; - - const stockMatch = context.match(/(in[\s-]stock|out[\s-]of[\s-]stock|call for availability|available|backordered)/i); - const stockStatus = stockMatch ? stockMatch[1].toLowerCase() : undefined; - - const combined = name + " " + partNumber; - const reach = detectReach(combined); - const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat); - - products.push({ - partNumber, name, url, - price: price && price > 0 && price < 100000 ? price : undefined, - stockStatus, - formFactor, speed, speedGbps, - reachLabel: reach?.label, - reachMeters: reach?.meters, - fiberType: detectFiber(combined), - wavelength: detectWavelength(combined), - }); - } - - // Strategy 2: Fallback — any link to a /products/ URL that has a $ price nearby - if (products.length === 0) { - const altRegex = /href="(\/products\/[^"]{10,})"/gi; - while ((match = altRegex.exec(collapsed)) !== null) { - const relUrl = match[1]; - if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue; - - const context = collapsed.slice(Math.max(0, match.index - 50), match.index + 800); - const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/); - if (!priceMatch) continue; - - const price = parseFloat(priceMatch[1].replace(",", "")); - const nameMatch = context.match(/<(?:h[23]|strong|span)[^>]*>([^<]{10,150})<\//i); - const name = nameMatch ? nameMatch[1].trim() : relUrl.split("/").pop() || ""; - const partNumber = (relUrl.split("/").pop() ?? name).slice(0, 80); - - const url = BASE + relUrl; - const combined = name + " " + partNumber; - const reach = detectReach(combined); - const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat); - - products.push({ - partNumber, name, url, - price: price > 0 && price < 100000 ? price : undefined, - formFactor, speed, speedGbps, - reachLabel: reach?.label, - reachMeters: reach?.meters, - fiberType: detectFiber(combined), - wavelength: detectWavelength(combined), - }); - } - } - - // Deduplicate by URL - const seen = new Set(); - return products.filter((p) => { - if (seen.has(p.url)) return false; - seen.add(p.url); - return true; - }); -} - -/** Check if the HTML contains a link to the next pagination page. */ -function hasNextPage(html: string, currentPage: number): boolean { - if (/rel="next"/i.test(html)) return true; - const nextPageNum = currentPage + 1; - const pattern = new RegExp(`[?&]page=${nextPageNum}`, "i"); - return pattern.test(html); -} - -async function fetchPage(url: string): Promise { - const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); - if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); - return resp.text(); -} - function normalizeStockLevel( raw?: string ): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { @@ -242,8 +121,19 @@ function normalizeStockLevel( return "on_request"; } +/* ------------------------------------------------------------------ */ +/* Main scraper */ +/* ------------------------------------------------------------------ */ + export async function scrapeProLabs(): Promise { - console.log("=== ProLabs Scraper Starting ===\n"); + console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n"); + + if (PROXY_URL) { + console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`); + } else { + console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs."); + console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n"); + } const vendorId = await ensureVendor( "ProLabs", @@ -254,90 +144,334 @@ export async function scrapeProLabs(): Promise { let totalProducts = 0; let priceUpdates = 0; + let blockedPages = 0; const seenUrls = new Set(); + // Map URL -> category metadata + const urlToCat = new Map(); + + const requestQueue = await RequestQueue.open(); + for (const cat of CATEGORIES) { - console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); + const url = `${BASE}${cat.path}`; + urlToCat.set(url, cat); + await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } }); + } - let page = 1; - let pagesThisCat = 0; - let productsThisCat = 0; + const crawler = new PlaywrightCrawler({ + requestQueue, + maxConcurrency: 1, + maxRequestsPerMinute: 10, + requestHandlerTimeoutSecs: 120, + navigationTimeoutSecs: 60, + maxRequestRetries: 2, + headless: true, + // Override default blockedStatusCodes (normally [401, 403, 429]). + // We allow 403 so our handler can inspect the page — CloudFront may + // serve a JS challenge that resolves, or we can log the block gracefully. + sessionPoolOptions: { + blockedStatusCodes: [401, 429], + }, + browserPoolOptions: { + useFingerprints: false, + }, + launchContext: { + launcher: firefox, + launchOptions: { + firefoxUserPrefs: { + "toolkit.telemetry.enabled": false, + "privacy.trackingprotection.enabled": false, + }, + }, + }, + ...(PROXY_URL ? { + proxyConfiguration: new (require("crawlee").ProxyConfiguration)({ + proxyUrls: [PROXY_URL], + }), + } : {}), + preNavigationHooks: [ + async ({ page }, goToOptions) => { + // Realistic viewport + await page.setViewportSize({ width: 1920, height: 1080 }); - while (page <= MAX_PAGES) { - const url = page === 1 - ? `${BASE}${cat.path}` - : `${BASE}${cat.path}?page=${page}`; + // Override webdriver detection + await page.addInitScript(() => { + Object.defineProperty(navigator, "webdriver", { get: () => false }); + }); - try { - const html = await fetchPage(url); - const pageProducts = parseProductList(html, cat); + if (goToOptions) { + goToOptions.waitUntil = "load"; + } + }, + ], - // Global dedup: broad fallback category overlaps with specific ones - const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url)); - newProducts.forEach((p) => seenUrls.add(p.url)); + async requestHandler({ page, request, log }) { + const currentPage: number = request.userData?.page ?? 1; + const catPath: string = request.userData?.catPath ?? ""; - console.log(` Page ${page}: ${pageProducts.length} found, ${newProducts.length} new`); + const cat = urlToCat.get(request.url) ?? + CATEGORIES.find((c) => catPath === c.path) ?? + CATEGORIES[CATEGORIES.length - 1]; + urlToCat.set(request.url, cat); - for (const product of newProducts) { - try { - const txId = await findOrCreateScrapedTransceiver({ - partNumber: product.partNumber, - vendorId, - formFactor: product.formFactor, - speedGbps: product.speedGbps, - speed: product.speed, - reachMeters: product.reachMeters, - reachLabel: product.reachLabel, - fiberType: product.fiberType, - wavelengths: product.wavelength, - category: "DataCenter", - }); + log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`); - if (product.price && product.price > 0) { - const hash = contentHash({ - price: product.price, - part: product.partNumber, - stock: product.stockStatus ?? "", - }); - const updated = await upsertPriceObservation({ - transceiverId: txId, - sourceVendorId: vendorId, - price: product.price, - currency: "USD", - stockLevel: normalizeStockLevel(product.stockStatus), - url: product.url, - contentHash: hash, - }); - if (updated) priceUpdates++; + // Give JS challenges time to resolve + await page.waitForTimeout(8000); + + // Check what we actually got + const pageTitle = await page.title(); + const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || ""); + log.info(` Title: "${pageTitle}"`); + + // Detect CloudFront WAF block + if (bodyText.includes("Request blocked") || + bodyText.includes("Access Denied") || + bodyText.includes("403 ERROR") || + pageTitle.includes("ERROR")) { + blockedPages++; + log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`); + if (blockedPages >= 3 && totalProducts === 0) { + log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`); + } + return; + } + + // Extract products via page.evaluate + const productData = await page.evaluate(() => { + const results: Array<{ + name: string; + href: string; + price: string; + stock: string; + partNumber: string; + }> = []; + + // Strategy 1: Product card links + const productLinks = document.querySelectorAll( + 'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a' + ); + + for (const link of productLinks) { + const el = link as HTMLAnchorElement; + const name = el.textContent?.trim() || ""; + const href = el.getAttribute("href") || ""; + + if (!name || name.length < 5 || name.length > 200 || !href) continue; + if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue; + + const container = + el.closest('[class*="product"]') || + el.closest('[class*="item"]') || + el.closest('[class*="card"]') || + el.closest("li") || + el.parentElement?.parentElement?.parentElement; + + let price = ""; + let stock = ""; + let pn = ""; + + if (container) { + const priceEl = container.querySelector( + '[class*="price"], [class*="Price"], [data-price], .price' + ); + price = priceEl?.textContent?.trim() || ""; + if (!price) { + const containerText = container.textContent || ""; + const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/); + if (priceMatch) price = priceMatch[0]; } - productsThisCat++; - totalProducts++; - } catch (err) { - console.warn(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); + const stockEl = container.querySelector( + '[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]' + ); + stock = stockEl?.textContent?.trim() || ""; + + const skuEl = container.querySelector( + '[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]' + ); + pn = skuEl?.textContent?.trim() || ""; + } + + if (!pn) { + pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || ""; + } + + if (name && href.includes("/products/")) { + results.push({ name, href, price, stock, partNumber: pn }); } } - pagesThisCat++; + // Strategy 2: Scan deeper for anchors with product URLs + if (results.length === 0) { + const allAnchors = document.querySelectorAll("a[href*='/products/']"); + for (const el of allAnchors) { + const anchor = el as HTMLAnchorElement; + const href = anchor.getAttribute("href") || ""; + const name = anchor.textContent?.trim() || ""; + if (!name || name.length < 5) continue; - if (pageProducts.length === 0 || !hasNextPage(html, page)) break; + let parent: Element | null = anchor; + let price = ""; + for (let i = 0; i < 4 && parent; i++) { + parent = parent.parentElement; + if (parent) { + const text = parent.textContent || ""; + const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/); + if (m) { price = m[0]; break; } + } + } - page++; - await sleep(2000); - } catch (err) { - console.error(` Page ${page} failed: ${(err as Error).message}`); - break; + const pn = href.split("/").pop()?.replace(/\.html?$/, "") || ""; + results.push({ name, href, price, stock: "", partNumber: pn }); + } + } + + // Strategy 3: JSON-LD structured data + const ldScripts = document.querySelectorAll('script[type="application/ld+json"]'); + for (const script of ldScripts) { + try { + const data = JSON.parse(script.textContent || ""); + const items = data.itemListElement || (Array.isArray(data) ? data : [data]); + for (const item of items) { + if (item["@type"] === "Product" || item.offers) { + const name = item.name || ""; + const href = item.url || ""; + const offers = item.offers || {}; + const price = offers.price ? `$${offers.price}` : ""; + const stock = offers.availability || ""; + const pn = item.sku || item.mpn || href.split("/").pop() || ""; + if (name) results.push({ name, href, price, stock, partNumber: pn }); + } + } + } catch { /* ignore parse errors */ } + } + + return results; + }); + + log.info(` Raw items extracted: ${productData.length}`); + + // Process extracted products + const pageProducts: Product[] = []; + + for (const item of productData) { + if (!item.name) continue; + + const partNumber = (item.partNumber || item.name).slice(0, 80).trim(); + const name = item.name.slice(0, 200).trim(); + const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`; + + let price: number | undefined; + if (item.price) { + const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", ""); + const parsed = parseFloat(cleaned); + if (parsed > 0 && parsed < 100000) price = parsed; + } + + const combined = name + " " + partNumber; + const reach = detectReach(combined); + const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat); + + pageProducts.push({ + partNumber, name, url, price, + stockStatus: item.stock || undefined, + formFactor, speed, speedGbps, + reachLabel: reach?.label, + reachMeters: reach?.meters, + fiberType: detectFiber(combined), + wavelength: detectWavelength(combined), + }); } - } - console.log(` Category done: ${productsThisCat} products across ${pagesThisCat} page(s)`); + // Deduplicate against global set + const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url)); + for (const p of newProducts) seenUrls.add(p.url); - if (cat !== CATEGORIES[CATEGORIES.length - 1]) { - await sleep(2000); - } + log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`); + + // Write to database + for (const product of newProducts) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + if (product.price && product.price > 0) { + const hash = contentHash({ + price: product.price, + part: product.partNumber, + stock: product.stockStatus ?? "", + }); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: "USD", + stockLevel: normalizeStockLevel(product.stockStatus), + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + + totalProducts++; + } catch (err) { + log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); + } + } + + // Check for next page + const hasNext = await page.evaluate((currentPageNum: number) => { + const nextLink = document.querySelector('a[rel="next"], link[rel="next"]'); + if (nextLink) return true; + const nextNum = currentPageNum + 1; + const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a'); + for (const link of paginationLinks) { + const href = (link as HTMLAnchorElement).getAttribute("href") || ""; + if (href.includes(`page=${nextNum}`)) return true; + const text = link.textContent?.trim() || ""; + if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true; + } + return false; + }, currentPage); + + if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) { + const nextPageNum = currentPage + 1; + const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`; + urlToCat.set(nextUrl, cat); + await requestQueue.addRequest({ + url: nextUrl, + userData: { page: nextPageNum, catPath }, + }); + log.info(` Enqueued next page: ${nextPageNum}`); + } + }, + + async failedRequestHandler({ request, log }) { + log.error(`Request failed after retries: ${request.url}`); + }, + }); + + await crawler.run(); + + console.log(`\n=== ProLabs Complete ===`); + console.log(` Products processed: ${totalProducts}`); + console.log(` Price updates: ${priceUpdates}`); + console.log(` Pages blocked by WAF: ${blockedPages}`); + if (blockedPages > 0 && totalProducts === 0) { + console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`); + console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`); } - - console.log(`\n=== ProLabs Complete: ${totalProducts} products processed, ${priceUpdates} price updates ===`); } if (require.main === module) {