refactor(prolabs): replace Playwright+Firefox with fetch-based catalog scraper
ProLabs uses B2B quote model - prices require reseller account and are not shown publicly (schema.org always shows price=0.00). Fighting CloudFront WAF with Firefox automation is pointless. New approach: - Sitemap-driven: downloads all 14 sitemaps to collect product URLs - fetch-based: curl-compatible HTTP requests bypass CloudFront TLS detection - catalog-only: writes part numbers + specs to transceivers table - Rate-limited: 300ms between requests (~3 req/sec) - No proxy needed: Pi nodes no longer consumed for ProLabs
This commit is contained in:
parent
7af5b32b3f
commit
6febb9c88e
@ -1,59 +1,66 @@
|
||||
/**
|
||||
* ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary)
|
||||
* ProLabs Scraper — Catalog-only (no public pricing)
|
||||
*
|
||||
* prolabs.com — CloudFront WAF aggressively blocks datacenter IPs.
|
||||
* Uses PlaywrightCrawler with Firefox for anti-detection.
|
||||
* ProLabs (an Amphenol company) uses a B2B quote model — prices require a
|
||||
* sales contact or reseller account and are NOT shown on the public website.
|
||||
* The schema.org markup consistently shows price=0.00.
|
||||
*
|
||||
* KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs
|
||||
* (HTTP 403 "Request blocked"). This scraper works correctly from
|
||||
* residential IPs. Solutions:
|
||||
* 1. Set PROXY_URL env var to a residential/rotating proxy
|
||||
* 2. Run from a residential IP (e.g. home server)
|
||||
* 3. Route through WireGuard with internet breakout at home
|
||||
* Approach: sitemap-driven fetch scraper (curl-compatible headers).
|
||||
* CloudFront allows regular HTTP requests; only Playwright/browser automation
|
||||
* gets blocked via TLS fingerprinting.
|
||||
*
|
||||
* Products listed under /products/networking/fiber-optics/ category pages.
|
||||
* Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min.
|
||||
* Collects: part numbers, product names, form factors, specs (no prices).
|
||||
* Writes: `transceivers` catalog entries via findOrCreateScrapedTransceiver.
|
||||
*
|
||||
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
|
||||
* SKU format examples: "SFP-10G-SR-PR", "Q28-100G-LR4-PR", "Q-4X10G-LR-PR"
|
||||
*/
|
||||
import { PlaywrightCrawler, RequestQueue } from "crawlee";
|
||||
import { firefox } from "playwright";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
||||
|
||||
const BASE = "https://www.prolabs.com";
|
||||
const MAX_PAGES = 100;
|
||||
const PROXY_URL = process.env.PROXY_URL || "";
|
||||
const SITEMAP_INDEX = `${BASE}/sitemap.xml`;
|
||||
const RATE_LIMIT_MS = 300; // ~3 req/sec — polite crawl
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/products/networking/fiber-optics/sfp-plus-modules", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/products/networking/fiber-optics/sfp28-modules", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/products/networking/fiber-optics/qsfp-plus-modules", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
};
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
stockStatus?: string;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
const TRANSCEIVER_KEYWORDS = /\b(sfp|qsfp|xfp|osfp|cfp|cxp|transceiver|fiber.optic|fibre.optic)\b/i;
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Helper / detection functions (unchanged from original) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
async function fetchText(url: string): Promise<string | null> {
|
||||
try {
|
||||
const res = await fetch(url, { headers: HEADERS });
|
||||
if (!res.ok) return null;
|
||||
return await res.text();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
|
||||
const lower = text.toLowerCase();
|
||||
if (lower.includes("osfp") && lower.includes("1600g")) return { formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 };
|
||||
if (lower.includes("osfp") && lower.includes("800g")) return { formFactor: "OSFP", speed: "800G", speedGbps: 800 };
|
||||
if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
||||
if (lower.includes("qsfp28") || lower.includes("100gbase")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
||||
if ((lower.includes("qsfp+") || lower.includes("qsfp plus")) && !lower.includes("28")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
||||
if (lower.includes("sfp28") || lower.includes("25gbase") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
|
||||
if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
||||
if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 };
|
||||
if (lower.includes("cfp2")) return { formFactor: "CFP2", speed: "100G", speedGbps: 100 };
|
||||
if (lower.includes("cfp4")) return { formFactor: "CFP4", speed: "100G", speedGbps: 100 };
|
||||
if (lower.includes("cfp")) return { formFactor: "CFP", speed: "100G", speedGbps: 100 };
|
||||
if (lower.includes("1000base") || lower.includes("1gbase")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
|
||||
if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
||||
if (lower.includes("qsfp")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
||||
return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
@ -65,413 +72,211 @@ function detectReach(text: string): { label: string; meters: number } | undefine
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b400\s*m\b/i, "400m", 400],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\b30\s*m\b/i, "30m", 30],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
[/\bLR4\b/i, "10km", 10000], [/\b[^Z]LR\b/i, "10km", 10000],
|
||||
[/\bER4?\b/i, "40km", 40000], [/\bZR4?\b/i, "80km", 80000],
|
||||
[/\bSR4?\b/i, "300m", 300], [/\bDR4?\b/i, "500m", 500],
|
||||
[/\bFR4?\b/i, "2km", 2000],
|
||||
];
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
for (const [rx, label, meters] of patterns) {
|
||||
if (rx.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/single.?mode|smf|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|\bsx\b|\bsr\b/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
const m = text.match(/(\d{3,4})\s*nm/i);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
||||
/** Extract SKU from ProLabs product URL slug */
|
||||
function skuFromSlug(slug: string): string {
|
||||
// URL pattern: /vendor-sku-c-description or /sku-description
|
||||
// The -c- separator splits SKU from compatibility desc
|
||||
const parts = slug.replace(/^\//, "").split("-c-");
|
||||
const base = parts[0];
|
||||
// ProLabs appends vendor code at end — extract the actual part number
|
||||
// e.g. "extreme-sfp-10gbase-zr-100-i-ex" → "SFP-10GBASE-ZR-100-I-EX-C"
|
||||
return base.toUpperCase().replace(/-+/g, "-");
|
||||
}
|
||||
|
||||
/** Download and parse sitemap index to collect all sitemap URLs */
|
||||
async function fetchSitemapUrls(): Promise<string[]> {
|
||||
const index = await fetchText(SITEMAP_INDEX);
|
||||
if (!index) {
|
||||
console.warn(" Could not fetch sitemap index");
|
||||
return [];
|
||||
}
|
||||
|
||||
const sitemapUrls = [...index.matchAll(/<loc>([^<]+sitemap[^<]+\.xml)<\/loc>/gi)]
|
||||
.map((m) => m[1].trim());
|
||||
|
||||
console.log(` Found ${sitemapUrls.length} sitemaps`);
|
||||
return sitemapUrls;
|
||||
}
|
||||
|
||||
/** Download a single sitemap and extract product URLs */
|
||||
async function fetchProductUrlsFromSitemap(sitemapUrl: string): Promise<string[]> {
|
||||
const xml = await fetchText(sitemapUrl);
|
||||
if (!xml) return [];
|
||||
|
||||
return [...xml.matchAll(/<loc>([^<]+prolabs\.com\/[^<]+)<\/loc>/gi)]
|
||||
.map((m) => m[1].trim())
|
||||
.filter((u) => {
|
||||
// Filter out category pages, search, contact, etc.
|
||||
const path = u.replace(/https?:\/\/[^/]+/, "");
|
||||
return !path.match(/^\/(search|contact|sitemap|sitemap|downloads|articles|industry|support|why-|prolabs-dense|prolabs-test|360-virtual|videos|multi-coded|case-studies|white-papers|faqs|built-for-ai|what-is-new|about|associations|careers|multi-source|legacy|rma|tech-support|warranty|wintune|edfamux|privacy|newsletter|where-to-buy|media-converter|multiservice|eon-omp|multiplexers|patch-cables|cassettes|adapter|server|desktop|network|a-v-cables|power-adapters|usb-cables|c-lc|c-sc|c-fc|c-st|c-mtp|c-mpo|c-mt|power-|usb-)/)
|
||||
&& !path.includes("memory")
|
||||
&& !path.includes("/cart")
|
||||
&& !path.includes("/order");
|
||||
});
|
||||
}
|
||||
|
||||
/** Scrape a ProLabs product page for part number and specs */
|
||||
async function scrapeProductPage(url: string): Promise<{
|
||||
partNumber: string;
|
||||
name: string;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
} {
|
||||
const upper = sku.toUpperCase();
|
||||
if (/^QDD[-_]|QSFP.DD/i.test(upper)) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
||||
if (/^Q28[-_]|QSFP28/i.test(upper)) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
||||
if (/^Q[-_]4X|^Q[-_]/i.test(upper) && !/28/i.test(upper.slice(0, 5))) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
||||
if (/^SFP28[-_]|SFP-25/i.test(upper)) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
|
||||
if (/^S[-_]/i.test(upper) && !/sfp/i.test(upper.slice(1, 4))) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
|
||||
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
|
||||
}
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
} | null> {
|
||||
const html = await fetchText(url);
|
||||
if (!html) return null;
|
||||
|
||||
function normalizeStockLevel(
|
||||
raw?: string
|
||||
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
|
||||
if (!raw) return "on_request";
|
||||
const lower = raw.toLowerCase();
|
||||
if (lower.includes("in stock") || lower.includes("available")) return "in_stock";
|
||||
if (lower.includes("out of stock") || lower.includes("backordered")) return "out_of_stock";
|
||||
if (lower.includes("low stock") || lower.includes("limited")) return "low_stock";
|
||||
return "on_request";
|
||||
}
|
||||
// Extract title from <title> tag
|
||||
const titleM = html.match(/<title>([^<]+)<\/title>/i);
|
||||
const pageTitle = titleM ? titleM[1].replace(/\s*\|\s*[^|]+$/, "").trim() : "";
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Main scraper */
|
||||
/* ------------------------------------------------------------------ */
|
||||
// Extract H1 (canonical product name)
|
||||
const h1M = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
const h1 = h1M ? h1M[1].replace(/<[^>]+>/g, "").replace(/&#x[0-9A-F]+;/gi, (e) =>
|
||||
String.fromCharCode(parseInt(e.slice(3, -1), 16))).replace(/&/g, "&").trim() : "";
|
||||
|
||||
// Extract SKU from nopCommerce structure
|
||||
const skuM = html.match(/id="sku-\d+"[^>]*>([^<]+)<\/span>/i);
|
||||
const pageSku = skuM ? skuM[1].trim() : "";
|
||||
|
||||
// If no transceiver keywords found, skip
|
||||
const combined = [pageTitle, h1, pageSku].join(" ");
|
||||
if (!TRANSCEIVER_KEYWORDS.test(combined) && !combined.includes("-PR")) return null;
|
||||
|
||||
// Use page SKU if available, otherwise derive from URL slug
|
||||
const slug = url.replace(/https?:\/\/[^/]+/, "");
|
||||
const partNumber = pageSku || skuFromSlug(slug);
|
||||
const name = h1 || pageTitle;
|
||||
|
||||
if (!name || name.length < 5) return null;
|
||||
|
||||
const { formFactor, speed, speedGbps } = detectFormFactor(combined);
|
||||
const reach = detectReach(combined);
|
||||
|
||||
return {
|
||||
partNumber,
|
||||
name,
|
||||
formFactor,
|
||||
speed,
|
||||
speedGbps,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelength: detectWavelength(combined),
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeProLabs(): Promise<void> {
|
||||
console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n");
|
||||
|
||||
if (PROXY_URL) {
|
||||
console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`);
|
||||
} else {
|
||||
console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs.");
|
||||
console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n");
|
||||
}
|
||||
console.log("=== ProLabs Catalog Scraper (fetch-based, no Playwright) ===\n");
|
||||
console.log("Note: ProLabs uses B2B quote model — catalog data only, no public prices.\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"ProLabs",
|
||||
"compatible",
|
||||
"https://www.prolabs.com",
|
||||
"https://www.prolabs.com/products/networking/fiber-optics"
|
||||
"https://www.prolabs.com/transceivers"
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
let blockedPages = 0;
|
||||
const seenUrls = new Set<string>();
|
||||
let totalProcessed = 0;
|
||||
let totalNew = 0;
|
||||
let skipped = 0;
|
||||
|
||||
// Map URL -> category metadata
|
||||
const urlToCat = new Map<string, typeof CATEGORIES[number]>();
|
||||
// Step 1: Collect product URLs from sitemaps
|
||||
console.log("Step 1: Collecting sitemaps...");
|
||||
const sitemapUrls = await fetchSitemapUrls();
|
||||
|
||||
const requestQueue = await RequestQueue.open();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
const url = `${BASE}${cat.path}`;
|
||||
urlToCat.set(url, cat);
|
||||
await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } });
|
||||
const productUrls: string[] = [];
|
||||
let sitemapCount = 0;
|
||||
for (const smUrl of sitemapUrls) {
|
||||
const urls = await fetchProductUrlsFromSitemap(smUrl);
|
||||
productUrls.push(...urls);
|
||||
sitemapCount++;
|
||||
if (sitemapCount % 3 === 0) {
|
||||
console.log(` Processed ${sitemapCount}/${sitemapUrls.length} sitemaps, ${productUrls.length} product URLs so far`);
|
||||
}
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
requestQueue,
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 10,
|
||||
requestHandlerTimeoutSecs: 120,
|
||||
navigationTimeoutSecs: 60,
|
||||
maxRequestRetries: 2,
|
||||
headless: true,
|
||||
// Override default blockedStatusCodes (normally [401, 403, 429]).
|
||||
// We allow 403 so our handler can inspect the page — CloudFront may
|
||||
// serve a JS challenge that resolves, or we can log the block gracefully.
|
||||
sessionPoolOptions: {
|
||||
blockedStatusCodes: [401, 429],
|
||||
},
|
||||
browserPoolOptions: {
|
||||
useFingerprints: false,
|
||||
},
|
||||
launchContext: {
|
||||
launcher: firefox,
|
||||
launchOptions: {
|
||||
firefoxUserPrefs: {
|
||||
"toolkit.telemetry.enabled": false,
|
||||
"privacy.trackingprotection.enabled": false,
|
||||
},
|
||||
},
|
||||
},
|
||||
...(PROXY_URL ? {
|
||||
proxyConfiguration: new (require("crawlee").ProxyConfiguration)({
|
||||
proxyUrls: [PROXY_URL],
|
||||
}),
|
||||
} : {}),
|
||||
preNavigationHooks: [
|
||||
async ({ page }, goToOptions) => {
|
||||
// Realistic viewport
|
||||
await page.setViewportSize({ width: 1920, height: 1080 });
|
||||
// Deduplicate
|
||||
const uniqueUrls = [...new Set(productUrls)];
|
||||
console.log(`\nStep 2: Found ${uniqueUrls.length} unique product URLs\n`);
|
||||
|
||||
// Override webdriver detection
|
||||
await page.addInitScript(() => {
|
||||
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
||||
});
|
||||
|
||||
if (goToOptions) {
|
||||
goToOptions.waitUntil = "load";
|
||||
}
|
||||
},
|
||||
],
|
||||
|
||||
async requestHandler({ page, request, log }) {
|
||||
const currentPage: number = request.userData?.page ?? 1;
|
||||
const catPath: string = request.userData?.catPath ?? "";
|
||||
|
||||
const cat = urlToCat.get(request.url) ??
|
||||
CATEGORIES.find((c) => catPath === c.path) ??
|
||||
CATEGORIES[CATEGORIES.length - 1];
|
||||
urlToCat.set(request.url, cat);
|
||||
|
||||
log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`);
|
||||
|
||||
// Give JS challenges time to resolve
|
||||
await page.waitForTimeout(8000);
|
||||
|
||||
// Check what we actually got
|
||||
const pageTitle = await page.title();
|
||||
const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || "");
|
||||
log.info(` Title: "${pageTitle}"`);
|
||||
|
||||
// Detect CloudFront WAF block
|
||||
if (bodyText.includes("Request blocked") ||
|
||||
bodyText.includes("Access Denied") ||
|
||||
bodyText.includes("403 ERROR") ||
|
||||
pageTitle.includes("ERROR")) {
|
||||
blockedPages++;
|
||||
log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`);
|
||||
if (blockedPages >= 3 && totalProducts === 0) {
|
||||
log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract products via page.evaluate
|
||||
const productData = await page.evaluate(() => {
|
||||
const results: Array<{
|
||||
name: string;
|
||||
href: string;
|
||||
price: string;
|
||||
stock: string;
|
||||
partNumber: string;
|
||||
}> = [];
|
||||
|
||||
// Strategy 1: Product card links
|
||||
const productLinks = document.querySelectorAll(
|
||||
'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a'
|
||||
);
|
||||
|
||||
for (const link of productLinks) {
|
||||
const el = link as HTMLAnchorElement;
|
||||
const name = el.textContent?.trim() || "";
|
||||
const href = el.getAttribute("href") || "";
|
||||
|
||||
if (!name || name.length < 5 || name.length > 200 || !href) continue;
|
||||
if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue;
|
||||
|
||||
const container =
|
||||
el.closest('[class*="product"]') ||
|
||||
el.closest('[class*="item"]') ||
|
||||
el.closest('[class*="card"]') ||
|
||||
el.closest("li") ||
|
||||
el.parentElement?.parentElement?.parentElement;
|
||||
|
||||
let price = "";
|
||||
let stock = "";
|
||||
let pn = "";
|
||||
|
||||
if (container) {
|
||||
const priceEl = container.querySelector(
|
||||
'[class*="price"], [class*="Price"], [data-price], .price'
|
||||
);
|
||||
price = priceEl?.textContent?.trim() || "";
|
||||
if (!price) {
|
||||
const containerText = container.textContent || "";
|
||||
const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||
if (priceMatch) price = priceMatch[0];
|
||||
}
|
||||
|
||||
const stockEl = container.querySelector(
|
||||
'[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]'
|
||||
);
|
||||
stock = stockEl?.textContent?.trim() || "";
|
||||
|
||||
const skuEl = container.querySelector(
|
||||
'[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]'
|
||||
);
|
||||
pn = skuEl?.textContent?.trim() || "";
|
||||
}
|
||||
|
||||
if (!pn) {
|
||||
pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || "";
|
||||
}
|
||||
|
||||
if (name && href.includes("/products/")) {
|
||||
results.push({ name, href, price, stock, partNumber: pn });
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Scan deeper for anchors with product URLs
|
||||
if (results.length === 0) {
|
||||
const allAnchors = document.querySelectorAll("a[href*='/products/']");
|
||||
for (const el of allAnchors) {
|
||||
const anchor = el as HTMLAnchorElement;
|
||||
const href = anchor.getAttribute("href") || "";
|
||||
const name = anchor.textContent?.trim() || "";
|
||||
if (!name || name.length < 5) continue;
|
||||
|
||||
let parent: Element | null = anchor;
|
||||
let price = "";
|
||||
for (let i = 0; i < 4 && parent; i++) {
|
||||
parent = parent.parentElement;
|
||||
if (parent) {
|
||||
const text = parent.textContent || "";
|
||||
const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||
if (m) { price = m[0]; break; }
|
||||
}
|
||||
}
|
||||
|
||||
const pn = href.split("/").pop()?.replace(/\.html?$/, "") || "";
|
||||
results.push({ name, href, price, stock: "", partNumber: pn });
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: JSON-LD structured data
|
||||
const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||
for (const script of ldScripts) {
|
||||
try {
|
||||
const data = JSON.parse(script.textContent || "");
|
||||
const items = data.itemListElement || (Array.isArray(data) ? data : [data]);
|
||||
for (const item of items) {
|
||||
if (item["@type"] === "Product" || item.offers) {
|
||||
const name = item.name || "";
|
||||
const href = item.url || "";
|
||||
const offers = item.offers || {};
|
||||
const price = offers.price ? `$${offers.price}` : "";
|
||||
const stock = offers.availability || "";
|
||||
const pn = item.sku || item.mpn || href.split("/").pop() || "";
|
||||
if (name) results.push({ name, href, price, stock, partNumber: pn });
|
||||
}
|
||||
}
|
||||
} catch { /* ignore parse errors */ }
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
log.info(` Raw items extracted: ${productData.length}`);
|
||||
|
||||
// Process extracted products
|
||||
const pageProducts: Product[] = [];
|
||||
|
||||
for (const item of productData) {
|
||||
if (!item.name) continue;
|
||||
|
||||
const partNumber = (item.partNumber || item.name).slice(0, 80).trim();
|
||||
const name = item.name.slice(0, 200).trim();
|
||||
const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`;
|
||||
|
||||
let price: number | undefined;
|
||||
if (item.price) {
|
||||
const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", "");
|
||||
const parsed = parseFloat(cleaned);
|
||||
if (parsed > 0 && parsed < 100000) price = parsed;
|
||||
}
|
||||
|
||||
const combined = name + " " + partNumber;
|
||||
const reach = detectReach(combined);
|
||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
||||
|
||||
pageProducts.push({
|
||||
partNumber, name, url, price,
|
||||
stockStatus: item.stock || undefined,
|
||||
formFactor, speed, speedGbps,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelength: detectWavelength(combined),
|
||||
});
|
||||
}
|
||||
|
||||
// Deduplicate against global set
|
||||
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
|
||||
for (const p of newProducts) seenUrls.add(p.url);
|
||||
|
||||
log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`);
|
||||
|
||||
// Write to database
|
||||
for (const product of newProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({
|
||||
price: product.price,
|
||||
part: product.partNumber,
|
||||
stock: product.stockStatus ?? "",
|
||||
});
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: normalizeStockLevel(product.stockStatus),
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for next page
|
||||
const hasNext = await page.evaluate((currentPageNum: number) => {
|
||||
const nextLink = document.querySelector('a[rel="next"], link[rel="next"]');
|
||||
if (nextLink) return true;
|
||||
const nextNum = currentPageNum + 1;
|
||||
const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a');
|
||||
for (const link of paginationLinks) {
|
||||
const href = (link as HTMLAnchorElement).getAttribute("href") || "";
|
||||
if (href.includes(`page=${nextNum}`)) return true;
|
||||
const text = link.textContent?.trim() || "";
|
||||
if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true;
|
||||
}
|
||||
return false;
|
||||
}, currentPage);
|
||||
|
||||
if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) {
|
||||
const nextPageNum = currentPage + 1;
|
||||
const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`;
|
||||
urlToCat.set(nextUrl, cat);
|
||||
await requestQueue.addRequest({
|
||||
url: nextUrl,
|
||||
userData: { page: nextPageNum, catPath },
|
||||
});
|
||||
log.info(` Enqueued next page: ${nextPageNum}`);
|
||||
}
|
||||
},
|
||||
|
||||
async failedRequestHandler({ request, log }) {
|
||||
log.error(`Request failed after retries: ${request.url}`);
|
||||
},
|
||||
// Step 3: Scrape each product page
|
||||
// Limit to transceiver-related URLs first (filter by keyword in slug)
|
||||
const transceiverUrls = uniqueUrls.filter((u) => {
|
||||
const path = u.toLowerCase();
|
||||
return TRANSCEIVER_KEYWORDS.test(path) || path.includes("-pr") || path.includes("transceiver");
|
||||
});
|
||||
|
||||
await crawler.run();
|
||||
console.log(`Step 3: Scraping ${transceiverUrls.length} transceiver-related pages...\n`);
|
||||
|
||||
console.log(`\n=== ProLabs Complete ===`);
|
||||
console.log(` Products processed: ${totalProducts}`);
|
||||
console.log(` Price updates: ${priceUpdates}`);
|
||||
console.log(` Pages blocked by WAF: ${blockedPages}`);
|
||||
if (blockedPages > 0 && totalProducts === 0) {
|
||||
console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`);
|
||||
console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`);
|
||||
for (const url of transceiverUrls) {
|
||||
try {
|
||||
const product = await scrapeProductPage(url);
|
||||
totalProcessed++;
|
||||
|
||||
if (!product) {
|
||||
skipped++;
|
||||
} else {
|
||||
await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber.slice(0, 80),
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
totalNew++;
|
||||
|
||||
if (totalNew % 50 === 0) {
|
||||
console.log(` ${totalNew} catalog entries written (${totalProcessed} pages processed)`);
|
||||
}
|
||||
}
|
||||
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
} catch (err) {
|
||||
console.warn(` Error [${url.slice(-60)}]: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n=== ProLabs Catalog Complete ===`);
|
||||
console.log(` Pages processed: ${totalProcessed}`);
|
||||
console.log(` Catalog entries written: ${totalNew}`);
|
||||
console.log(` Skipped (non-transceiver): ${skipped}`);
|
||||
console.log(` Note: No prices (B2B quote model)`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user