ProLabs uses B2B quote model - prices require reseller account and are not shown publicly (schema.org always shows price=0.00). Fighting CloudFront WAF with Firefox automation is pointless. New approach: - Sitemap-driven: downloads all 14 sitemaps to collect product URLs - fetch-based: curl-compatible HTTP requests bypass CloudFront TLS detection - catalog-only: writes part numbers + specs to transceivers table - Rate-limited: 300ms between requests (~3 req/sec) - No proxy needed: Pi nodes no longer consumed for ProLabs
291 lines
11 KiB
TypeScript
291 lines
11 KiB
TypeScript
/**
|
|
* ProLabs Scraper — Catalog-only (no public pricing)
|
|
*
|
|
* ProLabs (an Amphenol company) uses a B2B quote model — prices require a
|
|
* sales contact or reseller account and are NOT shown on the public website.
|
|
* The schema.org markup consistently shows price=0.00.
|
|
*
|
|
* Approach: sitemap-driven fetch scraper (curl-compatible headers).
|
|
* CloudFront allows regular HTTP requests; only Playwright/browser automation
|
|
* gets blocked via TLS fingerprinting.
|
|
*
|
|
* Collects: part numbers, product names, form factors, specs (no prices).
|
|
* Writes: `transceivers` catalog entries via findOrCreateScrapedTransceiver.
|
|
*
|
|
* SKU format examples: "SFP-10G-SR-PR", "Q28-100G-LR4-PR", "Q-4X10G-LR-PR"
|
|
*/
|
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db";
|
|
|
|
const BASE = "https://www.prolabs.com";
|
|
const SITEMAP_INDEX = `${BASE}/sitemap.xml`;
|
|
const RATE_LIMIT_MS = 300; // ~3 req/sec — polite crawl
|
|
|
|
const HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
|
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
};
|
|
|
|
const TRANSCEIVER_KEYWORDS = /\b(sfp|qsfp|xfp|osfp|cfp|cxp|transceiver|fiber.optic|fibre.optic)\b/i;
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function fetchText(url: string): Promise<string | null> {
|
|
try {
|
|
const res = await fetch(url, { headers: HEADERS });
|
|
if (!res.ok) return null;
|
|
return await res.text();
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } {
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("osfp") && lower.includes("1600g")) return { formFactor: "OSFP224", speed: "1.6T", speedGbps: 1600 };
|
|
if (lower.includes("osfp") && lower.includes("800g")) return { formFactor: "OSFP", speed: "800G", speedGbps: 800 };
|
|
if (lower.includes("qsfp-dd") || lower.includes("qsfp dd")) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
|
|
if (lower.includes("qsfp28") || lower.includes("100gbase")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
|
|
if ((lower.includes("qsfp+") || lower.includes("qsfp plus")) && !lower.includes("28")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
|
if (lower.includes("sfp28") || lower.includes("25gbase") || lower.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
|
|
if (lower.includes("sfp+") || lower.includes("10gbase") || lower.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
|
if (lower.includes("xfp")) return { formFactor: "XFP", speed: "10G", speedGbps: 10 };
|
|
if (lower.includes("cfp2")) return { formFactor: "CFP2", speed: "100G", speedGbps: 100 };
|
|
if (lower.includes("cfp4")) return { formFactor: "CFP4", speed: "100G", speedGbps: 100 };
|
|
if (lower.includes("cfp")) return { formFactor: "CFP", speed: "100G", speedGbps: 100 };
|
|
if (lower.includes("1000base") || lower.includes("1gbase")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
|
|
if (lower.includes("sfp") && !lower.includes("qsfp")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
|
if (lower.includes("qsfp")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
|
|
return { formFactor: "SFP+", speed: "10G", speedGbps: 10 };
|
|
}
|
|
|
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
|
const patterns: [RegExp, string, number][] = [
|
|
[/\b120\s*km\b/i, "120km", 120000],
|
|
[/\b80\s*km\b/i, "80km", 80000],
|
|
[/\b40\s*km\b/i, "40km", 40000],
|
|
[/\b20\s*km\b/i, "20km", 20000],
|
|
[/\b10\s*km\b/i, "10km", 10000],
|
|
[/\b2\s*km\b/i, "2km", 2000],
|
|
[/\b550\s*m\b/i, "550m", 550],
|
|
[/\b500\s*m\b/i, "500m", 500],
|
|
[/\b300\s*m\b/i, "300m", 300],
|
|
[/\b100\s*m\b/i, "100m", 100],
|
|
[/\bLR4\b/i, "10km", 10000], [/\b[^Z]LR\b/i, "10km", 10000],
|
|
[/\bER4?\b/i, "40km", 40000], [/\bZR4?\b/i, "80km", 80000],
|
|
[/\bSR4?\b/i, "300m", 300], [/\bDR4?\b/i, "500m", 500],
|
|
[/\bFR4?\b/i, "2km", 2000],
|
|
];
|
|
for (const [rx, label, meters] of patterns) {
|
|
if (rx.test(text)) return { label, meters };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function detectFiber(text: string): string {
|
|
if (/single.?mode|smf|\blx\b|\blr\b|\ber\b|\bzr\b|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
|
if (/multi.?mode|mmf|\bsx\b|\bsr\b/i.test(text)) return "MMF";
|
|
if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper";
|
|
return "";
|
|
}
|
|
|
|
function detectWavelength(text: string): string {
|
|
const m = text.match(/(\d{3,4})\s*nm/i);
|
|
return m ? m[1] : "";
|
|
}
|
|
|
|
/** Extract SKU from ProLabs product URL slug */
|
|
function skuFromSlug(slug: string): string {
|
|
// URL pattern: /vendor-sku-c-description or /sku-description
|
|
// The -c- separator splits SKU from compatibility desc
|
|
const parts = slug.replace(/^\//, "").split("-c-");
|
|
const base = parts[0];
|
|
// ProLabs appends vendor code at end — extract the actual part number
|
|
// e.g. "extreme-sfp-10gbase-zr-100-i-ex" → "SFP-10GBASE-ZR-100-I-EX-C"
|
|
return base.toUpperCase().replace(/-+/g, "-");
|
|
}
|
|
|
|
/** Download and parse sitemap index to collect all sitemap URLs */
|
|
async function fetchSitemapUrls(): Promise<string[]> {
|
|
const index = await fetchText(SITEMAP_INDEX);
|
|
if (!index) {
|
|
console.warn(" Could not fetch sitemap index");
|
|
return [];
|
|
}
|
|
|
|
const sitemapUrls = [...index.matchAll(/<loc>([^<]+sitemap[^<]+\.xml)<\/loc>/gi)]
|
|
.map((m) => m[1].trim());
|
|
|
|
console.log(` Found ${sitemapUrls.length} sitemaps`);
|
|
return sitemapUrls;
|
|
}
|
|
|
|
/** Download a single sitemap and extract product URLs */
|
|
async function fetchProductUrlsFromSitemap(sitemapUrl: string): Promise<string[]> {
|
|
const xml = await fetchText(sitemapUrl);
|
|
if (!xml) return [];
|
|
|
|
return [...xml.matchAll(/<loc>([^<]+prolabs\.com\/[^<]+)<\/loc>/gi)]
|
|
.map((m) => m[1].trim())
|
|
.filter((u) => {
|
|
// Filter out category pages, search, contact, etc.
|
|
const path = u.replace(/https?:\/\/[^/]+/, "");
|
|
return !path.match(/^\/(search|contact|sitemap|sitemap|downloads|articles|industry|support|why-|prolabs-dense|prolabs-test|360-virtual|videos|multi-coded|case-studies|white-papers|faqs|built-for-ai|what-is-new|about|associations|careers|multi-source|legacy|rma|tech-support|warranty|wintune|edfamux|privacy|newsletter|where-to-buy|media-converter|multiservice|eon-omp|multiplexers|patch-cables|cassettes|adapter|server|desktop|network|a-v-cables|power-adapters|usb-cables|c-lc|c-sc|c-fc|c-st|c-mtp|c-mpo|c-mt|power-|usb-)/)
|
|
&& !path.includes("memory")
|
|
&& !path.includes("/cart")
|
|
&& !path.includes("/order");
|
|
});
|
|
}
|
|
|
|
/** Scrape a ProLabs product page for part number and specs */
|
|
async function scrapeProductPage(url: string): Promise<{
|
|
partNumber: string;
|
|
name: string;
|
|
formFactor: string;
|
|
speed: string;
|
|
speedGbps: number;
|
|
reachLabel?: string;
|
|
reachMeters?: number;
|
|
fiberType?: string;
|
|
wavelength?: string;
|
|
} | null> {
|
|
const html = await fetchText(url);
|
|
if (!html) return null;
|
|
|
|
// Extract title from <title> tag
|
|
const titleM = html.match(/<title>([^<]+)<\/title>/i);
|
|
const pageTitle = titleM ? titleM[1].replace(/\s*\|\s*[^|]+$/, "").trim() : "";
|
|
|
|
// Extract H1 (canonical product name)
|
|
const h1M = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
const h1 = h1M ? h1M[1].replace(/<[^>]+>/g, "").replace(/&#x[0-9A-F]+;/gi, (e) =>
|
|
String.fromCharCode(parseInt(e.slice(3, -1), 16))).replace(/&/g, "&").trim() : "";
|
|
|
|
// Extract SKU from nopCommerce structure
|
|
const skuM = html.match(/id="sku-\d+"[^>]*>([^<]+)<\/span>/i);
|
|
const pageSku = skuM ? skuM[1].trim() : "";
|
|
|
|
// If no transceiver keywords found, skip
|
|
const combined = [pageTitle, h1, pageSku].join(" ");
|
|
if (!TRANSCEIVER_KEYWORDS.test(combined) && !combined.includes("-PR")) return null;
|
|
|
|
// Use page SKU if available, otherwise derive from URL slug
|
|
const slug = url.replace(/https?:\/\/[^/]+/, "");
|
|
const partNumber = pageSku || skuFromSlug(slug);
|
|
const name = h1 || pageTitle;
|
|
|
|
if (!name || name.length < 5) return null;
|
|
|
|
const { formFactor, speed, speedGbps } = detectFormFactor(combined);
|
|
const reach = detectReach(combined);
|
|
|
|
return {
|
|
partNumber,
|
|
name,
|
|
formFactor,
|
|
speed,
|
|
speedGbps,
|
|
reachLabel: reach?.label,
|
|
reachMeters: reach?.meters,
|
|
fiberType: detectFiber(combined),
|
|
wavelength: detectWavelength(combined),
|
|
};
|
|
}
|
|
|
|
export async function scrapeProLabs(): Promise<void> {
|
|
console.log("=== ProLabs Catalog Scraper (fetch-based, no Playwright) ===\n");
|
|
console.log("Note: ProLabs uses B2B quote model — catalog data only, no public prices.\n");
|
|
|
|
const vendorId = await ensureVendor(
|
|
"ProLabs",
|
|
"compatible",
|
|
"https://www.prolabs.com",
|
|
"https://www.prolabs.com/transceivers"
|
|
);
|
|
|
|
let totalProcessed = 0;
|
|
let totalNew = 0;
|
|
let skipped = 0;
|
|
|
|
// Step 1: Collect product URLs from sitemaps
|
|
console.log("Step 1: Collecting sitemaps...");
|
|
const sitemapUrls = await fetchSitemapUrls();
|
|
|
|
const productUrls: string[] = [];
|
|
let sitemapCount = 0;
|
|
for (const smUrl of sitemapUrls) {
|
|
const urls = await fetchProductUrlsFromSitemap(smUrl);
|
|
productUrls.push(...urls);
|
|
sitemapCount++;
|
|
if (sitemapCount % 3 === 0) {
|
|
console.log(` Processed ${sitemapCount}/${sitemapUrls.length} sitemaps, ${productUrls.length} product URLs so far`);
|
|
}
|
|
await sleep(RATE_LIMIT_MS);
|
|
}
|
|
|
|
// Deduplicate
|
|
const uniqueUrls = [...new Set(productUrls)];
|
|
console.log(`\nStep 2: Found ${uniqueUrls.length} unique product URLs\n`);
|
|
|
|
// Step 3: Scrape each product page
|
|
// Limit to transceiver-related URLs first (filter by keyword in slug)
|
|
const transceiverUrls = uniqueUrls.filter((u) => {
|
|
const path = u.toLowerCase();
|
|
return TRANSCEIVER_KEYWORDS.test(path) || path.includes("-pr") || path.includes("transceiver");
|
|
});
|
|
|
|
console.log(`Step 3: Scraping ${transceiverUrls.length} transceiver-related pages...\n`);
|
|
|
|
for (const url of transceiverUrls) {
|
|
try {
|
|
const product = await scrapeProductPage(url);
|
|
totalProcessed++;
|
|
|
|
if (!product) {
|
|
skipped++;
|
|
} else {
|
|
await findOrCreateScrapedTransceiver({
|
|
partNumber: product.partNumber.slice(0, 80),
|
|
vendorId,
|
|
formFactor: product.formFactor,
|
|
speedGbps: product.speedGbps,
|
|
speed: product.speed,
|
|
reachMeters: product.reachMeters,
|
|
reachLabel: product.reachLabel,
|
|
fiberType: product.fiberType,
|
|
wavelengths: product.wavelength,
|
|
category: "DataCenter",
|
|
});
|
|
totalNew++;
|
|
|
|
if (totalNew % 50 === 0) {
|
|
console.log(` ${totalNew} catalog entries written (${totalProcessed} pages processed)`);
|
|
}
|
|
}
|
|
|
|
await sleep(RATE_LIMIT_MS);
|
|
} catch (err) {
|
|
console.warn(` Error [${url.slice(-60)}]: ${(err as Error).message.slice(0, 80)}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\n=== ProLabs Catalog Complete ===`);
|
|
console.log(` Pages processed: ${totalProcessed}`);
|
|
console.log(` Catalog entries written: ${totalNew}`);
|
|
console.log(` Skipped (non-transceiver): ${skipped}`);
|
|
console.log(` Note: No prices (B2B quote model)`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
scrapeProLabs()
|
|
.then(() => pool.end())
|
|
.catch((err) => {
|
|
console.error("Fatal:", err);
|
|
pool.end();
|
|
process.exit(1);
|
|
});
|
|
}
|