Rene Fichtmueller aa977abc97 feat(v0.2.0): Sales Intelligence Engine — Phase 0+A
New API routes:
- GET /api/finder — Switch→Flexoptix transceiver finder with FlexBox coding
- GET /api/competitor-alerts — Competitor intelligence (price changes, new products, stock)
- GET /api/forecast/:technology — Sales forecast 3/9/12/18 months + buy/wait/hold signal
- POST /api/transport/plan — Transport system planner (city→city BOM with fiber providers)

New MCP tools:
- find_flexoptix_for_switch — Customer switch → Flexoptix products
- get_competitor_alerts — Competitor monitoring
- plan_transport — Network transport planning
- forecast_sales — Volume/revenue prediction
- generate_blog — Enhanced blog generation

New DB tables (migration 013):
- competitor_alerts, price_changes, flexoptix_product_map
- sales_forecasts, fiber_providers, fiber_routes, cities
- generated_datasheets, blog_series
- Views: v_price_coverage, v_image_coverage, v_switch_flexoptix_finder

Seed data (migration 014):
- 25 European cities with IX/DC locations + coordinates
- 15 fiber providers (euNetworks, Telia, DTAG, Colt, Zayo, etc.)
- 16 fiber routes with pricing (Germany focus)

Infrastructure:
- Scraper scheduler: 2h Flexoptix, 4h FS.com/Optcore (was 6-8h)
- Change detector for competitor price/stock monitoring
- Image downloader utility with coverage tracking
2026-03-31 08:51:22 +02:00

486 lines
18 KiB
TypeScript

/**
* ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary)
*
* prolabs.com — CloudFront WAF aggressively blocks datacenter IPs.
* Uses PlaywrightCrawler with Firefox for anti-detection.
*
* KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs
* (HTTP 403 "Request blocked"). This scraper works correctly from
* residential IPs. Solutions:
* 1. Set PROXY_URL env var to a residential/rotating proxy
* 2. Run from a residential IP (e.g. home server)
* 3. Route through WireGuard with internet breakout at home
*
* Products listed under /products/networking/fiber-optics/ category pages.
* Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min.
*
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
*/
import { PlaywrightCrawler, RequestQueue } from "crawlee";
import { firefox } from "playwright";
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash";
const BASE = "https://www.prolabs.com";
const MAX_PAGES = 100;
const PROXY_URL = process.env.PROXY_URL || "";
const CATEGORIES = [
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
{ path: "/products/networking/fiber-optics/sfp-plus-modules", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
{ path: "/products/networking/fiber-optics/sfp28-modules", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
{ path: "/products/networking/fiber-optics/qsfp-plus-modules", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
];
interface Product {
partNumber: string;
name: string;
url: string;
price?: number;
stockStatus?: string;
formFactor: string;
speed: string;
speedGbps: number;
reachLabel?: string;
reachMeters?: number;
fiberType?: string;
wavelength?: string;
}
/* ------------------------------------------------------------------ */
/* Helper / detection functions (unchanged from original) */
/* ------------------------------------------------------------------ */
function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [
[/\b120\s*km\b/i, "120km", 120000],
[/\b80\s*km\b/i, "80km", 80000],
[/\b40\s*km\b/i, "40km", 40000],
[/\b20\s*km\b/i, "20km", 20000],
[/\b10\s*km\b/i, "10km", 10000],
[/\b2\s*km\b/i, "2km", 2000],
[/\b550\s*m\b/i, "550m", 550],
[/\b500\s*m\b/i, "500m", 500],
[/\b400\s*m\b/i, "400m", 400],
[/\b300\s*m\b/i, "300m", 300],
[/\b150\s*m\b/i, "150m", 150],
[/\b100\s*m\b/i, "100m", 100],
[/\b30\s*m\b/i, "30m", 30],
[/\bLR4\b/, "10km", 10000],
[/\bLR\b/, "10km", 10000],
[/\bER4?\b/, "40km", 40000],
[/\bZR4?\b/, "80km", 80000],
[/\bSR4?\b/, "300m", 300],
[/\bDR4?\b/, "500m", 500],
[/\bFR4?\b/, "2km", 2000],
];
for (const [regex, label, meters] of patterns) {
if (regex.test(text)) return { label, meters };
}
return undefined;
}
function detectFiber(text: string): string {
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
if (/copper|dac|twinax|rj.?45|base-t|cat[56x]/i.test(text)) return "Copper";
return "";
}
function detectWavelength(text: string): string {
const match = text.match(/(\d{3,4})\s*nm/i);
return match ? match[1] : "";
}
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
formFactor: string;
speed: string;
speedGbps: number;
} {
const upper = sku.toUpperCase();
if (/^QDD[-_]|QSFP.DD/i.test(upper)) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 };
if (/^Q28[-_]|QSFP28/i.test(upper)) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 };
if (/^Q[-_]4X|^Q[-_]/i.test(upper) && !/28/i.test(upper.slice(0, 5))) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 };
if (/^SFP28[-_]|SFP-25/i.test(upper)) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 };
if (/^S[-_]/i.test(upper) && !/sfp/i.test(upper.slice(1, 4))) return { formFactor: "SFP", speed: "1G", speedGbps: 1 };
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
}
function normalizeStockLevel(
raw?: string
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
if (!raw) return "on_request";
const lower = raw.toLowerCase();
if (lower.includes("in stock") || lower.includes("available")) return "in_stock";
if (lower.includes("out of stock") || lower.includes("backordered")) return "out_of_stock";
if (lower.includes("low stock") || lower.includes("limited")) return "low_stock";
return "on_request";
}
/* ------------------------------------------------------------------ */
/* Main scraper */
/* ------------------------------------------------------------------ */
export async function scrapeProLabs(): Promise<void> {
console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n");
if (PROXY_URL) {
console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`);
} else {
console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs.");
console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n");
}
const vendorId = await ensureVendor(
"ProLabs",
"compatible",
"https://www.prolabs.com",
"https://www.prolabs.com/products/networking/fiber-optics"
);
let totalProducts = 0;
let priceUpdates = 0;
let blockedPages = 0;
const seenUrls = new Set<string>();
// Map URL -> category metadata
const urlToCat = new Map<string, typeof CATEGORIES[number]>();
const requestQueue = await RequestQueue.open();
for (const cat of CATEGORIES) {
const url = `${BASE}${cat.path}`;
urlToCat.set(url, cat);
await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } });
}
const crawler = new PlaywrightCrawler({
requestQueue,
maxConcurrency: 1,
maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 120,
navigationTimeoutSecs: 60,
maxRequestRetries: 2,
headless: true,
// Override default blockedStatusCodes (normally [401, 403, 429]).
// We allow 403 so our handler can inspect the page — CloudFront may
// serve a JS challenge that resolves, or we can log the block gracefully.
sessionPoolOptions: {
blockedStatusCodes: [401, 429],
},
browserPoolOptions: {
useFingerprints: false,
},
launchContext: {
launcher: firefox,
launchOptions: {
firefoxUserPrefs: {
"toolkit.telemetry.enabled": false,
"privacy.trackingprotection.enabled": false,
},
},
},
...(PROXY_URL ? {
proxyConfiguration: new (require("crawlee").ProxyConfiguration)({
proxyUrls: [PROXY_URL],
}),
} : {}),
preNavigationHooks: [
async ({ page }, goToOptions) => {
// Realistic viewport
await page.setViewportSize({ width: 1920, height: 1080 });
// Override webdriver detection
await page.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
});
if (goToOptions) {
goToOptions.waitUntil = "load";
}
},
],
async requestHandler({ page, request, log }) {
const currentPage: number = request.userData?.page ?? 1;
const catPath: string = request.userData?.catPath ?? "";
const cat = urlToCat.get(request.url) ??
CATEGORIES.find((c) => catPath === c.path) ??
CATEGORIES[CATEGORIES.length - 1];
urlToCat.set(request.url, cat);
log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`);
// Give JS challenges time to resolve
await page.waitForTimeout(8000);
// Check what we actually got
const pageTitle = await page.title();
const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || "");
log.info(` Title: "${pageTitle}"`);
// Detect CloudFront WAF block
if (bodyText.includes("Request blocked") ||
bodyText.includes("Access Denied") ||
bodyText.includes("403 ERROR") ||
pageTitle.includes("ERROR")) {
blockedPages++;
log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`);
if (blockedPages >= 3 && totalProducts === 0) {
log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`);
}
return;
}
// Extract products via page.evaluate
const productData = await page.evaluate(() => {
const results: Array<{
name: string;
href: string;
price: string;
stock: string;
partNumber: string;
}> = [];
// Strategy 1: Product card links
const productLinks = document.querySelectorAll(
'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a'
);
for (const link of productLinks) {
const el = link as HTMLAnchorElement;
const name = el.textContent?.trim() || "";
const href = el.getAttribute("href") || "";
if (!name || name.length < 5 || name.length > 200 || !href) continue;
if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue;
const container =
el.closest('[class*="product"]') ||
el.closest('[class*="item"]') ||
el.closest('[class*="card"]') ||
el.closest("li") ||
el.parentElement?.parentElement?.parentElement;
let price = "";
let stock = "";
let pn = "";
if (container) {
const priceEl = container.querySelector(
'[class*="price"], [class*="Price"], [data-price], .price'
);
price = priceEl?.textContent?.trim() || "";
if (!price) {
const containerText = container.textContent || "";
const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/);
if (priceMatch) price = priceMatch[0];
}
const stockEl = container.querySelector(
'[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]'
);
stock = stockEl?.textContent?.trim() || "";
const skuEl = container.querySelector(
'[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]'
);
pn = skuEl?.textContent?.trim() || "";
}
if (!pn) {
pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || "";
}
if (name && href.includes("/products/")) {
results.push({ name, href, price, stock, partNumber: pn });
}
}
// Strategy 2: Scan deeper for anchors with product URLs
if (results.length === 0) {
const allAnchors = document.querySelectorAll("a[href*='/products/']");
for (const el of allAnchors) {
const anchor = el as HTMLAnchorElement;
const href = anchor.getAttribute("href") || "";
const name = anchor.textContent?.trim() || "";
if (!name || name.length < 5) continue;
let parent: Element | null = anchor;
let price = "";
for (let i = 0; i < 4 && parent; i++) {
parent = parent.parentElement;
if (parent) {
const text = parent.textContent || "";
const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/);
if (m) { price = m[0]; break; }
}
}
const pn = href.split("/").pop()?.replace(/\.html?$/, "") || "";
results.push({ name, href, price, stock: "", partNumber: pn });
}
}
// Strategy 3: JSON-LD structured data
const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of ldScripts) {
try {
const data = JSON.parse(script.textContent || "");
const items = data.itemListElement || (Array.isArray(data) ? data : [data]);
for (const item of items) {
if (item["@type"] === "Product" || item.offers) {
const name = item.name || "";
const href = item.url || "";
const offers = item.offers || {};
const price = offers.price ? `$${offers.price}` : "";
const stock = offers.availability || "";
const pn = item.sku || item.mpn || href.split("/").pop() || "";
if (name) results.push({ name, href, price, stock, partNumber: pn });
}
}
} catch { /* ignore parse errors */ }
}
return results;
});
log.info(` Raw items extracted: ${productData.length}`);
// Process extracted products
const pageProducts: Product[] = [];
for (const item of productData) {
if (!item.name) continue;
const partNumber = (item.partNumber || item.name).slice(0, 80).trim();
const name = item.name.slice(0, 200).trim();
const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`;
let price: number | undefined;
if (item.price) {
const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", "");
const parsed = parseFloat(cleaned);
if (parsed > 0 && parsed < 100000) price = parsed;
}
const combined = name + " " + partNumber;
const reach = detectReach(combined);
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
pageProducts.push({
partNumber, name, url, price,
stockStatus: item.stock || undefined,
formFactor, speed, speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelength: detectWavelength(combined),
});
}
// Deduplicate against global set
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
for (const p of newProducts) seenUrls.add(p.url);
log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`);
// Write to database
for (const product of newProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({
price: product.price,
part: product.partNumber,
stock: product.stockStatus ?? "",
});
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: normalizeStockLevel(product.stockStatus),
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
}
}
// Check for next page
const hasNext = await page.evaluate((currentPageNum: number) => {
const nextLink = document.querySelector('a[rel="next"], link[rel="next"]');
if (nextLink) return true;
const nextNum = currentPageNum + 1;
const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a');
for (const link of paginationLinks) {
const href = (link as HTMLAnchorElement).getAttribute("href") || "";
if (href.includes(`page=${nextNum}`)) return true;
const text = link.textContent?.trim() || "";
if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true;
}
return false;
}, currentPage);
if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) {
const nextPageNum = currentPage + 1;
const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`;
urlToCat.set(nextUrl, cat);
await requestQueue.addRequest({
url: nextUrl,
userData: { page: nextPageNum, catPath },
});
log.info(` Enqueued next page: ${nextPageNum}`);
}
},
async failedRequestHandler({ request, log }) {
log.error(`Request failed after retries: ${request.url}`);
},
});
await crawler.run();
console.log(`\n=== ProLabs Complete ===`);
console.log(` Products processed: ${totalProducts}`);
console.log(` Price updates: ${priceUpdates}`);
console.log(` Pages blocked by WAF: ${blockedPages}`);
if (blockedPages > 0 && totalProducts === 0) {
console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`);
console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`);
}
}
if (require.main === module) {
scrapeProLabs()
.then(() => pool.end())
.catch((err) => {
console.error("Fatal:", err);
pool.end();
process.exit(1);
});
}