chore: sync local changes

This commit is contained in:
Rene Fichtmueller 2026-03-31 07:32:02 +02:00
parent fa2d88096f
commit 8757fc8bf0
6 changed files with 521 additions and 358 deletions

View File

@ -61,9 +61,6 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
"scrape:pricing:10gtek", "scrape:pricing:10gtek",
"scrape:pricing:atgbics", "scrape:pricing:atgbics",
"scrape:pricing:prolabs", "scrape:pricing:prolabs",
"scrape:pricing:naddod",
"scrape:pricing:qsfptek",
"scrape:pricing:addon",
"scrape:compat:cisco", "scrape:compat:cisco",
"scrape:pricing:flexoptix", "scrape:pricing:flexoptix",
"scrape:vendors:flexoptix", "scrape:vendors:flexoptix",
@ -117,30 +114,12 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
expireInSeconds: 3600, expireInSeconds: 3600,
}); });
// ProLabs pricing (every 8 hours — server-rendered HTML, USD prices) // ProLabs pricing (every 8 hours — Playwright, needs proxy for CloudFront)
await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, { await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, {
retryLimit: 2, retryLimit: 2,
expireInSeconds: 3600, expireInSeconds: 3600,
}); });
// NADDOD pricing (every 8 hours — WooCommerce, USD prices)
await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, {
retryLimit: 2,
expireInSeconds: 3600,
});
// QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices)
await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, {
retryLimit: 2,
expireInSeconds: 3600,
});
// AddOn Networks pricing (every 12 hours — enterprise site, USD prices)
await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, {
retryLimit: 2,
expireInSeconds: 3600,
});
// Flexoptix catalog (every 6 hours — fetch-based, fast) // Flexoptix catalog (every 6 hours — fetch-based, fast)
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, { await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
retryLimit: 2, retryLimit: 2,
@ -173,9 +152,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
const { scrapeNews } = await import("./scrapers/news"); const { scrapeNews } = await import("./scrapers/news");
const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeAtgbics } = await import("./scrapers/atgbics");
const { scrapeProLabs } = await import("./scrapers/prolabs"); const { scrapeProLabs } = await import("./scrapers/prolabs");
const { scrapeNaddod } = await import("./scrapers/naddod");
const { scrapeQsfptek } = await import("./scrapers/qsfptek");
const { scrapeAddonNetworks } = await import("./scrapers/addon-networks");
await boss.work("scrape:pricing:fs", async (_job) => { await boss.work("scrape:pricing:fs", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
@ -222,21 +198,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await withIsolatedStorage("prolabs", scrapeProLabs); await withIsolatedStorage("prolabs", scrapeProLabs);
}); });
await boss.work("scrape:pricing:naddod", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`);
await scrapeNaddod();
});
await boss.work("scrape:pricing:qsfptek", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`);
await scrapeQsfptek();
});
await boss.work("scrape:pricing:addon", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`);
await scrapeAddonNetworks();
});
await boss.work("scrape:faq", async (_job) => { await boss.work("scrape:faq", async (_job) => {
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
}); });

View File

@ -1,27 +1,101 @@
/** /**
* Cisco TMG Matrix Scraper Transceiver Compatibility * Cisco TMG Matrix Scraper Transceiver Compatibility
* *
* Source: tmgmatrix.cisco.com * Source: tmgmatrix.cisco.com (JSON API no auth required)
* Extracts: Switch model Transceiver compatibility data * Extracts: Switch model Transceiver compatibility data
* Stores: switches, compatibility table * Stores: switches, compatibility table
* *
* The TMG Matrix has a JSON API behind the scenes. * Uses POST /public/api/networkdevice/search endpoint directly.
*/ */
import { CheerioCrawler } from "crawlee";
import { pool, ensureVendor } from "../utils/db"; import { pool, ensureVendor } from "../utils/db";
const TMG_BASE = "https://tmgmatrix.cisco.com"; const TMG_API = "https://tmgmatrix.cisco.com/public/api/networkdevice/search";
interface TmgEntry { interface TmgTransceiver {
switchModel: string; tmgId: number;
switchSeries: string; productId: string;
transceiverPid: string; productFamily: string;
transceiverDescription: string; formFactor: string;
speed: string;
reach: string; reach: string;
temperatureRange: string;
cableType: string; cableType: string;
connector: string; media: string;
minSoftware: string; connectorType: string;
transmissionStandard: string;
dataRate: string;
endOfSale: string;
softReleaseMinVer: string;
breakoutMode: string;
osType: string;
domSupport: string;
type: string;
}
interface TmgCompatEntry {
productId: string; // switch PID
transceivers: TmgTransceiver[];
}
interface TmgDevice {
productFamily: string;
networkAndTransceiverCompatibility: TmgCompatEntry[];
}
interface TmgSearchResponse {
totalCount: number;
filters: Array<{ name: string; values: Array<{ id: number; name: string; count: number }> }>;
networkDevices: TmgDevice[];
}
/** Key Nexus/Catalyst platform family IDs from the TMG API */
const PLATFORM_FAMILIES = [
{ id: 74, name: "N9300" }, // Nexus 9300 — 8,515 entries
{ id: 77, name: "N9500" }, // Nexus 9500 — 2,266 entries
{ id: 78, name: "N9200" }, // Nexus 9200 — 708 entries
{ id: 661, name: "N9800" }, // Nexus 9800 — 238 entries
{ id: 76, name: "C9300" }, // Catalyst 9300 — 260 entries
{ id: 601, name: "C9300L" }, // Catalyst 9300L — 720 entries
{ id: 1181, name: "C9300X" }, // Catalyst 9300X — 413 entries
{ id: 8, name: "C9500" }, // Catalyst 9500 — 1,141 entries
{ id: 521, name: "C9600" }, // Catalyst 9600 — 771 entries
{ id: 7, name: "C9400" }, // Catalyst 9400 — 561 entries
{ id: 341, name: "C9200" }, // Catalyst 9200 — 222 entries
{ id: 83, name: "ASR9000" }, // ASR 9000 — 3,644 entries
];
async function searchTmg(familyFilter: { id: number; name: string }): Promise<TmgSearchResponse> {
const body = {
cableType: [],
dataRate: [],
formFactor: [],
reach: [],
searchInput: [""],
osType: [],
transceiverProductFamily: [],
transceiverProductID: [],
networkDeviceProductFamily: [familyFilter],
networkDeviceProductID: [],
media: [],
connectorType: [],
caseTemperature: [],
performanceMonitoring: [],
};
const res = await fetch(TMG_API, {
method: "POST",
headers: {
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "application/json",
},
body: JSON.stringify(body),
});
if (!res.ok) {
throw new Error(`TMG API ${res.status}: ${res.statusText}`);
}
return res.json() as Promise<TmgSearchResponse>;
} }
async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise<string> { async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise<string> {
@ -38,18 +112,31 @@ async function upsertCiscoSwitch(vendorId: string, model: string, series: string
async function upsertCompatibility( async function upsertCompatibility(
switchId: string, switchId: string,
transceiverId: string, transceiverId: string,
firmwareMin: string firmwareMin: string,
formFactor: string,
reach: string,
cableType: string,
media: string,
dataRate: string
): Promise<void> { ): Promise<void> {
await pool.query( await pool.query(
`INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url) `INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url, notes)
VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4) VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4, $5)
ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET firmware_min = EXCLUDED.firmware_min`, ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET
[switchId, transceiverId, firmwareMin || null, TMG_BASE] firmware_min = EXCLUDED.firmware_min,
notes = EXCLUDED.notes`,
[
switchId,
transceiverId,
firmwareMin || null,
"https://tmgmatrix.cisco.com",
`${formFactor} ${dataRate} ${reach} ${media} ${cableType}`.trim(),
]
); );
} }
export async function scrapeCiscoTmg(): Promise<void> { export async function scrapeCiscoTmg(): Promise<void> {
console.log("=== Cisco TMG Matrix Scraper Starting ===\n"); console.log("=== Cisco TMG Matrix Scraper Starting (API mode) ===\n");
const ciscoVendorId = await ensureVendor( const ciscoVendorId = await ensureVendor(
"Cisco", "Cisco",
@ -58,90 +145,69 @@ export async function scrapeCiscoTmg(): Promise<void> {
undefined undefined
); );
const entries: TmgEntry[] = []; let totalSwitches = 0;
let totalCompat = 0;
let totalTransceivers = 0;
// TMG Matrix uses a search API for (const family of PLATFORM_FAMILIES) {
// First, try the public HTML interface console.log(`\nFetching ${family.name}...`);
const crawler = new CheerioCrawler({
maxConcurrency: 1,
maxRequestsPerMinute: 10, // Very respectful — Cisco rate limits aggressively
async requestHandler({ request, $, log }) {
log.info(`Scraping: ${request.url}`);
// The TMG Matrix renders a table with compatibility data
$("table tbody tr, .matrix-row, [class*='result-row']").each((_i, el) => {
const $row = $(el);
const cells = $row.find("td").map((_j, td) => $(td).text().trim()).get();
if (cells.length >= 4) {
entries.push({
switchModel: cells[0] || "",
switchSeries: cells[0]?.split(" ")[0] || "Nexus",
transceiverPid: cells[1] || "",
transceiverDescription: cells[2] || "",
speed: cells[3] || "",
reach: cells[4] || "",
cableType: cells[5] || "",
connector: cells[6] || "",
minSoftware: cells[7] || "",
});
}
});
},
});
// Start with Nexus switches (most relevant for Flexoptix)
await crawler.run([
`${TMG_BASE}/public/tmg?searchValue=Nexus+9000`,
`${TMG_BASE}/public/tmg?searchValue=Nexus+3000`,
`${TMG_BASE}/public/tmg?searchValue=Nexus+7000`,
`${TMG_BASE}/public/tmg?searchValue=Catalyst+9000`,
]);
console.log(`\nEntries found: ${entries.length}`);
// Write to database
let switches = 0;
let compat = 0;
for (const entry of entries) {
if (!entry.switchModel || !entry.transceiverPid) continue;
try { try {
const switchId = await upsertCiscoSwitch( const data = await searchTmg(family);
ciscoVendorId, console.log(` ${family.name}: ${data.totalCount} total entries, ${data.networkDevices.length} device groups`);
entry.switchModel,
entry.switchSeries
);
switches++;
// Try to match transceiver in our DB for (const device of data.networkDevices) {
const txResult = await pool.query( for (const compat of device.networkAndTransceiverCompatibility) {
`SELECT id FROM transceivers if (!compat.productId) continue;
WHERE part_number = $1
OR slug LIKE $2
OR standard_name ILIKE $3
LIMIT 1`,
[
entry.transceiverPid,
`%${entry.transceiverPid.toLowerCase().replace(/[^a-z0-9]/g, "")}%`,
`%${entry.speed}%${entry.reach}%`,
]
);
if (txResult.rows.length > 0) { const switchId = await upsertCiscoSwitch(
await upsertCompatibility(switchId, txResult.rows[0].id, entry.minSoftware); ciscoVendorId,
compat++; compat.productId,
device.productFamily
);
totalSwitches++;
for (const tx of compat.transceivers) {
if (!tx.productId) continue;
totalTransceivers++;
// Try to match transceiver in our DB by Cisco PID
const txResult = await pool.query(
`SELECT id FROM transceivers
WHERE part_number = $1
OR part_number = $2
LIMIT 1`,
[tx.productId, tx.productId.replace(/-S$/, "")]
);
if (txResult.rows.length > 0) {
await upsertCompatibility(
switchId,
txResult.rows[0].id,
tx.softReleaseMinVer,
tx.formFactor,
tx.reach,
tx.cableType,
tx.media,
tx.dataRate
);
totalCompat++;
}
}
}
} }
// Rate limit: 2 seconds between platform families
await new Promise((r) => setTimeout(r, 2000));
} catch (err) { } catch (err) {
// Skip duplicates silently console.error(` Error fetching ${family.name}:`, err);
} }
} }
console.log(`Switches upserted: ${switches}`); console.log(`\n=== Cisco TMG Scraper Complete ===`);
console.log(`Compatibility entries: ${compat}`); console.log(` Switches upserted: ${totalSwitches}`);
console.log("=== Cisco TMG Scraper Complete ===\n"); console.log(` Transceiver entries scanned: ${totalTransceivers}`);
console.log(` Compatibility matches: ${totalCompat}\n`);
} }
if (require.main === module) { if (require.main === module) {

View File

@ -1,7 +1,7 @@
/** /**
* Fluxlight Scraper US-based compatible transceiver vendor * Fluxlight Scraper US-based compatible transceiver vendor
* *
* fluxlight.com BigCommerce, server-rendered HTML with real prices. * www.fluxlight.com BigCommerce, server-rendered HTML with real prices.
* ~144+ products across 6 pages. Uses pagination via ?page=N. * ~144+ products across 6 pages. Uses pagination via ?page=N.
* *
* Rate limited: 1 req/2sec. * Rate limited: 1 req/2sec.
@ -91,8 +91,8 @@ function parseProductList(html: string): Product[] {
const products: Product[] = []; const products: Product[] = [];
// BigCommerce product card pattern: product link + price // BigCommerce product card pattern: product link + price
// Pattern: <a href="https://fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99 // Pattern: <a href="https://www.fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99
const productRegex = /href="(https?:\/\/fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi; const productRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi;
let match; let match;
while ((match = productRegex.exec(html)) !== null) { while ((match = productRegex.exec(html)) !== null) {
const url = match[1]; const url = match[1];
@ -123,7 +123,7 @@ function parseProductList(html: string): Product[] {
// Fallback: broader link pattern // Fallback: broader link pattern
if (products.length === 0) { if (products.length === 0) {
const simpleRegex = /href="(https?:\/\/fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi; const simpleRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi;
while ((match = simpleRegex.exec(html)) !== null) { while ((match = simpleRegex.exec(html)) !== null) {
const url = match[1]; const url = match[1];
const name = match[2].trim(); const name = match[2].trim();
@ -166,7 +166,7 @@ async function fetchPage(url: string): Promise<string> {
export async function scrapeFluxlight(): Promise<void> { export async function scrapeFluxlight(): Promise<void> {
console.log("=== Fluxlight Scraper Starting ===\n"); console.log("=== Fluxlight Scraper Starting ===\n");
const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://fluxlight.com/transceivers/"); const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://www.fluxlight.com/transceivers/");
let allProducts: Product[] = []; let allProducts: Product[] = [];
@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise<void> {
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber }); const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const updated = await upsertPriceObservation({ const updated = await upsertPriceObservation({
transceiverId: txId, sourceVendorId: vendorId, transceiverId: txId, sourceVendorId: vendorId,
price: product.price, currency: "USD", price: product.price, currency: "USD",

View File

@ -8,7 +8,7 @@
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
const BASE = "https://gbics.com"; const BASE = "https://www.gbics.com";
const HEADERS = { const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml", Accept: "text/html,application/xhtml+xml",
@ -100,7 +100,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
// BigCommerce card-title pattern: // BigCommerce card-title pattern:
// <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click"> // <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click">
const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/gbics\.com\/[^"]+)"\s+data-event-type="product-click"/gi; const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*data-event-type="product-click"/gi;
let match; let match;
while ((match = productRegex.exec(collapsed)) !== null) { while ((match = productRegex.exec(collapsed)) !== null) {
const label = match[1].trim(); const label = match[1].trim();
@ -110,7 +110,14 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
// Split on last comma to separate name and price // Split on last comma to separate name and price
const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/); const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/);
const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label; const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label;
const price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined; let price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
// Fallback: extract price from data-price-asc attribute on parent <li>
if (!price) {
const priceContext = collapsed.slice(Math.max(0, match.index - 500), match.index);
const dataPriceMatch = priceContext.match(/data-price-asc="(\d+)"/);
if (dataPriceMatch) price = parseFloat(dataPriceMatch[1]);
}
if (name.length < 10) continue; if (name.length < 10) continue;
@ -131,7 +138,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
// Fallback: try "Now: £XX.XX" pattern near product links // Fallback: try "Now: £XX.XX" pattern near product links
if (products.length === 0) { if (products.length === 0) {
const altRegex = /href="(https?:\/\/gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi; const altRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi;
while ((match = altRegex.exec(collapsed)) !== null) { while ((match = altRegex.exec(collapsed)) !== null) {
const url = match[1]; const url = match[1];
const name = match[2].trim(); const name = match[2].trim();
@ -172,7 +179,7 @@ async function fetchPage(url: string): Promise<string> {
export async function scrapeGbics(): Promise<void> { export async function scrapeGbics(): Promise<void> {
console.log("=== GBICS.com Scraper Starting ===\n"); console.log("=== GBICS.com Scraper Starting ===\n");
const vendorId = await ensureVendor("GBICS", "compatible", "https://gbics.com", "https://gbics.com/optical-transceivers/"); const vendorId = await ensureVendor("GBICS", "compatible", "https://www.gbics.com", "https://www.gbics.com/optical-transceivers/");
let totalProducts = 0; let totalProducts = 0;
let priceUpdates = 0; let priceUpdates = 0;
@ -196,7 +203,7 @@ export async function scrapeGbics(): Promise<void> {
}); });
if (product.price && product.price > 0) { if (product.price && product.price > 0) {
const hash = contentHash({ price: product.price, part: product.partNumber }); const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
const updated = await upsertPriceObservation({ const updated = await upsertPriceObservation({
transceiverId: txId, sourceVendorId: vendorId, transceiverId: txId, sourceVendorId: vendorId,
price: product.price, currency: "GBP", price: product.price, currency: "GBP",

View File

@ -38,19 +38,14 @@ interface NewsArticle {
const FEEDS: RssFeed[] = [ const FEEDS: RssFeed[] = [
// === PRIMARY: Transceiver-specific === // === PRIMARY: Transceiver-specific ===
{ {
name: "Lightwave Online", name: "The Next Platform",
url: "https://www.lightwaveonline.com/rss", url: "https://www.nextplatform.com/feed/",
category: "market_report", category: "market_report",
}, },
{ {
name: "Lightwave - Fiber Optics", name: "ServeTheHome",
url: "https://www.lightwaveonline.com/fttx/rss", url: "https://www.servethehome.com/feed/",
category: "market_report", category: "product_launch",
},
{
name: "Fierce Telecom",
url: "https://www.fiercetelecom.com/rss/xml",
category: "market_report",
}, },
{ {
name: "Optics.org", name: "Optics.org",
@ -69,8 +64,8 @@ const FEEDS: RssFeed[] = [
category: "market_report", category: "market_report",
}, },
{ {
name: "SDxCentral", name: "The Register - Data Centre",
url: "https://www.sdxcentral.com/feed/", url: "https://www.theregister.com/data_centre/headlines.atom",
category: "market_report", category: "market_report",
}, },
// === TERTIARY: General tech / photonics === // === TERTIARY: General tech / photonics ===

View File

@ -1,22 +1,29 @@
/** /**
* ProLabs Scraper Enterprise-grade compatible optics (Legrand subsidiary) * ProLabs Scraper Enterprise-grade compatible optics (Legrand subsidiary)
* *
* prolabs.com Server-rendered HTML with public USD pricing. * prolabs.com CloudFront WAF aggressively blocks datacenter IPs.
* Uses PlaywrightCrawler with Firefox for anti-detection.
*
* KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs
* (HTTP 403 "Request blocked"). This scraper works correctly from
* residential IPs. Solutions:
* 1. Set PROXY_URL env var to a residential/rotating proxy
* 2. Run from a residential IP (e.g. home server)
* 3. Route through WireGuard with internet breakout at home
*
* Products listed under /products/networking/fiber-optics/ category pages. * Products listed under /products/networking/fiber-optics/ category pages.
* Pagination via ?page=N. Rate limited: 1 req/2sec. Max 100 pages. * Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min.
* *
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR" * SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
*/ */
import { PlaywrightCrawler, RequestQueue } from "crawlee";
import { firefox } from "playwright";
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
const BASE = "https://www.prolabs.com"; const BASE = "https://www.prolabs.com";
const HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
Accept: "text/html,application/xhtml+xml",
};
const MAX_PAGES = 100; const MAX_PAGES = 100;
const PROXY_URL = process.env.PROXY_URL || "";
const CATEGORIES = [ const CATEGORIES = [
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 }, { path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
@ -26,7 +33,6 @@ const CATEGORIES = [
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, { path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, { path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
// Broad fallback category in case above paths differ on the live site
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, { path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
]; ];
@ -45,9 +51,9 @@ interface Product {
wavelength?: string; wavelength?: string;
} }
function sleep(ms: number): Promise<void> { /* ------------------------------------------------------------------ */
return new Promise((resolve) => setTimeout(resolve, ms)); /* Helper / detection functions (unchanged from original) */
} /* ------------------------------------------------------------------ */
function detectReach(text: string): { label: string; meters: number } | undefined { function detectReach(text: string): { label: string; meters: number } | undefined {
const patterns: [RegExp, string, number][] = [ const patterns: [RegExp, string, number][] = [
@ -90,18 +96,6 @@ function detectWavelength(text: string): string {
return match ? match[1] : ""; return match ? match[1] : "";
} }
/**
* Infer form factor and speed from ProLabs SKU prefixes when category context
* is not specific enough (e.g. when crawling the broad fallback category).
*
* ProLabs SKU prefix conventions:
* Q- -> QSFP+ 40G
* Q28- -> QSFP28 100G
* QDD- -> QSFP-DD 400G
* SFP28- -> SFP28 25G
* SFP- -> SFP+ 10G (most common ProLabs prefix)
* S- -> SFP 1G
*/
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): { function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
formFactor: string; formFactor: string;
speed: string; speed: string;
@ -116,121 +110,6 @@ function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps }; return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
} }
/**
* Parse product listings from a ProLabs category page.
*
* ProLabs uses a standard e-commerce layout:
* - Product cards with an <a> link containing the product URL and name
* - Price in a span with "price" in class or as "$XX.XX" text nearby
* - SKU / part number in the URL slug
* - Stock badge: "In Stock" / "Out of Stock" / "Call for Availability"
*
* We parse with lightweight regex on collapsed HTML same approach as gbics.ts
* and sfpcables.ts (no DOM parser dependency).
*/
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
const products: Product[] = [];
const collapsed = html.replace(/\s+/g, " ");
// Strategy 1: product cards with structured href containing a SKU-like segment
// Match anchor tags whose href is a deep product path ending in a SKU pattern
const productLinkRegex = /href="(\/products\/[^"]*?\/([A-Z0-9][A-Z0-9\-_]{3,}(?:-PR)?))"\s[^>]*>([^<]{10,})<\/a>/gi;
let match: RegExpExecArray | null;
while ((match = productLinkRegex.exec(collapsed)) !== null) {
const relUrl = match[1];
const skuFromUrl = match[2];
const linkText = match[3].trim();
// Skip navigation / filter / pagination links
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
if (linkText.length > 200) continue;
const url = BASE + relUrl;
const partNumber = skuFromUrl.slice(0, 80);
const name = linkText.length > 10 ? linkText : partNumber;
// Look for price in a 700-char window after the match position
const context = collapsed.slice(Math.max(0, match.index - 100), match.index + 700);
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/) ||
context.match(/price[^>]*>\s*\$?\s*([\d,]+\.?\d{0,2})/i);
const price = priceMatch ? parseFloat(priceMatch[1].replace(",", "")) : undefined;
const stockMatch = context.match(/(in[\s-]stock|out[\s-]of[\s-]stock|call for availability|available|backordered)/i);
const stockStatus = stockMatch ? stockMatch[1].toLowerCase() : undefined;
const combined = name + " " + partNumber;
const reach = detectReach(combined);
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
products.push({
partNumber, name, url,
price: price && price > 0 && price < 100000 ? price : undefined,
stockStatus,
formFactor, speed, speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelength: detectWavelength(combined),
});
}
// Strategy 2: Fallback — any link to a /products/ URL that has a $ price nearby
if (products.length === 0) {
const altRegex = /href="(\/products\/[^"]{10,})"/gi;
while ((match = altRegex.exec(collapsed)) !== null) {
const relUrl = match[1];
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
const context = collapsed.slice(Math.max(0, match.index - 50), match.index + 800);
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
if (!priceMatch) continue;
const price = parseFloat(priceMatch[1].replace(",", ""));
const nameMatch = context.match(/<(?:h[23]|strong|span)[^>]*>([^<]{10,150})<\//i);
const name = nameMatch ? nameMatch[1].trim() : relUrl.split("/").pop() || "";
const partNumber = (relUrl.split("/").pop() ?? name).slice(0, 80);
const url = BASE + relUrl;
const combined = name + " " + partNumber;
const reach = detectReach(combined);
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
products.push({
partNumber, name, url,
price: price > 0 && price < 100000 ? price : undefined,
formFactor, speed, speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelength: detectWavelength(combined),
});
}
}
// Deduplicate by URL
const seen = new Set<string>();
return products.filter((p) => {
if (seen.has(p.url)) return false;
seen.add(p.url);
return true;
});
}
/** Check if the HTML contains a link to the next pagination page. */
function hasNextPage(html: string, currentPage: number): boolean {
if (/rel="next"/i.test(html)) return true;
const nextPageNum = currentPage + 1;
const pattern = new RegExp(`[?&]page=${nextPageNum}`, "i");
return pattern.test(html);
}
async function fetchPage(url: string): Promise<string> {
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
return resp.text();
}
function normalizeStockLevel( function normalizeStockLevel(
raw?: string raw?: string
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" { ): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
@ -242,8 +121,19 @@ function normalizeStockLevel(
return "on_request"; return "on_request";
} }
/* ------------------------------------------------------------------ */
/* Main scraper */
/* ------------------------------------------------------------------ */
export async function scrapeProLabs(): Promise<void> { export async function scrapeProLabs(): Promise<void> {
console.log("=== ProLabs Scraper Starting ===\n"); console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n");
if (PROXY_URL) {
console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`);
} else {
console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs.");
console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n");
}
const vendorId = await ensureVendor( const vendorId = await ensureVendor(
"ProLabs", "ProLabs",
@ -254,90 +144,334 @@ export async function scrapeProLabs(): Promise<void> {
let totalProducts = 0; let totalProducts = 0;
let priceUpdates = 0; let priceUpdates = 0;
let blockedPages = 0;
const seenUrls = new Set<string>(); const seenUrls = new Set<string>();
// Map URL -> category metadata
const urlToCat = new Map<string, typeof CATEGORIES[number]>();
const requestQueue = await RequestQueue.open();
for (const cat of CATEGORIES) { for (const cat of CATEGORIES) {
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); const url = `${BASE}${cat.path}`;
urlToCat.set(url, cat);
await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } });
}
let page = 1; const crawler = new PlaywrightCrawler({
let pagesThisCat = 0; requestQueue,
let productsThisCat = 0; maxConcurrency: 1,
maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 120,
navigationTimeoutSecs: 60,
maxRequestRetries: 2,
headless: true,
// Override default blockedStatusCodes (normally [401, 403, 429]).
// We allow 403 so our handler can inspect the page — CloudFront may
// serve a JS challenge that resolves, or we can log the block gracefully.
sessionPoolOptions: {
blockedStatusCodes: [401, 429],
},
browserPoolOptions: {
useFingerprints: false,
},
launchContext: {
launcher: firefox,
launchOptions: {
firefoxUserPrefs: {
"toolkit.telemetry.enabled": false,
"privacy.trackingprotection.enabled": false,
},
},
},
...(PROXY_URL ? {
proxyConfiguration: new (require("crawlee").ProxyConfiguration)({
proxyUrls: [PROXY_URL],
}),
} : {}),
preNavigationHooks: [
async ({ page }, goToOptions) => {
// Realistic viewport
await page.setViewportSize({ width: 1920, height: 1080 });
while (page <= MAX_PAGES) { // Override webdriver detection
const url = page === 1 await page.addInitScript(() => {
? `${BASE}${cat.path}` Object.defineProperty(navigator, "webdriver", { get: () => false });
: `${BASE}${cat.path}?page=${page}`; });
try { if (goToOptions) {
const html = await fetchPage(url); goToOptions.waitUntil = "load";
const pageProducts = parseProductList(html, cat); }
},
],
// Global dedup: broad fallback category overlaps with specific ones async requestHandler({ page, request, log }) {
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url)); const currentPage: number = request.userData?.page ?? 1;
newProducts.forEach((p) => seenUrls.add(p.url)); const catPath: string = request.userData?.catPath ?? "";
console.log(` Page ${page}: ${pageProducts.length} found, ${newProducts.length} new`); const cat = urlToCat.get(request.url) ??
CATEGORIES.find((c) => catPath === c.path) ??
CATEGORIES[CATEGORIES.length - 1];
urlToCat.set(request.url, cat);
for (const product of newProducts) { log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`);
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) { // Give JS challenges time to resolve
const hash = contentHash({ await page.waitForTimeout(8000);
price: product.price,
part: product.partNumber, // Check what we actually got
stock: product.stockStatus ?? "", const pageTitle = await page.title();
}); const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || "");
const updated = await upsertPriceObservation({ log.info(` Title: "${pageTitle}"`);
transceiverId: txId,
sourceVendorId: vendorId, // Detect CloudFront WAF block
price: product.price, if (bodyText.includes("Request blocked") ||
currency: "USD", bodyText.includes("Access Denied") ||
stockLevel: normalizeStockLevel(product.stockStatus), bodyText.includes("403 ERROR") ||
url: product.url, pageTitle.includes("ERROR")) {
contentHash: hash, blockedPages++;
}); log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`);
if (updated) priceUpdates++; if (blockedPages >= 3 && totalProducts === 0) {
log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`);
}
return;
}
// Extract products via page.evaluate
const productData = await page.evaluate(() => {
const results: Array<{
name: string;
href: string;
price: string;
stock: string;
partNumber: string;
}> = [];
// Strategy 1: Product card links
const productLinks = document.querySelectorAll(
'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a'
);
for (const link of productLinks) {
const el = link as HTMLAnchorElement;
const name = el.textContent?.trim() || "";
const href = el.getAttribute("href") || "";
if (!name || name.length < 5 || name.length > 200 || !href) continue;
if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue;
const container =
el.closest('[class*="product"]') ||
el.closest('[class*="item"]') ||
el.closest('[class*="card"]') ||
el.closest("li") ||
el.parentElement?.parentElement?.parentElement;
let price = "";
let stock = "";
let pn = "";
if (container) {
const priceEl = container.querySelector(
'[class*="price"], [class*="Price"], [data-price], .price'
);
price = priceEl?.textContent?.trim() || "";
if (!price) {
const containerText = container.textContent || "";
const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/);
if (priceMatch) price = priceMatch[0];
} }
productsThisCat++; const stockEl = container.querySelector(
totalProducts++; '[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]'
} catch (err) { );
console.warn(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`); stock = stockEl?.textContent?.trim() || "";
const skuEl = container.querySelector(
'[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]'
);
pn = skuEl?.textContent?.trim() || "";
}
if (!pn) {
pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || "";
}
if (name && href.includes("/products/")) {
results.push({ name, href, price, stock, partNumber: pn });
} }
} }
pagesThisCat++; // Strategy 2: Scan deeper for anchors with product URLs
if (results.length === 0) {
const allAnchors = document.querySelectorAll("a[href*='/products/']");
for (const el of allAnchors) {
const anchor = el as HTMLAnchorElement;
const href = anchor.getAttribute("href") || "";
const name = anchor.textContent?.trim() || "";
if (!name || name.length < 5) continue;
if (pageProducts.length === 0 || !hasNextPage(html, page)) break; let parent: Element | null = anchor;
let price = "";
for (let i = 0; i < 4 && parent; i++) {
parent = parent.parentElement;
if (parent) {
const text = parent.textContent || "";
const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/);
if (m) { price = m[0]; break; }
}
}
page++; const pn = href.split("/").pop()?.replace(/\.html?$/, "") || "";
await sleep(2000); results.push({ name, href, price, stock: "", partNumber: pn });
} catch (err) { }
console.error(` Page ${page} failed: ${(err as Error).message}`); }
break;
// Strategy 3: JSON-LD structured data
const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of ldScripts) {
try {
const data = JSON.parse(script.textContent || "");
const items = data.itemListElement || (Array.isArray(data) ? data : [data]);
for (const item of items) {
if (item["@type"] === "Product" || item.offers) {
const name = item.name || "";
const href = item.url || "";
const offers = item.offers || {};
const price = offers.price ? `$${offers.price}` : "";
const stock = offers.availability || "";
const pn = item.sku || item.mpn || href.split("/").pop() || "";
if (name) results.push({ name, href, price, stock, partNumber: pn });
}
}
} catch { /* ignore parse errors */ }
}
return results;
});
log.info(` Raw items extracted: ${productData.length}`);
// Process extracted products
const pageProducts: Product[] = [];
for (const item of productData) {
if (!item.name) continue;
const partNumber = (item.partNumber || item.name).slice(0, 80).trim();
const name = item.name.slice(0, 200).trim();
const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`;
let price: number | undefined;
if (item.price) {
const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", "");
const parsed = parseFloat(cleaned);
if (parsed > 0 && parsed < 100000) price = parsed;
}
const combined = name + " " + partNumber;
const reach = detectReach(combined);
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
pageProducts.push({
partNumber, name, url, price,
stockStatus: item.stock || undefined,
formFactor, speed, speedGbps,
reachLabel: reach?.label,
reachMeters: reach?.meters,
fiberType: detectFiber(combined),
wavelength: detectWavelength(combined),
});
} }
}
console.log(` Category done: ${productsThisCat} products across ${pagesThisCat} page(s)`); // Deduplicate against global set
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
for (const p of newProducts) seenUrls.add(p.url);
if (cat !== CATEGORIES[CATEGORIES.length - 1]) { log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`);
await sleep(2000);
} // Write to database
for (const product of newProducts) {
try {
const txId = await findOrCreateScrapedTransceiver({
partNumber: product.partNumber,
vendorId,
formFactor: product.formFactor,
speedGbps: product.speedGbps,
speed: product.speed,
reachMeters: product.reachMeters,
reachLabel: product.reachLabel,
fiberType: product.fiberType,
wavelengths: product.wavelength,
category: "DataCenter",
});
if (product.price && product.price > 0) {
const hash = contentHash({
price: product.price,
part: product.partNumber,
stock: product.stockStatus ?? "",
});
const updated = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price: product.price,
currency: "USD",
stockLevel: normalizeStockLevel(product.stockStatus),
url: product.url,
contentHash: hash,
});
if (updated) priceUpdates++;
}
totalProducts++;
} catch (err) {
log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
}
}
// Check for next page
const hasNext = await page.evaluate((currentPageNum: number) => {
const nextLink = document.querySelector('a[rel="next"], link[rel="next"]');
if (nextLink) return true;
const nextNum = currentPageNum + 1;
const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a');
for (const link of paginationLinks) {
const href = (link as HTMLAnchorElement).getAttribute("href") || "";
if (href.includes(`page=${nextNum}`)) return true;
const text = link.textContent?.trim() || "";
if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true;
}
return false;
}, currentPage);
if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) {
const nextPageNum = currentPage + 1;
const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`;
urlToCat.set(nextUrl, cat);
await requestQueue.addRequest({
url: nextUrl,
userData: { page: nextPageNum, catPath },
});
log.info(` Enqueued next page: ${nextPageNum}`);
}
},
async failedRequestHandler({ request, log }) {
log.error(`Request failed after retries: ${request.url}`);
},
});
await crawler.run();
console.log(`\n=== ProLabs Complete ===`);
console.log(` Products processed: ${totalProducts}`);
console.log(` Price updates: ${priceUpdates}`);
console.log(` Pages blocked by WAF: ${blockedPages}`);
if (blockedPages > 0 && totalProducts === 0) {
console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`);
console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`);
} }
console.log(`\n=== ProLabs Complete: ${totalProducts} products processed, ${priceUpdates} price updates ===`);
} }
if (require.main === module) { if (require.main === module) {