chore: sync local changes
This commit is contained in:
parent
2348238888
commit
0b07490114
@ -61,9 +61,6 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
"scrape:pricing:10gtek",
|
||||
"scrape:pricing:atgbics",
|
||||
"scrape:pricing:prolabs",
|
||||
"scrape:pricing:naddod",
|
||||
"scrape:pricing:qsfptek",
|
||||
"scrape:pricing:addon",
|
||||
"scrape:compat:cisco",
|
||||
"scrape:pricing:flexoptix",
|
||||
"scrape:vendors:flexoptix",
|
||||
@ -117,30 +114,12 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// ProLabs pricing (every 8 hours — server-rendered HTML, USD prices)
|
||||
// ProLabs pricing (every 8 hours — Playwright, needs proxy for CloudFront)
|
||||
await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// NADDOD pricing (every 8 hours — WooCommerce, USD prices)
|
||||
await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices)
|
||||
await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// AddOn Networks pricing (every 12 hours — enterprise site, USD prices)
|
||||
await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// Flexoptix catalog (every 6 hours — fetch-based, fast)
|
||||
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
@ -173,9 +152,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
const { scrapeNews } = await import("./scrapers/news");
|
||||
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||||
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
||||
const { scrapeNaddod } = await import("./scrapers/naddod");
|
||||
const { scrapeQsfptek } = await import("./scrapers/qsfptek");
|
||||
const { scrapeAddonNetworks } = await import("./scrapers/addon-networks");
|
||||
|
||||
await boss.work("scrape:pricing:fs", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||
@ -222,21 +198,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await withIsolatedStorage("prolabs", scrapeProLabs);
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:naddod", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`);
|
||||
await scrapeNaddod();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:qsfptek", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`);
|
||||
await scrapeQsfptek();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:addon", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`);
|
||||
await scrapeAddonNetworks();
|
||||
});
|
||||
|
||||
await boss.work("scrape:faq", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
|
||||
});
|
||||
|
||||
@ -1,27 +1,101 @@
|
||||
/**
|
||||
* Cisco TMG Matrix Scraper — Transceiver Compatibility
|
||||
*
|
||||
* Source: tmgmatrix.cisco.com
|
||||
* Source: tmgmatrix.cisco.com (JSON API — no auth required)
|
||||
* Extracts: Switch model ↔ Transceiver compatibility data
|
||||
* Stores: switches, compatibility table
|
||||
*
|
||||
* The TMG Matrix has a JSON API behind the scenes.
|
||||
* Uses POST /public/api/networkdevice/search endpoint directly.
|
||||
*/
|
||||
import { CheerioCrawler } from "crawlee";
|
||||
import { pool, ensureVendor } from "../utils/db";
|
||||
|
||||
const TMG_BASE = "https://tmgmatrix.cisco.com";
|
||||
const TMG_API = "https://tmgmatrix.cisco.com/public/api/networkdevice/search";
|
||||
|
||||
interface TmgEntry {
|
||||
switchModel: string;
|
||||
switchSeries: string;
|
||||
transceiverPid: string;
|
||||
transceiverDescription: string;
|
||||
speed: string;
|
||||
interface TmgTransceiver {
|
||||
tmgId: number;
|
||||
productId: string;
|
||||
productFamily: string;
|
||||
formFactor: string;
|
||||
reach: string;
|
||||
temperatureRange: string;
|
||||
cableType: string;
|
||||
connector: string;
|
||||
minSoftware: string;
|
||||
media: string;
|
||||
connectorType: string;
|
||||
transmissionStandard: string;
|
||||
dataRate: string;
|
||||
endOfSale: string;
|
||||
softReleaseMinVer: string;
|
||||
breakoutMode: string;
|
||||
osType: string;
|
||||
domSupport: string;
|
||||
type: string;
|
||||
}
|
||||
|
||||
interface TmgCompatEntry {
|
||||
productId: string; // switch PID
|
||||
transceivers: TmgTransceiver[];
|
||||
}
|
||||
|
||||
interface TmgDevice {
|
||||
productFamily: string;
|
||||
networkAndTransceiverCompatibility: TmgCompatEntry[];
|
||||
}
|
||||
|
||||
interface TmgSearchResponse {
|
||||
totalCount: number;
|
||||
filters: Array<{ name: string; values: Array<{ id: number; name: string; count: number }> }>;
|
||||
networkDevices: TmgDevice[];
|
||||
}
|
||||
|
||||
/** Key Nexus/Catalyst platform family IDs from the TMG API */
|
||||
const PLATFORM_FAMILIES = [
|
||||
{ id: 74, name: "N9300" }, // Nexus 9300 — 8,515 entries
|
||||
{ id: 77, name: "N9500" }, // Nexus 9500 — 2,266 entries
|
||||
{ id: 78, name: "N9200" }, // Nexus 9200 — 708 entries
|
||||
{ id: 661, name: "N9800" }, // Nexus 9800 — 238 entries
|
||||
{ id: 76, name: "C9300" }, // Catalyst 9300 — 260 entries
|
||||
{ id: 601, name: "C9300L" }, // Catalyst 9300L — 720 entries
|
||||
{ id: 1181, name: "C9300X" }, // Catalyst 9300X — 413 entries
|
||||
{ id: 8, name: "C9500" }, // Catalyst 9500 — 1,141 entries
|
||||
{ id: 521, name: "C9600" }, // Catalyst 9600 — 771 entries
|
||||
{ id: 7, name: "C9400" }, // Catalyst 9400 — 561 entries
|
||||
{ id: 341, name: "C9200" }, // Catalyst 9200 — 222 entries
|
||||
{ id: 83, name: "ASR9000" }, // ASR 9000 — 3,644 entries
|
||||
];
|
||||
|
||||
async function searchTmg(familyFilter: { id: number; name: string }): Promise<TmgSearchResponse> {
|
||||
const body = {
|
||||
cableType: [],
|
||||
dataRate: [],
|
||||
formFactor: [],
|
||||
reach: [],
|
||||
searchInput: [""],
|
||||
osType: [],
|
||||
transceiverProductFamily: [],
|
||||
transceiverProductID: [],
|
||||
networkDeviceProductFamily: [familyFilter],
|
||||
networkDeviceProductID: [],
|
||||
media: [],
|
||||
connectorType: [],
|
||||
caseTemperature: [],
|
||||
performanceMonitoring: [],
|
||||
};
|
||||
|
||||
const res = await fetch(TMG_API, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`TMG API ${res.status}: ${res.statusText}`);
|
||||
}
|
||||
|
||||
return res.json() as Promise<TmgSearchResponse>;
|
||||
}
|
||||
|
||||
async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise<string> {
|
||||
@ -38,18 +112,31 @@ async function upsertCiscoSwitch(vendorId: string, model: string, series: string
|
||||
async function upsertCompatibility(
|
||||
switchId: string,
|
||||
transceiverId: string,
|
||||
firmwareMin: string
|
||||
firmwareMin: string,
|
||||
formFactor: string,
|
||||
reach: string,
|
||||
cableType: string,
|
||||
media: string,
|
||||
dataRate: string
|
||||
): Promise<void> {
|
||||
await pool.query(
|
||||
`INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url)
|
||||
VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4)
|
||||
ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET firmware_min = EXCLUDED.firmware_min`,
|
||||
[switchId, transceiverId, firmwareMin || null, TMG_BASE]
|
||||
`INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url, notes)
|
||||
VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4, $5)
|
||||
ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET
|
||||
firmware_min = EXCLUDED.firmware_min,
|
||||
notes = EXCLUDED.notes`,
|
||||
[
|
||||
switchId,
|
||||
transceiverId,
|
||||
firmwareMin || null,
|
||||
"https://tmgmatrix.cisco.com",
|
||||
`${formFactor} ${dataRate} ${reach} ${media} ${cableType}`.trim(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
export async function scrapeCiscoTmg(): Promise<void> {
|
||||
console.log("=== Cisco TMG Matrix Scraper Starting ===\n");
|
||||
console.log("=== Cisco TMG Matrix Scraper Starting (API mode) ===\n");
|
||||
|
||||
const ciscoVendorId = await ensureVendor(
|
||||
"Cisco",
|
||||
@ -58,90 +145,69 @@ export async function scrapeCiscoTmg(): Promise<void> {
|
||||
undefined
|
||||
);
|
||||
|
||||
const entries: TmgEntry[] = [];
|
||||
let totalSwitches = 0;
|
||||
let totalCompat = 0;
|
||||
let totalTransceivers = 0;
|
||||
|
||||
// TMG Matrix uses a search API
|
||||
// First, try the public HTML interface
|
||||
const crawler = new CheerioCrawler({
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 10, // Very respectful — Cisco rate limits aggressively
|
||||
|
||||
async requestHandler({ request, $, log }) {
|
||||
log.info(`Scraping: ${request.url}`);
|
||||
|
||||
// The TMG Matrix renders a table with compatibility data
|
||||
$("table tbody tr, .matrix-row, [class*='result-row']").each((_i, el) => {
|
||||
const $row = $(el);
|
||||
const cells = $row.find("td").map((_j, td) => $(td).text().trim()).get();
|
||||
|
||||
if (cells.length >= 4) {
|
||||
entries.push({
|
||||
switchModel: cells[0] || "",
|
||||
switchSeries: cells[0]?.split(" ")[0] || "Nexus",
|
||||
transceiverPid: cells[1] || "",
|
||||
transceiverDescription: cells[2] || "",
|
||||
speed: cells[3] || "",
|
||||
reach: cells[4] || "",
|
||||
cableType: cells[5] || "",
|
||||
connector: cells[6] || "",
|
||||
minSoftware: cells[7] || "",
|
||||
});
|
||||
}
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// Start with Nexus switches (most relevant for Flexoptix)
|
||||
await crawler.run([
|
||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+9000`,
|
||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+3000`,
|
||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+7000`,
|
||||
`${TMG_BASE}/public/tmg?searchValue=Catalyst+9000`,
|
||||
]);
|
||||
|
||||
console.log(`\nEntries found: ${entries.length}`);
|
||||
|
||||
// Write to database
|
||||
let switches = 0;
|
||||
let compat = 0;
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.switchModel || !entry.transceiverPid) continue;
|
||||
for (const family of PLATFORM_FAMILIES) {
|
||||
console.log(`\nFetching ${family.name}...`);
|
||||
|
||||
try {
|
||||
const switchId = await upsertCiscoSwitch(
|
||||
ciscoVendorId,
|
||||
entry.switchModel,
|
||||
entry.switchSeries
|
||||
);
|
||||
switches++;
|
||||
const data = await searchTmg(family);
|
||||
console.log(` ${family.name}: ${data.totalCount} total entries, ${data.networkDevices.length} device groups`);
|
||||
|
||||
// Try to match transceiver in our DB
|
||||
const txResult = await pool.query(
|
||||
`SELECT id FROM transceivers
|
||||
WHERE part_number = $1
|
||||
OR slug LIKE $2
|
||||
OR standard_name ILIKE $3
|
||||
LIMIT 1`,
|
||||
[
|
||||
entry.transceiverPid,
|
||||
`%${entry.transceiverPid.toLowerCase().replace(/[^a-z0-9]/g, "")}%`,
|
||||
`%${entry.speed}%${entry.reach}%`,
|
||||
]
|
||||
);
|
||||
for (const device of data.networkDevices) {
|
||||
for (const compat of device.networkAndTransceiverCompatibility) {
|
||||
if (!compat.productId) continue;
|
||||
|
||||
if (txResult.rows.length > 0) {
|
||||
await upsertCompatibility(switchId, txResult.rows[0].id, entry.minSoftware);
|
||||
compat++;
|
||||
const switchId = await upsertCiscoSwitch(
|
||||
ciscoVendorId,
|
||||
compat.productId,
|
||||
device.productFamily
|
||||
);
|
||||
totalSwitches++;
|
||||
|
||||
for (const tx of compat.transceivers) {
|
||||
if (!tx.productId) continue;
|
||||
totalTransceivers++;
|
||||
|
||||
// Try to match transceiver in our DB by Cisco PID
|
||||
const txResult = await pool.query(
|
||||
`SELECT id FROM transceivers
|
||||
WHERE part_number = $1
|
||||
OR part_number = $2
|
||||
LIMIT 1`,
|
||||
[tx.productId, tx.productId.replace(/-S$/, "")]
|
||||
);
|
||||
|
||||
if (txResult.rows.length > 0) {
|
||||
await upsertCompatibility(
|
||||
switchId,
|
||||
txResult.rows[0].id,
|
||||
tx.softReleaseMinVer,
|
||||
tx.formFactor,
|
||||
tx.reach,
|
||||
tx.cableType,
|
||||
tx.media,
|
||||
tx.dataRate
|
||||
);
|
||||
totalCompat++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limit: 2 seconds between platform families
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
} catch (err) {
|
||||
// Skip duplicates silently
|
||||
console.error(` Error fetching ${family.name}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Switches upserted: ${switches}`);
|
||||
console.log(`Compatibility entries: ${compat}`);
|
||||
console.log("=== Cisco TMG Scraper Complete ===\n");
|
||||
console.log(`\n=== Cisco TMG Scraper Complete ===`);
|
||||
console.log(` Switches upserted: ${totalSwitches}`);
|
||||
console.log(` Transceiver entries scanned: ${totalTransceivers}`);
|
||||
console.log(` Compatibility matches: ${totalCompat}\n`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
/**
|
||||
* Fluxlight Scraper — US-based compatible transceiver vendor
|
||||
*
|
||||
* fluxlight.com — BigCommerce, server-rendered HTML with real prices.
|
||||
* www.fluxlight.com — BigCommerce, server-rendered HTML with real prices.
|
||||
* ~144+ products across 6 pages. Uses pagination via ?page=N.
|
||||
*
|
||||
* Rate limited: 1 req/2sec.
|
||||
@ -91,8 +91,8 @@ function parseProductList(html: string): Product[] {
|
||||
const products: Product[] = [];
|
||||
|
||||
// BigCommerce product card pattern: product link + price
|
||||
// Pattern: <a href="https://fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99
|
||||
const productRegex = /href="(https?:\/\/fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi;
|
||||
// Pattern: <a href="https://www.fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99
|
||||
const productRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi;
|
||||
let match;
|
||||
while ((match = productRegex.exec(html)) !== null) {
|
||||
const url = match[1];
|
||||
@ -123,7 +123,7 @@ function parseProductList(html: string): Product[] {
|
||||
|
||||
// Fallback: broader link pattern
|
||||
if (products.length === 0) {
|
||||
const simpleRegex = /href="(https?:\/\/fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi;
|
||||
const simpleRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi;
|
||||
while ((match = simpleRegex.exec(html)) !== null) {
|
||||
const url = match[1];
|
||||
const name = match[2].trim();
|
||||
@ -166,7 +166,7 @@ async function fetchPage(url: string): Promise<string> {
|
||||
export async function scrapeFluxlight(): Promise<void> {
|
||||
console.log("=== Fluxlight Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://fluxlight.com/transceivers/");
|
||||
const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://www.fluxlight.com/transceivers/");
|
||||
|
||||
let allProducts: Product[] = [];
|
||||
|
||||
@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise<void> {
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId, sourceVendorId: vendorId,
|
||||
price: product.price, currency: "USD",
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://gbics.com";
|
||||
const BASE = "https://www.gbics.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
@ -100,7 +100,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
|
||||
// BigCommerce card-title pattern:
|
||||
// <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click">
|
||||
const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/gbics\.com\/[^"]+)"\s+data-event-type="product-click"/gi;
|
||||
const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*data-event-type="product-click"/gi;
|
||||
let match;
|
||||
while ((match = productRegex.exec(collapsed)) !== null) {
|
||||
const label = match[1].trim();
|
||||
@ -110,7 +110,14 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
// Split on last comma to separate name and price
|
||||
const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/);
|
||||
const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label;
|
||||
const price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
|
||||
let price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
|
||||
|
||||
// Fallback: extract price from data-price-asc attribute on parent <li>
|
||||
if (!price) {
|
||||
const priceContext = collapsed.slice(Math.max(0, match.index - 500), match.index);
|
||||
const dataPriceMatch = priceContext.match(/data-price-asc="(\d+)"/);
|
||||
if (dataPriceMatch) price = parseFloat(dataPriceMatch[1]);
|
||||
}
|
||||
|
||||
if (name.length < 10) continue;
|
||||
|
||||
@ -131,7 +138,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
|
||||
// Fallback: try "Now: £XX.XX" pattern near product links
|
||||
if (products.length === 0) {
|
||||
const altRegex = /href="(https?:\/\/gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi;
|
||||
const altRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi;
|
||||
while ((match = altRegex.exec(collapsed)) !== null) {
|
||||
const url = match[1];
|
||||
const name = match[2].trim();
|
||||
@ -172,7 +179,7 @@ async function fetchPage(url: string): Promise<string> {
|
||||
export async function scrapeGbics(): Promise<void> {
|
||||
console.log("=== GBICS.com Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor("GBICS", "compatible", "https://gbics.com", "https://gbics.com/optical-transceivers/");
|
||||
const vendorId = await ensureVendor("GBICS", "compatible", "https://www.gbics.com", "https://www.gbics.com/optical-transceivers/");
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
@ -196,7 +203,7 @@ export async function scrapeGbics(): Promise<void> {
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId, sourceVendorId: vendorId,
|
||||
price: product.price, currency: "GBP",
|
||||
|
||||
@ -38,19 +38,14 @@ interface NewsArticle {
|
||||
const FEEDS: RssFeed[] = [
|
||||
// === PRIMARY: Transceiver-specific ===
|
||||
{
|
||||
name: "Lightwave Online",
|
||||
url: "https://www.lightwaveonline.com/rss",
|
||||
name: "The Next Platform",
|
||||
url: "https://www.nextplatform.com/feed/",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "Lightwave - Fiber Optics",
|
||||
url: "https://www.lightwaveonline.com/fttx/rss",
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "Fierce Telecom",
|
||||
url: "https://www.fiercetelecom.com/rss/xml",
|
||||
category: "market_report",
|
||||
name: "ServeTheHome",
|
||||
url: "https://www.servethehome.com/feed/",
|
||||
category: "product_launch",
|
||||
},
|
||||
{
|
||||
name: "Optics.org",
|
||||
@ -69,8 +64,8 @@ const FEEDS: RssFeed[] = [
|
||||
category: "market_report",
|
||||
},
|
||||
{
|
||||
name: "SDxCentral",
|
||||
url: "https://www.sdxcentral.com/feed/",
|
||||
name: "The Register - Data Centre",
|
||||
url: "https://www.theregister.com/data_centre/headlines.atom",
|
||||
category: "market_report",
|
||||
},
|
||||
// === TERTIARY: General tech / photonics ===
|
||||
|
||||
@ -1,22 +1,29 @@
|
||||
/**
|
||||
* ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary)
|
||||
*
|
||||
* prolabs.com — Server-rendered HTML with public USD pricing.
|
||||
* prolabs.com — CloudFront WAF aggressively blocks datacenter IPs.
|
||||
* Uses PlaywrightCrawler with Firefox for anti-detection.
|
||||
*
|
||||
* KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs
|
||||
* (HTTP 403 "Request blocked"). This scraper works correctly from
|
||||
* residential IPs. Solutions:
|
||||
* 1. Set PROXY_URL env var to a residential/rotating proxy
|
||||
* 2. Run from a residential IP (e.g. home server)
|
||||
* 3. Route through WireGuard with internet breakout at home
|
||||
*
|
||||
* Products listed under /products/networking/fiber-optics/ category pages.
|
||||
* Pagination via ?page=N. Rate limited: 1 req/2sec. Max 100 pages.
|
||||
* Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min.
|
||||
*
|
||||
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
|
||||
*/
|
||||
import { PlaywrightCrawler, RequestQueue } from "crawlee";
|
||||
import { firefox } from "playwright";
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.prolabs.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 100;
|
||||
const PROXY_URL = process.env.PROXY_URL || "";
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
@ -26,7 +33,6 @@ const CATEGORIES = [
|
||||
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
// Broad fallback category in case above paths differ on the live site
|
||||
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
@ -45,9 +51,9 @@ interface Product {
|
||||
wavelength?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Helper / detection functions (unchanged from original) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
@ -90,18 +96,6 @@ function detectWavelength(text: string): string {
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Infer form factor and speed from ProLabs SKU prefixes when category context
|
||||
* is not specific enough (e.g. when crawling the broad fallback category).
|
||||
*
|
||||
* ProLabs SKU prefix conventions:
|
||||
* Q- -> QSFP+ 40G
|
||||
* Q28- -> QSFP28 100G
|
||||
* QDD- -> QSFP-DD 400G
|
||||
* SFP28- -> SFP28 25G
|
||||
* SFP- -> SFP+ 10G (most common ProLabs prefix)
|
||||
* S- -> SFP 1G
|
||||
*/
|
||||
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
@ -116,121 +110,6 @@ function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
||||
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse product listings from a ProLabs category page.
|
||||
*
|
||||
* ProLabs uses a standard e-commerce layout:
|
||||
* - Product cards with an <a> link containing the product URL and name
|
||||
* - Price in a span with "price" in class or as "$XX.XX" text nearby
|
||||
* - SKU / part number in the URL slug
|
||||
* - Stock badge: "In Stock" / "Out of Stock" / "Call for Availability"
|
||||
*
|
||||
* We parse with lightweight regex on collapsed HTML — same approach as gbics.ts
|
||||
* and sfpcables.ts (no DOM parser dependency).
|
||||
*/
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: product cards with structured href containing a SKU-like segment
|
||||
// Match anchor tags whose href is a deep product path ending in a SKU pattern
|
||||
const productLinkRegex = /href="(\/products\/[^"]*?\/([A-Z0-9][A-Z0-9\-_]{3,}(?:-PR)?))"\s[^>]*>([^<]{10,})<\/a>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = productLinkRegex.exec(collapsed)) !== null) {
|
||||
const relUrl = match[1];
|
||||
const skuFromUrl = match[2];
|
||||
const linkText = match[3].trim();
|
||||
|
||||
// Skip navigation / filter / pagination links
|
||||
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
|
||||
if (linkText.length > 200) continue;
|
||||
|
||||
const url = BASE + relUrl;
|
||||
const partNumber = skuFromUrl.slice(0, 80);
|
||||
const name = linkText.length > 10 ? linkText : partNumber;
|
||||
|
||||
// Look for price in a 700-char window after the match position
|
||||
const context = collapsed.slice(Math.max(0, match.index - 100), match.index + 700);
|
||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/) ||
|
||||
context.match(/price[^>]*>\s*\$?\s*([\d,]+\.?\d{0,2})/i);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(",", "")) : undefined;
|
||||
|
||||
const stockMatch = context.match(/(in[\s-]stock|out[\s-]of[\s-]stock|call for availability|available|backordered)/i);
|
||||
const stockStatus = stockMatch ? stockMatch[1].toLowerCase() : undefined;
|
||||
|
||||
const combined = name + " " + partNumber;
|
||||
const reach = detectReach(combined);
|
||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
stockStatus,
|
||||
formFactor, speed, speedGbps,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelength: detectWavelength(combined),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Fallback — any link to a /products/ URL that has a $ price nearby
|
||||
if (products.length === 0) {
|
||||
const altRegex = /href="(\/products\/[^"]{10,})"/gi;
|
||||
while ((match = altRegex.exec(collapsed)) !== null) {
|
||||
const relUrl = match[1];
|
||||
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
|
||||
|
||||
const context = collapsed.slice(Math.max(0, match.index - 50), match.index + 800);
|
||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
|
||||
if (!priceMatch) continue;
|
||||
|
||||
const price = parseFloat(priceMatch[1].replace(",", ""));
|
||||
const nameMatch = context.match(/<(?:h[23]|strong|span)[^>]*>([^<]{10,150})<\//i);
|
||||
const name = nameMatch ? nameMatch[1].trim() : relUrl.split("/").pop() || "";
|
||||
const partNumber = (relUrl.split("/").pop() ?? name).slice(0, 80);
|
||||
|
||||
const url = BASE + relUrl;
|
||||
const combined = name + " " + partNumber;
|
||||
const reach = detectReach(combined);
|
||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor, speed, speedGbps,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelength: detectWavelength(combined),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by URL
|
||||
const seen = new Set<string>();
|
||||
return products.filter((p) => {
|
||||
if (seen.has(p.url)) return false;
|
||||
seen.add(p.url);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
/** Check if the HTML contains a link to the next pagination page. */
|
||||
function hasNextPage(html: string, currentPage: number): boolean {
|
||||
if (/rel="next"/i.test(html)) return true;
|
||||
const nextPageNum = currentPage + 1;
|
||||
const pattern = new RegExp(`[?&]page=${nextPageNum}`, "i");
|
||||
return pattern.test(html);
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
function normalizeStockLevel(
|
||||
raw?: string
|
||||
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
|
||||
@ -242,8 +121,19 @@ function normalizeStockLevel(
|
||||
return "on_request";
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Main scraper */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
export async function scrapeProLabs(): Promise<void> {
|
||||
console.log("=== ProLabs Scraper Starting ===\n");
|
||||
console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n");
|
||||
|
||||
if (PROXY_URL) {
|
||||
console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`);
|
||||
} else {
|
||||
console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs.");
|
||||
console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n");
|
||||
}
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"ProLabs",
|
||||
@ -254,90 +144,334 @@ export async function scrapeProLabs(): Promise<void> {
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
let blockedPages = 0;
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
// Map URL -> category metadata
|
||||
const urlToCat = new Map<string, typeof CATEGORIES[number]>();
|
||||
|
||||
const requestQueue = await RequestQueue.open();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
const url = `${BASE}${cat.path}`;
|
||||
urlToCat.set(url, cat);
|
||||
await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } });
|
||||
}
|
||||
|
||||
let page = 1;
|
||||
let pagesThisCat = 0;
|
||||
let productsThisCat = 0;
|
||||
const crawler = new PlaywrightCrawler({
|
||||
requestQueue,
|
||||
maxConcurrency: 1,
|
||||
maxRequestsPerMinute: 10,
|
||||
requestHandlerTimeoutSecs: 120,
|
||||
navigationTimeoutSecs: 60,
|
||||
maxRequestRetries: 2,
|
||||
headless: true,
|
||||
// Override default blockedStatusCodes (normally [401, 403, 429]).
|
||||
// We allow 403 so our handler can inspect the page — CloudFront may
|
||||
// serve a JS challenge that resolves, or we can log the block gracefully.
|
||||
sessionPoolOptions: {
|
||||
blockedStatusCodes: [401, 429],
|
||||
},
|
||||
browserPoolOptions: {
|
||||
useFingerprints: false,
|
||||
},
|
||||
launchContext: {
|
||||
launcher: firefox,
|
||||
launchOptions: {
|
||||
firefoxUserPrefs: {
|
||||
"toolkit.telemetry.enabled": false,
|
||||
"privacy.trackingprotection.enabled": false,
|
||||
},
|
||||
},
|
||||
},
|
||||
...(PROXY_URL ? {
|
||||
proxyConfiguration: new (require("crawlee").ProxyConfiguration)({
|
||||
proxyUrls: [PROXY_URL],
|
||||
}),
|
||||
} : {}),
|
||||
preNavigationHooks: [
|
||||
async ({ page }, goToOptions) => {
|
||||
// Realistic viewport
|
||||
await page.setViewportSize({ width: 1920, height: 1080 });
|
||||
|
||||
while (page <= MAX_PAGES) {
|
||||
const url = page === 1
|
||||
? `${BASE}${cat.path}`
|
||||
: `${BASE}${cat.path}?page=${page}`;
|
||||
// Override webdriver detection
|
||||
await page.addInitScript(() => {
|
||||
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
||||
});
|
||||
|
||||
try {
|
||||
const html = await fetchPage(url);
|
||||
const pageProducts = parseProductList(html, cat);
|
||||
if (goToOptions) {
|
||||
goToOptions.waitUntil = "load";
|
||||
}
|
||||
},
|
||||
],
|
||||
|
||||
// Global dedup: broad fallback category overlaps with specific ones
|
||||
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
|
||||
newProducts.forEach((p) => seenUrls.add(p.url));
|
||||
async requestHandler({ page, request, log }) {
|
||||
const currentPage: number = request.userData?.page ?? 1;
|
||||
const catPath: string = request.userData?.catPath ?? "";
|
||||
|
||||
console.log(` Page ${page}: ${pageProducts.length} found, ${newProducts.length} new`);
|
||||
const cat = urlToCat.get(request.url) ??
|
||||
CATEGORIES.find((c) => catPath === c.path) ??
|
||||
CATEGORIES[CATEGORIES.length - 1];
|
||||
urlToCat.set(request.url, cat);
|
||||
|
||||
for (const product of newProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`);
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({
|
||||
price: product.price,
|
||||
part: product.partNumber,
|
||||
stock: product.stockStatus ?? "",
|
||||
});
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: normalizeStockLevel(product.stockStatus),
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
// Give JS challenges time to resolve
|
||||
await page.waitForTimeout(8000);
|
||||
|
||||
// Check what we actually got
|
||||
const pageTitle = await page.title();
|
||||
const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || "");
|
||||
log.info(` Title: "${pageTitle}"`);
|
||||
|
||||
// Detect CloudFront WAF block
|
||||
if (bodyText.includes("Request blocked") ||
|
||||
bodyText.includes("Access Denied") ||
|
||||
bodyText.includes("403 ERROR") ||
|
||||
pageTitle.includes("ERROR")) {
|
||||
blockedPages++;
|
||||
log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`);
|
||||
if (blockedPages >= 3 && totalProducts === 0) {
|
||||
log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract products via page.evaluate
|
||||
const productData = await page.evaluate(() => {
|
||||
const results: Array<{
|
||||
name: string;
|
||||
href: string;
|
||||
price: string;
|
||||
stock: string;
|
||||
partNumber: string;
|
||||
}> = [];
|
||||
|
||||
// Strategy 1: Product card links
|
||||
const productLinks = document.querySelectorAll(
|
||||
'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a'
|
||||
);
|
||||
|
||||
for (const link of productLinks) {
|
||||
const el = link as HTMLAnchorElement;
|
||||
const name = el.textContent?.trim() || "";
|
||||
const href = el.getAttribute("href") || "";
|
||||
|
||||
if (!name || name.length < 5 || name.length > 200 || !href) continue;
|
||||
if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue;
|
||||
|
||||
const container =
|
||||
el.closest('[class*="product"]') ||
|
||||
el.closest('[class*="item"]') ||
|
||||
el.closest('[class*="card"]') ||
|
||||
el.closest("li") ||
|
||||
el.parentElement?.parentElement?.parentElement;
|
||||
|
||||
let price = "";
|
||||
let stock = "";
|
||||
let pn = "";
|
||||
|
||||
if (container) {
|
||||
const priceEl = container.querySelector(
|
||||
'[class*="price"], [class*="Price"], [data-price], .price'
|
||||
);
|
||||
price = priceEl?.textContent?.trim() || "";
|
||||
if (!price) {
|
||||
const containerText = container.textContent || "";
|
||||
const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||
if (priceMatch) price = priceMatch[0];
|
||||
}
|
||||
|
||||
productsThisCat++;
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
||||
const stockEl = container.querySelector(
|
||||
'[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]'
|
||||
);
|
||||
stock = stockEl?.textContent?.trim() || "";
|
||||
|
||||
const skuEl = container.querySelector(
|
||||
'[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]'
|
||||
);
|
||||
pn = skuEl?.textContent?.trim() || "";
|
||||
}
|
||||
|
||||
if (!pn) {
|
||||
pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || "";
|
||||
}
|
||||
|
||||
if (name && href.includes("/products/")) {
|
||||
results.push({ name, href, price, stock, partNumber: pn });
|
||||
}
|
||||
}
|
||||
|
||||
pagesThisCat++;
|
||||
// Strategy 2: Scan deeper for anchors with product URLs
|
||||
if (results.length === 0) {
|
||||
const allAnchors = document.querySelectorAll("a[href*='/products/']");
|
||||
for (const el of allAnchors) {
|
||||
const anchor = el as HTMLAnchorElement;
|
||||
const href = anchor.getAttribute("href") || "";
|
||||
const name = anchor.textContent?.trim() || "";
|
||||
if (!name || name.length < 5) continue;
|
||||
|
||||
if (pageProducts.length === 0 || !hasNextPage(html, page)) break;
|
||||
let parent: Element | null = anchor;
|
||||
let price = "";
|
||||
for (let i = 0; i < 4 && parent; i++) {
|
||||
parent = parent.parentElement;
|
||||
if (parent) {
|
||||
const text = parent.textContent || "";
|
||||
const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||
if (m) { price = m[0]; break; }
|
||||
}
|
||||
}
|
||||
|
||||
page++;
|
||||
await sleep(2000);
|
||||
} catch (err) {
|
||||
console.error(` Page ${page} failed: ${(err as Error).message}`);
|
||||
break;
|
||||
const pn = href.split("/").pop()?.replace(/\.html?$/, "") || "";
|
||||
results.push({ name, href, price, stock: "", partNumber: pn });
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: JSON-LD structured data
|
||||
const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||
for (const script of ldScripts) {
|
||||
try {
|
||||
const data = JSON.parse(script.textContent || "");
|
||||
const items = data.itemListElement || (Array.isArray(data) ? data : [data]);
|
||||
for (const item of items) {
|
||||
if (item["@type"] === "Product" || item.offers) {
|
||||
const name = item.name || "";
|
||||
const href = item.url || "";
|
||||
const offers = item.offers || {};
|
||||
const price = offers.price ? `$${offers.price}` : "";
|
||||
const stock = offers.availability || "";
|
||||
const pn = item.sku || item.mpn || href.split("/").pop() || "";
|
||||
if (name) results.push({ name, href, price, stock, partNumber: pn });
|
||||
}
|
||||
}
|
||||
} catch { /* ignore parse errors */ }
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
log.info(` Raw items extracted: ${productData.length}`);
|
||||
|
||||
// Process extracted products
|
||||
const pageProducts: Product[] = [];
|
||||
|
||||
for (const item of productData) {
|
||||
if (!item.name) continue;
|
||||
|
||||
const partNumber = (item.partNumber || item.name).slice(0, 80).trim();
|
||||
const name = item.name.slice(0, 200).trim();
|
||||
const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`;
|
||||
|
||||
let price: number | undefined;
|
||||
if (item.price) {
|
||||
const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", "");
|
||||
const parsed = parseFloat(cleaned);
|
||||
if (parsed > 0 && parsed < 100000) price = parsed;
|
||||
}
|
||||
|
||||
const combined = name + " " + partNumber;
|
||||
const reach = detectReach(combined);
|
||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
||||
|
||||
pageProducts.push({
|
||||
partNumber, name, url, price,
|
||||
stockStatus: item.stock || undefined,
|
||||
formFactor, speed, speedGbps,
|
||||
reachLabel: reach?.label,
|
||||
reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(combined),
|
||||
wavelength: detectWavelength(combined),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` Category done: ${productsThisCat} products across ${pagesThisCat} page(s)`);
|
||||
// Deduplicate against global set
|
||||
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
|
||||
for (const p of newProducts) seenUrls.add(p.url);
|
||||
|
||||
if (cat !== CATEGORIES[CATEGORIES.length - 1]) {
|
||||
await sleep(2000);
|
||||
}
|
||||
log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`);
|
||||
|
||||
// Write to database
|
||||
for (const product of newProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({
|
||||
price: product.price,
|
||||
part: product.partNumber,
|
||||
stock: product.stockStatus ?? "",
|
||||
});
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: normalizeStockLevel(product.stockStatus),
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for next page
|
||||
const hasNext = await page.evaluate((currentPageNum: number) => {
|
||||
const nextLink = document.querySelector('a[rel="next"], link[rel="next"]');
|
||||
if (nextLink) return true;
|
||||
const nextNum = currentPageNum + 1;
|
||||
const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a');
|
||||
for (const link of paginationLinks) {
|
||||
const href = (link as HTMLAnchorElement).getAttribute("href") || "";
|
||||
if (href.includes(`page=${nextNum}`)) return true;
|
||||
const text = link.textContent?.trim() || "";
|
||||
if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true;
|
||||
}
|
||||
return false;
|
||||
}, currentPage);
|
||||
|
||||
if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) {
|
||||
const nextPageNum = currentPage + 1;
|
||||
const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`;
|
||||
urlToCat.set(nextUrl, cat);
|
||||
await requestQueue.addRequest({
|
||||
url: nextUrl,
|
||||
userData: { page: nextPageNum, catPath },
|
||||
});
|
||||
log.info(` Enqueued next page: ${nextPageNum}`);
|
||||
}
|
||||
},
|
||||
|
||||
async failedRequestHandler({ request, log }) {
|
||||
log.error(`Request failed after retries: ${request.url}`);
|
||||
},
|
||||
});
|
||||
|
||||
await crawler.run();
|
||||
|
||||
console.log(`\n=== ProLabs Complete ===`);
|
||||
console.log(` Products processed: ${totalProducts}`);
|
||||
console.log(` Price updates: ${priceUpdates}`);
|
||||
console.log(` Pages blocked by WAF: ${blockedPages}`);
|
||||
if (blockedPages > 0 && totalProducts === 0) {
|
||||
console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`);
|
||||
console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`);
|
||||
}
|
||||
|
||||
console.log(`\n=== ProLabs Complete: ${totalProducts} products processed, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user