chore: sync local changes
This commit is contained in:
parent
fa2d88096f
commit
8757fc8bf0
@ -61,9 +61,6 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
"scrape:pricing:10gtek",
|
"scrape:pricing:10gtek",
|
||||||
"scrape:pricing:atgbics",
|
"scrape:pricing:atgbics",
|
||||||
"scrape:pricing:prolabs",
|
"scrape:pricing:prolabs",
|
||||||
"scrape:pricing:naddod",
|
|
||||||
"scrape:pricing:qsfptek",
|
|
||||||
"scrape:pricing:addon",
|
|
||||||
"scrape:compat:cisco",
|
"scrape:compat:cisco",
|
||||||
"scrape:pricing:flexoptix",
|
"scrape:pricing:flexoptix",
|
||||||
"scrape:vendors:flexoptix",
|
"scrape:vendors:flexoptix",
|
||||||
@ -117,30 +114,12 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
|||||||
expireInSeconds: 3600,
|
expireInSeconds: 3600,
|
||||||
});
|
});
|
||||||
|
|
||||||
// ProLabs pricing (every 8 hours — server-rendered HTML, USD prices)
|
// ProLabs pricing (every 8 hours — Playwright, needs proxy for CloudFront)
|
||||||
await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, {
|
await boss.schedule("scrape:pricing:prolabs", "0 4/8 * * *", {}, {
|
||||||
retryLimit: 2,
|
retryLimit: 2,
|
||||||
expireInSeconds: 3600,
|
expireInSeconds: 3600,
|
||||||
});
|
});
|
||||||
|
|
||||||
// NADDOD pricing (every 8 hours — WooCommerce, USD prices)
|
|
||||||
await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, {
|
|
||||||
retryLimit: 2,
|
|
||||||
expireInSeconds: 3600,
|
|
||||||
});
|
|
||||||
|
|
||||||
// QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices)
|
|
||||||
await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, {
|
|
||||||
retryLimit: 2,
|
|
||||||
expireInSeconds: 3600,
|
|
||||||
});
|
|
||||||
|
|
||||||
// AddOn Networks pricing (every 12 hours — enterprise site, USD prices)
|
|
||||||
await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, {
|
|
||||||
retryLimit: 2,
|
|
||||||
expireInSeconds: 3600,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Flexoptix catalog (every 6 hours — fetch-based, fast)
|
// Flexoptix catalog (every 6 hours — fetch-based, fast)
|
||||||
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
|
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
|
||||||
retryLimit: 2,
|
retryLimit: 2,
|
||||||
@ -173,9 +152,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
const { scrapeNews } = await import("./scrapers/news");
|
const { scrapeNews } = await import("./scrapers/news");
|
||||||
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||||||
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
||||||
const { scrapeNaddod } = await import("./scrapers/naddod");
|
|
||||||
const { scrapeQsfptek } = await import("./scrapers/qsfptek");
|
|
||||||
const { scrapeAddonNetworks } = await import("./scrapers/addon-networks");
|
|
||||||
|
|
||||||
await boss.work("scrape:pricing:fs", async (_job) => {
|
await boss.work("scrape:pricing:fs", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||||
@ -222,21 +198,6 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await withIsolatedStorage("prolabs", scrapeProLabs);
|
await withIsolatedStorage("prolabs", scrapeProLabs);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:naddod", async (_job) => {
|
|
||||||
console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`);
|
|
||||||
await scrapeNaddod();
|
|
||||||
});
|
|
||||||
|
|
||||||
await boss.work("scrape:pricing:qsfptek", async (_job) => {
|
|
||||||
console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`);
|
|
||||||
await scrapeQsfptek();
|
|
||||||
});
|
|
||||||
|
|
||||||
await boss.work("scrape:pricing:addon", async (_job) => {
|
|
||||||
console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`);
|
|
||||||
await scrapeAddonNetworks();
|
|
||||||
});
|
|
||||||
|
|
||||||
await boss.work("scrape:faq", async (_job) => {
|
await boss.work("scrape:faq", async (_job) => {
|
||||||
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
|
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
|
||||||
});
|
});
|
||||||
|
|||||||
@ -1,27 +1,101 @@
|
|||||||
/**
|
/**
|
||||||
* Cisco TMG Matrix Scraper — Transceiver Compatibility
|
* Cisco TMG Matrix Scraper — Transceiver Compatibility
|
||||||
*
|
*
|
||||||
* Source: tmgmatrix.cisco.com
|
* Source: tmgmatrix.cisco.com (JSON API — no auth required)
|
||||||
* Extracts: Switch model ↔ Transceiver compatibility data
|
* Extracts: Switch model ↔ Transceiver compatibility data
|
||||||
* Stores: switches, compatibility table
|
* Stores: switches, compatibility table
|
||||||
*
|
*
|
||||||
* The TMG Matrix has a JSON API behind the scenes.
|
* Uses POST /public/api/networkdevice/search endpoint directly.
|
||||||
*/
|
*/
|
||||||
import { CheerioCrawler } from "crawlee";
|
|
||||||
import { pool, ensureVendor } from "../utils/db";
|
import { pool, ensureVendor } from "../utils/db";
|
||||||
|
|
||||||
const TMG_BASE = "https://tmgmatrix.cisco.com";
|
const TMG_API = "https://tmgmatrix.cisco.com/public/api/networkdevice/search";
|
||||||
|
|
||||||
interface TmgEntry {
|
interface TmgTransceiver {
|
||||||
switchModel: string;
|
tmgId: number;
|
||||||
switchSeries: string;
|
productId: string;
|
||||||
transceiverPid: string;
|
productFamily: string;
|
||||||
transceiverDescription: string;
|
formFactor: string;
|
||||||
speed: string;
|
|
||||||
reach: string;
|
reach: string;
|
||||||
|
temperatureRange: string;
|
||||||
cableType: string;
|
cableType: string;
|
||||||
connector: string;
|
media: string;
|
||||||
minSoftware: string;
|
connectorType: string;
|
||||||
|
transmissionStandard: string;
|
||||||
|
dataRate: string;
|
||||||
|
endOfSale: string;
|
||||||
|
softReleaseMinVer: string;
|
||||||
|
breakoutMode: string;
|
||||||
|
osType: string;
|
||||||
|
domSupport: string;
|
||||||
|
type: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TmgCompatEntry {
|
||||||
|
productId: string; // switch PID
|
||||||
|
transceivers: TmgTransceiver[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TmgDevice {
|
||||||
|
productFamily: string;
|
||||||
|
networkAndTransceiverCompatibility: TmgCompatEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TmgSearchResponse {
|
||||||
|
totalCount: number;
|
||||||
|
filters: Array<{ name: string; values: Array<{ id: number; name: string; count: number }> }>;
|
||||||
|
networkDevices: TmgDevice[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Key Nexus/Catalyst platform family IDs from the TMG API */
|
||||||
|
const PLATFORM_FAMILIES = [
|
||||||
|
{ id: 74, name: "N9300" }, // Nexus 9300 — 8,515 entries
|
||||||
|
{ id: 77, name: "N9500" }, // Nexus 9500 — 2,266 entries
|
||||||
|
{ id: 78, name: "N9200" }, // Nexus 9200 — 708 entries
|
||||||
|
{ id: 661, name: "N9800" }, // Nexus 9800 — 238 entries
|
||||||
|
{ id: 76, name: "C9300" }, // Catalyst 9300 — 260 entries
|
||||||
|
{ id: 601, name: "C9300L" }, // Catalyst 9300L — 720 entries
|
||||||
|
{ id: 1181, name: "C9300X" }, // Catalyst 9300X — 413 entries
|
||||||
|
{ id: 8, name: "C9500" }, // Catalyst 9500 — 1,141 entries
|
||||||
|
{ id: 521, name: "C9600" }, // Catalyst 9600 — 771 entries
|
||||||
|
{ id: 7, name: "C9400" }, // Catalyst 9400 — 561 entries
|
||||||
|
{ id: 341, name: "C9200" }, // Catalyst 9200 — 222 entries
|
||||||
|
{ id: 83, name: "ASR9000" }, // ASR 9000 — 3,644 entries
|
||||||
|
];
|
||||||
|
|
||||||
|
async function searchTmg(familyFilter: { id: number; name: string }): Promise<TmgSearchResponse> {
|
||||||
|
const body = {
|
||||||
|
cableType: [],
|
||||||
|
dataRate: [],
|
||||||
|
formFactor: [],
|
||||||
|
reach: [],
|
||||||
|
searchInput: [""],
|
||||||
|
osType: [],
|
||||||
|
transceiverProductFamily: [],
|
||||||
|
transceiverProductID: [],
|
||||||
|
networkDeviceProductFamily: [familyFilter],
|
||||||
|
networkDeviceProductID: [],
|
||||||
|
media: [],
|
||||||
|
connectorType: [],
|
||||||
|
caseTemperature: [],
|
||||||
|
performanceMonitoring: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
const res = await fetch(TMG_API, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||||
|
"Accept": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`TMG API ${res.status}: ${res.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.json() as Promise<TmgSearchResponse>;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise<string> {
|
async function upsertCiscoSwitch(vendorId: string, model: string, series: string): Promise<string> {
|
||||||
@ -38,18 +112,31 @@ async function upsertCiscoSwitch(vendorId: string, model: string, series: string
|
|||||||
async function upsertCompatibility(
|
async function upsertCompatibility(
|
||||||
switchId: string,
|
switchId: string,
|
||||||
transceiverId: string,
|
transceiverId: string,
|
||||||
firmwareMin: string
|
firmwareMin: string,
|
||||||
|
formFactor: string,
|
||||||
|
reach: string,
|
||||||
|
cableType: string,
|
||||||
|
media: string,
|
||||||
|
dataRate: string
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
await pool.query(
|
await pool.query(
|
||||||
`INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url)
|
`INSERT INTO compatibility (switch_id, transceiver_id, verified_by, verification_method, status, firmware_min, source_url, notes)
|
||||||
VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4)
|
VALUES ($1, $2, 'Cisco TMG Matrix', 'vendor_matrix', 'compatible', $3, $4, $5)
|
||||||
ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET firmware_min = EXCLUDED.firmware_min`,
|
ON CONFLICT (switch_id, transceiver_id) DO UPDATE SET
|
||||||
[switchId, transceiverId, firmwareMin || null, TMG_BASE]
|
firmware_min = EXCLUDED.firmware_min,
|
||||||
|
notes = EXCLUDED.notes`,
|
||||||
|
[
|
||||||
|
switchId,
|
||||||
|
transceiverId,
|
||||||
|
firmwareMin || null,
|
||||||
|
"https://tmgmatrix.cisco.com",
|
||||||
|
`${formFactor} ${dataRate} ${reach} ${media} ${cableType}`.trim(),
|
||||||
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeCiscoTmg(): Promise<void> {
|
export async function scrapeCiscoTmg(): Promise<void> {
|
||||||
console.log("=== Cisco TMG Matrix Scraper Starting ===\n");
|
console.log("=== Cisco TMG Matrix Scraper Starting (API mode) ===\n");
|
||||||
|
|
||||||
const ciscoVendorId = await ensureVendor(
|
const ciscoVendorId = await ensureVendor(
|
||||||
"Cisco",
|
"Cisco",
|
||||||
@ -58,90 +145,69 @@ export async function scrapeCiscoTmg(): Promise<void> {
|
|||||||
undefined
|
undefined
|
||||||
);
|
);
|
||||||
|
|
||||||
const entries: TmgEntry[] = [];
|
let totalSwitches = 0;
|
||||||
|
let totalCompat = 0;
|
||||||
|
let totalTransceivers = 0;
|
||||||
|
|
||||||
// TMG Matrix uses a search API
|
for (const family of PLATFORM_FAMILIES) {
|
||||||
// First, try the public HTML interface
|
console.log(`\nFetching ${family.name}...`);
|
||||||
const crawler = new CheerioCrawler({
|
|
||||||
maxConcurrency: 1,
|
|
||||||
maxRequestsPerMinute: 10, // Very respectful — Cisco rate limits aggressively
|
|
||||||
|
|
||||||
async requestHandler({ request, $, log }) {
|
|
||||||
log.info(`Scraping: ${request.url}`);
|
|
||||||
|
|
||||||
// The TMG Matrix renders a table with compatibility data
|
|
||||||
$("table tbody tr, .matrix-row, [class*='result-row']").each((_i, el) => {
|
|
||||||
const $row = $(el);
|
|
||||||
const cells = $row.find("td").map((_j, td) => $(td).text().trim()).get();
|
|
||||||
|
|
||||||
if (cells.length >= 4) {
|
|
||||||
entries.push({
|
|
||||||
switchModel: cells[0] || "",
|
|
||||||
switchSeries: cells[0]?.split(" ")[0] || "Nexus",
|
|
||||||
transceiverPid: cells[1] || "",
|
|
||||||
transceiverDescription: cells[2] || "",
|
|
||||||
speed: cells[3] || "",
|
|
||||||
reach: cells[4] || "",
|
|
||||||
cableType: cells[5] || "",
|
|
||||||
connector: cells[6] || "",
|
|
||||||
minSoftware: cells[7] || "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// Start with Nexus switches (most relevant for Flexoptix)
|
|
||||||
await crawler.run([
|
|
||||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+9000`,
|
|
||||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+3000`,
|
|
||||||
`${TMG_BASE}/public/tmg?searchValue=Nexus+7000`,
|
|
||||||
`${TMG_BASE}/public/tmg?searchValue=Catalyst+9000`,
|
|
||||||
]);
|
|
||||||
|
|
||||||
console.log(`\nEntries found: ${entries.length}`);
|
|
||||||
|
|
||||||
// Write to database
|
|
||||||
let switches = 0;
|
|
||||||
let compat = 0;
|
|
||||||
|
|
||||||
for (const entry of entries) {
|
|
||||||
if (!entry.switchModel || !entry.transceiverPid) continue;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const data = await searchTmg(family);
|
||||||
|
console.log(` ${family.name}: ${data.totalCount} total entries, ${data.networkDevices.length} device groups`);
|
||||||
|
|
||||||
|
for (const device of data.networkDevices) {
|
||||||
|
for (const compat of device.networkAndTransceiverCompatibility) {
|
||||||
|
if (!compat.productId) continue;
|
||||||
|
|
||||||
const switchId = await upsertCiscoSwitch(
|
const switchId = await upsertCiscoSwitch(
|
||||||
ciscoVendorId,
|
ciscoVendorId,
|
||||||
entry.switchModel,
|
compat.productId,
|
||||||
entry.switchSeries
|
device.productFamily
|
||||||
);
|
);
|
||||||
switches++;
|
totalSwitches++;
|
||||||
|
|
||||||
// Try to match transceiver in our DB
|
for (const tx of compat.transceivers) {
|
||||||
|
if (!tx.productId) continue;
|
||||||
|
totalTransceivers++;
|
||||||
|
|
||||||
|
// Try to match transceiver in our DB by Cisco PID
|
||||||
const txResult = await pool.query(
|
const txResult = await pool.query(
|
||||||
`SELECT id FROM transceivers
|
`SELECT id FROM transceivers
|
||||||
WHERE part_number = $1
|
WHERE part_number = $1
|
||||||
OR slug LIKE $2
|
OR part_number = $2
|
||||||
OR standard_name ILIKE $3
|
|
||||||
LIMIT 1`,
|
LIMIT 1`,
|
||||||
[
|
[tx.productId, tx.productId.replace(/-S$/, "")]
|
||||||
entry.transceiverPid,
|
|
||||||
`%${entry.transceiverPid.toLowerCase().replace(/[^a-z0-9]/g, "")}%`,
|
|
||||||
`%${entry.speed}%${entry.reach}%`,
|
|
||||||
]
|
|
||||||
);
|
);
|
||||||
|
|
||||||
if (txResult.rows.length > 0) {
|
if (txResult.rows.length > 0) {
|
||||||
await upsertCompatibility(switchId, txResult.rows[0].id, entry.minSoftware);
|
await upsertCompatibility(
|
||||||
compat++;
|
switchId,
|
||||||
|
txResult.rows[0].id,
|
||||||
|
tx.softReleaseMinVer,
|
||||||
|
tx.formFactor,
|
||||||
|
tx.reach,
|
||||||
|
tx.cableType,
|
||||||
|
tx.media,
|
||||||
|
tx.dataRate
|
||||||
|
);
|
||||||
|
totalCompat++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (err) {
|
|
||||||
// Skip duplicates silently
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`Switches upserted: ${switches}`);
|
// Rate limit: 2 seconds between platform families
|
||||||
console.log(`Compatibility entries: ${compat}`);
|
await new Promise((r) => setTimeout(r, 2000));
|
||||||
console.log("=== Cisco TMG Scraper Complete ===\n");
|
} catch (err) {
|
||||||
|
console.error(` Error fetching ${family.name}:`, err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\n=== Cisco TMG Scraper Complete ===`);
|
||||||
|
console.log(` Switches upserted: ${totalSwitches}`);
|
||||||
|
console.log(` Transceiver entries scanned: ${totalTransceivers}`);
|
||||||
|
console.log(` Compatibility matches: ${totalCompat}\n`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Fluxlight Scraper — US-based compatible transceiver vendor
|
* Fluxlight Scraper — US-based compatible transceiver vendor
|
||||||
*
|
*
|
||||||
* fluxlight.com — BigCommerce, server-rendered HTML with real prices.
|
* www.fluxlight.com — BigCommerce, server-rendered HTML with real prices.
|
||||||
* ~144+ products across 6 pages. Uses pagination via ?page=N.
|
* ~144+ products across 6 pages. Uses pagination via ?page=N.
|
||||||
*
|
*
|
||||||
* Rate limited: 1 req/2sec.
|
* Rate limited: 1 req/2sec.
|
||||||
@ -91,8 +91,8 @@ function parseProductList(html: string): Product[] {
|
|||||||
const products: Product[] = [];
|
const products: Product[] = [];
|
||||||
|
|
||||||
// BigCommerce product card pattern: product link + price
|
// BigCommerce product card pattern: product link + price
|
||||||
// Pattern: <a href="https://fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99
|
// Pattern: <a href="https://www.fluxlight.com/PARTNUM-FL/">Product Name</a> ... $29.99
|
||||||
const productRegex = /href="(https?:\/\/fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi;
|
const productRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]*-FL\/)"[^>]*>\s*([^<]{10,})<\/a>/gi;
|
||||||
let match;
|
let match;
|
||||||
while ((match = productRegex.exec(html)) !== null) {
|
while ((match = productRegex.exec(html)) !== null) {
|
||||||
const url = match[1];
|
const url = match[1];
|
||||||
@ -123,7 +123,7 @@ function parseProductList(html: string): Product[] {
|
|||||||
|
|
||||||
// Fallback: broader link pattern
|
// Fallback: broader link pattern
|
||||||
if (products.length === 0) {
|
if (products.length === 0) {
|
||||||
const simpleRegex = /href="(https?:\/\/fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi;
|
const simpleRegex = /href="(https?:\/\/(?:www\.)?fluxlight\.com\/[^"]+)"[^>]*>([^<]{10,}(?:SFP|QSFP|XFP|Base)[^<]*)<\/a>/gi;
|
||||||
while ((match = simpleRegex.exec(html)) !== null) {
|
while ((match = simpleRegex.exec(html)) !== null) {
|
||||||
const url = match[1];
|
const url = match[1];
|
||||||
const name = match[2].trim();
|
const name = match[2].trim();
|
||||||
@ -166,7 +166,7 @@ async function fetchPage(url: string): Promise<string> {
|
|||||||
export async function scrapeFluxlight(): Promise<void> {
|
export async function scrapeFluxlight(): Promise<void> {
|
||||||
console.log("=== Fluxlight Scraper Starting ===\n");
|
console.log("=== Fluxlight Scraper Starting ===\n");
|
||||||
|
|
||||||
const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://fluxlight.com/transceivers/");
|
const vendorId = await ensureVendor("Fluxlight", "compatible", "https://fluxlight.com", "https://www.fluxlight.com/transceivers/");
|
||||||
|
|
||||||
let allProducts: Product[] = [];
|
let allProducts: Product[] = [];
|
||||||
|
|
||||||
@ -210,7 +210,7 @@ export async function scrapeFluxlight(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId, sourceVendorId: vendorId,
|
transceiverId: txId, sourceVendorId: vendorId,
|
||||||
price: product.price, currency: "USD",
|
price: product.price, currency: "USD",
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
|
|
||||||
const BASE = "https://gbics.com";
|
const BASE = "https://www.gbics.com";
|
||||||
const HEADERS = {
|
const HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||||
Accept: "text/html,application/xhtml+xml",
|
Accept: "text/html,application/xhtml+xml",
|
||||||
@ -100,7 +100,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
|||||||
|
|
||||||
// BigCommerce card-title pattern:
|
// BigCommerce card-title pattern:
|
||||||
// <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click">
|
// <a aria-label="Product Name, £XX.XX" href="URL" data-event-type="product-click">
|
||||||
const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/gbics\.com\/[^"]+)"\s+data-event-type="product-click"/gi;
|
const productRegex = /aria-label="([^"]+)"\s+href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*data-event-type="product-click"/gi;
|
||||||
let match;
|
let match;
|
||||||
while ((match = productRegex.exec(collapsed)) !== null) {
|
while ((match = productRegex.exec(collapsed)) !== null) {
|
||||||
const label = match[1].trim();
|
const label = match[1].trim();
|
||||||
@ -110,7 +110,14 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
|||||||
// Split on last comma to separate name and price
|
// Split on last comma to separate name and price
|
||||||
const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/);
|
const priceInLabel = label.match(/,\s*£\s*([\d,.]+)\s*$/);
|
||||||
const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label;
|
const name = priceInLabel ? label.slice(0, label.lastIndexOf(",")).trim() : label;
|
||||||
const price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
|
let price = priceInLabel ? parseFloat(priceInLabel[1].replace(",", "")) : undefined;
|
||||||
|
|
||||||
|
// Fallback: extract price from data-price-asc attribute on parent <li>
|
||||||
|
if (!price) {
|
||||||
|
const priceContext = collapsed.slice(Math.max(0, match.index - 500), match.index);
|
||||||
|
const dataPriceMatch = priceContext.match(/data-price-asc="(\d+)"/);
|
||||||
|
if (dataPriceMatch) price = parseFloat(dataPriceMatch[1]);
|
||||||
|
}
|
||||||
|
|
||||||
if (name.length < 10) continue;
|
if (name.length < 10) continue;
|
||||||
|
|
||||||
@ -131,7 +138,7 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
|||||||
|
|
||||||
// Fallback: try "Now: £XX.XX" pattern near product links
|
// Fallback: try "Now: £XX.XX" pattern near product links
|
||||||
if (products.length === 0) {
|
if (products.length === 0) {
|
||||||
const altRegex = /href="(https?:\/\/gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi;
|
const altRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*>\s*([^<]{15,})<\/a>/gi;
|
||||||
while ((match = altRegex.exec(collapsed)) !== null) {
|
while ((match = altRegex.exec(collapsed)) !== null) {
|
||||||
const url = match[1];
|
const url = match[1];
|
||||||
const name = match[2].trim();
|
const name = match[2].trim();
|
||||||
@ -172,7 +179,7 @@ async function fetchPage(url: string): Promise<string> {
|
|||||||
export async function scrapeGbics(): Promise<void> {
|
export async function scrapeGbics(): Promise<void> {
|
||||||
console.log("=== GBICS.com Scraper Starting ===\n");
|
console.log("=== GBICS.com Scraper Starting ===\n");
|
||||||
|
|
||||||
const vendorId = await ensureVendor("GBICS", "compatible", "https://gbics.com", "https://gbics.com/optical-transceivers/");
|
const vendorId = await ensureVendor("GBICS", "compatible", "https://www.gbics.com", "https://www.gbics.com/optical-transceivers/");
|
||||||
|
|
||||||
let totalProducts = 0;
|
let totalProducts = 0;
|
||||||
let priceUpdates = 0;
|
let priceUpdates = 0;
|
||||||
@ -196,7 +203,7 @@ export async function scrapeGbics(): Promise<void> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (product.price && product.price > 0) {
|
if (product.price && product.price > 0) {
|
||||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber }));
|
||||||
const updated = await upsertPriceObservation({
|
const updated = await upsertPriceObservation({
|
||||||
transceiverId: txId, sourceVendorId: vendorId,
|
transceiverId: txId, sourceVendorId: vendorId,
|
||||||
price: product.price, currency: "GBP",
|
price: product.price, currency: "GBP",
|
||||||
|
|||||||
@ -38,19 +38,14 @@ interface NewsArticle {
|
|||||||
const FEEDS: RssFeed[] = [
|
const FEEDS: RssFeed[] = [
|
||||||
// === PRIMARY: Transceiver-specific ===
|
// === PRIMARY: Transceiver-specific ===
|
||||||
{
|
{
|
||||||
name: "Lightwave Online",
|
name: "The Next Platform",
|
||||||
url: "https://www.lightwaveonline.com/rss",
|
url: "https://www.nextplatform.com/feed/",
|
||||||
category: "market_report",
|
category: "market_report",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Lightwave - Fiber Optics",
|
name: "ServeTheHome",
|
||||||
url: "https://www.lightwaveonline.com/fttx/rss",
|
url: "https://www.servethehome.com/feed/",
|
||||||
category: "market_report",
|
category: "product_launch",
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Fierce Telecom",
|
|
||||||
url: "https://www.fiercetelecom.com/rss/xml",
|
|
||||||
category: "market_report",
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Optics.org",
|
name: "Optics.org",
|
||||||
@ -69,8 +64,8 @@ const FEEDS: RssFeed[] = [
|
|||||||
category: "market_report",
|
category: "market_report",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "SDxCentral",
|
name: "The Register - Data Centre",
|
||||||
url: "https://www.sdxcentral.com/feed/",
|
url: "https://www.theregister.com/data_centre/headlines.atom",
|
||||||
category: "market_report",
|
category: "market_report",
|
||||||
},
|
},
|
||||||
// === TERTIARY: General tech / photonics ===
|
// === TERTIARY: General tech / photonics ===
|
||||||
|
|||||||
@ -1,22 +1,29 @@
|
|||||||
/**
|
/**
|
||||||
* ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary)
|
* ProLabs Scraper — Enterprise-grade compatible optics (Legrand subsidiary)
|
||||||
*
|
*
|
||||||
* prolabs.com — Server-rendered HTML with public USD pricing.
|
* prolabs.com — CloudFront WAF aggressively blocks datacenter IPs.
|
||||||
|
* Uses PlaywrightCrawler with Firefox for anti-detection.
|
||||||
|
*
|
||||||
|
* KNOWN ISSUE: CloudFront blocks all requests from IONOS/datacenter IPs
|
||||||
|
* (HTTP 403 "Request blocked"). This scraper works correctly from
|
||||||
|
* residential IPs. Solutions:
|
||||||
|
* 1. Set PROXY_URL env var to a residential/rotating proxy
|
||||||
|
* 2. Run from a residential IP (e.g. home server)
|
||||||
|
* 3. Route through WireGuard with internet breakout at home
|
||||||
|
*
|
||||||
* Products listed under /products/networking/fiber-optics/ category pages.
|
* Products listed under /products/networking/fiber-optics/ category pages.
|
||||||
* Pagination via ?page=N. Rate limited: 1 req/2sec. Max 100 pages.
|
* Pagination via ?page=N. Rate limited: maxConcurrency 1, 10 req/min.
|
||||||
*
|
*
|
||||||
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
|
* SKU format examples: "Q-4X10G-LR-PR", "SFP-10G-SR-PR", "Q28-100G-LR4-PR"
|
||||||
*/
|
*/
|
||||||
|
import { PlaywrightCrawler, RequestQueue } from "crawlee";
|
||||||
|
import { firefox } from "playwright";
|
||||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||||
import { contentHash } from "../utils/hash";
|
import { contentHash } from "../utils/hash";
|
||||||
|
|
||||||
const BASE = "https://www.prolabs.com";
|
const BASE = "https://www.prolabs.com";
|
||||||
const HEADERS = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
|
||||||
Accept: "text/html,application/xhtml+xml",
|
|
||||||
};
|
|
||||||
|
|
||||||
const MAX_PAGES = 100;
|
const MAX_PAGES = 100;
|
||||||
|
const PROXY_URL = process.env.PROXY_URL || "";
|
||||||
|
|
||||||
const CATEGORIES = [
|
const CATEGORIES = [
|
||||||
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
{ path: "/products/networking/fiber-optics/sfp-modules", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||||
@ -26,7 +33,6 @@ const CATEGORIES = [
|
|||||||
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
{ path: "/products/networking/fiber-optics/qsfp28-modules", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||||
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
{ path: "/products/networking/fiber-optics/qsfp-dd-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||||
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
{ path: "/products/networking/fiber-optics/coherent-modules", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||||
// Broad fallback category in case above paths differ on the live site
|
|
||||||
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
{ path: "/products/networking/fiber-optics", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -45,9 +51,9 @@ interface Product {
|
|||||||
wavelength?: string;
|
wavelength?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
function sleep(ms: number): Promise<void> {
|
/* ------------------------------------------------------------------ */
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
/* Helper / detection functions (unchanged from original) */
|
||||||
}
|
/* ------------------------------------------------------------------ */
|
||||||
|
|
||||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||||
const patterns: [RegExp, string, number][] = [
|
const patterns: [RegExp, string, number][] = [
|
||||||
@ -90,18 +96,6 @@ function detectWavelength(text: string): string {
|
|||||||
return match ? match[1] : "";
|
return match ? match[1] : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Infer form factor and speed from ProLabs SKU prefixes when category context
|
|
||||||
* is not specific enough (e.g. when crawling the broad fallback category).
|
|
||||||
*
|
|
||||||
* ProLabs SKU prefix conventions:
|
|
||||||
* Q- -> QSFP+ 40G
|
|
||||||
* Q28- -> QSFP28 100G
|
|
||||||
* QDD- -> QSFP-DD 400G
|
|
||||||
* SFP28- -> SFP28 25G
|
|
||||||
* SFP- -> SFP+ 10G (most common ProLabs prefix)
|
|
||||||
* S- -> SFP 1G
|
|
||||||
*/
|
|
||||||
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
||||||
formFactor: string;
|
formFactor: string;
|
||||||
speed: string;
|
speed: string;
|
||||||
@ -116,121 +110,6 @@ function inferFromSku(sku: string, cat: typeof CATEGORIES[number]): {
|
|||||||
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
|
return { formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps };
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Parse product listings from a ProLabs category page.
|
|
||||||
*
|
|
||||||
* ProLabs uses a standard e-commerce layout:
|
|
||||||
* - Product cards with an <a> link containing the product URL and name
|
|
||||||
* - Price in a span with "price" in class or as "$XX.XX" text nearby
|
|
||||||
* - SKU / part number in the URL slug
|
|
||||||
* - Stock badge: "In Stock" / "Out of Stock" / "Call for Availability"
|
|
||||||
*
|
|
||||||
* We parse with lightweight regex on collapsed HTML — same approach as gbics.ts
|
|
||||||
* and sfpcables.ts (no DOM parser dependency).
|
|
||||||
*/
|
|
||||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
|
||||||
const products: Product[] = [];
|
|
||||||
const collapsed = html.replace(/\s+/g, " ");
|
|
||||||
|
|
||||||
// Strategy 1: product cards with structured href containing a SKU-like segment
|
|
||||||
// Match anchor tags whose href is a deep product path ending in a SKU pattern
|
|
||||||
const productLinkRegex = /href="(\/products\/[^"]*?\/([A-Z0-9][A-Z0-9\-_]{3,}(?:-PR)?))"\s[^>]*>([^<]{10,})<\/a>/gi;
|
|
||||||
let match: RegExpExecArray | null;
|
|
||||||
|
|
||||||
while ((match = productLinkRegex.exec(collapsed)) !== null) {
|
|
||||||
const relUrl = match[1];
|
|
||||||
const skuFromUrl = match[2];
|
|
||||||
const linkText = match[3].trim();
|
|
||||||
|
|
||||||
// Skip navigation / filter / pagination links
|
|
||||||
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
|
|
||||||
if (linkText.length > 200) continue;
|
|
||||||
|
|
||||||
const url = BASE + relUrl;
|
|
||||||
const partNumber = skuFromUrl.slice(0, 80);
|
|
||||||
const name = linkText.length > 10 ? linkText : partNumber;
|
|
||||||
|
|
||||||
// Look for price in a 700-char window after the match position
|
|
||||||
const context = collapsed.slice(Math.max(0, match.index - 100), match.index + 700);
|
|
||||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/) ||
|
|
||||||
context.match(/price[^>]*>\s*\$?\s*([\d,]+\.?\d{0,2})/i);
|
|
||||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(",", "")) : undefined;
|
|
||||||
|
|
||||||
const stockMatch = context.match(/(in[\s-]stock|out[\s-]of[\s-]stock|call for availability|available|backordered)/i);
|
|
||||||
const stockStatus = stockMatch ? stockMatch[1].toLowerCase() : undefined;
|
|
||||||
|
|
||||||
const combined = name + " " + partNumber;
|
|
||||||
const reach = detectReach(combined);
|
|
||||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
|
||||||
|
|
||||||
products.push({
|
|
||||||
partNumber, name, url,
|
|
||||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
|
||||||
stockStatus,
|
|
||||||
formFactor, speed, speedGbps,
|
|
||||||
reachLabel: reach?.label,
|
|
||||||
reachMeters: reach?.meters,
|
|
||||||
fiberType: detectFiber(combined),
|
|
||||||
wavelength: detectWavelength(combined),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strategy 2: Fallback — any link to a /products/ URL that has a $ price nearby
|
|
||||||
if (products.length === 0) {
|
|
||||||
const altRegex = /href="(\/products\/[^"]{10,})"/gi;
|
|
||||||
while ((match = altRegex.exec(collapsed)) !== null) {
|
|
||||||
const relUrl = match[1];
|
|
||||||
if (/category|filter|sort|page|breadcrumb/i.test(relUrl)) continue;
|
|
||||||
|
|
||||||
const context = collapsed.slice(Math.max(0, match.index - 50), match.index + 800);
|
|
||||||
const priceMatch = context.match(/\$\s*([\d,]+\.?\d{0,2})/);
|
|
||||||
if (!priceMatch) continue;
|
|
||||||
|
|
||||||
const price = parseFloat(priceMatch[1].replace(",", ""));
|
|
||||||
const nameMatch = context.match(/<(?:h[23]|strong|span)[^>]*>([^<]{10,150})<\//i);
|
|
||||||
const name = nameMatch ? nameMatch[1].trim() : relUrl.split("/").pop() || "";
|
|
||||||
const partNumber = (relUrl.split("/").pop() ?? name).slice(0, 80);
|
|
||||||
|
|
||||||
const url = BASE + relUrl;
|
|
||||||
const combined = name + " " + partNumber;
|
|
||||||
const reach = detectReach(combined);
|
|
||||||
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
|
||||||
|
|
||||||
products.push({
|
|
||||||
partNumber, name, url,
|
|
||||||
price: price > 0 && price < 100000 ? price : undefined,
|
|
||||||
formFactor, speed, speedGbps,
|
|
||||||
reachLabel: reach?.label,
|
|
||||||
reachMeters: reach?.meters,
|
|
||||||
fiberType: detectFiber(combined),
|
|
||||||
wavelength: detectWavelength(combined),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deduplicate by URL
|
|
||||||
const seen = new Set<string>();
|
|
||||||
return products.filter((p) => {
|
|
||||||
if (seen.has(p.url)) return false;
|
|
||||||
seen.add(p.url);
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Check if the HTML contains a link to the next pagination page. */
|
|
||||||
function hasNextPage(html: string, currentPage: number): boolean {
|
|
||||||
if (/rel="next"/i.test(html)) return true;
|
|
||||||
const nextPageNum = currentPage + 1;
|
|
||||||
const pattern = new RegExp(`[?&]page=${nextPageNum}`, "i");
|
|
||||||
return pattern.test(html);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchPage(url: string): Promise<string> {
|
|
||||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
|
||||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
|
||||||
return resp.text();
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeStockLevel(
|
function normalizeStockLevel(
|
||||||
raw?: string
|
raw?: string
|
||||||
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
|
): "in_stock" | "low_stock" | "out_of_stock" | "on_request" {
|
||||||
@ -242,8 +121,19 @@ function normalizeStockLevel(
|
|||||||
return "on_request";
|
return "on_request";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------------ */
|
||||||
|
/* Main scraper */
|
||||||
|
/* ------------------------------------------------------------------ */
|
||||||
|
|
||||||
export async function scrapeProLabs(): Promise<void> {
|
export async function scrapeProLabs(): Promise<void> {
|
||||||
console.log("=== ProLabs Scraper Starting ===\n");
|
console.log("=== ProLabs Scraper Starting (PlaywrightCrawler + Firefox) ===\n");
|
||||||
|
|
||||||
|
if (PROXY_URL) {
|
||||||
|
console.log(`Using proxy: ${PROXY_URL.replace(/:[^:@]+@/, ":***@")}`);
|
||||||
|
} else {
|
||||||
|
console.log("WARNING: No PROXY_URL set. CloudFront WAF blocks datacenter IPs.");
|
||||||
|
console.log("Set PROXY_URL env var for residential proxy if running from VPS.\n");
|
||||||
|
}
|
||||||
|
|
||||||
const vendorId = await ensureVendor(
|
const vendorId = await ensureVendor(
|
||||||
"ProLabs",
|
"ProLabs",
|
||||||
@ -254,30 +144,253 @@ export async function scrapeProLabs(): Promise<void> {
|
|||||||
|
|
||||||
let totalProducts = 0;
|
let totalProducts = 0;
|
||||||
let priceUpdates = 0;
|
let priceUpdates = 0;
|
||||||
|
let blockedPages = 0;
|
||||||
const seenUrls = new Set<string>();
|
const seenUrls = new Set<string>();
|
||||||
|
|
||||||
|
// Map URL -> category metadata
|
||||||
|
const urlToCat = new Map<string, typeof CATEGORIES[number]>();
|
||||||
|
|
||||||
|
const requestQueue = await RequestQueue.open();
|
||||||
|
|
||||||
for (const cat of CATEGORIES) {
|
for (const cat of CATEGORIES) {
|
||||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
const url = `${BASE}${cat.path}`;
|
||||||
|
urlToCat.set(url, cat);
|
||||||
|
await requestQueue.addRequest({ url, userData: { page: 1, catPath: cat.path } });
|
||||||
|
}
|
||||||
|
|
||||||
let page = 1;
|
const crawler = new PlaywrightCrawler({
|
||||||
let pagesThisCat = 0;
|
requestQueue,
|
||||||
let productsThisCat = 0;
|
maxConcurrency: 1,
|
||||||
|
maxRequestsPerMinute: 10,
|
||||||
|
requestHandlerTimeoutSecs: 120,
|
||||||
|
navigationTimeoutSecs: 60,
|
||||||
|
maxRequestRetries: 2,
|
||||||
|
headless: true,
|
||||||
|
// Override default blockedStatusCodes (normally [401, 403, 429]).
|
||||||
|
// We allow 403 so our handler can inspect the page — CloudFront may
|
||||||
|
// serve a JS challenge that resolves, or we can log the block gracefully.
|
||||||
|
sessionPoolOptions: {
|
||||||
|
blockedStatusCodes: [401, 429],
|
||||||
|
},
|
||||||
|
browserPoolOptions: {
|
||||||
|
useFingerprints: false,
|
||||||
|
},
|
||||||
|
launchContext: {
|
||||||
|
launcher: firefox,
|
||||||
|
launchOptions: {
|
||||||
|
firefoxUserPrefs: {
|
||||||
|
"toolkit.telemetry.enabled": false,
|
||||||
|
"privacy.trackingprotection.enabled": false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
...(PROXY_URL ? {
|
||||||
|
proxyConfiguration: new (require("crawlee").ProxyConfiguration)({
|
||||||
|
proxyUrls: [PROXY_URL],
|
||||||
|
}),
|
||||||
|
} : {}),
|
||||||
|
preNavigationHooks: [
|
||||||
|
async ({ page }, goToOptions) => {
|
||||||
|
// Realistic viewport
|
||||||
|
await page.setViewportSize({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
while (page <= MAX_PAGES) {
|
// Override webdriver detection
|
||||||
const url = page === 1
|
await page.addInitScript(() => {
|
||||||
? `${BASE}${cat.path}`
|
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
||||||
: `${BASE}${cat.path}?page=${page}`;
|
});
|
||||||
|
|
||||||
|
if (goToOptions) {
|
||||||
|
goToOptions.waitUntil = "load";
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
async requestHandler({ page, request, log }) {
|
||||||
|
const currentPage: number = request.userData?.page ?? 1;
|
||||||
|
const catPath: string = request.userData?.catPath ?? "";
|
||||||
|
|
||||||
|
const cat = urlToCat.get(request.url) ??
|
||||||
|
CATEGORIES.find((c) => catPath === c.path) ??
|
||||||
|
CATEGORIES[CATEGORIES.length - 1];
|
||||||
|
urlToCat.set(request.url, cat);
|
||||||
|
|
||||||
|
log.info(`[${cat.formFactor} ${cat.speed}] Page ${currentPage}: ${request.url}`);
|
||||||
|
|
||||||
|
// Give JS challenges time to resolve
|
||||||
|
await page.waitForTimeout(8000);
|
||||||
|
|
||||||
|
// Check what we actually got
|
||||||
|
const pageTitle = await page.title();
|
||||||
|
const bodyText = await page.evaluate(() => document.body?.innerText?.slice(0, 500) || "");
|
||||||
|
log.info(` Title: "${pageTitle}"`);
|
||||||
|
|
||||||
|
// Detect CloudFront WAF block
|
||||||
|
if (bodyText.includes("Request blocked") ||
|
||||||
|
bodyText.includes("Access Denied") ||
|
||||||
|
bodyText.includes("403 ERROR") ||
|
||||||
|
pageTitle.includes("ERROR")) {
|
||||||
|
blockedPages++;
|
||||||
|
log.warning(` CloudFront WAF blocked this page (${blockedPages} total blocked)`);
|
||||||
|
if (blockedPages >= 3 && totalProducts === 0) {
|
||||||
|
log.warning(` Multiple blocks detected — likely IP-level block. Consider using PROXY_URL.`);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract products via page.evaluate
|
||||||
|
const productData = await page.evaluate(() => {
|
||||||
|
const results: Array<{
|
||||||
|
name: string;
|
||||||
|
href: string;
|
||||||
|
price: string;
|
||||||
|
stock: string;
|
||||||
|
partNumber: string;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
// Strategy 1: Product card links
|
||||||
|
const productLinks = document.querySelectorAll(
|
||||||
|
'a[href*="/products/"], .product-card a, .product-item a, [class*="product"] a[href], .product-list a, .category-products a, [data-product] a'
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const link of productLinks) {
|
||||||
|
const el = link as HTMLAnchorElement;
|
||||||
|
const name = el.textContent?.trim() || "";
|
||||||
|
const href = el.getAttribute("href") || "";
|
||||||
|
|
||||||
|
if (!name || name.length < 5 || name.length > 200 || !href) continue;
|
||||||
|
if (/category|filter|sort|breadcrumb|login|cart|account/i.test(href) && !/products\//i.test(href)) continue;
|
||||||
|
|
||||||
|
const container =
|
||||||
|
el.closest('[class*="product"]') ||
|
||||||
|
el.closest('[class*="item"]') ||
|
||||||
|
el.closest('[class*="card"]') ||
|
||||||
|
el.closest("li") ||
|
||||||
|
el.parentElement?.parentElement?.parentElement;
|
||||||
|
|
||||||
|
let price = "";
|
||||||
|
let stock = "";
|
||||||
|
let pn = "";
|
||||||
|
|
||||||
|
if (container) {
|
||||||
|
const priceEl = container.querySelector(
|
||||||
|
'[class*="price"], [class*="Price"], [data-price], .price'
|
||||||
|
);
|
||||||
|
price = priceEl?.textContent?.trim() || "";
|
||||||
|
if (!price) {
|
||||||
|
const containerText = container.textContent || "";
|
||||||
|
const priceMatch = containerText.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||||
|
if (priceMatch) price = priceMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const stockEl = container.querySelector(
|
||||||
|
'[class*="stock"], [class*="Stock"], [class*="avail"], [class*="Avail"]'
|
||||||
|
);
|
||||||
|
stock = stockEl?.textContent?.trim() || "";
|
||||||
|
|
||||||
|
const skuEl = container.querySelector(
|
||||||
|
'[class*="sku"], [class*="SKU"], [class*="part"], [class*="Part"], [class*="model"]'
|
||||||
|
);
|
||||||
|
pn = skuEl?.textContent?.trim() || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pn) {
|
||||||
|
pn = href.split("/").pop()?.replace(/\.html?$/, "")?.replace(/#.*$/, "") || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name && href.includes("/products/")) {
|
||||||
|
results.push({ name, href, price, stock, partNumber: pn });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 2: Scan deeper for anchors with product URLs
|
||||||
|
if (results.length === 0) {
|
||||||
|
const allAnchors = document.querySelectorAll("a[href*='/products/']");
|
||||||
|
for (const el of allAnchors) {
|
||||||
|
const anchor = el as HTMLAnchorElement;
|
||||||
|
const href = anchor.getAttribute("href") || "";
|
||||||
|
const name = anchor.textContent?.trim() || "";
|
||||||
|
if (!name || name.length < 5) continue;
|
||||||
|
|
||||||
|
let parent: Element | null = anchor;
|
||||||
|
let price = "";
|
||||||
|
for (let i = 0; i < 4 && parent; i++) {
|
||||||
|
parent = parent.parentElement;
|
||||||
|
if (parent) {
|
||||||
|
const text = parent.textContent || "";
|
||||||
|
const m = text.match(/\$\s*[\d,]+\.?\d{0,2}/);
|
||||||
|
if (m) { price = m[0]; break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const pn = href.split("/").pop()?.replace(/\.html?$/, "") || "";
|
||||||
|
results.push({ name, href, price, stock: "", partNumber: pn });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 3: JSON-LD structured data
|
||||||
|
const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||||
|
for (const script of ldScripts) {
|
||||||
try {
|
try {
|
||||||
const html = await fetchPage(url);
|
const data = JSON.parse(script.textContent || "");
|
||||||
const pageProducts = parseProductList(html, cat);
|
const items = data.itemListElement || (Array.isArray(data) ? data : [data]);
|
||||||
|
for (const item of items) {
|
||||||
|
if (item["@type"] === "Product" || item.offers) {
|
||||||
|
const name = item.name || "";
|
||||||
|
const href = item.url || "";
|
||||||
|
const offers = item.offers || {};
|
||||||
|
const price = offers.price ? `$${offers.price}` : "";
|
||||||
|
const stock = offers.availability || "";
|
||||||
|
const pn = item.sku || item.mpn || href.split("/").pop() || "";
|
||||||
|
if (name) results.push({ name, href, price, stock, partNumber: pn });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch { /* ignore parse errors */ }
|
||||||
|
}
|
||||||
|
|
||||||
// Global dedup: broad fallback category overlaps with specific ones
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
log.info(` Raw items extracted: ${productData.length}`);
|
||||||
|
|
||||||
|
// Process extracted products
|
||||||
|
const pageProducts: Product[] = [];
|
||||||
|
|
||||||
|
for (const item of productData) {
|
||||||
|
if (!item.name) continue;
|
||||||
|
|
||||||
|
const partNumber = (item.partNumber || item.name).slice(0, 80).trim();
|
||||||
|
const name = item.name.slice(0, 200).trim();
|
||||||
|
const url = item.href.startsWith("http") ? item.href : `${BASE}${item.href}`;
|
||||||
|
|
||||||
|
let price: number | undefined;
|
||||||
|
if (item.price) {
|
||||||
|
const cleaned = item.price.replace(/[^\d.,]/g, "").replace(",", "");
|
||||||
|
const parsed = parseFloat(cleaned);
|
||||||
|
if (parsed > 0 && parsed < 100000) price = parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
const combined = name + " " + partNumber;
|
||||||
|
const reach = detectReach(combined);
|
||||||
|
const { formFactor, speed, speedGbps } = inferFromSku(partNumber, cat);
|
||||||
|
|
||||||
|
pageProducts.push({
|
||||||
|
partNumber, name, url, price,
|
||||||
|
stockStatus: item.stock || undefined,
|
||||||
|
formFactor, speed, speedGbps,
|
||||||
|
reachLabel: reach?.label,
|
||||||
|
reachMeters: reach?.meters,
|
||||||
|
fiberType: detectFiber(combined),
|
||||||
|
wavelength: detectWavelength(combined),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate against global set
|
||||||
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
|
const newProducts = pageProducts.filter((p) => !seenUrls.has(p.url));
|
||||||
newProducts.forEach((p) => seenUrls.add(p.url));
|
for (const p of newProducts) seenUrls.add(p.url);
|
||||||
|
|
||||||
console.log(` Page ${page}: ${pageProducts.length} found, ${newProducts.length} new`);
|
log.info(` Parsed: ${pageProducts.length} found, ${newProducts.length} new`);
|
||||||
|
|
||||||
|
// Write to database
|
||||||
for (const product of newProducts) {
|
for (const product of newProducts) {
|
||||||
try {
|
try {
|
||||||
const txId = await findOrCreateScrapedTransceiver({
|
const txId = await findOrCreateScrapedTransceiver({
|
||||||
@ -311,33 +424,54 @@ export async function scrapeProLabs(): Promise<void> {
|
|||||||
if (updated) priceUpdates++;
|
if (updated) priceUpdates++;
|
||||||
}
|
}
|
||||||
|
|
||||||
productsThisCat++;
|
|
||||||
totalProducts++;
|
totalProducts++;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
log.warning(` DB error [${product.partNumber}]: ${(err as Error).message.slice(0, 80)}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pagesThisCat++;
|
// Check for next page
|
||||||
|
const hasNext = await page.evaluate((currentPageNum: number) => {
|
||||||
if (pageProducts.length === 0 || !hasNextPage(html, page)) break;
|
const nextLink = document.querySelector('a[rel="next"], link[rel="next"]');
|
||||||
|
if (nextLink) return true;
|
||||||
page++;
|
const nextNum = currentPageNum + 1;
|
||||||
await sleep(2000);
|
const paginationLinks = document.querySelectorAll('a[href*="page="], .pagination a, nav a');
|
||||||
} catch (err) {
|
for (const link of paginationLinks) {
|
||||||
console.error(` Page ${page} failed: ${(err as Error).message}`);
|
const href = (link as HTMLAnchorElement).getAttribute("href") || "";
|
||||||
break;
|
if (href.includes(`page=${nextNum}`)) return true;
|
||||||
}
|
const text = link.textContent?.trim() || "";
|
||||||
|
if (text === String(nextNum) || text.toLowerCase() === "next" || text === "\u203a" || text === "\u00bb") return true;
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
|
}, currentPage);
|
||||||
|
|
||||||
console.log(` Category done: ${productsThisCat} products across ${pagesThisCat} page(s)`);
|
if (hasNext && currentPage < MAX_PAGES && newProducts.length > 0) {
|
||||||
|
const nextPageNum = currentPage + 1;
|
||||||
if (cat !== CATEGORIES[CATEGORIES.length - 1]) {
|
const nextUrl = `${BASE}${catPath}?page=${nextPageNum}`;
|
||||||
await sleep(2000);
|
urlToCat.set(nextUrl, cat);
|
||||||
}
|
await requestQueue.addRequest({
|
||||||
|
url: nextUrl,
|
||||||
|
userData: { page: nextPageNum, catPath },
|
||||||
|
});
|
||||||
|
log.info(` Enqueued next page: ${nextPageNum}`);
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
|
||||||
console.log(`\n=== ProLabs Complete: ${totalProducts} products processed, ${priceUpdates} price updates ===`);
|
async failedRequestHandler({ request, log }) {
|
||||||
|
log.error(`Request failed after retries: ${request.url}`);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await crawler.run();
|
||||||
|
|
||||||
|
console.log(`\n=== ProLabs Complete ===`);
|
||||||
|
console.log(` Products processed: ${totalProducts}`);
|
||||||
|
console.log(` Price updates: ${priceUpdates}`);
|
||||||
|
console.log(` Pages blocked by WAF: ${blockedPages}`);
|
||||||
|
if (blockedPages > 0 && totalProducts === 0) {
|
||||||
|
console.log(`\n All pages blocked by CloudFront WAF (datacenter IP detected).`);
|
||||||
|
console.log(` Fix: Set PROXY_URL=http://user:pass@proxy:port in .env`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user