feat: add NADDOD, QSFPTEK, and AddOn Networks scrapers
Three new fetch-based price scrapers for compatible optics vendors: - NADDOD: WooCommerce, USD, ~800+ SKUs - QSFPTEK: Custom PHP shop, USD, ~1000+ SKUs - AddOn Networks: Magento/custom, USD, ~2500 SKUs All registered in scheduler (8-12h intervals) and index.ts --flags. Build: 0 TypeScript errors.
This commit is contained in:
parent
fcddd1f27b
commit
2348238888
@ -27,6 +27,9 @@
|
||||
* tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors)
|
||||
* tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright)
|
||||
* tsx src/index.ts --atgbics — Run ATGBICS scraper once
|
||||
* tsx src/index.ts --naddod — Run NADDOD scraper once
|
||||
* tsx src/index.ts --qsfptek — Run QSFPTEK scraper once
|
||||
* tsx src/index.ts --addon — Run AddOn Networks scraper once
|
||||
*/
|
||||
import { createScheduler, registerSchedules, registerWorkers } from "./scheduler";
|
||||
import { scrapeFs } from "./scrapers/fs-com";
|
||||
@ -54,6 +57,9 @@ import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler";
|
||||
import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright";
|
||||
import { scrapeAtgbics } from "./scrapers/atgbics";
|
||||
import { scrapeProLabs } from "./scrapers/prolabs";
|
||||
import { scrapeNaddod } from "./scrapers/naddod";
|
||||
import { scrapeQsfptek } from "./scrapers/qsfptek";
|
||||
import { scrapeAddonNetworks } from "./scrapers/addon-networks";
|
||||
import { pool } from "./utils/db";
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
@ -86,6 +92,15 @@ async function runOnce(): Promise<void> {
|
||||
if (args.includes("--prolabs") || isAll || isFetchOnly) {
|
||||
await scrapeProLabs();
|
||||
}
|
||||
if (args.includes("--naddod") || isAll || isFetchOnly) {
|
||||
await scrapeNaddod();
|
||||
}
|
||||
if (args.includes("--qsfptek") || isAll || isFetchOnly) {
|
||||
await scrapeQsfptek();
|
||||
}
|
||||
if (args.includes("--addon") || isAll || isFetchOnly) {
|
||||
await scrapeAddonNetworks();
|
||||
}
|
||||
if (args.includes("--juniper") || isAll || isFetchOnly) {
|
||||
await scrapeJuniperHct();
|
||||
}
|
||||
@ -172,7 +187,7 @@ async function runScheduler(): Promise<void> {
|
||||
process.on("SIGTERM", shutdown);
|
||||
}
|
||||
|
||||
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"];
|
||||
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"];
|
||||
|
||||
if (args.some((a) => ALL_FLAGS.includes(a))) {
|
||||
runOnce().catch((err) => {
|
||||
|
||||
@ -61,6 +61,9 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
"scrape:pricing:10gtek",
|
||||
"scrape:pricing:atgbics",
|
||||
"scrape:pricing:prolabs",
|
||||
"scrape:pricing:naddod",
|
||||
"scrape:pricing:qsfptek",
|
||||
"scrape:pricing:addon",
|
||||
"scrape:compat:cisco",
|
||||
"scrape:pricing:flexoptix",
|
||||
"scrape:vendors:flexoptix",
|
||||
@ -120,6 +123,24 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// NADDOD pricing (every 8 hours — WooCommerce, USD prices)
|
||||
await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices)
|
||||
await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// AddOn Networks pricing (every 12 hours — enterprise site, USD prices)
|
||||
await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
expireInSeconds: 3600,
|
||||
});
|
||||
|
||||
// Flexoptix catalog (every 6 hours — fetch-based, fast)
|
||||
await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, {
|
||||
retryLimit: 2,
|
||||
@ -152,6 +173,9 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
const { scrapeNews } = await import("./scrapers/news");
|
||||
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||||
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
||||
const { scrapeNaddod } = await import("./scrapers/naddod");
|
||||
const { scrapeQsfptek } = await import("./scrapers/qsfptek");
|
||||
const { scrapeAddonNetworks } = await import("./scrapers/addon-networks");
|
||||
|
||||
await boss.work("scrape:pricing:fs", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||
@ -198,6 +222,21 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await withIsolatedStorage("prolabs", scrapeProLabs);
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:naddod", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`);
|
||||
await scrapeNaddod();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:qsfptek", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`);
|
||||
await scrapeQsfptek();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:addon", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`);
|
||||
await scrapeAddonNetworks();
|
||||
});
|
||||
|
||||
await boss.work("scrape:faq", async (_job) => {
|
||||
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
|
||||
});
|
||||
|
||||
303
packages/scraper/src/scrapers/addon-networks.ts
Normal file
303
packages/scraper/src/scrapers/addon-networks.ts
Normal file
@ -0,0 +1,303 @@
|
||||
/**
|
||||
* AddOn Networks Scraper — US-based compatible optics vendor
|
||||
*
|
||||
* addnetworks.com — Enterprise-grade compatible transceivers.
|
||||
* Products browseable under /products/ category pages.
|
||||
* Pricing is public in USD. Rate limited: 1 req/2sec.
|
||||
*
|
||||
* AddOn Networks (AddOn Computer Products) specializes in OEM-compatible
|
||||
* optics for Cisco, Juniper, Arista, HPE, and Dell environments.
|
||||
* ~2500 SKUs, strong US channel presence.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.addnetworks.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 50;
|
||||
|
||||
// AddOn uses "compatible" suffix naming (e.g. "ADD-XSSFP10GE-LR-AO")
|
||||
// Categories follow standard form-factor taxonomy
|
||||
const CATEGORIES = [
|
||||
{ path: "/products/networking/optical-networking/sfp/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/products/networking/optical-networking/sfp-plus/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/products/networking/optical-networking/sfp28/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/products/networking/optical-networking/qsfp-plus/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/products/networking/optical-networking/qsfp28/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/products/networking/optical-networking/qsfp-dd/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
// Broader category fallback
|
||||
{ path: "/products/networking/optical-networking/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
compatibleWith?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b400\s*m\b/i, "400m", 400],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function extractCompatibleVendor(name: string): string {
|
||||
const brands = ["Cisco", "Juniper", "Arista", "HPE", "HP", "Aruba", "Dell", "Brocade", "Extreme",
|
||||
"Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Force10",
|
||||
"Foundry", "Enterasys", "Allied Telesis", "Netgear", "Calix"];
|
||||
for (const brand of brands) {
|
||||
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
|
||||
}
|
||||
// AddOn naming convention: "FOR-XX" suffix
|
||||
const forMatch = name.match(/-AO$|-IN$/i);
|
||||
if (forMatch) {
|
||||
// Check preceding OEM part number pattern, e.g. SFP-10G-SR-AO → Cisco
|
||||
if (/^SFP-|^GLC-|^QSFP-|^SFP28-/i.test(name)) return "Cisco";
|
||||
if (/^EX-|^QFX-/i.test(name)) return "Juniper";
|
||||
if (/^740-|^J\d{4}/i.test(name)) return "Juniper";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse AddOn Networks product listing HTML.
|
||||
* Supports multiple CMS patterns (Magento, BigCommerce, custom).
|
||||
*/
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: Magento / standard product grid
|
||||
for (const m of collapsed.matchAll(/<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi)) {
|
||||
const card = m[1];
|
||||
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url) || !/\/product(?:s)?\/|\/item\//i.test(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const nameMatch = card.match(/<h[2-4][^>]*>([^<]{10,})<\/h[2-4]>/i) ||
|
||||
card.match(/product[_-]?(?:name|title)[^>]*>([^<]{10,})</i) ||
|
||||
card.match(/class="name[^"]*"[^>]*>([^<]{10,})</i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#[0-9]+;/g, "");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
// AddOn part numbers end in "-AO" or "-IN" suffix
|
||||
const partNumber = name.match(/([A-Z0-9](?:[A-Z0-9\-\.\/]{4,}(?:-AO|-IN|-ADD)?))/)?.[1] ||
|
||||
name.split(/\s+/)[0]?.slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Generic product link fallback using matchAll
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"[^>]*>\s*<[^>]+>\s*([^<]{10,})/gi)) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 10) continue;
|
||||
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|fiber|optical/i.test(name)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const idx = collapsed.indexOf(url);
|
||||
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.match(/([A-Z0-9][A-Z0-9\-\.\/]{4,})/)?.[1] || name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeAddonNetworks(): Promise<void> {
|
||||
console.log("=== AddOn Networks Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"AddOn Networks",
|
||||
"compatible",
|
||||
"https://www.addnetworks.com",
|
||||
"https://www.addnetworks.com/products/networking/optical-networking/",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path === "/products/networking/optical-networking/" && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
// Detect pagination
|
||||
const totalPagesMatch =
|
||||
html1.match(/page\s+\d+\s+of\s+(\d+)/i) ||
|
||||
html1.match(/aria-label="Last[^"]*"\s+href="[^"]*[?&]p=(\d+)/) ||
|
||||
html1.match(/pagination[^>]*>[\s\S]*?(\d+)<\/a>\s*<\/[^>]+>\s*<\/[^>]+>/);
|
||||
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 2;
|
||||
console.log(` Total pages (estimate): ${totalPages}`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= totalPages; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const pageUrl = BASE + cat.path + `?p=${page}`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== AddOn Networks Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeAddonNetworks()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
285
packages/scraper/src/scrapers/naddod.ts
Normal file
285
packages/scraper/src/scrapers/naddod.ts
Normal file
@ -0,0 +1,285 @@
|
||||
/**
|
||||
* NADDOD Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* naddod.com — WooCommerce store, server-rendered HTML, USD pricing.
|
||||
* Products listed under product category pages.
|
||||
* Pagination via /page/N/. Rate limited: 1 req/2sec.
|
||||
*
|
||||
* NADDOD (Shenzhen NADDOD Information Co.) makes and sells compatible
|
||||
* optics for Cisco, Juniper, Arista, etc. Transparent USD pricing.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.naddod.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 30;
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/product-category/1g-sfp-transceivers/", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/product-category/10g-sfp-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/product-category/25g-sfp28-transceivers/", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/product-category/40g-qsfp-transceivers/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/product-category/100g-qsfp28-transceivers/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/product-category/200g-qsfp56-transceivers/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
||||
{ path: "/product-category/400g-qsfp-dd-transceivers/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/product-category/800g-osfp-transceivers/", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
compatibleWith?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b400\s*m\b/i, "400m", 400],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b150\s*m\b/i, "150m", 150],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function extractCompatibleVendor(name: string): string {
|
||||
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Dell", "Brocade", "Extreme", "Huawei",
|
||||
"Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti"];
|
||||
for (const brand of brands) {
|
||||
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
|
||||
}
|
||||
const match = name.match(/(?:for\s+|compatible\s+(?:with\s+)?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)/);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: WooCommerce standard product loop
|
||||
const cardRegex = /<li[^>]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi;
|
||||
let cardMatch;
|
||||
while ((cardMatch = cardRegex.exec(collapsed)) !== null) {
|
||||
const card = cardMatch[1];
|
||||
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?naddod\.com\/product\/[^"]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const nameMatch = card.match(/woocommerce-loop-product__title[^>]*>([^<]+)</i) ||
|
||||
card.match(/<h2[^>]*>([^<]{10,})<\/h2>/i) ||
|
||||
card.match(/<h3[^>]*>([^<]{10,})<\/h3>/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/–/g, "–");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Generic product link fallback
|
||||
if (products.length === 0) {
|
||||
const linkRegex = /href="(https?:\/\/(?:www\.)?naddod\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>\s*([^<]{10,})/gi;
|
||||
let m;
|
||||
while ((m = linkRegex.exec(collapsed)) !== null) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 10) continue;
|
||||
if (!/transceiver|sfp|qsfp|osfp|dac|aoc|xfp/i.test(name)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const ctx = collapsed.slice(Math.max(0, m.index - 200), m.index + 500);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeNaddod(): Promise<void> {
|
||||
console.log("=== NADDOD Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"NADDOD",
|
||||
"compatible",
|
||||
"https://www.naddod.com",
|
||||
"https://www.naddod.com/product-category/transceivers/",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path.includes("/transceivers/") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const totalPagesMatch = html1.match(/page-numbers[^>]*>(\d+)<\/a>(?!.*page-numbers)/);
|
||||
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 1;
|
||||
console.log(` Total pages: ${totalPages}`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= totalPages; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const html = await fetchPage(BASE + cat.path + `page/${page}/`);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== NADDOD Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeNaddod()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
281
packages/scraper/src/scrapers/qsfptek.ts
Normal file
281
packages/scraper/src/scrapers/qsfptek.ts
Normal file
@ -0,0 +1,281 @@
|
||||
/**
|
||||
* QSFPTEK Scraper — Chinese compatible transceiver vendor
|
||||
*
|
||||
* qsfptek.com — Server-rendered HTML shop, USD pricing.
|
||||
* Focuses on QSFP+/QSFP28/QSFP-DD/SFP+ form factors.
|
||||
* Rate limited: 1 req/2sec.
|
||||
*
|
||||
* QSFPTEK (Shenzhen Optotech Technology) — competitive pricing,
|
||||
* transparent USD prices, no account required.
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
|
||||
const BASE = "https://www.qsfptek.com";
|
||||
const HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
const MAX_PAGES = 30;
|
||||
|
||||
const CATEGORIES = [
|
||||
{ path: "/c/sfp-transceiver.html", formFactor: "SFP", speed: "1G", speedGbps: 1 },
|
||||
{ path: "/c/sfp-plus-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
{ path: "/c/sfp28-transceiver.html", formFactor: "SFP28", speed: "25G", speedGbps: 25 },
|
||||
{ path: "/c/qsfp-plus-transceiver.html", formFactor: "QSFP+", speed: "40G", speedGbps: 40 },
|
||||
{ path: "/c/qsfp28-transceiver.html", formFactor: "QSFP28", speed: "100G", speedGbps: 100 },
|
||||
{ path: "/c/qsfp56-transceiver.html", formFactor: "QSFP56", speed: "200G", speedGbps: 200 },
|
||||
{ path: "/c/qsfp-dd-transceiver.html", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 },
|
||||
{ path: "/c/osfp-transceiver.html", formFactor: "OSFP", speed: "800G", speedGbps: 800 },
|
||||
{ path: "/c/optical-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 },
|
||||
];
|
||||
|
||||
interface Product {
|
||||
partNumber: string;
|
||||
name: string;
|
||||
url: string;
|
||||
price?: number;
|
||||
formFactor: string;
|
||||
speed: string;
|
||||
speedGbps: number;
|
||||
reachLabel?: string;
|
||||
reachMeters?: number;
|
||||
fiberType?: string;
|
||||
wavelength?: string;
|
||||
compatibleWith?: string;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function detectReach(text: string): { label: string; meters: number } | undefined {
|
||||
const patterns: [RegExp, string, number][] = [
|
||||
[/\b120\s*km\b/i, "120km", 120000],
|
||||
[/\b80\s*km\b/i, "80km", 80000],
|
||||
[/\b40\s*km\b/i, "40km", 40000],
|
||||
[/\b20\s*km\b/i, "20km", 20000],
|
||||
[/\b10\s*km\b/i, "10km", 10000],
|
||||
[/\b2\s*km\b/i, "2km", 2000],
|
||||
[/\b550\s*m\b/i, "550m", 550],
|
||||
[/\b500\s*m\b/i, "500m", 500],
|
||||
[/\b300\s*m\b/i, "300m", 300],
|
||||
[/\b100\s*m\b/i, "100m", 100],
|
||||
[/\bLR4\b/, "10km", 10000],
|
||||
[/\bLR\b/, "10km", 10000],
|
||||
[/\bER4?\b/, "40km", 40000],
|
||||
[/\bZR4?\b/, "80km", 80000],
|
||||
[/\bSR4?\b/, "300m", 300],
|
||||
[/\bDR4?\b/, "500m", 500],
|
||||
[/\bFR4?\b/, "2km", 2000],
|
||||
];
|
||||
for (const [regex, label, meters] of patterns) {
|
||||
if (regex.test(text)) return { label, meters };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function detectFiber(text: string): string {
|
||||
if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF";
|
||||
if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF";
|
||||
if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper";
|
||||
return "";
|
||||
}
|
||||
|
||||
function detectWavelength(text: string): string {
|
||||
const match = text.match(/(\d{3,4})\s*nm/i);
|
||||
return match ? match[1] : "";
|
||||
}
|
||||
|
||||
function extractCompatibleVendor(name: string): string {
|
||||
const brands = ["Cisco", "Juniper", "Arista", "HPE", "Aruba", "Dell", "Brocade", "Extreme",
|
||||
"Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Allied Telesis"];
|
||||
for (const brand of brands) {
|
||||
if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] {
|
||||
const products: Product[] = [];
|
||||
const seen = new Set<string>();
|
||||
const collapsed = html.replace(/\s+/g, " ");
|
||||
|
||||
// Strategy 1: OpenCart / custom card layout using matchAll
|
||||
for (const cardMatch of collapsed.matchAll(/<div[^>]+class="[^"]*product-(?:thumb|layout)[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi)) {
|
||||
const card = cardMatch[1];
|
||||
|
||||
const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/[^"]+)"/i);
|
||||
if (!urlMatch) continue;
|
||||
const url = urlMatch[1];
|
||||
if (seen.has(url)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const nameMatch = card.match(/<h[34][^>]*>\s*<a[^>]*>([^<]{10,})<\/a>/i) ||
|
||||
card.match(/<a[^>]*title="([^"]{10,})"/i);
|
||||
if (!nameMatch) continue;
|
||||
const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#[0-9]+;/g, "");
|
||||
if (name.length < 5) continue;
|
||||
|
||||
const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
||||
|
||||
const reach = detectReach(name);
|
||||
const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60);
|
||||
|
||||
products.push({
|
||||
partNumber, name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Generic product link scan using matchAll
|
||||
if (products.length === 0) {
|
||||
for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/(?:p|product)[^"?#]+)"[^>]*>([^<]{10,})</gi)) {
|
||||
const url = m[1];
|
||||
const name = m[2].trim().replace(/&/g, "&");
|
||||
if (seen.has(url) || name.length < 10) continue;
|
||||
if (!/transceiver|sfp|qsfp|osfp|dac|aoc/i.test(name)) continue;
|
||||
seen.add(url);
|
||||
|
||||
const idx = collapsed.indexOf(url);
|
||||
const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600);
|
||||
const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/);
|
||||
const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined;
|
||||
const reach = detectReach(name);
|
||||
|
||||
products.push({
|
||||
partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "",
|
||||
name, url,
|
||||
price: price && price > 0 && price < 100000 ? price : undefined,
|
||||
formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps,
|
||||
reachLabel: reach?.label, reachMeters: reach?.meters,
|
||||
fiberType: detectFiber(name), wavelength: detectWavelength(name),
|
||||
compatibleWith: extractCompatibleVendor(name),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return products;
|
||||
}
|
||||
|
||||
async function fetchPage(url: string): Promise<string> {
|
||||
const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
||||
return resp.text();
|
||||
}
|
||||
|
||||
export async function scrapeQsfptek(): Promise<void> {
|
||||
console.log("=== QSFPTEK Scraper Starting ===\n");
|
||||
|
||||
const vendorId = await ensureVendor(
|
||||
"QSFPTEK",
|
||||
"compatible",
|
||||
"https://www.qsfptek.com",
|
||||
"https://www.qsfptek.com/c/optical-transceiver.html",
|
||||
);
|
||||
|
||||
let totalProducts = 0;
|
||||
let priceUpdates = 0;
|
||||
const seenCategories = new Set<string>();
|
||||
|
||||
for (const cat of CATEGORIES) {
|
||||
console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`);
|
||||
|
||||
try {
|
||||
const html1 = await fetchPage(BASE + cat.path);
|
||||
const catProducts = parseProductList(html1, cat);
|
||||
|
||||
if (cat.path.includes("/optical-transceiver") && seenCategories.size > 3) {
|
||||
console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (catProducts.length === 0) {
|
||||
console.log(" No products on page 1 — skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
seenCategories.add(cat.path);
|
||||
console.log(` Found ${catProducts.length} products on page 1`);
|
||||
|
||||
const totalPagesMatch =
|
||||
html1.match(/total-page[^>]*>\s*(\d+)/) ||
|
||||
html1.match(/page\s+\d+\s+of\s+(\d+)/i);
|
||||
const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 3;
|
||||
console.log(` Total pages (estimate): ${totalPages}`);
|
||||
|
||||
const allProducts = [...catProducts];
|
||||
|
||||
for (let page = 2; page <= totalPages; page++) {
|
||||
await sleep(2000);
|
||||
try {
|
||||
const pageUrl = BASE + cat.path.replace(".html", "") + `?page=${page}`;
|
||||
const html = await fetchPage(pageUrl);
|
||||
const pageProds = parseProductList(html, cat);
|
||||
if (pageProds.length === 0) break;
|
||||
allProducts.push(...pageProds);
|
||||
console.log(` Page ${page}: ${pageProds.length} products`);
|
||||
} catch (err) {
|
||||
console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i);
|
||||
console.log(` Total unique: ${uniqueProducts.length}`);
|
||||
|
||||
for (const product of uniqueProducts) {
|
||||
try {
|
||||
const txId = await findOrCreateScrapedTransceiver({
|
||||
partNumber: product.partNumber,
|
||||
vendorId,
|
||||
formFactor: product.formFactor,
|
||||
speedGbps: product.speedGbps,
|
||||
speed: product.speed,
|
||||
reachMeters: product.reachMeters,
|
||||
reachLabel: product.reachLabel,
|
||||
fiberType: product.fiberType,
|
||||
wavelengths: product.wavelength,
|
||||
category: "DataCenter",
|
||||
});
|
||||
|
||||
if (product.price && product.price > 0) {
|
||||
const hash = contentHash({ price: product.price, part: product.partNumber });
|
||||
const updated = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price: product.price,
|
||||
currency: "USD",
|
||||
stockLevel: "in_stock",
|
||||
url: product.url,
|
||||
contentHash: hash,
|
||||
});
|
||||
if (updated) priceUpdates++;
|
||||
}
|
||||
totalProducts++;
|
||||
} catch (err) {
|
||||
console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` Category failed: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
await sleep(2000);
|
||||
}
|
||||
|
||||
console.log(`\n=== QSFPTEK Complete: ${totalProducts} products, ${priceUpdates} price updates ===`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
scrapeQsfptek()
|
||||
.then(() => pool.end())
|
||||
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user