From fa2d88096fc035820687934675b156347aa55bd7 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Mon, 30 Mar 2026 21:20:23 +0200 Subject: [PATCH] feat: add NADDOD, QSFPTEK, and AddOn Networks scrapers Three new fetch-based price scrapers for compatible optics vendors: - NADDOD: WooCommerce, USD, ~800+ SKUs - QSFPTEK: Custom PHP shop, USD, ~1000+ SKUs - AddOn Networks: Magento/custom, USD, ~2500 SKUs All registered in scheduler (8-12h intervals) and index.ts --flags. Build: 0 TypeScript errors. --- packages/scraper/src/index.ts | 17 +- packages/scraper/src/scheduler.ts | 39 +++ .../scraper/src/scrapers/addon-networks.ts | 303 ++++++++++++++++++ packages/scraper/src/scrapers/naddod.ts | 285 ++++++++++++++++ packages/scraper/src/scrapers/qsfptek.ts | 281 ++++++++++++++++ 5 files changed, 924 insertions(+), 1 deletion(-) create mode 100644 packages/scraper/src/scrapers/addon-networks.ts create mode 100644 packages/scraper/src/scrapers/naddod.ts create mode 100644 packages/scraper/src/scrapers/qsfptek.ts diff --git a/packages/scraper/src/index.ts b/packages/scraper/src/index.ts index 373e6b1..ca13099 100644 --- a/packages/scraper/src/index.ts +++ b/packages/scraper/src/index.ts @@ -27,6 +27,9 @@ * tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors) * tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright) * tsx src/index.ts --atgbics — Run ATGBICS scraper once + * tsx src/index.ts --naddod — Run NADDOD scraper once + * tsx src/index.ts --qsfptek — Run QSFPTEK scraper once + * tsx src/index.ts --addon — Run AddOn Networks scraper once */ import { createScheduler, registerSchedules, registerWorkers } from "./scheduler"; import { scrapeFs } from "./scrapers/fs-com"; @@ -54,6 +57,9 @@ import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler"; import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright"; import { scrapeAtgbics } from "./scrapers/atgbics"; import { scrapeProLabs } from "./scrapers/prolabs"; +import { scrapeNaddod } from "./scrapers/naddod"; +import { scrapeQsfptek } from "./scrapers/qsfptek"; +import { scrapeAddonNetworks } from "./scrapers/addon-networks"; import { pool } from "./utils/db"; const args = process.argv.slice(2); @@ -86,6 +92,15 @@ async function runOnce(): Promise { if (args.includes("--prolabs") || isAll || isFetchOnly) { await scrapeProLabs(); } + if (args.includes("--naddod") || isAll || isFetchOnly) { + await scrapeNaddod(); + } + if (args.includes("--qsfptek") || isAll || isFetchOnly) { + await scrapeQsfptek(); + } + if (args.includes("--addon") || isAll || isFetchOnly) { + await scrapeAddonNetworks(); + } if (args.includes("--juniper") || isAll || isFetchOnly) { await scrapeJuniperHct(); } @@ -172,7 +187,7 @@ async function runScheduler(): Promise { process.on("SIGTERM", shutdown); } -const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"]; +const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"]; if (args.some((a) => ALL_FLAGS.includes(a))) { runOnce().catch((err) => { diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 5d039a5..cf08518 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -61,6 +61,9 @@ export async function registerSchedules(boss: PgBoss): Promise { "scrape:pricing:10gtek", "scrape:pricing:atgbics", "scrape:pricing:prolabs", + "scrape:pricing:naddod", + "scrape:pricing:qsfptek", + "scrape:pricing:addon", "scrape:compat:cisco", "scrape:pricing:flexoptix", "scrape:vendors:flexoptix", @@ -120,6 +123,24 @@ export async function registerSchedules(boss: PgBoss): Promise { expireInSeconds: 3600, }); + // NADDOD pricing (every 8 hours — WooCommerce, USD prices) + await boss.schedule("scrape:pricing:naddod", "0 5/8 * * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + + // QSFPTEK pricing (every 10 hours — custom PHP shop, USD prices) + await boss.schedule("scrape:pricing:qsfptek", "0 3/10 * * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + + // AddOn Networks pricing (every 12 hours — enterprise site, USD prices) + await boss.schedule("scrape:pricing:addon", "0 6/12 * * *", {}, { + retryLimit: 2, + expireInSeconds: 3600, + }); + // Flexoptix catalog (every 6 hours — fetch-based, fast) await boss.schedule("scrape:pricing:flexoptix", "0 1/6 * * *", {}, { retryLimit: 2, @@ -152,6 +173,9 @@ export async function registerWorkers(boss: PgBoss): Promise { const { scrapeNews } = await import("./scrapers/news"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeProLabs } = await import("./scrapers/prolabs"); + const { scrapeNaddod } = await import("./scrapers/naddod"); + const { scrapeQsfptek } = await import("./scrapers/qsfptek"); + const { scrapeAddonNetworks } = await import("./scrapers/addon-networks"); await boss.work("scrape:pricing:fs", async (_job) => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); @@ -198,6 +222,21 @@ export async function registerWorkers(boss: PgBoss): Promise { await withIsolatedStorage("prolabs", scrapeProLabs); }); + await boss.work("scrape:pricing:naddod", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`); + await scrapeNaddod(); + }); + + await boss.work("scrape:pricing:qsfptek", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`); + await scrapeQsfptek(); + }); + + await boss.work("scrape:pricing:addon", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`); + await scrapeAddonNetworks(); + }); + await boss.work("scrape:faq", async (_job) => { console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); }); diff --git a/packages/scraper/src/scrapers/addon-networks.ts b/packages/scraper/src/scrapers/addon-networks.ts new file mode 100644 index 0000000..c4ece01 --- /dev/null +++ b/packages/scraper/src/scrapers/addon-networks.ts @@ -0,0 +1,303 @@ +/** + * AddOn Networks Scraper — US-based compatible optics vendor + * + * addnetworks.com — Enterprise-grade compatible transceivers. + * Products browseable under /products/ category pages. + * Pricing is public in USD. Rate limited: 1 req/2sec. + * + * AddOn Networks (AddOn Computer Products) specializes in OEM-compatible + * optics for Cisco, Juniper, Arista, HPE, and Dell environments. + * ~2500 SKUs, strong US channel presence. + */ +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const BASE = "https://www.addnetworks.com"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +}; + +const MAX_PAGES = 50; + +// AddOn uses "compatible" suffix naming (e.g. "ADD-XSSFP10GE-LR-AO") +// Categories follow standard form-factor taxonomy +const CATEGORIES = [ + { path: "/products/networking/optical-networking/sfp/", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { path: "/products/networking/optical-networking/sfp-plus/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/products/networking/optical-networking/sfp28/", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { path: "/products/networking/optical-networking/qsfp-plus/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { path: "/products/networking/optical-networking/qsfp28/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { path: "/products/networking/optical-networking/qsfp-dd/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + // Broader category fallback + { path: "/products/networking/optical-networking/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, +]; + +interface Product { + partNumber: string; + name: string; + url: string; + price?: number; + formFactor: string; + speed: string; + speedGbps: number; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelength?: string; + compatibleWith?: string; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b120\s*km\b/i, "120km", 120000], + [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], + [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b550\s*m\b/i, "550m", 550], + [/\b500\s*m\b/i, "500m", 500], + [/\b400\s*m\b/i, "400m", 400], + [/\b300\s*m\b/i, "300m", 300], + [/\b150\s*m\b/i, "150m", 150], + [/\b100\s*m\b/i, "100m", 100], + [/\bLR4\b/, "10km", 10000], + [/\bLR\b/, "10km", 10000], + [/\bER4?\b/, "40km", 40000], + [/\bZR4?\b/, "80km", 80000], + [/\bSR4?\b/, "300m", 300], + [/\bDR4?\b/, "500m", 500], + [/\bFR4?\b/, "2km", 2000], + ]; + for (const [regex, label, meters] of patterns) { + if (regex.test(text)) return { label, meters }; + } + return undefined; +} + +function detectFiber(text: string): string { + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; + return ""; +} + +function detectWavelength(text: string): string { + const match = text.match(/(\d{3,4})\s*nm/i); + return match ? match[1] : ""; +} + +function extractCompatibleVendor(name: string): string { + const brands = ["Cisco", "Juniper", "Arista", "HPE", "HP", "Aruba", "Dell", "Brocade", "Extreme", + "Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Force10", + "Foundry", "Enterasys", "Allied Telesis", "Netgear", "Calix"]; + for (const brand of brands) { + if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand; + } + // AddOn naming convention: "FOR-XX" suffix + const forMatch = name.match(/-AO$|-IN$/i); + if (forMatch) { + // Check preceding OEM part number pattern, e.g. SFP-10G-SR-AO → Cisco + if (/^SFP-|^GLC-|^QSFP-|^SFP28-/i.test(name)) return "Cisco"; + if (/^EX-|^QFX-/i.test(name)) return "Juniper"; + if (/^740-|^J\d{4}/i.test(name)) return "Juniper"; + } + return ""; +} + +/** + * Parse AddOn Networks product listing HTML. + * Supports multiple CMS patterns (Magento, BigCommerce, custom). + */ +function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { + const products: Product[] = []; + const seen = new Set(); + const collapsed = html.replace(/\s+/g, " "); + + // Strategy 1: Magento / standard product grid + for (const m of collapsed.matchAll(/]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi)) { + const card = m[1]; + + const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"/i); + if (!urlMatch) continue; + const url = urlMatch[1]; + if (seen.has(url) || !/\/product(?:s)?\/|\/item\//i.test(url)) continue; + seen.add(url); + + const nameMatch = card.match(/]*>([^<]{10,})<\/h[2-4]>/i) || + card.match(/product[_-]?(?:name|title)[^>]*>([^<]{10,})]*>([^<]{10,}) 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + + // Strategy 2: Generic product link fallback using matchAll + if (products.length === 0) { + for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?addnetworks\.com\/[^"?#]+)"[^>]*>\s*<[^>]+>\s*([^<]{10,})/gi)) { + const url = m[1]; + const name = m[2].trim().replace(/&/g, "&"); + if (seen.has(url) || name.length < 10) continue; + if (!/transceiver|sfp|qsfp|osfp|dac|aoc|fiber|optical/i.test(name)) continue; + seen.add(url); + + const idx = collapsed.indexOf(url); + const ctx = collapsed.slice(Math.max(0, idx - 300), idx + 600); + const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/); + const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined; + const reach = detectReach(name); + + products.push({ + partNumber: name.match(/([A-Z0-9][A-Z0-9\-\.\/]{4,})/)?.[1] || name.split(/\s+/)[0]?.slice(0, 80) || "", + name, url, + price: price && price > 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + } + + return products; +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +export async function scrapeAddonNetworks(): Promise { + console.log("=== AddOn Networks Scraper Starting ===\n"); + + const vendorId = await ensureVendor( + "AddOn Networks", + "compatible", + "https://www.addnetworks.com", + "https://www.addnetworks.com/products/networking/optical-networking/", + ); + + let totalProducts = 0; + let priceUpdates = 0; + const seenCategories = new Set(); + + for (const cat of CATEGORIES) { + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); + + try { + const html1 = await fetchPage(BASE + cat.path); + const catProducts = parseProductList(html1, cat); + + if (cat.path === "/products/networking/optical-networking/" && seenCategories.size > 3) { + console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`); + continue; + } + + if (catProducts.length === 0) { + console.log(" No products on page 1 — skipping"); + continue; + } + + seenCategories.add(cat.path); + console.log(` Found ${catProducts.length} products on page 1`); + + // Detect pagination + const totalPagesMatch = + html1.match(/page\s+\d+\s+of\s+(\d+)/i) || + html1.match(/aria-label="Last[^"]*"\s+href="[^"]*[?&]p=(\d+)/) || + html1.match(/pagination[^>]*>[\s\S]*?(\d+)<\/a>\s*<\/[^>]+>\s*<\/[^>]+>/); + const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 2; + console.log(` Total pages (estimate): ${totalPages}`); + + const allProducts = [...catProducts]; + + for (let page = 2; page <= totalPages; page++) { + await sleep(2000); + try { + const pageUrl = BASE + cat.path + `?p=${page}`; + const html = await fetchPage(pageUrl); + const pageProds = parseProductList(html, cat); + if (pageProds.length === 0) break; + allProducts.push(...pageProds); + console.log(` Page ${page}: ${pageProds.length} products`); + } catch (err) { + console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`); + break; + } + } + + const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i); + console.log(` Total unique: ${uniqueProducts.length}`); + + for (const product of uniqueProducts) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + if (product.price && product.price > 0) { + const hash = contentHash({ price: product.price, part: product.partNumber }); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: "USD", + stockLevel: "in_stock", + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + totalProducts++; + } catch (err) { + console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); + } + } + } catch (err) { + console.error(` Category failed: ${(err as Error).message}`); + } + + await sleep(2000); + } + + console.log(`\n=== AddOn Networks Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); +} + +if (require.main === module) { + scrapeAddonNetworks() + .then(() => pool.end()) + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); +} diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts new file mode 100644 index 0000000..84ede6e --- /dev/null +++ b/packages/scraper/src/scrapers/naddod.ts @@ -0,0 +1,285 @@ +/** + * NADDOD Scraper — Chinese compatible transceiver vendor + * + * naddod.com — WooCommerce store, server-rendered HTML, USD pricing. + * Products listed under product category pages. + * Pagination via /page/N/. Rate limited: 1 req/2sec. + * + * NADDOD (Shenzhen NADDOD Information Co.) makes and sells compatible + * optics for Cisco, Juniper, Arista, etc. Transparent USD pricing. + */ +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const BASE = "https://www.naddod.com"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +}; + +const MAX_PAGES = 30; + +const CATEGORIES = [ + { path: "/product-category/1g-sfp-transceivers/", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { path: "/product-category/10g-sfp-transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/product-category/25g-sfp28-transceivers/", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { path: "/product-category/40g-qsfp-transceivers/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { path: "/product-category/100g-qsfp28-transceivers/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { path: "/product-category/200g-qsfp56-transceivers/", formFactor: "QSFP56", speed: "200G", speedGbps: 200 }, + { path: "/product-category/400g-qsfp-dd-transceivers/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + { path: "/product-category/800g-osfp-transceivers/", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, + { path: "/product-category/transceivers/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, +]; + +interface Product { + partNumber: string; + name: string; + url: string; + price?: number; + formFactor: string; + speed: string; + speedGbps: number; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelength?: string; + compatibleWith?: string; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b120\s*km\b/i, "120km", 120000], + [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], + [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b550\s*m\b/i, "550m", 550], + [/\b500\s*m\b/i, "500m", 500], + [/\b400\s*m\b/i, "400m", 400], + [/\b300\s*m\b/i, "300m", 300], + [/\b150\s*m\b/i, "150m", 150], + [/\b100\s*m\b/i, "100m", 100], + [/\bLR4\b/, "10km", 10000], + [/\bLR\b/, "10km", 10000], + [/\bER4?\b/, "40km", 40000], + [/\bZR4?\b/, "80km", 80000], + [/\bSR4?\b/, "300m", 300], + [/\bDR4?\b/, "500m", 500], + [/\bFR4?\b/, "2km", 2000], + ]; + for (const [regex, label, meters] of patterns) { + if (regex.test(text)) return { label, meters }; + } + return undefined; +} + +function detectFiber(text: string): string { + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; + return ""; +} + +function detectWavelength(text: string): string { + const match = text.match(/(\d{3,4})\s*nm/i); + return match ? match[1] : ""; +} + +function extractCompatibleVendor(name: string): string { + const brands = ["Cisco", "Juniper", "Arista", "HPE", "Dell", "Brocade", "Extreme", "Huawei", + "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti"]; + for (const brand of brands) { + if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand; + } + const match = name.match(/(?:for\s+|compatible\s+(?:with\s+)?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)/); + return match ? match[1] : ""; +} + +function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { + const products: Product[] = []; + const seen = new Set(); + const collapsed = html.replace(/\s+/g, " "); + + // Strategy 1: WooCommerce standard product loop + const cardRegex = /]+class="[^"]*product[^"]*"[^>]*>([\s\S]*?)<\/li>/gi; + let cardMatch; + while ((cardMatch = cardRegex.exec(collapsed)) !== null) { + const card = cardMatch[1]; + + const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?naddod\.com\/product\/[^"]+)"/i); + if (!urlMatch) continue; + const url = urlMatch[1]; + if (seen.has(url)) continue; + seen.add(url); + + const nameMatch = card.match(/woocommerce-loop-product__title[^>]*>([^<]+)]*>([^<]{10,})<\/h2>/i) || + card.match(/]*>([^<]{10,})<\/h3>/i); + if (!nameMatch) continue; + const name = nameMatch[1].trim().replace(/&/g, "&").replace(/–/g, "–"); + if (name.length < 5) continue; + + const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/); + const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; + + const reach = detectReach(name); + const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); + + products.push({ + partNumber, name, url, + price: price && price > 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + + // Strategy 2: Generic product link fallback + if (products.length === 0) { + const linkRegex = /href="(https?:\/\/(?:www\.)?naddod\.com\/(?:product|shop)\/[^"?#]+)"[^>]*>\s*([^<]{10,})/gi; + let m; + while ((m = linkRegex.exec(collapsed)) !== null) { + const url = m[1]; + const name = m[2].trim().replace(/&/g, "&"); + if (seen.has(url) || name.length < 10) continue; + if (!/transceiver|sfp|qsfp|osfp|dac|aoc|xfp/i.test(name)) continue; + seen.add(url); + + const ctx = collapsed.slice(Math.max(0, m.index - 200), m.index + 500); + const priceM = ctx.match(/\$\s*([\d,]+\.?\d*)/); + const price = priceM ? parseFloat(priceM[1].replace(/,/g, "")) : undefined; + const reach = detectReach(name); + + products.push({ + partNumber: name.split(/\s+/)[0]?.slice(0, 80) || "", + name, url, + price: price && price > 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + } + + return products; +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +export async function scrapeNaddod(): Promise { + console.log("=== NADDOD Scraper Starting ===\n"); + + const vendorId = await ensureVendor( + "NADDOD", + "compatible", + "https://www.naddod.com", + "https://www.naddod.com/product-category/transceivers/", + ); + + let totalProducts = 0; + let priceUpdates = 0; + const seenCategories = new Set(); + + for (const cat of CATEGORIES) { + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); + + try { + const html1 = await fetchPage(BASE + cat.path); + const catProducts = parseProductList(html1, cat); + + if (cat.path.includes("/transceivers/") && seenCategories.size > 3) { + console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`); + continue; + } + + if (catProducts.length === 0) { + console.log(" No products on page 1 — skipping"); + continue; + } + + seenCategories.add(cat.path); + console.log(` Found ${catProducts.length} products on page 1`); + + const totalPagesMatch = html1.match(/page-numbers[^>]*>(\d+)<\/a>(?!.*page-numbers)/); + const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 1; + console.log(` Total pages: ${totalPages}`); + + const allProducts = [...catProducts]; + + for (let page = 2; page <= totalPages; page++) { + await sleep(2000); + try { + const html = await fetchPage(BASE + cat.path + `page/${page}/`); + const pageProds = parseProductList(html, cat); + if (pageProds.length === 0) break; + allProducts.push(...pageProds); + console.log(` Page ${page}: ${pageProds.length} products`); + } catch (err) { + console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`); + break; + } + } + + const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i); + console.log(` Total unique: ${uniqueProducts.length}`); + + for (const product of uniqueProducts) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + if (product.price && product.price > 0) { + const hash = contentHash({ price: product.price, part: product.partNumber }); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: "USD", + stockLevel: "in_stock", + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + totalProducts++; + } catch (err) { + console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); + } + } + } catch (err) { + console.error(` Category failed: ${(err as Error).message}`); + } + + await sleep(2000); + } + + console.log(`\n=== NADDOD Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); +} + +if (require.main === module) { + scrapeNaddod() + .then(() => pool.end()) + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); +} diff --git a/packages/scraper/src/scrapers/qsfptek.ts b/packages/scraper/src/scrapers/qsfptek.ts new file mode 100644 index 0000000..d66a60b --- /dev/null +++ b/packages/scraper/src/scrapers/qsfptek.ts @@ -0,0 +1,281 @@ +/** + * QSFPTEK Scraper — Chinese compatible transceiver vendor + * + * qsfptek.com — Server-rendered HTML shop, USD pricing. + * Focuses on QSFP+/QSFP28/QSFP-DD/SFP+ form factors. + * Rate limited: 1 req/2sec. + * + * QSFPTEK (Shenzhen Optotech Technology) — competitive pricing, + * transparent USD prices, no account required. + */ +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const BASE = "https://www.qsfptek.com"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +}; + +const MAX_PAGES = 30; + +const CATEGORIES = [ + { path: "/c/sfp-transceiver.html", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { path: "/c/sfp-plus-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/c/sfp28-transceiver.html", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { path: "/c/qsfp-plus-transceiver.html", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { path: "/c/qsfp28-transceiver.html", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { path: "/c/qsfp56-transceiver.html", formFactor: "QSFP56", speed: "200G", speedGbps: 200 }, + { path: "/c/qsfp-dd-transceiver.html", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + { path: "/c/osfp-transceiver.html", formFactor: "OSFP", speed: "800G", speedGbps: 800 }, + { path: "/c/optical-transceiver.html", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, +]; + +interface Product { + partNumber: string; + name: string; + url: string; + price?: number; + formFactor: string; + speed: string; + speedGbps: number; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelength?: string; + compatibleWith?: string; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b120\s*km\b/i, "120km", 120000], + [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], + [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b550\s*m\b/i, "550m", 550], + [/\b500\s*m\b/i, "500m", 500], + [/\b300\s*m\b/i, "300m", 300], + [/\b100\s*m\b/i, "100m", 100], + [/\bLR4\b/, "10km", 10000], + [/\bLR\b/, "10km", 10000], + [/\bER4?\b/, "40km", 40000], + [/\bZR4?\b/, "80km", 80000], + [/\bSR4?\b/, "300m", 300], + [/\bDR4?\b/, "500m", 500], + [/\bFR4?\b/, "2km", 2000], + ]; + for (const [regex, label, meters] of patterns) { + if (regex.test(text)) return { label, meters }; + } + return undefined; +} + +function detectFiber(text: string): string { + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; + if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; + return ""; +} + +function detectWavelength(text: string): string { + const match = text.match(/(\d{3,4})\s*nm/i); + return match ? match[1] : ""; +} + +function extractCompatibleVendor(name: string): string { + const brands = ["Cisco", "Juniper", "Arista", "HPE", "Aruba", "Dell", "Brocade", "Extreme", + "Huawei", "Nokia", "MikroTik", "Mellanox", "Nvidia", "Ubiquiti", "Allied Telesis"]; + for (const brand of brands) { + if (new RegExp(`\\b${brand}\\b`, "i").test(name)) return brand; + } + return ""; +} + +function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { + const products: Product[] = []; + const seen = new Set(); + const collapsed = html.replace(/\s+/g, " "); + + // Strategy 1: OpenCart / custom card layout using matchAll + for (const cardMatch of collapsed.matchAll(/]+class="[^"]*product-(?:thumb|layout)[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi)) { + const card = cardMatch[1]; + + const urlMatch = card.match(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/[^"]+)"/i); + if (!urlMatch) continue; + const url = urlMatch[1]; + if (seen.has(url)) continue; + seen.add(url); + + const nameMatch = card.match(/]*>\s*]*>([^<]{10,})<\/a>/i) || + card.match(/]*title="([^"]{10,})"/i); + if (!nameMatch) continue; + const name = nameMatch[1].trim().replace(/&/g, "&").replace(/&#[0-9]+;/g, ""); + if (name.length < 5) continue; + + const priceMatch = card.match(/\$\s*([\d,]+\.?\d*)/); + const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; + + const reach = detectReach(name); + const partNumber = name.split(/\s+(?:compatible|for|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); + + products.push({ + partNumber, name, url, + price: price && price > 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + + // Strategy 2: Generic product link scan using matchAll + if (products.length === 0) { + for (const m of collapsed.matchAll(/href="(https?:\/\/(?:www\.)?qsfptek\.com\/(?:p|product)[^"?#]+)"[^>]*>([^<]{10,}) 0 && price < 100000 ? price : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + compatibleWith: extractCompatibleVendor(name), + }); + } + } + + return products; +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +export async function scrapeQsfptek(): Promise { + console.log("=== QSFPTEK Scraper Starting ===\n"); + + const vendorId = await ensureVendor( + "QSFPTEK", + "compatible", + "https://www.qsfptek.com", + "https://www.qsfptek.com/c/optical-transceiver.html", + ); + + let totalProducts = 0; + let priceUpdates = 0; + const seenCategories = new Set(); + + for (const cat of CATEGORIES) { + console.log(`\n--- ${cat.formFactor} (${cat.speed}) [${cat.path}] ---`); + + try { + const html1 = await fetchPage(BASE + cat.path); + const catProducts = parseProductList(html1, cat); + + if (cat.path.includes("/optical-transceiver") && seenCategories.size > 3) { + console.log(` Skipping generic fallback (${seenCategories.size} specific categories scraped)`); + continue; + } + + if (catProducts.length === 0) { + console.log(" No products on page 1 — skipping"); + continue; + } + + seenCategories.add(cat.path); + console.log(` Found ${catProducts.length} products on page 1`); + + const totalPagesMatch = + html1.match(/total-page[^>]*>\s*(\d+)/) || + html1.match(/page\s+\d+\s+of\s+(\d+)/i); + const totalPages = totalPagesMatch ? Math.min(parseInt(totalPagesMatch[1]), MAX_PAGES) : 3; + console.log(` Total pages (estimate): ${totalPages}`); + + const allProducts = [...catProducts]; + + for (let page = 2; page <= totalPages; page++) { + await sleep(2000); + try { + const pageUrl = BASE + cat.path.replace(".html", "") + `?page=${page}`; + const html = await fetchPage(pageUrl); + const pageProds = parseProductList(html, cat); + if (pageProds.length === 0) break; + allProducts.push(...pageProds); + console.log(` Page ${page}: ${pageProds.length} products`); + } catch (err) { + console.warn(` Page ${page} failed: ${(err as Error).message.slice(0, 60)}`); + break; + } + } + + const uniqueProducts = allProducts.filter((p, i, arr) => arr.findIndex((x) => x.url === p.url) === i); + console.log(` Total unique: ${uniqueProducts.length}`); + + for (const product of uniqueProducts) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + if (product.price && product.price > 0) { + const hash = contentHash({ price: product.price, part: product.partNumber }); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: "USD", + stockLevel: "in_stock", + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + totalProducts++; + } catch (err) { + console.warn(` DB error: ${(err as Error).message.slice(0, 80)}`); + } + } + } catch (err) { + console.error(` Category failed: ${(err as Error).message}`); + } + + await sleep(2000); + } + + console.log(`\n=== QSFPTEK Complete: ${totalProducts} products, ${priceUpdates} price updates ===`); +} + +if (require.main === module) { + scrapeQsfptek() + .then(() => pool.end()) + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); +}