/** * NADDOD Scraper v2 — Chinese compatible transceiver vendor * * naddod.com — Migrated from WooCommerce → Astro/Shopify-style in 2025. * Product URLs: /products/XXXXX.html (numeric IDs, not category slugs) * Sitemap: /sitemaps/products.xml * * Phase 1: Parse sitemap to collect all product URLs (plain HTTP) * Phase 2: Fetch product detail pages — extract name, price, stock count * Stock format: "In Stock: 543" (exact) | "In Stock: 2.1k+" (rounded) | "In Stock: Available" (boolean) * Per-warehouse JSON (warehouse_stock: {us, nl, sg, cn}) is in a JS hydration * payload that requires JS execution — only the display count is in plain HTML. * → stock_confidence=2 (aggregated global count) for exact/rounded counts * → stock_confidence=1 (boolean) for "Available" only * * Rate limited: 1 req/2sec. */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; const BASE = "https://www.naddod.com"; const SITEMAP_URL = `${BASE}/sitemaps/products.xml`; const HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research)", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }; // Limit detail-page fetches per run to stay reasonable const MAX_DETAIL_PAGES = 600; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } // ── Classification helpers ────────────────────────────────────────────────── function detectFormFactor(text: string): string { const t = text.toLowerCase(); if (/\bosfp\b/.test(t)) return "OSFP"; if (/\bqsfp.?dd800\b|\bqsfp.?dd\s+800\b/.test(t)) return "QSFP-DD800"; if (/\bqsfp.?dd\b/.test(t)) return "QSFP-DD"; if (/\bqsfp56\b/.test(t)) return "QSFP56"; if (/\bqsfp112\b/.test(t)) return "QSFP112"; if (/\bqsfp28\b/.test(t)) return "QSFP28"; if (/\bqsfp\+|\bqsfp\s*plus\b/.test(t)) return "QSFP+"; if (/\bsfp28\b/.test(t)) return "SFP28"; if (/\bsfp.?\+|10g.*sfp|sfp.*10g/.test(t)) return "SFP+"; if (/\bsfp\b/.test(t)) return "SFP"; if (/\bxfp\b/.test(t)) return "XFP"; return "SFP+"; // default } function detectSpeedGbps(text: string): { speed: string; speedGbps: number } { const t = text.toUpperCase(); if (/\b800G\b|\b800GBE\b/.test(t)) return { speed: "800G", speedGbps: 800 }; if (/\b400G\b|\b400GBE\b/.test(t)) return { speed: "400G", speedGbps: 400 }; if (/\b200G\b|\b200GBE\b/.test(t)) return { speed: "200G", speedGbps: 200 }; if (/\b100G\b|\b100GBE\b/.test(t)) return { speed: "100G", speedGbps: 100 }; if (/\b40G\b|\b40GBE\b/.test(t)) return { speed: "40G", speedGbps: 40 }; if (/\b25G\b|\b25GBE\b/.test(t)) return { speed: "25G", speedGbps: 25 }; if (/\b10G\b|\b10GBE\b/.test(t)) return { speed: "10G", speedGbps: 10 }; if (/\b1G\b|\b1GBE\b|\bGIGABIT\b/.test(t)) return { speed: "1G", speedGbps: 1 }; return { speed: "Unknown", speedGbps: 0 }; } function detectReach(text: string): { label: string; meters: number } | undefined { const patterns: [RegExp, string, number][] = [ [/\b120\s*km\b/i, "120km", 120000], [/\b80\s*km\b/i, "80km", 80000], [/\b40\s*km\b/i, "40km", 40000], [/\b20\s*km\b/i, "20km", 20000], [/\b10\s*km\b/i, "10km", 10000], [/\b2\s*km\b/i, "2km", 2000], [/\b550\s*m\b/i, "550m", 550], [/\b500\s*m\b/i, "500m", 500], [/\b400\s*m\b/i, "400m", 400], [/\b300\s*m\b/i, "300m", 300], [/\b150\s*m\b/i, "150m", 150], [/\b100\s*m\b/i, "100m", 100], [/\bLR4\b/, "10km", 10000], [/\bLR\b/, "10km", 10000], [/\bER4?\b/, "40km", 40000], [/\bZR4?\b/, "80km", 80000], [/\bSR4?\b/, "300m", 300], [/\bDR4?\b/, "500m", 500], [/\bFR4?\b/, "2km", 2000], ]; for (const [re, label, meters] of patterns) { if (re.test(text)) return { label, meters }; } return undefined; } function detectFiber(text: string): string { if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(text)) return "SMF"; if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(text)) return "MMF"; if (/copper|dac|twinax|rj.?45|base-t/i.test(text)) return "Copper"; return ""; } function detectWavelength(text: string): string { const m = text.match(/(\d{3,4})\s*nm/i); return m ? m[1] : ""; } function isTransceiver(name: string): boolean { const lower = name.toLowerCase(); // Include: SFP, QSFP, OSFP, XFP, DAC/AOC cables count as transceivers for stock tracking return /sfp|qsfp|osfp|xfp|transceiver|dac|aoc|cwdm|dwdm/i.test(lower); } // ── Stock parsing ─────────────────────────────────────────────────────────── /** * Parse NADDOD stock display text. * "In Stock: 543" → { qty: 543, confidence: 2 } * "In Stock: 2.1k+" → { qty: 2100, confidence: 2 } (approximate, rounded) * "In Stock: Available" → { qty: undefined, confidence: 1 } * Returns null if no stock text found. */ function parseStockText(html: string): { qty?: number; confidence: 1 | 2 } | null { // Look for "In Stock: X" pattern in page text const m = html.match(/In\s+Stock[:\s]+([^\s<"]+)/i); if (!m) return null; const raw = m[1].trim().toLowerCase(); // "Available" = boolean only if (/^avail/i.test(raw)) return { confidence: 1 }; // Numeric: "543" or "2.1k+" or "1.5k+" const kMatch = raw.match(/^([\d.]+)k\+?$/); if (kMatch) { const qty = Math.round(parseFloat(kMatch[1]) * 1000); return { qty: isNaN(qty) ? undefined : qty, confidence: 2 }; } const exact = parseInt(raw.replace(/[^0-9]/g, ""), 10); if (!isNaN(exact) && exact >= 0) return { qty: exact, confidence: 2 }; return { confidence: 1 }; // fallback: boolean } // ── HTTP helpers ──────────────────────────────────────────────────────────── async function fetchText(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); if (!resp.ok) throw new Error(`HTTP ${resp.status}`); return resp.text(); } /** * Parse product detail page HTML → extract name, price, stock. */ function parseDetailPage(html: string, url: string): { name: string; price?: number; stock: { qty?: number; confidence: 1 | 2 } | null; } | null { // Product name: og:title or or <h1> const ogTitle = html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i)?.[1]; const h1 = html.match(/<h1[^>]*>([^<]{15,})<\/h1>/i)?.[1]?.trim(); const titleTag = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim(); const name = (ogTitle || h1 || titleTag || "").replace(/\s+/g, " ").slice(0, 200); if (!name || name.length < 10) return null; if (!isTransceiver(name)) return null; // Price: "US$ 10.90" or "$10.90" const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) || html.match(/\$\s*([\d,]+\.\d{2})\b/); const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; // Stock count const stock = parseStockText(html); return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock }; } // ── Sitemap parsing ───────────────────────────────────────────────────────── async function fetchProductUrlsFromSitemap(): Promise<string[]> { console.log(` Fetching sitemap: ${SITEMAP_URL}`); const xml = await fetchText(SITEMAP_URL); // Extract all <loc> URLs that match /products/XXXXX.html const urls: string[] = []; const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi; let m: RegExpExecArray | null; while ((m = locRegex.exec(xml)) !== null) { const url = m[1].trim(); // Keep only canonical English URLs (no language prefix) if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) { urls.push(url); } } return [...new Set(urls)]; // deduplicate } // ── Main scraper ──────────────────────────────────────────────────────────── export async function scrapeNaddod(): Promise<void> { console.log("=== NADDOD Scraper v2 Starting (sitemap + detail mode) ===\n"); const vendorId = await ensureVendor( "NADDOD", "compatible", "https://www.naddod.com", "https://www.naddod.com/collection/optical-transceivers", ); // ── Phase 1: Discover product URLs via sitemap ──────────────────────────── console.log("[Phase 1] Discovering products from sitemap..."); let productUrls: string[] = []; try { productUrls = await fetchProductUrlsFromSitemap(); console.log(` Found ${productUrls.length} product URLs in sitemap`); } catch (err) { console.error(` Sitemap fetch failed: ${(err as Error).message}`); return; } if (productUrls.length === 0) { console.warn(" No product URLs found — aborting"); return; } // Limit to avoid excessive runtime const urls = productUrls.slice(0, MAX_DETAIL_PAGES); console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`); // ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── console.log("\n[Phase 2] Fetching product detail pages..."); let processed = 0; let priceUpdates = 0; let stockWritten = 0; let stockSkipped = 0; let skippedNonTx = 0; let errors = 0; for (const url of urls) { await sleep(2000); try { const html = await fetchText(url); const detail = parseDetailPage(html, url); if (!detail) { skippedNonTx++; continue; } const { name, price, stock } = detail; const { speed, speedGbps } = detectSpeedGbps(name); const formFactor = detectFormFactor(name); const reach = detectReach(name); const fiberType = detectFiber(name); const wavelength = detectWavelength(name); // Extract part number from name (first word-group before "Compatible" or vendor name) const partNumber = name.split(/\s+(?:compatible|for\s+[A-Z]|sfp|qsfp)/i)[0]?.trim().slice(0, 80) || name.slice(0, 60); const txId = await findOrCreateScrapedTransceiver({ partNumber, vendorId, formFactor, speedGbps, speed, reachMeters: reach?.meters, reachLabel: reach?.label, fiberType, wavelengths: wavelength, category: "DataCenter", }); // Price observation if (price && price > 0) { const hash = contentHash({ price, part: partNumber }); const isNew = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price, currency: "USD", stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" : stock?.confidence === 1 ? "in_stock" : "on_request", url, contentHash: hash, }); if (isNew) priceUpdates++; } // Stock observation if (stock !== null) { const stockLevel = stock.qty !== undefined ? (stock.qty > 0 ? "in_stock" : "out_of_stock") : "in_stock"; const isNew = await upsertStockObservation({ transceiverId: txId, sourceVendorId: vendorId, stockLevel, quantityAvailable: stock.qty !== undefined && stock.qty > 0 ? stock.qty : undefined, productUrl: url, stockConfidence: stock.confidence, priceCurrency: "USD", priceIncludesTax: false, }); if (isNew) stockWritten++; else stockSkipped++; } processed++; if (processed % 50 === 0) { console.log(` Progress: ${processed}/${urls.length} | prices: ${priceUpdates} | stock: ${stockWritten} new / ${stockSkipped} unchanged`); } } catch (err) { errors++; if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 120)}`); } } console.log("\n=== NADDOD Scraper v2 Complete ==="); console.log(` Products processed: ${processed}`); console.log(` Non-transceivers skip: ${skippedNonTx}`); console.log(` Price observations: ${priceUpdates} new`); console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`); if (errors > 0) console.warn(` Errors: ${errors}`); } if (require.main === module) { scrapeNaddod() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }