/** * SmartOptics Scraper — Premium coherent/DWDM transceiver manufacturer * * smartoptics.com — WordPress/WooCommerce, no prices (B2B, RFQ only). * Scrapes product catalog for specs, images, and datasheets. * * v2 fixes: * - Multi-category crawl (coherent, DWDM, access, SFP, QSFP) * - Handles both absolute AND relative product URLs * - WooCommerce REST API fallback for complete product list * - Up to 10 pagination pages per category */ import { pool, findOrCreateScrapedTransceiver, ensureVendor } from "../utils/db"; const BASE = "https://smartoptics.com"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }; /** All transceiver-related catalog category pages to crawl */ const CATALOG_PAGES = [ "/products/optical-transceivers/", "/products/", "/product-category/optical-transceivers/", "/product-category/transceivers/", "/product-category/sfp/", "/product-category/qsfp/", "/product-category/coherent/", "/product-category/dwdm/", ]; function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } function detectFormFactor(text: string): { formFactor: string; speed: string; speedGbps: number } { const t = text.toLowerCase(); if (t.includes("qsfp-dd800") || t.includes("800ge")) return { formFactor: "QSFP-DD", speed: "800G", speedGbps: 800 }; if (t.includes("qsfp-dd") || (t.includes("400g") && t.includes("qsfp"))) return { formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }; if (t.includes("qsfp112")) return { formFactor: "QSFP112", speed: "400G", speedGbps: 400 }; if (t.includes("qsfp56")) return { formFactor: "QSFP56", speed: "200G", speedGbps: 200 }; if (t.includes("qsfp28") || t.includes("100ge") || t.includes("100g")) return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; if (t.includes("sfp28") || t.includes("25g")) return { formFactor: "SFP28", speed: "25G", speedGbps: 25 }; if (t.includes("qsfp+") || t.includes("40g")) return { formFactor: "QSFP+", speed: "40G", speedGbps: 40 }; if (t.includes("sfp+") || t.includes("10g")) return { formFactor: "SFP+", speed: "10G", speedGbps: 10 }; if (t.includes("sfp")) return { formFactor: "SFP", speed: "1G", speedGbps: 1 }; return { formFactor: "QSFP28", speed: "100G", speedGbps: 100 }; } function detectReach(text: string): { label: string; meters: number } | undefined { const kmMatch = text.match(/(\d+(?:\.\d+)?)\s*km/i); if (kmMatch) { const km = parseFloat(kmMatch[1]); return { label: `${km}km`, meters: km * 1000 }; } const mMatch = text.match(/(\d+)\s*m\b/i); if (mMatch) { const m = parseInt(mMatch[1]); return { label: `${m}m`, meters: m }; } return undefined; } function detectFiber(text: string): string { if (/multi.?mode|mmf|sr\b/i.test(text)) return "MMF"; return "SMF"; // SmartOptics is almost exclusively SMF/coherent } async function fetchPage(url: string): Promise { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); return resp.text(); } /** * Extract all /product/xxx/ URLs from an HTML page. * Handles both absolute (https://smartoptics.com/product/...) and * root-relative (/product/...) href patterns. */ function extractProductUrls(html: string, pageUrl: string): string[] { const urls = new Set(); // Absolute URLs const absRegex = /href="(https?:\/\/(?:www\.)?smartoptics\.com\/product\/[^"#?]+)"/gi; let m: RegExpExecArray | null; while ((m = absRegex.exec(html)) !== null) { urls.add(normalizeProductUrl(m[1])); } // Root-relative: href="/product/..." or href="/products/..." (individual product, not category) const relRegex = /href="(\/product\/[^"#?]+)"/gi; while ((m = relRegex.exec(html)) !== null) { urls.add(normalizeProductUrl(`${BASE}${m[1]}`)); } // WooCommerce data attributes: data-permalink or data-product-url const dataRegex = /data-(?:permalink|product-url)="([^"]*\/product\/[^"]+)"/gi; while ((m = dataRegex.exec(html)) !== null) { const u = m[1].startsWith("http") ? m[1] : `${BASE}${m[1]}`; urls.add(normalizeProductUrl(u)); } // Filter out category pages — only keep individual product URLs return Array.from(urls).filter((u) => { const path = new URL(u).pathname; // Must be /product/something — not /products/ (that's a category) return path.startsWith("/product/") && path.split("/").filter(Boolean).length >= 2; }); } function normalizeProductUrl(url: string): string { // Ensure trailing slash, strip query and fragment try { const u = new URL(url); let path = u.pathname; if (!path.endsWith("/")) path += "/"; return `${u.origin}${path}`; } catch { return url; } } interface ProductData { sku: string; name: string; url: string; imageUrl?: string; datasheetUrl?: string; formFactor: string; speed: string; speedGbps: number; reachLabel?: string; reachMeters?: number; fiberType: string; coherent: boolean; wdmType?: string; } async function scrapeProductPage(url: string): Promise { try { const html = await fetchPage(url); // Product name — try OG tag first (most reliable), then H1 const nameMatch = html.match(/property="og:title"\s+content="([^"]+)"/) || html.match(/content="([^"]+)"\s+property="og:title"/) || html.match(/]*class="[^"]*(?:product_title|entry-title)[^"]*"[^>]*>([^<]+)<\/h1>/i) || html.match(/]*>([^<]+)<\/h1>/); const rawName = nameMatch?.[1]?.trim() ?? ""; const name = rawName.replace(/\s*\|\s*Smartoptics\s*$/, "").replace(/\s*–\s*Smartoptics\s*$/, "").trim(); if (!name || name.length < 4) return null; // SKU — try WooCommerce SKU field first const skuMatch = html.match(/(?:SKU|Artikelnummer)[^<]*<\/[^>]+>\s*<[^>]+>([A-Z0-9][-A-Z0-9./]{2,40})/i) || html.match(/"sku"\s*:\s*"([^"]+)"/) || html.match(/class="sku"[^>]*>([^<]+)]+src="([^"]*wp-content\/uploads[^"]*\.(?:png|jpg|webp))"[^>]+class="[^"]*(?:wp-post-image|attachment-shop_single)[^"]*"/i); const imageUrl = imgMatch?.[1]; // Datasheet PDF link const dsMatch = html.match(/href="([^"]*\.pdf)"[^>]*>.*?(?:datasheet|datenblatt|spec)/gi); const datasheetUrl = dsMatch ? (dsMatch[0].match(/href="([^"]+)"/) ?? [])[1] : undefined; const ff = detectFormFactor(name); const reach = detectReach(name); const pageText = html.slice(0, 5000); // only check first 5KB for coherent detection const coherent = /coherent|coh-t|coh\.|dp-qpsk|qpsk|cfp2/i.test(name + pageText); const wdmType = /dwdm/i.test(name) ? "DWDM" : /cwdm/i.test(name) ? "CWDM" : undefined; return { sku, name, url, imageUrl, datasheetUrl, ...ff, reachLabel: reach?.label, reachMeters: reach?.meters, fiberType: detectFiber(name), coherent, wdmType, }; } catch (err) { console.warn(` Failed ${url}: ${(err as Error).message.slice(0, 80)}`); return null; } } /** Try WooCommerce REST API for a complete product list (often publicly accessible) */ async function tryWooCommerceApi(): Promise { const urls: string[] = []; try { for (let page = 1; page <= 20; page++) { const apiUrl = `${BASE}/wp-json/wc/v3/products?per_page=100&page=${page}&category=optical-transceivers&status=publish`; const resp = await fetch(apiUrl, { headers: { ...HEADERS, Accept: "application/json" }, signal: AbortSignal.timeout(10000), }); if (!resp.ok) break; const products = await resp.json() as Array<{ permalink?: string; slug?: string }>; if (!Array.isArray(products) || products.length === 0) break; for (const p of products) { if (p.permalink) urls.push(normalizeProductUrl(p.permalink)); else if (p.slug) urls.push(normalizeProductUrl(`${BASE}/product/${p.slug}/`)); } if (products.length < 100) break; await sleep(500); } } catch { // API not accessible — not unusual, fall through to HTML crawl } return urls; } export async function scrapeSmartOptics(): Promise { console.log("=== SmartOptics Scraper v2 Starting ===\n"); console.log("Note: SmartOptics is B2B — no public prices. Scraping specs + catalog only.\n"); const vendorId = await ensureVendor( "SmartOptics", "manufacturer", "https://www.smartoptics.com", "https://smartoptics.com/products/optical-transceivers/" ); const productUrls = new Set(); // ── Try WooCommerce REST API first (fastest, most complete) ────────────── console.log("[1] Trying WooCommerce REST API…"); const apiUrls = await tryWooCommerceApi(); if (apiUrls.length > 0) { console.log(` API returned ${apiUrls.length} products`); apiUrls.forEach((u) => productUrls.add(u)); } else { console.log(" API not accessible — falling back to HTML crawl"); } // ── HTML catalog crawl (always run to catch any API misses) ─────────────── console.log("[2] Crawling category pages…"); for (const catPath of CATALOG_PAGES) { const catBase = `${BASE}${catPath}`; for (let page = 1; page <= 10; page++) { const pageUrl = page === 1 ? catBase : `${catBase}page/${page}/`; try { const html = await fetchPage(pageUrl); const found = extractProductUrls(html, pageUrl); if (found.length === 0 && page > 1) break; // no more pages in this category if (found.length === 0 && page === 1) break; // category doesn't exist found.forEach((u) => productUrls.add(u)); console.log(` ${catPath} p${page}: ${found.length} products`); await sleep(1200); } catch (err) { const msg = (err as Error).message; if (!msg.includes("404")) console.warn(` ${pageUrl}: ${msg.slice(0, 60)}`); break; } } } console.log(`\nTotal unique product URLs: ${productUrls.size}`); if (productUrls.size === 0) { console.warn("No products found — SmartOptics site structure may have changed"); return; } // ── Scrape individual product pages ─────────────────────────────────────── console.log("\n[3] Scraping product detail pages…"); let saved = 0; let withImages = 0; let failed = 0; for (const url of productUrls) { const product = await scrapeProductPage(url); if (!product) { failed++; continue; } try { await findOrCreateScrapedTransceiver({ partNumber: product.sku, vendorId, formFactor: product.formFactor, speedGbps: product.speedGbps, speed: product.speed, reachMeters: product.reachMeters, reachLabel: product.reachLabel, fiberType: product.fiberType, wavelengths: product.wdmType ? "DWDM-tunable" : undefined, category: product.coherent ? "Coherent" : "DataCenter", imageUrl: product.imageUrl, }); saved++; if (product.imageUrl) withImages++; console.log(` ✓ ${product.sku.slice(0, 25).padEnd(25)} ${product.name.slice(0, 50)}`); } catch (err) { console.warn(` ✗ ${product.sku}: ${(err as Error).message.slice(0, 80)}`); } await sleep(1200); } console.log(`\n=== SmartOptics v2 Complete ===`); console.log(` Products discovered: ${productUrls.size}`); console.log(` Saved to DB: ${saved}`); console.log(` With images: ${withImages}`); if (failed > 0) console.warn(` Failed pages: ${failed}`); } if (require.main === module) { scrapeSmartOptics() .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }