From 204e99763ccbc4bc889d3a5dc25ec66548fda7cc Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 28 Mar 2026 01:02:34 +1300 Subject: [PATCH] feat: add Flexoptix product catalog scraper, register in CLI Scrapes flexoptix.net product catalog across 9 categories (SFP through OSFP). Extracts product names, prices, form factors, reach, fiber type, wavelength. CLI: --flexoptix flag, integrated into --all. --- packages/scraper/src/index.ts | 12 +- .../scraper/src/scrapers/flexoptix-catalog.ts | 244 ++++++++++++++++++ 2 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 packages/scraper/src/scrapers/flexoptix-catalog.ts diff --git a/packages/scraper/src/index.ts b/packages/scraper/src/index.ts index ce94f6d..d54c053 100644 --- a/packages/scraper/src/index.ts +++ b/packages/scraper/src/index.ts @@ -8,17 +8,27 @@ * tsx src/index.ts --cisco — Run Cisco TMG scraper once * tsx src/index.ts --optcore — Run Optcore scraper once * tsx src/index.ts --news — Run news aggregator once + * tsx src/index.ts --flexoptix — Run Flexoptix catalog scraper once + * tsx src/index.ts --vendors — Run Flexoptix vendor list scraper once */ import { createScheduler, registerSchedules, registerWorkers } from "./scheduler"; import { scrapeFs } from "./scrapers/fs-com"; import { scrapeCiscoTmg } from "./scrapers/cisco-tmg"; import { scrapeOptcore } from "./scrapers/optcore"; import { scrapeNews } from "./scrapers/news"; +import { scrapeFlexoptixCatalog } from "./scrapers/flexoptix-catalog"; +import { scrapeFlexoptixVendors } from "./scrapers/flexoptix-vendors"; import { pool } from "./utils/db"; const args = process.argv.slice(2); async function runOnce(): Promise { + if (args.includes("--flexoptix") || args.includes("--all")) { + await scrapeFlexoptixCatalog(); + } + if (args.includes("--vendors") || args.includes("--all")) { + await scrapeFlexoptixVendors(); + } if (args.includes("--fs") || args.includes("--all")) { await scrapeFs(); } @@ -56,7 +66,7 @@ async function runScheduler(): Promise { process.on("SIGTERM", shutdown); } -if (args.some((a) => ["--all", "--fs", "--cisco", "--optcore", "--news"].includes(a))) { +if (args.some((a) => ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors"].includes(a))) { runOnce().catch((err) => { console.error("Fatal:", err); process.exit(1); diff --git a/packages/scraper/src/scrapers/flexoptix-catalog.ts b/packages/scraper/src/scrapers/flexoptix-catalog.ts new file mode 100644 index 0000000..d3e8264 --- /dev/null +++ b/packages/scraper/src/scrapers/flexoptix-catalog.ts @@ -0,0 +1,244 @@ +/** + * Flexoptix Product Catalog Scraper + * + * Scrapes flexoptix.net product catalog for transceiver specs and pricing. + * This is our own data — no restrictions. + * + * Categories: SFP, SFP+, SFP28, QSFP+, QSFP28, QSFP-DD, OSFP, XFP, CFP + * + * Uses standard fetch (server-rendered HTML). Rate limited: 1 req/sec. + */ +import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation } from "../utils/db"; +import { contentHash } from "../utils/hash"; + +const BASE = "https://www.flexoptix.net"; +const HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; internal-flexoptix)", + Accept: "text/html,application/xhtml+xml", +}; + +const CATEGORIES = [ + { path: "/en/sfp/", formFactor: "SFP", speed: "1G", speedGbps: 1 }, + { path: "/en/sfp-plus/", formFactor: "SFP+", speed: "10G", speedGbps: 10 }, + { path: "/en/sfp28/", formFactor: "SFP28", speed: "25G", speedGbps: 25 }, + { path: "/en/qsfp-plus/", formFactor: "QSFP+", speed: "40G", speedGbps: 40 }, + { path: "/en/qsfp28/", formFactor: "QSFP28", speed: "100G", speedGbps: 100 }, + { path: "/en/qsfp-dd/", formFactor: "QSFP-DD", speed: "400G", speedGbps: 400 }, + { path: "/en/osfp/", formFactor: "OSFP", speed: "400G", speedGbps: 400 }, + { path: "/en/xfp/", formFactor: "XFP", speed: "10G", speedGbps: 10 }, + { path: "/en/cfp/", formFactor: "CFP2", speed: "100G", speedGbps: 100 }, +]; + +interface Product { + name: string; + partNumber: string; + url: string; + price?: number; + currency?: string; + formFactor: string; + speed: string; + speedGbps: number; + reachLabel?: string; + reachMeters?: number; + fiberType?: string; + wavelength?: string; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function detectReach(text: string): { label: string; meters: number } | undefined { + const patterns: [RegExp, string, number][] = [ + [/\b80\s*km\b/i, "80km", 80000], + [/\b40\s*km\b/i, "40km", 40000], + [/\b20\s*km\b/i, "20km", 20000], + [/\b10\s*km\b/i, "10km", 10000], + [/\b2\s*km\b/i, "2km", 2000], + [/\b500\s*m\b/i, "500m", 500], + [/\b300\s*m\b/i, "300m", 300], + [/\b100\s*m\b/i, "100m", 100], + [/\bLR4\b/, "10km", 10000], + [/\bLR\b/, "10km", 10000], + [/\bER\b/, "40km", 40000], + [/\bZR\b/, "80km", 80000], + [/\bSR4?\b/, "100m", 100], + [/\bDR4?\b/, "500m", 500], + [/\bFR4?\b/, "2km", 2000], + [/\bCWDM4\b/i, "2km", 2000], + [/\bPSM4\b/i, "500m", 500], + ]; + for (const [regex, label, meters] of patterns) { + if (regex.test(text)) return { label, meters }; + } + return undefined; +} + +function detectFiber(text: string): string { + const t = text.toLowerCase(); + if (/single.?mode|smf|[^a-z]lx[^a-z]|[^a-z]lr[^a-z]|[^a-z]er[^a-z]|[^a-z]zr[^a-z]|bidi|cwdm|dwdm/i.test(t)) return "SMF"; + if (/multi.?mode|mmf|[^a-z]sx[^a-z]|[^a-z]sr[^a-z]/i.test(t)) return "MMF"; + return ""; +} + +function detectWavelength(text: string): string { + const match = text.match(/(\d{3,4})\s*nm/i); + if (match) return match[1]; + return ""; +} + +function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product[] { + const products: Product[] = []; + + // Shopware product box pattern + const itemRegex = /class="[^"]*product-(?:box|item|card|info|name)[^"]*"[\s\S]*?href="(\/en\/[^"]*?\.html)"[^>]*>[\s\S]*?<\/(?:div|article|li)>/gi; + let match; + while ((match = itemRegex.exec(html)) !== null) { + const block = match[0]; + const url = match[1]; + + const titleMatch = block.match(/class="[^"]*product-(?:name|title)[^"]*"[^>]*>([^<]+)/i) + || block.match(/]*>\s*([^<]{5,})<\/a>/i); + if (!titleMatch) continue; + + const name = titleMatch[1].trim(); + if (!name || name.length < 3) continue; + + const priceMatch = block.match(/(?:€|EUR)\s*([\d.,]+)/i) || block.match(/([\d.,]+)\s*(?:€|EUR)/i); + const price = priceMatch ? parseFloat(priceMatch[1].replace(",", ".")) : undefined; + const partNum = name.replace(/\s+/g, "-").slice(0, 80); + const reach = detectReach(name); + + products.push({ + name, partNumber: partNum, url: BASE + url, + price, currency: price ? "EUR" : undefined, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + }); + } + + // Fallback: simple link extraction + if (products.length === 0) { + const simpleRegex = /href="(\/en\/(?:sfp|qsfp|osfp|xfp|cfp)[^"]*?\.html)"[^>]*>\s*([^<]{5,})/gi; + while ((match = simpleRegex.exec(html)) !== null) { + const url = match[1]; + const name = match[2].trim(); + if (products.find((p) => p.url === BASE + url)) continue; + const reach = detectReach(name); + products.push({ + name, partNumber: name.replace(/\s+/g, "-").slice(0, 80), url: BASE + url, + formFactor: cat.formFactor, speed: cat.speed, speedGbps: cat.speedGbps, + reachLabel: reach?.label, reachMeters: reach?.meters, + fiberType: detectFiber(name), wavelength: detectWavelength(name), + }); + } + } + + return products; +} + +function getMaxPage(html: string): number { + const pageMatches = html.match(/[?&]p=(\d+)/g); + if (!pageMatches) return 1; + let max = 1; + for (const m of pageMatches) { + const n = parseInt(m.replace(/[?&]p=/, "")); + if (n > max) max = n; + } + return Math.min(max, 50); +} + +async function fetchPage(url: string): Promise { + const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(30000) }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`); + return resp.text(); +} + +export async function scrapeFlexoptixCatalog(): Promise { + console.log("=== Flexoptix Catalog Scraper Starting ===\n"); + + const vendorId = await ensureVendor("Flexoptix", "reseller", "https://www.flexoptix.net", "https://www.flexoptix.net/en/"); + + let totalProducts = 0; + let priceUpdates = 0; + + for (const cat of CATEGORIES) { + console.log(`\n--- ${cat.formFactor} (${cat.speed}) ---`); + + try { + const firstPage = await fetchPage(BASE + cat.path); + const maxPage = getMaxPage(firstPage); + console.log(` Pages: ${maxPage}`); + + let catProducts: Product[] = parseProductList(firstPage, cat); + + for (let page = 2; page <= maxPage; page++) { + await sleep(1000); + try { + const html = await fetchPage(`${BASE}${cat.path}?p=${page}`); + catProducts.push(...parseProductList(html, cat)); + } catch (err) { + console.warn(` Page ${page} failed: ${(err as Error).message}`); + } + } + + // Dedupe by URL + const seen = new Set(); + catProducts = catProducts.filter((p) => { + if (seen.has(p.url)) return false; + seen.add(p.url); + return true; + }); + + console.log(` Found ${catProducts.length} products`); + + for (const product of catProducts) { + try { + const txId = await findOrCreateScrapedTransceiver({ + partNumber: product.partNumber, + vendorId, + formFactor: product.formFactor, + speedGbps: product.speedGbps, + speed: product.speed, + reachMeters: product.reachMeters, + reachLabel: product.reachLabel, + fiberType: product.fiberType, + wavelengths: product.wavelength, + category: "DataCenter", + }); + + if (product.price && product.price > 0) { + const hash = contentHash(JSON.stringify({ price: product.price, part: product.partNumber })); + const updated = await upsertPriceObservation({ + transceiverId: txId, + sourceVendorId: vendorId, + price: product.price, + currency: product.currency || "EUR", + stockLevel: "in_stock", + url: product.url, + contentHash: hash, + }); + if (updated) priceUpdates++; + } + + totalProducts++; + } catch (err) { + console.warn(` Error: ${(err as Error).message.slice(0, 80)}`); + } + } + } catch (err) { + console.error(` Category failed: ${(err as Error).message}`); + } + + await sleep(2000); + } + + console.log(`\n=== Flexoptix Catalog Complete: ${totalProducts} products, ${priceUpdates} prices ===`); +} + +if (require.main === module) { + scrapeFlexoptixCatalog() + .then(() => pool.end()) + .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); +}