/** * UfiSpace Product Catalog Scraper * * Scrapes switch product pages from ufispace.com for specs and compatibility. * UfiSpace publishes clean, well-structured product pages. * * Source: https://www.ufispace.com/products/datacenter-switches */ import { CheerioCrawler } from "crawlee"; import { makeCrawleeConfig } from "../utils/crawlee-config"; import { pool, ensureWhiteboxVendor, findOrCreateSwitch } from "../utils/db"; const BASE_URL = "https://www.ufispace.com"; const PRODUCT_URLS = [ `${BASE_URL}/products/datacenter-switches`, `${BASE_URL}/networking-white-box`, ]; function extractPortsFromSpec(specText: string): { portsConfig: Record; totalPorts: number; maxSpeedGbps: number; formFactors: string[]; } { const portsConfig: Record = {}; let totalPorts = 0; let maxSpeedGbps = 0; const formFactors: string[] = []; const portPattern = /(\d+)\s*x\s*(\d+)\s*G(?:bE|b\/s)?\s*(QSFP-DD|QSFP28|QSFP\+|QSFP56|SFP28|SFP\+|SFP56|OSFP|CFP2)?/gi; let match: RegExpExecArray | null; while ((match = portPattern.exec(specText)) !== null) { const count = parseInt(match[1]); const speed = parseInt(match[2]); const ff = match[3]?.toUpperCase() || `${speed}G`; const key = `${speed}G_${ff}`; portsConfig[key] = (portsConfig[key] || 0) + count; totalPorts += count; maxSpeedGbps = Math.max(maxSpeedGbps, speed); if (match[3] && !formFactors.includes(match[3].toUpperCase())) { formFactors.push(match[3].toUpperCase()); } } return { portsConfig, totalPorts, maxSpeedGbps, formFactors }; } function detectAsic(text: string): { vendor: string; model: string; series: string } { const asicPatterns: Array<{ pattern: RegExp; vendor: string; model: string; series: string }> = [ { pattern: /tomahawk\s*5/i, vendor: "Broadcom", model: "Tomahawk 5", series: "StrataDNX" }, { pattern: /tomahawk\s*4/i, vendor: "Broadcom", model: "Tomahawk 4", series: "StrataDNX" }, { pattern: /tomahawk\s*3/i, vendor: "Broadcom", model: "Tomahawk 3", series: "StrataDNX" }, { pattern: /tomahawk\s*2/i, vendor: "Broadcom", model: "Tomahawk 2", series: "StrataDNX" }, { pattern: /tomahawk/i, vendor: "Broadcom", model: "Tomahawk", series: "StrataDNX" }, { pattern: /trident\s*(3|iii)/i, vendor: "Broadcom", model: "Trident III", series: "StrataDNX" }, { pattern: /jericho\s*2/i, vendor: "Broadcom", model: "Jericho2", series: "StrataDNX" }, { pattern: /spectrum/i, vendor: "NVIDIA", model: "Spectrum", series: "Spectrum" }, ]; for (const { pattern, vendor, model, series } of asicPatterns) { if (pattern.test(text)) { return { vendor, model, series }; } } return { vendor: "Broadcom", model: "Unknown", series: "" }; } export async function scrapeUfiSpace(): Promise { console.log("\n=== UfiSpace Scraper ===\n"); const vendorId = await ensureWhiteboxVendor("UfiSpace", "https://www.ufispace.com", { isOdm: true, ocpMember: true, sonicContributor: true, }); let created = 0; let updated = 0; const crawler = new CheerioCrawler({ maxConcurrency: 2, maxRequestsPerMinute: 15, requestHandlerTimeoutSecs: 30, async requestHandler({ request, $, enqueueLinks }) { // Product list pages — enqueue individual products if (request.url.includes("products/") || request.url.includes("networking-white-box")) { console.log(` Parsing: ${request.url}`); const productLinks: string[] = []; // Look for links to individual product pages $("a").each((_i, el) => { const href = $(el).attr("href") || ""; if (href.match(/\/S9[0-9]+-/i) || href.match(/\/product\//i)) { const fullUrl = href.startsWith("http") ? href : `${BASE_URL}${href}`; if (!productLinks.includes(fullUrl)) { productLinks.push(fullUrl); } } }); console.log(` Found ${productLinks.length} product links`); for (const link of productLinks) { await enqueueLinks({ urls: [link] }); } return; } // Individual product page const pageText = $("body").text(); const title = $("h1, .product-title").first().text().trim(); if (!title) return; // Extract model name (S9600-32X, S9700-53DX, etc.) const modelMatch = title.match(/(S\d{4}-\d+[A-Z]*)/i) || pageText.match(/(S\d{4}-\d+[A-Z]*)/i); if (!modelMatch) return; const model = modelMatch[1]; const portInfo = extractPortsFromSpec(pageText); const asicInfo = detectAsic(pageText); if (portInfo.totalPorts === 0) return; const powerMatch = pageText.match(/(?:max|maximum)\s*power[:\s]*(\d+)\s*W/i); const cpuMatch = pageText.match(/(Intel\s+(?:Xeon|Atom|Core)[^\n,;]+)/i); const ramMatch = pageText.match(/(\d+)\s*GB?\s*(?:DDR[34]|RAM|memory)/i); const storageMatch = pageText.match(/(\d+)\s*GB?\s*(?:SSD|eMMC|M\.2)/i); const switchCapMatch = pageText.match(/switching\s*capacity[:\s]*([\d.]+)\s*Tb/i); const seriesMatch = model.match(/^(S\d{4})/); const series = seriesMatch ? seriesMatch[1] : ""; // Determine category based on model/series let category: "DataCenter" | "Edge" | "SP" = "DataCenter"; if (model.includes("9510") || pageText.toLowerCase().includes("cell site")) { category = "Edge"; } const existing = await pool.query( `SELECT id FROM switches WHERE model = $1 AND vendor_id = $2`, [model, vendorId] ); const isNew = existing.rows.length === 0; await findOrCreateSwitch({ model, vendorId, series, category, layer: "L3", portsConfig: portInfo.portsConfig, totalPorts: portInfo.totalPorts, maxSpeedGbps: portInfo.maxSpeedGbps, switchingCapacityTbps: switchCapMatch ? parseFloat(switchCapMatch[1]) : undefined, asicVendor: asicInfo.vendor, asicModel: asicInfo.model, asicSeries: asicInfo.series, maxPowerW: powerMatch ? parseInt(powerMatch[1]) : undefined, cpu: cpuMatch ? cpuMatch[1].trim() : undefined, ramGb: ramMatch ? parseInt(ramMatch[1]) : undefined, storageGb: storageMatch ? parseInt(storageMatch[1]) : undefined, sonicCompatible: true, isWhitebox: true, onieSupport: true, supportedNos: ["SONiC"], transceiverFormFactors: portInfo.formFactors, catalogUrl: request.url, tags: [ "whitebox", "UfiSpace", `${portInfo.maxSpeedGbps}G`, asicInfo.model, ...(category === "Edge" ? ["cell-site", "DCSG"] : []), ], scrapeSource: "ufispace-catalog", }); if (isNew) { created++; console.log(` + ${model} (${portInfo.maxSpeedGbps}G, ${asicInfo.vendor} ${asicInfo.model})`); } else { updated++; } }, failedRequestHandler({ request }) { console.error(` ! Failed: ${request.url}`); }, }, makeCrawleeConfig("ufispace")); await crawler.run(PRODUCT_URLS); console.log(`\n Created: ${created}, Updated: ${updated}\n`); }