/** * Switch Assets Scraper — Playwright-based for JS-heavy vendor sites * * Cisco, Arista, HPE/Aruba, Dell, and Extreme require JavaScript rendering * to access product pages, datasheets, and images. * * Uses PlaywrightCrawler for full browser rendering. */ import { PlaywrightCrawler } from "crawlee"; import { pool } from "../utils/db"; import { makeCrawleeConfig } from "../utils/crawlee-config"; import { downloadSwitchImage, downloadSwitchDatasheet, downloadSwitchManual, setSwitchProductPage, } from "../utils/assets"; interface CrawlTarget { switchId: string; vendorId: string; vendorName: string; model: string; productPageUrl: string; } // ═══════════════════════════════════════════════════════ // Vendor-specific product page URL builders // ═══════════════════════════════════════════════════════ function buildCiscoUrl(model: string): string { if (model.startsWith("N9K-") || model.startsWith("N3K-")) { // Nexus 9000 — use datasheet listing page for JS-rendered content return `https://www.cisco.com/c/en/us/products/switches/nexus-9000-series-switches/datasheet-listing.html`; } if (model.startsWith("C93")) { return `https://www.cisco.com/c/en/us/products/switches/catalyst-9300-series-switches/datasheet-listing.html`; } if (model.startsWith("C92")) { return `https://www.cisco.com/c/en/us/products/switches/catalyst-9200-series-switches/index.html`; } if (model.startsWith("C95")) { return `https://www.cisco.com/c/en/us/products/switches/catalyst-9500-series-switches/index.html`; } if (model.startsWith("C9")) { return `https://www.cisco.com/c/en/us/products/switches/catalyst-9000/index.html`; } if (model.startsWith("NCS-") || model.startsWith("81")) { return `https://www.cisco.com/c/en/us/products/routers/network-convergence-system-5500-series/index.html`; } return `https://www.cisco.com/site/us/en/products/networking/cloud-networking-switches/index.html`; } function buildAristaUrl(model: string): string { // Arista product pages: /en/products/{series}-series (no hyphens in series name) const series = model.match(/^(\d{4}[A-Z]*\d*)/)?.[1] || model; return `https://www.arista.com/en/products/${series.toLowerCase().replace(/[^a-z0-9]/g, "")}-series`; } function buildHpeUrl(model: string): string { const seriesNum = model.match(/CX\s*(\d+)/)?.[1] || ""; return `https://www.arubanetworks.com/products/switches/cx-${seriesNum}-series/`; } function buildDellUrl(model: string): string { return `https://www.dell.com/en-us/shop/networking-switches/${model.toLowerCase().replace(/\s+/g, "-")}/spd/${model.toLowerCase().replace(/\s+/g, "-")}`; } function buildExtremeUrl(model: string): string { return `https://www.extremenetworks.com/product/${model.toLowerCase().replace(/[^a-z0-9]/g, "-")}`; } function buildJsVendorUrl(vendorName: string, model: string): string | null { const lower = vendorName.toLowerCase(); if (lower.includes("cisco")) return buildCiscoUrl(model); if (lower.includes("arista")) return buildAristaUrl(model); if (lower.includes("hpe") || lower.includes("aruba")) return buildHpeUrl(model); if (lower.includes("dell")) return buildDellUrl(model); if (lower.includes("extreme")) return buildExtremeUrl(model); return null; } // ═══════════════════════════════════════════════════════ // Playwright-based asset extraction // ═══════════════════════════════════════════════════════ export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promise { console.log("=== Switch Assets Crawler (Playwright) ===\n"); const jsVendors = ["Cisco", "Arista", "HPE", "Aruba", "Dell", "Extreme"]; const vendorFilter = targetVendor ? `AND v.name ILIKE '%${targetVendor}%'` : `AND (${jsVendors.map((v) => `v.name ILIKE '%${v}%'`).join(" OR ")})`; const result = await pool.query(` SELECT sw.id, sw.model, sw.series, sw.product_page_url, v.name as vendor_name, v.id as vendor_id FROM switches sw JOIN vendors v ON sw.vendor_id = v.id WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL) ${vendorFilter} ORDER BY v.name, sw.model LIMIT 100 `); if (result.rows.length === 0) { console.log("No JS-vendor switches need asset scraping.\n"); return; } const targets: CrawlTarget[] = []; for (const row of result.rows) { const productPageUrl = row.product_page_url || buildJsVendorUrl(row.vendor_name, row.model); if (!productPageUrl) continue; targets.push({ switchId: row.id, vendorId: row.vendor_id, vendorName: row.vendor_name, model: row.model, productPageUrl, }); } console.log(`Crawling ${targets.length} JS-heavy product pages...\n`); let images = 0; let datasheets = 0; let manuals = 0; const crawler = new PlaywrightCrawler({ maxConcurrency: 2, maxRequestsPerMinute: 10, requestHandlerTimeoutSecs: 60, headless: true, launchContext: { launchOptions: { args: ["--no-sandbox", "--disable-setuid-sandbox"], }, }, async requestHandler({ request, page }) { const target = request.userData as CrawlTarget; console.log(` ${target.vendorName} ${target.model}:`); // Wait for page to fully load await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}); // Set product page URL await setSwitchProductPage(target.switchId, request.url); // Extract og:image or first large product image const imageUrl = await page.evaluate(() => { const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute("content"); if (ogImage) return ogImage; const imgs = Array.from(document.querySelectorAll("img")); const productImg = imgs.find((img) => { const src = img.src || ""; const alt = (img.alt || "").toLowerCase(); return (src.includes("product") || alt.includes("switch") || alt.includes("router")) && img.naturalWidth > 200; }); return productImg?.src || null; }); if (imageUrl) { const ok = await downloadSwitchImage( target.switchId, imageUrl, target.vendorName, target.model ); if (ok) { images++; console.log(` ✓ Image`); } } // Extract datasheet PDF links const pdfLinks = await page.evaluate(() => { const links = Array.from(document.querySelectorAll('a[href*=".pdf"]')); return links.map((a) => ({ href: (a as HTMLAnchorElement).href, text: a.textContent?.trim() || "", })); }); const datasheetLink = pdfLinks.find((l) => { const t = l.text.toLowerCase(); const h = l.href.toLowerCase(); return t.includes("datasheet") || t.includes("data sheet") || h.includes("datasheet") || h.includes("data-sheet"); }); if (datasheetLink) { const ok = await downloadSwitchDatasheet( target.switchId, target.vendorId, datasheetLink.href, datasheetLink.text || `${target.model} Datasheet`, target.vendorName, target.model ); if (ok) { datasheets++; console.log(` ✓ Datasheet`); } } // Extract manual/guide links const manualLinks = pdfLinks.filter((l) => { const t = l.text.toLowerCase(); return t.includes("guide") || t.includes("manual") || t.includes("reference") || t.includes("quick start") || t.includes("installation"); }); for (const manual of manualLinks.slice(0, 3)) { let type = "manual"; const t = manual.text.toLowerCase(); if (t.includes("quick start")) type = "quick_start"; if (t.includes("cli") || t.includes("reference")) type = "cli_reference"; if (t.includes("installation")) type = "installation_guide"; const ok = await downloadSwitchManual( target.switchId, target.vendorId, manual.href, manual.text, type, target.vendorName, target.model ); if (ok) { manuals++; console.log(` ✓ ${type}: ${manual.text}`); } } }, async failedRequestHandler({ request }) { const target = request.userData as CrawlTarget; console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`); }, }, makeCrawleeConfig("switch-assets-playwright")); await crawler.run( targets.map((t) => ({ url: t.productPageUrl, userData: t, })) ); console.log(`\n=== Playwright Crawl Complete ===`); console.log(` Images: ${images}`); console.log(` Datasheets: ${datasheets}`); console.log(` Manuals: ${manuals}`); } if (require.main === module) { const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1]; crawlSwitchAssetsPlaywright(vendor) .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }