/** * Switch Image Fetcher — Playwright edition for bot-blocked vendors * * Vendors that reject plain HTTP bots (403/406) or require JS rendering: * Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403), * Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs), * Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise, * Asterfusion, Brocade, UfiSpace, QCT * * Strategy: * 1. Query switches without image_url for JS-blocked vendors * 2. Open each product page in headless Chromium (stealth mode) * 3. Extract og:image (or fallback: first large product ) * 4. Apply same isGenericImage() filter as the plain HTTP fetcher * 5. Write image_url + product_page_url to switches table * * Rate limit: maxConcurrency=1, 4s delay between requests. * Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista] */ import { PlaywrightCrawler } from "crawlee"; import { pool } from "../utils/db"; import { makeCrawleeConfig } from "../utils/crawlee-config"; // ── Stealth headers injected into every page ───────────────────────────────── const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; // ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ──────── const GENERIC_IMAGE_PATTERNS: RegExp[] = [ /[-/_]logo[-_.]|\/logos?\//i, /cisco[-_]?logo/i, /juniper[-_]networks[-_]logo/i, /arista[-_]?logo/i, /brand[-_]?logo/i, /company[-_]?logo/i, /\/svg\//i, /\.svg(\?|$)/i, /naas-homepag/i, /al-enterprise.*\/images\/naas/i, /og[-_]default/i, /default[-_](?:og|social|share|image)/i, /site[-_](?:default|image|og)/i, /social[-_](?:default|share)/i, /twitter[-_]default/i, /default[-_]thumbnail/i, /\/homepage\//i, /hero[-_](?:banner|bg|background|image)/i, /banner[-_](?:bg|background)/i, /lifestyle/i, /stock[-_]?photo/i, /placeholder/i, /no[-_]?image/i, /image[-_]?not[-_]?found/i, /\/fallback[/-]/i, /missing[-_]image/i, /\/press[-_]kit/i, /\/media[-_]kit/i, // Vendor-specific brand icons /open-graph\.gif/i, /social[-_]icon/i, /favicon/i, /og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic // Cookie consent / GDPR overlay images (OneTrust, Cookiebot, TrustArc, etc.) /cdn\.cookielaw\.org/i, /cookiebot\.com/i, /trustarc\.com/i, /consent-manager/i, // Nokia CMS marketing banners (not product photos) /nok\d+-nokia-com-banner/i, // Huawei category/why-buy marketing images /whyhuawei-/i, /campus-switches/i, /bg_products/i, // Generic "banners" path segment used by CMSes /\/banners?\//i, // Vendor error / 404 graphics /404[-_]error/i, /error[-_]graphic/i, /webimage-404/i, // Navigation icon libraries (D-Link, other CMSes) /\/icon[-_]library\//i, // Diagrams and illustrations (not product photos) /[-_]illustration[._]/i, // Moxa brand/marketing images (not product photos) /\/Brand\//i, /cybersecurity\.png/i, ]; function isGenericImage(url: string): boolean { return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url)); } // ── Product page URL builders ───────────────────────────────────────────────── function buildAristaUrl(model: string): string | null { // Map model to its Arista series page (og:image lives on series pages, not individual model pages). // Pattern: extract alphanumeric prefix before the first "-" port-count suffix. // 7060X5-32QS → 7060x5 → /en/products/7060x5-series // 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series // 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series // 7020R → 7020r → /en/products/7020r-series const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i); if (!leadMatch) return null; let series = leadMatch[1].toLowerCase(); // Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page series = series.replace(/([a-z]\d+)a$/, "$1"); return `https://www.arista.com/en/products/${series}-series`; } function buildDellUrl(model: string): string | null { // PowerSwitch Z9332F-ON → try Dell networking product page const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim(); const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`; } // Edgecore uses WooCommerce with /product// URLs (no .html suffix). // Some models have non-obvious slugs verified via sitemap. const EDGECORE_SLUG_MAP: Record = { "AS7712-32X": "as7712-32x-ec", // -ec suffix variant in Edgecore WooCommerce "Minipack2": "minipack-as8000-open-modular-platform", // Facebook OCP Minipack2 }; function buildEdgecoreUrl(model: string): string | null { if (model in EDGECORE_SLUG_MAP) { return `https://www.edge-core.com/product/${EDGECORE_SLUG_MAP[model]}/`; } // Standard slug: lowercase, replace non-alphanum with dash, collapse multiple dashes const slug = model.toLowerCase() .replace(/[^a-z0-9-]/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, ""); return slug ? `https://www.edge-core.com/product/${slug}/` : null; } function buildFortinetUrl(_model: string): string | null { // Fortinet product pages are fully JS-rendered and all redirect to generic /products/ethernet-switches. // No reliable og:image can be extracted — skip entirely. return null; } function buildHpeArubaUrl(model: string): string | null { // HPE Aruba series pages are stored in product_page_url for all known models. // Builder is a fallback for unknown models. const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.arubanetworks.com/products/switches/${slug}/`; } function buildExtremeUrl(model: string): string | null { // Extreme direct product pages: extremenetworks.com/product/ const slug = model.toLowerCase() .replace(/\s+/g, "-") .replace(/[^a-z0-9-]/g, "") .replace(/-+/g, "-"); return slug ? `https://www.extremenetworks.com/product/${slug}` : null; } // ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ──── // Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade: // all models have product_page_url in DB → return null so the stored URL is used. const buildPassthroughUrl = (_model: string): string | null => null; function buildNvidiaUrl(model: string): string | null { // NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc. // ConnectX-7 is an HCA, no relevant product page → skip. const snMatch = model.match(/^(SN[\d]+)/i); if (snMatch) { return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`; } return null; } function buildNetgearUrl(model: string): string | null { // M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed// const slug = model.toLowerCase() .replace(/[^a-z0-9]/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, ""); return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null; } // UfiSpace: slug map derived from sitemap (non-predictable product URL tree) const UFISPACE_URL_MAP: Record = { "S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway", "S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router", "S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router", "S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam", "S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router", "S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router", }; function buildUfiSpaceUrl(model: string): string | null { return UFISPACE_URL_MAP[model] ?? null; } // QCT: URL map derived from sitemap (category path not predictable from model name) const QCT_URL_MAP: Record = { "QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8", "QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1", }; function buildQctUrl(model: string): string | null { return QCT_URL_MAP[model] ?? null; } const URL_BUILDERS: Record string | null> = { arista: buildAristaUrl, dell: buildDellUrl, edgecore: buildEdgecoreUrl, fortinet: buildFortinetUrl, "hpe-aruba": buildHpeArubaUrl, extreme: buildExtremeUrl, // New JS-rendered vendors (stored product_page_url used where available) nokia: buildPassthroughUrl, huawei: buildPassthroughUrl, ciena: buildPassthroughUrl, moxa: buildPassthroughUrl, "d-link": buildPassthroughUrl, "alcatel-lucent-enterprise": buildPassthroughUrl, asterfusion: buildPassthroughUrl, brocade: buildPassthroughUrl, "nvidia-networking": buildNvidiaUrl, netgear: buildNetgearUrl, ufispace: buildUfiSpaceUrl, "quanta-cloud-technology": buildQctUrl, }; // ── Request data attached to each crawl URL ────────────────────────────────── interface SwitchCrawlData { switchId: string; model: string; vendorName: string; vendorSlug: string; productPageUrl: string; } // ── Main scraper ────────────────────────────────────────────────────────────── export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise { console.log("=== Switch Image Fetcher (Playwright) ===\n"); const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`; const { rows } = await pool.query<{ id: string; model: string; vendor_slug: string; vendor_name: string; product_page_url: string | null; }>(` SELECT sw.id, sw.model, sw.product_page_url, v.slug AS vendor_slug, v.name AS vendor_name FROM switches sw JOIN vendors v ON v.id = sw.vendor_id WHERE (sw.image_url IS NULL OR sw.image_url = '') ${slugFilter} ORDER BY v.slug, sw.model `); if (rows.length === 0) { console.log(" All target switches already have images.\n"); return; } console.log(` ${rows.length} switches need images (Playwright vendors)\n`); const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = []; for (const row of rows) { const builder = URL_BUILDERS[row.vendor_slug]; // For Arista: prefer freshly-built series URL over a stale stored model URL const builtUrl = builder ? builder(row.model) : null; const productUrl = row.vendor_slug === "arista" ? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista : (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL if (!productUrl) { console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`); continue; } requests.push({ url: productUrl, // Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs. // Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write. uniqueKey: row.id, userData: { switchId: row.id, model: row.model, vendorName: row.vendor_name, vendorSlug: row.vendor_slug, productPageUrl: productUrl, }, }); } if (requests.length === 0) { console.log(" Nothing to crawl.\n"); return; } let found = 0; let missed = 0; let errors = 0; const crawler = new PlaywrightCrawler( { maxConcurrency: 1, // one at a time — server-friendly maxRequestsPerMinute: 12, // ~5s per request minimum requestHandlerTimeoutSecs: 45, navigationTimeoutSecs: 30, headless: true, launchContext: { launchOptions: { args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-blink-features=AutomationControlled", "--disable-infobars", "--window-size=1920,1080", ], }, }, preNavigationHooks: [ async (_ctx, gotoOptions) => { gotoOptions!.waitUntil = "domcontentloaded"; }, ], async requestHandler({ request, page }) { const data = request.userData as SwitchCrawlData; // Inject stealth UA await page.setExtraHTTPHeaders({ "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", }); await page.evaluate((ua) => { Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true }); Object.defineProperty(navigator, "webdriver", { value: false, configurable: true }); }, STEALTH_UA); // Wait for page to settle (JS rendering) await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {}); // Extract og:image / twitter:image meta tags. // We DON'T filter generics here — we filter outside so the img fallback can still run // even when og:image exists but is a logo/brand image (e.g. Dell, HPE). const metaImageUrl: string | null = await page.evaluate(() => { const og = document.querySelector('meta[property="og:image"]')?.content; if (og) return og; const tw = document.querySelector('meta[name="twitter:image"]')?.content; return tw ?? null; }); // Use meta image if it passes the generic filter; otherwise fall through to img fallback. let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null; if (!imageUrl) { // Img fallback: largest visible image that isn't a UI element. // Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward. imageUrl = await page.evaluate(() => { const imgs = Array.from(document.querySelectorAll("img")); const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading|cookielaw|cookiebot|trustarc/i; const candidate = imgs .filter((img) => { const src = img.src || img.getAttribute("data-src") || ""; return src.startsWith("http") && /\.(jpg|jpeg|png|webp)/i.test(src) && img.naturalWidth >= 200 && img.naturalHeight >= 150 && !skipPattern.test(src); }) .sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0]; return candidate?.src ?? null; }); } if (!imageUrl || isGenericImage(imageUrl)) { console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`); missed++; // Save product_page_url even on miss to track that we tried if (!data.productPageUrl) { await pool.query( `UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`, [data.switchId, request.url], ); } return; } await pool.query( `UPDATE switches SET image_url = $2, product_page_url = COALESCE(product_page_url, $3), assets_scraped_at = NOW() WHERE id = $1`, [data.switchId, imageUrl, request.url], ); console.log(` [OK] ${data.vendorName} ${data.model} → ${imageUrl.slice(0, 80)}`); found++; }, async failedRequestHandler({ request }) { const data = request.userData as SwitchCrawlData; console.log(` [FAIL] ${data.vendorName} ${data.model} — ${request.errorMessages?.[0] ?? "unknown error"}`); errors++; }, }, // Use a unique run ID to avoid Crawlee temp-dir state contamination when multiple // vendor runs execute back-to-back (ENOENT: stale request-queue files from prior run). makeCrawleeConfig(`switch-images-playwright-${Date.now()}`), ); await crawler.run(requests); console.log(`\n=== Playwright Image Scraper Complete ===`); console.log(` Images found: ${found}`); console.log(` Missed: ${missed}`); if (errors > 0) console.warn(` Errors: ${errors}`); } if (require.main === module) { const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1]; fetchSwitchImagesPlaywright(vendor) .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }