/** * Switch Image Fetcher — og:image based image discovery for all seeded switches * * Strategy: * 1. For each switch without image_url, build the vendor product page URL * 2. Fetch page HTML (plain HTTP) and extract og:image meta tag * 3. Validate image URL (must be HTTP(S), not empty) * 4. Write image_url + product_page_url to switches table * * Vendors covered: * Cisco (Nexus 9000/9300, NCS 5500/5700, Catalyst 9300/9500, 8000 SP) * Arista (7000 series) * Juniper (QFX, EX series) * NVIDIA Networking (Spectrum SN series — ConnectX skipped) * Edgecore, Celestica, Asterfusion (whitebox) * Fortinet (FortiSwitch series) * Dell, HPE/Aruba, Huawei, Nokia, Extreme, MikroTik, Ubiquiti, FS.COM, Supermicro * Alcatel-Lucent Enterprise, Allied Telesis, Netgear, Quanta Cloud Technology, Ufispace * * Rate limit: 1 req/2sec per domain, max 3 concurrent domains. * Respects robots.txt: User-Agent identifies as research bot. */ import { pool } from "../utils/db"; const HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; TIP-Bot/1.0; research; +https://transceiver-db.fichtmueller.org)", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }; function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } // ── Product page URL builders ─────────────────────────────────────────────── function buildCiscoUrl(model: string): string | null { const m = model.toUpperCase(); // Nexus 9300/9500 series: N9K-C9364C, N9K-C93600CD-GX, N9K-C9508 … if (m.startsWith("N9K-C")) { const slug = m.replace("N9K-C", "").toLowerCase().replace(/[^a-z0-9]/g, "-"); return `https://www.cisco.com/c/en/us/products/switches/nexus-${slug}-switch/index.html`; } // NCS 5500/5700: NCS-57C3-MOD, NCS-5504 if (m.startsWith("NCS-")) { const num = m.replace("NCS-", "").toLowerCase().replace(/[^a-z0-9]/g, "-"); return `https://www.cisco.com/c/en/us/products/routers/network-convergence-system-${num}/index.html`; } // Catalyst: C9300-48UXM, C9500-32C if (m.startsWith("C9")) { const slug = m.toLowerCase().replace(/[^a-z0-9]/g, "-"); return `https://www.cisco.com/c/en/us/products/switches/catalyst-${slug}/index.html`; } // Cisco 8000 SP series chassis: 8101-32FH, 8202-32FH, 8608 if (/^8[0-9]{3}/.test(m)) { return `https://www.cisco.com/site/us/en/products/networking/sdwan-routers/8000-series/index.html`; } // Cisco 8800 line cards (88-LC0-*, 84-MPA-*, 86-MPA-*) → same 8000 family page if (/^(88|84|86)-/.test(m)) { return `https://www.cisco.com/site/us/en/products/networking/sdwan-routers/8000-series/index.html`; } // ASR 9000 / A900 line cards only return the Cisco logo as og:image — skip return null; } function buildAlcatelLucentUrl(model: string): string | null { // OmniSwitch 6900-X72, OmniSwitch 9900-C32D const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); return `https://www.al-enterprise.com/en/products/switches/${slug}`; } function buildAristaUrl(model: string): string | null { // 7060X6-64PE → https://www.arista.com/en/products/7060x6-series/7060cx6-64pe // 7050CX3-32S → https://www.arista.com/en/products/7050x3-series/7050cx3-32s // All arista models follow: /en/products/{model-lowercase} const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.arista.com/en/products/${slug}`; } function buildJuniperUrl(model: string): string | null { const m = model.toUpperCase(); if (m.startsWith("QFX")) { // QFX5130-32CD → qfx5130-32cd const slug = model.toLowerCase(); return `https://www.juniper.net/us/en/products/switches/qfx-series/${slug}.html`; } if (m.startsWith("EX")) { const slug = model.toLowerCase(); return `https://www.juniper.net/us/en/products/switches/ex-series/${slug}.html`; } return null; } function buildNvidiaUrl(model: string): string | null { // SN5600 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn5600/ // SN4700 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn4700/ // ConnectX-7 / BlueField are adapters, not switches — skip const m = model.toUpperCase(); if (m.includes("CONNECTX") || m.includes("BLUEFIELD")) return null; const slug = m.replace(/[^A-Z0-9]/g, ""); if (!slug.startsWith("SN")) return null; // only Spectrum switch series return `https://www.nvidia.com/en-us/networking/ethernet-switching/${slug.toLowerCase()}/`; } function buildEdgecoreUrl(model: string): string | null { // AS7726-32X, DCS810 const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.edge-core.com/product/${slug}.html`; } function buildDellUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.dell.com/en-us/shop/networking/sf/${slug}`; } function buildHuaweiUrl(model: string): string | null { const slug = model.replace(/\s+/g, "-"); return `https://e.huawei.com/en/products/enterprise-networking/switches/${slug}`; } function buildNobelUrl(_model: string): string | null { return null; // Nokia SROS pages require auth } function buildExtremeUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.extremenetworks.com/product/${slug}/`; } // MikroTik product URL slugs for models containing '+' are not derivable from // the model name — their website uses opaque suffixes (_in, _rm, …). // The models without '+' follow a simple pattern (lowercase, dashes→underscore). const MIKROTIK_SLUG_MAP: Record = { "CRS305-1G-4S+": "crs305_1g_4s_in", "CRS312-4C+8XG": "crs312_4c_8xg_rm", "CRS317-1G-16S+": "crs317_1g_16s_rm", "CRS326-24G-2S+": "crs326_24g_2s_in", // CRS354-48G-4S+2Q+: URL not discoverable — MikroTik's product listing is JS-rendered }; function buildMikroTikUrl(model: string): string | null { if (model in MIKROTIK_SLUG_MAP) { return `https://mikrotik.com/product/${MIKROTIK_SLUG_MAP[model]}`; } if (model.includes("+")) return null; // other + models — URL unknown // Simple lowercase + dashes→underscores for models without '+' const slug = model.toLowerCase().replace(/[-\s]+/g, "_").replace(/[^a-z0-9_]/g, ""); return slug ? `https://mikrotik.com/product/${slug}` : null; } function buildUbiquitiUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://store.ui.com/us/en/products/${slug}`; } function buildFsComUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.fs.com/products/${slug}.html`; } function buildSupermicroUrl(model: string): string | null { const slug = model.toUpperCase(); return `https://www.supermicro.com/en/products/switches/${slug}`; } function buildHpeArubaUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.arubanetworks.com/products/switches/${slug}/`; } function buildCelesticaUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.celestica.com/networking/${slug}`; } function buildAsterfusionUrl(model: string): string | null { const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-"); return `https://www.asterfusion.com/products/${slug}/`; } function buildFortinetUrl(_model: string): string | null { // Fortinet product pages are JS-rendered — og:image only returns the brand icon. // All /products/fortiswitch/ URLs redirect to the generic /ethernet-switches page. // Image scraping is not possible via plain HTTP for this vendor. return null; } function buildQuantaUrl(model: string): string | null { // QuantaMesh T3048-LY8, T7032-IX1 etc. const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); return `https://www.qct.io/product/index/Infrastructure-Product/Networking/Switch/${slug}`; } function buildAlliedTelesisUrl(model: string): string | null { // AT-x530-28GSX → https://www.alliedtelesis.com/us/en/products/at-x530-28gsx const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); return `https://www.alliedtelesis.com/us/en/products/${slug}`; } function buildUfispaceUrl(model: string): string | null { const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); return `https://www.ufispace.com/products/${slug}`; } function buildNetgearUrl(model: string): string | null { const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); return `https://www.netgear.com/business/products/switches/${slug}`; } // ── URL dispatcher by vendor slug ─────────────────────────────────────────── const URL_BUILDERS: Record string | null> = { cisco: buildCiscoUrl, arista: buildAristaUrl, juniper: buildJuniperUrl, "nvidia-networking": buildNvidiaUrl, edgecore: buildEdgecoreUrl, celestica: buildCelesticaUrl, asterfusion: buildAsterfusionUrl, fortinet: buildFortinetUrl, dell: buildDellUrl, "hpe-aruba": buildHpeArubaUrl, huawei: buildHuaweiUrl, nokia: buildNobelUrl, extreme: buildExtremeUrl, mikrotik: buildMikroTikUrl, ubiquiti: buildUbiquitiUrl, "fs-com": buildFsComUrl, supermicro: buildSupermicroUrl, "alcatel-lucent": buildAlcatelLucentUrl, "alcatel-lucent-enterprise": buildAlcatelLucentUrl, // fix: DB uses this slug ale: buildAlcatelLucentUrl, "quanta-cloud-technology": buildQuantaUrl, "allied-telesis": buildAlliedTelesisUrl, ufispace: buildUfispaceUrl, netgear: buildNetgearUrl, wistron: (_m) => null, // no public product pages aruba: buildHpeArubaUrl, // alias }; // ── Generic marketing image detector ──────────────────────────────────────── // Rejects URLs that are clearly stock photos, homepages, lifestyle shots or // any other non-product image. Patterns found from real-world scrapes. const GENERIC_IMAGE_PATTERNS: RegExp[] = [ // ── Logo / brand marks (never product photos) ──────────────────────────── /[-/_]logo[-_.]|\/logos?\//i, /cisco[-_]?logo/i, /juniper[-_]networks[-_]logo/i, /arista[-_]?logo/i, /brand[-_]?logo/i, /company[-_]?logo/i, // SVG logos often have these in path /\/svg\//i, /\.svg(\?|$)/i, // ── Alcatel-Lucent Enterprise generic hero images ──────────────────────── /naas-homepag/i, /al-enterprise.*\/images\/naas/i, // ── Generic OG / social sharing defaults ───────────────────────────────── /og[-_]default/i, /default[-_](?:og|social|share|image)/i, /site[-_](?:default|image|og)/i, /social[-_](?:default|share)/i, /twitter[-_]default/i, /default[-_]thumbnail/i, // ── Homepage / banner / lifestyle ──────────────────────────────────────── /\/homepage\//i, /hero[-_](?:banner|bg|background|image)/i, /banner[-_](?:bg|background)/i, /lifestyle/i, /stock[-_]?photo/i, /people[-_](?:at|in|with)/i, // ── Placeholder / fallback ──────────────────────────────────────────────── /placeholder/i, /no[-_]?image/i, /image[-_]?not[-_]?found/i, /\/fallback[/-]/i, /missing[-_]image/i, // ── Generic about/press/brand pages ────────────────────────────────────── /\/press[-_]kit/i, /\/media[-_]kit/i, // ── Vendor error / 404 graphics ────────────────────────────────────────── /404[-_]error/i, /error[-_]graphic/i, // ── Navigation icon libraries ──────────────────────────────────────────── /\/icon[-_]library\//i, // ── Diagrams and illustrations ─────────────────────────────────────────── /[-_]illustration[._]/i, // ── Vendor 404 hero images ─────────────────────────────────────────────── /webimage-404/i, // ── Moxa brand/marketing images (not product photos) ──────────────────── /\/Brand\//i, /cybersecurity\.png/i, // ── Cookie consent / GDPR overlay images ──────────────────────────────── /cdn\.cookielaw\.org/i, /cookiebot\.com/i, /trustarc\.com/i, /consent-manager/i, ]; function isGenericImage(url: string): boolean { return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url)); } // ── og:image extractor ────────────────────────────────────────────────────── function extractOgImage(html: string, baseUrl: string): string | null { const resolve = (url: string): string | null => { if (!url) return null; let abs = url; if (url.startsWith("/")) { try { abs = new URL(url, baseUrl).toString(); } catch { return null; } } if (!abs.startsWith("http")) return null; if (isGenericImage(abs)) return null; // ← reject logos/marketing images return abs; }; // Primary: og:image const ogM = html.match(/ with product keyword in path const imgM = html.match(/]+src="([^"]+(?:product|switch|router|hardware)[^"]*\.(?:jpg|jpeg|png|webp))"/i); if (imgM?.[1]) { const resolved = resolve(imgM[1]); if (resolved) return resolved; } return null; } // ── HTTP fetch with timeout ───────────────────────────────────────────────── async function fetchPageHtml(url: string): Promise { try { const resp = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(20_000), redirect: "follow", }); if (!resp.ok) return null; const html = await resp.text(); return html; } catch { return null; } } // ── Main scraper ──────────────────────────────────────────────────────────── export async function fetchSwitchImages(targetVendorSlug?: string): Promise { console.log("=== Switch Image Fetcher ===\n"); const vendorFilter = targetVendorSlug ? `AND v.slug = $1` : ""; const params = targetVendorSlug ? [targetVendorSlug] : []; const { rows } = await pool.query<{ id: string; model: string; series: string; vendor_slug: string; vendor_name: string; product_page_url: string | null; image_url: string | null; }>( `SELECT sw.id, sw.model, sw.series, sw.product_page_url, sw.image_url, v.slug AS vendor_slug, v.name AS vendor_name FROM switches sw JOIN vendors v ON v.id = sw.vendor_id WHERE (sw.image_url IS NULL OR sw.image_url = '') ${vendorFilter} ORDER BY v.slug, sw.model`, params, ); if (rows.length === 0) { console.log(" All switches already have images — nothing to do."); return; } console.log(` ${rows.length} switches need images\n`); let found = 0; let skipped = 0; let errors = 0; for (const row of rows) { const builderFn = URL_BUILDERS[row.vendor_slug]; const productUrl = row.product_page_url || (builderFn ? builderFn(row.model) : null); if (!productUrl) { console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL pattern`); skipped++; continue; } await sleep(3500); // 1 req/3.5s — server-friendly rate limit const html = await fetchPageHtml(productUrl); if (!html) { console.log(` [FAIL] ${row.vendor_name} ${row.model} — HTTP error`); errors++; continue; } const imageUrl = extractOgImage(html, productUrl); if (!imageUrl) { console.log(` [MISS] ${row.vendor_name} ${row.model} — no og:image on ${productUrl}`); skipped++; // Still save the product_page_url so we don't retry the same miss endlessly if (!row.product_page_url) { await pool.query( `UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`, [row.id, productUrl], ); } continue; } await pool.query( `UPDATE switches SET image_url = $2, product_page_url = COALESCE(product_page_url, $3), assets_scraped_at = NOW() WHERE id = $1`, [row.id, imageUrl, productUrl], ); console.log(` [OK] ${row.vendor_name} ${row.model} → ${imageUrl.slice(0, 80)}`); found++; } console.log(`\n=== Switch Image Fetcher Complete ===`); console.log(` Images found: ${found}`); console.log(` Skipped/miss: ${skipped}`); if (errors > 0) console.warn(` Errors: ${errors}`); } if (require.main === module) { const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1]; fetchSwitchImages(vendor) .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }