/** * Switch Assets Crawler — Crawlee-based scraper for product images, datasheets, manuals * * Uses CheerioCrawler to visit actual vendor product pages and extract: * - Product hero images * - Datasheet PDF download links * - Manual/Guide links * - Quick Start Guide links * * Handles static HTML pages. For JS-heavy vendors (Cisco, Arista), * use PlaywrightCrawler variant or the static URL-pattern scraper. */ import { CheerioCrawler, Dataset } from "crawlee"; import { pool } from "../utils/db"; import { makeCrawleeConfig } from "../utils/crawlee-config"; import { downloadSwitchImage, downloadSwitchDatasheet, downloadSwitchManual, setSwitchProductPage, } from "../utils/assets"; interface CrawlTarget { switchId: string; vendorId: string; vendorName: string; model: string; productPageUrl: string; } // ═══════════════════════════════════════════════════════ // Vendor-specific page parsers // ═══════════════════════════════════════════════════════ interface ParsedAssets { imageUrl?: string; datasheetUrl?: string; datasheetTitle?: string; manuals: Array<{ url: string; title: string; type: string }>; } type PageParser = ($: any, url: string) => ParsedAssets; function parseMikroTikPage($: any, baseUrl: string): ParsedAssets { const manuals: ParsedAssets["manuals"] = []; // MikroTik product images are on cdn.mikrotik.com with unpredictable numeric IDs // Look for: og:image, large product images in gallery, or CDN URLs const ogImage = $('meta[property="og:image"]').attr("content"); const galleryImage = $(".product-image img, #gallery img, .product-hero img, .product_image img, img[src*='cdn.mikrotik.com']").first().attr("src"); // Also check for large images in the page body const bodyImage = $("img").filter((_: any, el: any) => { const src = $(el).attr("src") || ""; return src.includes("cdn.mikrotik.com") && (src.includes("_lg") || src.includes("_hi")); }).first().attr("src"); const imageUrl = ogImage || bodyImage || galleryImage; // Datasheets — MikroTik PDFs on cdn.mikrotik.com/web-assets/product_files/ const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => { const text = $(el).text().toLowerCase(); const href = $(el).attr("href")?.toLowerCase() || ""; return text.includes("datasheet") || text.includes("data sheet") || text.includes("brochure") || href.includes("datasheet") || href.includes("product_files"); }).first().attr("href"); // Manuals — check help.mikrotik.com links and PDFs $('a[href*=".pdf"], a[href*="help.mikrotik.com"]').each((_: any, el: any) => { const href = $(el).attr("href"); const text = $(el).text().trim(); if (!href || !text) return; const lower = text.toLowerCase(); if (lower.includes("manual") || lower.includes("guide") || lower.includes("quick start")) { const type = lower.includes("quick start") ? "quick_start" : "manual"; manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type }); } }); return { imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined, datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined, datasheetTitle: datasheetUrl ? "Product Datasheet" : undefined, manuals, }; } function parseFortinetPage($: any, baseUrl: string): ParsedAssets { const manuals: ParsedAssets["manuals"] = []; const imageUrl = $('meta[property="og:image"]').attr("content") || $(".product-image img, .hero-image img").first().attr("src"); const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => { const text = $(el).text().toLowerCase(); const href = $(el).attr("href")?.toLowerCase() || ""; return text.includes("datasheet") || text.includes("data-sheet") || href.includes("data-sheet"); }).first().attr("href"); $('a[href*="docs.fortinet.com"]').each((_: any, el: any) => { const href = $(el).attr("href"); const text = $(el).text().trim(); if (href && text) { manuals.push({ url: href, title: text, type: "manual" }); } }); return { imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined, datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined, datasheetTitle: "FortiSwitch Datasheet", manuals, }; } function parseGenericPage($: any, baseUrl: string): ParsedAssets { const manuals: ParsedAssets["manuals"] = []; // Generic image extraction const imageUrl = $('meta[property="og:image"]').attr("content") || $(".product-image img, .hero img, .product-photo img, main img").first().attr("src"); // Generic datasheet extraction — look for PDF links with "datasheet" in text or URL const datasheetUrl = $('a[href$=".pdf"]').filter((_: any, el: any) => { const text = $(el).text().toLowerCase(); const href = $(el).attr("href")?.toLowerCase() || ""; return text.includes("datasheet") || text.includes("data sheet") || href.includes("datasheet") || href.includes("data-sheet"); }).first().attr("href"); // Generic manual extraction $('a[href$=".pdf"]').each((_: any, el: any) => { const href = $(el).attr("href"); const text = $(el).text().trim(); if (!href || !text) return; const lower = text.toLowerCase(); if (lower.includes("manual") || lower.includes("guide") || lower.includes("installation") || lower.includes("configuration") || lower.includes("quick start") || lower.includes("cli")) { let type = "manual"; if (lower.includes("quick start")) type = "quick_start"; if (lower.includes("cli")) type = "cli_reference"; if (lower.includes("installation")) type = "installation_guide"; manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type }); } }); return { imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined, datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined, datasheetTitle: "Product Datasheet", manuals, }; } function getParserForVendor(vendorName: string): PageParser { const lower = vendorName.toLowerCase(); if (lower.includes("mikrotik")) return parseMikroTikPage; if (lower.includes("fortinet")) return parseFortinetPage; return parseGenericPage; } // ═══════════════════════════════════════════════════════ // Known vendor product page URL builders // ═══════════════════════════════════════════════════════ function buildProductPageUrl(vendorName: string, model: string): string | null { const lower = vendorName.toLowerCase(); if (lower.includes("mikrotik")) { // MikroTik uses underscored slugs: https://mikrotik.com/product/CRS504_4XQ_IN // Some models use hyphens in their name (CRS504-4XQ-IN) but URL uses underscores return `https://mikrotik.com/product/${model.replace(/[-\s]+/g, "_")}`; } if (lower.includes("fortinet")) { if (model.startsWith("FortiSwitch")) { const num = model.match(/\d+[A-Z]*/)?.[0] || ""; return `https://www.fortinet.com/products/switches/fortiswitch-${num.toLowerCase()}`; } } if (lower.includes("ubiquiti") || lower.includes("ui.com")) { return `https://store.ui.com/us/en/products/${model.toLowerCase().replace(/\s+/g, "-")}`; } if (lower.includes("netgear")) { return `https://www.netgear.com/business/wired/switches/${model.toLowerCase()}/`; } if (lower.includes("allied telesis")) { return `https://www.alliedtelesis.com/products/${model.toLowerCase()}`; } if (lower.includes("tp-link")) { return `https://www.tp-link.com/us/business-networking/managed-switch/${model.toLowerCase()}/`; } if (lower.includes("zyxel")) { return `https://www.zyxel.com/products/${model}/`; } if (lower.includes("moxa")) { return `https://www.moxa.com/en/products/industrial-network-infrastructure/ethernet-switches/${model.toLowerCase()}`; } if (lower.includes("hirschmann") || lower.includes("belden")) { return `https://catalog.belden.com/techdata/en/${model.replace(/\s+/g, "_")}_en.html`; } if (lower.includes("siemens")) { return `https://mall.industry.siemens.com/mall/en/WW/Catalog/Products/${model.replace(/\s+/g, "")}`; } if (lower.includes("phoenix")) { return `https://www.phoenixcontact.com/en-us/products/${model.toLowerCase().replace(/\s+/g, "-")}`; } if (lower.includes("westermo")) { return `https://www.westermo.com/products/${model.toLowerCase().replace(/\s+/g, "-")}`; } if (lower.includes("f5")) { return `https://www.f5.com/products/big-ip-services`; } return null; } // ═══════════════════════════════════════════════════════ // Main crawler // ═══════════════════════════════════════════════════════ export async function crawlSwitchAssets(targetVendor?: string): Promise { console.log("=== Switch Assets Crawler (Crawlee/Cheerio) ===\n"); // Get switches that need asset scraping and have a buildable product page URL const vendorFilter = targetVendor ? `AND v.name ILIKE '%${targetVendor}%'` : ""; const result = await pool.query(` SELECT sw.id, sw.model, sw.series, sw.product_page_url, v.name as vendor_name, v.id as vendor_id FROM switches sw JOIN vendors v ON sw.vendor_id = v.id WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL) ${vendorFilter} ORDER BY v.name, sw.model LIMIT 200 `); if (result.rows.length === 0) { console.log("No switches need asset scraping.\n"); return; } // Build crawl targets const targets: CrawlTarget[] = []; for (const row of result.rows) { const productPageUrl = row.product_page_url || buildProductPageUrl(row.vendor_name, row.model); if (!productPageUrl) continue; targets.push({ switchId: row.id, vendorId: row.vendor_id, vendorName: row.vendor_name, model: row.model, productPageUrl, }); } console.log(`Crawling ${targets.length} product pages...\n`); let images = 0; let datasheets = 0; let manuals = 0; const crawler = new CheerioCrawler({ maxConcurrency: 3, maxRequestsPerMinute: 20, requestHandlerTimeoutSecs: 30, async requestHandler({ request, $ }) { const target = request.userData as CrawlTarget; const parser = getParserForVendor(target.vendorName); const assets = parser($, request.loadedUrl || request.url); console.log(` ${target.vendorName} ${target.model}:`); // Set product page URL await setSwitchProductPage(target.switchId, request.url); // Download image if (assets.imageUrl) { const ok = await downloadSwitchImage( target.switchId, assets.imageUrl, target.vendorName, target.model ); if (ok) { images++; console.log(` ✓ Image`); } } // Download datasheet if (assets.datasheetUrl) { const ok = await downloadSwitchDatasheet( target.switchId, target.vendorId, assets.datasheetUrl, assets.datasheetTitle || `${target.model} Datasheet`, target.vendorName, target.model ); if (ok) { datasheets++; console.log(` ✓ Datasheet`); } } // Download manuals for (const manual of assets.manuals) { const ok = await downloadSwitchManual( target.switchId, target.vendorId, manual.url, manual.title, manual.type, target.vendorName, target.model ); if (ok) { manuals++; console.log(` ✓ ${manual.type}: ${manual.title}`); } } }, async failedRequestHandler({ request }) { const target = request.userData as CrawlTarget; console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`); }, }, makeCrawleeConfig("switch-assets")); await crawler.run( targets.map((t) => ({ url: t.productPageUrl, userData: t, })) ); console.log(`\n=== Crawl Complete ===`); console.log(` Images: ${images}`); console.log(` Datasheets: ${datasheets}`); console.log(` Manuals: ${manuals}`); } if (require.main === module) { const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1]; crawlSwitchAssets(vendor) .then(() => pool.end()) .catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); }); }