transceiver-db/packages/scraper/src/scrapers/switch-assets-crawler.ts
Rene Fichtmueller 419af4a24e fix: remove all withIsolatedStorage wrappers, add makeCrawleeConfig to remaining Crawlee scrapers
- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
  optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
  cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
  eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
  storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
  PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
2026-04-18 02:19:53 +02:00

344 lines
13 KiB
TypeScript

/**
* Switch Assets Crawler — Crawlee-based scraper for product images, datasheets, manuals
*
* Uses CheerioCrawler to visit actual vendor product pages and extract:
* - Product hero images
* - Datasheet PDF download links
* - Manual/Guide links
* - Quick Start Guide links
*
* Handles static HTML pages. For JS-heavy vendors (Cisco, Arista),
* use PlaywrightCrawler variant or the static URL-pattern scraper.
*/
import { CheerioCrawler, Dataset } from "crawlee";
import { pool } from "../utils/db";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import {
downloadSwitchImage,
downloadSwitchDatasheet,
downloadSwitchManual,
setSwitchProductPage,
} from "../utils/assets";
interface CrawlTarget {
switchId: string;
vendorId: string;
vendorName: string;
model: string;
productPageUrl: string;
}
// ═══════════════════════════════════════════════════════
// Vendor-specific page parsers
// ═══════════════════════════════════════════════════════
interface ParsedAssets {
imageUrl?: string;
datasheetUrl?: string;
datasheetTitle?: string;
manuals: Array<{ url: string; title: string; type: string }>;
}
type PageParser = ($: any, url: string) => ParsedAssets;
function parseMikroTikPage($: any, baseUrl: string): ParsedAssets {
const manuals: ParsedAssets["manuals"] = [];
// MikroTik product images are on cdn.mikrotik.com with unpredictable numeric IDs
// Look for: og:image, large product images in gallery, or CDN URLs
const ogImage = $('meta[property="og:image"]').attr("content");
const galleryImage = $(".product-image img, #gallery img, .product-hero img, .product_image img, img[src*='cdn.mikrotik.com']").first().attr("src");
// Also check for large images in the page body
const bodyImage = $("img").filter((_: any, el: any) => {
const src = $(el).attr("src") || "";
return src.includes("cdn.mikrotik.com") && (src.includes("_lg") || src.includes("_hi"));
}).first().attr("src");
const imageUrl = ogImage || bodyImage || galleryImage;
// Datasheets — MikroTik PDFs on cdn.mikrotik.com/web-assets/product_files/
const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => {
const text = $(el).text().toLowerCase();
const href = $(el).attr("href")?.toLowerCase() || "";
return text.includes("datasheet") || text.includes("data sheet") || text.includes("brochure")
|| href.includes("datasheet") || href.includes("product_files");
}).first().attr("href");
// Manuals — check help.mikrotik.com links and PDFs
$('a[href*=".pdf"], a[href*="help.mikrotik.com"]').each((_: any, el: any) => {
const href = $(el).attr("href");
const text = $(el).text().trim();
if (!href || !text) return;
const lower = text.toLowerCase();
if (lower.includes("manual") || lower.includes("guide") || lower.includes("quick start")) {
const type = lower.includes("quick start") ? "quick_start" : "manual";
manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type });
}
});
return {
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
datasheetTitle: datasheetUrl ? "Product Datasheet" : undefined,
manuals,
};
}
function parseFortinetPage($: any, baseUrl: string): ParsedAssets {
const manuals: ParsedAssets["manuals"] = [];
const imageUrl = $('meta[property="og:image"]').attr("content")
|| $(".product-image img, .hero-image img").first().attr("src");
const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => {
const text = $(el).text().toLowerCase();
const href = $(el).attr("href")?.toLowerCase() || "";
return text.includes("datasheet") || text.includes("data-sheet") || href.includes("data-sheet");
}).first().attr("href");
$('a[href*="docs.fortinet.com"]').each((_: any, el: any) => {
const href = $(el).attr("href");
const text = $(el).text().trim();
if (href && text) {
manuals.push({ url: href, title: text, type: "manual" });
}
});
return {
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
datasheetTitle: "FortiSwitch Datasheet",
manuals,
};
}
function parseGenericPage($: any, baseUrl: string): ParsedAssets {
const manuals: ParsedAssets["manuals"] = [];
// Generic image extraction
const imageUrl = $('meta[property="og:image"]').attr("content")
|| $(".product-image img, .hero img, .product-photo img, main img").first().attr("src");
// Generic datasheet extraction — look for PDF links with "datasheet" in text or URL
const datasheetUrl = $('a[href$=".pdf"]').filter((_: any, el: any) => {
const text = $(el).text().toLowerCase();
const href = $(el).attr("href")?.toLowerCase() || "";
return text.includes("datasheet") || text.includes("data sheet")
|| href.includes("datasheet") || href.includes("data-sheet");
}).first().attr("href");
// Generic manual extraction
$('a[href$=".pdf"]').each((_: any, el: any) => {
const href = $(el).attr("href");
const text = $(el).text().trim();
if (!href || !text) return;
const lower = text.toLowerCase();
if (lower.includes("manual") || lower.includes("guide") || lower.includes("installation")
|| lower.includes("configuration") || lower.includes("quick start") || lower.includes("cli")) {
let type = "manual";
if (lower.includes("quick start")) type = "quick_start";
if (lower.includes("cli")) type = "cli_reference";
if (lower.includes("installation")) type = "installation_guide";
manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type });
}
});
return {
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
datasheetTitle: "Product Datasheet",
manuals,
};
}
function getParserForVendor(vendorName: string): PageParser {
const lower = vendorName.toLowerCase();
if (lower.includes("mikrotik")) return parseMikroTikPage;
if (lower.includes("fortinet")) return parseFortinetPage;
return parseGenericPage;
}
// ═══════════════════════════════════════════════════════
// Known vendor product page URL builders
// ═══════════════════════════════════════════════════════
function buildProductPageUrl(vendorName: string, model: string): string | null {
const lower = vendorName.toLowerCase();
if (lower.includes("mikrotik")) {
// MikroTik uses underscored slugs: https://mikrotik.com/product/CRS504_4XQ_IN
// Some models use hyphens in their name (CRS504-4XQ-IN) but URL uses underscores
return `https://mikrotik.com/product/${model.replace(/[-\s]+/g, "_")}`;
}
if (lower.includes("fortinet")) {
if (model.startsWith("FortiSwitch")) {
const num = model.match(/\d+[A-Z]*/)?.[0] || "";
return `https://www.fortinet.com/products/switches/fortiswitch-${num.toLowerCase()}`;
}
}
if (lower.includes("ubiquiti") || lower.includes("ui.com")) {
return `https://store.ui.com/us/en/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
}
if (lower.includes("netgear")) {
return `https://www.netgear.com/business/wired/switches/${model.toLowerCase()}/`;
}
if (lower.includes("allied telesis")) {
return `https://www.alliedtelesis.com/products/${model.toLowerCase()}`;
}
if (lower.includes("tp-link")) {
return `https://www.tp-link.com/us/business-networking/managed-switch/${model.toLowerCase()}/`;
}
if (lower.includes("zyxel")) {
return `https://www.zyxel.com/products/${model}/`;
}
if (lower.includes("moxa")) {
return `https://www.moxa.com/en/products/industrial-network-infrastructure/ethernet-switches/${model.toLowerCase()}`;
}
if (lower.includes("hirschmann") || lower.includes("belden")) {
return `https://catalog.belden.com/techdata/en/${model.replace(/\s+/g, "_")}_en.html`;
}
if (lower.includes("siemens")) {
return `https://mall.industry.siemens.com/mall/en/WW/Catalog/Products/${model.replace(/\s+/g, "")}`;
}
if (lower.includes("phoenix")) {
return `https://www.phoenixcontact.com/en-us/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
}
if (lower.includes("westermo")) {
return `https://www.westermo.com/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
}
if (lower.includes("f5")) {
return `https://www.f5.com/products/big-ip-services`;
}
return null;
}
// ═══════════════════════════════════════════════════════
// Main crawler
// ═══════════════════════════════════════════════════════
export async function crawlSwitchAssets(targetVendor?: string): Promise<void> {
console.log("=== Switch Assets Crawler (Crawlee/Cheerio) ===\n");
// Get switches that need asset scraping and have a buildable product page URL
const vendorFilter = targetVendor
? `AND v.name ILIKE '%${targetVendor}%'`
: "";
const result = await pool.query(`
SELECT sw.id, sw.model, sw.series, sw.product_page_url,
v.name as vendor_name, v.id as vendor_id
FROM switches sw
JOIN vendors v ON sw.vendor_id = v.id
WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL)
${vendorFilter}
ORDER BY v.name, sw.model
LIMIT 200
`);
if (result.rows.length === 0) {
console.log("No switches need asset scraping.\n");
return;
}
// Build crawl targets
const targets: CrawlTarget[] = [];
for (const row of result.rows) {
const productPageUrl = row.product_page_url || buildProductPageUrl(row.vendor_name, row.model);
if (!productPageUrl) continue;
targets.push({
switchId: row.id,
vendorId: row.vendor_id,
vendorName: row.vendor_name,
model: row.model,
productPageUrl,
});
}
console.log(`Crawling ${targets.length} product pages...\n`);
let images = 0;
let datasheets = 0;
let manuals = 0;
const crawler = new CheerioCrawler({
maxConcurrency: 3,
maxRequestsPerMinute: 20,
requestHandlerTimeoutSecs: 30,
async requestHandler({ request, $ }) {
const target = request.userData as CrawlTarget;
const parser = getParserForVendor(target.vendorName);
const assets = parser($, request.loadedUrl || request.url);
console.log(` ${target.vendorName} ${target.model}:`);
// Set product page URL
await setSwitchProductPage(target.switchId, request.url);
// Download image
if (assets.imageUrl) {
const ok = await downloadSwitchImage(
target.switchId, assets.imageUrl, target.vendorName, target.model
);
if (ok) {
images++;
console.log(` ✓ Image`);
}
}
// Download datasheet
if (assets.datasheetUrl) {
const ok = await downloadSwitchDatasheet(
target.switchId, target.vendorId, assets.datasheetUrl,
assets.datasheetTitle || `${target.model} Datasheet`,
target.vendorName, target.model
);
if (ok) {
datasheets++;
console.log(` ✓ Datasheet`);
}
}
// Download manuals
for (const manual of assets.manuals) {
const ok = await downloadSwitchManual(
target.switchId, target.vendorId, manual.url,
manual.title, manual.type, target.vendorName, target.model
);
if (ok) {
manuals++;
console.log(`${manual.type}: ${manual.title}`);
}
}
},
async failedRequestHandler({ request }) {
const target = request.userData as CrawlTarget;
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
},
}, makeCrawleeConfig("switch-assets"));
await crawler.run(
targets.map((t) => ({
url: t.productPageUrl,
userData: t,
}))
);
console.log(`\n=== Crawl Complete ===`);
console.log(` Images: ${images}`);
console.log(` Datasheets: ${datasheets}`);
console.log(` Manuals: ${manuals}`);
}
if (require.main === module) {
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
crawlSwitchAssets(vendor)
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}