- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
344 lines
13 KiB
TypeScript
344 lines
13 KiB
TypeScript
/**
|
|
* Switch Assets Crawler — Crawlee-based scraper for product images, datasheets, manuals
|
|
*
|
|
* Uses CheerioCrawler to visit actual vendor product pages and extract:
|
|
* - Product hero images
|
|
* - Datasheet PDF download links
|
|
* - Manual/Guide links
|
|
* - Quick Start Guide links
|
|
*
|
|
* Handles static HTML pages. For JS-heavy vendors (Cisco, Arista),
|
|
* use PlaywrightCrawler variant or the static URL-pattern scraper.
|
|
*/
|
|
import { CheerioCrawler, Dataset } from "crawlee";
|
|
import { pool } from "../utils/db";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
import {
|
|
downloadSwitchImage,
|
|
downloadSwitchDatasheet,
|
|
downloadSwitchManual,
|
|
setSwitchProductPage,
|
|
} from "../utils/assets";
|
|
|
|
interface CrawlTarget {
|
|
switchId: string;
|
|
vendorId: string;
|
|
vendorName: string;
|
|
model: string;
|
|
productPageUrl: string;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// Vendor-specific page parsers
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
interface ParsedAssets {
|
|
imageUrl?: string;
|
|
datasheetUrl?: string;
|
|
datasheetTitle?: string;
|
|
manuals: Array<{ url: string; title: string; type: string }>;
|
|
}
|
|
|
|
type PageParser = ($: any, url: string) => ParsedAssets;
|
|
|
|
function parseMikroTikPage($: any, baseUrl: string): ParsedAssets {
|
|
const manuals: ParsedAssets["manuals"] = [];
|
|
|
|
// MikroTik product images are on cdn.mikrotik.com with unpredictable numeric IDs
|
|
// Look for: og:image, large product images in gallery, or CDN URLs
|
|
const ogImage = $('meta[property="og:image"]').attr("content");
|
|
const galleryImage = $(".product-image img, #gallery img, .product-hero img, .product_image img, img[src*='cdn.mikrotik.com']").first().attr("src");
|
|
// Also check for large images in the page body
|
|
const bodyImage = $("img").filter((_: any, el: any) => {
|
|
const src = $(el).attr("src") || "";
|
|
return src.includes("cdn.mikrotik.com") && (src.includes("_lg") || src.includes("_hi"));
|
|
}).first().attr("src");
|
|
const imageUrl = ogImage || bodyImage || galleryImage;
|
|
|
|
// Datasheets — MikroTik PDFs on cdn.mikrotik.com/web-assets/product_files/
|
|
const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => {
|
|
const text = $(el).text().toLowerCase();
|
|
const href = $(el).attr("href")?.toLowerCase() || "";
|
|
return text.includes("datasheet") || text.includes("data sheet") || text.includes("brochure")
|
|
|| href.includes("datasheet") || href.includes("product_files");
|
|
}).first().attr("href");
|
|
|
|
// Manuals — check help.mikrotik.com links and PDFs
|
|
$('a[href*=".pdf"], a[href*="help.mikrotik.com"]').each((_: any, el: any) => {
|
|
const href = $(el).attr("href");
|
|
const text = $(el).text().trim();
|
|
if (!href || !text) return;
|
|
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("manual") || lower.includes("guide") || lower.includes("quick start")) {
|
|
const type = lower.includes("quick start") ? "quick_start" : "manual";
|
|
manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type });
|
|
}
|
|
});
|
|
|
|
return {
|
|
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
|
|
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
|
|
datasheetTitle: datasheetUrl ? "Product Datasheet" : undefined,
|
|
manuals,
|
|
};
|
|
}
|
|
|
|
function parseFortinetPage($: any, baseUrl: string): ParsedAssets {
|
|
const manuals: ParsedAssets["manuals"] = [];
|
|
|
|
const imageUrl = $('meta[property="og:image"]').attr("content")
|
|
|| $(".product-image img, .hero-image img").first().attr("src");
|
|
|
|
const datasheetUrl = $('a[href*=".pdf"]').filter((_: any, el: any) => {
|
|
const text = $(el).text().toLowerCase();
|
|
const href = $(el).attr("href")?.toLowerCase() || "";
|
|
return text.includes("datasheet") || text.includes("data-sheet") || href.includes("data-sheet");
|
|
}).first().attr("href");
|
|
|
|
$('a[href*="docs.fortinet.com"]').each((_: any, el: any) => {
|
|
const href = $(el).attr("href");
|
|
const text = $(el).text().trim();
|
|
if (href && text) {
|
|
manuals.push({ url: href, title: text, type: "manual" });
|
|
}
|
|
});
|
|
|
|
return {
|
|
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
|
|
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
|
|
datasheetTitle: "FortiSwitch Datasheet",
|
|
manuals,
|
|
};
|
|
}
|
|
|
|
function parseGenericPage($: any, baseUrl: string): ParsedAssets {
|
|
const manuals: ParsedAssets["manuals"] = [];
|
|
|
|
// Generic image extraction
|
|
const imageUrl = $('meta[property="og:image"]').attr("content")
|
|
|| $(".product-image img, .hero img, .product-photo img, main img").first().attr("src");
|
|
|
|
// Generic datasheet extraction — look for PDF links with "datasheet" in text or URL
|
|
const datasheetUrl = $('a[href$=".pdf"]').filter((_: any, el: any) => {
|
|
const text = $(el).text().toLowerCase();
|
|
const href = $(el).attr("href")?.toLowerCase() || "";
|
|
return text.includes("datasheet") || text.includes("data sheet")
|
|
|| href.includes("datasheet") || href.includes("data-sheet");
|
|
}).first().attr("href");
|
|
|
|
// Generic manual extraction
|
|
$('a[href$=".pdf"]').each((_: any, el: any) => {
|
|
const href = $(el).attr("href");
|
|
const text = $(el).text().trim();
|
|
if (!href || !text) return;
|
|
|
|
const lower = text.toLowerCase();
|
|
if (lower.includes("manual") || lower.includes("guide") || lower.includes("installation")
|
|
|| lower.includes("configuration") || lower.includes("quick start") || lower.includes("cli")) {
|
|
let type = "manual";
|
|
if (lower.includes("quick start")) type = "quick_start";
|
|
if (lower.includes("cli")) type = "cli_reference";
|
|
if (lower.includes("installation")) type = "installation_guide";
|
|
|
|
manuals.push({ url: new URL(href, baseUrl).toString(), title: text, type });
|
|
}
|
|
});
|
|
|
|
return {
|
|
imageUrl: imageUrl ? new URL(imageUrl, baseUrl).toString() : undefined,
|
|
datasheetUrl: datasheetUrl ? new URL(datasheetUrl, baseUrl).toString() : undefined,
|
|
datasheetTitle: "Product Datasheet",
|
|
manuals,
|
|
};
|
|
}
|
|
|
|
function getParserForVendor(vendorName: string): PageParser {
|
|
const lower = vendorName.toLowerCase();
|
|
if (lower.includes("mikrotik")) return parseMikroTikPage;
|
|
if (lower.includes("fortinet")) return parseFortinetPage;
|
|
return parseGenericPage;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// Known vendor product page URL builders
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
function buildProductPageUrl(vendorName: string, model: string): string | null {
|
|
const lower = vendorName.toLowerCase();
|
|
|
|
if (lower.includes("mikrotik")) {
|
|
// MikroTik uses underscored slugs: https://mikrotik.com/product/CRS504_4XQ_IN
|
|
// Some models use hyphens in their name (CRS504-4XQ-IN) but URL uses underscores
|
|
return `https://mikrotik.com/product/${model.replace(/[-\s]+/g, "_")}`;
|
|
}
|
|
if (lower.includes("fortinet")) {
|
|
if (model.startsWith("FortiSwitch")) {
|
|
const num = model.match(/\d+[A-Z]*/)?.[0] || "";
|
|
return `https://www.fortinet.com/products/switches/fortiswitch-${num.toLowerCase()}`;
|
|
}
|
|
}
|
|
if (lower.includes("ubiquiti") || lower.includes("ui.com")) {
|
|
return `https://store.ui.com/us/en/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
|
|
}
|
|
if (lower.includes("netgear")) {
|
|
return `https://www.netgear.com/business/wired/switches/${model.toLowerCase()}/`;
|
|
}
|
|
if (lower.includes("allied telesis")) {
|
|
return `https://www.alliedtelesis.com/products/${model.toLowerCase()}`;
|
|
}
|
|
if (lower.includes("tp-link")) {
|
|
return `https://www.tp-link.com/us/business-networking/managed-switch/${model.toLowerCase()}/`;
|
|
}
|
|
if (lower.includes("zyxel")) {
|
|
return `https://www.zyxel.com/products/${model}/`;
|
|
}
|
|
if (lower.includes("moxa")) {
|
|
return `https://www.moxa.com/en/products/industrial-network-infrastructure/ethernet-switches/${model.toLowerCase()}`;
|
|
}
|
|
if (lower.includes("hirschmann") || lower.includes("belden")) {
|
|
return `https://catalog.belden.com/techdata/en/${model.replace(/\s+/g, "_")}_en.html`;
|
|
}
|
|
if (lower.includes("siemens")) {
|
|
return `https://mall.industry.siemens.com/mall/en/WW/Catalog/Products/${model.replace(/\s+/g, "")}`;
|
|
}
|
|
if (lower.includes("phoenix")) {
|
|
return `https://www.phoenixcontact.com/en-us/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
|
|
}
|
|
if (lower.includes("westermo")) {
|
|
return `https://www.westermo.com/products/${model.toLowerCase().replace(/\s+/g, "-")}`;
|
|
}
|
|
if (lower.includes("f5")) {
|
|
return `https://www.f5.com/products/big-ip-services`;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// Main crawler
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
export async function crawlSwitchAssets(targetVendor?: string): Promise<void> {
|
|
console.log("=== Switch Assets Crawler (Crawlee/Cheerio) ===\n");
|
|
|
|
// Get switches that need asset scraping and have a buildable product page URL
|
|
const vendorFilter = targetVendor
|
|
? `AND v.name ILIKE '%${targetVendor}%'`
|
|
: "";
|
|
|
|
const result = await pool.query(`
|
|
SELECT sw.id, sw.model, sw.series, sw.product_page_url,
|
|
v.name as vendor_name, v.id as vendor_id
|
|
FROM switches sw
|
|
JOIN vendors v ON sw.vendor_id = v.id
|
|
WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL)
|
|
${vendorFilter}
|
|
ORDER BY v.name, sw.model
|
|
LIMIT 200
|
|
`);
|
|
|
|
if (result.rows.length === 0) {
|
|
console.log("No switches need asset scraping.\n");
|
|
return;
|
|
}
|
|
|
|
// Build crawl targets
|
|
const targets: CrawlTarget[] = [];
|
|
for (const row of result.rows) {
|
|
const productPageUrl = row.product_page_url || buildProductPageUrl(row.vendor_name, row.model);
|
|
if (!productPageUrl) continue;
|
|
|
|
targets.push({
|
|
switchId: row.id,
|
|
vendorId: row.vendor_id,
|
|
vendorName: row.vendor_name,
|
|
model: row.model,
|
|
productPageUrl,
|
|
});
|
|
}
|
|
|
|
console.log(`Crawling ${targets.length} product pages...\n`);
|
|
|
|
let images = 0;
|
|
let datasheets = 0;
|
|
let manuals = 0;
|
|
|
|
const crawler = new CheerioCrawler({
|
|
maxConcurrency: 3,
|
|
maxRequestsPerMinute: 20,
|
|
requestHandlerTimeoutSecs: 30,
|
|
|
|
async requestHandler({ request, $ }) {
|
|
const target = request.userData as CrawlTarget;
|
|
const parser = getParserForVendor(target.vendorName);
|
|
const assets = parser($, request.loadedUrl || request.url);
|
|
|
|
console.log(` ${target.vendorName} ${target.model}:`);
|
|
|
|
// Set product page URL
|
|
await setSwitchProductPage(target.switchId, request.url);
|
|
|
|
// Download image
|
|
if (assets.imageUrl) {
|
|
const ok = await downloadSwitchImage(
|
|
target.switchId, assets.imageUrl, target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
images++;
|
|
console.log(` ✓ Image`);
|
|
}
|
|
}
|
|
|
|
// Download datasheet
|
|
if (assets.datasheetUrl) {
|
|
const ok = await downloadSwitchDatasheet(
|
|
target.switchId, target.vendorId, assets.datasheetUrl,
|
|
assets.datasheetTitle || `${target.model} Datasheet`,
|
|
target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
datasheets++;
|
|
console.log(` ✓ Datasheet`);
|
|
}
|
|
}
|
|
|
|
// Download manuals
|
|
for (const manual of assets.manuals) {
|
|
const ok = await downloadSwitchManual(
|
|
target.switchId, target.vendorId, manual.url,
|
|
manual.title, manual.type, target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
manuals++;
|
|
console.log(` ✓ ${manual.type}: ${manual.title}`);
|
|
}
|
|
}
|
|
},
|
|
|
|
async failedRequestHandler({ request }) {
|
|
const target = request.userData as CrawlTarget;
|
|
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
|
},
|
|
}, makeCrawleeConfig("switch-assets"));
|
|
|
|
await crawler.run(
|
|
targets.map((t) => ({
|
|
url: t.productPageUrl,
|
|
userData: t,
|
|
}))
|
|
);
|
|
|
|
console.log(`\n=== Crawl Complete ===`);
|
|
console.log(` Images: ${images}`);
|
|
console.log(` Datasheets: ${datasheets}`);
|
|
console.log(` Manuals: ${manuals}`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
crawlSwitchAssets(vendor)
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|