- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
255 lines
9.3 KiB
TypeScript
255 lines
9.3 KiB
TypeScript
/**
|
|
* Switch Assets Scraper — Playwright-based for JS-heavy vendor sites
|
|
*
|
|
* Cisco, Arista, HPE/Aruba, Dell, and Extreme require JavaScript rendering
|
|
* to access product pages, datasheets, and images.
|
|
*
|
|
* Uses PlaywrightCrawler for full browser rendering.
|
|
*/
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { pool } from "../utils/db";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
import {
|
|
downloadSwitchImage,
|
|
downloadSwitchDatasheet,
|
|
downloadSwitchManual,
|
|
setSwitchProductPage,
|
|
} from "../utils/assets";
|
|
|
|
interface CrawlTarget {
|
|
switchId: string;
|
|
vendorId: string;
|
|
vendorName: string;
|
|
model: string;
|
|
productPageUrl: string;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// Vendor-specific product page URL builders
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
function buildCiscoUrl(model: string): string {
|
|
if (model.startsWith("N9K-") || model.startsWith("N3K-")) {
|
|
// Nexus 9000 — use datasheet listing page for JS-rendered content
|
|
return `https://www.cisco.com/c/en/us/products/switches/nexus-9000-series-switches/datasheet-listing.html`;
|
|
}
|
|
if (model.startsWith("C93")) {
|
|
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9300-series-switches/datasheet-listing.html`;
|
|
}
|
|
if (model.startsWith("C92")) {
|
|
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9200-series-switches/index.html`;
|
|
}
|
|
if (model.startsWith("C95")) {
|
|
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9500-series-switches/index.html`;
|
|
}
|
|
if (model.startsWith("C9")) {
|
|
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9000/index.html`;
|
|
}
|
|
if (model.startsWith("NCS-") || model.startsWith("81")) {
|
|
return `https://www.cisco.com/c/en/us/products/routers/network-convergence-system-5500-series/index.html`;
|
|
}
|
|
return `https://www.cisco.com/site/us/en/products/networking/cloud-networking-switches/index.html`;
|
|
}
|
|
|
|
function buildAristaUrl(model: string): string {
|
|
// Arista product pages: /en/products/{series}-series (no hyphens in series name)
|
|
const series = model.match(/^(\d{4}[A-Z]*\d*)/)?.[1] || model;
|
|
return `https://www.arista.com/en/products/${series.toLowerCase().replace(/[^a-z0-9]/g, "")}-series`;
|
|
}
|
|
|
|
function buildHpeUrl(model: string): string {
|
|
const seriesNum = model.match(/CX\s*(\d+)/)?.[1] || "";
|
|
return `https://www.arubanetworks.com/products/switches/cx-${seriesNum}-series/`;
|
|
}
|
|
|
|
function buildDellUrl(model: string): string {
|
|
return `https://www.dell.com/en-us/shop/networking-switches/${model.toLowerCase().replace(/\s+/g, "-")}/spd/${model.toLowerCase().replace(/\s+/g, "-")}`;
|
|
}
|
|
|
|
function buildExtremeUrl(model: string): string {
|
|
return `https://www.extremenetworks.com/product/${model.toLowerCase().replace(/[^a-z0-9]/g, "-")}`;
|
|
}
|
|
|
|
function buildJsVendorUrl(vendorName: string, model: string): string | null {
|
|
const lower = vendorName.toLowerCase();
|
|
if (lower.includes("cisco")) return buildCiscoUrl(model);
|
|
if (lower.includes("arista")) return buildAristaUrl(model);
|
|
if (lower.includes("hpe") || lower.includes("aruba")) return buildHpeUrl(model);
|
|
if (lower.includes("dell")) return buildDellUrl(model);
|
|
if (lower.includes("extreme")) return buildExtremeUrl(model);
|
|
return null;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// Playwright-based asset extraction
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promise<void> {
|
|
console.log("=== Switch Assets Crawler (Playwright) ===\n");
|
|
|
|
const jsVendors = ["Cisco", "Arista", "HPE", "Aruba", "Dell", "Extreme"];
|
|
const vendorFilter = targetVendor
|
|
? `AND v.name ILIKE '%${targetVendor}%'`
|
|
: `AND (${jsVendors.map((v) => `v.name ILIKE '%${v}%'`).join(" OR ")})`;
|
|
|
|
const result = await pool.query(`
|
|
SELECT sw.id, sw.model, sw.series, sw.product_page_url,
|
|
v.name as vendor_name, v.id as vendor_id
|
|
FROM switches sw
|
|
JOIN vendors v ON sw.vendor_id = v.id
|
|
WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL)
|
|
${vendorFilter}
|
|
ORDER BY v.name, sw.model
|
|
LIMIT 100
|
|
`);
|
|
|
|
if (result.rows.length === 0) {
|
|
console.log("No JS-vendor switches need asset scraping.\n");
|
|
return;
|
|
}
|
|
|
|
const targets: CrawlTarget[] = [];
|
|
for (const row of result.rows) {
|
|
const productPageUrl = row.product_page_url || buildJsVendorUrl(row.vendor_name, row.model);
|
|
if (!productPageUrl) continue;
|
|
|
|
targets.push({
|
|
switchId: row.id,
|
|
vendorId: row.vendor_id,
|
|
vendorName: row.vendor_name,
|
|
model: row.model,
|
|
productPageUrl,
|
|
});
|
|
}
|
|
|
|
console.log(`Crawling ${targets.length} JS-heavy product pages...\n`);
|
|
|
|
let images = 0;
|
|
let datasheets = 0;
|
|
let manuals = 0;
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
maxConcurrency: 2,
|
|
maxRequestsPerMinute: 10,
|
|
requestHandlerTimeoutSecs: 60,
|
|
headless: true,
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
},
|
|
},
|
|
|
|
async requestHandler({ request, page }) {
|
|
const target = request.userData as CrawlTarget;
|
|
console.log(` ${target.vendorName} ${target.model}:`);
|
|
|
|
// Wait for page to fully load
|
|
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
|
|
|
|
// Set product page URL
|
|
await setSwitchProductPage(target.switchId, request.url);
|
|
|
|
// Extract og:image or first large product image
|
|
const imageUrl = await page.evaluate(() => {
|
|
const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute("content");
|
|
if (ogImage) return ogImage;
|
|
|
|
const imgs = Array.from(document.querySelectorAll("img"));
|
|
const productImg = imgs.find((img) => {
|
|
const src = img.src || "";
|
|
const alt = (img.alt || "").toLowerCase();
|
|
return (src.includes("product") || alt.includes("switch") || alt.includes("router"))
|
|
&& img.naturalWidth > 200;
|
|
});
|
|
return productImg?.src || null;
|
|
});
|
|
|
|
if (imageUrl) {
|
|
const ok = await downloadSwitchImage(
|
|
target.switchId, imageUrl, target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
images++;
|
|
console.log(` ✓ Image`);
|
|
}
|
|
}
|
|
|
|
// Extract datasheet PDF links
|
|
const pdfLinks = await page.evaluate(() => {
|
|
const links = Array.from(document.querySelectorAll('a[href*=".pdf"]'));
|
|
return links.map((a) => ({
|
|
href: (a as HTMLAnchorElement).href,
|
|
text: a.textContent?.trim() || "",
|
|
}));
|
|
});
|
|
|
|
const datasheetLink = pdfLinks.find((l) => {
|
|
const t = l.text.toLowerCase();
|
|
const h = l.href.toLowerCase();
|
|
return t.includes("datasheet") || t.includes("data sheet")
|
|
|| h.includes("datasheet") || h.includes("data-sheet");
|
|
});
|
|
|
|
if (datasheetLink) {
|
|
const ok = await downloadSwitchDatasheet(
|
|
target.switchId, target.vendorId, datasheetLink.href,
|
|
datasheetLink.text || `${target.model} Datasheet`,
|
|
target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
datasheets++;
|
|
console.log(` ✓ Datasheet`);
|
|
}
|
|
}
|
|
|
|
// Extract manual/guide links
|
|
const manualLinks = pdfLinks.filter((l) => {
|
|
const t = l.text.toLowerCase();
|
|
return t.includes("guide") || t.includes("manual") || t.includes("reference")
|
|
|| t.includes("quick start") || t.includes("installation");
|
|
});
|
|
|
|
for (const manual of manualLinks.slice(0, 3)) {
|
|
let type = "manual";
|
|
const t = manual.text.toLowerCase();
|
|
if (t.includes("quick start")) type = "quick_start";
|
|
if (t.includes("cli") || t.includes("reference")) type = "cli_reference";
|
|
if (t.includes("installation")) type = "installation_guide";
|
|
|
|
const ok = await downloadSwitchManual(
|
|
target.switchId, target.vendorId, manual.href,
|
|
manual.text, type, target.vendorName, target.model
|
|
);
|
|
if (ok) {
|
|
manuals++;
|
|
console.log(` ✓ ${type}: ${manual.text}`);
|
|
}
|
|
}
|
|
},
|
|
|
|
async failedRequestHandler({ request }) {
|
|
const target = request.userData as CrawlTarget;
|
|
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
|
},
|
|
}, makeCrawleeConfig("switch-assets-playwright"));
|
|
|
|
await crawler.run(
|
|
targets.map((t) => ({
|
|
url: t.productPageUrl,
|
|
userData: t,
|
|
}))
|
|
);
|
|
|
|
console.log(`\n=== Playwright Crawl Complete ===`);
|
|
console.log(` Images: ${images}`);
|
|
console.log(` Datasheets: ${datasheets}`);
|
|
console.log(` Manuals: ${manuals}`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
crawlSwitchAssetsPlaywright(vendor)
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|