transceiver-db/packages/scraper/src/scrapers/switch-assets-playwright.ts

254 lines
9.2 KiB
TypeScript

/**
* Switch Assets Scraper — Playwright-based for JS-heavy vendor sites
*
* Cisco, Arista, HPE/Aruba, Dell, and Extreme require JavaScript rendering
* to access product pages, datasheets, and images.
*
* Uses PlaywrightCrawler for full browser rendering.
*/
import { PlaywrightCrawler } from "crawlee";
import { pool } from "../utils/db";
import {
downloadSwitchImage,
downloadSwitchDatasheet,
downloadSwitchManual,
setSwitchProductPage,
} from "../utils/assets";
interface CrawlTarget {
switchId: string;
vendorId: string;
vendorName: string;
model: string;
productPageUrl: string;
}
// ═══════════════════════════════════════════════════════
// Vendor-specific product page URL builders
// ═══════════════════════════════════════════════════════
function buildCiscoUrl(model: string): string {
if (model.startsWith("N9K-") || model.startsWith("N3K-")) {
// Nexus 9000 — use datasheet listing page for JS-rendered content
return `https://www.cisco.com/c/en/us/products/switches/nexus-9000-series-switches/datasheet-listing.html`;
}
if (model.startsWith("C93")) {
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9300-series-switches/datasheet-listing.html`;
}
if (model.startsWith("C92")) {
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9200-series-switches/index.html`;
}
if (model.startsWith("C95")) {
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9500-series-switches/index.html`;
}
if (model.startsWith("C9")) {
return `https://www.cisco.com/c/en/us/products/switches/catalyst-9000/index.html`;
}
if (model.startsWith("NCS-") || model.startsWith("81")) {
return `https://www.cisco.com/c/en/us/products/routers/network-convergence-system-5500-series/index.html`;
}
return `https://www.cisco.com/site/us/en/products/networking/cloud-networking-switches/index.html`;
}
function buildAristaUrl(model: string): string {
// Arista product pages: /en/products/{series}-series (no hyphens in series name)
const series = model.match(/^(\d{4}[A-Z]*\d*)/)?.[1] || model;
return `https://www.arista.com/en/products/${series.toLowerCase().replace(/[^a-z0-9]/g, "")}-series`;
}
function buildHpeUrl(model: string): string {
const seriesNum = model.match(/CX\s*(\d+)/)?.[1] || "";
return `https://www.arubanetworks.com/products/switches/cx-${seriesNum}-series/`;
}
function buildDellUrl(model: string): string {
return `https://www.dell.com/en-us/shop/networking-switches/${model.toLowerCase().replace(/\s+/g, "-")}/spd/${model.toLowerCase().replace(/\s+/g, "-")}`;
}
function buildExtremeUrl(model: string): string {
return `https://www.extremenetworks.com/product/${model.toLowerCase().replace(/[^a-z0-9]/g, "-")}`;
}
function buildJsVendorUrl(vendorName: string, model: string): string | null {
const lower = vendorName.toLowerCase();
if (lower.includes("cisco")) return buildCiscoUrl(model);
if (lower.includes("arista")) return buildAristaUrl(model);
if (lower.includes("hpe") || lower.includes("aruba")) return buildHpeUrl(model);
if (lower.includes("dell")) return buildDellUrl(model);
if (lower.includes("extreme")) return buildExtremeUrl(model);
return null;
}
// ═══════════════════════════════════════════════════════
// Playwright-based asset extraction
// ═══════════════════════════════════════════════════════
export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promise<void> {
console.log("=== Switch Assets Crawler (Playwright) ===\n");
const jsVendors = ["Cisco", "Arista", "HPE", "Aruba", "Dell", "Extreme"];
const vendorFilter = targetVendor
? `AND v.name ILIKE '%${targetVendor}%'`
: `AND (${jsVendors.map((v) => `v.name ILIKE '%${v}%'`).join(" OR ")})`;
const result = await pool.query(`
SELECT sw.id, sw.model, sw.series, sw.product_page_url,
v.name as vendor_name, v.id as vendor_id
FROM switches sw
JOIN vendors v ON sw.vendor_id = v.id
WHERE (sw.image_url IS NULL OR sw.datasheet_url IS NULL)
${vendorFilter}
ORDER BY v.name, sw.model
LIMIT 100
`);
if (result.rows.length === 0) {
console.log("No JS-vendor switches need asset scraping.\n");
return;
}
const targets: CrawlTarget[] = [];
for (const row of result.rows) {
const productPageUrl = row.product_page_url || buildJsVendorUrl(row.vendor_name, row.model);
if (!productPageUrl) continue;
targets.push({
switchId: row.id,
vendorId: row.vendor_id,
vendorName: row.vendor_name,
model: row.model,
productPageUrl,
});
}
console.log(`Crawling ${targets.length} JS-heavy product pages...\n`);
let images = 0;
let datasheets = 0;
let manuals = 0;
const crawler = new PlaywrightCrawler({
maxConcurrency: 2,
maxRequestsPerMinute: 10,
requestHandlerTimeoutSecs: 60,
headless: true,
launchContext: {
launchOptions: {
args: ["--no-sandbox", "--disable-setuid-sandbox"],
},
},
async requestHandler({ request, page }) {
const target = request.userData as CrawlTarget;
console.log(` ${target.vendorName} ${target.model}:`);
// Wait for page to fully load
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
// Set product page URL
await setSwitchProductPage(target.switchId, request.url);
// Extract og:image or first large product image
const imageUrl = await page.evaluate(() => {
const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute("content");
if (ogImage) return ogImage;
const imgs = Array.from(document.querySelectorAll("img"));
const productImg = imgs.find((img) => {
const src = img.src || "";
const alt = (img.alt || "").toLowerCase();
return (src.includes("product") || alt.includes("switch") || alt.includes("router"))
&& img.naturalWidth > 200;
});
return productImg?.src || null;
});
if (imageUrl) {
const ok = await downloadSwitchImage(
target.switchId, imageUrl, target.vendorName, target.model
);
if (ok) {
images++;
console.log(` ✓ Image`);
}
}
// Extract datasheet PDF links
const pdfLinks = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a[href*=".pdf"]'));
return links.map((a) => ({
href: (a as HTMLAnchorElement).href,
text: a.textContent?.trim() || "",
}));
});
const datasheetLink = pdfLinks.find((l) => {
const t = l.text.toLowerCase();
const h = l.href.toLowerCase();
return t.includes("datasheet") || t.includes("data sheet")
|| h.includes("datasheet") || h.includes("data-sheet");
});
if (datasheetLink) {
const ok = await downloadSwitchDatasheet(
target.switchId, target.vendorId, datasheetLink.href,
datasheetLink.text || `${target.model} Datasheet`,
target.vendorName, target.model
);
if (ok) {
datasheets++;
console.log(` ✓ Datasheet`);
}
}
// Extract manual/guide links
const manualLinks = pdfLinks.filter((l) => {
const t = l.text.toLowerCase();
return t.includes("guide") || t.includes("manual") || t.includes("reference")
|| t.includes("quick start") || t.includes("installation");
});
for (const manual of manualLinks.slice(0, 3)) {
let type = "manual";
const t = manual.text.toLowerCase();
if (t.includes("quick start")) type = "quick_start";
if (t.includes("cli") || t.includes("reference")) type = "cli_reference";
if (t.includes("installation")) type = "installation_guide";
const ok = await downloadSwitchManual(
target.switchId, target.vendorId, manual.href,
manual.text, type, target.vendorName, target.model
);
if (ok) {
manuals++;
console.log(`${type}: ${manual.text}`);
}
}
},
async failedRequestHandler({ request }) {
const target = request.userData as CrawlTarget;
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
},
});
await crawler.run(
targets.map((t) => ({
url: t.productPageUrl,
userData: t,
}))
);
console.log(`\n=== Playwright Crawl Complete ===`);
console.log(` Images: ${images}`);
console.log(` Datasheets: ${datasheets}`);
console.log(` Manuals: ${manuals}`);
}
if (require.main === module) {
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
crawlSwitchAssetsPlaywright(vendor)
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}