transceiver-db/packages/scraper/src/scrapers/switch-image-playwright.ts
Rene Fichtmueller f4afe14af4 feat: add 12 new vendor URL builders to Playwright image scraper
- Nokia, Huawei, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
  Asterfusion, Brocade: passthrough builders (use stored product_page_url)
- NVIDIA Networking: SN-series URL builder (sn5600 → /ethernet-switching/sn5600/)
- Netgear: lowercase model slug builder for /business/wired/switches/fully-managed/
- UfiSpace: hardcoded sitemap-verified URL map (all 6 S9xxx models)
- QCT: hardcoded URL map for T3048-LY8 and T7032-IX1
- Add Nokia banner / Huawei marketing image patterns to GENERIC_IMAGE_PATTERNS
2026-04-21 07:24:11 +02:00

422 lines
17 KiB
TypeScript

/**
* Switch Image Fetcher — Playwright edition for bot-blocked vendors
*
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs),
* Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
* Asterfusion, Brocade, UfiSpace, QCT
*
* Strategy:
* 1. Query switches without image_url for JS-blocked vendors
* 2. Open each product page in headless Chromium (stealth mode)
* 3. Extract og:image (or fallback: first large product <img>)
* 4. Apply same isGenericImage() filter as the plain HTTP fetcher
* 5. Write image_url + product_page_url to switches table
*
* Rate limit: maxConcurrency=1, 4s delay between requests.
* Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista]
*/
import { PlaywrightCrawler } from "crawlee";
import { pool } from "../utils/db";
import { makeCrawleeConfig } from "../utils/crawlee-config";
// ── Stealth headers injected into every page ─────────────────────────────────
const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
// ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ────────
const GENERIC_IMAGE_PATTERNS: RegExp[] = [
/[-/_]logo[-_.]|\/logos?\//i,
/cisco[-_]?logo/i,
/juniper[-_]networks[-_]logo/i,
/arista[-_]?logo/i,
/brand[-_]?logo/i,
/company[-_]?logo/i,
/\/svg\//i,
/\.svg(\?|$)/i,
/naas-homepag/i,
/al-enterprise.*\/images\/naas/i,
/og[-_]default/i,
/default[-_](?:og|social|share|image)/i,
/site[-_](?:default|image|og)/i,
/social[-_](?:default|share)/i,
/twitter[-_]default/i,
/default[-_]thumbnail/i,
/\/homepage\//i,
/hero[-_](?:banner|bg|background|image)/i,
/banner[-_](?:bg|background)/i,
/lifestyle/i,
/stock[-_]?photo/i,
/placeholder/i,
/no[-_]?image/i,
/image[-_]?not[-_]?found/i,
/\/fallback[/-]/i,
/missing[-_]image/i,
/\/press[-_]kit/i,
/\/media[-_]kit/i,
// Vendor-specific brand icons
/open-graph\.gif/i,
/social[-_]icon/i,
/favicon/i,
/og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic
// Cookie consent / GDPR overlay images (OneTrust, Cookiebot, TrustArc, etc.)
/cdn\.cookielaw\.org/i,
/cookiebot\.com/i,
/trustarc\.com/i,
/consent-manager/i,
// Nokia CMS marketing banners (not product photos)
/nok\d+-nokia-com-banner/i,
// Huawei category/why-buy marketing images
/whyhuawei-/i,
/campus-switches/i,
/bg_products/i,
// Generic "banners" path segment used by CMSes
/\/banners?\//i,
];
function isGenericImage(url: string): boolean {
return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url));
}
// ── Product page URL builders ─────────────────────────────────────────────────
function buildAristaUrl(model: string): string | null {
// Map model to its Arista series page (og:image lives on series pages, not individual model pages).
// Pattern: extract alphanumeric prefix before the first "-<digits>" port-count suffix.
// 7060X5-32QS → 7060x5 → /en/products/7060x5-series
// 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series
// 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series
// 7020R → 7020r → /en/products/7020r-series
const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i);
if (!leadMatch) return null;
let series = leadMatch[1].toLowerCase();
// Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page
series = series.replace(/([a-z]\d+)a$/, "$1");
return `https://www.arista.com/en/products/${series}-series`;
}
function buildDellUrl(model: string): string | null {
// PowerSwitch Z9332F-ON → try Dell networking product page
const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim();
const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
}
// Edgecore uses WooCommerce with /product/<slug>/ URLs (no .html suffix).
// Some models have non-obvious slugs verified via sitemap.
const EDGECORE_SLUG_MAP: Record<string, string> = {
"AS7712-32X": "as7712-32x-ec", // -ec suffix variant in Edgecore WooCommerce
"Minipack2": "minipack-as8000-open-modular-platform", // Facebook OCP Minipack2
};
function buildEdgecoreUrl(model: string): string | null {
if (model in EDGECORE_SLUG_MAP) {
return `https://www.edge-core.com/product/${EDGECORE_SLUG_MAP[model]}/`;
}
// Standard slug: lowercase, replace non-alphanum with dash, collapse multiple dashes
const slug = model.toLowerCase()
.replace(/[^a-z0-9-]/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
return slug ? `https://www.edge-core.com/product/${slug}/` : null;
}
function buildFortinetUrl(_model: string): string | null {
// Fortinet product pages are fully JS-rendered and all redirect to generic /products/ethernet-switches.
// No reliable og:image can be extracted — skip entirely.
return null;
}
function buildHpeArubaUrl(model: string): string | null {
// HPE Aruba series pages are stored in product_page_url for all known models.
// Builder is a fallback for unknown models.
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.arubanetworks.com/products/switches/${slug}/`;
}
function buildExtremeUrl(model: string): string | null {
// Extreme direct product pages: extremenetworks.com/product/<slug>
const slug = model.toLowerCase()
.replace(/\s+/g, "-")
.replace(/[^a-z0-9-]/g, "")
.replace(/-+/g, "-");
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
}
// ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ────
// Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade:
// all models have product_page_url in DB → return null so the stored URL is used.
const buildPassthroughUrl = (_model: string): string | null => null;
function buildNvidiaUrl(model: string): string | null {
// NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc.
// ConnectX-7 is an HCA, no relevant product page → skip.
const snMatch = model.match(/^(SN[\d]+)/i);
if (snMatch) {
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`;
}
return null;
}
function buildNetgearUrl(model: string): string | null {
// M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed/<slug>/
const slug = model.toLowerCase()
.replace(/[^a-z0-9]/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null;
}
// UfiSpace: slug map derived from sitemap (non-predictable product URL tree)
const UFISPACE_URL_MAP: Record<string, string> = {
"S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway",
"S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router",
"S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router",
"S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam",
"S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router",
"S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router",
};
function buildUfiSpaceUrl(model: string): string | null {
return UFISPACE_URL_MAP[model] ?? null;
}
// QCT: URL map derived from sitemap (category path not predictable from model name)
const QCT_URL_MAP: Record<string, string> = {
"QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8",
"QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1",
};
function buildQctUrl(model: string): string | null {
return QCT_URL_MAP[model] ?? null;
}
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
arista: buildAristaUrl,
dell: buildDellUrl,
edgecore: buildEdgecoreUrl,
fortinet: buildFortinetUrl,
"hpe-aruba": buildHpeArubaUrl,
extreme: buildExtremeUrl,
// New JS-rendered vendors (stored product_page_url used where available)
nokia: buildPassthroughUrl,
huawei: buildPassthroughUrl,
ciena: buildPassthroughUrl,
moxa: buildPassthroughUrl,
"d-link": buildPassthroughUrl,
"alcatel-lucent-enterprise": buildPassthroughUrl,
asterfusion: buildPassthroughUrl,
brocade: buildPassthroughUrl,
"nvidia-networking": buildNvidiaUrl,
netgear: buildNetgearUrl,
ufispace: buildUfiSpaceUrl,
"quanta-cloud-technology": buildQctUrl,
};
// ── Request data attached to each crawl URL ──────────────────────────────────
interface SwitchCrawlData {
switchId: string;
model: string;
vendorName: string;
vendorSlug: string;
productPageUrl: string;
}
// ── Main scraper ──────────────────────────────────────────────────────────────
export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise<void> {
console.log("=== Switch Image Fetcher (Playwright) ===\n");
const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`;
const { rows } = await pool.query<{
id: string;
model: string;
vendor_slug: string;
vendor_name: string;
product_page_url: string | null;
}>(`
SELECT sw.id, sw.model, sw.product_page_url,
v.slug AS vendor_slug, v.name AS vendor_name
FROM switches sw
JOIN vendors v ON v.id = sw.vendor_id
WHERE (sw.image_url IS NULL OR sw.image_url = '')
${slugFilter}
ORDER BY v.slug, sw.model
`);
if (rows.length === 0) {
console.log(" All target switches already have images.\n");
return;
}
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = [];
for (const row of rows) {
const builder = URL_BUILDERS[row.vendor_slug];
// For Arista: prefer freshly-built series URL over a stale stored model URL
const builtUrl = builder ? builder(row.model) : null;
const productUrl = row.vendor_slug === "arista"
? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista
: (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL
if (!productUrl) {
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
continue;
}
requests.push({
url: productUrl,
// Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs.
// Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write.
uniqueKey: row.id,
userData: {
switchId: row.id,
model: row.model,
vendorName: row.vendor_name,
vendorSlug: row.vendor_slug,
productPageUrl: productUrl,
},
});
}
if (requests.length === 0) {
console.log(" Nothing to crawl.\n");
return;
}
let found = 0;
let missed = 0;
let errors = 0;
const crawler = new PlaywrightCrawler(
{
maxConcurrency: 1, // one at a time — server-friendly
maxRequestsPerMinute: 12, // ~5s per request minimum
requestHandlerTimeoutSecs: 45,
navigationTimeoutSecs: 30,
headless: true,
launchContext: {
launchOptions: {
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-infobars",
"--window-size=1920,1080",
],
},
},
preNavigationHooks: [
async (_ctx, gotoOptions) => {
gotoOptions!.waitUntil = "domcontentloaded";
},
],
async requestHandler({ request, page }) {
const data = request.userData as SwitchCrawlData;
// Inject stealth UA
await page.setExtraHTTPHeaders({
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
});
await page.evaluate((ua) => {
Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true });
Object.defineProperty(navigator, "webdriver", { value: false, configurable: true });
}, STEALTH_UA);
// Wait for page to settle (JS rendering)
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
// Extract og:image / twitter:image meta tags.
// We DON'T filter generics here — we filter outside so the img fallback can still run
// even when og:image exists but is a logo/brand image (e.g. Dell, HPE).
const metaImageUrl: string | null = await page.evaluate(() => {
const og = document.querySelector<HTMLMetaElement>('meta[property="og:image"]')?.content;
if (og) return og;
const tw = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]')?.content;
return tw ?? null;
});
// Use meta image if it passes the generic filter; otherwise fall through to img fallback.
let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null;
if (!imageUrl) {
// Img fallback: largest visible image that isn't a UI element.
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
imageUrl = await page.evaluate(() => {
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading|cookielaw|cookiebot|trustarc/i;
const candidate = imgs
.filter((img) => {
const src = img.src || img.getAttribute("data-src") || "";
return src.startsWith("http") &&
/\.(jpg|jpeg|png|webp)/i.test(src) &&
img.naturalWidth >= 200 &&
img.naturalHeight >= 150 &&
!skipPattern.test(src);
})
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
return candidate?.src ?? null;
});
}
if (!imageUrl || isGenericImage(imageUrl)) {
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
missed++;
// Save product_page_url even on miss to track that we tried
if (!data.productPageUrl) {
await pool.query(
`UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`,
[data.switchId, request.url],
);
}
return;
}
await pool.query(
`UPDATE switches
SET image_url = $2,
product_page_url = COALESCE(product_page_url, $3),
assets_scraped_at = NOW()
WHERE id = $1`,
[data.switchId, imageUrl, request.url],
);
console.log(` [OK] ${data.vendorName} ${data.model}${imageUrl.slice(0, 80)}`);
found++;
},
async failedRequestHandler({ request }) {
const data = request.userData as SwitchCrawlData;
console.log(` [FAIL] ${data.vendorName} ${data.model}${request.errorMessages?.[0] ?? "unknown error"}`);
errors++;
},
},
// Use a unique run ID to avoid Crawlee temp-dir state contamination when multiple
// vendor runs execute back-to-back (ENOENT: stale request-queue files from prior run).
makeCrawleeConfig(`switch-images-playwright-${Date.now()}`),
);
await crawler.run(requests);
console.log(`\n=== Playwright Image Scraper Complete ===`);
console.log(` Images found: ${found}`);
console.log(` Missed: ${missed}`);
if (errors > 0) console.warn(` Errors: ${errors}`);
}
if (require.main === module) {
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
fetchSwitchImagesPlaywright(vendor)
.then(() => pool.end())
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
}