- switch-image-playwright.ts + switch-image-fetcher.ts: add filter patterns for /webimage-404/ (Netgear 404 hero), /Brand/ + /cybersecurity.png/ (Moxa brand marketing images not product photos) - sql/047: Moxa 4/4 models — CDN getattachment paths (hotlink-protected, Referer: moxa.com required; R2 proxy needed for production display) - sql/048: UfiSpace 6/6 models — ufispace.com/image/<hash>/ direct PNGs; Brocade G720+G730 — broadcom.com og:image; ICX 7850-48FS — CommScope/Ruckus vistancenetworks.com ImageServer (rand param is cache-bust only, not auth) - sql/049: NVIDIA SN-series 6/6 — docscontent.nvidia.com (SN2201/3700/4700) and S3 direct (SN5400/5600); SN3750-SX via uvation reseller CDN
433 lines
18 KiB
TypeScript
433 lines
18 KiB
TypeScript
/**
|
|
* Switch Image Fetcher — Playwright edition for bot-blocked vendors
|
|
*
|
|
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
|
|
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
|
|
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs),
|
|
* Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
|
|
* Asterfusion, Brocade, UfiSpace, QCT
|
|
*
|
|
* Strategy:
|
|
* 1. Query switches without image_url for JS-blocked vendors
|
|
* 2. Open each product page in headless Chromium (stealth mode)
|
|
* 3. Extract og:image (or fallback: first large product <img>)
|
|
* 4. Apply same isGenericImage() filter as the plain HTTP fetcher
|
|
* 5. Write image_url + product_page_url to switches table
|
|
*
|
|
* Rate limit: maxConcurrency=1, 4s delay between requests.
|
|
* Run: npx tsx src/scrapers/switch-image-playwright.ts [--vendor=arista]
|
|
*/
|
|
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { pool } from "../utils/db";
|
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
|
|
|
// ── Stealth headers injected into every page ─────────────────────────────────
|
|
|
|
const STEALTH_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
|
|
// ── Generic marketing image detector (mirrors switch-image-fetcher.ts) ────────
|
|
|
|
const GENERIC_IMAGE_PATTERNS: RegExp[] = [
|
|
/[-/_]logo[-_.]|\/logos?\//i,
|
|
/cisco[-_]?logo/i,
|
|
/juniper[-_]networks[-_]logo/i,
|
|
/arista[-_]?logo/i,
|
|
/brand[-_]?logo/i,
|
|
/company[-_]?logo/i,
|
|
/\/svg\//i,
|
|
/\.svg(\?|$)/i,
|
|
/naas-homepag/i,
|
|
/al-enterprise.*\/images\/naas/i,
|
|
/og[-_]default/i,
|
|
/default[-_](?:og|social|share|image)/i,
|
|
/site[-_](?:default|image|og)/i,
|
|
/social[-_](?:default|share)/i,
|
|
/twitter[-_]default/i,
|
|
/default[-_]thumbnail/i,
|
|
/\/homepage\//i,
|
|
/hero[-_](?:banner|bg|background|image)/i,
|
|
/banner[-_](?:bg|background)/i,
|
|
/lifestyle/i,
|
|
/stock[-_]?photo/i,
|
|
/placeholder/i,
|
|
/no[-_]?image/i,
|
|
/image[-_]?not[-_]?found/i,
|
|
/\/fallback[/-]/i,
|
|
/missing[-_]image/i,
|
|
/\/press[-_]kit/i,
|
|
/\/media[-_]kit/i,
|
|
// Vendor-specific brand icons
|
|
/open-graph\.gif/i,
|
|
/social[-_]icon/i,
|
|
/favicon/i,
|
|
/og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic
|
|
// Cookie consent / GDPR overlay images (OneTrust, Cookiebot, TrustArc, etc.)
|
|
/cdn\.cookielaw\.org/i,
|
|
/cookiebot\.com/i,
|
|
/trustarc\.com/i,
|
|
/consent-manager/i,
|
|
// Nokia CMS marketing banners (not product photos)
|
|
/nok\d+-nokia-com-banner/i,
|
|
// Huawei category/why-buy marketing images
|
|
/whyhuawei-/i,
|
|
/campus-switches/i,
|
|
/bg_products/i,
|
|
// Generic "banners" path segment used by CMSes
|
|
/\/banners?\//i,
|
|
// Vendor error / 404 graphics
|
|
/404[-_]error/i,
|
|
/error[-_]graphic/i,
|
|
/webimage-404/i,
|
|
// Navigation icon libraries (D-Link, other CMSes)
|
|
/\/icon[-_]library\//i,
|
|
// Diagrams and illustrations (not product photos)
|
|
/[-_]illustration[._]/i,
|
|
// Moxa brand/marketing images (not product photos)
|
|
/\/Brand\//i,
|
|
/cybersecurity\.png/i,
|
|
];
|
|
|
|
function isGenericImage(url: string): boolean {
|
|
return GENERIC_IMAGE_PATTERNS.some((re) => re.test(url));
|
|
}
|
|
|
|
// ── Product page URL builders ─────────────────────────────────────────────────
|
|
|
|
function buildAristaUrl(model: string): string | null {
|
|
// Map model to its Arista series page (og:image lives on series pages, not individual model pages).
|
|
// Pattern: extract alphanumeric prefix before the first "-<digits>" port-count suffix.
|
|
// 7060X5-32QS → 7060x5 → /en/products/7060x5-series
|
|
// 7050CX3-32S → 7050cx3 → /en/products/7050cx3-series
|
|
// 7280R3A-48D5 → 7280r3a → strip trailing sub-variant 'A' → 7280r3 → /en/products/7280r3-series
|
|
// 7020R → 7020r → /en/products/7020r-series
|
|
const leadMatch = model.match(/^(\d{3,4}[A-Z0-9]*?)(-\d|$)/i);
|
|
if (!leadMatch) return null;
|
|
|
|
let series = leadMatch[1].toLowerCase();
|
|
// Strip trailing sub-variant 'a' (R3A → R3, R2A → R2) — Arista groups these on the base series page
|
|
series = series.replace(/([a-z]\d+)a$/, "$1");
|
|
|
|
return `https://www.arista.com/en/products/${series}-series`;
|
|
}
|
|
|
|
function buildDellUrl(model: string): string | null {
|
|
// PowerSwitch Z9332F-ON → try Dell networking product page
|
|
const cleanModel = model.replace(/^PowerSwitch\s+/i, "").trim();
|
|
const slug = cleanModel.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
|
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
|
|
}
|
|
|
|
// Edgecore uses WooCommerce with /product/<slug>/ URLs (no .html suffix).
|
|
// Some models have non-obvious slugs verified via sitemap.
|
|
const EDGECORE_SLUG_MAP: Record<string, string> = {
|
|
"AS7712-32X": "as7712-32x-ec", // -ec suffix variant in Edgecore WooCommerce
|
|
"Minipack2": "minipack-as8000-open-modular-platform", // Facebook OCP Minipack2
|
|
};
|
|
|
|
function buildEdgecoreUrl(model: string): string | null {
|
|
if (model in EDGECORE_SLUG_MAP) {
|
|
return `https://www.edge-core.com/product/${EDGECORE_SLUG_MAP[model]}/`;
|
|
}
|
|
// Standard slug: lowercase, replace non-alphanum with dash, collapse multiple dashes
|
|
const slug = model.toLowerCase()
|
|
.replace(/[^a-z0-9-]/g, "-")
|
|
.replace(/-+/g, "-")
|
|
.replace(/^-|-$/g, "");
|
|
return slug ? `https://www.edge-core.com/product/${slug}/` : null;
|
|
}
|
|
|
|
function buildFortinetUrl(_model: string): string | null {
|
|
// Fortinet product pages are fully JS-rendered and all redirect to generic /products/ethernet-switches.
|
|
// No reliable og:image can be extracted — skip entirely.
|
|
return null;
|
|
}
|
|
|
|
function buildHpeArubaUrl(model: string): string | null {
|
|
// HPE Aruba series pages are stored in product_page_url for all known models.
|
|
// Builder is a fallback for unknown models.
|
|
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
|
return `https://www.arubanetworks.com/products/switches/${slug}/`;
|
|
}
|
|
|
|
function buildExtremeUrl(model: string): string | null {
|
|
// Extreme direct product pages: extremenetworks.com/product/<slug>
|
|
const slug = model.toLowerCase()
|
|
.replace(/\s+/g, "-")
|
|
.replace(/[^a-z0-9-]/g, "")
|
|
.replace(/-+/g, "-");
|
|
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
|
|
}
|
|
|
|
// ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ────
|
|
|
|
// Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade:
|
|
// all models have product_page_url in DB → return null so the stored URL is used.
|
|
const buildPassthroughUrl = (_model: string): string | null => null;
|
|
|
|
function buildNvidiaUrl(model: string): string | null {
|
|
// NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc.
|
|
// ConnectX-7 is an HCA, no relevant product page → skip.
|
|
const snMatch = model.match(/^(SN[\d]+)/i);
|
|
if (snMatch) {
|
|
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function buildNetgearUrl(model: string): string | null {
|
|
// M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed/<slug>/
|
|
const slug = model.toLowerCase()
|
|
.replace(/[^a-z0-9]/g, "-")
|
|
.replace(/-+/g, "-")
|
|
.replace(/^-|-$/g, "");
|
|
return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null;
|
|
}
|
|
|
|
// UfiSpace: slug map derived from sitemap (non-predictable product URL tree)
|
|
const UFISPACE_URL_MAP: Record<string, string> = {
|
|
"S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway",
|
|
"S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router",
|
|
"S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router",
|
|
"S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam",
|
|
"S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router",
|
|
"S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router",
|
|
};
|
|
function buildUfiSpaceUrl(model: string): string | null {
|
|
return UFISPACE_URL_MAP[model] ?? null;
|
|
}
|
|
|
|
// QCT: URL map derived from sitemap (category path not predictable from model name)
|
|
const QCT_URL_MAP: Record<string, string> = {
|
|
"QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8",
|
|
"QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1",
|
|
};
|
|
function buildQctUrl(model: string): string | null {
|
|
return QCT_URL_MAP[model] ?? null;
|
|
}
|
|
|
|
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
|
arista: buildAristaUrl,
|
|
dell: buildDellUrl,
|
|
edgecore: buildEdgecoreUrl,
|
|
fortinet: buildFortinetUrl,
|
|
"hpe-aruba": buildHpeArubaUrl,
|
|
extreme: buildExtremeUrl,
|
|
// New JS-rendered vendors (stored product_page_url used where available)
|
|
nokia: buildPassthroughUrl,
|
|
huawei: buildPassthroughUrl,
|
|
ciena: buildPassthroughUrl,
|
|
moxa: buildPassthroughUrl,
|
|
"d-link": buildPassthroughUrl,
|
|
"alcatel-lucent-enterprise": buildPassthroughUrl,
|
|
asterfusion: buildPassthroughUrl,
|
|
brocade: buildPassthroughUrl,
|
|
"nvidia-networking": buildNvidiaUrl,
|
|
netgear: buildNetgearUrl,
|
|
ufispace: buildUfiSpaceUrl,
|
|
"quanta-cloud-technology": buildQctUrl,
|
|
};
|
|
|
|
// ── Request data attached to each crawl URL ──────────────────────────────────
|
|
|
|
interface SwitchCrawlData {
|
|
switchId: string;
|
|
model: string;
|
|
vendorName: string;
|
|
vendorSlug: string;
|
|
productPageUrl: string;
|
|
}
|
|
|
|
// ── Main scraper ──────────────────────────────────────────────────────────────
|
|
|
|
export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Promise<void> {
|
|
console.log("=== Switch Image Fetcher (Playwright) ===\n");
|
|
|
|
const slugFilter = targetVendorSlug ? `AND v.slug = '${targetVendorSlug}'` : `AND v.slug IN (${Object.keys(URL_BUILDERS).map((s) => `'${s}'`).join(",")})`;
|
|
|
|
const { rows } = await pool.query<{
|
|
id: string;
|
|
model: string;
|
|
vendor_slug: string;
|
|
vendor_name: string;
|
|
product_page_url: string | null;
|
|
}>(`
|
|
SELECT sw.id, sw.model, sw.product_page_url,
|
|
v.slug AS vendor_slug, v.name AS vendor_name
|
|
FROM switches sw
|
|
JOIN vendors v ON v.id = sw.vendor_id
|
|
WHERE (sw.image_url IS NULL OR sw.image_url = '')
|
|
${slugFilter}
|
|
ORDER BY v.slug, sw.model
|
|
`);
|
|
|
|
if (rows.length === 0) {
|
|
console.log(" All target switches already have images.\n");
|
|
return;
|
|
}
|
|
|
|
console.log(` ${rows.length} switches need images (Playwright vendors)\n`);
|
|
|
|
const requests: Array<{ url: string; uniqueKey: string; userData: SwitchCrawlData }> = [];
|
|
|
|
for (const row of rows) {
|
|
const builder = URL_BUILDERS[row.vendor_slug];
|
|
// For Arista: prefer freshly-built series URL over a stale stored model URL
|
|
const builtUrl = builder ? builder(row.model) : null;
|
|
const productUrl = row.vendor_slug === "arista"
|
|
? (builtUrl ?? row.product_page_url) // always use fresh series URL for Arista
|
|
: (row.product_page_url ?? builtUrl); // other vendors: prefer stored URL
|
|
if (!productUrl) {
|
|
console.log(` [SKIP] ${row.vendor_name} ${row.model} — no URL`);
|
|
continue;
|
|
}
|
|
requests.push({
|
|
url: productUrl,
|
|
// Use switch ID as uniqueKey so Crawlee doesn't deduplicate series-level URLs.
|
|
// Multiple models can share the same series page (e.g. 7060x5-series) — each needs its own DB write.
|
|
uniqueKey: row.id,
|
|
userData: {
|
|
switchId: row.id,
|
|
model: row.model,
|
|
vendorName: row.vendor_name,
|
|
vendorSlug: row.vendor_slug,
|
|
productPageUrl: productUrl,
|
|
},
|
|
});
|
|
}
|
|
|
|
if (requests.length === 0) {
|
|
console.log(" Nothing to crawl.\n");
|
|
return;
|
|
}
|
|
|
|
let found = 0;
|
|
let missed = 0;
|
|
let errors = 0;
|
|
|
|
const crawler = new PlaywrightCrawler(
|
|
{
|
|
maxConcurrency: 1, // one at a time — server-friendly
|
|
maxRequestsPerMinute: 12, // ~5s per request minimum
|
|
requestHandlerTimeoutSecs: 45,
|
|
navigationTimeoutSecs: 30,
|
|
headless: true,
|
|
launchContext: {
|
|
launchOptions: {
|
|
args: [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--disable-infobars",
|
|
"--window-size=1920,1080",
|
|
],
|
|
},
|
|
},
|
|
|
|
preNavigationHooks: [
|
|
async (_ctx, gotoOptions) => {
|
|
gotoOptions!.waitUntil = "domcontentloaded";
|
|
},
|
|
],
|
|
|
|
async requestHandler({ request, page }) {
|
|
const data = request.userData as SwitchCrawlData;
|
|
|
|
// Inject stealth UA
|
|
await page.setExtraHTTPHeaders({
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
});
|
|
await page.evaluate((ua) => {
|
|
Object.defineProperty(navigator, "userAgent", { value: ua, configurable: true });
|
|
Object.defineProperty(navigator, "webdriver", { value: false, configurable: true });
|
|
}, STEALTH_UA);
|
|
|
|
// Wait for page to settle (JS rendering)
|
|
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
|
|
|
|
// Extract og:image / twitter:image meta tags.
|
|
// We DON'T filter generics here — we filter outside so the img fallback can still run
|
|
// even when og:image exists but is a logo/brand image (e.g. Dell, HPE).
|
|
const metaImageUrl: string | null = await page.evaluate(() => {
|
|
const og = document.querySelector<HTMLMetaElement>('meta[property="og:image"]')?.content;
|
|
if (og) return og;
|
|
const tw = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]')?.content;
|
|
return tw ?? null;
|
|
});
|
|
|
|
// Use meta image if it passes the generic filter; otherwise fall through to img fallback.
|
|
let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null;
|
|
|
|
if (!imageUrl) {
|
|
// Img fallback: largest visible image that isn't a UI element.
|
|
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
|
|
imageUrl = await page.evaluate(() => {
|
|
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
|
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading|cookielaw|cookiebot|trustarc/i;
|
|
const candidate = imgs
|
|
.filter((img) => {
|
|
const src = img.src || img.getAttribute("data-src") || "";
|
|
return src.startsWith("http") &&
|
|
/\.(jpg|jpeg|png|webp)/i.test(src) &&
|
|
img.naturalWidth >= 200 &&
|
|
img.naturalHeight >= 150 &&
|
|
!skipPattern.test(src);
|
|
})
|
|
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
|
return candidate?.src ?? null;
|
|
});
|
|
}
|
|
|
|
if (!imageUrl || isGenericImage(imageUrl)) {
|
|
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
|
|
missed++;
|
|
|
|
// Save product_page_url even on miss to track that we tried
|
|
if (!data.productPageUrl) {
|
|
await pool.query(
|
|
`UPDATE switches SET product_page_url = $2, assets_scraped_at = NOW() WHERE id = $1`,
|
|
[data.switchId, request.url],
|
|
);
|
|
}
|
|
return;
|
|
}
|
|
|
|
await pool.query(
|
|
`UPDATE switches
|
|
SET image_url = $2,
|
|
product_page_url = COALESCE(product_page_url, $3),
|
|
assets_scraped_at = NOW()
|
|
WHERE id = $1`,
|
|
[data.switchId, imageUrl, request.url],
|
|
);
|
|
console.log(` [OK] ${data.vendorName} ${data.model} → ${imageUrl.slice(0, 80)}`);
|
|
found++;
|
|
},
|
|
|
|
async failedRequestHandler({ request }) {
|
|
const data = request.userData as SwitchCrawlData;
|
|
console.log(` [FAIL] ${data.vendorName} ${data.model} — ${request.errorMessages?.[0] ?? "unknown error"}`);
|
|
errors++;
|
|
},
|
|
},
|
|
// Use a unique run ID to avoid Crawlee temp-dir state contamination when multiple
|
|
// vendor runs execute back-to-back (ENOENT: stale request-queue files from prior run).
|
|
makeCrawleeConfig(`switch-images-playwright-${Date.now()}`),
|
|
);
|
|
|
|
await crawler.run(requests);
|
|
|
|
console.log(`\n=== Playwright Image Scraper Complete ===`);
|
|
console.log(` Images found: ${found}`);
|
|
console.log(` Missed: ${missed}`);
|
|
if (errors > 0) console.warn(` Errors: ${errors}`);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
const vendor = process.argv.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
fetchSwitchImagesPlaywright(vendor)
|
|
.then(() => pool.end())
|
|
.catch((err) => { console.error("Fatal:", err); pool.end(); process.exit(1); });
|
|
}
|