feat: add 12 new vendor URL builders to Playwright image scraper

- Nokia, Huawei, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
  Asterfusion, Brocade: passthrough builders (use stored product_page_url)
- NVIDIA Networking: SN-series URL builder (sn5600 → /ethernet-switching/sn5600/)
- Netgear: lowercase model slug builder for /business/wired/switches/fully-managed/
- UfiSpace: hardcoded sitemap-verified URL map (all 6 S9xxx models)
- QCT: hardcoded URL map for T3048-LY8 and T7032-IX1
- Add Nokia banner / Huawei marketing image patterns to GENERIC_IMAGE_PATTERNS
This commit is contained in:
Rene Fichtmueller 2026-04-21 07:24:11 +02:00
parent 07e1fc9178
commit 88403eb7eb

View File

@ -3,7 +3,9 @@
*
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs)
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs),
* Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
* Asterfusion, Brocade, UfiSpace, QCT
*
* Strategy:
* 1. Query switches without image_url for JS-blocked vendors
@ -65,6 +67,14 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [
/cookiebot\.com/i,
/trustarc\.com/i,
/consent-manager/i,
// Nokia CMS marketing banners (not product photos)
/nok\d+-nokia-com-banner/i,
// Huawei category/why-buy marketing images
/whyhuawei-/i,
/campus-switches/i,
/bg_products/i,
// Generic "banners" path segment used by CMSes
/\/banners?\//i,
];
function isGenericImage(url: string): boolean {
@ -138,13 +148,73 @@ function buildExtremeUrl(model: string): string | null {
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
}
// ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ────
// Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade:
// all models have product_page_url in DB → return null so the stored URL is used.
const buildPassthroughUrl = (_model: string): string | null => null;
function buildNvidiaUrl(model: string): string | null {
// NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc.
// ConnectX-7 is an HCA, no relevant product page → skip.
const snMatch = model.match(/^(SN[\d]+)/i);
if (snMatch) {
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`;
}
return null;
}
function buildNetgearUrl(model: string): string | null {
// M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed/<slug>/
const slug = model.toLowerCase()
.replace(/[^a-z0-9]/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null;
}
// UfiSpace: slug map derived from sitemap (non-predictable product URL tree)
const UFISPACE_URL_MAP: Record<string, string> = {
"S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway",
"S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router",
"S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router",
"S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam",
"S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router",
"S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router",
};
function buildUfiSpaceUrl(model: string): string | null {
return UFISPACE_URL_MAP[model] ?? null;
}
// QCT: URL map derived from sitemap (category path not predictable from model name)
const QCT_URL_MAP: Record<string, string> = {
"QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8",
"QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1",
};
function buildQctUrl(model: string): string | null {
return QCT_URL_MAP[model] ?? null;
}
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
arista: buildAristaUrl,
dell: buildDellUrl,
edgecore: buildEdgecoreUrl,
fortinet: buildFortinetUrl,
"hpe-aruba": buildHpeArubaUrl,
extreme: buildExtremeUrl,
arista: buildAristaUrl,
dell: buildDellUrl,
edgecore: buildEdgecoreUrl,
fortinet: buildFortinetUrl,
"hpe-aruba": buildHpeArubaUrl,
extreme: buildExtremeUrl,
// New JS-rendered vendors (stored product_page_url used where available)
nokia: buildPassthroughUrl,
huawei: buildPassthroughUrl,
ciena: buildPassthroughUrl,
moxa: buildPassthroughUrl,
"d-link": buildPassthroughUrl,
"alcatel-lucent-enterprise": buildPassthroughUrl,
asterfusion: buildPassthroughUrl,
brocade: buildPassthroughUrl,
"nvidia-networking": buildNvidiaUrl,
netgear: buildNetgearUrl,
ufispace: buildUfiSpaceUrl,
"quanta-cloud-technology": buildQctUrl,
};
// ── Request data attached to each crawl URL ──────────────────────────────────