feat: add 12 new vendor URL builders to Playwright image scraper
- Nokia, Huawei, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise, Asterfusion, Brocade: passthrough builders (use stored product_page_url) - NVIDIA Networking: SN-series URL builder (sn5600 → /ethernet-switching/sn5600/) - Netgear: lowercase model slug builder for /business/wired/switches/fully-managed/ - UfiSpace: hardcoded sitemap-verified URL map (all 6 S9xxx models) - QCT: hardcoded URL map for T3048-LY8 and T7032-IX1 - Add Nokia banner / Huawei marketing image patterns to GENERIC_IMAGE_PATTERNS
This commit is contained in:
parent
6a4b4700cb
commit
f4afe14af4
@ -3,7 +3,9 @@
|
||||
*
|
||||
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
|
||||
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
|
||||
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs)
|
||||
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs),
|
||||
* Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
|
||||
* Asterfusion, Brocade, UfiSpace, QCT
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Query switches without image_url for JS-blocked vendors
|
||||
@ -65,6 +67,14 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [
|
||||
/cookiebot\.com/i,
|
||||
/trustarc\.com/i,
|
||||
/consent-manager/i,
|
||||
// Nokia CMS marketing banners (not product photos)
|
||||
/nok\d+-nokia-com-banner/i,
|
||||
// Huawei category/why-buy marketing images
|
||||
/whyhuawei-/i,
|
||||
/campus-switches/i,
|
||||
/bg_products/i,
|
||||
// Generic "banners" path segment used by CMSes
|
||||
/\/banners?\//i,
|
||||
];
|
||||
|
||||
function isGenericImage(url: string): boolean {
|
||||
@ -138,13 +148,73 @@ function buildExtremeUrl(model: string): string | null {
|
||||
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
|
||||
}
|
||||
|
||||
// ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ────
|
||||
|
||||
// Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade:
|
||||
// all models have product_page_url in DB → return null so the stored URL is used.
|
||||
const buildPassthroughUrl = (_model: string): string | null => null;
|
||||
|
||||
function buildNvidiaUrl(model: string): string | null {
|
||||
// NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc.
|
||||
// ConnectX-7 is an HCA, no relevant product page → skip.
|
||||
const snMatch = model.match(/^(SN[\d]+)/i);
|
||||
if (snMatch) {
|
||||
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function buildNetgearUrl(model: string): string | null {
|
||||
// M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed/<slug>/
|
||||
const slug = model.toLowerCase()
|
||||
.replace(/[^a-z0-9]/g, "-")
|
||||
.replace(/-+/g, "-")
|
||||
.replace(/^-|-$/g, "");
|
||||
return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null;
|
||||
}
|
||||
|
||||
// UfiSpace: slug map derived from sitemap (non-predictable product URL tree)
|
||||
const UFISPACE_URL_MAP: Record<string, string> = {
|
||||
"S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway",
|
||||
"S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router",
|
||||
"S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router",
|
||||
"S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam",
|
||||
"S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router",
|
||||
"S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router",
|
||||
};
|
||||
function buildUfiSpaceUrl(model: string): string | null {
|
||||
return UFISPACE_URL_MAP[model] ?? null;
|
||||
}
|
||||
|
||||
// QCT: URL map derived from sitemap (category path not predictable from model name)
|
||||
const QCT_URL_MAP: Record<string, string> = {
|
||||
"QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8",
|
||||
"QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1",
|
||||
};
|
||||
function buildQctUrl(model: string): string | null {
|
||||
return QCT_URL_MAP[model] ?? null;
|
||||
}
|
||||
|
||||
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||
arista: buildAristaUrl,
|
||||
dell: buildDellUrl,
|
||||
edgecore: buildEdgecoreUrl,
|
||||
fortinet: buildFortinetUrl,
|
||||
"hpe-aruba": buildHpeArubaUrl,
|
||||
extreme: buildExtremeUrl,
|
||||
arista: buildAristaUrl,
|
||||
dell: buildDellUrl,
|
||||
edgecore: buildEdgecoreUrl,
|
||||
fortinet: buildFortinetUrl,
|
||||
"hpe-aruba": buildHpeArubaUrl,
|
||||
extreme: buildExtremeUrl,
|
||||
// New JS-rendered vendors (stored product_page_url used where available)
|
||||
nokia: buildPassthroughUrl,
|
||||
huawei: buildPassthroughUrl,
|
||||
ciena: buildPassthroughUrl,
|
||||
moxa: buildPassthroughUrl,
|
||||
"d-link": buildPassthroughUrl,
|
||||
"alcatel-lucent-enterprise": buildPassthroughUrl,
|
||||
asterfusion: buildPassthroughUrl,
|
||||
brocade: buildPassthroughUrl,
|
||||
"nvidia-networking": buildNvidiaUrl,
|
||||
netgear: buildNetgearUrl,
|
||||
ufispace: buildUfiSpaceUrl,
|
||||
"quanta-cloud-technology": buildQctUrl,
|
||||
};
|
||||
|
||||
// ── Request data attached to each crawl URL ──────────────────────────────────
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user