feat: add 12 new vendor URL builders to Playwright image scraper
- Nokia, Huawei, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise, Asterfusion, Brocade: passthrough builders (use stored product_page_url) - NVIDIA Networking: SN-series URL builder (sn5600 → /ethernet-switching/sn5600/) - Netgear: lowercase model slug builder for /business/wired/switches/fully-managed/ - UfiSpace: hardcoded sitemap-verified URL map (all 6 S9xxx models) - QCT: hardcoded URL map for T3048-LY8 and T7032-IX1 - Add Nokia banner / Huawei marketing image patterns to GENERIC_IMAGE_PATTERNS
This commit is contained in:
parent
07e1fc9178
commit
88403eb7eb
@ -3,7 +3,9 @@
|
|||||||
*
|
*
|
||||||
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
|
* Vendors that reject plain HTTP bots (403/406) or require JS rendering:
|
||||||
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
|
* Arista (HTTP 406), Dell (HTTP 403), Edgecore (HTTP 403),
|
||||||
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs)
|
* Fortinet (JS-rendered), HPE/Aruba (HTTP 403), Extreme Networks (no static URLs),
|
||||||
|
* Nokia, Huawei, NVIDIA, Netgear, Ciena, Moxa, D-Link, Alcatel-Lucent Enterprise,
|
||||||
|
* Asterfusion, Brocade, UfiSpace, QCT
|
||||||
*
|
*
|
||||||
* Strategy:
|
* Strategy:
|
||||||
* 1. Query switches without image_url for JS-blocked vendors
|
* 1. Query switches without image_url for JS-blocked vendors
|
||||||
@ -65,6 +67,14 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [
|
|||||||
/cookiebot\.com/i,
|
/cookiebot\.com/i,
|
||||||
/trustarc\.com/i,
|
/trustarc\.com/i,
|
||||||
/consent-manager/i,
|
/consent-manager/i,
|
||||||
|
// Nokia CMS marketing banners (not product photos)
|
||||||
|
/nok\d+-nokia-com-banner/i,
|
||||||
|
// Huawei category/why-buy marketing images
|
||||||
|
/whyhuawei-/i,
|
||||||
|
/campus-switches/i,
|
||||||
|
/bg_products/i,
|
||||||
|
// Generic "banners" path segment used by CMSes
|
||||||
|
/\/banners?\//i,
|
||||||
];
|
];
|
||||||
|
|
||||||
function isGenericImage(url: string): boolean {
|
function isGenericImage(url: string): boolean {
|
||||||
@ -138,13 +148,73 @@ function buildExtremeUrl(model: string): string | null {
|
|||||||
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
|
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── New vendors (JS-rendered; rely on stored product_page_url or built URL) ────
|
||||||
|
|
||||||
|
// Nokia, Huawei, Ciena, Moxa, D-Link, ALE, Asterfusion, Brocade:
|
||||||
|
// all models have product_page_url in DB → return null so the stored URL is used.
|
||||||
|
const buildPassthroughUrl = (_model: string): string | null => null;
|
||||||
|
|
||||||
|
function buildNvidiaUrl(model: string): string | null {
|
||||||
|
// NVIDIA Spectrum switches: SN5600, SN4700, SN3700, SN3750-SX, SN2201, etc.
|
||||||
|
// ConnectX-7 is an HCA, no relevant product page → skip.
|
||||||
|
const snMatch = model.match(/^(SN[\d]+)/i);
|
||||||
|
if (snMatch) {
|
||||||
|
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${snMatch[1].toLowerCase()}/`;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildNetgearUrl(model: string): string | null {
|
||||||
|
// M4300-96X, M4350-48G4XF, M4500-32C → /business/wired/switches/fully-managed/<slug>/
|
||||||
|
const slug = model.toLowerCase()
|
||||||
|
.replace(/[^a-z0-9]/g, "-")
|
||||||
|
.replace(/-+/g, "-")
|
||||||
|
.replace(/^-|-$/g, "");
|
||||||
|
return slug ? `https://www.netgear.com/business/wired/switches/fully-managed/${slug}/` : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// UfiSpace: slug map derived from sitemap (non-predictable product URL tree)
|
||||||
|
const UFISPACE_URL_MAP: Record<string, string> = {
|
||||||
|
"S9510-28DC": "https://www.ufispace.com/products/telco/access/s9510-28dc-flexe-tsn-disaggregated-cell-site-gateway",
|
||||||
|
"S9600-30DX": "https://www.ufispace.com/products/telco/aggregation/s9600-30dx-open-zr-aggregation-router",
|
||||||
|
"S9600-32X": "https://www.ufispace.com/products/telco/aggregation/s9600-32x-25g-100g-aggregation-router",
|
||||||
|
"S9600-72XC": "https://www.ufispace.com/products/telco/aggregation/s9600-72xc-25g-100g-open-aggregation-router-tcam",
|
||||||
|
"S9700-53DX": "https://www.ufispace.com/products/telco/core-edge/s9700-53dx-100g-core-router",
|
||||||
|
"S9710-76D": "https://www.ufispace.com/products/telco/core-edge/s9710-76d-high-density-400g-disaggregated-core-router",
|
||||||
|
};
|
||||||
|
function buildUfiSpaceUrl(model: string): string | null {
|
||||||
|
return UFISPACE_URL_MAP[model] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// QCT: URL map derived from sitemap (category path not predictable from model name)
|
||||||
|
const QCT_URL_MAP: Record<string, string> = {
|
||||||
|
"QuantaMesh T3048-LY8": "https://www.qct.io/product/index/Networking/Ethernet-Switch/T3000-Series/QuantaMesh-T3048-LY8",
|
||||||
|
"QuantaMesh T7032-IX1": "https://www.qct.io/product/index/Networking/Bare-Metal-Switch/Spine-Switch/QuantaMesh-BMS-T7032-IX1",
|
||||||
|
};
|
||||||
|
function buildQctUrl(model: string): string | null {
|
||||||
|
return QCT_URL_MAP[model] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||||
arista: buildAristaUrl,
|
arista: buildAristaUrl,
|
||||||
dell: buildDellUrl,
|
dell: buildDellUrl,
|
||||||
edgecore: buildEdgecoreUrl,
|
edgecore: buildEdgecoreUrl,
|
||||||
fortinet: buildFortinetUrl,
|
fortinet: buildFortinetUrl,
|
||||||
"hpe-aruba": buildHpeArubaUrl,
|
"hpe-aruba": buildHpeArubaUrl,
|
||||||
extreme: buildExtremeUrl,
|
extreme: buildExtremeUrl,
|
||||||
|
// New JS-rendered vendors (stored product_page_url used where available)
|
||||||
|
nokia: buildPassthroughUrl,
|
||||||
|
huawei: buildPassthroughUrl,
|
||||||
|
ciena: buildPassthroughUrl,
|
||||||
|
moxa: buildPassthroughUrl,
|
||||||
|
"d-link": buildPassthroughUrl,
|
||||||
|
"alcatel-lucent-enterprise": buildPassthroughUrl,
|
||||||
|
asterfusion: buildPassthroughUrl,
|
||||||
|
brocade: buildPassthroughUrl,
|
||||||
|
"nvidia-networking": buildNvidiaUrl,
|
||||||
|
netgear: buildNetgearUrl,
|
||||||
|
ufispace: buildUfiSpaceUrl,
|
||||||
|
"quanta-cloud-technology": buildQctUrl,
|
||||||
};
|
};
|
||||||
|
|
||||||
// ── Request data attached to each crawl URL ──────────────────────────────────
|
// ── Request data attached to each crawl URL ──────────────────────────────────
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user