fix(scraper): fix Edgecore/Extreme URL builders, broaden img fallback, fix ENOENT

- buildEdgecoreUrl: /product/<slug>/ (WooCommerce, no .html) with EDGECORE_SLUG_MAP
  for AS7712-32X→as7712-32x-ec, Minipack2→minipack-as8000-open-modular-platform
- buildFortinetUrl: returns null (all pages redirect to generic, no usable og:image)
- buildExtremeUrl: direct product URL (extremenetworks.com/product/<slug>)
- img fallback: remove strict 'product/switch/router/hardware' path requirement;
  now takes largest image >=200x150px excluding flags/icons/spinners — isGenericImage()
  filters hero/banner/logo afterward
- ENOENT fix: unique per-run Crawlee storage dir (timestamp suffix) prevents
  stale request-queue file contamination between back-to-back vendor runs
This commit is contained in:
Rene Fichtmueller 2026-04-21 06:33:32 +02:00
parent 87b9416592
commit 09d3a60b7c

View File

@ -92,33 +92,45 @@ function buildDellUrl(model: string): string | null {
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
}
// Edgecore uses WooCommerce with /product/<slug>/ URLs (no .html suffix).
// Some models have non-obvious slugs verified via sitemap.
const EDGECORE_SLUG_MAP: Record<string, string> = {
"AS7712-32X": "as7712-32x-ec", // -ec suffix variant in Edgecore WooCommerce
"Minipack2": "minipack-as8000-open-modular-platform", // Facebook OCP Minipack2
};
function buildEdgecoreUrl(model: string): string | null {
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.edge-core.com/product/${slug}.html`;
if (model in EDGECORE_SLUG_MAP) {
return `https://www.edge-core.com/product/${EDGECORE_SLUG_MAP[model]}/`;
}
// Standard slug: lowercase, replace non-alphanum with dash, collapse multiple dashes
const slug = model.toLowerCase()
.replace(/[^a-z0-9-]/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
return slug ? `https://www.edge-core.com/product/${slug}/` : null;
}
function buildFortinetUrl(model: string): string | null {
// FortiSwitch 424E → fortiswitch-424e
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
return `https://www.fortinet.com/products/fortiswitch/${slug}`;
function buildFortinetUrl(_model: string): string | null {
// Fortinet product pages are fully JS-rendered and all redirect to generic /products/ethernet-switches.
// No reliable og:image can be extracted — skip entirely.
return null;
}
function buildHpeArubaUrl(model: string): string | null {
// HPE Aruba series pages are stored in product_page_url for all known models.
// Builder is a fallback for unknown models.
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
return `https://www.arubanetworks.com/products/switches/${slug}/`;
}
function buildExtremeUrl(model: string): string | null {
// Extreme uses Coveo JS search — try series page
const m = model.toLowerCase();
if (m.startsWith("x6")) {
const series = model.match(/^(X\d+)/i)?.[1]?.toLowerCase() ?? "";
return `https://www.extremenetworks.com/products/switching/${series}-series/`;
}
if (m.startsWith("slx") || m.startsWith("8720") || m.startsWith("5520")) {
return `https://www.extremenetworks.com/products/switching/`;
}
return `https://www.extremenetworks.com/products/switching/`;
// Extreme direct product pages: extremenetworks.com/product/<slug>
const slug = model.toLowerCase()
.replace(/\s+/g, "-")
.replace(/[^a-z0-9-]/g, "")
.replace(/-+/g, "-");
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
}
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
@ -256,16 +268,18 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
if (twitterMeta?.content) return twitterMeta.content;
// Fallback: largest product-looking image
// Fallback: largest visible image that isn't a UI element.
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
const candidate = imgs
.filter((img) => {
const src = img.src || img.getAttribute("data-src") || "";
return src.startsWith("http") &&
(src.match(/\.(jpg|jpeg|png|webp)/i)) &&
img.naturalWidth > 300 &&
img.naturalHeight > 200 &&
(src.includes("product") || src.includes("switch") || src.includes("router") || src.includes("hardware"));
/\.(jpg|jpeg|png|webp)/i.test(src) &&
img.naturalWidth >= 200 &&
img.naturalHeight >= 150 &&
!skipPattern.test(src);
})
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
return candidate?.src ?? null;
@ -303,7 +317,9 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
errors++;
},
},
makeCrawleeConfig("switch-images-playwright"),
// Use a unique run ID to avoid Crawlee temp-dir state contamination when multiple
// vendor runs execute back-to-back (ENOENT: stale request-queue files from prior run).
makeCrawleeConfig(`switch-images-playwright-${Date.now()}`),
);
await crawler.run(requests);