fix(scraper): fix Edgecore/Extreme URL builders, broaden img fallback, fix ENOENT
- buildEdgecoreUrl: /product/<slug>/ (WooCommerce, no .html) with EDGECORE_SLUG_MAP for AS7712-32X→as7712-32x-ec, Minipack2→minipack-as8000-open-modular-platform - buildFortinetUrl: returns null (all pages redirect to generic, no usable og:image) - buildExtremeUrl: direct product URL (extremenetworks.com/product/<slug>) - img fallback: remove strict 'product/switch/router/hardware' path requirement; now takes largest image >=200x150px excluding flags/icons/spinners — isGenericImage() filters hero/banner/logo afterward - ENOENT fix: unique per-run Crawlee storage dir (timestamp suffix) prevents stale request-queue file contamination between back-to-back vendor runs
This commit is contained in:
parent
87b9416592
commit
09d3a60b7c
@ -92,33 +92,45 @@ function buildDellUrl(model: string): string | null {
|
|||||||
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
|
return `https://www.dell.com/en-us/shop/dell-networking-switches/sc/networking-switches?appliedRefinements=DP_SEARCH_RESULTS_KEYWORDS~${encodeURIComponent(cleanModel)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Edgecore uses WooCommerce with /product/<slug>/ URLs (no .html suffix).
|
||||||
|
// Some models have non-obvious slugs verified via sitemap.
|
||||||
|
const EDGECORE_SLUG_MAP: Record<string, string> = {
|
||||||
|
"AS7712-32X": "as7712-32x-ec", // -ec suffix variant in Edgecore WooCommerce
|
||||||
|
"Minipack2": "minipack-as8000-open-modular-platform", // Facebook OCP Minipack2
|
||||||
|
};
|
||||||
|
|
||||||
function buildEdgecoreUrl(model: string): string | null {
|
function buildEdgecoreUrl(model: string): string | null {
|
||||||
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
if (model in EDGECORE_SLUG_MAP) {
|
||||||
return `https://www.edge-core.com/product/${slug}.html`;
|
return `https://www.edge-core.com/product/${EDGECORE_SLUG_MAP[model]}/`;
|
||||||
|
}
|
||||||
|
// Standard slug: lowercase, replace non-alphanum with dash, collapse multiple dashes
|
||||||
|
const slug = model.toLowerCase()
|
||||||
|
.replace(/[^a-z0-9-]/g, "-")
|
||||||
|
.replace(/-+/g, "-")
|
||||||
|
.replace(/^-|-$/g, "");
|
||||||
|
return slug ? `https://www.edge-core.com/product/${slug}/` : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildFortinetUrl(model: string): string | null {
|
function buildFortinetUrl(_model: string): string | null {
|
||||||
// FortiSwitch 424E → fortiswitch-424e
|
// Fortinet product pages are fully JS-rendered and all redirect to generic /products/ethernet-switches.
|
||||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
// No reliable og:image can be extracted — skip entirely.
|
||||||
return `https://www.fortinet.com/products/fortiswitch/${slug}`;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildHpeArubaUrl(model: string): string | null {
|
function buildHpeArubaUrl(model: string): string | null {
|
||||||
|
// HPE Aruba series pages are stored in product_page_url for all known models.
|
||||||
|
// Builder is a fallback for unknown models.
|
||||||
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
const slug = model.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
||||||
return `https://www.arubanetworks.com/products/switches/${slug}/`;
|
return `https://www.arubanetworks.com/products/switches/${slug}/`;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildExtremeUrl(model: string): string | null {
|
function buildExtremeUrl(model: string): string | null {
|
||||||
// Extreme uses Coveo JS search — try series page
|
// Extreme direct product pages: extremenetworks.com/product/<slug>
|
||||||
const m = model.toLowerCase();
|
const slug = model.toLowerCase()
|
||||||
if (m.startsWith("x6")) {
|
.replace(/\s+/g, "-")
|
||||||
const series = model.match(/^(X\d+)/i)?.[1]?.toLowerCase() ?? "";
|
.replace(/[^a-z0-9-]/g, "")
|
||||||
return `https://www.extremenetworks.com/products/switching/${series}-series/`;
|
.replace(/-+/g, "-");
|
||||||
}
|
return slug ? `https://www.extremenetworks.com/product/${slug}` : null;
|
||||||
if (m.startsWith("slx") || m.startsWith("8720") || m.startsWith("5520")) {
|
|
||||||
return `https://www.extremenetworks.com/products/switching/`;
|
|
||||||
}
|
|
||||||
return `https://www.extremenetworks.com/products/switching/`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||||
@ -256,16 +268,18 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
|||||||
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
|
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
|
||||||
if (twitterMeta?.content) return twitterMeta.content;
|
if (twitterMeta?.content) return twitterMeta.content;
|
||||||
|
|
||||||
// Fallback: largest product-looking image
|
// Fallback: largest visible image that isn't a UI element.
|
||||||
|
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
|
||||||
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
||||||
|
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
|
||||||
const candidate = imgs
|
const candidate = imgs
|
||||||
.filter((img) => {
|
.filter((img) => {
|
||||||
const src = img.src || img.getAttribute("data-src") || "";
|
const src = img.src || img.getAttribute("data-src") || "";
|
||||||
return src.startsWith("http") &&
|
return src.startsWith("http") &&
|
||||||
(src.match(/\.(jpg|jpeg|png|webp)/i)) &&
|
/\.(jpg|jpeg|png|webp)/i.test(src) &&
|
||||||
img.naturalWidth > 300 &&
|
img.naturalWidth >= 200 &&
|
||||||
img.naturalHeight > 200 &&
|
img.naturalHeight >= 150 &&
|
||||||
(src.includes("product") || src.includes("switch") || src.includes("router") || src.includes("hardware"));
|
!skipPattern.test(src);
|
||||||
})
|
})
|
||||||
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
||||||
return candidate?.src ?? null;
|
return candidate?.src ?? null;
|
||||||
@ -303,7 +317,9 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
|||||||
errors++;
|
errors++;
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
makeCrawleeConfig("switch-images-playwright"),
|
// Use a unique run ID to avoid Crawlee temp-dir state contamination when multiple
|
||||||
|
// vendor runs execute back-to-back (ENOENT: stale request-queue files from prior run).
|
||||||
|
makeCrawleeConfig(`switch-images-playwright-${Date.now()}`),
|
||||||
);
|
);
|
||||||
|
|
||||||
await crawler.run(requests);
|
await crawler.run(requests);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user