fix(scraper): fall through to img fallback when og:image is generic/logo

Previously: if og:image existed (even as a Dell logo URL), page.evaluate() returned
early and the img fallback was never tried. Now: meta tags are extracted first, then
isGenericImage() is checked in Node.js, and the img fallback runs if meta image is null
or generic. This allows vendors like Dell (og:image = logo) to still get product images
via the DOM fallback.
This commit is contained in:
Rene Fichtmueller 2026-04-21 06:36:12 +02:00
parent 55d4d6a8f8
commit fcb8fb8c90

View File

@ -260,31 +260,39 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
// Wait for page to settle (JS rendering) // Wait for page to settle (JS rendering)
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {}); await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
// Extract og:image // Extract og:image / twitter:image meta tags.
const imageUrl: string | null = await page.evaluate(() => { // We DON'T filter generics here — we filter outside so the img fallback can still run
const ogMeta = document.querySelector<HTMLMetaElement>('meta[property="og:image"]'); // even when og:image exists but is a logo/brand image (e.g. Dell, HPE).
if (ogMeta?.content) return ogMeta.content; const metaImageUrl: string | null = await page.evaluate(() => {
const og = document.querySelector<HTMLMetaElement>('meta[property="og:image"]')?.content;
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]'); if (og) return og;
if (twitterMeta?.content) return twitterMeta.content; const tw = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]')?.content;
return tw ?? null;
// Fallback: largest visible image that isn't a UI element.
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
const candidate = imgs
.filter((img) => {
const src = img.src || img.getAttribute("data-src") || "";
return src.startsWith("http") &&
/\.(jpg|jpeg|png|webp)/i.test(src) &&
img.naturalWidth >= 200 &&
img.naturalHeight >= 150 &&
!skipPattern.test(src);
})
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
return candidate?.src ?? null;
}); });
// Use meta image if it passes the generic filter; otherwise fall through to img fallback.
let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null;
if (!imageUrl) {
// Img fallback: largest visible image that isn't a UI element.
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
imageUrl = await page.evaluate(() => {
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
const candidate = imgs
.filter((img) => {
const src = img.src || img.getAttribute("data-src") || "";
return src.startsWith("http") &&
/\.(jpg|jpeg|png|webp)/i.test(src) &&
img.naturalWidth >= 200 &&
img.naturalHeight >= 150 &&
!skipPattern.test(src);
})
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
return candidate?.src ?? null;
});
}
if (!imageUrl || isGenericImage(imageUrl)) { if (!imageUrl || isGenericImage(imageUrl)) {
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`); console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
missed++; missed++;