From fcb8fb8c90ea0d8933264d855b9d342efdc01339 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 21 Apr 2026 06:36:12 +0200 Subject: [PATCH] fix(scraper): fall through to img fallback when og:image is generic/logo Previously: if og:image existed (even as a Dell logo URL), page.evaluate() returned early and the img fallback was never tried. Now: meta tags are extracted first, then isGenericImage() is checked in Node.js, and the img fallback runs if meta image is null or generic. This allows vendors like Dell (og:image = logo) to still get product images via the DOM fallback. --- .../src/scrapers/switch-image-playwright.ts | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/packages/scraper/src/scrapers/switch-image-playwright.ts b/packages/scraper/src/scrapers/switch-image-playwright.ts index 89aa17e..fba0399 100644 --- a/packages/scraper/src/scrapers/switch-image-playwright.ts +++ b/packages/scraper/src/scrapers/switch-image-playwright.ts @@ -260,31 +260,39 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr // Wait for page to settle (JS rendering) await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {}); - // Extract og:image - const imageUrl: string | null = await page.evaluate(() => { - const ogMeta = document.querySelector('meta[property="og:image"]'); - if (ogMeta?.content) return ogMeta.content; - - const twitterMeta = document.querySelector('meta[name="twitter:image"]'); - if (twitterMeta?.content) return twitterMeta.content; - - // Fallback: largest visible image that isn't a UI element. - // Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward. - const imgs = Array.from(document.querySelectorAll("img")); - const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i; - const candidate = imgs - .filter((img) => { - const src = img.src || img.getAttribute("data-src") || ""; - return src.startsWith("http") && - /\.(jpg|jpeg|png|webp)/i.test(src) && - img.naturalWidth >= 200 && - img.naturalHeight >= 150 && - !skipPattern.test(src); - }) - .sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0]; - return candidate?.src ?? null; + // Extract og:image / twitter:image meta tags. + // We DON'T filter generics here — we filter outside so the img fallback can still run + // even when og:image exists but is a logo/brand image (e.g. Dell, HPE). + const metaImageUrl: string | null = await page.evaluate(() => { + const og = document.querySelector('meta[property="og:image"]')?.content; + if (og) return og; + const tw = document.querySelector('meta[name="twitter:image"]')?.content; + return tw ?? null; }); + // Use meta image if it passes the generic filter; otherwise fall through to img fallback. + let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null; + + if (!imageUrl) { + // Img fallback: largest visible image that isn't a UI element. + // Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward. + imageUrl = await page.evaluate(() => { + const imgs = Array.from(document.querySelectorAll("img")); + const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i; + const candidate = imgs + .filter((img) => { + const src = img.src || img.getAttribute("data-src") || ""; + return src.startsWith("http") && + /\.(jpg|jpeg|png|webp)/i.test(src) && + img.naturalWidth >= 200 && + img.naturalHeight >= 150 && + !skipPattern.test(src); + }) + .sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0]; + return candidate?.src ?? null; + }); + } + if (!imageUrl || isGenericImage(imageUrl)) { console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`); missed++;