fix(scraper): fall through to img fallback when og:image is generic/logo
Previously: if og:image existed (even as a Dell logo URL), page.evaluate() returned early and the img fallback was never tried. Now: meta tags are extracted first, then isGenericImage() is checked in Node.js, and the img fallback runs if meta image is null or generic. This allows vendors like Dell (og:image = logo) to still get product images via the DOM fallback.
This commit is contained in:
parent
09d3a60b7c
commit
d67fbe31da
@ -260,16 +260,23 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
|||||||
// Wait for page to settle (JS rendering)
|
// Wait for page to settle (JS rendering)
|
||||||
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
|
await page.waitForLoadState("networkidle", { timeout: 20_000 }).catch(() => {});
|
||||||
|
|
||||||
// Extract og:image
|
// Extract og:image / twitter:image meta tags.
|
||||||
const imageUrl: string | null = await page.evaluate(() => {
|
// We DON'T filter generics here — we filter outside so the img fallback can still run
|
||||||
const ogMeta = document.querySelector<HTMLMetaElement>('meta[property="og:image"]');
|
// even when og:image exists but is a logo/brand image (e.g. Dell, HPE).
|
||||||
if (ogMeta?.content) return ogMeta.content;
|
const metaImageUrl: string | null = await page.evaluate(() => {
|
||||||
|
const og = document.querySelector<HTMLMetaElement>('meta[property="og:image"]')?.content;
|
||||||
|
if (og) return og;
|
||||||
|
const tw = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]')?.content;
|
||||||
|
return tw ?? null;
|
||||||
|
});
|
||||||
|
|
||||||
const twitterMeta = document.querySelector<HTMLMetaElement>('meta[name="twitter:image"]');
|
// Use meta image if it passes the generic filter; otherwise fall through to img fallback.
|
||||||
if (twitterMeta?.content) return twitterMeta.content;
|
let imageUrl: string | null = (metaImageUrl && !isGenericImage(metaImageUrl)) ? metaImageUrl : null;
|
||||||
|
|
||||||
// Fallback: largest visible image that isn't a UI element.
|
if (!imageUrl) {
|
||||||
|
// Img fallback: largest visible image that isn't a UI element.
|
||||||
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
|
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
|
||||||
|
imageUrl = await page.evaluate(() => {
|
||||||
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
|
||||||
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
|
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i;
|
||||||
const candidate = imgs
|
const candidate = imgs
|
||||||
@ -284,6 +291,7 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
|
|||||||
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
.sort((a, b) => (b.naturalWidth * b.naturalHeight) - (a.naturalWidth * a.naturalHeight))[0];
|
||||||
return candidate?.src ?? null;
|
return candidate?.src ?? null;
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (!imageUrl || isGenericImage(imageUrl)) {
|
if (!imageUrl || isGenericImage(imageUrl)) {
|
||||||
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
|
console.log(` [MISS] ${data.vendorName} ${data.model} — no product image (${imageUrl?.slice(0, 60) ?? "null"})`);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user