fix(scraper): filter OneTrust/cookie-consent images + skip in img fallback

cdn.cookielaw.org logos appear as the largest DOM image on Dell/Extreme
product pages when the cookie consent overlay is present. Added to both
GENERIC_IMAGE_PATTERNS (isGenericImage filter) and img fallback skipPattern
so the next-largest actual product image can be found.
This commit is contained in:
Rene Fichtmueller 2026-04-21 06:45:41 +02:00
parent fcb8fb8c90
commit 53cfebb6f4

View File

@ -60,6 +60,11 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [
/social[-_]icon/i, /social[-_]icon/i,
/favicon/i, /favicon/i,
/og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic /og[-_]image[-_][0-9]+x[0-9]+\./i, // e.g. og-image-1200x630 → family-level generic
// Cookie consent / GDPR overlay images (OneTrust, Cookiebot, TrustArc, etc.)
/cdn\.cookielaw\.org/i,
/cookiebot\.com/i,
/trustarc\.com/i,
/consent-manager/i,
]; ];
function isGenericImage(url: string): boolean { function isGenericImage(url: string): boolean {
@ -278,7 +283,7 @@ export async function fetchSwitchImagesPlaywright(targetVendorSlug?: string): Pr
// Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward. // Deliberately broad — isGenericImage() will filter hero/banner/logo images afterward.
imageUrl = await page.evaluate(() => { imageUrl = await page.evaluate(() => {
const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img")); const imgs = Array.from(document.querySelectorAll<HTMLImageElement>("img"));
const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading/i; const skipPattern = /\/flags?\/|\/icons?\/|\/avatars?\/|social[-_]icon|favicon|spinner|loading|cookielaw|cookiebot|trustarc/i;
const candidate = imgs const candidate = imgs
.filter((img) => { .filter((img) => {
const src = img.src || img.getAttribute("data-src") || ""; const src = img.src || img.getAttribute("data-src") || "";