From b65e4452db2fc0df72fe58406eb88536fcae2440 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 21 Apr 2026 07:38:01 +0200 Subject: [PATCH] fix: add error-graphic, icon-library, illustration filters to GENERIC_IMAGE_PATTERNS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /404[-_]error/i, /error[-_]graphic/i — Broadcom 404-ERROR-GRAPHIC.png - /\/icon[-_]library\//i — D-Link navigation/icon-library path images - /[-_]illustration[._]/i — Arista Cloud-Legacy_Illustration and similar diagrams - Nokia banner, Huawei marketing, banners/ path patterns (Playwright scraper) - Cookie consent patterns synced to switch-image-fetcher.ts (was only in Playwright) --- .../scraper/src/scrapers/switch-image-fetcher.ts | 12 ++++++++++++ .../scraper/src/scrapers/switch-image-playwright.ts | 7 +++++++ 2 files changed, 19 insertions(+) diff --git a/packages/scraper/src/scrapers/switch-image-fetcher.ts b/packages/scraper/src/scrapers/switch-image-fetcher.ts index b206318..d30267d 100644 --- a/packages/scraper/src/scrapers/switch-image-fetcher.ts +++ b/packages/scraper/src/scrapers/switch-image-fetcher.ts @@ -280,6 +280,18 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [ // ── Generic about/press/brand pages ────────────────────────────────────── /\/press[-_]kit/i, /\/media[-_]kit/i, + // ── Vendor error / 404 graphics ────────────────────────────────────────── + /404[-_]error/i, + /error[-_]graphic/i, + // ── Navigation icon libraries ──────────────────────────────────────────── + /\/icon[-_]library\//i, + // ── Diagrams and illustrations ─────────────────────────────────────────── + /[-_]illustration[._]/i, + // ── Cookie consent / GDPR overlay images ──────────────────────────────── + /cdn\.cookielaw\.org/i, + /cookiebot\.com/i, + /trustarc\.com/i, + /consent-manager/i, ]; function isGenericImage(url: string): boolean { diff --git a/packages/scraper/src/scrapers/switch-image-playwright.ts b/packages/scraper/src/scrapers/switch-image-playwright.ts index bd3bfcd..d5795df 100644 --- a/packages/scraper/src/scrapers/switch-image-playwright.ts +++ b/packages/scraper/src/scrapers/switch-image-playwright.ts @@ -75,6 +75,13 @@ const GENERIC_IMAGE_PATTERNS: RegExp[] = [ /bg_products/i, // Generic "banners" path segment used by CMSes /\/banners?\//i, + // Vendor error / 404 graphics + /404[-_]error/i, + /error[-_]graphic/i, + // Navigation icon libraries (D-Link, other CMSes) + /\/icon[-_]library\//i, + // Diagrams and illustrations (not product photos) + /[-_]illustration[._]/i, ]; function isGenericImage(url: string): boolean {