From 8391b194a5e47702298c97c7111b254ddbb02eff Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 03:02:39 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20GBICS=20scraper=20=E2=80=94=20fall=20bac?= =?UTF-8?q?k=20to=20aria-label-first=20pattern=20when=20href-first=20finds?= =?UTF-8?q?=20no=20priced=20products?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pattern 1 (href→aria-label) finds 127 navigation links on GBICS BigCommerce pages — none contain GBP prices. Pattern 2 (aria-label→href) correctly finds 16-30 product links per category page with £XX.XX prices in aria-labels. The fallback from P1 to P2 now triggers when P1 finds results but none contain '£', rather than only when P1 finds 0 total results. --- packages/scraper/src/scrapers/gbics.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/scraper/src/scrapers/gbics.ts b/packages/scraper/src/scrapers/gbics.ts index ce6c9a7..472ca4f 100644 --- a/packages/scraper/src/scrapers/gbics.ts +++ b/packages/scraper/src/scrapers/gbics.ts @@ -117,7 +117,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product // BigCommerce card pattern (attribute order varies by theme version): // Old: // New: - // Two-pass approach: find all product tags regardless of attribute order + // Two-pass approach: find all product tags regardless of attribute order. + // GBICS BigCommerce theme: product cards have aria-label BEFORE href ("aria-label first"), + // while navigation links have href BEFORE aria-label. Try pattern 2 when pattern 1 + // finds results but none contain GBP prices (£), which indicates only nav links were matched. const productRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*aria-label="([^"]+)"/gi; const productRegex2 = /aria-label="([^"]+)"[^>]*href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"/gi; let match; @@ -125,7 +128,11 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product while ((match = productRegex.exec(collapsed)) !== null) { rawMatches.push({ url: match[1].trim(), label: match[2].trim(), index: match.index }); } - if (rawMatches.length === 0) { + // Fall back to pattern 2 when pattern 1 finds no results, OR when no results contain + // GBP prices (£) — indicating only navigation links were matched by pattern 1. + const hasPricesInP1 = rawMatches.some((m) => m.label.includes("£")); + if (rawMatches.length === 0 || !hasPricesInP1) { + rawMatches.length = 0; // clear nav-link pollution while ((match = productRegex2.exec(collapsed)) !== null) { rawMatches.push({ url: match[2].trim(), label: match[1].trim(), index: match.index }); }