fix: GBICS scraper — fall back to aria-label-first pattern when href-first finds no priced products

Pattern 1 (href→aria-label) finds 127 navigation links on GBICS BigCommerce
pages — none contain GBP prices. Pattern 2 (aria-label→href) correctly
finds 16-30 product links per category page with £XX.XX prices in aria-labels.
The fallback from P1 to P2 now triggers when P1 finds results but none
contain '£', rather than only when P1 finds 0 total results.
This commit is contained in:
Rene Fichtmueller 2026-04-18 03:02:39 +02:00
parent f191ece0e4
commit 4797fccd7f

View File

@ -117,7 +117,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
// BigCommerce card pattern (attribute order varies by theme version): // BigCommerce card pattern (attribute order varies by theme version):
// Old: <a aria-label="Name, £XX.XX" href="URL" data-event-type="product-click"> // Old: <a aria-label="Name, £XX.XX" href="URL" data-event-type="product-click">
// New: <a href="URL" class="card-figure__link..." aria-label="Name, £XX.XX"> // New: <a href="URL" class="card-figure__link..." aria-label="Name, £XX.XX">
// Two-pass approach: find all product <a> tags regardless of attribute order // Two-pass approach: find all product <a> tags regardless of attribute order.
// GBICS BigCommerce theme: product cards have aria-label BEFORE href ("aria-label first"),
// while navigation links have href BEFORE aria-label. Try pattern 2 when pattern 1
// finds results but none contain GBP prices (£), which indicates only nav links were matched.
const productRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*aria-label="([^"]+)"/gi; const productRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*aria-label="([^"]+)"/gi;
const productRegex2 = /aria-label="([^"]+)"[^>]*href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"/gi; const productRegex2 = /aria-label="([^"]+)"[^>]*href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"/gi;
let match; let match;
@ -125,7 +128,11 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
while ((match = productRegex.exec(collapsed)) !== null) { while ((match = productRegex.exec(collapsed)) !== null) {
rawMatches.push({ url: match[1].trim(), label: match[2].trim(), index: match.index }); rawMatches.push({ url: match[1].trim(), label: match[2].trim(), index: match.index });
} }
if (rawMatches.length === 0) { // Fall back to pattern 2 when pattern 1 finds no results, OR when no results contain
// GBP prices (£) — indicating only navigation links were matched by pattern 1.
const hasPricesInP1 = rawMatches.some((m) => m.label.includes("£"));
if (rawMatches.length === 0 || !hasPricesInP1) {
rawMatches.length = 0; // clear nav-link pollution
while ((match = productRegex2.exec(collapsed)) !== null) { while ((match = productRegex2.exec(collapsed)) !== null) {
rawMatches.push({ url: match[2].trim(), label: match[1].trim(), index: match.index }); rawMatches.push({ url: match[2].trim(), label: match[1].trim(), index: match.index });
} }