fix: GBICS scraper — fall back to aria-label-first pattern when href-first finds no priced products
Pattern 1 (href→aria-label) finds 127 navigation links on GBICS BigCommerce pages — none contain GBP prices. Pattern 2 (aria-label→href) correctly finds 16-30 product links per category page with £XX.XX prices in aria-labels. The fallback from P1 to P2 now triggers when P1 finds results but none contain '£', rather than only when P1 finds 0 total results.
This commit is contained in:
parent
f191ece0e4
commit
4797fccd7f
@ -117,7 +117,10 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
// BigCommerce card pattern (attribute order varies by theme version):
|
||||
// Old: <a aria-label="Name, £XX.XX" href="URL" data-event-type="product-click">
|
||||
// New: <a href="URL" class="card-figure__link..." aria-label="Name, £XX.XX">
|
||||
// Two-pass approach: find all product <a> tags regardless of attribute order
|
||||
// Two-pass approach: find all product <a> tags regardless of attribute order.
|
||||
// GBICS BigCommerce theme: product cards have aria-label BEFORE href ("aria-label first"),
|
||||
// while navigation links have href BEFORE aria-label. Try pattern 2 when pattern 1
|
||||
// finds results but none contain GBP prices (£), which indicates only nav links were matched.
|
||||
const productRegex = /href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"[^>]*aria-label="([^"]+)"/gi;
|
||||
const productRegex2 = /aria-label="([^"]+)"[^>]*href="(https?:\/\/(?:www\.)?gbics\.com\/[^"]+)"/gi;
|
||||
let match;
|
||||
@ -125,7 +128,11 @@ function parseProductList(html: string, cat: typeof CATEGORIES[number]): Product
|
||||
while ((match = productRegex.exec(collapsed)) !== null) {
|
||||
rawMatches.push({ url: match[1].trim(), label: match[2].trim(), index: match.index });
|
||||
}
|
||||
if (rawMatches.length === 0) {
|
||||
// Fall back to pattern 2 when pattern 1 finds no results, OR when no results contain
|
||||
// GBP prices (£) — indicating only navigation links were matched by pattern 1.
|
||||
const hasPricesInP1 = rawMatches.some((m) => m.label.includes("£"));
|
||||
if (rawMatches.length === 0 || !hasPricesInP1) {
|
||||
rawMatches.length = 0; // clear nav-link pollution
|
||||
while ((match = productRegex2.exec(collapsed)) !== null) {
|
||||
rawMatches.push({ url: match[2].trim(), label: match[1].trim(), index: match.index });
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user