From 57e20efe492c0e6e21296165a364eec83571a871 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 6 May 2026 23:55:55 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20NADDOD=20price=20extraction=20=E2=80=94?= =?UTF-8?q?=20read=20from=20LD+JSON=20offers.price?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NADDOD uses LD+JSON for pricing (Astro/Shopify structure): {"offers":{"price":"731.00","priceCurrency":"USD",...}} Old regex (/US$\s*.../) never matched → all 132 price obs were lucky text matches, not systematic. Now: parse all ld+json blocks first, fall back to regex. Also broaden sitemap URL regex to capture new-style URLs without .html: /products/nvidia-networking/102612 (was being missed) --- packages/scraper/src/scrapers/naddod.ts | 46 ++++++++++++++++++++----- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts index 075866c..81bfcb3 100644 --- a/packages/scraper/src/scrapers/naddod.ts +++ b/packages/scraper/src/scrapers/naddod.ts @@ -187,15 +187,40 @@ function parseDetailPage(html: string, url: string): { if (!name || name.length < 10) return null; if (!isTransceiver(name)) return null; - // Price: "US$ 10.90" or "$10.90" - const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) || - html.match(/\$\s*([\d,]+\.\d{2})\b/); - const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; + // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00") + // Fall back to "US$ 10.90" or "$10.90" visible text patterns + let price: number | undefined; + const ldJsonMatch = html.match(/]+type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi); + if (ldJsonMatch) { + for (const block of ldJsonMatch) { + try { + const jsonStr = block.replace(/]*>/, "").replace(/<\/script>/, ""); + const ld = JSON.parse(jsonStr) as Record; + const offers = ld["offers"] as Record | undefined; + if (offers && typeof offers["price"] === "string") { + const p = parseFloat(offers["price"].replace(/,/g, "")); + if (p > 0 && p < 500000) { price = p; break; } + } + if (offers && typeof offers["price"] === "number" && offers["price"] > 0 && (offers["price"] as number) < 500000) { + price = offers["price"] as number; + break; + } + } catch { /* skip malformed blocks */ } + } + } + if (!price) { + const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) || + html.match(/\$\s*([\d,]+\.\d{2})\b/); + if (priceMatch) { + const p = parseFloat(priceMatch[1].replace(/,/g, "")); + if (p > 0 && p < 500000) price = p; + } + } // Stock count const stock = parseStockText(html); - return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock }; + return { name, price, stock }; } // ── Sitemap parsing ───────────────────────────────────────────────────────── @@ -204,14 +229,17 @@ async function fetchProductUrlsFromSitemap(): Promise { console.log(` Fetching sitemap: ${SITEMAP_URL}`); const xml = await fetchText(SITEMAP_URL); - // Extract all URLs that match /products/XXXXX.html + // Extract all product URLs — supports both: + // /products/12345.html (most products) + // /products/brand-slug/12345 (new URL style without .html) const urls: string[] = []; - const locRegex = /([^<]+\/products\/\d+\.html)<\/loc>/gi; + const locRegex = /([^<]+\/products\/[^<\s]+)<\/loc>/gi; let m: RegExpExecArray | null; while ((m = locRegex.exec(xml)) !== null) { const url = m[1].trim(); - // Keep only canonical English URLs (no language prefix) - if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) { + // Keep only canonical English URLs (no language prefix like /en/, /de/, /fr/, /ja/ etc.) + const hasLangPrefix = /\/[a-z]{2}\/products\//.test(url); + if (!hasLangPrefix) { urls.push(url); } }