From 57e20efe492c0e6e21296165a364eec83571a871 Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller <renefichtmueller@MacStudio-von-Rene-8.local>
Date: Wed, 6 May 2026 23:55:55 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20NADDOD=20price=20extraction=20=E2=80=94?=
 =?UTF-8?q?=20read=20from=20LD+JSON=20offers.price?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NADDOD uses LD+JSON for pricing (Astro/Shopify structure):
  {"offers":{"price":"731.00","priceCurrency":"USD",...}}

Old regex (/US$\s*.../) never matched → all 132 price obs were lucky
text matches, not systematic. Now: parse all ld+json blocks first,
fall back to regex.

Also broaden sitemap URL regex to capture new-style URLs without .html:
  /products/nvidia-networking/102612 (was being missed)
---
 packages/scraper/src/scrapers/naddod.ts | 46 ++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 9 deletions(-)
diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts
index 075866c..81bfcb3 100644
--- a/packages/scraper/src/scrapers/naddod.ts
+++ b/packages/scraper/src/scrapers/naddod.ts
@@ -187,15 +187,40 @@ function parseDetailPage(html: string, url: string): {
   if (!name || name.length < 10) return null;
   if (!isTransceiver(name)) return null;
 
-  // Price: "US$ 10.90" or "$10.90"
-  const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
-                     html.match(/\$\s*([\d,]+\.\d{2})\b/);
-  const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
+  // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
+  // Fall back to "US$ 10.90" or "$10.90" visible text patterns
+  let price: number | undefined;
+  const ldJsonMatch = html.match(/<script[^>]+type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi);
+  if (ldJsonMatch) {
+    for (const block of ldJsonMatch) {
+      try {
+        const jsonStr = block.replace(/<script[^>]*>/, "").replace(/<\/script>/, "");
+        const ld = JSON.parse(jsonStr) as Record<string, unknown>;
+        const offers = ld["offers"] as Record<string, unknown> | undefined;
+        if (offers && typeof offers["price"] === "string") {
+          const p = parseFloat(offers["price"].replace(/,/g, ""));
+          if (p > 0 && p < 500000) { price = p; break; }
+        }
+        if (offers && typeof offers["price"] === "number" && offers["price"] > 0 && (offers["price"] as number) < 500000) {
+          price = offers["price"] as number;
+          break;
+        }
+      } catch { /* skip malformed blocks */ }
+    }
+  }
+  if (!price) {
+    const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
+                       html.match(/\$\s*([\d,]+\.\d{2})\b/);
+    if (priceMatch) {
+      const p = parseFloat(priceMatch[1].replace(/,/g, ""));
+      if (p > 0 && p < 500000) price = p;
+    }
+  }
 
   // Stock count
   const stock = parseStockText(html);
 
-  return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock };
+  return { name, price, stock };
 }
 
 // ── Sitemap parsing ─────────────────────────────────────────────────────────
@@ -204,14 +229,17 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
   console.log(`  Fetching sitemap: ${SITEMAP_URL}`);
   const xml = await fetchText(SITEMAP_URL);
 
-  // Extract all <loc> URLs that match /products/XXXXX.html
+  // Extract all <loc> product URLs — supports both:
+  //   /products/12345.html  (most products)
+  //   /products/brand-slug/12345  (new URL style without .html)
   const urls: string[] = [];
-  const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi;
+  const locRegex = /<loc>([^<]+\/products\/[^<\s]+)<\/loc>/gi;
   let m: RegExpExecArray | null;
   while ((m = locRegex.exec(xml)) !== null) {
     const url = m[1].trim();
-    // Keep only canonical English URLs (no language prefix)
-    if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) {
+    // Keep only canonical English URLs (no language prefix like /en/, /de/, /fr/, /ja/ etc.)
+    const hasLangPrefix = /\/[a-z]{2}\/products\//.test(url);
+    if (!hasLangPrefix) {
       urls.push(url);
     }
   }