fix: NADDOD price extraction — read from LD+JSON offers.price

NADDOD uses LD+JSON for pricing (Astro/Shopify structure): {"offers":{"price":"731.00","priceCurrency":"USD",...}} Old regex (/US$\s*.../) never matched → all 132 price obs were lucky text matches, not systematic. Now: parse all ld+json blocks first, fall back to regex. Also broaden sitemap URL regex to capture new-style URLs without .html: /products/nvidia-networking/102612 (was being missed)
2026-05-06 23:55:55 +02:00 · 2026-05-06 23:55:55 +02:00 · 57e20efe49
commit 57e20efe49
parent 1a7c928120
1 changed files with 37 additions and 9 deletions
--- a/packages/scraper/src/scrapers/naddod.ts
+++ b/packages/scraper/src/scrapers/naddod.ts
@ -187,15 +187,40 @@ function parseDetailPage(html: string, url: string): {
  if (!name || name.length < 10) return null;
  if (!isTransceiver(name)) return null;
-  // Price: "US$ 10.90" or "$10.90"
+  // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
-  const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
+  // Fall back to "US$ 10.90" or "$10.90" visible text patterns
-                     html.match(/\$\s*([\d,]+\.\d{2})\b/);
+  let price: number | undefined;
-  const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
+  const ldJsonMatch = html.match(/<script[^>]+type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi);
  if (ldJsonMatch) {
    for (const block of ldJsonMatch) {
      try {
        const jsonStr = block.replace(/<script[^>]*>/, "").replace(/<\/script>/, "");
        const ld = JSON.parse(jsonStr) as Record<string, unknown>;
        const offers = ld["offers"] as Record<string, unknown> | undefined;
        if (offers && typeof offers["price"] === "string") {
          const p = parseFloat(offers["price"].replace(/,/g, ""));
          if (p > 0 && p < 500000) { price = p; break; }
        }
        if (offers && typeof offers["price"] === "number" && offers["price"] > 0 && (offers["price"] as number) < 500000) {
          price = offers["price"] as number;
          break;
        }
      } catch { /* skip malformed blocks */ }
    }
  }
  if (!price) {
    const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
                       html.match(/\$\s*([\d,]+\.\d{2})\b/);
    if (priceMatch) {
      const p = parseFloat(priceMatch[1].replace(/,/g, ""));
      if (p > 0 && p < 500000) price = p;
    }
  }
  // Stock count
  const stock = parseStockText(html);
-  return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock };
+  return { name, price, stock };
 }
 // ── Sitemap parsing ─────────────────────────────────────────────────────────
@ -204,14 +229,17 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
  console.log(`  Fetching sitemap: ${SITEMAP_URL}`);
  const xml = await fetchText(SITEMAP_URL);
-  // Extract all <loc> URLs that match /products/XXXXX.html
+  // Extract all <loc> product URLs — supports both:
  //   /products/12345.html  (most products)
  //   /products/brand-slug/12345  (new URL style without .html)
  const urls: string[] = [];
-  const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi;
+  const locRegex = /<loc>([^<]+\/products\/[^<\s]+)<\/loc>/gi;
  let m: RegExpExecArray | null;
  while ((m = locRegex.exec(xml)) !== null) {
    const url = m[1].trim();
-    // Keep only canonical English URLs (no language prefix)
+    // Keep only canonical English URLs (no language prefix like /en/, /de/, /fr/, /ja/ etc.)
-    if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) {
+    const hasLangPrefix = /\/[a-z]{2}\/products\//.test(url);
    if (!hasLangPrefix) {
      urls.push(url);
    }
  }