fix: NADDOD price extraction — read from LD+JSON offers.price

NADDOD uses LD+JSON for pricing (Astro/Shopify structure):
  {"offers":{"price":"731.00","priceCurrency":"USD",...}}

Old regex (/US$\s*.../) never matched → all 132 price obs were lucky
text matches, not systematic. Now: parse all ld+json blocks first,
fall back to regex.

Also broaden sitemap URL regex to capture new-style URLs without .html:
  /products/nvidia-networking/102612 (was being missed)
This commit is contained in:
Rene Fichtmueller 2026-05-06 23:55:55 +02:00
parent 1a7c928120
commit 57e20efe49

View File

@ -187,15 +187,40 @@ function parseDetailPage(html: string, url: string): {
if (!name || name.length < 10) return null; if (!name || name.length < 10) return null;
if (!isTransceiver(name)) return null; if (!isTransceiver(name)) return null;
// Price: "US$ 10.90" or "$10.90" // Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) || // Fall back to "US$ 10.90" or "$10.90" visible text patterns
html.match(/\$\s*([\d,]+\.\d{2})\b/); let price: number | undefined;
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined; const ldJsonMatch = html.match(/<script[^>]+type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi);
if (ldJsonMatch) {
for (const block of ldJsonMatch) {
try {
const jsonStr = block.replace(/<script[^>]*>/, "").replace(/<\/script>/, "");
const ld = JSON.parse(jsonStr) as Record<string, unknown>;
const offers = ld["offers"] as Record<string, unknown> | undefined;
if (offers && typeof offers["price"] === "string") {
const p = parseFloat(offers["price"].replace(/,/g, ""));
if (p > 0 && p < 500000) { price = p; break; }
}
if (offers && typeof offers["price"] === "number" && offers["price"] > 0 && (offers["price"] as number) < 500000) {
price = offers["price"] as number;
break;
}
} catch { /* skip malformed blocks */ }
}
}
if (!price) {
const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
html.match(/\$\s*([\d,]+\.\d{2})\b/);
if (priceMatch) {
const p = parseFloat(priceMatch[1].replace(/,/g, ""));
if (p > 0 && p < 500000) price = p;
}
}
// Stock count // Stock count
const stock = parseStockText(html); const stock = parseStockText(html);
return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock }; return { name, price, stock };
} }
// ── Sitemap parsing ───────────────────────────────────────────────────────── // ── Sitemap parsing ─────────────────────────────────────────────────────────
@ -204,14 +229,17 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
console.log(` Fetching sitemap: ${SITEMAP_URL}`); console.log(` Fetching sitemap: ${SITEMAP_URL}`);
const xml = await fetchText(SITEMAP_URL); const xml = await fetchText(SITEMAP_URL);
// Extract all <loc> URLs that match /products/XXXXX.html // Extract all <loc> product URLs — supports both:
// /products/12345.html (most products)
// /products/brand-slug/12345 (new URL style without .html)
const urls: string[] = []; const urls: string[] = [];
const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi; const locRegex = /<loc>([^<]+\/products\/[^<\s]+)<\/loc>/gi;
let m: RegExpExecArray | null; let m: RegExpExecArray | null;
while ((m = locRegex.exec(xml)) !== null) { while ((m = locRegex.exec(xml)) !== null) {
const url = m[1].trim(); const url = m[1].trim();
// Keep only canonical English URLs (no language prefix) // Keep only canonical English URLs (no language prefix like /en/, /de/, /fr/, /ja/ etc.)
if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) { const hasLangPrefix = /\/[a-z]{2}\/products\//.test(url);
if (!hasLangPrefix) {
urls.push(url); urls.push(url);
} }
} }