fix: NADDOD price extraction — read from LD+JSON offers.price
NADDOD uses LD+JSON for pricing (Astro/Shopify structure):
{"offers":{"price":"731.00","priceCurrency":"USD",...}}
Old regex (/US$\s*.../) never matched → all 132 price obs were lucky
text matches, not systematic. Now: parse all ld+json blocks first,
fall back to regex.
Also broaden sitemap URL regex to capture new-style URLs without .html:
/products/nvidia-networking/102612 (was being missed)
This commit is contained in:
parent
1a7c928120
commit
57e20efe49
@ -187,15 +187,40 @@ function parseDetailPage(html: string, url: string): {
|
|||||||
if (!name || name.length < 10) return null;
|
if (!name || name.length < 10) return null;
|
||||||
if (!isTransceiver(name)) return null;
|
if (!isTransceiver(name)) return null;
|
||||||
|
|
||||||
// Price: "US$ 10.90" or "$10.90"
|
// Price: prefer LD+JSON offers.price (Astro/Shopify structure, e.g. "price":"731.00")
|
||||||
|
// Fall back to "US$ 10.90" or "$10.90" visible text patterns
|
||||||
|
let price: number | undefined;
|
||||||
|
const ldJsonMatch = html.match(/<script[^>]+type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi);
|
||||||
|
if (ldJsonMatch) {
|
||||||
|
for (const block of ldJsonMatch) {
|
||||||
|
try {
|
||||||
|
const jsonStr = block.replace(/<script[^>]*>/, "").replace(/<\/script>/, "");
|
||||||
|
const ld = JSON.parse(jsonStr) as Record<string, unknown>;
|
||||||
|
const offers = ld["offers"] as Record<string, unknown> | undefined;
|
||||||
|
if (offers && typeof offers["price"] === "string") {
|
||||||
|
const p = parseFloat(offers["price"].replace(/,/g, ""));
|
||||||
|
if (p > 0 && p < 500000) { price = p; break; }
|
||||||
|
}
|
||||||
|
if (offers && typeof offers["price"] === "number" && offers["price"] > 0 && (offers["price"] as number) < 500000) {
|
||||||
|
price = offers["price"] as number;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch { /* skip malformed blocks */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!price) {
|
||||||
const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
|
const priceMatch = html.match(/US\$\s*([\d,]+\.?\d{0,2})/i) ||
|
||||||
html.match(/\$\s*([\d,]+\.\d{2})\b/);
|
html.match(/\$\s*([\d,]+\.\d{2})\b/);
|
||||||
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, "")) : undefined;
|
if (priceMatch) {
|
||||||
|
const p = parseFloat(priceMatch[1].replace(/,/g, ""));
|
||||||
|
if (p > 0 && p < 500000) price = p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stock count
|
// Stock count
|
||||||
const stock = parseStockText(html);
|
const stock = parseStockText(html);
|
||||||
|
|
||||||
return { name, price: price && price > 0 && price < 500000 ? price : undefined, stock };
|
return { name, price, stock };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Sitemap parsing ─────────────────────────────────────────────────────────
|
// ── Sitemap parsing ─────────────────────────────────────────────────────────
|
||||||
@ -204,14 +229,17 @@ async function fetchProductUrlsFromSitemap(): Promise<string[]> {
|
|||||||
console.log(` Fetching sitemap: ${SITEMAP_URL}`);
|
console.log(` Fetching sitemap: ${SITEMAP_URL}`);
|
||||||
const xml = await fetchText(SITEMAP_URL);
|
const xml = await fetchText(SITEMAP_URL);
|
||||||
|
|
||||||
// Extract all <loc> URLs that match /products/XXXXX.html
|
// Extract all <loc> product URLs — supports both:
|
||||||
|
// /products/12345.html (most products)
|
||||||
|
// /products/brand-slug/12345 (new URL style without .html)
|
||||||
const urls: string[] = [];
|
const urls: string[] = [];
|
||||||
const locRegex = /<loc>([^<]+\/products\/\d+\.html)<\/loc>/gi;
|
const locRegex = /<loc>([^<]+\/products\/[^<\s]+)<\/loc>/gi;
|
||||||
let m: RegExpExecArray | null;
|
let m: RegExpExecArray | null;
|
||||||
while ((m = locRegex.exec(xml)) !== null) {
|
while ((m = locRegex.exec(xml)) !== null) {
|
||||||
const url = m[1].trim();
|
const url = m[1].trim();
|
||||||
// Keep only canonical English URLs (no language prefix)
|
// Keep only canonical English URLs (no language prefix like /en/, /de/, /fr/, /ja/ etc.)
|
||||||
if (!url.includes("/en/") && !url.includes("/de/") && !url.includes("/fr/")) {
|
const hasLangPrefix = /\/[a-z]{2}\/products\//.test(url);
|
||||||
|
if (!hasLangPrefix) {
|
||||||
urls.push(url);
|
urls.push(url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user