From 1da4abc488d9e144a550f1f258ee239a9d03b588 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 13:10:35 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20FS.com=20price=20extraction=20=E2=80=94?= =?UTF-8?q?=20DOM-based=20prices=20+=20shipping-context=20exclusion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - All 247 FS.com prices were €79 (shipping threshold, not product prices) - Root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' banner matched first - Fix 1: DOM price extraction in page.evaluate with bad-parent skip list - Fix 2: bodyText qualified patterns skip matches near shipping keywords - Fix 3: waitForSelector for price DOM element before evaluate - Fix 4: Deleted 247 invalid €79 observations from DB Also included from previous session: - db.ts: set has_image=true on image writes (fix 632 desync rows) - spec-updater.ts: DR/FR/LR/ER/ZR → SMF, SR → MMF fiber type inference --- CHANGELOG_PENDING.md | 4 + packages/scraper/src/scrapers/fs-com.ts | 94 +++++++++++++++++----- packages/scraper/src/utils/db.ts | 6 +- packages/scraper/src/utils/spec-updater.ts | 8 ++ 4 files changed, 90 insertions(+), 22 deletions(-) diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 3755885..29078ed 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -3,6 +3,10 @@ Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}` Types: FEAT · FIX · UI · DATA · AI · INFRA +{"d":"2026-04-18","t":"FIX","m":"FS.com scraper: all 247 prices written as €79 (wrong) — root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' free-shipping banner appears on every FS.com product page. PRICE_QUALIFIED bodyText regex matched this banner text before reaching the actual product price. Fix: (1) DOM-based price extraction added to page.evaluate — targets [class*='price-value']/[class*='product-price'] etc., skipping elements inside shipping/banner/footer parents; (2) bodyText qualified patterns now check 200-char context for versand/shipping/gratis keywords and skip matches that appear in shipping context; (3) waitForSelector for price elements added before evaluate; (4) deleted 247 invalid €79 observations from DB."} +{"d":"2026-04-18","t":"FIX","m":"has_image flag desync: 671 transceivers had image_url set but has_image=false. Fixed: (1) db.ts findOrCreateScrapedTransceiver now sets has_image=true, image_verified=true on both INSERT (ON CONFLICT DO UPDATE) and UPDATE path; (2) DB bulk UPDATE SET has_image=true WHERE image_url IS NOT NULL AND has_image=false (632 rows fixed)."} +{"d":"2026-04-18","t":"FIX","m":"Fiber type missing for 400G/800G parallel-optic modules (DR8/SR8/FR8 etc.): spec-updater parseSpecTable did not recognize standard abbreviations. Added DR/FR/LR/ER/ZR → SMF and SR → MMF patterns for both 'Fiber Type' field values and part-number-style keys. DB bulk UPDATE applied: 55 transceivers set to SMF, 20 to MMF."} +{"d":"2026-04-18","t":"FIX","m":"Dashboard blog generation: both generateBlog() and generateBlogManual() were calling POST /api/blog/generate without Authorization: Bearer header. requireAuth middleware correctly returned 401, shown as 'Unauthorized — please log in' toast. Fixed: read loadToken() before each fetch and include token in header. Also added r.status===401 guard to redirect to login page on token expiry."} {"d":"2026-04-18","t":"FIX","m":"PM2 SKIP_FS_SCRAPER env not picked up by tip-scraper-daemon: pm2 restart --update-env did not apply new ecosystem.config.js vars because PM2 loaded from its saved dump. Fixed: pm2 delete + pm2 start ecosystem.config.js --only tip-scraper-daemon + pm2 save. Daemon restarted fresh (ID 83, 0 restarts) with SKIP_FS_SCRAPER=true now confirmed live. FS.com job now correctly skips on Erik instead of failing with ENOENT."} {"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: suppress Crawlee post-run ENOENT unhandledRejection — Crawlee's FileSystemStorage fires a final _isTaskReadyFunction call after run() resolves, reading a request .json that was already processed/cleaned-up. This ENOENT triggered process.exit(1) before Phase 2 completed, causing 7 days of missing FS.com price data. Fixed: targeted unhandledRejection handler in require.main block swallows ENOENT from request_queues paths while re-raising real errors."} {"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: PID lock (/tmp/tip-fs-scraper.lock) added to run-fs-scraper-mac.sh — prevents concurrent instances when launchd 2am fire overlaps with a still-running earlier run. Previous concurrent instances caused rmSync(storage-fs-phase1) race (one instance deletes the storage dir while another is using it), crashing Phase 2."} diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 235489b..1a1b711 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -410,10 +410,16 @@ async function scrapeProductDetails( const url = request.url; try { + // Wait for the page structure AND ideally a price element to render await page.waitForSelector( 'h1, .product-detail, [class*="product-info"], [class*="product-main"]', { timeout: 12000 } ); + // Give JS-rendered price elements a moment to appear after the DOM is ready + await page.waitForSelector( + '[class*="price-value"], [class*="product-price"], [class*="prod-price"], [class*="final-price"]', + { timeout: 5000 } + ).catch(() => { /* price element optional — proceed with bodyText fallback */ }); } catch { await page.waitForTimeout(7000); } @@ -421,6 +427,7 @@ async function scrapeProductDetails( const raw = await page.evaluate( (): { bodyText: string; + priceRaw: string; specs: Record; brands: string[]; imageUrl: string; @@ -429,6 +436,37 @@ async function scrapeProductDetails( } => { const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n"); + // ── DOM price extraction (avoids matching site-wide shipping threshold) ── + // FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" on every page. + // bodyText regex matches this and returns 79 for ALL products. We extract + // the actual product price from its own DOM element, skipping bad parents. + let priceRaw = ""; + const PRICE_SELS = [ + "[class*='price-value']", + "[class*='product-price']", + "[class*='prod-price']", + "[class*='final-price']", + "[class*='regular-price']", + "[class*='price-amount']", + "[data-cy='price']", + ".price-box", + ]; + const SKIP_PARENT = + "[class*='shipping'], [class*='banner'], [class*='delivery'], " + + "[class*='free-ship'], [class*='cart'], [class*='checkout'], " + + "[class*='notice'], [class*='promo'], footer, header, nav"; + outer: for (const sel of PRICE_SELS) { + for (const el of Array.from(document.querySelectorAll(sel))) { + if (el.closest(SKIP_PARENT)) continue; + const txt = (el.textContent ?? "").replace(/\s+/g, " ").trim(); + // Must contain a digit, currency marker, and be short (<40 chars) + if (/\d/.test(txt) && txt.length < 40 && /[€$]|EUR/i.test(txt)) { + priceRaw = txt; + break outer; + } + } + } + const specs: Record = {}; const SEL = [ ".product-param tr", ".product-specs tr", ".param-table tr", @@ -488,7 +526,7 @@ async function scrapeProductDetails( const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? ""; const h1 = document.querySelector("h1")?.textContent?.trim() ?? ""; - return { bodyText, specs, brands, imageUrl, datasheetUrl, h1 }; + return { bodyText, priceRaw, specs, brands, imageUrl, datasheetUrl, h1 }; } ); @@ -496,27 +534,45 @@ async function scrapeProductDetails( const t = raw.bodyText; // ── Net price (ohne MwSt, EUR) ───────────────────────────────────────── - // Priority: patterns that require "ohne MwSt" or "netto" qualifier (FS.com shows - // real prices this way). Fallback broad patterns are only accepted above €100 - // to avoid matching FS.com's €79 placeholder/template price. + // Strategy: + // 1. DOM extraction (priceRaw) — most reliable, avoids shipping-threshold text + // 2. bodyText qualified patterns (ohne MwSt / netto) with shipping-ctx exclusion + // 3. Broad bodyText fallback — only >€100 to skip free-shipping threshold + // + // Root cause of the €79 bug: FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" + // on every page. The qualified regex matched that and returned 79 for every product. let priceNet: number | undefined; - const PRICE_QUALIFIED: RegExp[] = [ - /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i, - /€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i, - /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i, - /Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i, - // DOM-extracted price element (set by page.evaluate in raw) - /Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i, - /([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i, - ]; - for (const pat of PRICE_QUALIFIED) { - const m = t.match(pat); - if (m?.[1]) { - const p = parseGermanPrice(m[1]); - if (p && p > 0.5 && p < 500_000) { priceNet = p; break; } + + // 1. DOM-extracted price string + if (raw.priceRaw) { + const p = parseGermanPrice(raw.priceRaw); + if (p && p > 0.5 && p < 500_000) priceNet = p; + } + + // 2. bodyText qualified patterns — with shipping-context exclusion + if (!priceNet) { + const PRICE_QUALIFIED: RegExp[] = [ + /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i, + /€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i, + /([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i, + /Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i, + /Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i, + /([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i, + ]; + for (const pat of PRICE_QUALIFIED) { + const m = t.match(pat); + if (m?.[1]) { + // Skip if the match appears in a shipping/free-delivery context + const matchIdx = m.index ?? t.indexOf(m[0]); + const ctx = t.slice(Math.max(0, matchIdx - 200), matchIdx + 200); + if (/versand|shipping|lieferung.*\bab\b|\bab\b.*versand|gratis.*ab|kostenlos/i.test(ctx)) continue; + const p = parseGermanPrice(m[1]); + if (p && p > 0.5 && p < 500_000) { priceNet = p; break; } + } } } - // Broad fallback — only accept if price > €100 (avoids FS.com's €79 placeholder) + + // 3. Broad bodyText fallback — only accept > €100 (free-shipping threshold is €79) if (!priceNet) { for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) { const m = t.match(pat); diff --git a/packages/scraper/src/utils/db.ts b/packages/scraper/src/utils/db.ts index b113913..ea16329 100644 --- a/packages/scraper/src/utils/db.ts +++ b/packages/scraper/src/utils/db.ts @@ -295,10 +295,10 @@ export async function findOrCreateScrapedTransceiver(params: { ); if (existing.rows.length > 0) { - // Update image_url and image_verified if we have a new image for a record without one + // Update image_url, has_image and image_verified if we have a new image for a record without one if (params.imageUrl && !existing.rows[0].image_url) { await pool.query( - `UPDATE transceivers SET image_url = $1, image_verified = true, updated_at = NOW() WHERE id = $2`, + `UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`, [params.imageUrl, existing.rows[0].id] ); await checkAndSetFullyVerified(existing.rows[0].id); @@ -311,7 +311,7 @@ export async function findOrCreateScrapedTransceiver(params: { const result = await pool.query( `INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13) - ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW() + ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW() RETURNING id`, [ slug, diff --git a/packages/scraper/src/utils/spec-updater.ts b/packages/scraper/src/utils/spec-updater.ts index 9b16b6e..c8aa714 100644 --- a/packages/scraper/src/utils/spec-updater.ts +++ b/packages/scraper/src/utils/spec-updater.ts @@ -120,6 +120,14 @@ export function parseSpecTable(specs: Record): Partial