fix: FS.com price extraction — DOM-based prices + shipping-context exclusion

- All 247 FS.com prices were €79 (shipping threshold, not product prices)
- Root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' banner matched first
- Fix 1: DOM price extraction in page.evaluate with bad-parent skip list
- Fix 2: bodyText qualified patterns skip matches near shipping keywords
- Fix 3: waitForSelector for price DOM element before evaluate
- Fix 4: Deleted 247 invalid €79 observations from DB

Also included from previous session:
- db.ts: set has_image=true on image writes (fix 632 desync rows)
- spec-updater.ts: DR/FR/LR/ER/ZR → SMF, SR → MMF fiber type inference
This commit is contained in:
Rene Fichtmueller 2026-04-18 13:10:35 +02:00
parent 0e91e8b11c
commit ff0cee2e80
4 changed files with 90 additions and 22 deletions

View File

@ -3,6 +3,10 @@
Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}`
Types: FEAT · FIX · UI · DATA · AI · INFRA
{"d":"2026-04-18","t":"FIX","m":"FS.com scraper: all 247 prices written as €79 (wrong) — root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' free-shipping banner appears on every FS.com product page. PRICE_QUALIFIED bodyText regex matched this banner text before reaching the actual product price. Fix: (1) DOM-based price extraction added to page.evaluate — targets [class*='price-value']/[class*='product-price'] etc., skipping elements inside shipping/banner/footer parents; (2) bodyText qualified patterns now check 200-char context for versand/shipping/gratis keywords and skip matches that appear in shipping context; (3) waitForSelector for price elements added before evaluate; (4) deleted 247 invalid €79 observations from DB."}
{"d":"2026-04-18","t":"FIX","m":"has_image flag desync: 671 transceivers had image_url set but has_image=false. Fixed: (1) db.ts findOrCreateScrapedTransceiver now sets has_image=true, image_verified=true on both INSERT (ON CONFLICT DO UPDATE) and UPDATE path; (2) DB bulk UPDATE SET has_image=true WHERE image_url IS NOT NULL AND has_image=false (632 rows fixed)."}
{"d":"2026-04-18","t":"FIX","m":"Fiber type missing for 400G/800G parallel-optic modules (DR8/SR8/FR8 etc.): spec-updater parseSpecTable did not recognize standard abbreviations. Added DR/FR/LR/ER/ZR → SMF and SR → MMF patterns for both 'Fiber Type' field values and part-number-style keys. DB bulk UPDATE applied: 55 transceivers set to SMF, 20 to MMF."}
{"d":"2026-04-18","t":"FIX","m":"Dashboard blog generation: both generateBlog() and generateBlogManual() were calling POST /api/blog/generate without Authorization: Bearer header. requireAuth middleware correctly returned 401, shown as 'Unauthorized — please log in' toast. Fixed: read loadToken() before each fetch and include token in header. Also added r.status===401 guard to redirect to login page on token expiry."}
{"d":"2026-04-18","t":"FIX","m":"PM2 SKIP_FS_SCRAPER env not picked up by tip-scraper-daemon: pm2 restart --update-env did not apply new ecosystem.config.js vars because PM2 loaded from its saved dump. Fixed: pm2 delete + pm2 start ecosystem.config.js --only tip-scraper-daemon + pm2 save. Daemon restarted fresh (ID 83, 0 restarts) with SKIP_FS_SCRAPER=true now confirmed live. FS.com job now correctly skips on Erik instead of failing with ENOENT."}
{"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: suppress Crawlee post-run ENOENT unhandledRejection — Crawlee's FileSystemStorage fires a final _isTaskReadyFunction call after run() resolves, reading a request .json that was already processed/cleaned-up. This ENOENT triggered process.exit(1) before Phase 2 completed, causing 7 days of missing FS.com price data. Fixed: targeted unhandledRejection handler in require.main block swallows ENOENT from request_queues paths while re-raising real errors."}
{"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: PID lock (/tmp/tip-fs-scraper.lock) added to run-fs-scraper-mac.sh — prevents concurrent instances when launchd 2am fire overlaps with a still-running earlier run. Previous concurrent instances caused rmSync(storage-fs-phase1) race (one instance deletes the storage dir while another is using it), crashing Phase 2."}

View File

@ -410,10 +410,16 @@ async function scrapeProductDetails(
const url = request.url;
try {
// Wait for the page structure AND ideally a price element to render
await page.waitForSelector(
'h1, .product-detail, [class*="product-info"], [class*="product-main"]',
{ timeout: 12000 }
);
// Give JS-rendered price elements a moment to appear after the DOM is ready
await page.waitForSelector(
'[class*="price-value"], [class*="product-price"], [class*="prod-price"], [class*="final-price"]',
{ timeout: 5000 }
).catch(() => { /* price element optional — proceed with bodyText fallback */ });
} catch {
await page.waitForTimeout(7000);
}
@ -421,6 +427,7 @@ async function scrapeProductDetails(
const raw = await page.evaluate(
(): {
bodyText: string;
priceRaw: string;
specs: Record<string, string>;
brands: string[];
imageUrl: string;
@ -429,6 +436,37 @@ async function scrapeProductDetails(
} => {
const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n");
// ── DOM price extraction (avoids matching site-wide shipping threshold) ──
// FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" on every page.
// bodyText regex matches this and returns 79 for ALL products. We extract
// the actual product price from its own DOM element, skipping bad parents.
let priceRaw = "";
const PRICE_SELS = [
"[class*='price-value']",
"[class*='product-price']",
"[class*='prod-price']",
"[class*='final-price']",
"[class*='regular-price']",
"[class*='price-amount']",
"[data-cy='price']",
".price-box",
];
const SKIP_PARENT =
"[class*='shipping'], [class*='banner'], [class*='delivery'], " +
"[class*='free-ship'], [class*='cart'], [class*='checkout'], " +
"[class*='notice'], [class*='promo'], footer, header, nav";
outer: for (const sel of PRICE_SELS) {
for (const el of Array.from(document.querySelectorAll<HTMLElement>(sel))) {
if (el.closest(SKIP_PARENT)) continue;
const txt = (el.textContent ?? "").replace(/\s+/g, " ").trim();
// Must contain a digit, currency marker, and be short (<40 chars)
if (/\d/.test(txt) && txt.length < 40 && /[€$]|EUR/i.test(txt)) {
priceRaw = txt;
break outer;
}
}
}
const specs: Record<string, string> = {};
const SEL = [
".product-param tr", ".product-specs tr", ".param-table tr",
@ -488,7 +526,7 @@ async function scrapeProductDetails(
const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? "";
const h1 = document.querySelector("h1")?.textContent?.trim() ?? "";
return { bodyText, specs, brands, imageUrl, datasheetUrl, h1 };
return { bodyText, priceRaw, specs, brands, imageUrl, datasheetUrl, h1 };
}
);
@ -496,27 +534,45 @@ async function scrapeProductDetails(
const t = raw.bodyText;
// ── Net price (ohne MwSt, EUR) ─────────────────────────────────────────
// Priority: patterns that require "ohne MwSt" or "netto" qualifier (FS.com shows
// real prices this way). Fallback broad patterns are only accepted above €100
// to avoid matching FS.com's €79 placeholder/template price.
// Strategy:
// 1. DOM extraction (priceRaw) — most reliable, avoids shipping-threshold text
// 2. bodyText qualified patterns (ohne MwSt / netto) with shipping-ctx exclusion
// 3. Broad bodyText fallback — only >€100 to skip free-shipping threshold
//
// Root cause of the €79 bug: FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)"
// on every page. The qualified regex matched that and returned 79 for every product.
let priceNet: number | undefined;
// 1. DOM-extracted price string
if (raw.priceRaw) {
const p = parseGermanPrice(raw.priceRaw);
if (p && p > 0.5 && p < 500_000) priceNet = p;
}
// 2. bodyText qualified patterns — with shipping-context exclusion
if (!priceNet) {
const PRICE_QUALIFIED: RegExp[] = [
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i,
/€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i,
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i,
/Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i,
// DOM-extracted price element (set by page.evaluate in raw)
/Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i,
/([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i,
];
for (const pat of PRICE_QUALIFIED) {
const m = t.match(pat);
if (m?.[1]) {
// Skip if the match appears in a shipping/free-delivery context
const matchIdx = m.index ?? t.indexOf(m[0]);
const ctx = t.slice(Math.max(0, matchIdx - 200), matchIdx + 200);
if (/versand|shipping|lieferung.*\bab\b|\bab\b.*versand|gratis.*ab|kostenlos/i.test(ctx)) continue;
const p = parseGermanPrice(m[1]);
if (p && p > 0.5 && p < 500_000) { priceNet = p; break; }
}
}
// Broad fallback — only accept if price > €100 (avoids FS.com's €79 placeholder)
}
// 3. Broad bodyText fallback — only accept > €100 (free-shipping threshold is €79)
if (!priceNet) {
for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) {
const m = t.match(pat);

View File

@ -295,10 +295,10 @@ export async function findOrCreateScrapedTransceiver(params: {
);
if (existing.rows.length > 0) {
// Update image_url and image_verified if we have a new image for a record without one
// Update image_url, has_image and image_verified if we have a new image for a record without one
if (params.imageUrl && !existing.rows[0].image_url) {
await pool.query(
`UPDATE transceivers SET image_url = $1, image_verified = true, updated_at = NOW() WHERE id = $2`,
`UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`,
[params.imageUrl, existing.rows[0].id]
);
await checkAndSetFullyVerified(existing.rows[0].id);
@ -311,7 +311,7 @@ export async function findOrCreateScrapedTransceiver(params: {
const result = await pool.query(
`INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13)
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
RETURNING id`,
[
slug,

View File

@ -120,6 +120,14 @@ export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedS
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
// 400G/800G parallel-optic standards: DR/FR/LR = SMF, SR = MMF
else if (/\bdr\d*\b|\bfr\d*\b|\blr\d*\b|\ber\d*\b|\bzr\d*\b/i.test(val)) result.fiberType = "SMF";
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
}
// Also infer fiber type from part-number-style keys when fiber key absent
if (!result.fiberType && (key === "part number" || key === "model" || key === "sku")) {
if (/\b(dr|fr|lr|er|zr)\d*\b/i.test(val)) result.fiberType = "SMF";
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
}
// Connector