fix: FS.com price extraction — DOM-based prices + shipping-context exclusion
- All 247 FS.com prices were €79 (shipping threshold, not product prices) - Root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' banner matched first - Fix 1: DOM price extraction in page.evaluate with bad-parent skip list - Fix 2: bodyText qualified patterns skip matches near shipping keywords - Fix 3: waitForSelector for price DOM element before evaluate - Fix 4: Deleted 247 invalid €79 observations from DB Also included from previous session: - db.ts: set has_image=true on image writes (fix 632 desync rows) - spec-updater.ts: DR/FR/LR/ER/ZR → SMF, SR → MMF fiber type inference
This commit is contained in:
parent
0e91e8b11c
commit
ff0cee2e80
@ -3,6 +3,10 @@
|
||||
Format: `{"d":"YYYY-MM-DD","t":"TYPE","m":"Description"}`
|
||||
Types: FEAT · FIX · UI · DATA · AI · INFRA
|
||||
|
||||
{"d":"2026-04-18","t":"FIX","m":"FS.com scraper: all 247 prices written as €79 (wrong) — root cause: 'Gratis Versand ab 79 € (ohne MwSt.)' free-shipping banner appears on every FS.com product page. PRICE_QUALIFIED bodyText regex matched this banner text before reaching the actual product price. Fix: (1) DOM-based price extraction added to page.evaluate — targets [class*='price-value']/[class*='product-price'] etc., skipping elements inside shipping/banner/footer parents; (2) bodyText qualified patterns now check 200-char context for versand/shipping/gratis keywords and skip matches that appear in shipping context; (3) waitForSelector for price elements added before evaluate; (4) deleted 247 invalid €79 observations from DB."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"has_image flag desync: 671 transceivers had image_url set but has_image=false. Fixed: (1) db.ts findOrCreateScrapedTransceiver now sets has_image=true, image_verified=true on both INSERT (ON CONFLICT DO UPDATE) and UPDATE path; (2) DB bulk UPDATE SET has_image=true WHERE image_url IS NOT NULL AND has_image=false (632 rows fixed)."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"Fiber type missing for 400G/800G parallel-optic modules (DR8/SR8/FR8 etc.): spec-updater parseSpecTable did not recognize standard abbreviations. Added DR/FR/LR/ER/ZR → SMF and SR → MMF patterns for both 'Fiber Type' field values and part-number-style keys. DB bulk UPDATE applied: 55 transceivers set to SMF, 20 to MMF."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"Dashboard blog generation: both generateBlog() and generateBlogManual() were calling POST /api/blog/generate without Authorization: Bearer header. requireAuth middleware correctly returned 401, shown as 'Unauthorized — please log in' toast. Fixed: read loadToken() before each fetch and include token in header. Also added r.status===401 guard to redirect to login page on token expiry."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"PM2 SKIP_FS_SCRAPER env not picked up by tip-scraper-daemon: pm2 restart --update-env did not apply new ecosystem.config.js vars because PM2 loaded from its saved dump. Fixed: pm2 delete + pm2 start ecosystem.config.js --only tip-scraper-daemon + pm2 save. Daemon restarted fresh (ID 83, 0 restarts) with SKIP_FS_SCRAPER=true now confirmed live. FS.com job now correctly skips on Erik instead of failing with ENOENT."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: suppress Crawlee post-run ENOENT unhandledRejection — Crawlee's FileSystemStorage fires a final _isTaskReadyFunction call after run() resolves, reading a request .json that was already processed/cleaned-up. This ENOENT triggered process.exit(1) before Phase 2 completed, causing 7 days of missing FS.com price data. Fixed: targeted unhandledRejection handler in require.main block swallows ENOENT from request_queues paths while re-raising real errors."}
|
||||
{"d":"2026-04-18","t":"FIX","m":"FS.com Mac scraper: PID lock (/tmp/tip-fs-scraper.lock) added to run-fs-scraper-mac.sh — prevents concurrent instances when launchd 2am fire overlaps with a still-running earlier run. Previous concurrent instances caused rmSync(storage-fs-phase1) race (one instance deletes the storage dir while another is using it), crashing Phase 2."}
|
||||
|
||||
@ -410,10 +410,16 @@ async function scrapeProductDetails(
|
||||
const url = request.url;
|
||||
|
||||
try {
|
||||
// Wait for the page structure AND ideally a price element to render
|
||||
await page.waitForSelector(
|
||||
'h1, .product-detail, [class*="product-info"], [class*="product-main"]',
|
||||
{ timeout: 12000 }
|
||||
);
|
||||
// Give JS-rendered price elements a moment to appear after the DOM is ready
|
||||
await page.waitForSelector(
|
||||
'[class*="price-value"], [class*="product-price"], [class*="prod-price"], [class*="final-price"]',
|
||||
{ timeout: 5000 }
|
||||
).catch(() => { /* price element optional — proceed with bodyText fallback */ });
|
||||
} catch {
|
||||
await page.waitForTimeout(7000);
|
||||
}
|
||||
@ -421,6 +427,7 @@ async function scrapeProductDetails(
|
||||
const raw = await page.evaluate(
|
||||
(): {
|
||||
bodyText: string;
|
||||
priceRaw: string;
|
||||
specs: Record<string, string>;
|
||||
brands: string[];
|
||||
imageUrl: string;
|
||||
@ -429,6 +436,37 @@ async function scrapeProductDetails(
|
||||
} => {
|
||||
const bodyText = (document.body?.innerText ?? "").replace(/\n{3,}/g, "\n\n");
|
||||
|
||||
// ── DOM price extraction (avoids matching site-wide shipping threshold) ──
|
||||
// FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)" on every page.
|
||||
// bodyText regex matches this and returns 79 for ALL products. We extract
|
||||
// the actual product price from its own DOM element, skipping bad parents.
|
||||
let priceRaw = "";
|
||||
const PRICE_SELS = [
|
||||
"[class*='price-value']",
|
||||
"[class*='product-price']",
|
||||
"[class*='prod-price']",
|
||||
"[class*='final-price']",
|
||||
"[class*='regular-price']",
|
||||
"[class*='price-amount']",
|
||||
"[data-cy='price']",
|
||||
".price-box",
|
||||
];
|
||||
const SKIP_PARENT =
|
||||
"[class*='shipping'], [class*='banner'], [class*='delivery'], " +
|
||||
"[class*='free-ship'], [class*='cart'], [class*='checkout'], " +
|
||||
"[class*='notice'], [class*='promo'], footer, header, nav";
|
||||
outer: for (const sel of PRICE_SELS) {
|
||||
for (const el of Array.from(document.querySelectorAll<HTMLElement>(sel))) {
|
||||
if (el.closest(SKIP_PARENT)) continue;
|
||||
const txt = (el.textContent ?? "").replace(/\s+/g, " ").trim();
|
||||
// Must contain a digit, currency marker, and be short (<40 chars)
|
||||
if (/\d/.test(txt) && txt.length < 40 && /[€$]|EUR/i.test(txt)) {
|
||||
priceRaw = txt;
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const specs: Record<string, string> = {};
|
||||
const SEL = [
|
||||
".product-param tr", ".product-specs tr", ".param-table tr",
|
||||
@ -488,7 +526,7 @@ async function scrapeProductDetails(
|
||||
const datasheetUrl = dsEl?.href ?? dsEl?.getAttribute("href") ?? "";
|
||||
|
||||
const h1 = document.querySelector("h1")?.textContent?.trim() ?? "";
|
||||
return { bodyText, specs, brands, imageUrl, datasheetUrl, h1 };
|
||||
return { bodyText, priceRaw, specs, brands, imageUrl, datasheetUrl, h1 };
|
||||
}
|
||||
);
|
||||
|
||||
@ -496,27 +534,45 @@ async function scrapeProductDetails(
|
||||
const t = raw.bodyText;
|
||||
|
||||
// ── Net price (ohne MwSt, EUR) ─────────────────────────────────────────
|
||||
// Priority: patterns that require "ohne MwSt" or "netto" qualifier (FS.com shows
|
||||
// real prices this way). Fallback broad patterns are only accepted above €100
|
||||
// to avoid matching FS.com's €79 placeholder/template price.
|
||||
// Strategy:
|
||||
// 1. DOM extraction (priceRaw) — most reliable, avoids shipping-threshold text
|
||||
// 2. bodyText qualified patterns (ohne MwSt / netto) with shipping-ctx exclusion
|
||||
// 3. Broad bodyText fallback — only >€100 to skip free-shipping threshold
|
||||
//
|
||||
// Root cause of the €79 bug: FS.com shows "Gratis Versand ab 79 € (ohne MwSt.)"
|
||||
// on every page. The qualified regex matched that and returned 79 for every product.
|
||||
let priceNet: number | undefined;
|
||||
const PRICE_QUALIFIED: RegExp[] = [
|
||||
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i,
|
||||
/€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i,
|
||||
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i,
|
||||
/Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i,
|
||||
// DOM-extracted price element (set by page.evaluate in raw)
|
||||
/Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i,
|
||||
/([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i,
|
||||
];
|
||||
for (const pat of PRICE_QUALIFIED) {
|
||||
const m = t.match(pat);
|
||||
if (m?.[1]) {
|
||||
const p = parseGermanPrice(m[1]);
|
||||
if (p && p > 0.5 && p < 500_000) { priceNet = p; break; }
|
||||
|
||||
// 1. DOM-extracted price string
|
||||
if (raw.priceRaw) {
|
||||
const p = parseGermanPrice(raw.priceRaw);
|
||||
if (p && p > 0.5 && p < 500_000) priceNet = p;
|
||||
}
|
||||
|
||||
// 2. bodyText qualified patterns — with shipping-context exclusion
|
||||
if (!priceNet) {
|
||||
const PRICE_QUALIFIED: RegExp[] = [
|
||||
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*\(?ohne\s+MwSt\.?\)?/i,
|
||||
/€\s*([0-9]{1,5}[,.]?[0-9]{0,3})\s*\(?ohne\s+MwSt\.?\)?/i,
|
||||
/([0-9]{1,5}[,.]?[0-9]{0,3})\s*€\s*netto/i,
|
||||
/Netto[:\s]*€?\s*([0-9]{1,5}[,.]?[0-9]{0,3})/i,
|
||||
/Preis[:\s]*([0-9]{1,5},[0-9]{2})\s*€/i,
|
||||
/([0-9]{1,5},[0-9]{2})\s*€\s*(?:exkl\.|exklusive|excl\.)/i,
|
||||
];
|
||||
for (const pat of PRICE_QUALIFIED) {
|
||||
const m = t.match(pat);
|
||||
if (m?.[1]) {
|
||||
// Skip if the match appears in a shipping/free-delivery context
|
||||
const matchIdx = m.index ?? t.indexOf(m[0]);
|
||||
const ctx = t.slice(Math.max(0, matchIdx - 200), matchIdx + 200);
|
||||
if (/versand|shipping|lieferung.*\bab\b|\bab\b.*versand|gratis.*ab|kostenlos/i.test(ctx)) continue;
|
||||
const p = parseGermanPrice(m[1]);
|
||||
if (p && p > 0.5 && p < 500_000) { priceNet = p; break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// Broad fallback — only accept if price > €100 (avoids FS.com's €79 placeholder)
|
||||
|
||||
// 3. Broad bodyText fallback — only accept > €100 (free-shipping threshold is €79)
|
||||
if (!priceNet) {
|
||||
for (const pat of [/([0-9]{1,5},[0-9]{2})\s*€/, /€\s*([0-9]{1,5},[0-9]{2})/]) {
|
||||
const m = t.match(pat);
|
||||
|
||||
@ -295,10 +295,10 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
// Update image_url and image_verified if we have a new image for a record without one
|
||||
// Update image_url, has_image and image_verified if we have a new image for a record without one
|
||||
if (params.imageUrl && !existing.rows[0].image_url) {
|
||||
await pool.query(
|
||||
`UPDATE transceivers SET image_url = $1, image_verified = true, updated_at = NOW() WHERE id = $2`,
|
||||
`UPDATE transceivers SET image_url = $1, has_image = true, image_verified = true, updated_at = NOW() WHERE id = $2`,
|
||||
[params.imageUrl, existing.rows[0].id]
|
||||
);
|
||||
await checkAndSetFullyVerified(existing.rows[0].id);
|
||||
@ -311,7 +311,7 @@ export async function findOrCreateScrapedTransceiver(params: {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO transceivers (slug, part_number, vendor_id, form_factor, speed_gbps, speed, reach_meters, reach_label, fiber_type, wavelengths, category, market_status, image_url, image_verified)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'Mainstream', $12, $13)
|
||||
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
|
||||
ON CONFLICT (slug) DO UPDATE SET image_url = COALESCE(transceivers.image_url, EXCLUDED.image_url), has_image = COALESCE(transceivers.has_image, EXCLUDED.has_image), image_verified = COALESCE(transceivers.image_verified, EXCLUDED.image_verified), updated_at = NOW()
|
||||
RETURNING id`,
|
||||
[
|
||||
slug,
|
||||
|
||||
@ -120,6 +120,14 @@ export function parseSpecTable(specs: Record<string, string>): Partial<VerifiedS
|
||||
else if (/multi.?mode|mmf|om[1-5]/i.test(val)) result.fiberType = "MMF";
|
||||
else if (/copper|cat[56]/i.test(val)) result.fiberType = "Copper";
|
||||
else if (/aoc|active.optical/i.test(val)) result.fiberType = "AOC";
|
||||
// 400G/800G parallel-optic standards: DR/FR/LR = SMF, SR = MMF
|
||||
else if (/\bdr\d*\b|\bfr\d*\b|\blr\d*\b|\ber\d*\b|\bzr\d*\b/i.test(val)) result.fiberType = "SMF";
|
||||
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
|
||||
}
|
||||
// Also infer fiber type from part-number-style keys when fiber key absent
|
||||
if (!result.fiberType && (key === "part number" || key === "model" || key === "sku")) {
|
||||
if (/\b(dr|fr|lr|er|zr)\d*\b/i.test(val)) result.fiberType = "SMF";
|
||||
else if (/\bsr\d*\b/i.test(val)) result.fiberType = "MMF";
|
||||
}
|
||||
|
||||
// Connector
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user