From 24ff9822acb862a6f97a0a98660260f0530ec4d2 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 02:54:28 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20improve=20scraper=20health=20monitor=20?= =?UTF-8?q?=E2=80=94=20tiered=20alerts,=20suppress=20stable-price=20false?= =?UTF-8?q?=20positives?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous logic fired an alert whenever prices_6h=0, even when prices were genuinely stable (content hash dedup prevents duplicate inserts). This caused Flexoptix, ATGBICS and others to trigger alerts every 3h despite their scrapers running successfully. New logic: 🔴 CRITICAL: last price > 7 days (genuine failure) 🟡 WARNING: last price 48h–7 days (possibly stale) ✅ STABLE: last price ≤48h, 0 new (prices unchanged, scraper OK) Also shows pg-boss job state/time alongside each vendor for faster root-cause diagnosis. Trimmed EXPECTED_VENDORS to vendors with actual scraper implementations (removed never-scraped placeholders). --- packages/scraper/src/scheduler.ts | 99 ++++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 8713c90..8dbaa91 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -685,15 +685,25 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("monitor:scraper-health", async () => { const { pool } = await import("./utils/db"); - // Vendors we expect to see prices from regularly - const EXPECTED_VENDORS = [ - "FiberMall", "QSFPTEK", "Flexoptix", "FS.COM", "10Gtek", - "ATGBICS", "GBICS", "BlueOptics", "ShopFiber24", "T&S Communication", - "Fluxlight", "Optcore", "Champion ONE", "SFPcables", - "Vcelink", "OpticsBay", + // Vendors we expect to see prices from regularly. + // Mapped: display name → pg-boss job name prefix (for last-run lookup). + const EXPECTED_VENDORS: Array<{ name: string; jobName: string }> = [ + { name: "FiberMall", jobName: "scrape:pricing:fibermall" }, + { name: "QSFPTEK", jobName: "scrape:pricing:qsfptek" }, + { name: "Flexoptix", jobName: "scrape:pricing:flexoptix" }, + { name: "FS.COM", jobName: "scrape:pricing:fs" }, + { name: "10Gtek", jobName: "scrape:pricing:10gtek" }, + { name: "ATGBICS", jobName: "scrape:pricing:atgbics" }, + { name: "GBICS", jobName: "scrape:pricing:gbics" }, + { name: "SFPcables", jobName: "scrape:pricing:sfpcables" }, + { name: "NADDOD", jobName: "scrape:pricing:naddod" }, ]; - const result = await pool.query(` + const vendorNames = EXPECTED_VENDORS.map((v) => v.name); + const jobNames = EXPECTED_VENDORS.map((v) => v.jobName); + + // Price observation recency per vendor + const priceResult = await pool.query(` SELECT v.name, SUM(CASE WHEN po.time > NOW() - INTERVAL '6 hours' THEN 1 ELSE 0 END) AS prices_6h, MAX(po.time) AS last_seen, @@ -703,26 +713,75 @@ export async function registerWorkers(boss: PgBoss): Promise { WHERE v.name = ANY($1) GROUP BY v.name ORDER BY last_seen ASC NULLS FIRST - `, [EXPECTED_VENDORS]); + `, [vendorNames]); - const problems: string[] = []; - for (const row of result.rows) { + // Last successful pg-boss job per vendor scraper (within last 12h) + const jobResult = await pool.query(` + SELECT DISTINCT ON (name) name, state, completed_on + FROM pgboss.job + WHERE name = ANY($1) + AND created_on > NOW() - INTERVAL '12 hours' + ORDER BY name, created_on DESC + `, [jobNames]); + + const jobMap = new Map(); + for (const row of jobResult.rows) { + jobMap.set(row.name as string, { state: row.state as string, completed_on: row.completed_on as Date | null }); + } + + // Thresholds for alerting: + // CRITICAL (🔴): last price > 168h (7 days) — genuinely broken + // WARNING (🟡): last price > 48h (2 days) — possibly stale + // STABLE (✅): 0 new prices but last price ≤48h — prices unchanged, scraper OK + const CRITICAL_HOURS = 168; + const WARN_HOURS = 48; + + const critical: string[] = []; + const warnings: string[] = []; + const stable: string[] = []; + + for (const row of priceResult.rows) { const h = parseFloat(row.hours_since ?? "9999"); const n = parseInt(row.prices_6h ?? "0", 10); - if (n === 0) { - const lastStr = row.last_seen - ? `last seen ${h.toFixed(1)}h ago (${new Date(row.last_seen).toISOString().slice(0, 16)})` - : "NEVER scraped"; - problems.push(`⚠ ${row.name}: 0 prices in last 6h — ${lastStr}`); + if (n > 0) continue; // new prices written → healthy + + const lastStr = row.last_seen + ? `last price ${h.toFixed(1)}h ago (${new Date(row.last_seen as string).toISOString().slice(0, 16)})` + : "NEVER scraped"; + + const vendor = EXPECTED_VENDORS.find((v) => v.name === row.name); + const jobInfo = vendor ? jobMap.get(vendor.jobName) : undefined; + const jobStr = jobInfo + ? ` | job=${jobInfo.state} at ${jobInfo.completed_on ? new Date(jobInfo.completed_on).toISOString().slice(11, 16) : "?"}` + : " | job=not run in 12h"; + + if (!row.last_seen || h > CRITICAL_HOURS) { + critical.push(`🔴 ${row.name}: ${lastStr}${jobStr}`); + } else if (h > WARN_HOURS) { + warnings.push(`🟡 ${row.name}: ${lastStr}${jobStr}`); + } else { + stable.push(`✅ ${row.name}: prices stable (${h.toFixed(1)}h unchanged)${jobStr}`); } } - if (problems.length > 0) { - console.error("=== SCRAPER HEALTH ALERT ==="); - for (const p of problems) console.error(p); - console.error("=== Check pm2 logs tip-scraper-daemon ==="); + if (critical.length > 0 || warnings.length > 0) { + if (critical.length > 0) { + console.error("=== 🔴 SCRAPER CRITICAL — vendors with no prices for 7+ days ==="); + for (const p of critical) console.error(p); + } + if (warnings.length > 0) { + console.warn("=== 🟡 SCRAPER WARNING — vendors with stale prices (48h+) ==="); + for (const p of warnings) console.warn(p); + } + console.error("=== Check: pm2 logs tip-scraper-daemon ==="); } else { - console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`); + const activeCount = EXPECTED_VENDORS.length - stable.length; + if (stable.length > 0) { + console.log(`[monitor] Scraper health OK — ${activeCount} vendors active, ${stable.length} stable (no price changes)`); + for (const s of stable) console.log(` ${s}`); + } else { + console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`); + } } });