From 5d09b954f56a9a000dea1cade6c4e40bf5c0230f Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Mon, 20 Apr 2026 23:35:02 +0200 Subject: [PATCH] perf: load-aware scraper guard + higher rate limits + /tmp crawlee storage --- packages/scraper/src/scheduler.ts | 23 ++++++++++++ .../scraper/src/scrapers/flexoptix-compat.ts | 2 +- .../src/scrapers/switch-image-fetcher.ts | 2 +- packages/scraper/src/utils/crawlee-config.ts | 36 ++++++++++++++++--- 4 files changed, 57 insertions(+), 6 deletions(-) diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index c56c084..20da0bf 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -21,10 +21,25 @@ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; +import { loadavg } from "os"; // withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig() // for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts +/** + * Load-aware guard — skip heavy scrapers when the server is already busy. + * Uses the 1-minute load average; maxLoad defaults to 2.5 (50% of 5 vCPUs). + * Logs a warning and returns false when load is too high. + */ +function isLoadAcceptable(maxLoad = 2.5): boolean { + const [avg1] = loadavg(); + if (avg1 > maxLoad) { + console.warn(`[load-guard] 1m load avg ${avg1.toFixed(2)} > ${maxLoad} — deferring heavy scraper`); + return false; + } + return true; +} + config({ path: join(__dirname, "..", "..", "..", ".env") }); const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`; @@ -474,6 +489,10 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:compat:flexoptix", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix compatibility mapping`); + if (!isLoadAcceptable(2.5)) { + console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Flexoptix compat scrape`); + return; + } await scrapeFlexoptixCompatibility(); }); @@ -511,6 +530,10 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:images:switches", async () => { console.log(`[${new Date().toISOString()}] Running: Switch og:image fetcher`); + if (!isLoadAcceptable(2.5)) { + console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping switch image fetch`); + return; + } await fetchSwitchImages(); }); diff --git a/packages/scraper/src/scrapers/flexoptix-compat.ts b/packages/scraper/src/scrapers/flexoptix-compat.ts index e5ac083..d191905 100644 --- a/packages/scraper/src/scrapers/flexoptix-compat.ts +++ b/packages/scraper/src/scrapers/flexoptix-compat.ts @@ -133,7 +133,7 @@ export async function scrapeFlexoptixCompatibility(): Promise { const addedForSwitch: string[] = []; // ── Strategy 1: Search Flexoptix by switch model ────────────────────── - await sleep(1500); + await sleep(3000); // 1 req/3s — server-friendly rate limit const suggestions = await searchFlexoptix(sw.model); const matchedBySku = new Set(); diff --git a/packages/scraper/src/scrapers/switch-image-fetcher.ts b/packages/scraper/src/scrapers/switch-image-fetcher.ts index 4f96917..597cca3 100644 --- a/packages/scraper/src/scrapers/switch-image-fetcher.ts +++ b/packages/scraper/src/scrapers/switch-image-fetcher.ts @@ -332,7 +332,7 @@ export async function fetchSwitchImages(targetVendorSlug?: string): Promise/ (NOT next to the repo). + * /tmp is tmpfs → survives process restarts but not server reboots. + * Keeps build artefacts and persistent storage separate. + */ +const CRAWLEE_TMP_ROOT = process.env.CRAWLEE_TMP_ROOT ?? "/tmp/tip-crawlers"; /** Absolute path to the per-scraper Crawlee storage root on disk. */ export function crawleeStorageDir(scraperName: string): string { - // dist layout: packages/scraper/dist/utils/ → go 4 levels up → repo root - // Then store beside packages/ as storage-/ - return join(__dirname, "..", "..", "..", "..", `storage-${scraperName}`); + return join(CRAWLEE_TMP_ROOT, scraperName); +} + +/** + * Remove crawler temp dirs older than `maxAgeMs` (default 48 h). + * Called automatically on each scraper start — keeps /tmp clean. + */ +export function cleanCrawleeTempDirs(maxAgeMs = 48 * 60 * 60 * 1_000): void { + if (!existsSync(CRAWLEE_TMP_ROOT)) return; + const cutoff = Date.now() - maxAgeMs; + try { + for (const entry of readdirSync(CRAWLEE_TMP_ROOT)) { + const full = join(CRAWLEE_TMP_ROOT, entry); + try { + const st = statSync(full); + if (st.isDirectory() && st.mtimeMs < cutoff) { + rmSync(full, { recursive: true, force: true }); + } + } catch { /* skip */ } + } + } catch { /* skip if /tmp not writable */ } } /** @@ -50,6 +75,9 @@ export function crawleeStorageDir(scraperName: string): string { export function makeCrawleeConfig(scraperName: string): Configuration { const storageDir = crawleeStorageDir(scraperName); + // Clean up stale tmp dirs from other scrapers before starting + cleanCrawleeTempDirs(); + // Wipe the request queue from the previous run so Crawlee doesn't skip URLs // that were already marked as HANDLED (state=4, orderNo=null). The queue // persists between runs because purgeOnStart is false. Without this clear,