From 5d09b954f56a9a000dea1cade6c4e40bf5c0230f Mon Sep 17 00:00:00 2001
From: Rene Fichtmueller <renefichtmueller@MacStudio-von-Rene-8.local>
Date: Mon, 20 Apr 2026 23:35:02 +0200
Subject: [PATCH] perf: load-aware scraper guard + higher rate limits + /tmp
 crawlee storage

---
 packages/scraper/src/scheduler.ts             | 23 ++++++++++++
 .../scraper/src/scrapers/flexoptix-compat.ts  |  2 +-
 .../src/scrapers/switch-image-fetcher.ts      |  2 +-
 packages/scraper/src/utils/crawlee-config.ts  | 36 ++++++++++++++++---
 4 files changed, 57 insertions(+), 6 deletions(-)
diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts
index c56c084..20da0bf 100644
--- a/packages/scraper/src/scheduler.ts
+++ b/packages/scraper/src/scheduler.ts
@@ -21,10 +21,25 @@
 import PgBoss from "pg-boss";
 import { config } from "dotenv";
 import { join } from "path";
+import { loadavg } from "os";
 
 // withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
 // for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
 
+/**
+ * Load-aware guard — skip heavy scrapers when the server is already busy.
+ * Uses the 1-minute load average; maxLoad defaults to 2.5 (50% of 5 vCPUs).
+ * Logs a warning and returns false when load is too high.
+ */
+function isLoadAcceptable(maxLoad = 2.5): boolean {
+  const [avg1] = loadavg();
+  if (avg1 > maxLoad) {
+    console.warn(`[load-guard] 1m load avg ${avg1.toFixed(2)} > ${maxLoad} — deferring heavy scraper`);
+    return false;
+  }
+  return true;
+}
+
 config({ path: join(__dirname, "..", "..", "..", ".env") });
 
 const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
@@ -474,6 +489,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
 
   await boss.work("scrape:compat:flexoptix", async () => {
     console.log(`[${new Date().toISOString()}] Running: Flexoptix compatibility mapping`);
+    if (!isLoadAcceptable(2.5)) {
+      console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Flexoptix compat scrape`);
+      return;
+    }
     await scrapeFlexoptixCompatibility();
   });
 
@@ -511,6 +530,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
 
   await boss.work("scrape:images:switches", async () => {
     console.log(`[${new Date().toISOString()}] Running: Switch og:image fetcher`);
+    if (!isLoadAcceptable(2.5)) {
+      console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping switch image fetch`);
+      return;
+    }
     await fetchSwitchImages();
   });
 
diff --git a/packages/scraper/src/scrapers/flexoptix-compat.ts b/packages/scraper/src/scrapers/flexoptix-compat.ts
index e5ac083..d191905 100644
--- a/packages/scraper/src/scrapers/flexoptix-compat.ts
+++ b/packages/scraper/src/scrapers/flexoptix-compat.ts
@@ -133,7 +133,7 @@ export async function scrapeFlexoptixCompatibility(): Promise<void> {
     const addedForSwitch: string[] = [];
 
     // ── Strategy 1: Search Flexoptix by switch model ──────────────────────
-    await sleep(1500);
+    await sleep(3000); // 1 req/3s — server-friendly rate limit
     const suggestions = await searchFlexoptix(sw.model);
 
     const matchedBySku = new Set<string>();
diff --git a/packages/scraper/src/scrapers/switch-image-fetcher.ts b/packages/scraper/src/scrapers/switch-image-fetcher.ts
index 4f96917..597cca3 100644
--- a/packages/scraper/src/scrapers/switch-image-fetcher.ts
+++ b/packages/scraper/src/scrapers/switch-image-fetcher.ts
@@ -332,7 +332,7 @@ export async function fetchSwitchImages(targetVendorSlug?: string): Promise<void
       continue;
     }
 
-    await sleep(2000); // 1 req/2sec
+    await sleep(3500); // 1 req/3.5s — server-friendly rate limit
 
     const html = await fetchPageHtml(productUrl);
     if (!html) {
diff --git a/packages/scraper/src/utils/crawlee-config.ts b/packages/scraper/src/utils/crawlee-config.ts
index 5aa02bd..8140d05 100644
--- a/packages/scraper/src/utils/crawlee-config.ts
+++ b/packages/scraper/src/utils/crawlee-config.ts
@@ -29,13 +29,38 @@
 
 import { Configuration } from "crawlee";
 import { join } from "node:path";
-import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs";
+import { mkdirSync, existsSync, writeFileSync, rmSync, readdirSync, statSync } from "node:fs";
+
+/**
+ * Crawlee scratch space lives in /tmp/tip-crawlers/<name>/ (NOT next to the repo).
+ * /tmp is tmpfs → survives process restarts but not server reboots.
+ * Keeps build artefacts and persistent storage separate.
+ */
+const CRAWLEE_TMP_ROOT = process.env.CRAWLEE_TMP_ROOT ?? "/tmp/tip-crawlers";
 
 /** Absolute path to the per-scraper Crawlee storage root on disk. */
 export function crawleeStorageDir(scraperName: string): string {
-  // dist layout: packages/scraper/dist/utils/ → go 4 levels up → repo root
-  // Then store beside packages/ as storage-<name>/
-  return join(__dirname, "..", "..", "..", "..", `storage-${scraperName}`);
+  return join(CRAWLEE_TMP_ROOT, scraperName);
+}
+
+/**
+ * Remove crawler temp dirs older than `maxAgeMs` (default 48 h).
+ * Called automatically on each scraper start — keeps /tmp clean.
+ */
+export function cleanCrawleeTempDirs(maxAgeMs = 48 * 60 * 60 * 1_000): void {
+  if (!existsSync(CRAWLEE_TMP_ROOT)) return;
+  const cutoff = Date.now() - maxAgeMs;
+  try {
+    for (const entry of readdirSync(CRAWLEE_TMP_ROOT)) {
+      const full = join(CRAWLEE_TMP_ROOT, entry);
+      try {
+        const st = statSync(full);
+        if (st.isDirectory() && st.mtimeMs < cutoff) {
+          rmSync(full, { recursive: true, force: true });
+        }
+      } catch { /* skip */ }
+    }
+  } catch { /* skip if /tmp not writable */ }
 }
 
 /**
@@ -50,6 +75,9 @@ export function crawleeStorageDir(scraperName: string): string {
 export function makeCrawleeConfig(scraperName: string): Configuration {
   const storageDir = crawleeStorageDir(scraperName);
 
+  // Clean up stale tmp dirs from other scrapers before starting
+  cleanCrawleeTempDirs();
+
   // Wipe the request queue from the previous run so Crawlee doesn't skip URLs
   // that were already marked as HANDLED (state=4, orderNo=null). The queue
   // persists between runs because purgeOnStart is false. Without this clear,