perf: load-aware scraper guard + higher rate limits + /tmp crawlee storage
This commit is contained in:
parent
a2492d833b
commit
823b64bd24
@ -21,10 +21,25 @@
|
|||||||
import PgBoss from "pg-boss";
|
import PgBoss from "pg-boss";
|
||||||
import { config } from "dotenv";
|
import { config } from "dotenv";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
|
import { loadavg } from "os";
|
||||||
|
|
||||||
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
|
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
|
||||||
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
|
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load-aware guard — skip heavy scrapers when the server is already busy.
|
||||||
|
* Uses the 1-minute load average; maxLoad defaults to 2.5 (50% of 5 vCPUs).
|
||||||
|
* Logs a warning and returns false when load is too high.
|
||||||
|
*/
|
||||||
|
function isLoadAcceptable(maxLoad = 2.5): boolean {
|
||||||
|
const [avg1] = loadavg();
|
||||||
|
if (avg1 > maxLoad) {
|
||||||
|
console.warn(`[load-guard] 1m load avg ${avg1.toFixed(2)} > ${maxLoad} — deferring heavy scraper`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||||||
|
|
||||||
const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
|
const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
|
||||||
@ -474,6 +489,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:compat:flexoptix", async () => {
|
await boss.work("scrape:compat:flexoptix", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Flexoptix compatibility mapping`);
|
console.log(`[${new Date().toISOString()}] Running: Flexoptix compatibility mapping`);
|
||||||
|
if (!isLoadAcceptable(2.5)) {
|
||||||
|
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Flexoptix compat scrape`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
await scrapeFlexoptixCompatibility();
|
await scrapeFlexoptixCompatibility();
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -511,6 +530,10 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:images:switches", async () => {
|
await boss.work("scrape:images:switches", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Switch og:image fetcher`);
|
console.log(`[${new Date().toISOString()}] Running: Switch og:image fetcher`);
|
||||||
|
if (!isLoadAcceptable(2.5)) {
|
||||||
|
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping switch image fetch`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
await fetchSwitchImages();
|
await fetchSwitchImages();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -133,7 +133,7 @@ export async function scrapeFlexoptixCompatibility(): Promise<void> {
|
|||||||
const addedForSwitch: string[] = [];
|
const addedForSwitch: string[] = [];
|
||||||
|
|
||||||
// ── Strategy 1: Search Flexoptix by switch model ──────────────────────
|
// ── Strategy 1: Search Flexoptix by switch model ──────────────────────
|
||||||
await sleep(1500);
|
await sleep(3000); // 1 req/3s — server-friendly rate limit
|
||||||
const suggestions = await searchFlexoptix(sw.model);
|
const suggestions = await searchFlexoptix(sw.model);
|
||||||
|
|
||||||
const matchedBySku = new Set<string>();
|
const matchedBySku = new Set<string>();
|
||||||
|
|||||||
@ -332,7 +332,7 @@ export async function fetchSwitchImages(targetVendorSlug?: string): Promise<void
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
await sleep(2000); // 1 req/2sec
|
await sleep(3500); // 1 req/3.5s — server-friendly rate limit
|
||||||
|
|
||||||
const html = await fetchPageHtml(productUrl);
|
const html = await fetchPageHtml(productUrl);
|
||||||
if (!html) {
|
if (!html) {
|
||||||
|
|||||||
@ -29,13 +29,38 @@
|
|||||||
|
|
||||||
import { Configuration } from "crawlee";
|
import { Configuration } from "crawlee";
|
||||||
import { join } from "node:path";
|
import { join } from "node:path";
|
||||||
import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs";
|
import { mkdirSync, existsSync, writeFileSync, rmSync, readdirSync, statSync } from "node:fs";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Crawlee scratch space lives in /tmp/tip-crawlers/<name>/ (NOT next to the repo).
|
||||||
|
* /tmp is tmpfs → survives process restarts but not server reboots.
|
||||||
|
* Keeps build artefacts and persistent storage separate.
|
||||||
|
*/
|
||||||
|
const CRAWLEE_TMP_ROOT = process.env.CRAWLEE_TMP_ROOT ?? "/tmp/tip-crawlers";
|
||||||
|
|
||||||
/** Absolute path to the per-scraper Crawlee storage root on disk. */
|
/** Absolute path to the per-scraper Crawlee storage root on disk. */
|
||||||
export function crawleeStorageDir(scraperName: string): string {
|
export function crawleeStorageDir(scraperName: string): string {
|
||||||
// dist layout: packages/scraper/dist/utils/ → go 4 levels up → repo root
|
return join(CRAWLEE_TMP_ROOT, scraperName);
|
||||||
// Then store beside packages/ as storage-<name>/
|
}
|
||||||
return join(__dirname, "..", "..", "..", "..", `storage-${scraperName}`);
|
|
||||||
|
/**
|
||||||
|
* Remove crawler temp dirs older than `maxAgeMs` (default 48 h).
|
||||||
|
* Called automatically on each scraper start — keeps /tmp clean.
|
||||||
|
*/
|
||||||
|
export function cleanCrawleeTempDirs(maxAgeMs = 48 * 60 * 60 * 1_000): void {
|
||||||
|
if (!existsSync(CRAWLEE_TMP_ROOT)) return;
|
||||||
|
const cutoff = Date.now() - maxAgeMs;
|
||||||
|
try {
|
||||||
|
for (const entry of readdirSync(CRAWLEE_TMP_ROOT)) {
|
||||||
|
const full = join(CRAWLEE_TMP_ROOT, entry);
|
||||||
|
try {
|
||||||
|
const st = statSync(full);
|
||||||
|
if (st.isDirectory() && st.mtimeMs < cutoff) {
|
||||||
|
rmSync(full, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
} catch { /* skip */ }
|
||||||
|
}
|
||||||
|
} catch { /* skip if /tmp not writable */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -50,6 +75,9 @@ export function crawleeStorageDir(scraperName: string): string {
|
|||||||
export function makeCrawleeConfig(scraperName: string): Configuration {
|
export function makeCrawleeConfig(scraperName: string): Configuration {
|
||||||
const storageDir = crawleeStorageDir(scraperName);
|
const storageDir = crawleeStorageDir(scraperName);
|
||||||
|
|
||||||
|
// Clean up stale tmp dirs from other scrapers before starting
|
||||||
|
cleanCrawleeTempDirs();
|
||||||
|
|
||||||
// Wipe the request queue from the previous run so Crawlee doesn't skip URLs
|
// Wipe the request queue from the previous run so Crawlee doesn't skip URLs
|
||||||
// that were already marked as HANDLED (state=4, orderNo=null). The queue
|
// that were already marked as HANDLED (state=4, orderNo=null). The queue
|
||||||
// persists between runs because purgeOnStart is false. Without this clear,
|
// persists between runs because purgeOnStart is false. Without this clear,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user