fix(scraper): set CRAWLEE_PURGE_ON_START=1 in withIsolatedStorage

Crawlee's SessionPool throws 'Could not find SDK_SESSION_POOL_STATE.json'
when initializing against a freshly-created isolated storage dir.
Setting CRAWLEE_PURGE_ON_START=1 tells Crawlee to start fresh instead
of trying to load non-existent session state — fixes FS.com and ATGBICS
crashes at the start of every 2h cycle after the dirs were cleaned up.
This commit is contained in:
Rene Fichtmueller 2026-04-11 07:27:24 +02:00
parent 45c48755e4
commit 148d2e1000

View File

@ -31,11 +31,16 @@ async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promi
mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true });
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
const prev = process.env.CRAWLEE_STORAGE_DIR; const prev = process.env.CRAWLEE_STORAGE_DIR;
const prevPurge = process.env.CRAWLEE_PURGE_ON_START;
process.env.CRAWLEE_STORAGE_DIR = dir; process.env.CRAWLEE_STORAGE_DIR = dir;
// Force Crawlee to initialize fresh — prevents "Could not find SDK_SESSION_POOL_STATE.json"
// when the isolated storage dir was just created and has no pre-existing state files.
process.env.CRAWLEE_PURGE_ON_START = "1";
try { try {
await fn(); await fn();
} finally { } finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
process.env.CRAWLEE_PURGE_ON_START = prevPurge ?? "";
// Clean up after successful run // Clean up after successful run
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
} }