From e11e351f5e46d9b742f6ccfd39757e8f72889137 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 05:37:45 +0200 Subject: [PATCH] fix: crawlee-config clear request queue on each run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crawlee's FileSystemStorage marks request URLs as HANDLED (state=4, orderNo=null) after processing. With purgeOnStart=false these entries persist, so on the next run crawler.run(startUrls) deduplicates them → requestsTotal=0 → immediate finish with 0 scraped pages. Fix: rmSync request_queues/default/ before each makeCrawleeConfig() call. Safe: session pool state lives in key_value_stores/, not in request_queues/. Affects all Crawlee-based scrapers (ATGBICS, Optcore, Switch-assets, etc.). --- packages/scraper/src/utils/crawlee-config.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/scraper/src/utils/crawlee-config.ts b/packages/scraper/src/utils/crawlee-config.ts index 42611a2..5aa02bd 100644 --- a/packages/scraper/src/utils/crawlee-config.ts +++ b/packages/scraper/src/utils/crawlee-config.ts @@ -29,7 +29,7 @@ import { Configuration } from "crawlee"; import { join } from "node:path"; -import { mkdirSync, existsSync, writeFileSync } from "node:fs"; +import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs"; /** Absolute path to the per-scraper Crawlee storage root on disk. */ export function crawleeStorageDir(scraperName: string): string { @@ -50,6 +50,14 @@ export function crawleeStorageDir(scraperName: string): string { export function makeCrawleeConfig(scraperName: string): Configuration { const storageDir = crawleeStorageDir(scraperName); + // Wipe the request queue from the previous run so Crawlee doesn't skip URLs + // that were already marked as HANDLED (state=4, orderNo=null). The queue + // persists between runs because purgeOnStart is false. Without this clear, + // crawler.run(startUrls) deduplicates all start URLs → requestsTotal=0 → + // immediate finish with 0 scraped pages. Session pool state lives in + // key_value_stores/ (not request_queues/), so wiping the queue is safe. + rmSync(join(storageDir, "request_queues", "default"), { recursive: true, force: true }); + // Pre-create internal directory tree mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true }); mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });