diff --git a/packages/scraper/src/utils/crawlee-config.ts b/packages/scraper/src/utils/crawlee-config.ts index 42611a2..5aa02bd 100644 --- a/packages/scraper/src/utils/crawlee-config.ts +++ b/packages/scraper/src/utils/crawlee-config.ts @@ -29,7 +29,7 @@ import { Configuration } from "crawlee"; import { join } from "node:path"; -import { mkdirSync, existsSync, writeFileSync } from "node:fs"; +import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs"; /** Absolute path to the per-scraper Crawlee storage root on disk. */ export function crawleeStorageDir(scraperName: string): string { @@ -50,6 +50,14 @@ export function crawleeStorageDir(scraperName: string): string { export function makeCrawleeConfig(scraperName: string): Configuration { const storageDir = crawleeStorageDir(scraperName); + // Wipe the request queue from the previous run so Crawlee doesn't skip URLs + // that were already marked as HANDLED (state=4, orderNo=null). The queue + // persists between runs because purgeOnStart is false. Without this clear, + // crawler.run(startUrls) deduplicates all start URLs → requestsTotal=0 → + // immediate finish with 0 scraped pages. Session pool state lives in + // key_value_stores/ (not request_queues/), so wiping the queue is safe. + rmSync(join(storageDir, "request_queues", "default"), { recursive: true, force: true }); + // Pre-create internal directory tree mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true }); mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });