fix: crawlee-config clear request queue on each run

Crawlee's FileSystemStorage marks request URLs as HANDLED (state=4,
orderNo=null) after processing. With purgeOnStart=false these entries
persist, so on the next run crawler.run(startUrls) deduplicates them
→ requestsTotal=0 → immediate finish with 0 scraped pages.

Fix: rmSync request_queues/default/ before each makeCrawleeConfig()
call. Safe: session pool state lives in key_value_stores/, not in
request_queues/. Affects all Crawlee-based scrapers (ATGBICS, Optcore,
Switch-assets, etc.).
This commit is contained in:
Rene Fichtmueller 2026-04-18 05:37:45 +02:00
parent 1378a9bee8
commit e11e351f5e

View File

@ -29,7 +29,7 @@
import { Configuration } from "crawlee"; import { Configuration } from "crawlee";
import { join } from "node:path"; import { join } from "node:path";
import { mkdirSync, existsSync, writeFileSync } from "node:fs"; import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs";
/** Absolute path to the per-scraper Crawlee storage root on disk. */ /** Absolute path to the per-scraper Crawlee storage root on disk. */
export function crawleeStorageDir(scraperName: string): string { export function crawleeStorageDir(scraperName: string): string {
@ -50,6 +50,14 @@ export function crawleeStorageDir(scraperName: string): string {
export function makeCrawleeConfig(scraperName: string): Configuration { export function makeCrawleeConfig(scraperName: string): Configuration {
const storageDir = crawleeStorageDir(scraperName); const storageDir = crawleeStorageDir(scraperName);
// Wipe the request queue from the previous run so Crawlee doesn't skip URLs
// that were already marked as HANDLED (state=4, orderNo=null). The queue
// persists between runs because purgeOnStart is false. Without this clear,
// crawler.run(startUrls) deduplicates all start URLs → requestsTotal=0 →
// immediate finish with 0 scraped pages. Session pool state lives in
// key_value_stores/ (not request_queues/), so wiping the queue is safe.
rmSync(join(storageDir, "request_queues", "default"), { recursive: true, force: true });
// Pre-create internal directory tree // Pre-create internal directory tree
mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true }); mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true });
mkdirSync(join(storageDir, "datasets", "default"), { recursive: true }); mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });