fix: crawlee-config clear request queue on each run
Crawlee's FileSystemStorage marks request URLs as HANDLED (state=4, orderNo=null) after processing. With purgeOnStart=false these entries persist, so on the next run crawler.run(startUrls) deduplicates them → requestsTotal=0 → immediate finish with 0 scraped pages. Fix: rmSync request_queues/default/ before each makeCrawleeConfig() call. Safe: session pool state lives in key_value_stores/, not in request_queues/. Affects all Crawlee-based scrapers (ATGBICS, Optcore, Switch-assets, etc.).
This commit is contained in:
parent
19ff1a779b
commit
1d79094872
@ -29,7 +29,7 @@
|
||||
|
||||
import { Configuration } from "crawlee";
|
||||
import { join } from "node:path";
|
||||
import { mkdirSync, existsSync, writeFileSync } from "node:fs";
|
||||
import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs";
|
||||
|
||||
/** Absolute path to the per-scraper Crawlee storage root on disk. */
|
||||
export function crawleeStorageDir(scraperName: string): string {
|
||||
@ -50,6 +50,14 @@ export function crawleeStorageDir(scraperName: string): string {
|
||||
export function makeCrawleeConfig(scraperName: string): Configuration {
|
||||
const storageDir = crawleeStorageDir(scraperName);
|
||||
|
||||
// Wipe the request queue from the previous run so Crawlee doesn't skip URLs
|
||||
// that were already marked as HANDLED (state=4, orderNo=null). The queue
|
||||
// persists between runs because purgeOnStart is false. Without this clear,
|
||||
// crawler.run(startUrls) deduplicates all start URLs → requestsTotal=0 →
|
||||
// immediate finish with 0 scraped pages. Session pool state lives in
|
||||
// key_value_stores/ (not request_queues/), so wiping the queue is safe.
|
||||
rmSync(join(storageDir, "request_queues", "default"), { recursive: true, force: true });
|
||||
|
||||
// Pre-create internal directory tree
|
||||
mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true });
|
||||
mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user