fix: crawlee-config clear request queue on each run
Crawlee's FileSystemStorage marks request URLs as HANDLED (state=4, orderNo=null) after processing. With purgeOnStart=false these entries persist, so on the next run crawler.run(startUrls) deduplicates them → requestsTotal=0 → immediate finish with 0 scraped pages. Fix: rmSync request_queues/default/ before each makeCrawleeConfig() call. Safe: session pool state lives in key_value_stores/, not in request_queues/. Affects all Crawlee-based scrapers (ATGBICS, Optcore, Switch-assets, etc.).
This commit is contained in:
parent
1378a9bee8
commit
e11e351f5e
@ -29,7 +29,7 @@
|
|||||||
|
|
||||||
import { Configuration } from "crawlee";
|
import { Configuration } from "crawlee";
|
||||||
import { join } from "node:path";
|
import { join } from "node:path";
|
||||||
import { mkdirSync, existsSync, writeFileSync } from "node:fs";
|
import { mkdirSync, existsSync, writeFileSync, rmSync } from "node:fs";
|
||||||
|
|
||||||
/** Absolute path to the per-scraper Crawlee storage root on disk. */
|
/** Absolute path to the per-scraper Crawlee storage root on disk. */
|
||||||
export function crawleeStorageDir(scraperName: string): string {
|
export function crawleeStorageDir(scraperName: string): string {
|
||||||
@ -50,6 +50,14 @@ export function crawleeStorageDir(scraperName: string): string {
|
|||||||
export function makeCrawleeConfig(scraperName: string): Configuration {
|
export function makeCrawleeConfig(scraperName: string): Configuration {
|
||||||
const storageDir = crawleeStorageDir(scraperName);
|
const storageDir = crawleeStorageDir(scraperName);
|
||||||
|
|
||||||
|
// Wipe the request queue from the previous run so Crawlee doesn't skip URLs
|
||||||
|
// that were already marked as HANDLED (state=4, orderNo=null). The queue
|
||||||
|
// persists between runs because purgeOnStart is false. Without this clear,
|
||||||
|
// crawler.run(startUrls) deduplicates all start URLs → requestsTotal=0 →
|
||||||
|
// immediate finish with 0 scraped pages. Session pool state lives in
|
||||||
|
// key_value_stores/ (not request_queues/), so wiping the queue is safe.
|
||||||
|
rmSync(join(storageDir, "request_queues", "default"), { recursive: true, force: true });
|
||||||
|
|
||||||
// Pre-create internal directory tree
|
// Pre-create internal directory tree
|
||||||
mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true });
|
mkdirSync(join(storageDir, "request_queues", "default"), { recursive: true });
|
||||||
mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });
|
mkdirSync(join(storageDir, "datasets", "default"), { recursive: true });
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user