From a4460b9dcaeb4bbeb8bc4700397cdc66477cdbb2 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Sat, 18 Apr 2026 02:19:53 +0200 Subject: [PATCH] fix: remove all withIsolatedStorage wrappers, add makeCrawleeConfig to remaining Crawlee scrapers - scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics, optcore, ufispace, edgecore, ebay-*, market-intel, community-issues, cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs) eliminates global CRAWLEE_STORAGE_DIR race condition entirely - fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both PlaywrightCrawler instances - switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets') - switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright') - naddod.ts: restore clean error logging (remove debug instrumentation) --- packages/scraper/src/scheduler.ts | 57 ++++++------------- packages/scraper/src/scrapers/fs-com.ts | 18 +++--- packages/scraper/src/scrapers/naddod.ts | 8 +-- .../src/scrapers/switch-assets-crawler.ts | 3 +- .../src/scrapers/switch-assets-playwright.ts | 3 +- 5 files changed, 35 insertions(+), 54 deletions(-) diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index 38a165d..8713c90 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -21,32 +21,9 @@ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; -import { mkdirSync, existsSync, writeFileSync } from "fs"; -/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ -async function withIsolatedStorage(name: string, fn: () => Promise): Promise { - const dir = join(__dirname, "..", "..", "..", `storage-${name}`); - // Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races - mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); - mkdirSync(join(dir, "datasets", "default"), { recursive: true }); - mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); - // Pre-seed session pool state file to prevent "Could not find file" crash - // on first run (Crawlee reads this before writing it on some versions) - const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json"); - if (!existsSync(sessionFile)) { - writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] })); - } - const prev = process.env.CRAWLEE_STORAGE_DIR; - process.env.CRAWLEE_STORAGE_DIR = dir; - // Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state - // between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes). - // The dir is intentionally kept between runs so Crawlee can persist its state. - try { - await fn(); - } finally { - process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; - } -} +// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig() +// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts config({ path: join(__dirname, "..", "..", "..", ".env") }); @@ -350,22 +327,22 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:pricing:fs", async () => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); - await withIsolatedStorage("fs", scrapeFs); + await scrapeFs(); }); await boss.work("scrape:pricing:10gtek", async () => { console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); - await withIsolatedStorage("10gtek", scrape10Gtek); + await scrape10Gtek(); }); await boss.work("scrape:pricing:atgbics", async () => { console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`); - await withIsolatedStorage("atgbics", scrapeAtgbics); + await scrapeAtgbics(); }); await boss.work("scrape:pricing:prolabs", async () => { console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`); - await withIsolatedStorage("prolabs", scrapeProLabs); + await scrapeProLabs(); }); // ── Lightweight fetch/cheerio scrapers ─────────────────────────────── @@ -384,7 +361,7 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:pricing:optcore", async () => { console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); const { scrapeOptcore } = await import("./scrapers/optcore"); - await withIsolatedStorage("optcore", scrapeOptcore); + await scrapeOptcore(); }); await boss.work("scrape:pricing:champion-one", async () => { @@ -474,34 +451,34 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:compat:cisco", async () => { console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`); - await withIsolatedStorage("cisco", scrapeCiscoTmg); + await scrapeCiscoTmg(); }); await boss.work("scrape:compat:juniper", async () => { console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`); - await withIsolatedStorage("juniper", scrapeJuniperHct); + await scrapeJuniperHct(); }); await boss.work("scrape:compat:sonic", async () => { console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`); - await withIsolatedStorage("sonic", scrapeSonicHcl); + await scrapeSonicHcl(); }); await boss.work("scrape:compat:ufispace", async () => { console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`); - await withIsolatedStorage("ufispace", scrapeUfiSpace); + await scrapeUfiSpace(); }); await boss.work("scrape:compat:edgecore", async () => { console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`); - await withIsolatedStorage("edgecore", scrapeEdgecore); + await scrapeEdgecore(); }); // ── Switch assets ───────────────────────────────────────────────────── await boss.work("scrape:assets:switches", async () => { console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`); - await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); + await scrapeSwitchAssets(); }); // ── eBay enrichment ─────────────────────────────────────────────────── @@ -509,20 +486,20 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("enrich:ebay-transceivers", async () => { console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`); const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher"); - await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); + await enrichTransceiversFromEbay(100); }); await boss.work("enrich:ebay-switches", async () => { console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`); const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher"); - await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30)); + await enrichSwitchesFromEbay(30); }); // ── Intelligence & community ────────────────────────────────────────── await boss.work("scrape:market-intel", async () => { console.log(`[${new Date().toISOString()}] Running: Market intelligence`); - await withIsolatedStorage("market-intel", scrapeMarketIntelligence); + await scrapeMarketIntelligence(); }); await boss.work("scrape:nog-talks", async () => { @@ -534,7 +511,7 @@ export async function registerWorkers(boss: PgBoss): Promise { await boss.work("scrape:community-issues", async () => { console.log(`[${new Date().toISOString()}] Running: Community issues`); const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues"); - await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30)); + await scrapeAllSwitchIssues(30); }); await boss.work("scrape:datasheet-links", async () => { diff --git a/packages/scraper/src/scrapers/fs-com.ts b/packages/scraper/src/scrapers/fs-com.ts index 6744d3e..d6fbf43 100644 --- a/packages/scraper/src/scrapers/fs-com.ts +++ b/packages/scraper/src/scrapers/fs-com.ts @@ -13,7 +13,9 @@ * * Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail). */ -import { PlaywrightCrawler, ProxyConfiguration, purgeDefaultStorages } from "crawlee"; +import { PlaywrightCrawler, ProxyConfiguration } from "crawlee"; +import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config"; +import { rmSync } from "node:fs"; import type { Page } from "playwright"; /** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */ @@ -268,8 +270,9 @@ interface ProductDetail extends ProductSummary { async function collectProductUrls( proxyConfiguration: ProxyConfiguration | undefined ): Promise> { - // Purge any leftover Crawlee storage from previous runs - await purgeDefaultStorages(); + // Purge leftover request queue from previous runs (instance-isolated storage) + const fsPhase1Dir = crawleeStorageDir("fs-phase1"); + try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ } const products = new Map(); const exhausted = new Set(); @@ -349,7 +352,7 @@ async function collectProductUrls( } } }, - }); + }, makeCrawleeConfig("fs-phase1")); await crawler.run(listingRequests); console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`); @@ -362,8 +365,9 @@ async function scrapeProductDetails( requests: Array<{ url: string; userData: { name: string; partNumber: string } }>, proxyConfiguration: ProxyConfiguration | undefined ): Promise { - // Purge Phase 1 storage so Phase 2 starts with a clean request queue - await purgeDefaultStorages(); + // Purge Phase 2 storage so it starts with a clean request queue + const fsPhase2Dir = crawleeStorageDir("fs-phase2"); + try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ } const details: ProductDetail[] = []; const crawler = new PlaywrightCrawler({ @@ -610,7 +614,7 @@ async function scrapeProductDetails( datasheetUrl: resolveUrl(raw.datasheetUrl), }); }, - }); + }, makeCrawleeConfig("fs-phase2")); await crawler.run(requests); return details; diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts index 8dc5ebc..7e92204 100644 --- a/packages/scraper/src/scrapers/naddod.ts +++ b/packages/scraper/src/scrapers/naddod.ts @@ -275,15 +275,13 @@ export async function scrapeNaddod(): Promise { // Price observation if (price && price > 0) { const hash = contentHash({ price, part: partNumber }); - const sl: string = stock?.qty !== undefined && stock.qty > 0 ? "in_stock" : - stock?.confidence === 1 ? "in_stock" : "on_request"; - console.log(` [DEBUG] upsertPrice: txId=${txId} price=${price} stockLevel=${sl} hash=${hash.slice(0,8)}`); const isNew = await upsertPriceObservation({ transceiverId: txId, sourceVendorId: vendorId, price, currency: "USD", - stockLevel: sl, + stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" : + stock?.confidence === 1 ? "in_stock" : "on_request", url, contentHash: hash, }); @@ -313,7 +311,7 @@ export async function scrapeNaddod(): Promise { } } catch (err) { errors++; - if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 300)}`); + if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 120)}`); } } diff --git a/packages/scraper/src/scrapers/switch-assets-crawler.ts b/packages/scraper/src/scrapers/switch-assets-crawler.ts index d9bde58..51274c1 100644 --- a/packages/scraper/src/scrapers/switch-assets-crawler.ts +++ b/packages/scraper/src/scrapers/switch-assets-crawler.ts @@ -12,6 +12,7 @@ */ import { CheerioCrawler, Dataset } from "crawlee"; import { pool } from "../utils/db"; +import { makeCrawleeConfig } from "../utils/crawlee-config"; import { downloadSwitchImage, downloadSwitchDatasheet, @@ -319,7 +320,7 @@ export async function crawlSwitchAssets(targetVendor?: string): Promise { const target = request.userData as CrawlTarget; console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`); }, - }); + }, makeCrawleeConfig("switch-assets")); await crawler.run( targets.map((t) => ({ diff --git a/packages/scraper/src/scrapers/switch-assets-playwright.ts b/packages/scraper/src/scrapers/switch-assets-playwright.ts index 9b35b1c..a1b5d45 100644 --- a/packages/scraper/src/scrapers/switch-assets-playwright.ts +++ b/packages/scraper/src/scrapers/switch-assets-playwright.ts @@ -8,6 +8,7 @@ */ import { PlaywrightCrawler } from "crawlee"; import { pool } from "../utils/db"; +import { makeCrawleeConfig } from "../utils/crawlee-config"; import { downloadSwitchImage, downloadSwitchDatasheet, @@ -230,7 +231,7 @@ export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promis const target = request.userData as CrawlTarget; console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`); }, - }); + }, makeCrawleeConfig("switch-assets-playwright")); await crawler.run( targets.map((t) => ({