fix: remove all withIsolatedStorage wrappers, add makeCrawleeConfig to remaining Crawlee scrapers

- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
  optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
  cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
  eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
  storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
  PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
This commit is contained in:
Rene Fichtmueller 2026-04-18 02:19:53 +02:00
parent d9e5331161
commit 419af4a24e
5 changed files with 35 additions and 54 deletions

View File

@ -21,32 +21,9 @@
import PgBoss from "pg-boss";
import { config } from "dotenv";
import { join } from "path";
import { mkdirSync, existsSync, writeFileSync } from "fs";
/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
const dir = join(__dirname, "..", "..", "..", `storage-${name}`);
// Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
// Pre-seed session pool state file to prevent "Could not find file" crash
// on first run (Crawlee reads this before writing it on some versions)
const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json");
if (!existsSync(sessionFile)) {
writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] }));
}
const prev = process.env.CRAWLEE_STORAGE_DIR;
process.env.CRAWLEE_STORAGE_DIR = dir;
// Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state
// between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes).
// The dir is intentionally kept between runs so Crawlee can persist its state.
try {
await fn();
} finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
}
}
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
config({ path: join(__dirname, "..", "..", "..", ".env") });
@ -350,22 +327,22 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await boss.work("scrape:pricing:fs", async () => {
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
await withIsolatedStorage("fs", scrapeFs);
await scrapeFs();
});
await boss.work("scrape:pricing:10gtek", async () => {
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
await withIsolatedStorage("10gtek", scrape10Gtek);
await scrape10Gtek();
});
await boss.work("scrape:pricing:atgbics", async () => {
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
await withIsolatedStorage("atgbics", scrapeAtgbics);
await scrapeAtgbics();
});
await boss.work("scrape:pricing:prolabs", async () => {
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
await withIsolatedStorage("prolabs", scrapeProLabs);
await scrapeProLabs();
});
// ── Lightweight fetch/cheerio scrapers ───────────────────────────────
@ -384,7 +361,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await boss.work("scrape:pricing:optcore", async () => {
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
const { scrapeOptcore } = await import("./scrapers/optcore");
await withIsolatedStorage("optcore", scrapeOptcore);
await scrapeOptcore();
});
await boss.work("scrape:pricing:champion-one", async () => {
@ -474,34 +451,34 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await boss.work("scrape:compat:cisco", async () => {
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
await withIsolatedStorage("cisco", scrapeCiscoTmg);
await scrapeCiscoTmg();
});
await boss.work("scrape:compat:juniper", async () => {
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
await withIsolatedStorage("juniper", scrapeJuniperHct);
await scrapeJuniperHct();
});
await boss.work("scrape:compat:sonic", async () => {
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
await withIsolatedStorage("sonic", scrapeSonicHcl);
await scrapeSonicHcl();
});
await boss.work("scrape:compat:ufispace", async () => {
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
await withIsolatedStorage("ufispace", scrapeUfiSpace);
await scrapeUfiSpace();
});
await boss.work("scrape:compat:edgecore", async () => {
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
await withIsolatedStorage("edgecore", scrapeEdgecore);
await scrapeEdgecore();
});
// ── Switch assets ─────────────────────────────────────────────────────
await boss.work("scrape:assets:switches", async () => {
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets());
await scrapeSwitchAssets();
});
// ── eBay enrichment ───────────────────────────────────────────────────
@ -509,20 +486,20 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await boss.work("enrich:ebay-transceivers", async () => {
console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`);
const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher");
await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100));
await enrichTransceiversFromEbay(100);
});
await boss.work("enrich:ebay-switches", async () => {
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30));
await enrichSwitchesFromEbay(30);
});
// ── Intelligence & community ──────────────────────────────────────────
await boss.work("scrape:market-intel", async () => {
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
await scrapeMarketIntelligence();
});
await boss.work("scrape:nog-talks", async () => {
@ -534,7 +511,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await boss.work("scrape:community-issues", async () => {
console.log(`[${new Date().toISOString()}] Running: Community issues`);
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30));
await scrapeAllSwitchIssues(30);
});
await boss.work("scrape:datasheet-links", async () => {

View File

@ -13,7 +13,9 @@
*
* Respects robots.txt and rate limits (12 req/min listing, 10 req/min detail).
*/
import { PlaywrightCrawler, ProxyConfiguration, purgeDefaultStorages } from "crawlee";
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config";
import { rmSync } from "node:fs";
import type { Page } from "playwright";
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
@ -268,8 +270,9 @@ interface ProductDetail extends ProductSummary {
async function collectProductUrls(
proxyConfiguration: ProxyConfiguration | undefined
): Promise<Map<string, ProductSummary>> {
// Purge any leftover Crawlee storage from previous runs
await purgeDefaultStorages();
// Purge leftover request queue from previous runs (instance-isolated storage)
const fsPhase1Dir = crawleeStorageDir("fs-phase1");
try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ }
const products = new Map<string, ProductSummary>();
const exhausted = new Set<string>();
@ -349,7 +352,7 @@ async function collectProductUrls(
}
}
},
});
}, makeCrawleeConfig("fs-phase1"));
await crawler.run(listingRequests);
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
@ -362,8 +365,9 @@ async function scrapeProductDetails(
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
proxyConfiguration: ProxyConfiguration | undefined
): Promise<ProductDetail[]> {
// Purge Phase 1 storage so Phase 2 starts with a clean request queue
await purgeDefaultStorages();
// Purge Phase 2 storage so it starts with a clean request queue
const fsPhase2Dir = crawleeStorageDir("fs-phase2");
try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ }
const details: ProductDetail[] = [];
const crawler = new PlaywrightCrawler({
@ -610,7 +614,7 @@ async function scrapeProductDetails(
datasheetUrl: resolveUrl(raw.datasheetUrl),
});
},
});
}, makeCrawleeConfig("fs-phase2"));
await crawler.run(requests);
return details;

View File

@ -275,15 +275,13 @@ export async function scrapeNaddod(): Promise<void> {
// Price observation
if (price && price > 0) {
const hash = contentHash({ price, part: partNumber });
const sl: string = stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
stock?.confidence === 1 ? "in_stock" : "on_request";
console.log(` [DEBUG] upsertPrice: txId=${txId} price=${price} stockLevel=${sl} hash=${hash.slice(0,8)}`);
const isNew = await upsertPriceObservation({
transceiverId: txId,
sourceVendorId: vendorId,
price,
currency: "USD",
stockLevel: sl,
stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
stock?.confidence === 1 ? "in_stock" : "on_request",
url,
contentHash: hash,
});
@ -313,7 +311,7 @@ export async function scrapeNaddod(): Promise<void> {
}
} catch (err) {
errors++;
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 300)}`);
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 120)}`);
}
}

View File

@ -12,6 +12,7 @@
*/
import { CheerioCrawler, Dataset } from "crawlee";
import { pool } from "../utils/db";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import {
downloadSwitchImage,
downloadSwitchDatasheet,
@ -319,7 +320,7 @@ export async function crawlSwitchAssets(targetVendor?: string): Promise<void> {
const target = request.userData as CrawlTarget;
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
},
});
}, makeCrawleeConfig("switch-assets"));
await crawler.run(
targets.map((t) => ({

View File

@ -8,6 +8,7 @@
*/
import { PlaywrightCrawler } from "crawlee";
import { pool } from "../utils/db";
import { makeCrawleeConfig } from "../utils/crawlee-config";
import {
downloadSwitchImage,
downloadSwitchDatasheet,
@ -230,7 +231,7 @@ export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promis
const target = request.userData as CrawlTarget;
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
},
});
}, makeCrawleeConfig("switch-assets-playwright"));
await crawler.run(
targets.map((t) => ({