fix: remove all withIsolatedStorage wrappers, add makeCrawleeConfig to remaining Crawlee scrapers
- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
This commit is contained in:
parent
d9e5331161
commit
419af4a24e
@ -21,32 +21,9 @@
|
||||
import PgBoss from "pg-boss";
|
||||
import { config } from "dotenv";
|
||||
import { join } from "path";
|
||||
import { mkdirSync, existsSync, writeFileSync } from "fs";
|
||||
|
||||
/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */
|
||||
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
|
||||
const dir = join(__dirname, "..", "..", "..", `storage-${name}`);
|
||||
// Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races
|
||||
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
|
||||
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
|
||||
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
|
||||
// Pre-seed session pool state file to prevent "Could not find file" crash
|
||||
// on first run (Crawlee reads this before writing it on some versions)
|
||||
const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json");
|
||||
if (!existsSync(sessionFile)) {
|
||||
writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] }));
|
||||
}
|
||||
const prev = process.env.CRAWLEE_STORAGE_DIR;
|
||||
process.env.CRAWLEE_STORAGE_DIR = dir;
|
||||
// Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state
|
||||
// between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes).
|
||||
// The dir is intentionally kept between runs so Crawlee can persist its state.
|
||||
try {
|
||||
await fn();
|
||||
} finally {
|
||||
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
|
||||
}
|
||||
}
|
||||
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
|
||||
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
|
||||
|
||||
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||||
|
||||
@ -350,22 +327,22 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
|
||||
await boss.work("scrape:pricing:fs", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||
await withIsolatedStorage("fs", scrapeFs);
|
||||
await scrapeFs();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:10gtek", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
||||
await withIsolatedStorage("10gtek", scrape10Gtek);
|
||||
await scrape10Gtek();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:atgbics", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
||||
await withIsolatedStorage("atgbics", scrapeAtgbics);
|
||||
await scrapeAtgbics();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:prolabs", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
||||
await withIsolatedStorage("prolabs", scrapeProLabs);
|
||||
await scrapeProLabs();
|
||||
});
|
||||
|
||||
// ── Lightweight fetch/cheerio scrapers ───────────────────────────────
|
||||
@ -384,7 +361,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await boss.work("scrape:pricing:optcore", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
||||
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||||
await withIsolatedStorage("optcore", scrapeOptcore);
|
||||
await scrapeOptcore();
|
||||
});
|
||||
|
||||
await boss.work("scrape:pricing:champion-one", async () => {
|
||||
@ -474,34 +451,34 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
|
||||
await boss.work("scrape:compat:cisco", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
|
||||
await withIsolatedStorage("cisco", scrapeCiscoTmg);
|
||||
await scrapeCiscoTmg();
|
||||
});
|
||||
|
||||
await boss.work("scrape:compat:juniper", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
|
||||
await withIsolatedStorage("juniper", scrapeJuniperHct);
|
||||
await scrapeJuniperHct();
|
||||
});
|
||||
|
||||
await boss.work("scrape:compat:sonic", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
|
||||
await withIsolatedStorage("sonic", scrapeSonicHcl);
|
||||
await scrapeSonicHcl();
|
||||
});
|
||||
|
||||
await boss.work("scrape:compat:ufispace", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
|
||||
await withIsolatedStorage("ufispace", scrapeUfiSpace);
|
||||
await scrapeUfiSpace();
|
||||
});
|
||||
|
||||
await boss.work("scrape:compat:edgecore", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
|
||||
await withIsolatedStorage("edgecore", scrapeEdgecore);
|
||||
await scrapeEdgecore();
|
||||
});
|
||||
|
||||
// ── Switch assets ─────────────────────────────────────────────────────
|
||||
|
||||
await boss.work("scrape:assets:switches", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
|
||||
await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets());
|
||||
await scrapeSwitchAssets();
|
||||
});
|
||||
|
||||
// ── eBay enrichment ───────────────────────────────────────────────────
|
||||
@ -509,20 +486,20 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await boss.work("enrich:ebay-transceivers", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`);
|
||||
const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher");
|
||||
await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100));
|
||||
await enrichTransceiversFromEbay(100);
|
||||
});
|
||||
|
||||
await boss.work("enrich:ebay-switches", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
|
||||
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
|
||||
await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30));
|
||||
await enrichSwitchesFromEbay(30);
|
||||
});
|
||||
|
||||
// ── Intelligence & community ──────────────────────────────────────────
|
||||
|
||||
await boss.work("scrape:market-intel", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
|
||||
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
|
||||
await scrapeMarketIntelligence();
|
||||
});
|
||||
|
||||
await boss.work("scrape:nog-talks", async () => {
|
||||
@ -534,7 +511,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||||
await boss.work("scrape:community-issues", async () => {
|
||||
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
||||
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
|
||||
await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30));
|
||||
await scrapeAllSwitchIssues(30);
|
||||
});
|
||||
|
||||
await boss.work("scrape:datasheet-links", async () => {
|
||||
|
||||
@ -13,7 +13,9 @@
|
||||
*
|
||||
* Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail).
|
||||
*/
|
||||
import { PlaywrightCrawler, ProxyConfiguration, purgeDefaultStorages } from "crawlee";
|
||||
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||
import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config";
|
||||
import { rmSync } from "node:fs";
|
||||
import type { Page } from "playwright";
|
||||
|
||||
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
|
||||
@ -268,8 +270,9 @@ interface ProductDetail extends ProductSummary {
|
||||
async function collectProductUrls(
|
||||
proxyConfiguration: ProxyConfiguration | undefined
|
||||
): Promise<Map<string, ProductSummary>> {
|
||||
// Purge any leftover Crawlee storage from previous runs
|
||||
await purgeDefaultStorages();
|
||||
// Purge leftover request queue from previous runs (instance-isolated storage)
|
||||
const fsPhase1Dir = crawleeStorageDir("fs-phase1");
|
||||
try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
|
||||
const products = new Map<string, ProductSummary>();
|
||||
const exhausted = new Set<string>();
|
||||
@ -349,7 +352,7 @@ async function collectProductUrls(
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
}, makeCrawleeConfig("fs-phase1"));
|
||||
|
||||
await crawler.run(listingRequests);
|
||||
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
|
||||
@ -362,8 +365,9 @@ async function scrapeProductDetails(
|
||||
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
||||
proxyConfiguration: ProxyConfiguration | undefined
|
||||
): Promise<ProductDetail[]> {
|
||||
// Purge Phase 1 storage so Phase 2 starts with a clean request queue
|
||||
await purgeDefaultStorages();
|
||||
// Purge Phase 2 storage so it starts with a clean request queue
|
||||
const fsPhase2Dir = crawleeStorageDir("fs-phase2");
|
||||
try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
const details: ProductDetail[] = [];
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
@ -610,7 +614,7 @@ async function scrapeProductDetails(
|
||||
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
||||
});
|
||||
},
|
||||
});
|
||||
}, makeCrawleeConfig("fs-phase2"));
|
||||
|
||||
await crawler.run(requests);
|
||||
return details;
|
||||
|
||||
@ -275,15 +275,13 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
// Price observation
|
||||
if (price && price > 0) {
|
||||
const hash = contentHash({ price, part: partNumber });
|
||||
const sl: string = stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
|
||||
stock?.confidence === 1 ? "in_stock" : "on_request";
|
||||
console.log(` [DEBUG] upsertPrice: txId=${txId} price=${price} stockLevel=${sl} hash=${hash.slice(0,8)}`);
|
||||
const isNew = await upsertPriceObservation({
|
||||
transceiverId: txId,
|
||||
sourceVendorId: vendorId,
|
||||
price,
|
||||
currency: "USD",
|
||||
stockLevel: sl,
|
||||
stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
|
||||
stock?.confidence === 1 ? "in_stock" : "on_request",
|
||||
url,
|
||||
contentHash: hash,
|
||||
});
|
||||
@ -313,7 +311,7 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
}
|
||||
} catch (err) {
|
||||
errors++;
|
||||
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 300)}`);
|
||||
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 120)}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
*/
|
||||
import { CheerioCrawler, Dataset } from "crawlee";
|
||||
import { pool } from "../utils/db";
|
||||
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||
import {
|
||||
downloadSwitchImage,
|
||||
downloadSwitchDatasheet,
|
||||
@ -319,7 +320,7 @@ export async function crawlSwitchAssets(targetVendor?: string): Promise<void> {
|
||||
const target = request.userData as CrawlTarget;
|
||||
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
||||
},
|
||||
});
|
||||
}, makeCrawleeConfig("switch-assets"));
|
||||
|
||||
await crawler.run(
|
||||
targets.map((t) => ({
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
*/
|
||||
import { PlaywrightCrawler } from "crawlee";
|
||||
import { pool } from "../utils/db";
|
||||
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||
import {
|
||||
downloadSwitchImage,
|
||||
downloadSwitchDatasheet,
|
||||
@ -230,7 +231,7 @@ export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promis
|
||||
const target = request.userData as CrawlTarget;
|
||||
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
||||
},
|
||||
});
|
||||
}, makeCrawleeConfig("switch-assets-playwright"));
|
||||
|
||||
await crawler.run(
|
||||
targets.map((t) => ({
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user