fix: remove all withIsolatedStorage wrappers, add makeCrawleeConfig to remaining Crawlee scrapers
- scheduler.ts: remove withIsolatedStorage from ALL scrapers (atgbics,
optcore, ufispace, edgecore, ebay-*, market-intel, community-issues,
cisco, juniper, sonic, 10gtek, prolabs, switch-assets, fs)
eliminates global CRAWLEE_STORAGE_DIR race condition entirely
- fs-com.ts: replace purgeDefaultStorages() with rmSync on isolated
storage dirs (fs-phase1, fs-phase2); pass makeCrawleeConfig to both
PlaywrightCrawler instances
- switch-assets-crawler.ts: add makeCrawleeConfig('switch-assets')
- switch-assets-playwright.ts: add makeCrawleeConfig('switch-assets-playwright')
- naddod.ts: restore clean error logging (remove debug instrumentation)
This commit is contained in:
parent
d9e5331161
commit
419af4a24e
@ -21,32 +21,9 @@
|
|||||||
import PgBoss from "pg-boss";
|
import PgBoss from "pg-boss";
|
||||||
import { config } from "dotenv";
|
import { config } from "dotenv";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { mkdirSync, existsSync, writeFileSync } from "fs";
|
|
||||||
|
|
||||||
/** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */
|
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
|
||||||
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
|
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
|
||||||
const dir = join(__dirname, "..", "..", "..", `storage-${name}`);
|
|
||||||
// Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races
|
|
||||||
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
|
|
||||||
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
|
|
||||||
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
|
|
||||||
// Pre-seed session pool state file to prevent "Could not find file" crash
|
|
||||||
// on first run (Crawlee reads this before writing it on some versions)
|
|
||||||
const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json");
|
|
||||||
if (!existsSync(sessionFile)) {
|
|
||||||
writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] }));
|
|
||||||
}
|
|
||||||
const prev = process.env.CRAWLEE_STORAGE_DIR;
|
|
||||||
process.env.CRAWLEE_STORAGE_DIR = dir;
|
|
||||||
// Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state
|
|
||||||
// between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes).
|
|
||||||
// The dir is intentionally kept between runs so Crawlee can persist its state.
|
|
||||||
try {
|
|
||||||
await fn();
|
|
||||||
} finally {
|
|
||||||
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||||||
|
|
||||||
@ -350,22 +327,22 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:pricing:fs", async () => {
|
await boss.work("scrape:pricing:fs", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||||||
await withIsolatedStorage("fs", scrapeFs);
|
await scrapeFs();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:10gtek", async () => {
|
await boss.work("scrape:pricing:10gtek", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
||||||
await withIsolatedStorage("10gtek", scrape10Gtek);
|
await scrape10Gtek();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:atgbics", async () => {
|
await boss.work("scrape:pricing:atgbics", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
||||||
await withIsolatedStorage("atgbics", scrapeAtgbics);
|
await scrapeAtgbics();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:prolabs", async () => {
|
await boss.work("scrape:pricing:prolabs", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
||||||
await withIsolatedStorage("prolabs", scrapeProLabs);
|
await scrapeProLabs();
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── Lightweight fetch/cheerio scrapers ───────────────────────────────
|
// ── Lightweight fetch/cheerio scrapers ───────────────────────────────
|
||||||
@ -384,7 +361,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await boss.work("scrape:pricing:optcore", async () => {
|
await boss.work("scrape:pricing:optcore", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
||||||
const { scrapeOptcore } = await import("./scrapers/optcore");
|
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||||||
await withIsolatedStorage("optcore", scrapeOptcore);
|
await scrapeOptcore();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:pricing:champion-one", async () => {
|
await boss.work("scrape:pricing:champion-one", async () => {
|
||||||
@ -474,34 +451,34 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
|
|
||||||
await boss.work("scrape:compat:cisco", async () => {
|
await boss.work("scrape:compat:cisco", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
|
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
|
||||||
await withIsolatedStorage("cisco", scrapeCiscoTmg);
|
await scrapeCiscoTmg();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:compat:juniper", async () => {
|
await boss.work("scrape:compat:juniper", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
|
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
|
||||||
await withIsolatedStorage("juniper", scrapeJuniperHct);
|
await scrapeJuniperHct();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:compat:sonic", async () => {
|
await boss.work("scrape:compat:sonic", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
|
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
|
||||||
await withIsolatedStorage("sonic", scrapeSonicHcl);
|
await scrapeSonicHcl();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:compat:ufispace", async () => {
|
await boss.work("scrape:compat:ufispace", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
|
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
|
||||||
await withIsolatedStorage("ufispace", scrapeUfiSpace);
|
await scrapeUfiSpace();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:compat:edgecore", async () => {
|
await boss.work("scrape:compat:edgecore", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
|
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
|
||||||
await withIsolatedStorage("edgecore", scrapeEdgecore);
|
await scrapeEdgecore();
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── Switch assets ─────────────────────────────────────────────────────
|
// ── Switch assets ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
await boss.work("scrape:assets:switches", async () => {
|
await boss.work("scrape:assets:switches", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
|
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
|
||||||
await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets());
|
await scrapeSwitchAssets();
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── eBay enrichment ───────────────────────────────────────────────────
|
// ── eBay enrichment ───────────────────────────────────────────────────
|
||||||
@ -509,20 +486,20 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await boss.work("enrich:ebay-transceivers", async () => {
|
await boss.work("enrich:ebay-transceivers", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`);
|
console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`);
|
||||||
const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher");
|
const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher");
|
||||||
await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100));
|
await enrichTransceiversFromEbay(100);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("enrich:ebay-switches", async () => {
|
await boss.work("enrich:ebay-switches", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
|
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
|
||||||
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
|
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
|
||||||
await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30));
|
await enrichSwitchesFromEbay(30);
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── Intelligence & community ──────────────────────────────────────────
|
// ── Intelligence & community ──────────────────────────────────────────
|
||||||
|
|
||||||
await boss.work("scrape:market-intel", async () => {
|
await boss.work("scrape:market-intel", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
|
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
|
||||||
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
|
await scrapeMarketIntelligence();
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:nog-talks", async () => {
|
await boss.work("scrape:nog-talks", async () => {
|
||||||
@ -534,7 +511,7 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
|
|||||||
await boss.work("scrape:community-issues", async () => {
|
await boss.work("scrape:community-issues", async () => {
|
||||||
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
||||||
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
|
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
|
||||||
await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30));
|
await scrapeAllSwitchIssues(30);
|
||||||
});
|
});
|
||||||
|
|
||||||
await boss.work("scrape:datasheet-links", async () => {
|
await boss.work("scrape:datasheet-links", async () => {
|
||||||
|
|||||||
@ -13,7 +13,9 @@
|
|||||||
*
|
*
|
||||||
* Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail).
|
* Respects robots.txt and rate limits (≤12 req/min listing, ≤10 req/min detail).
|
||||||
*/
|
*/
|
||||||
import { PlaywrightCrawler, ProxyConfiguration, purgeDefaultStorages } from "crawlee";
|
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
||||||
|
import { makeCrawleeConfig, crawleeStorageDir } from "../utils/crawlee-config";
|
||||||
|
import { rmSync } from "node:fs";
|
||||||
import type { Page } from "playwright";
|
import type { Page } from "playwright";
|
||||||
|
|
||||||
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
|
/** Apply stealth patches to evade Cloudflare TLS/bot fingerprinting */
|
||||||
@ -268,8 +270,9 @@ interface ProductDetail extends ProductSummary {
|
|||||||
async function collectProductUrls(
|
async function collectProductUrls(
|
||||||
proxyConfiguration: ProxyConfiguration | undefined
|
proxyConfiguration: ProxyConfiguration | undefined
|
||||||
): Promise<Map<string, ProductSummary>> {
|
): Promise<Map<string, ProductSummary>> {
|
||||||
// Purge any leftover Crawlee storage from previous runs
|
// Purge leftover request queue from previous runs (instance-isolated storage)
|
||||||
await purgeDefaultStorages();
|
const fsPhase1Dir = crawleeStorageDir("fs-phase1");
|
||||||
|
try { rmSync(fsPhase1Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||||
|
|
||||||
const products = new Map<string, ProductSummary>();
|
const products = new Map<string, ProductSummary>();
|
||||||
const exhausted = new Set<string>();
|
const exhausted = new Set<string>();
|
||||||
@ -349,7 +352,7 @@ async function collectProductUrls(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
});
|
}, makeCrawleeConfig("fs-phase1"));
|
||||||
|
|
||||||
await crawler.run(listingRequests);
|
await crawler.run(listingRequests);
|
||||||
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
|
console.log(`[Phase 1] ${products.size} unique products across ${CATEGORY_URLS.length} categories`);
|
||||||
@ -362,8 +365,9 @@ async function scrapeProductDetails(
|
|||||||
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
requests: Array<{ url: string; userData: { name: string; partNumber: string } }>,
|
||||||
proxyConfiguration: ProxyConfiguration | undefined
|
proxyConfiguration: ProxyConfiguration | undefined
|
||||||
): Promise<ProductDetail[]> {
|
): Promise<ProductDetail[]> {
|
||||||
// Purge Phase 1 storage so Phase 2 starts with a clean request queue
|
// Purge Phase 2 storage so it starts with a clean request queue
|
||||||
await purgeDefaultStorages();
|
const fsPhase2Dir = crawleeStorageDir("fs-phase2");
|
||||||
|
try { rmSync(fsPhase2Dir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||||
const details: ProductDetail[] = [];
|
const details: ProductDetail[] = [];
|
||||||
|
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler({
|
||||||
@ -610,7 +614,7 @@ async function scrapeProductDetails(
|
|||||||
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
datasheetUrl: resolveUrl(raw.datasheetUrl),
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
});
|
}, makeCrawleeConfig("fs-phase2"));
|
||||||
|
|
||||||
await crawler.run(requests);
|
await crawler.run(requests);
|
||||||
return details;
|
return details;
|
||||||
|
|||||||
@ -275,15 +275,13 @@ export async function scrapeNaddod(): Promise<void> {
|
|||||||
// Price observation
|
// Price observation
|
||||||
if (price && price > 0) {
|
if (price && price > 0) {
|
||||||
const hash = contentHash({ price, part: partNumber });
|
const hash = contentHash({ price, part: partNumber });
|
||||||
const sl: string = stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
|
|
||||||
stock?.confidence === 1 ? "in_stock" : "on_request";
|
|
||||||
console.log(` [DEBUG] upsertPrice: txId=${txId} price=${price} stockLevel=${sl} hash=${hash.slice(0,8)}`);
|
|
||||||
const isNew = await upsertPriceObservation({
|
const isNew = await upsertPriceObservation({
|
||||||
transceiverId: txId,
|
transceiverId: txId,
|
||||||
sourceVendorId: vendorId,
|
sourceVendorId: vendorId,
|
||||||
price,
|
price,
|
||||||
currency: "USD",
|
currency: "USD",
|
||||||
stockLevel: sl,
|
stockLevel: stock?.qty !== undefined && stock.qty > 0 ? "in_stock" :
|
||||||
|
stock?.confidence === 1 ? "in_stock" : "on_request",
|
||||||
url,
|
url,
|
||||||
contentHash: hash,
|
contentHash: hash,
|
||||||
});
|
});
|
||||||
@ -313,7 +311,7 @@ export async function scrapeNaddod(): Promise<void> {
|
|||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
errors++;
|
errors++;
|
||||||
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 300)}`);
|
if (errors <= 10) console.warn(` Error for ${url}: ${(err as Error).message.slice(0, 120)}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
*/
|
*/
|
||||||
import { CheerioCrawler, Dataset } from "crawlee";
|
import { CheerioCrawler, Dataset } from "crawlee";
|
||||||
import { pool } from "../utils/db";
|
import { pool } from "../utils/db";
|
||||||
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||||
import {
|
import {
|
||||||
downloadSwitchImage,
|
downloadSwitchImage,
|
||||||
downloadSwitchDatasheet,
|
downloadSwitchDatasheet,
|
||||||
@ -319,7 +320,7 @@ export async function crawlSwitchAssets(targetVendor?: string): Promise<void> {
|
|||||||
const target = request.userData as CrawlTarget;
|
const target = request.userData as CrawlTarget;
|
||||||
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
||||||
},
|
},
|
||||||
});
|
}, makeCrawleeConfig("switch-assets"));
|
||||||
|
|
||||||
await crawler.run(
|
await crawler.run(
|
||||||
targets.map((t) => ({
|
targets.map((t) => ({
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
*/
|
*/
|
||||||
import { PlaywrightCrawler } from "crawlee";
|
import { PlaywrightCrawler } from "crawlee";
|
||||||
import { pool } from "../utils/db";
|
import { pool } from "../utils/db";
|
||||||
|
import { makeCrawleeConfig } from "../utils/crawlee-config";
|
||||||
import {
|
import {
|
||||||
downloadSwitchImage,
|
downloadSwitchImage,
|
||||||
downloadSwitchDatasheet,
|
downloadSwitchDatasheet,
|
||||||
@ -230,7 +231,7 @@ export async function crawlSwitchAssetsPlaywright(targetVendor?: string): Promis
|
|||||||
const target = request.userData as CrawlTarget;
|
const target = request.userData as CrawlTarget;
|
||||||
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
console.log(` [FAIL] ${target.vendorName} ${target.model}: ${request.url}`);
|
||||||
},
|
},
|
||||||
});
|
}, makeCrawleeConfig("switch-assets-playwright"));
|
||||||
|
|
||||||
await crawler.run(
|
await crawler.run(
|
||||||
targets.map((t) => ({
|
targets.map((t) => ({
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user