Rene Fichtmueller 93d825dc04 fix: daemon stability + health monitor accuracy
- Add global unhandledRejection handler in scheduler daemon to swallow
  Crawlee's benign post-run ENOENT lock-file races (prevents process.exit(1))
- Add SKIP_FS_SCRAPER env var: skip FS.com worker on Erik where Cloudflare
  WAF blocks datacenter IPs (Mac launchd handles FS.com from residential IP)
- Remove FS.COM from health monitor EXPECTED_VENDORS (skipped on Erik)
- Health monitor: extend pg-boss lookup from 12h → 26h, add completed-job
  map; if job ran OK in last 26h + vendor has historical prices → mark
  STABLE instead of CRITICAL (fixes ATGBICS/Fluxlight hash-dedup false positives)
- Install Playwright Chromium on Erik (fixes ATGBICS BrowserLaunchError)
- Create missing Crawlee storage dirs on Erik (storage-fs-phase1/2,
  storage-ebay-transceivers) to prevent ENOENT on first Crawlee run
2026-04-18 03:16:59 +02:00

239 lines
9.7 KiB
TypeScript

/**
* TIP Scraper Engine — Main entry point.
*
* Usage:
* tsx src/index.ts — Start scheduler (production mode)
* tsx src/index.ts --all — Run all scrapers once
* tsx src/index.ts --fs — Run FS.com scraper once
* tsx src/index.ts --cisco — Run Cisco TMG scraper once
* tsx src/index.ts --optcore — Run Optcore scraper once
* tsx src/index.ts --news — Run news aggregator once
* tsx src/index.ts --flexoptix — Run Flexoptix catalog scraper once
* tsx src/index.ts --vendors — Run Flexoptix vendor list scraper once
* tsx src/index.ts --10gtek — Run 10Gtek scraper once
* tsx src/index.ts --champion — Run Champion ONE scraper once
* tsx src/index.ts --fluxlight — Run Fluxlight scraper once
* tsx src/index.ts --gbics — Run GBICS.com scraper once
* tsx src/index.ts --prolabs — Run ProLabs scraper once
* tsx src/index.ts --juniper — Run Juniper HCT scraper once
* tsx src/index.ts --switches — Seed switch/router database
* tsx src/index.ts --whitebox — Seed whitebox switch database (Edgecore, Celestica, etc.)
* tsx src/index.ts --switches-ext — Seed extended switches (Fortinet, MikroTik, Industrial, etc.)
* tsx src/index.ts --sonic-hcl — Scrape SONiC Hardware Compatibility List
* tsx src/index.ts --edgecore — Scrape Edgecore product catalog
* tsx src/index.ts --ufispace — Scrape UfiSpace product catalog
* tsx src/index.ts --switch-assets — Scrape switch assets via URL patterns
* tsx src/index.ts --switch-crawl — Crawl switch assets (Cheerio, static HTML vendors)
* tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors)
* tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright)
* tsx src/index.ts --atgbics — Run ATGBICS scraper once
* tsx src/index.ts --naddod — Run NADDOD scraper once
* tsx src/index.ts --qsfptek — Run QSFPTEK scraper once
* tsx src/index.ts --addon — Run AddOn Networks scraper once
*/
import { createScheduler, registerSchedules, registerWorkers } from "./scheduler";
import { scrapeFs } from "./scrapers/fs-com";
import { scrapeCiscoTmg } from "./scrapers/cisco-tmg";
import { scrapeOptcore } from "./scrapers/optcore";
import { scrapeNews } from "./scrapers/news";
import { scrapeFlexoptixCatalog } from "./scrapers/flexoptix-catalog";
import { scrapeFlexoptixVendors } from "./scrapers/flexoptix-vendors";
import { scrape10Gtek } from "./scrapers/tenGtek";
import { scrapeChampionOne } from "./scrapers/champion-one";
import { scrapeFluxlight } from "./scrapers/fluxlight";
import { scrapeSfpCables } from "./scrapers/sfpcables";
import { scrapeGbics } from "./scrapers/gbics";
import { scrapeJuniperHct } from "./scrapers/juniper-hct";
import { seedSwitches } from "./scrapers/switch-seed";
import { seedWhiteboxSwitches } from "./scrapers/whitebox-seed";
import { seedFlexoptixVendors } from "./scrapers/flexoptix-supported-vendors";
import { scrapeSonicHcl } from "./scrapers/sonic-hcl";
import { scrapeEdgecore } from "./scrapers/edgecore";
import { scrapeUfiSpace } from "./scrapers/ufispace";
import { seedExtendedSwitches } from "./scrapers/switch-seed-extended";
import { seedBulkSwitches } from "./scrapers/switch-seed-bulk";
import { scrapeSwitchAssets } from "./scrapers/switch-assets";
import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler";
import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright";
import { scrapeAtgbics } from "./scrapers/atgbics";
import { scrapeProLabs } from "./scrapers/prolabs";
import { scrapeNaddod } from "./scrapers/naddod";
import { scrapeQsfptek } from "./scrapers/qsfptek";
import { scrapeAddonNetworks } from "./scrapers/addon-networks";
import { pool } from "./utils/db";
const args = process.argv.slice(2);
const isAll = args.includes("--all");
const isFetchOnly = args.includes("--fetch-only");
async function runOnce(): Promise<void> {
// Fetch-based scrapers (no Playwright/Chromium needed — fast, reliable)
if (args.includes("--flexoptix") || isAll || isFetchOnly) {
await scrapeFlexoptixCatalog();
}
if (args.includes("--vendors") || isAll || isFetchOnly) {
await scrapeFlexoptixVendors();
}
if (args.includes("--10gtek") || isAll || isFetchOnly) {
await scrape10Gtek();
}
if (args.includes("--champion") || isAll || isFetchOnly) {
await scrapeChampionOne();
}
if (args.includes("--fluxlight") || isAll || isFetchOnly) {
await scrapeFluxlight();
}
if (args.includes("--sfpcables") || isAll || isFetchOnly) {
await scrapeSfpCables();
}
if (args.includes("--gbics") || isAll || isFetchOnly) {
await scrapeGbics();
}
if (args.includes("--prolabs") || isAll || isFetchOnly) {
await scrapeProLabs();
}
if (args.includes("--naddod") || isAll || isFetchOnly) {
await scrapeNaddod();
}
if (args.includes("--qsfptek") || isAll || isFetchOnly) {
await scrapeQsfptek();
}
if (args.includes("--addon") || isAll || isFetchOnly) {
await scrapeAddonNetworks();
}
if (args.includes("--juniper") || isAll || isFetchOnly) {
await scrapeJuniperHct();
}
if (args.includes("--switches") || isAll || isFetchOnly) {
await seedSwitches();
}
if (args.includes("--whitebox") || isAll || isFetchOnly) {
await seedWhiteboxSwitches();
}
if (args.includes("--flexoptix-vendors") || isAll || isFetchOnly) {
await seedFlexoptixVendors();
}
if (args.includes("--switches-ext") || isAll || isFetchOnly) {
await seedExtendedSwitches();
}
if (args.includes("--switches-bulk") || isAll || isFetchOnly) {
await seedBulkSwitches();
}
if (args.includes("--sonic-hcl") || isAll || isFetchOnly) {
await scrapeSonicHcl();
}
if (args.includes("--news") || isAll || isFetchOnly) {
await scrapeNews();
}
if (args.includes("--switch-assets") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await scrapeSwitchAssets(vendor);
}
if (args.includes("--switch-crawl") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await crawlSwitchAssets(vendor);
}
// Crawlee-based scrapers (Cheerio, no Playwright needed)
if (args.includes("--edgecore") || isAll) {
await scrapeEdgecore();
}
if (args.includes("--ufispace") || isAll) {
await scrapeUfiSpace();
}
// Playwright-based scrapers (need Chromium installed)
if (!isFetchOnly) {
if (args.includes("--fs") || isAll) {
await scrapeFs();
}
if (args.includes("--cisco") || isAll) {
await scrapeCiscoTmg();
}
if (args.includes("--optcore") || isAll) {
await scrapeOptcore();
}
if (args.includes("--switch-crawl-pw") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await crawlSwitchAssetsPlaywright(vendor);
}
if (args.includes("--atgbics") || isAll) {
await scrapeAtgbics();
}
}
await pool.end();
}
async function runScheduler(): Promise<void> {
console.log("=== TIP Scraper Engine ===\n");
console.log("Mode: Scheduler (pg-boss)\n");
// Crawlee's FileSystemStorage fires spurious unhandledRejection errors after
// crawler.run() resolves: its internal task loop schedules one final
// _isTaskReadyFunction call that tries to mkdir/lock a request file that
// Crawlee already cleaned up. If not suppressed, Node.js calls process.exit(1)
// and takes down the whole scheduler daemon. Swallow ENOENT from request_queues
// paths (the Crawlee storage root); re-raise everything else.
process.on("unhandledRejection", (reason) => {
const msg = reason instanceof Error ? reason.message : String(reason);
if ((msg.includes("ENOENT") || msg.includes("no such file")) && msg.includes("request_queues")) {
// Benign Crawlee post-run lock-file race — ignore
return;
}
console.error("[scheduler] Unhandled rejection:", reason);
process.exit(1);
});
const boss = await createScheduler();
// Cleanup zombie jobs left by previous daemon crash.
// Active jobs > 5 min old on startup = orphaned (prev daemon died mid-run).
// They won't be re-queued until next cron tick without this cleanup.
try {
const { rowCount } = await pool.query(`
UPDATE pgboss.job
SET state = 'failed'::pgboss.job_state,
completed_on = NOW(),
output = '{"message":"startup_zombie_cleanup"}'::jsonb
WHERE state = 'active'
AND started_on < NOW() - INTERVAL '5 minutes'
`);
if (rowCount && rowCount > 0) {
console.log(`Startup cleanup: ${rowCount} zombie job(s) marked failed.\n`);
}
} catch (err) {
console.warn("Startup zombie cleanup failed (non-fatal):", (err as Error).message);
}
await registerSchedules(boss);
await registerWorkers(boss);
console.log("\nScheduler running. Press Ctrl+C to stop.\n");
// Graceful shutdown
const shutdown = async () => {
console.log("\nShutting down...");
await boss.stop();
await pool.end();
process.exit(0);
};
process.on("SIGINT", shutdown);
process.on("SIGTERM", shutdown);
}
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics"];
if (args.some((a) => ALL_FLAGS.includes(a))) {
runOnce().catch((err) => {
console.error("Fatal:", err);
process.exit(1);
});
} else {
runScheduler().catch((err) => {
console.error("Fatal:", err);
process.exit(1);
});
}