/** * TIP Scraper Engine — Main entry point. * * Usage: * tsx src/index.ts — Start scheduler (production mode) * tsx src/index.ts --all — Run all scrapers once * tsx src/index.ts --fs — Run FS.com scraper once * tsx src/index.ts --cisco — Run Cisco TMG scraper once * tsx src/index.ts --optcore — Run Optcore scraper once * tsx src/index.ts --news — Run news aggregator once * tsx src/index.ts --flexoptix — Run Flexoptix catalog scraper once * tsx src/index.ts --vendors — Run Flexoptix vendor list scraper once * tsx src/index.ts --10gtek — Run 10Gtek scraper once * tsx src/index.ts --champion — Run Champion ONE scraper once * tsx src/index.ts --fluxlight — Run Fluxlight scraper once * tsx src/index.ts --gbics — Run GBICS.com scraper once * tsx src/index.ts --prolabs — Run ProLabs scraper once * tsx src/index.ts --juniper — Run Juniper HCT scraper once * tsx src/index.ts --switches — Seed switch/router database * tsx src/index.ts --whitebox — Seed whitebox switch database (Edgecore, Celestica, etc.) * tsx src/index.ts --switches-ext — Seed extended switches (Fortinet, MikroTik, Industrial, etc.) * tsx src/index.ts --sonic-hcl — Scrape SONiC Hardware Compatibility List * tsx src/index.ts --edgecore — Scrape Edgecore product catalog * tsx src/index.ts --ufispace — Scrape UfiSpace product catalog * tsx src/index.ts --switch-assets — Scrape switch assets via URL patterns * tsx src/index.ts --switch-crawl — Crawl switch assets (Cheerio, static HTML vendors) * tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors) * tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright) * tsx src/index.ts --atgbics — Run ATGBICS scraper once * tsx src/index.ts --naddod — Run NADDOD scraper once * tsx src/index.ts --qsfptek — Run QSFPTEK scraper once * tsx src/index.ts --addon — Run AddOn Networks scraper once * tsx src/index.ts --fiber24 — Run ShopFiber24 scraper once (sitemap-based) * tsx src/index.ts --fibermall — Run FiberMall scraper once */ import { createScheduler, registerSchedules, registerWorkers } from "./scheduler"; import { scrapeFs } from "./scrapers/fs-com"; import { scrapeCiscoTmg } from "./scrapers/cisco-tmg"; import { scrapeOptcore } from "./scrapers/optcore"; import { scrapeNews } from "./scrapers/news"; import { scrapeFlexoptixCatalog } from "./scrapers/flexoptix-catalog"; import { scrapeFlexoptixVendors } from "./scrapers/flexoptix-vendors"; import { scrape10Gtek } from "./scrapers/tenGtek"; import { scrapeChampionOne } from "./scrapers/champion-one"; import { scrapeFluxlight } from "./scrapers/fluxlight"; import { scrapeSfpCables } from "./scrapers/sfpcables"; import { scrapeGbics } from "./scrapers/gbics"; import { scrapeJuniperHct } from "./scrapers/juniper-hct"; import { seedSwitches } from "./scrapers/switch-seed"; import { seedWhiteboxSwitches } from "./scrapers/whitebox-seed"; import { seedFlexoptixVendors } from "./scrapers/flexoptix-supported-vendors"; import { scrapeSonicHcl } from "./scrapers/sonic-hcl"; import { scrapeEdgecore } from "./scrapers/edgecore"; import { scrapeUfiSpace } from "./scrapers/ufispace"; import { seedExtendedSwitches } from "./scrapers/switch-seed-extended"; import { seedBulkSwitches } from "./scrapers/switch-seed-bulk"; import { scrapeSwitchAssets } from "./scrapers/switch-assets"; import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler"; import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright"; import { scrapeAtgbics } from "./scrapers/atgbics"; import { scrapeProLabs } from "./scrapers/prolabs"; import { scrapeNaddod } from "./scrapers/naddod"; import { scrapeQsfptek } from "./scrapers/qsfptek"; import { scrapeAddonNetworks } from "./scrapers/addon-networks"; import { scrapeFiber24 } from "./scrapers/fiber24"; import { scrapeFiberMall } from "./scrapers/fibermall"; import { pool } from "./utils/db"; const args = process.argv.slice(2); const isAll = args.includes("--all"); const isFetchOnly = args.includes("--fetch-only"); async function runOnce(): Promise { // Fetch-based scrapers (no Playwright/Chromium needed — fast, reliable) if (args.includes("--flexoptix") || isAll || isFetchOnly) { await scrapeFlexoptixCatalog(); } if (args.includes("--vendors") || isAll || isFetchOnly) { await scrapeFlexoptixVendors(); } if (args.includes("--10gtek") || isAll || isFetchOnly) { await scrape10Gtek(); } if (args.includes("--champion") || isAll || isFetchOnly) { await scrapeChampionOne(); } if (args.includes("--fluxlight") || isAll || isFetchOnly) { await scrapeFluxlight(); } if (args.includes("--sfpcables") || isAll || isFetchOnly) { await scrapeSfpCables(); } if (args.includes("--gbics") || isAll || isFetchOnly) { await scrapeGbics(); } if (args.includes("--prolabs") || isAll || isFetchOnly) { await scrapeProLabs(); } if (args.includes("--naddod") || isAll || isFetchOnly) { await scrapeNaddod(); } if (args.includes("--qsfptek") || isAll || isFetchOnly) { await scrapeQsfptek(); } if (args.includes("--addon") || isAll || isFetchOnly) { await scrapeAddonNetworks(); } if (args.includes("--juniper") || isAll || isFetchOnly) { await scrapeJuniperHct(); } if (args.includes("--switches") || isAll || isFetchOnly) { await seedSwitches(); } if (args.includes("--whitebox") || isAll || isFetchOnly) { await seedWhiteboxSwitches(); } if (args.includes("--flexoptix-vendors") || isAll || isFetchOnly) { await seedFlexoptixVendors(); } if (args.includes("--switches-ext") || isAll || isFetchOnly) { await seedExtendedSwitches(); } if (args.includes("--switches-bulk") || isAll || isFetchOnly) { await seedBulkSwitches(); } if (args.includes("--sonic-hcl") || isAll || isFetchOnly) { await scrapeSonicHcl(); } if (args.includes("--news") || isAll || isFetchOnly) { await scrapeNews(); } if (args.includes("--switch-assets") || isAll) { const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1]; await scrapeSwitchAssets(vendor); } if (args.includes("--switch-crawl") || isAll) { const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1]; await crawlSwitchAssets(vendor); } // Crawlee-based scrapers (Cheerio, no Playwright needed) if (args.includes("--edgecore") || isAll) { await scrapeEdgecore(); } if (args.includes("--ufispace") || isAll) { await scrapeUfiSpace(); } // Fetch-based price scrapers (added 2026-04-18) if (args.includes("--atgbics") || isAll || isFetchOnly) { await scrapeAtgbics(); // static HTML, no Playwright (rewritten 2026-04-18) } if (args.includes("--fiber24") || isAll || isFetchOnly) { await scrapeFiber24(); // sitemap-based, microdata parsing } if (args.includes("--fibermall") || isAll || isFetchOnly) { await scrapeFiberMall(); } // Playwright-based scrapers (need Chromium installed) if (!isFetchOnly) { if (args.includes("--fs") || isAll) { await scrapeFs(); } if (args.includes("--cisco") || isAll) { await scrapeCiscoTmg(); } if (args.includes("--optcore") || isAll) { await scrapeOptcore(); } if (args.includes("--switch-crawl-pw") || isAll) { const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1]; await crawlSwitchAssetsPlaywright(vendor); } } await pool.end(); } async function runScheduler(): Promise { console.log("=== TIP Scraper Engine ===\n"); console.log("Mode: Scheduler (pg-boss)\n"); // Crawlee's FileSystemStorage fires spurious unhandledRejection errors after // crawler.run() resolves: its internal task loop schedules one final // _isTaskReadyFunction call that tries to mkdir/lock a request file that // Crawlee already cleaned up. If not suppressed, Node.js calls process.exit(1) // and takes down the whole scheduler daemon. Swallow ENOENT from request_queues // paths (the Crawlee storage root); re-raise everything else. process.on("unhandledRejection", (reason) => { const msg = reason instanceof Error ? reason.message : String(reason); if ((msg.includes("ENOENT") || msg.includes("no such file")) && msg.includes("request_queues")) { // Benign Crawlee post-run lock-file race — ignore return; } console.error("[scheduler] Unhandled rejection:", reason); process.exit(1); }); const boss = await createScheduler(); // Cleanup zombie jobs left by previous daemon crash. // Active jobs > 5 min old on startup = orphaned (prev daemon died mid-run). // They won't be re-queued until next cron tick without this cleanup. try { const { rowCount } = await pool.query(` UPDATE pgboss.job SET state = 'failed'::pgboss.job_state, completed_on = NOW(), output = '{"message":"startup_zombie_cleanup"}'::jsonb WHERE state = 'active' AND started_on < NOW() - INTERVAL '5 minutes' `); if (rowCount && rowCount > 0) { console.log(`Startup cleanup: ${rowCount} zombie job(s) marked failed.\n`); } } catch (err) { console.warn("Startup zombie cleanup failed (non-fatal):", (err as Error).message); } await registerSchedules(boss); await registerWorkers(boss); console.log("\nScheduler running. Press Ctrl+C to stop.\n"); // Graceful shutdown const shutdown = async () => { console.log("\nShutting down..."); await boss.stop(); await pool.end(); process.exit(0); }; process.on("SIGINT", shutdown); process.on("SIGTERM", shutdown); } const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics", "--fiber24", "--fibermall"]; if (args.some((a) => ALL_FLAGS.includes(a))) { runOnce().catch((err) => { console.error("Fatal:", err); process.exit(1); }); } else { runScheduler().catch((err) => { console.error("Fatal:", err); process.exit(1); }); }