- scheduler: patch boss.schedule() to call createQueue() first (idempotent), fixing FK constraint errors after DB reset — no need to touch 277 call sites - index: registerWorkers() before registerSchedules() since boss.work() must register handlers before schedules fire - dashboard: fix switchBlogLlm() to use api() helper (adds Bearer auth token) instead of raw fetch() which was returning 401 Unauthorized
258 lines
11 KiB
TypeScript
258 lines
11 KiB
TypeScript
/**
|
|
* TIP Scraper Engine — Main entry point.
|
|
*
|
|
* Usage:
|
|
* tsx src/index.ts — Start scheduler (production mode)
|
|
* tsx src/index.ts --all — Run all scrapers once
|
|
* tsx src/index.ts --fs — Run FS.com scraper once
|
|
* tsx src/index.ts --cisco — Run Cisco TMG scraper once
|
|
* tsx src/index.ts --optcore — Run Optcore scraper once
|
|
* tsx src/index.ts --news — Run news aggregator once
|
|
* tsx src/index.ts --flexoptix — Run Flexoptix catalog scraper once
|
|
* tsx src/index.ts --vendors — Run Flexoptix vendor list scraper once
|
|
* tsx src/index.ts --10gtek — Run 10Gtek scraper once
|
|
* tsx src/index.ts --champion — Run Champion ONE scraper once
|
|
* tsx src/index.ts --fluxlight — Run Fluxlight scraper once
|
|
* tsx src/index.ts --gbics — Run GBICS.com scraper once
|
|
* tsx src/index.ts --prolabs — Run ProLabs scraper once
|
|
* tsx src/index.ts --juniper — Run Juniper HCT scraper once
|
|
* tsx src/index.ts --switches — Seed switch/router database
|
|
* tsx src/index.ts --whitebox — Seed whitebox switch database (Edgecore, Celestica, etc.)
|
|
* tsx src/index.ts --switches-ext — Seed extended switches (Fortinet, MikroTik, Industrial, etc.)
|
|
* tsx src/index.ts --sonic-hcl — Scrape SONiC Hardware Compatibility List
|
|
* tsx src/index.ts --edgecore — Scrape Edgecore product catalog
|
|
* tsx src/index.ts --ufispace — Scrape UfiSpace product catalog
|
|
* tsx src/index.ts --switch-assets — Scrape switch assets via URL patterns
|
|
* tsx src/index.ts --switch-crawl — Crawl switch assets (Cheerio, static HTML vendors)
|
|
* tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors)
|
|
* tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright)
|
|
* tsx src/index.ts --atgbics — Run ATGBICS scraper once
|
|
* tsx src/index.ts --naddod — Run NADDOD scraper once
|
|
* tsx src/index.ts --qsfptek — Run QSFPTEK scraper once
|
|
* tsx src/index.ts --addon — Run AddOn Networks scraper once
|
|
* tsx src/index.ts --fiber24 — Run ShopFiber24 scraper once (sitemap-based)
|
|
* tsx src/index.ts --fibermall — Run FiberMall scraper once
|
|
* tsx src/index.ts --backfill-images — Fill missing transceiver product photos
|
|
*/
|
|
import { createScheduler, registerSchedules, registerWorkers } from "./scheduler";
|
|
import { scrapeFs } from "./scrapers/fs-com";
|
|
import { scrapeCiscoTmg } from "./scrapers/cisco-tmg";
|
|
import { scrapeOptcore } from "./scrapers/optcore";
|
|
import { scrapeNews } from "./scrapers/news";
|
|
import { scrapeFlexoptixCatalog } from "./scrapers/flexoptix-catalog";
|
|
import { scrapeFlexoptixVendors } from "./scrapers/flexoptix-vendors";
|
|
import { scrape10Gtek } from "./scrapers/tenGtek";
|
|
import { scrapeChampionOne } from "./scrapers/champion-one";
|
|
import { scrapeFluxlight } from "./scrapers/fluxlight";
|
|
import { scrapeSfpCables } from "./scrapers/sfpcables";
|
|
import { scrapeGbics } from "./scrapers/gbics";
|
|
import { scrapeJuniperHct } from "./scrapers/juniper-hct";
|
|
import { seedSwitches } from "./scrapers/switch-seed";
|
|
import { seedWhiteboxSwitches } from "./scrapers/whitebox-seed";
|
|
import { seedFlexoptixVendors } from "./scrapers/flexoptix-supported-vendors";
|
|
import { scrapeSonicHcl } from "./scrapers/sonic-hcl";
|
|
import { scrapeEdgecore } from "./scrapers/edgecore";
|
|
import { scrapeUfiSpace } from "./scrapers/ufispace";
|
|
import { seedExtendedSwitches } from "./scrapers/switch-seed-extended";
|
|
import { seedBulkSwitches } from "./scrapers/switch-seed-bulk";
|
|
import { scrapeSwitchAssets } from "./scrapers/switch-assets";
|
|
import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler";
|
|
import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright";
|
|
import { scrapeAtgbics } from "./scrapers/atgbics";
|
|
import { scrapeProLabs } from "./scrapers/prolabs";
|
|
import { scrapeNaddod } from "./scrapers/naddod";
|
|
import { scrapeQsfptek } from "./scrapers/qsfptek";
|
|
import { scrapeAddonNetworks } from "./scrapers/addon-networks";
|
|
import { scrapeFiber24 } from "./scrapers/fiber24";
|
|
import { scrapeFiberMall } from "./scrapers/fibermall";
|
|
import { pool } from "./utils/db";
|
|
|
|
const args = process.argv.slice(2);
|
|
const isAll = args.includes("--all");
|
|
const isFetchOnly = args.includes("--fetch-only");
|
|
|
|
async function runOnce(): Promise<void> {
|
|
// Fetch-based scrapers (no Playwright/Chromium needed — fast, reliable)
|
|
if (args.includes("--flexoptix") || isAll || isFetchOnly) {
|
|
await scrapeFlexoptixCatalog();
|
|
}
|
|
if (args.includes("--vendors") || isAll || isFetchOnly) {
|
|
await scrapeFlexoptixVendors();
|
|
}
|
|
if (args.includes("--10gtek") || isAll || isFetchOnly) {
|
|
await scrape10Gtek();
|
|
}
|
|
if (args.includes("--champion") || isAll || isFetchOnly) {
|
|
await scrapeChampionOne();
|
|
}
|
|
if (args.includes("--fluxlight") || isAll || isFetchOnly) {
|
|
await scrapeFluxlight();
|
|
}
|
|
if (args.includes("--sfpcables") || isAll || isFetchOnly) {
|
|
await scrapeSfpCables();
|
|
}
|
|
if (args.includes("--gbics") || isAll || isFetchOnly) {
|
|
await scrapeGbics();
|
|
}
|
|
if (args.includes("--prolabs") || isAll || isFetchOnly) {
|
|
await scrapeProLabs();
|
|
}
|
|
if (args.includes("--naddod") || isAll || isFetchOnly) {
|
|
await scrapeNaddod();
|
|
}
|
|
if (args.includes("--qsfptek") || isAll || isFetchOnly) {
|
|
await scrapeQsfptek();
|
|
}
|
|
if (args.includes("--addon") || isAll || isFetchOnly) {
|
|
await scrapeAddonNetworks();
|
|
}
|
|
if (args.includes("--juniper") || isAll || isFetchOnly) {
|
|
await scrapeJuniperHct();
|
|
}
|
|
if (args.includes("--switches") || isAll || isFetchOnly) {
|
|
await seedSwitches();
|
|
}
|
|
if (args.includes("--whitebox") || isAll || isFetchOnly) {
|
|
await seedWhiteboxSwitches();
|
|
}
|
|
if (args.includes("--flexoptix-vendors") || isAll || isFetchOnly) {
|
|
await seedFlexoptixVendors();
|
|
}
|
|
if (args.includes("--switches-ext") || isAll || isFetchOnly) {
|
|
await seedExtendedSwitches();
|
|
}
|
|
if (args.includes("--switches-bulk") || isAll || isFetchOnly) {
|
|
await seedBulkSwitches();
|
|
}
|
|
if (args.includes("--sonic-hcl") || isAll || isFetchOnly) {
|
|
await scrapeSonicHcl();
|
|
}
|
|
if (args.includes("--news") || isAll || isFetchOnly) {
|
|
await scrapeNews();
|
|
}
|
|
if (args.includes("--switch-assets") || isAll) {
|
|
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
await scrapeSwitchAssets(vendor);
|
|
}
|
|
if (args.includes("--switch-crawl") || isAll) {
|
|
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
await crawlSwitchAssets(vendor);
|
|
}
|
|
|
|
// Crawlee-based scrapers (Cheerio, no Playwright needed)
|
|
if (args.includes("--edgecore") || isAll) {
|
|
await scrapeEdgecore();
|
|
}
|
|
if (args.includes("--ufispace") || isAll) {
|
|
await scrapeUfiSpace();
|
|
}
|
|
|
|
// Fetch-based price scrapers (added 2026-04-18)
|
|
if (args.includes("--atgbics") || isAll || isFetchOnly) {
|
|
await scrapeAtgbics(); // static HTML, no Playwright (rewritten 2026-04-18)
|
|
}
|
|
if (args.includes("--fiber24") || isAll || isFetchOnly) {
|
|
await scrapeFiber24(); // sitemap-based, microdata parsing
|
|
}
|
|
if (args.includes("--fibermall") || isAll || isFetchOnly) {
|
|
await scrapeFiberMall();
|
|
}
|
|
if (args.includes("--backfill-images")) {
|
|
const { backfillImages } = await import("./utils/backfill-images");
|
|
await backfillImages();
|
|
}
|
|
|
|
// Playwright-based scrapers (need Chromium installed)
|
|
if (!isFetchOnly) {
|
|
if (args.includes("--fs") || isAll) {
|
|
await scrapeFs();
|
|
}
|
|
if (args.includes("--cisco") || isAll) {
|
|
await scrapeCiscoTmg();
|
|
}
|
|
if (args.includes("--optcore") || isAll) {
|
|
await scrapeOptcore();
|
|
}
|
|
if (args.includes("--switch-crawl-pw") || isAll) {
|
|
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
|
|
await crawlSwitchAssetsPlaywright(vendor);
|
|
}
|
|
}
|
|
|
|
await pool.end();
|
|
}
|
|
|
|
async function runScheduler(): Promise<void> {
|
|
console.log("=== TIP Scraper Engine ===\n");
|
|
console.log("Mode: Scheduler (pg-boss)\n");
|
|
|
|
// Crawlee's FileSystemStorage fires spurious unhandledRejection errors after
|
|
// crawler.run() resolves: its internal task loop schedules one final
|
|
// _isTaskReadyFunction call that tries to mkdir/lock a request file that
|
|
// Crawlee already cleaned up. If not suppressed, Node.js calls process.exit(1)
|
|
// and takes down the whole scheduler daemon. Swallow ENOENT from request_queues
|
|
// paths (the Crawlee storage root); re-raise everything else.
|
|
process.on("unhandledRejection", (reason) => {
|
|
const msg = reason instanceof Error ? reason.message : String(reason);
|
|
if ((msg.includes("ENOENT") || msg.includes("no such file")) && msg.includes("request_queues")) {
|
|
// Benign Crawlee post-run lock-file race — ignore
|
|
return;
|
|
}
|
|
console.error("[scheduler] Unhandled rejection:", reason);
|
|
process.exit(1);
|
|
});
|
|
|
|
const boss = await createScheduler();
|
|
|
|
// Cleanup zombie jobs left by previous daemon crash.
|
|
// Active jobs > 5 min old on startup = orphaned (prev daemon died mid-run).
|
|
// They won't be re-queued until next cron tick without this cleanup.
|
|
try {
|
|
const { rowCount } = await pool.query(`
|
|
UPDATE pgboss.job
|
|
SET state = 'failed'::pgboss.job_state,
|
|
completed_on = NOW(),
|
|
output = '{"message":"startup_zombie_cleanup"}'::jsonb
|
|
WHERE state = 'active'
|
|
AND started_on < NOW() - INTERVAL '5 minutes'
|
|
`);
|
|
if (rowCount && rowCount > 0) {
|
|
console.log(`Startup cleanup: ${rowCount} zombie job(s) marked failed.\n`);
|
|
}
|
|
} catch (err) {
|
|
console.warn("Startup zombie cleanup failed (non-fatal):", (err as Error).message);
|
|
}
|
|
|
|
// Workers must register before schedules — boss.work() auto-creates queues,
|
|
// boss.schedule() requires the queue to already exist (pg-boss v10 FK constraint)
|
|
await registerWorkers(boss);
|
|
await registerSchedules(boss);
|
|
|
|
console.log("\nScheduler running. Press Ctrl+C to stop.\n");
|
|
|
|
// Graceful shutdown
|
|
const shutdown = async () => {
|
|
console.log("\nShutting down...");
|
|
await boss.stop();
|
|
await pool.end();
|
|
process.exit(0);
|
|
};
|
|
|
|
process.on("SIGINT", shutdown);
|
|
process.on("SIGTERM", shutdown);
|
|
}
|
|
|
|
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics", "--fiber24", "--fibermall", "--backfill-images"];
|
|
|
|
if (args.some((a) => ALL_FLAGS.includes(a))) {
|
|
runOnce().catch((err) => {
|
|
console.error("Fatal:", err);
|
|
process.exit(1);
|
|
});
|
|
} else {
|
|
runScheduler().catch((err) => {
|
|
console.error("Fatal:", err);
|
|
process.exit(1);
|
|
});
|
|
}
|