Rene Fichtmueller 199f36be48 fix(scraper): auto-create pg-boss queues before scheduling + worker/schedule order
- scheduler: patch boss.schedule() to call createQueue() first (idempotent),
  fixing FK constraint errors after DB reset — no need to touch 277 call sites
- index: registerWorkers() before registerSchedules() since boss.work() must
  register handlers before schedules fire
- dashboard: fix switchBlogLlm() to use api() helper (adds Bearer auth token)
  instead of raw fetch() which was returning 401 Unauthorized
2026-04-29 16:14:25 +02:00

258 lines
11 KiB
TypeScript

/**
* TIP Scraper Engine — Main entry point.
*
* Usage:
* tsx src/index.ts — Start scheduler (production mode)
* tsx src/index.ts --all — Run all scrapers once
* tsx src/index.ts --fs — Run FS.com scraper once
* tsx src/index.ts --cisco — Run Cisco TMG scraper once
* tsx src/index.ts --optcore — Run Optcore scraper once
* tsx src/index.ts --news — Run news aggregator once
* tsx src/index.ts --flexoptix — Run Flexoptix catalog scraper once
* tsx src/index.ts --vendors — Run Flexoptix vendor list scraper once
* tsx src/index.ts --10gtek — Run 10Gtek scraper once
* tsx src/index.ts --champion — Run Champion ONE scraper once
* tsx src/index.ts --fluxlight — Run Fluxlight scraper once
* tsx src/index.ts --gbics — Run GBICS.com scraper once
* tsx src/index.ts --prolabs — Run ProLabs scraper once
* tsx src/index.ts --juniper — Run Juniper HCT scraper once
* tsx src/index.ts --switches — Seed switch/router database
* tsx src/index.ts --whitebox — Seed whitebox switch database (Edgecore, Celestica, etc.)
* tsx src/index.ts --switches-ext — Seed extended switches (Fortinet, MikroTik, Industrial, etc.)
* tsx src/index.ts --sonic-hcl — Scrape SONiC Hardware Compatibility List
* tsx src/index.ts --edgecore — Scrape Edgecore product catalog
* tsx src/index.ts --ufispace — Scrape UfiSpace product catalog
* tsx src/index.ts --switch-assets — Scrape switch assets via URL patterns
* tsx src/index.ts --switch-crawl — Crawl switch assets (Cheerio, static HTML vendors)
* tsx src/index.ts --switch-crawl-pw — Crawl switch assets (Playwright, JS-heavy vendors)
* tsx src/index.ts --fetch-only — Run only fetch-based scrapers (no Playwright)
* tsx src/index.ts --atgbics — Run ATGBICS scraper once
* tsx src/index.ts --naddod — Run NADDOD scraper once
* tsx src/index.ts --qsfptek — Run QSFPTEK scraper once
* tsx src/index.ts --addon — Run AddOn Networks scraper once
* tsx src/index.ts --fiber24 — Run ShopFiber24 scraper once (sitemap-based)
* tsx src/index.ts --fibermall — Run FiberMall scraper once
* tsx src/index.ts --backfill-images — Fill missing transceiver product photos
*/
import { createScheduler, registerSchedules, registerWorkers } from "./scheduler";
import { scrapeFs } from "./scrapers/fs-com";
import { scrapeCiscoTmg } from "./scrapers/cisco-tmg";
import { scrapeOptcore } from "./scrapers/optcore";
import { scrapeNews } from "./scrapers/news";
import { scrapeFlexoptixCatalog } from "./scrapers/flexoptix-catalog";
import { scrapeFlexoptixVendors } from "./scrapers/flexoptix-vendors";
import { scrape10Gtek } from "./scrapers/tenGtek";
import { scrapeChampionOne } from "./scrapers/champion-one";
import { scrapeFluxlight } from "./scrapers/fluxlight";
import { scrapeSfpCables } from "./scrapers/sfpcables";
import { scrapeGbics } from "./scrapers/gbics";
import { scrapeJuniperHct } from "./scrapers/juniper-hct";
import { seedSwitches } from "./scrapers/switch-seed";
import { seedWhiteboxSwitches } from "./scrapers/whitebox-seed";
import { seedFlexoptixVendors } from "./scrapers/flexoptix-supported-vendors";
import { scrapeSonicHcl } from "./scrapers/sonic-hcl";
import { scrapeEdgecore } from "./scrapers/edgecore";
import { scrapeUfiSpace } from "./scrapers/ufispace";
import { seedExtendedSwitches } from "./scrapers/switch-seed-extended";
import { seedBulkSwitches } from "./scrapers/switch-seed-bulk";
import { scrapeSwitchAssets } from "./scrapers/switch-assets";
import { crawlSwitchAssets } from "./scrapers/switch-assets-crawler";
import { crawlSwitchAssetsPlaywright } from "./scrapers/switch-assets-playwright";
import { scrapeAtgbics } from "./scrapers/atgbics";
import { scrapeProLabs } from "./scrapers/prolabs";
import { scrapeNaddod } from "./scrapers/naddod";
import { scrapeQsfptek } from "./scrapers/qsfptek";
import { scrapeAddonNetworks } from "./scrapers/addon-networks";
import { scrapeFiber24 } from "./scrapers/fiber24";
import { scrapeFiberMall } from "./scrapers/fibermall";
import { pool } from "./utils/db";
const args = process.argv.slice(2);
const isAll = args.includes("--all");
const isFetchOnly = args.includes("--fetch-only");
async function runOnce(): Promise<void> {
// Fetch-based scrapers (no Playwright/Chromium needed — fast, reliable)
if (args.includes("--flexoptix") || isAll || isFetchOnly) {
await scrapeFlexoptixCatalog();
}
if (args.includes("--vendors") || isAll || isFetchOnly) {
await scrapeFlexoptixVendors();
}
if (args.includes("--10gtek") || isAll || isFetchOnly) {
await scrape10Gtek();
}
if (args.includes("--champion") || isAll || isFetchOnly) {
await scrapeChampionOne();
}
if (args.includes("--fluxlight") || isAll || isFetchOnly) {
await scrapeFluxlight();
}
if (args.includes("--sfpcables") || isAll || isFetchOnly) {
await scrapeSfpCables();
}
if (args.includes("--gbics") || isAll || isFetchOnly) {
await scrapeGbics();
}
if (args.includes("--prolabs") || isAll || isFetchOnly) {
await scrapeProLabs();
}
if (args.includes("--naddod") || isAll || isFetchOnly) {
await scrapeNaddod();
}
if (args.includes("--qsfptek") || isAll || isFetchOnly) {
await scrapeQsfptek();
}
if (args.includes("--addon") || isAll || isFetchOnly) {
await scrapeAddonNetworks();
}
if (args.includes("--juniper") || isAll || isFetchOnly) {
await scrapeJuniperHct();
}
if (args.includes("--switches") || isAll || isFetchOnly) {
await seedSwitches();
}
if (args.includes("--whitebox") || isAll || isFetchOnly) {
await seedWhiteboxSwitches();
}
if (args.includes("--flexoptix-vendors") || isAll || isFetchOnly) {
await seedFlexoptixVendors();
}
if (args.includes("--switches-ext") || isAll || isFetchOnly) {
await seedExtendedSwitches();
}
if (args.includes("--switches-bulk") || isAll || isFetchOnly) {
await seedBulkSwitches();
}
if (args.includes("--sonic-hcl") || isAll || isFetchOnly) {
await scrapeSonicHcl();
}
if (args.includes("--news") || isAll || isFetchOnly) {
await scrapeNews();
}
if (args.includes("--switch-assets") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await scrapeSwitchAssets(vendor);
}
if (args.includes("--switch-crawl") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await crawlSwitchAssets(vendor);
}
// Crawlee-based scrapers (Cheerio, no Playwright needed)
if (args.includes("--edgecore") || isAll) {
await scrapeEdgecore();
}
if (args.includes("--ufispace") || isAll) {
await scrapeUfiSpace();
}
// Fetch-based price scrapers (added 2026-04-18)
if (args.includes("--atgbics") || isAll || isFetchOnly) {
await scrapeAtgbics(); // static HTML, no Playwright (rewritten 2026-04-18)
}
if (args.includes("--fiber24") || isAll || isFetchOnly) {
await scrapeFiber24(); // sitemap-based, microdata parsing
}
if (args.includes("--fibermall") || isAll || isFetchOnly) {
await scrapeFiberMall();
}
if (args.includes("--backfill-images")) {
const { backfillImages } = await import("./utils/backfill-images");
await backfillImages();
}
// Playwright-based scrapers (need Chromium installed)
if (!isFetchOnly) {
if (args.includes("--fs") || isAll) {
await scrapeFs();
}
if (args.includes("--cisco") || isAll) {
await scrapeCiscoTmg();
}
if (args.includes("--optcore") || isAll) {
await scrapeOptcore();
}
if (args.includes("--switch-crawl-pw") || isAll) {
const vendor = args.find((a) => a.startsWith("--vendor="))?.split("=")[1];
await crawlSwitchAssetsPlaywright(vendor);
}
}
await pool.end();
}
async function runScheduler(): Promise<void> {
console.log("=== TIP Scraper Engine ===\n");
console.log("Mode: Scheduler (pg-boss)\n");
// Crawlee's FileSystemStorage fires spurious unhandledRejection errors after
// crawler.run() resolves: its internal task loop schedules one final
// _isTaskReadyFunction call that tries to mkdir/lock a request file that
// Crawlee already cleaned up. If not suppressed, Node.js calls process.exit(1)
// and takes down the whole scheduler daemon. Swallow ENOENT from request_queues
// paths (the Crawlee storage root); re-raise everything else.
process.on("unhandledRejection", (reason) => {
const msg = reason instanceof Error ? reason.message : String(reason);
if ((msg.includes("ENOENT") || msg.includes("no such file")) && msg.includes("request_queues")) {
// Benign Crawlee post-run lock-file race — ignore
return;
}
console.error("[scheduler] Unhandled rejection:", reason);
process.exit(1);
});
const boss = await createScheduler();
// Cleanup zombie jobs left by previous daemon crash.
// Active jobs > 5 min old on startup = orphaned (prev daemon died mid-run).
// They won't be re-queued until next cron tick without this cleanup.
try {
const { rowCount } = await pool.query(`
UPDATE pgboss.job
SET state = 'failed'::pgboss.job_state,
completed_on = NOW(),
output = '{"message":"startup_zombie_cleanup"}'::jsonb
WHERE state = 'active'
AND started_on < NOW() - INTERVAL '5 minutes'
`);
if (rowCount && rowCount > 0) {
console.log(`Startup cleanup: ${rowCount} zombie job(s) marked failed.\n`);
}
} catch (err) {
console.warn("Startup zombie cleanup failed (non-fatal):", (err as Error).message);
}
// Workers must register before schedules — boss.work() auto-creates queues,
// boss.schedule() requires the queue to already exist (pg-boss v10 FK constraint)
await registerWorkers(boss);
await registerSchedules(boss);
console.log("\nScheduler running. Press Ctrl+C to stop.\n");
// Graceful shutdown
const shutdown = async () => {
console.log("\nShutting down...");
await boss.stop();
await pool.end();
process.exit(0);
};
process.on("SIGINT", shutdown);
process.on("SIGTERM", shutdown);
}
const ALL_FLAGS = ["--all", "--fs", "--cisco", "--optcore", "--news", "--flexoptix", "--vendors", "--10gtek", "--champion", "--fluxlight", "--sfpcables", "--gbics", "--prolabs", "--naddod", "--qsfptek", "--addon", "--juniper", "--switches", "--whitebox", "--switches-ext", "--flexoptix-vendors", "--sonic-hcl", "--edgecore", "--ufispace", "--switch-assets", "--switch-crawl", "--switch-crawl-pw", "--fetch-only", "--atgbics", "--fiber24", "--fibermall", "--backfill-images"];
if (args.some((a) => ALL_FLAGS.includes(a))) {
runOnce().catch((err) => {
console.error("Fatal:", err);
process.exit(1);
});
} else {
runScheduler().catch((err) => {
console.error("Fatal:", err);
process.exit(1);
});
}