/** * pg-boss Job Scheduler — 24/7 Continuous Scraping * * ARCHITECTURE: * - Erik (VPS, .82) : Playwright-heavy scrapers (FS.com, 10Gtek, ATGBICS, ProLabs) * + all compatibility + eBay + compute + NAS sync * - Raspberry Pi Fleet : Lightweight fetch/cheerio scrapers run continuously all day * (BlueOptics, Fiber24, T&S Com, Fluxlight, GBICs, Optcore, * Champion ONE, SFPCables, SmartOptics, HUBER+SUHNER, etc.) * * SCHEDULE PHILOSOPHY: * - Playwright scrapers: every 8h (resource-heavy, VPS only) * - Fetch/Cheerio scrapers: every 4h (lightweight, Pi-friendly) * - Catalog scrapers (Flexoptix): every 2h (fast GraphQL, primary price source) * - Compatibility matrices: every 12h (rarely change) * - eBay enrichment: every 6h * - Intelligence/community: every 6h * - Compute jobs: after each pricing wave * - NAS sync: nightly at 07:55 */ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; import { rmSync, mkdirSync } from "fs"; /** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ async function withIsolatedStorage(name: string, fn: () => Promise): Promise { const dir = join(__dirname, "..", "..", "..", `storage-${name}`); // Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); const prev = process.env.CRAWLEE_STORAGE_DIR; process.env.CRAWLEE_STORAGE_DIR = dir; try { await fn(); } finally { process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } } } config({ path: join(__dirname, "..", "..", "..", ".env") }); const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`; export async function createScheduler(): Promise { const boss = new PgBoss({ connectionString, retryLimit: 3, retryDelay: 30, retryBackoff: true, expireInSeconds: 300, monitorStateIntervalSeconds: 30, }); boss.on("error", (error) => console.error("pg-boss error:", error)); await boss.start(); console.log("pg-boss scheduler started"); return boss; } export async function registerSchedules(boss: PgBoss): Promise { const queues = [ // ── Playwright scrapers (Erik, every 8h) ─────────────────────────── "scrape:pricing:fs", "scrape:pricing:10gtek", "scrape:pricing:atgbics", "scrape:pricing:prolabs", // ── Fetch/Cheerio scrapers (Pi-friendly, every 4h) ───────────────── "scrape:pricing:fluxlight", "scrape:pricing:gbics", "scrape:pricing:optcore", "scrape:pricing:champion-one", "scrape:pricing:sfpcables", "scrape:pricing:blueoptics", "scrape:pricing:fiber24", "scrape:pricing:tscom", "scrape:pricing:skylane", "scrape:pricing:ascentoptics", "scrape:pricing:gaotek", // ── Catalog scrapers (every 2h) ──────────────────────────────────── "scrape:pricing:flexoptix", // ── Manufacturer catalogs (every 8h, no prices) ──────────────────── "scrape:catalog:smartoptics", "scrape:catalog:hubersuhner", // ── Vendor lists ─────────────────────────────────────────────────── "scrape:vendors:flexoptix", "scrape:vendors:flexoptix-supported", // ── Compatibility (every 12h) ────────────────────────────────────── "scrape:compat:cisco", "scrape:compat:juniper", "scrape:compat:sonic", "scrape:compat:ufispace", "scrape:compat:edgecore", // ── Switch enrichment (every 12h) ───────────────────────────────── "scrape:assets:switches", // ── eBay enrichment (every 6h) ──────────────────────────────────── "enrich:ebay-transceivers", "enrich:ebay-switches", // ── Intelligence & community (every 6h) ─────────────────────────── "scrape:market-intel", "scrape:community-issues", "scrape:datasheet-links", "scrape:news", "scrape:faq", "scrape:docs", // ── Compute (every 4h, after pricing waves) ─────────────────────── "compute:abc", "compute:reorder-signals", // ── Sync ────────────────────────────────────────────────────────── "sync:nas", ]; for (const q of queues) { await boss.createQueue(q).catch(() => { /* already exists */ }); } // ══════════════════════════════════════════════════════════════════════ // PLAYWRIGHT SCRAPERS — every 8h (resource-heavy, runs on Erik VPS) // ══════════════════════════════════════════════════════════════════════ // FS.com: 01:00, 09:00, 17:00 await boss.schedule("scrape:pricing:fs", "0 1,9,17 * * *", {}, { retryLimit: 3, expireInSeconds: 5400 }); // 10Gtek: 01:20, 09:20, 17:20 await boss.schedule("scrape:pricing:10gtek", "20 1,9,17 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ATGBICS: 01:50, 09:50, 17:50 await boss.schedule("scrape:pricing:atgbics", "50 1,9,17 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ProLabs: 02:20, 10:20, 18:20 await boss.schedule("scrape:pricing:prolabs", "20 2,10,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // FETCH/CHEERIO SCRAPERS — every 4h (lightweight, Pi-friendly) // ══════════════════════════════════════════════════════════════════════ // Fluxlight: 00:05, 04:05, 08:05, 12:05, 16:05, 20:05 await boss.schedule("scrape:pricing:fluxlight", "5 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // GBICs: 00:15, 04:15, 08:15, 12:15, 16:15, 20:15 await boss.schedule("scrape:pricing:gbics", "15 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Optcore: 00:30, 04:30, 08:30, 12:30, 16:30, 20:30 await boss.schedule("scrape:pricing:optcore", "30 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Champion ONE: 00:45, 04:45, 08:45, 12:45, 16:45, 20:45 await boss.schedule("scrape:pricing:champion-one", "45 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // SFPCables: 01:00, 05:00, 09:00, 13:00, 17:00, 21:00 await boss.schedule("scrape:pricing:sfpcables", "0 1,5,9,13,17,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // BlueOptics: 01:15, 05:15, 09:15, 13:15, 17:15, 21:15 await boss.schedule("scrape:pricing:blueoptics", "15 1,5,9,13,17,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ShopFiber24: 01:30, 05:30, 09:30, 13:30, 17:30, 21:30 await boss.schedule("scrape:pricing:fiber24", "30 1,5,9,13,17,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // T&S Communication: 01:45, 05:45, 09:45, 13:45, 17:45, 21:45 await boss.schedule("scrape:pricing:tscom", "45 1,5,9,13,17,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Skylane: 02:00, 06:00, 10:00, 14:00, 18:00, 22:00 await boss.schedule("scrape:pricing:skylane", "0 2,6,10,14,18,22 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // AscentOptics: 02:15, 06:15, 10:15, 14:15, 18:15, 22:15 await boss.schedule("scrape:pricing:ascentoptics", "15 2,6,10,14,18,22 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // GAO Tek: 02:30, 06:30, 10:30, 14:30, 18:30, 22:30 await boss.schedule("scrape:pricing:gaotek", "30 2,6,10,14,18,22 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // CATALOG SCRAPERS — Flexoptix every 2h (primary price source) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:pricing:flexoptix", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // MANUFACTURER CATALOGS — every 8h (product data, no prices) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:catalog:smartoptics", "10 3,11,19 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:catalog:hubersuhner", "25 3,11,19 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // VENDOR LISTS — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:vendors:flexoptix", "0 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:vendors:flexoptix-supported", "15 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); // ══════════════════════════════════════════════════════════════════════ // COMPATIBILITY MATRICES — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:compat:cisco", "0 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:juniper", "15 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:sonic", "30 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:ufispace", "45 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:compat:edgecore", "55 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); // ══════════════════════════════════════════════════════════════════════ // SWITCH ASSETS — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // EBAY ENRICHMENT — every 6h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("enrich:ebay-transceivers", "0 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 }); await boss.schedule("enrich:ebay-switches", "30 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 }); // ══════════════════════════════════════════════════════════════════════ // INTELLIGENCE & COMMUNITY — every 6h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:market-intel", "0 2,8,14,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:community-issues", "30 2,8,14,20 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); await boss.schedule("scrape:datasheet-links", "0 3,9,15,21 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); await boss.schedule("scrape:news", "20 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:faq", "40 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:docs", "50 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // COMPUTE JOBS — every 4h (after pricing waves settle) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("compute:abc", "50 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); await boss.schedule("compute:reorder-signals", "55 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); // ══════════════════════════════════════════════════════════════════════ // NAS SYNC — nightly // ══════════════════════════════════════════════════════════════════════ await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); console.log("All schedules registered — 24/7 continuous scraping (42 jobs)"); } export async function registerWorkers(boss: PgBoss): Promise { // Lazy-load all scrapers const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeOptcore } = await import("./scrapers/optcore"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors"); const { scrapeNews } = await import("./scrapers/news"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeProLabs } = await import("./scrapers/prolabs"); const { scrapeChampionOne } = await import("./scrapers/champion-one"); const { scrapeFluxlight } = await import("./scrapers/fluxlight"); const { scrapeGbics } = await import("./scrapers/gbics"); const { scrapeSfpCables } = await import("./scrapers/sfpcables"); const { scrapeJuniperHct } = await import("./scrapers/juniper-hct"); const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl"); const { scrapeUfiSpace } = await import("./scrapers/ufispace"); const { scrapeEdgecore } = await import("./scrapers/edgecore"); const { scrapeSwitchAssets } = await import("./scrapers/switch-assets"); const { scrapeBlueOptics } = await import("./scrapers/blueoptics"); const { scrapeFiber24 } = await import("./scrapers/fiber24"); const { scrapeTsCom } = await import("./scrapers/tscom"); const { scrapeSmartOptics } = await import("./scrapers/smartoptics"); const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner"); const { scrapeSkylane } = await import("./scrapers/skylane"); const { scrapeAscentOptics } = await import("./scrapers/ascentoptics"); const { scrapeGaoTek } = await import("./scrapers/gaotek"); // ── Playwright scrapers ─────────────────────────────────────────────── await boss.work("scrape:pricing:fs", async () => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); await withIsolatedStorage("fs", scrapeFs); }); await boss.work("scrape:pricing:10gtek", async () => { console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); await withIsolatedStorage("10gtek", scrape10Gtek); }); await boss.work("scrape:pricing:atgbics", async () => { console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`); await withIsolatedStorage("atgbics", scrapeAtgbics); }); await boss.work("scrape:pricing:prolabs", async () => { console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`); await withIsolatedStorage("prolabs", scrapeProLabs); }); // ── Fetch/Cheerio scrapers ──────────────────────────────────────────── await boss.work("scrape:pricing:fluxlight", async () => { console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`); await withIsolatedStorage("fluxlight", scrapeFluxlight); }); await boss.work("scrape:pricing:gbics", async () => { console.log(`[${new Date().toISOString()}] Running: GBICs pricing`); await withIsolatedStorage("gbics", scrapeGbics); }); await boss.work("scrape:pricing:optcore", async () => { console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); await withIsolatedStorage("optcore", scrapeOptcore); }); await boss.work("scrape:pricing:champion-one", async () => { console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`); await withIsolatedStorage("champion-one", scrapeChampionOne); }); await boss.work("scrape:pricing:sfpcables", async () => { console.log(`[${new Date().toISOString()}] Running: SFPCables pricing`); await withIsolatedStorage("sfpcables", scrapeSfpCables); }); await boss.work("scrape:pricing:blueoptics", async () => { console.log(`[${new Date().toISOString()}] Running: BlueOptics pricing`); await withIsolatedStorage("blueoptics", scrapeBlueOptics); }); await boss.work("scrape:pricing:fiber24", async () => { console.log(`[${new Date().toISOString()}] Running: ShopFiber24 pricing`); await withIsolatedStorage("fiber24", scrapeFiber24); }); await boss.work("scrape:pricing:tscom", async () => { console.log(`[${new Date().toISOString()}] Running: T&S Communication pricing`); await withIsolatedStorage("tscom", scrapeTsCom); }); await boss.work("scrape:pricing:skylane", async () => { console.log(`[${new Date().toISOString()}] Running: Skylane Optics pricing`); await withIsolatedStorage("skylane", scrapeSkylane); }); await boss.work("scrape:pricing:ascentoptics", async () => { console.log(`[${new Date().toISOString()}] Running: AscentOptics pricing`); await withIsolatedStorage("ascentoptics", scrapeAscentOptics); }); await boss.work("scrape:pricing:gaotek", async () => { console.log(`[${new Date().toISOString()}] Running: GAO Tek pricing`); await withIsolatedStorage("gaotek", scrapeGaoTek); }); // ── Catalog scrapers ────────────────────────────────────────────────── await boss.work("scrape:pricing:flexoptix", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog`); await scrapeFlexoptixCatalog(); }); await boss.work("scrape:catalog:smartoptics", async () => { console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`); await withIsolatedStorage("smartoptics", scrapeSmartOptics); }); await boss.work("scrape:catalog:hubersuhner", async () => { console.log(`[${new Date().toISOString()}] Running: HUBER+SUHNER catalog`); await withIsolatedStorage("hubersuhner", scrapeHuberSuhner); }); // ── Vendor lists ────────────────────────────────────────────────────── await boss.work("scrape:vendors:flexoptix", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`); await scrapeFlexoptixVendors(); }); await boss.work("scrape:vendors:flexoptix-supported", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix supported vendors`); await seedFlexoptixVendors(); }); // ── Compatibility scrapers ──────────────────────────────────────────── await boss.work("scrape:compat:cisco", async () => { console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`); await withIsolatedStorage("cisco", scrapeCiscoTmg); }); await boss.work("scrape:compat:juniper", async () => { console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`); await withIsolatedStorage("juniper", scrapeJuniperHct); }); await boss.work("scrape:compat:sonic", async () => { console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`); await withIsolatedStorage("sonic", scrapeSonicHcl); }); await boss.work("scrape:compat:ufispace", async () => { console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`); await withIsolatedStorage("ufispace", scrapeUfiSpace); }); await boss.work("scrape:compat:edgecore", async () => { console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`); await withIsolatedStorage("edgecore", scrapeEdgecore); }); // ── Switch assets ───────────────────────────────────────────────────── await boss.work("scrape:assets:switches", async () => { console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); }); // ── eBay enrichment ─────────────────────────────────────────────────── await boss.work("enrich:ebay-transceivers", async () => { console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`); const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher"); await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); }); await boss.work("enrich:ebay-switches", async () => { console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`); const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher"); await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30)); }); // ── Intelligence & community ────────────────────────────────────────── await boss.work("scrape:market-intel", async () => { console.log(`[${new Date().toISOString()}] Running: Market intelligence`); const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); await withIsolatedStorage("market-intel", scrapeMarketIntelligence); }); await boss.work("scrape:community-issues", async () => { console.log(`[${new Date().toISOString()}] Running: Community issues`); const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues"); await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30)); }); await boss.work("scrape:datasheet-links", async () => { console.log(`[${new Date().toISOString()}] Running: Datasheet links`); const { findAndSeedDatasheetLinks } = await import("./scrapers/community-issues"); await findAndSeedDatasheetLinks(50); }); await boss.work("scrape:news", async () => { console.log(`[${new Date().toISOString()}] Running: News aggregation`); await scrapeNews(); }); await boss.work("scrape:faq", async () => { console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); }); await boss.work("scrape:docs", async () => { console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`); }); // ── Compute jobs ────────────────────────────────────────────────────── await boss.work("compute:abc", async () => { console.log(`[${new Date().toISOString()}] Computing: ABC classification`); const { computeAbcClassification } = await import("./scrapers/market-intelligence"); await computeAbcClassification(); }); await boss.work("compute:reorder-signals", async () => { console.log(`[${new Date().toISOString()}] Computing: Reorder signals`); const { computeReorderSignals } = await import("./scrapers/market-intelligence"); await computeReorderSignals(); }); // ── NAS sync ────────────────────────────────────────────────────────── await boss.work("sync:nas", async () => { console.log(`[${new Date().toISOString()}] Running: NAS sync to Fearghas`); const { runNightlyNasSync } = await import("./utils/nas-sync"); await runNightlyNasSync(); }); console.log("All workers registered (42 jobs, 24/7 continuous)"); }