/** * pg-boss Job Scheduler — 24/7 Continuous Scraping * * ARCHITECTURE: * - Erik (VPS, .82) : Playwright-heavy scrapers (FS.com, 10Gtek, ATGBICS, ProLabs) * + all compatibility + eBay + compute + NAS sync * - Raspberry Pi Fleet : Lightweight fetch/cheerio scrapers run continuously all day * (BlueOptics, Fiber24, T&S Com, Fluxlight, GBICs, Optcore, * Champion ONE, SFPCables, SmartOptics, HUBER+SUHNER, etc.) * * SCHEDULE PHILOSOPHY: * - Playwright scrapers: every 8h (resource-heavy, VPS only) * - Fetch/Cheerio scrapers: every 4h (lightweight, Pi-friendly) * - Catalog scrapers (Flexoptix): every 2h (fast GraphQL, primary price source) * - Compatibility matrices: every 12h (rarely change) * - eBay enrichment: every 6h * - Intelligence/community: every 6h * - Compute jobs: after each pricing wave * - NAS sync: nightly at 07:55 */ import PgBoss from "pg-boss"; import { config } from "dotenv"; import { join } from "path"; import { mkdirSync, existsSync, writeFileSync } from "fs"; /** Run a scraper with an isolated Crawlee storage directory to prevent queue collisions */ async function withIsolatedStorage(name: string, fn: () => Promise): Promise { const dir = join(__dirname, "..", "..", "..", `storage-${name}`); // Pre-create Crawlee's internal subdirectory tree to avoid ENOENT races mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); // Pre-seed session pool state file to prevent "Could not find file" crash // on first run (Crawlee reads this before writing it on some versions) const sessionFile = join(dir, "key_value_stores", "default", "SDK_SESSION_POOL_STATE.json"); if (!existsSync(sessionFile)) { writeFileSync(sessionFile, JSON.stringify({ usableSessionsCount: 0, retiredSessionsCount: 0, sessions: [] })); } const prev = process.env.CRAWLEE_STORAGE_DIR; process.env.CRAWLEE_STORAGE_DIR = dir; // Do NOT set CRAWLEE_PURGE_ON_START — let Crawlee reuse session pool state // between runs (better scraping, no "SDK_SESSION_POOL_STATE.json not found" crashes). // The dir is intentionally kept between runs so Crawlee can persist its state. try { await fn(); } finally { process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; } } config({ path: join(__dirname, "..", "..", "..", ".env") }); const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`; export async function createScheduler(): Promise { const boss = new PgBoss({ connectionString, retryLimit: 3, retryDelay: 30, retryBackoff: true, expireInSeconds: 300, monitorStateIntervalSeconds: 60, }); boss.on("error", (error) => console.error("pg-boss error:", error)); await boss.start(); console.log("pg-boss scheduler started"); return boss; } export async function registerSchedules(boss: PgBoss): Promise { const queues = [ // ── Playwright scrapers (Erik, every 8h) ─────────────────────────── "scrape:pricing:fs", "scrape:pricing:10gtek", "scrape:pricing:atgbics", "scrape:pricing:prolabs", // ── Fetch/Cheerio scrapers (Pi-friendly, every 4h) ───────────────── "scrape:pricing:fluxlight", "scrape:pricing:gbics", "scrape:pricing:optcore", "scrape:pricing:champion-one", "scrape:pricing:sfpcables", "scrape:pricing:blueoptics", "scrape:pricing:fiber24", "scrape:pricing:tscom", "scrape:pricing:skylane", "scrape:pricing:ascentoptics", "scrape:pricing:gaotek", // ── Catalog scrapers (every 2h) ──────────────────────────────────── "scrape:pricing:flexoptix", // ── Manufacturer catalogs (every 8h, no prices) ──────────────────── "scrape:catalog:smartoptics", "scrape:catalog:hubersuhner", "scrape:catalog:eoptolink", // ── Vendor lists ─────────────────────────────────────────────────── "scrape:vendors:flexoptix", "scrape:vendors:flexoptix-supported", // ── Compatibility (every 12h) ────────────────────────────────────── "scrape:compat:cisco", "scrape:compat:juniper", "scrape:compat:sonic", "scrape:compat:ufispace", "scrape:compat:edgecore", // ── Switch enrichment (every 12h) ───────────────────────────────── "scrape:assets:switches", // ── eBay enrichment (every 6h) ──────────────────────────────────── "enrich:ebay-transceivers", "enrich:ebay-switches", // ── Intelligence & community (every 6h) ─────────────────────────── "scrape:market-intel", "scrape:nog-talks", "scrape:community-issues", "scrape:datasheet-links", "scrape:news", "scrape:faq", "scrape:docs", // ── Compute (every 4h, after pricing waves) ─────────────────────── "compute:abc", "compute:reorder-signals", // ── New form-factor coverage scrapers (every 8h) ────────────────── "scrape:pricing:comms-express", "scrape:pricing:router-switch", "scrape:pricing:multimode-inc", "scrape:pricing:optictransceiver", "scrape:pricing:wiitek", // ── Fetch-based catalog+pricing scrapers (every 2h) ────────────── "scrape:pricing:naddod", "scrape:pricing:qsfptek", "scrape:pricing:addon", "scrape:pricing:fibermall", "scrape:pricing:vcelink", "scrape:pricing:opticsbay", // ── OEM Reference Prices (Mouser API, once daily) ───────────────── "scrape:pricing:mouser-oem", // ── Prediction Signal Scrapers (new) ────────────────────────────── "scrape:signals:sec-edgar", "scrape:signals:github", "scrape:signals:ebay-velocity", "scrape:signals:ai-clusters", "scrape:signals:distributor-leads", "scrape:signals:standards", // ── Forecast Engine ─────────────────────────────────────────────── "compute:forecast", // ── Hype Cycle Engine (Norton-Bass, daily) ──────────────────────── "compute:hype-cycle", // ── Sync ────────────────────────────────────────────────────────── "sync:nas", // ── Health Monitoring ───────────────────────────────────────────── "monitor:scraper-health", // ── Verification Reconciliation ─────────────────────────────────── "maintenance:reconcile-verification", // ── Competitor Equivalence Matching ─────────────────────────────── "maintenance:find-equivalences", // ── Re-Research approved equivalences ───────────────────────────── "maintenance:re-research-equivalences", ]; for (const q of queues) { await boss.createQueue(q).catch(() => { /* already exists */ }); } // ══════════════════════════════════════════════════════════════════════ // ══════════════════════════════════════════════════════════════════════ // ALL PRICING SCRAPERS — 24/7, every 2h, staggered by 10min // Goal: complete competitor coverage, no gaps, database always fresh // ══════════════════════════════════════════════════════════════════════ // Playwright scrapers (resource-heavy) — every 2h, 10min apart await boss.schedule("scrape:pricing:fs", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 5400 }); await boss.schedule("scrape:pricing:10gtek", "10 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:atgbics", "20 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:prolabs", "30 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Fetch/Cheerio scrapers (lightweight) — every 2h, 5min apart await boss.schedule("scrape:pricing:fluxlight", "0 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:gbics", "5 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:optcore", "10 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:champion-one", "15 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:sfpcables", "20 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:blueoptics", "25 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:fiber24", "30 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:tscom", "35 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:skylane", "40 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:ascentoptics", "45 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:gaotek", "50 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Form-factor coverage scrapers — every 2h await boss.schedule("scrape:pricing:comms-express", "5 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 5400 }); await boss.schedule("scrape:pricing:router-switch", "15 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 5400 }); await boss.schedule("scrape:pricing:multimode-inc", "25 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:optictransceiver", "35 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:wiitek", "45 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Fetch-based scrapers running on Erik — every 2h, end-of-cycle slots await boss.schedule("scrape:pricing:naddod", "48 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:qsfptek", "52 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:addon", "55 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:fibermall", "57 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // OEM reference prices via Mouser API — once daily at 03:00 (slow: 2s/PID × 475 PIDs ≈ 16min) // Requires MOUSER_API_KEY env var (free at mouser.com/api-hub) await boss.schedule("scrape:pricing:mouser-oem", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // FLEXOPTIX CATALOG — every 2h (primary price source) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:pricing:flexoptix", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // MANUFACTURER CATALOGS — every 4h (product data, no prices) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:catalog:smartoptics", "10 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:catalog:hubersuhner", "25 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:catalog:eoptolink", "40 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // VENDOR LISTS — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:vendors:flexoptix", "0 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:vendors:flexoptix-supported", "15 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); // ══════════════════════════════════════════════════════════════════════ // COMPATIBILITY MATRICES — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:compat:cisco", "0 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:juniper", "15 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:sonic", "30 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:compat:ufispace", "45 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:compat:edgecore", "55 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); // ══════════════════════════════════════════════════════════════════════ // SWITCH ASSETS — every 12h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // EBAY ENRICHMENT — every 6h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("enrich:ebay-transceivers", "0 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 }); await boss.schedule("enrich:ebay-switches", "30 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 }); // ══════════════════════════════════════════════════════════════════════ // INTELLIGENCE & COMMUNITY — every 6h // ══════════════════════════════════════════════════════════════════════ await boss.schedule("scrape:market-intel", "0 2,8,14,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // NOG conference talks — weekly on Mondays 06:00 UTC await boss.schedule("scrape:nog-talks", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 7200 }); await boss.schedule("scrape:community-issues", "30 2,8,14,20 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); await boss.schedule("scrape:datasheet-links", "0 3,9,15,21 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); await boss.schedule("scrape:news", "20 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); await boss.schedule("scrape:faq", "40 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); await boss.schedule("scrape:docs", "50 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // COMPUTE JOBS — every 4h (after pricing waves settle) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("compute:abc", "50 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); await boss.schedule("compute:reorder-signals", "55 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); // ══════════════════════════════════════════════════════════════════════ // PREDICTION SIGNAL SCRAPERS // ══════════════════════════════════════════════════════════════════════ // SEC EDGAR CapEx — weekly Monday 06:00 (filings don't change that fast) await boss.schedule("scrape:signals:sec-edgar", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 3600 }); // GitHub signals — weekly Sunday 05:00 await boss.schedule("scrape:signals:github", "0 5 * * 0", {}, { retryLimit: 2, expireInSeconds: 7200 }); // eBay sold velocity — every 12h (fast-moving market signal) await boss.schedule("scrape:signals:ebay-velocity", "0 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // AI cluster RSS feeds — every 4h (news moves fast) await boss.schedule("scrape:signals:ai-clusters", "10 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 }); // Distributor lead times — daily 03:30 (stock changes overnight) await boss.schedule("scrape:signals:distributor-leads","30 3 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 }); // Standards tracker — weekly Wednesday 04:00 (standards move slowly) await boss.schedule("scrape:signals:standards", "0 4 * * 3", {}, { retryLimit: 1, expireInSeconds: 3600 }); // ══════════════════════════════════════════════════════════════════════ // FORECAST ENGINE — daily at 08:00 (after all nightly scrapers done) // ══════════════════════════════════════════════════════════════════════ await boss.schedule("compute:forecast", "0 8 * * *", {}, { retryLimit: 2, expireInSeconds: 600 }); // Hype Cycle Engine runs daily at 04:30 (after Mouser OEM scraper at 03:00) await boss.schedule("compute:hype-cycle", "30 4 * * *", {}, { retryLimit: 1, expireInSeconds: 600 }); // ══════════════════════════════════════════════════════════════════════ // NAS SYNC — nightly // ══════════════════════════════════════════════════════════════════════ await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); // Health check: every 3h — warns if any vendor has no new prices recently await boss.schedule("monitor:scraper-health", "17 */3 * * *", {}, { retryLimit: 1, expireInSeconds: 600 }); // Verification reconciliation: nightly at 01:00 UTC // Resets competitor_verified/fully_verified for any transceiver that no longer // has a real non-Flexoptix price in the last 30 days — prevents stale ★ 100% badges await boss.schedule("maintenance:reconcile-verification", "0 1 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 }); // Equivalence matching: nightly at 02:00 UTC (after reconcile) await boss.schedule("maintenance:find-equivalences", "0 2 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); // Re-research approved equivalences: daily at 03:00 UTC, processes 200 items per run await boss.schedule("maintenance:re-research-equivalences", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 }); console.log("All schedules registered — 24/7 continuous scraping (57 jobs)"); } export async function registerWorkers(boss: PgBoss): Promise { // Lazy-load all scrapers const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeSmartOptics } = await import("./scrapers/smartoptics"); const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner"); const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); const { scrapeNews } = await import("./scrapers/news"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeProLabs } = await import("./scrapers/prolabs"); const { scrapeJuniperHct } = await import("./scrapers/juniper-hct"); const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl"); const { scrapeUfiSpace } = await import("./scrapers/ufispace"); const { scrapeEdgecore } = await import("./scrapers/edgecore"); const { scrapeSwitchAssets } = await import("./scrapers/switch-assets"); // ── Prediction signal scrapers ──────────────────────────────────────── const { scrapeSecEdgar } = await import("./scrapers/sec-edgar"); const { scrapeGithubSignals } = await import("./scrapers/github-signals"); const { scrapeEbayVelocity } = await import("./scrapers/ebay-velocity"); const { scrapeAiClusters } = await import("./scrapers/ai-clusters"); const { scrapeDistributorLeads }= await import("./scrapers/distributor-leads"); const { scrapeStandardsTracker }= await import("./scrapers/standards-tracker"); const { runForecastEngine } = await import("./utils/forecast-engine"); // ── Playwright scrapers ─────────────────────────────────────────────── await boss.work("scrape:pricing:fs", async () => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); await withIsolatedStorage("fs", scrapeFs); }); await boss.work("scrape:pricing:10gtek", async () => { console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); await withIsolatedStorage("10gtek", scrape10Gtek); }); await boss.work("scrape:pricing:atgbics", async () => { console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`); await withIsolatedStorage("atgbics", scrapeAtgbics); }); await boss.work("scrape:pricing:prolabs", async () => { console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`); await withIsolatedStorage("prolabs", scrapeProLabs); }); // ── Lightweight fetch/cheerio scrapers ─────────────────────────────── await boss.work("scrape:pricing:fluxlight", async () => { console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`); const { scrapeFluxlight } = await import("./scrapers/fluxlight"); await scrapeFluxlight(); }); await boss.work("scrape:pricing:gbics", async () => { console.log(`[${new Date().toISOString()}] Running: GBICS pricing`); const { scrapeGbics } = await import("./scrapers/gbics"); await scrapeGbics(); }); await boss.work("scrape:pricing:optcore", async () => { console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); const { scrapeOptcore } = await import("./scrapers/optcore"); await scrapeOptcore(); }); await boss.work("scrape:pricing:champion-one", async () => { console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`); const { scrapeChampionOne } = await import("./scrapers/champion-one"); await scrapeChampionOne(); }); await boss.work("scrape:pricing:sfpcables", async () => { console.log(`[${new Date().toISOString()}] Running: SFPcables pricing`); const { scrapeSfpCables } = await import("./scrapers/sfpcables"); await scrapeSfpCables(); }); await boss.work("scrape:pricing:blueoptics", async () => { console.log(`[${new Date().toISOString()}] Running: BlueOptics pricing`); const { scrapeBlueOptics } = await import("./scrapers/blueoptics"); await scrapeBlueOptics(); }); await boss.work("scrape:pricing:fiber24", async () => { console.log(`[${new Date().toISOString()}] Running: Fiber24 pricing`); const { scrapeFiber24 } = await import("./scrapers/fiber24"); await scrapeFiber24(); }); await boss.work("scrape:pricing:tscom", async () => { console.log(`[${new Date().toISOString()}] Running: T&S Communication pricing`); const { scrapeTsCom } = await import("./scrapers/tscom"); await scrapeTsCom(); }); await boss.work("scrape:pricing:skylane", async () => { console.log(`[${new Date().toISOString()}] Running: Skylane pricing`); const { scrapeSkylane } = await import("./scrapers/skylane"); await scrapeSkylane(); }); await boss.work("scrape:pricing:ascentoptics", async () => { console.log(`[${new Date().toISOString()}] Running: Ascent Optics pricing`); const { scrapeAscentOptics } = await import("./scrapers/ascentoptics"); await scrapeAscentOptics(); }); await boss.work("scrape:pricing:gaotek", async () => { console.log(`[${new Date().toISOString()}] Running: GAO Tek pricing`); const { scrapeGaoTek } = await import("./scrapers/gaotek"); await scrapeGaoTek(); }); // ── Catalog scrapers ────────────────────────────────────────────────── await boss.work("scrape:pricing:flexoptix", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog`); await scrapeFlexoptixCatalog(); }); await boss.work("scrape:catalog:smartoptics", async () => { console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`); await scrapeSmartOptics(); }); await boss.work("scrape:catalog:hubersuhner", async () => { console.log(`[${new Date().toISOString()}] Running: HUBER+SUHNER catalog`); await scrapeHuberSuhner(); }); await boss.work("scrape:catalog:eoptolink", async () => { console.log(`[${new Date().toISOString()}] Running: Eoptolink OEM catalog`); const { scrapeEoptolink } = await import("./scrapers/eoptolink"); await scrapeEoptolink(); }); // ── Vendor lists ────────────────────────────────────────────────────── await boss.work("scrape:vendors:flexoptix", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`); await scrapeFlexoptixVendors(); }); await boss.work("scrape:vendors:flexoptix-supported", async () => { console.log(`[${new Date().toISOString()}] Running: Flexoptix supported vendors`); await seedFlexoptixVendors(); }); // ── Compatibility scrapers ──────────────────────────────────────────── await boss.work("scrape:compat:cisco", async () => { console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`); await withIsolatedStorage("cisco", scrapeCiscoTmg); }); await boss.work("scrape:compat:juniper", async () => { console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`); await withIsolatedStorage("juniper", scrapeJuniperHct); }); await boss.work("scrape:compat:sonic", async () => { console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`); await withIsolatedStorage("sonic", scrapeSonicHcl); }); await boss.work("scrape:compat:ufispace", async () => { console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`); await withIsolatedStorage("ufispace", scrapeUfiSpace); }); await boss.work("scrape:compat:edgecore", async () => { console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`); await withIsolatedStorage("edgecore", scrapeEdgecore); }); // ── Switch assets ───────────────────────────────────────────────────── await boss.work("scrape:assets:switches", async () => { console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`); await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); }); // ── eBay enrichment ─────────────────────────────────────────────────── await boss.work("enrich:ebay-transceivers", async () => { console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`); const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher"); await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); }); await boss.work("enrich:ebay-switches", async () => { console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`); const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher"); await withIsolatedStorage("ebay-switches", () => enrichSwitchesFromEbay(30)); }); // ── Intelligence & community ────────────────────────────────────────── await boss.work("scrape:market-intel", async () => { console.log(`[${new Date().toISOString()}] Running: Market intelligence`); await scrapeMarketIntelligence(); }); await boss.work("scrape:nog-talks", async () => { console.log(`[${new Date().toISOString()}] Running: NOG conference talks`); const { scrapeNogTalks } = await import("./scrapers/nog-talks"); await scrapeNogTalks(); }); await boss.work("scrape:community-issues", async () => { console.log(`[${new Date().toISOString()}] Running: Community issues`); const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues"); await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30)); }); await boss.work("scrape:datasheet-links", async () => { console.log(`[${new Date().toISOString()}] Running: Datasheet links`); const { findAndSeedDatasheetLinks } = await import("./scrapers/community-issues"); await findAndSeedDatasheetLinks(50); }); await boss.work("scrape:news", async () => { console.log(`[${new Date().toISOString()}] Running: News scraper`); await scrapeNews(); }); await boss.work("scrape:faq", async () => { console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); }); await boss.work("scrape:docs", async () => { console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`); }); // ── Compute jobs ────────────────────────────────────────────────────── await boss.work("compute:abc", async () => { console.log(`[${new Date().toISOString()}] Computing: ABC classification`); const { computeAbcClassification } = await import("./scrapers/market-intelligence"); await computeAbcClassification(); }); await boss.work("compute:reorder-signals", async () => { console.log(`[${new Date().toISOString()}] Computing: Reorder signals`); const { computeReorderSignals } = await import("./scrapers/market-intelligence"); await computeReorderSignals(); }); // ── NAS sync ────────────────────────────────────────────────────────── await boss.work("sync:nas", async () => { console.log(`[${new Date().toISOString()}] Running: NAS sync to Fearghas`); const { runNightlyNasSync } = await import("./utils/nas-sync"); await runNightlyNasSync(); }); // ── Prediction signal scrapers ──────────────────────────────────────── await boss.work("scrape:signals:sec-edgar", async () => { console.log(`[${new Date().toISOString()}] Running: SEC EDGAR CapEx`); await scrapeSecEdgar(); }); await boss.work("scrape:signals:github", async () => { console.log(`[${new Date().toISOString()}] Running: GitHub tech signals`); await scrapeGithubSignals(); }); await boss.work("scrape:signals:ebay-velocity", async () => { console.log(`[${new Date().toISOString()}] Running: eBay sold velocity`); await scrapeEbayVelocity(); }); await boss.work("scrape:signals:ai-clusters", async () => { console.log(`[${new Date().toISOString()}] Running: AI cluster announcements`); await scrapeAiClusters(); }); await boss.work("scrape:signals:distributor-leads", async () => { console.log(`[${new Date().toISOString()}] Running: Distributor lead times`); await scrapeDistributorLeads(); }); await boss.work("scrape:signals:standards", async () => { console.log(`[${new Date().toISOString()}] Running: Standards tracker`); await scrapeStandardsTracker(); }); // ── Forecast engine ─────────────────────────────────────────────────── await boss.work("compute:forecast", async () => { console.log(`[${new Date().toISOString()}] Running: Forecast engine`); await runForecastEngine(); }); // ── Hype Cycle Engine (Norton-Bass diffusion model) ─────────────────── await boss.work("compute:hype-cycle", async () => { console.log(`[${new Date().toISOString()}] Running: Hype Cycle Engine (Norton-Bass)`); const { computeHypeCycle } = await import("./utils/hype-cycle-engine"); await computeHypeCycle(); }); // ── Form-factor coverage scrapers ───────────────────────────────────── await boss.work("scrape:pricing:comms-express", async () => { console.log(`[${new Date().toISOString()}] Running: Comms-Express pricing`); const { scrapeCommsExpress } = await import("./scrapers/comms-express"); await scrapeCommsExpress(); }); await boss.work("scrape:pricing:router-switch", async () => { console.log(`[${new Date().toISOString()}] Running: Router-Switch.com pricing`); const { scrapeRouterSwitch } = await import("./scrapers/router-switch"); await scrapeRouterSwitch(); }); await boss.work("scrape:pricing:multimode-inc", async () => { console.log(`[${new Date().toISOString()}] Running: Multimode Inc pricing`); const { scrapeMultimodeInc } = await import("./scrapers/multimode-inc"); await scrapeMultimodeInc(); }); await boss.work("scrape:pricing:optictransceiver", async () => { console.log(`[${new Date().toISOString()}] Running: OpticTransceiver.com pricing`); const { scrapeOpticTransceiver } = await import("./scrapers/optictransceiver"); await scrapeOpticTransceiver(); }); await boss.work("scrape:pricing:wiitek", async () => { console.log(`[${new Date().toISOString()}] Running: Wiitek pricing`); const { scrapeWiitek } = await import("./scrapers/wiitek"); await scrapeWiitek(); }); await boss.work("scrape:pricing:naddod", async () => { console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`); const { scrapeNaddod } = await import("./scrapers/naddod"); await scrapeNaddod(); }); await boss.work("scrape:pricing:qsfptek", async () => { console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`); const { scrapeQsfptek } = await import("./scrapers/qsfptek"); await scrapeQsfptek(); }); await boss.work("scrape:pricing:addon", async () => { console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`); const { scrapeAddonNetworks } = await import("./scrapers/addon-networks"); await scrapeAddonNetworks(); }); await boss.work("scrape:pricing:fibermall", async () => { console.log(`[${new Date().toISOString()}] Running: FiberMall pricing`); const { scrapeFiberMall } = await import("./scrapers/fibermall"); await scrapeFiberMall(); }); await boss.work("scrape:pricing:vcelink", async () => { console.log(`[${new Date().toISOString()}] Running: Vcelink pricing`); const { scrapeVcelink } = await import("./scrapers/vcelink"); await scrapeVcelink(); }); await boss.work("scrape:pricing:opticsbay", async () => { console.log(`[${new Date().toISOString()}] Running: OpticsBay pricing`); const { scrapeOpticsBay } = await import("./scrapers/opticsbay"); await scrapeOpticsBay(); }); // ── Mouser OEM reference prices ──────────────────────────────────────── await boss.work("scrape:pricing:mouser-oem", async () => { console.log(`[${new Date().toISOString()}] Running: Mouser OEM reference prices`); if (!process.env["MOUSER_API_KEY"]) { console.warn(" [mouser-oem] Skipping — MOUSER_API_KEY not set"); return; } const { scrapeMouser } = await import("./scrapers/mouser"); await scrapeMouser(); }); // ── Health monitor ────────────────────────────────────────────────────── await boss.work("monitor:scraper-health", async () => { const { pool } = await import("./utils/db"); // Vendors we expect to see prices from regularly const EXPECTED_VENDORS = [ "FiberMall", "QSFPTEK", "Flexoptix", "FS.COM", "10Gtek", "ATGBICS", "GBICS", "BlueOptics", "ShopFiber24", "T&S Communication", "Fluxlight", "Optcore", "Champion ONE", "SFPcables", "Vcelink", "OpticsBay", ]; const result = await pool.query(` SELECT v.name, SUM(CASE WHEN po.time > NOW() - INTERVAL '6 hours' THEN 1 ELSE 0 END) AS prices_6h, MAX(po.time) AS last_seen, EXTRACT(EPOCH FROM (NOW() - MAX(po.time))) / 3600.0 AS hours_since FROM vendors v LEFT JOIN price_observations po ON po.source_vendor_id = v.id WHERE v.name = ANY($1) GROUP BY v.name ORDER BY last_seen ASC NULLS FIRST `, [EXPECTED_VENDORS]); const problems: string[] = []; for (const row of result.rows) { const h = parseFloat(row.hours_since ?? "9999"); const n = parseInt(row.prices_6h ?? "0", 10); if (n === 0) { const lastStr = row.last_seen ? `last seen ${h.toFixed(1)}h ago (${new Date(row.last_seen).toISOString().slice(0, 16)})` : "NEVER scraped"; problems.push(`⚠ ${row.name}: 0 prices in last 6h — ${lastStr}`); } } if (problems.length > 0) { console.error("=== SCRAPER HEALTH ALERT ==="); for (const p of problems) console.error(p); console.error("=== Check pm2 logs tip-scraper-daemon ==="); } else { console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`); } }); // ── Verification reconciliation ───────────────────────────────────────── await boss.work("maintenance:reconcile-verification", async () => { const { pool } = await import("./utils/db"); // 1. Reset competitor_verified=false for products with no non-Flexoptix price in last 30 days const resetComp = await pool.query(` UPDATE transceivers t SET competitor_verified = false, competitor_verified_at = NULL WHERE competitor_verified = true AND NOT EXISTS ( SELECT 1 FROM price_observations po JOIN vendors v ON po.source_vendor_id = v.id WHERE po.transceiver_id = t.id AND po.time > NOW() - INTERVAL '30 days' AND UPPER(v.name) NOT LIKE '%FLEXOPTIX%' ) `); // 2. Reset fully_verified=false for products that lost competitor_verified const resetFull = await pool.query(` UPDATE transceivers SET fully_verified = false, fully_verified_at = NULL WHERE fully_verified = true AND (competitor_verified = false OR price_verified = false OR image_verified = false OR details_verified = false) `); // 3. Set fully_verified=true for products that now meet all 4 criteria const setFull = await pool.query(` UPDATE transceivers SET fully_verified = true, fully_verified_at = COALESCE(fully_verified_at, NOW()) WHERE competitor_verified = true AND price_verified = true AND image_verified = true AND details_verified = true AND fully_verified = false `); console.log( `[reconcile] competitor_verified reset: ${resetComp.rowCount}, ` + `fully_verified cleared: ${resetFull.rowCount}, ` + `fully_verified earned: ${setFull.rowCount}` ); }); // ── Equivalence matching ──────────────────────────────────────────────── // Matches Flexoptix SKUs to technically equivalent competitor products by specs. // Confidence scoring: standard_name(35) + form_factor(25) + speed_gbps(20) + // fiber_type(10) + reach±25%(10) = 100 pts max // ≥0.85 → auto_approved + competitor_verified=true // 0.50–0.84 → pending (Manual Review queue in dashboard) // <0.50 → skipped await boss.work("maintenance:find-equivalences", async () => { const { pool } = await import("./utils/db"); const ts = new Date().toISOString(); console.log(`[${ts}] Running: Equivalence matching`); // Find all Flexoptix transceivers that are NOT yet competitor_verified const flexResult = await pool.query(` SELECT t.id, t.part_number, t.standard_name, t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, t.connector, t.wdm_type, t.coherent FROM transceivers t JOIN vendors v ON v.id = t.vendor_id WHERE UPPER(v.name) LIKE '%FLEXOPTIX%' AND t.competitor_verified = false `); let autoApproved = 0; let queued = 0; let skipped = 0; for (const fx of flexResult.rows) { // Find competitor transceivers with recent price observations and matching specs const candidates = await pool.query(` SELECT t.id AS competitor_id, t.part_number, t.standard_name, t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, t.connector, v.name AS vendor_name, MAX(po.time) AS last_price, COUNT(*) AS price_count FROM transceivers t JOIN vendors v ON v.id = t.vendor_id JOIN price_observations po ON po.transceiver_id = t.id WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%' AND po.time > NOW() - INTERVAL '30 days' AND t.form_factor = $1 AND t.speed_gbps = $2 AND t.id != $3 GROUP BY t.id, t.part_number, t.standard_name, t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths, t.connector, v.name `, [fx.form_factor, fx.speed_gbps, fx.id]); for (const cand of candidates.rows) { // Confidence scoring // Max points: form_factor(25) + speed_gbps(20) + standard_name(30) + // wavelength_nm(20) + fiber_type(10) + reach(10) = 115 let score = 0; const basis: string[] = []; // form_factor already matched (pre-filter), award points score += 25; basis.push("form_factor"); // speed_gbps already matched (pre-filter) score += 20; basis.push("speed_gbps"); // standard_name match (strong signal — e.g. "10GBASE-LR") if (fx.standard_name && cand.standard_name && fx.standard_name.trim().toUpperCase() === cand.standard_name.trim().toUpperCase()) { score += 30; basis.push("standard_name"); } // wavelength match — extract first numeric nm value and compare within ±15nm // "wavelengths" is text: "1310 nm", "850nm", "1270/1290/1310/1330 nm" etc. const extractNm = (w: string | null): number | null => { if (!w) return null; const m = w.match(/(\d{3,4})/); return m ? parseInt(m[1], 10) : null; }; const fxNm = extractNm(fx.wavelengths); const candNm = extractNm(cand.wavelengths); if (fxNm !== null && candNm !== null) { if (Math.abs(fxNm - candNm) <= 15) { score += 20; basis.push(`wavelength_${fxNm}nm`); } else { score -= 20; // hard penalize wrong wavelength (1310 vs 1550 = completely different product) } } // fiber_type match (SMF vs MMF — critical) if (fx.fiber_type && cand.fiber_type) { if (fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase()) { score += 10; basis.push("fiber_type"); } else { score -= 15; // SMF vs MMF = wrong product } } // reach within ±25% if (fx.reach_meters && cand.reach_meters && fx.reach_meters > 0 && cand.reach_meters > 0) { const diff = Math.abs(fx.reach_meters - cand.reach_meters); const tolerance = Math.max(fx.reach_meters, 1) * 0.25; if (diff <= tolerance) { score += 10; basis.push("reach"); } else { score -= 15; // penalize mismatched reach } } else if (!fx.reach_meters && !cand.reach_meters) { score += 5; basis.push("reach_null"); } const confidence = Math.max(0, Math.min(1, score / 115)); if (confidence < 0.50) { skipped++; continue; } const notes = `${fx.part_number} ↔ ${cand.part_number} (${cand.vendor_name}) | ` + `basis: ${basis.join(", ")} | reach: ${fx.reach_meters}m vs ${cand.reach_meters}m | ` + `wavelength: ${fx.wavelengths||"?"} vs ${cand.wavelengths||"?"}`; // Upsert equivalence candidate const status = confidence >= 0.73 ? "auto_approved" : "pending"; await pool.query(` INSERT INTO transceiver_equivalences (flexoptix_id, competitor_id, confidence, match_basis, match_notes, status) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET confidence = EXCLUDED.confidence, match_basis = EXCLUDED.match_basis, match_notes = EXCLUDED.match_notes, updated_at = NOW() WHERE transceiver_equivalences.status NOT IN ('approved', 'rejected') `, [fx.id, cand.competitor_id, confidence, basis, notes, status]); if (confidence >= 0.73) { // Auto-approve: set competitor_verified on the Flexoptix transceiver await pool.query(` UPDATE transceivers SET competitor_verified = true, competitor_verified_at = NOW() WHERE id = $1 AND competitor_verified = false `, [fx.id]); autoApproved++; } else { queued++; } } } console.log( `[find-equivalences] auto_approved: ${autoApproved}, ` + `queued for review: ${queued}, skipped (low confidence): ${skipped}` ); // After auto-approvals, rerun fully_verified check if (autoApproved > 0) { await pool.query(` UPDATE transceivers SET fully_verified = true, fully_verified_at = COALESCE(fully_verified_at, NOW()) WHERE competitor_verified = true AND price_verified = true AND image_verified = true AND details_verified = true AND fully_verified = false `); } }); // ── Re-research approved equivalences ──────────────────────────────────────── // Processes up to 200 approved equivalences per day that have re_research_due_at <= NOW(). // Re-runs the confidence check: if competitor still has recent prices and specs still match, // the approval is confirmed (re_researched_at = NOW(), next check in 30 days). // If confidence drops or competitor has no recent price: reverts to pending. await boss.work("maintenance:re-research-equivalences", async () => { const { pool } = await import("./utils/db"); const ts = new Date().toISOString(); console.log(`[${ts}] Running: Re-research approved equivalences`); const batch = await pool.query(` SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.confidence, fx.form_factor, fx.speed_gbps, fx.standard_name, fx.fiber_type, fx.reach_meters, fx.wavelengths FROM transceiver_equivalences eq JOIN transceivers fx ON eq.flexoptix_id = fx.id WHERE eq.status IN ('approved', 'auto_approved') AND eq.re_research_due_at IS NOT NULL AND eq.re_research_due_at <= NOW() ORDER BY eq.re_research_due_at ASC LIMIT 200 `); let confirmed = 0; let reverted = 0; for (const eq of batch.rows) { // Check if competitor still has a recent price observation const priceCheck = await pool.query(` SELECT COUNT(*) AS cnt FROM price_observations WHERE transceiver_id = $1 AND time > NOW() - INTERVAL '45 days' `, [eq.competitor_id]); const hasRecentPrice = parseInt(priceCheck.rows[0].cnt, 10) > 0; if (!hasRecentPrice) { // Competitor no longer carries this — revert to pending for manual review await pool.query(` UPDATE transceiver_equivalences SET status = 'pending', re_research_due_at = NULL, re_researched_at = NULL, match_notes = CONCAT(match_notes, E'\n[Re-research ' || NOW()::date || ': no recent price — reverted to pending]') WHERE id = $1 `, [eq.id]); // Reset competitor_verified if no other approved equivalence covers this transceiver await pool.query(` UPDATE transceivers SET competitor_verified = false, competitor_verified_at = NULL, fully_verified = false, fully_verified_at = NULL WHERE id = $1 AND NOT EXISTS ( SELECT 1 FROM transceiver_equivalences WHERE flexoptix_id = $1 AND status IN ('approved', 'auto_approved') AND id != $2 ) `, [eq.flexoptix_id, eq.id]); reverted++; } else { // Still valid — confirm and schedule next re-research in 30 days await pool.query(` UPDATE transceiver_equivalences SET re_researched_at = NOW(), re_research_due_at = NOW() + INTERVAL '30 days' WHERE id = $1 `, [eq.id]); confirmed++; } } console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`); }); console.log("All workers registered (76 jobs, 24/7 continuous)"); }