1179 lines
60 KiB
TypeScript
1179 lines
60 KiB
TypeScript
/**
|
||
* pg-boss Job Scheduler — 24/7 Continuous Scraping
|
||
*
|
||
* ARCHITECTURE:
|
||
* - Erik (VPS, .82) : Playwright-heavy scrapers (FS.com, 10Gtek, ATGBICS, ProLabs)
|
||
* + all compatibility + eBay + compute + NAS sync
|
||
* - Raspberry Pi Fleet : Lightweight fetch/cheerio scrapers run continuously all day
|
||
* (BlueOptics, Fiber24, T&S Com, Fluxlight, GBICs, Optcore,
|
||
* Champion ONE, SFPCables, SmartOptics, HUBER+SUHNER, etc.)
|
||
*
|
||
* SCHEDULE PHILOSOPHY:
|
||
* - Playwright scrapers: every 8h (resource-heavy, VPS only)
|
||
* - Fetch/Cheerio scrapers: every 4h (lightweight, Pi-friendly)
|
||
* - Catalog scrapers (Flexoptix): every 2h (fast GraphQL, primary price source)
|
||
* - Compatibility matrices: every 12h (rarely change)
|
||
* - eBay enrichment: every 6h
|
||
* - Intelligence/community: every 6h
|
||
* - Compute jobs: after each pricing wave
|
||
* - NAS sync: nightly at 07:55
|
||
*/
|
||
import PgBoss from "pg-boss";
|
||
import { config } from "dotenv";
|
||
import { join } from "path";
|
||
import { loadavg } from "os";
|
||
|
||
// withIsolatedStorage removed — all Crawlee scrapers now use makeCrawleeConfig()
|
||
// for instance-level storage isolation. See packages/scraper/src/utils/crawlee-config.ts
|
||
|
||
/**
|
||
* Load-aware guard — skip heavy scrapers when the server is already busy.
|
||
* Uses the 1-minute load average; maxLoad defaults to 2.5 (50% of 5 vCPUs).
|
||
* Logs a warning and returns false when load is too high.
|
||
*/
|
||
function isLoadAcceptable(maxLoad = 2.5): boolean {
|
||
const [avg1] = loadavg();
|
||
if (avg1 > maxLoad) {
|
||
console.warn(`[load-guard] 1m load avg ${avg1.toFixed(2)} > ${maxLoad} — deferring heavy scraper`);
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
||
|
||
const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
|
||
|
||
export async function createScheduler(): Promise<PgBoss> {
|
||
const boss = new PgBoss({
|
||
connectionString,
|
||
retryLimit: 3,
|
||
retryDelay: 30,
|
||
retryBackoff: true,
|
||
expireInSeconds: 300,
|
||
monitorStateIntervalSeconds: 60,
|
||
});
|
||
|
||
boss.on("error", (error) => console.error("pg-boss error:", error));
|
||
|
||
await boss.start();
|
||
console.log("pg-boss scheduler started");
|
||
|
||
return boss;
|
||
}
|
||
|
||
export async function registerSchedules(boss: PgBoss): Promise<void> {
|
||
const queues = [
|
||
// ── Playwright scrapers (Erik, every 8h) ───────────────────────────
|
||
"scrape:pricing:fs",
|
||
"scrape:pricing:10gtek",
|
||
"scrape:pricing:atgbics",
|
||
"scrape:pricing:prolabs",
|
||
// ── Fetch/Cheerio scrapers (Pi-friendly, every 4h) ─────────────────
|
||
"scrape:pricing:fluxlight",
|
||
"scrape:pricing:gbics",
|
||
"scrape:pricing:optcore",
|
||
"scrape:pricing:champion-one",
|
||
"scrape:pricing:sfpcables",
|
||
"scrape:pricing:blueoptics",
|
||
"scrape:pricing:fiber24",
|
||
"scrape:pricing:tscom",
|
||
"scrape:pricing:skylane",
|
||
"scrape:pricing:ascentoptics",
|
||
"scrape:pricing:gaotek",
|
||
// ── Catalog scrapers (every 2h) ────────────────────────────────────
|
||
"scrape:pricing:flexoptix",
|
||
// ── Manufacturer catalogs (every 8h, no prices) ────────────────────
|
||
"scrape:catalog:smartoptics",
|
||
"scrape:catalog:hubersuhner",
|
||
"scrape:catalog:eoptolink",
|
||
// ── Vendor lists ───────────────────────────────────────────────────
|
||
"scrape:vendors:flexoptix",
|
||
"scrape:vendors:flexoptix-supported",
|
||
// ── Compatibility (every 12h) ──────────────────────────────────────
|
||
"scrape:compat:flexoptix",
|
||
"scrape:compat:cisco",
|
||
"scrape:compat:juniper",
|
||
"scrape:compat:sonic",
|
||
"scrape:compat:ufispace",
|
||
"scrape:compat:edgecore",
|
||
// ── Switch enrichment (every 12h) ─────────────────────────────────
|
||
"scrape:assets:switches",
|
||
// ── Switch og:image fetcher (daily, after switch-assets) ──────────
|
||
"scrape:images:switches",
|
||
// ── Playwright image fetcher for bot-blocked vendors (every 3d) ───
|
||
"scrape:images:switches:playwright",
|
||
// ── eBay enrichment (every 6h) ────────────────────────────────────
|
||
"enrich:ebay-transceivers",
|
||
"enrich:ebay-switches",
|
||
// ── Intelligence & community (every 6h) ───────────────────────────
|
||
"scrape:market-intel",
|
||
"scrape:nog-talks",
|
||
"scrape:community-issues",
|
||
"scrape:datasheet-links",
|
||
"scrape:news",
|
||
"scrape:faq",
|
||
"scrape:docs",
|
||
// ── Compute (every 4h, after pricing waves) ───────────────────────
|
||
"compute:abc",
|
||
"compute:reorder-signals",
|
||
// ── New form-factor coverage scrapers (every 8h) ──────────────────
|
||
"scrape:pricing:comms-express",
|
||
"scrape:pricing:router-switch",
|
||
"scrape:pricing:multimode-inc",
|
||
"scrape:pricing:optictransceiver",
|
||
"scrape:pricing:wiitek",
|
||
// ── Fetch-based catalog+pricing scrapers (every 2h) ──────────────
|
||
"scrape:pricing:naddod",
|
||
"scrape:pricing:qsfptek",
|
||
"scrape:pricing:addon",
|
||
"scrape:pricing:fibermall",
|
||
"scrape:pricing:vcelink",
|
||
"scrape:pricing:opticsbay",
|
||
// ── OEM Reference Prices (Mouser API, once daily) ─────────────────
|
||
"scrape:pricing:mouser-oem",
|
||
// ── Prediction Signal Scrapers (new) ──────────────────────────────
|
||
"scrape:signals:sec-edgar",
|
||
"scrape:signals:github",
|
||
"scrape:signals:ebay-velocity",
|
||
"scrape:signals:ai-clusters",
|
||
"scrape:signals:distributor-leads",
|
||
"scrape:signals:standards",
|
||
// ── Forecast Engine ───────────────────────────────────────────────
|
||
"compute:forecast",
|
||
// ── Hype Cycle Engine (Norton-Bass, daily) ────────────────────────
|
||
"compute:hype-cycle",
|
||
// ── Sync ──────────────────────────────────────────────────────────
|
||
"sync:nas",
|
||
// ── Health Monitoring ─────────────────────────────────────────────
|
||
"monitor:scraper-health",
|
||
// ── Verification Reconciliation ───────────────────────────────────
|
||
"maintenance:reconcile-verification",
|
||
// ── Competitor Equivalence Matching ───────────────────────────────
|
||
"maintenance:find-equivalences",
|
||
// ── Re-Research approved equivalences ─────────────────────────────
|
||
"maintenance:re-research-equivalences",
|
||
];
|
||
|
||
for (const q of queues) {
|
||
await boss.createQueue(q).catch(() => { /* already exists */ });
|
||
}
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// ALL PRICING SCRAPERS — 24/7, every 2h, staggered by 10min
|
||
// Goal: complete competitor coverage, no gaps, database always fresh
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
// Playwright scrapers (resource-heavy) — every 2h, 10min apart
|
||
await boss.schedule("scrape:pricing:fs", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 5400 });
|
||
await boss.schedule("scrape:pricing:10gtek", "10 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:atgbics", "20 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:prolabs", "30 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// Fetch/Cheerio scrapers (lightweight) — every 2h, 5min apart
|
||
await boss.schedule("scrape:pricing:fluxlight", "0 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:gbics", "5 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:optcore", "10 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:champion-one", "15 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:sfpcables", "20 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:blueoptics", "25 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:fiber24", "30 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:tscom", "35 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:skylane", "40 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:ascentoptics", "45 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:gaotek", "50 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// Form-factor coverage scrapers — every 2h
|
||
await boss.schedule("scrape:pricing:comms-express", "5 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 5400 });
|
||
await boss.schedule("scrape:pricing:router-switch", "15 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 5400 });
|
||
await boss.schedule("scrape:pricing:multimode-inc", "25 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:optictransceiver", "35 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:wiitek", "45 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// Fetch-based scrapers running on Erik — every 2h, end-of-cycle slots
|
||
await boss.schedule("scrape:pricing:naddod", "48 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:qsfptek", "52 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:addon", "55 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:fibermall", "57 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:vcelink", "3 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:pricing:opticsbay", "7 */2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// OEM reference prices via Mouser API — once daily at 03:00 (slow: 2s/PID × 475 PIDs ≈ 16min)
|
||
// Requires MOUSER_API_KEY env var (free at mouser.com/api-hub)
|
||
await boss.schedule("scrape:pricing:mouser-oem", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// FLEXOPTIX CATALOG — every 2h (primary price source)
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("scrape:pricing:flexoptix", "0 */2 * * *", {}, { retryLimit: 3, expireInSeconds: 3600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// MANUFACTURER CATALOGS — every 4h (product data, no prices)
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("scrape:catalog:smartoptics", "10 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:catalog:hubersuhner", "25 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:catalog:eoptolink", "40 */4 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// VENDOR LISTS — every 12h
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("scrape:vendors:flexoptix", "0 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
await boss.schedule("scrape:vendors:flexoptix-supported", "15 5,17 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// COMPATIBILITY MATRICES — every 12h
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
// Flexoptix compatibility — every 24h at 09:00 (after both switch-assets + image-fetcher)
|
||
await boss.schedule("scrape:compat:flexoptix", "0 9 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||
await boss.schedule("scrape:compat:cisco", "0 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:compat:juniper", "15 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:compat:sonic", "30 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:compat:ufispace", "45 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
await boss.schedule("scrape:compat:edgecore", "55 6,18 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// SWITCH ASSETS — every 12h
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("scrape:assets:switches", "30 7,19 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
// og:image fetcher: daily at 08:30, after switch-assets completes at 07:30
|
||
await boss.schedule("scrape:images:switches", "30 8 * * *", {}, { retryLimit: 1, expireInSeconds: 7200 });
|
||
// Playwright image scraper for bot-blocked vendors (Arista/Dell/Edgecore/Fortinet/Extreme)
|
||
// Every 3 days at 09:00 — Playwright is slower and heavier than plain HTTP
|
||
await boss.schedule("scrape:images:switches:playwright", "0 9 */3 * *", {}, { retryLimit: 1, expireInSeconds: 10800 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// EBAY ENRICHMENT — every 6h
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("enrich:ebay-transceivers", "0 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 });
|
||
await boss.schedule("enrich:ebay-switches", "30 0,6,12,18 * * *", {}, { retryLimit: 2, expireInSeconds: 7200 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// INTELLIGENCE & COMMUNITY — every 6h
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("scrape:market-intel", "0 2,8,14,20 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
// NOG conference talks — weekly on Mondays 06:00 UTC
|
||
await boss.schedule("scrape:nog-talks", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 7200 });
|
||
await boss.schedule("scrape:community-issues", "30 2,8,14,20 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:datasheet-links", "0 3,9,15,21 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:news", "20 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
await boss.schedule("scrape:faq", "40 3,9,15,21 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
await boss.schedule("scrape:docs", "50 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// COMPUTE JOBS — every 4h (after pricing waves settle)
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("compute:abc", "50 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
|
||
await boss.schedule("compute:reorder-signals", "55 3,7,11,15,19,23 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// PREDICTION SIGNAL SCRAPERS
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
// SEC EDGAR CapEx — weekly Monday 06:00 (filings don't change that fast)
|
||
await boss.schedule("scrape:signals:sec-edgar", "0 6 * * 1", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
// GitHub signals — weekly Sunday 05:00
|
||
await boss.schedule("scrape:signals:github", "0 5 * * 0", {}, { retryLimit: 2, expireInSeconds: 7200 });
|
||
// eBay sold velocity — every 12h (fast-moving market signal)
|
||
await boss.schedule("scrape:signals:ebay-velocity", "0 4,16 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
// AI cluster RSS feeds — every 4h (news moves fast)
|
||
await boss.schedule("scrape:signals:ai-clusters", "10 0,4,8,12,16,20 * * *", {}, { retryLimit: 2, expireInSeconds: 1800 });
|
||
// Distributor lead times — daily 03:30 (stock changes overnight)
|
||
await boss.schedule("scrape:signals:distributor-leads","30 3 * * *", {}, { retryLimit: 2, expireInSeconds: 3600 });
|
||
// Standards tracker — weekly Wednesday 04:00 (standards move slowly)
|
||
await boss.schedule("scrape:signals:standards", "0 4 * * 3", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// FORECAST ENGINE — daily at 08:00 (after all nightly scrapers done)
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("compute:forecast", "0 8 * * *", {}, { retryLimit: 2, expireInSeconds: 600 });
|
||
// Hype Cycle Engine runs daily at 04:30 (after Mouser OEM scraper at 03:00)
|
||
await boss.schedule("compute:hype-cycle", "30 4 * * *", {}, { retryLimit: 1, expireInSeconds: 600 });
|
||
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
// NAS SYNC — nightly
|
||
// ══════════════════════════════════════════════════════════════════════
|
||
|
||
await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
|
||
|
||
// Health check: every 3h — warns if any vendor has no new prices recently
|
||
await boss.schedule("monitor:scraper-health", "17 */3 * * *", {}, { retryLimit: 1, expireInSeconds: 600 });
|
||
|
||
// Verification reconciliation: nightly at 01:00 UTC
|
||
// Resets competitor_verified/fully_verified for any transceiver that no longer
|
||
// has a real non-Flexoptix price in the last 30 days — prevents stale ★ 100% badges
|
||
await boss.schedule("maintenance:reconcile-verification", "0 1 * * *", {}, { retryLimit: 1, expireInSeconds: 1800 });
|
||
|
||
// Equivalence matching: nightly at 02:00 UTC (after reconcile)
|
||
await boss.schedule("maintenance:find-equivalences", "0 2 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
|
||
// Re-research approved equivalences: daily at 03:00 UTC, processes 200 items per run
|
||
await boss.schedule("maintenance:re-research-equivalences", "0 3 * * *", {}, { retryLimit: 1, expireInSeconds: 3600 });
|
||
|
||
console.log("All schedules registered — 24/7 continuous scraping (59 jobs)");
|
||
}
|
||
|
||
export async function registerWorkers(boss: PgBoss): Promise<void> {
|
||
// Lazy-load all scrapers
|
||
const { scrapeFs } = await import("./scrapers/fs-com");
|
||
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
|
||
const { scrapeSmartOptics } = await import("./scrapers/smartoptics");
|
||
const { scrapeHuberSuhner } = await import("./scrapers/hubersuhner");
|
||
const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence");
|
||
const { scrapeNews } = await import("./scrapers/news");
|
||
const { scrape10Gtek } = await import("./scrapers/tenGtek");
|
||
const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog");
|
||
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
|
||
const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors");
|
||
const { scrapeAtgbics } = await import("./scrapers/atgbics");
|
||
const { scrapeProLabs } = await import("./scrapers/prolabs");
|
||
const { scrapeJuniperHct } = await import("./scrapers/juniper-hct");
|
||
const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl");
|
||
const { scrapeUfiSpace } = await import("./scrapers/ufispace");
|
||
const { scrapeEdgecore } = await import("./scrapers/edgecore");
|
||
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
|
||
const { fetchSwitchImages } = await import("./scrapers/switch-image-fetcher");
|
||
const { fetchSwitchImagesPlaywright } = await import("./scrapers/switch-image-playwright");
|
||
const { scrapeFlexoptixCompatibility } = await import("./scrapers/flexoptix-compat");
|
||
// ── Prediction signal scrapers ────────────────────────────────────────
|
||
const { scrapeSecEdgar } = await import("./scrapers/sec-edgar");
|
||
const { scrapeGithubSignals } = await import("./scrapers/github-signals");
|
||
const { scrapeEbayVelocity } = await import("./scrapers/ebay-velocity");
|
||
const { scrapeAiClusters } = await import("./scrapers/ai-clusters");
|
||
const { scrapeDistributorLeads }= await import("./scrapers/distributor-leads");
|
||
const { scrapeStandardsTracker }= await import("./scrapers/standards-tracker");
|
||
const { runForecastEngine } = await import("./utils/forecast-engine");
|
||
|
||
// ── Playwright scrapers ───────────────────────────────────────────────
|
||
|
||
await boss.work("scrape:pricing:fs", async () => {
|
||
// FS.com uses Playwright + Cloudflare bypass. On datacenter servers the
|
||
// datacenter IP is blocked by Cloudflare WAF. Set SKIP_FS_SCRAPER=true to
|
||
// skip on Erik; the Mac launchd cron handles FS.com from a residential IP.
|
||
if (process.env["SKIP_FS_SCRAPER"] === "true") {
|
||
console.log(`[${new Date().toISOString()}] FS.com pricing: SKIPPED (SKIP_FS_SCRAPER=true)`);
|
||
return;
|
||
}
|
||
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
|
||
await scrapeFs();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:10gtek", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
|
||
await scrape10Gtek();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:atgbics", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
|
||
await scrapeAtgbics();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:prolabs", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: ProLabs pricing`);
|
||
await scrapeProLabs();
|
||
});
|
||
|
||
// ── Lightweight fetch/cheerio scrapers ───────────────────────────────
|
||
await boss.work("scrape:pricing:fluxlight", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`);
|
||
const { scrapeFluxlight } = await import("./scrapers/fluxlight");
|
||
await scrapeFluxlight();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:gbics", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: GBICS pricing`);
|
||
const { scrapeGbics } = await import("./scrapers/gbics");
|
||
await scrapeGbics();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:optcore", async () => {
|
||
// Optcore.net WP REST API is blocked by Cloudflare WAF for Erik's datacenter
|
||
// IP (82.165.222.127). Set SKIP_OPTCORE_SCRAPER=true to suppress the wasted
|
||
// run. A residential IP (Mac launchd) would be needed to scrape Optcore.
|
||
if (process.env["SKIP_OPTCORE_SCRAPER"] === "true") {
|
||
console.log(`[${new Date().toISOString()}] Optcore pricing: SKIPPED (SKIP_OPTCORE_SCRAPER=true)`);
|
||
return;
|
||
}
|
||
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
|
||
const { scrapeOptcore } = await import("./scrapers/optcore");
|
||
await scrapeOptcore();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:champion-one", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`);
|
||
const { scrapeChampionOne } = await import("./scrapers/champion-one");
|
||
await scrapeChampionOne();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:sfpcables", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: SFPcables pricing`);
|
||
const { scrapeSfpCables } = await import("./scrapers/sfpcables");
|
||
await scrapeSfpCables();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:blueoptics", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: BlueOptics pricing`);
|
||
const { scrapeBlueOptics } = await import("./scrapers/blueoptics");
|
||
await scrapeBlueOptics();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:fiber24", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Fiber24 pricing`);
|
||
const { scrapeFiber24 } = await import("./scrapers/fiber24");
|
||
await scrapeFiber24();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:tscom", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: T&S Communication pricing`);
|
||
const { scrapeTsCom } = await import("./scrapers/tscom");
|
||
await scrapeTsCom();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:skylane", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Skylane pricing`);
|
||
const { scrapeSkylane } = await import("./scrapers/skylane");
|
||
await scrapeSkylane();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:ascentoptics", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Ascent Optics pricing`);
|
||
const { scrapeAscentOptics } = await import("./scrapers/ascentoptics");
|
||
await scrapeAscentOptics();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:gaotek", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: GAO Tek pricing`);
|
||
const { scrapeGaoTek } = await import("./scrapers/gaotek");
|
||
await scrapeGaoTek();
|
||
});
|
||
|
||
// ── Catalog scrapers ──────────────────────────────────────────────────
|
||
|
||
await boss.work("scrape:pricing:flexoptix", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog`);
|
||
await scrapeFlexoptixCatalog();
|
||
});
|
||
|
||
await boss.work("scrape:catalog:smartoptics", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: SmartOptics catalog`);
|
||
await scrapeSmartOptics();
|
||
});
|
||
|
||
await boss.work("scrape:catalog:hubersuhner", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: HUBER+SUHNER catalog`);
|
||
await scrapeHuberSuhner();
|
||
});
|
||
|
||
await boss.work("scrape:catalog:eoptolink", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Eoptolink OEM catalog`);
|
||
const { scrapeEoptolink } = await import("./scrapers/eoptolink");
|
||
await scrapeEoptolink();
|
||
});
|
||
|
||
// ── Vendor lists ──────────────────────────────────────────────────────
|
||
|
||
await boss.work("scrape:vendors:flexoptix", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`);
|
||
await scrapeFlexoptixVendors();
|
||
});
|
||
|
||
await boss.work("scrape:vendors:flexoptix-supported", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Flexoptix supported vendors`);
|
||
await seedFlexoptixVendors();
|
||
});
|
||
|
||
// ── Compatibility scrapers ────────────────────────────────────────────
|
||
|
||
await boss.work("scrape:compat:flexoptix", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Flexoptix compatibility mapping`);
|
||
if (!isLoadAcceptable(2.5)) {
|
||
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Flexoptix compat scrape`);
|
||
return;
|
||
}
|
||
await scrapeFlexoptixCompatibility();
|
||
});
|
||
|
||
await boss.work("scrape:compat:cisco", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
|
||
await scrapeCiscoTmg();
|
||
});
|
||
|
||
await boss.work("scrape:compat:juniper", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
|
||
await scrapeJuniperHct();
|
||
});
|
||
|
||
await boss.work("scrape:compat:sonic", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
|
||
await scrapeSonicHcl();
|
||
});
|
||
|
||
await boss.work("scrape:compat:ufispace", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
|
||
await scrapeUfiSpace();
|
||
});
|
||
|
||
await boss.work("scrape:compat:edgecore", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
|
||
await scrapeEdgecore();
|
||
});
|
||
|
||
// ── Switch assets ─────────────────────────────────────────────────────
|
||
|
||
await boss.work("scrape:assets:switches", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
|
||
await scrapeSwitchAssets();
|
||
});
|
||
|
||
await boss.work("scrape:images:switches", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Switch og:image fetcher`);
|
||
if (!isLoadAcceptable(2.5)) {
|
||
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping switch image fetch`);
|
||
return;
|
||
}
|
||
await fetchSwitchImages();
|
||
});
|
||
|
||
await boss.work("scrape:images:switches:playwright", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Switch image fetcher (Playwright — bot-blocked vendors)`);
|
||
if (!isLoadAcceptable(2.0)) {
|
||
console.warn(`[${new Date().toISOString()}] ⚠ Load too high — skipping Playwright image fetch`);
|
||
return;
|
||
}
|
||
await fetchSwitchImagesPlaywright();
|
||
});
|
||
|
||
// ── eBay enrichment ───────────────────────────────────────────────────
|
||
|
||
await boss.work("enrich:ebay-transceivers", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: eBay transceiver pricing`);
|
||
const { enrichTransceiversFromEbay } = await import("./scrapers/ebay-enricher");
|
||
await enrichTransceiversFromEbay(100);
|
||
});
|
||
|
||
await boss.work("enrich:ebay-switches", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
|
||
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
|
||
await enrichSwitchesFromEbay(30);
|
||
});
|
||
|
||
// ── Intelligence & community ──────────────────────────────────────────
|
||
|
||
await boss.work("scrape:market-intel", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
|
||
await scrapeMarketIntelligence();
|
||
});
|
||
|
||
await boss.work("scrape:nog-talks", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: NOG conference talks`);
|
||
const { scrapeNogTalks } = await import("./scrapers/nog-talks");
|
||
await scrapeNogTalks();
|
||
});
|
||
|
||
await boss.work("scrape:community-issues", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Community issues`);
|
||
const { scrapeAllSwitchIssues, scrapeTransceiverCompatIssues } = await import("./scrapers/community-issues");
|
||
await scrapeAllSwitchIssues(30);
|
||
await scrapeTransceiverCompatIssues(15);
|
||
});
|
||
|
||
await boss.work("scrape:datasheet-links", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Datasheet links`);
|
||
const { findAndSeedDatasheetLinks } = await import("./scrapers/community-issues");
|
||
await findAndSeedDatasheetLinks(50);
|
||
});
|
||
|
||
await boss.work("scrape:news", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: News scraper`);
|
||
await scrapeNews();
|
||
});
|
||
|
||
await boss.work("scrape:faq", async () => {
|
||
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
|
||
});
|
||
|
||
await boss.work("scrape:docs", async () => {
|
||
console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`);
|
||
});
|
||
|
||
// ── Compute jobs ──────────────────────────────────────────────────────
|
||
|
||
await boss.work("compute:abc", async () => {
|
||
console.log(`[${new Date().toISOString()}] Computing: ABC classification`);
|
||
const { computeAbcClassification } = await import("./scrapers/market-intelligence");
|
||
await computeAbcClassification();
|
||
});
|
||
|
||
await boss.work("compute:reorder-signals", async () => {
|
||
console.log(`[${new Date().toISOString()}] Computing: Reorder signals`);
|
||
const { computeReorderSignals } = await import("./scrapers/market-intelligence");
|
||
await computeReorderSignals();
|
||
});
|
||
|
||
// ── NAS sync ──────────────────────────────────────────────────────────
|
||
|
||
await boss.work("sync:nas", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: NAS sync to Fearghas`);
|
||
const { runNightlyNasSync } = await import("./utils/nas-sync");
|
||
await runNightlyNasSync();
|
||
});
|
||
|
||
// ── Prediction signal scrapers ────────────────────────────────────────
|
||
|
||
await boss.work("scrape:signals:sec-edgar", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: SEC EDGAR CapEx`);
|
||
await scrapeSecEdgar();
|
||
});
|
||
|
||
await boss.work("scrape:signals:github", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: GitHub tech signals`);
|
||
await scrapeGithubSignals();
|
||
});
|
||
|
||
await boss.work("scrape:signals:ebay-velocity", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: eBay sold velocity`);
|
||
await scrapeEbayVelocity();
|
||
});
|
||
|
||
await boss.work("scrape:signals:ai-clusters", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: AI cluster announcements`);
|
||
await scrapeAiClusters();
|
||
});
|
||
|
||
await boss.work("scrape:signals:distributor-leads", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Distributor lead times`);
|
||
await scrapeDistributorLeads();
|
||
});
|
||
|
||
await boss.work("scrape:signals:standards", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Standards tracker`);
|
||
await scrapeStandardsTracker();
|
||
});
|
||
|
||
// ── Forecast engine ───────────────────────────────────────────────────
|
||
|
||
await boss.work("compute:forecast", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Forecast engine`);
|
||
await runForecastEngine();
|
||
});
|
||
|
||
// ── Hype Cycle Engine (Norton-Bass diffusion model) ───────────────────
|
||
|
||
await boss.work("compute:hype-cycle", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Hype Cycle Engine (Norton-Bass)`);
|
||
const { computeHypeCycle } = await import("./utils/hype-cycle-engine");
|
||
await computeHypeCycle();
|
||
});
|
||
|
||
// ── Form-factor coverage scrapers ─────────────────────────────────────
|
||
|
||
await boss.work("scrape:pricing:comms-express", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Comms-Express pricing`);
|
||
const { scrapeCommsExpress } = await import("./scrapers/comms-express");
|
||
await scrapeCommsExpress();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:router-switch", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Router-Switch.com pricing`);
|
||
const { scrapeRouterSwitch } = await import("./scrapers/router-switch");
|
||
await scrapeRouterSwitch();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:multimode-inc", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Multimode Inc pricing`);
|
||
const { scrapeMultimodeInc } = await import("./scrapers/multimode-inc");
|
||
await scrapeMultimodeInc();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:optictransceiver", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: OpticTransceiver.com pricing`);
|
||
const { scrapeOpticTransceiver } = await import("./scrapers/optictransceiver");
|
||
await scrapeOpticTransceiver();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:wiitek", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Wiitek pricing`);
|
||
const { scrapeWiitek } = await import("./scrapers/wiitek");
|
||
await scrapeWiitek();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:naddod", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: NADDOD pricing`);
|
||
const { scrapeNaddod } = await import("./scrapers/naddod");
|
||
await scrapeNaddod();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:qsfptek", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: QSFPTEK pricing`);
|
||
const { scrapeQsfptek } = await import("./scrapers/qsfptek");
|
||
await scrapeQsfptek();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:addon", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: AddOn Networks pricing`);
|
||
const { scrapeAddonNetworks } = await import("./scrapers/addon-networks");
|
||
await scrapeAddonNetworks();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:fibermall", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: FiberMall pricing`);
|
||
const { scrapeFiberMall } = await import("./scrapers/fibermall");
|
||
await scrapeFiberMall();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:vcelink", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Vcelink pricing`);
|
||
const { scrapeVcelink } = await import("./scrapers/vcelink");
|
||
await scrapeVcelink();
|
||
});
|
||
|
||
await boss.work("scrape:pricing:opticsbay", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: OpticsBay pricing`);
|
||
const { scrapeOpticsBay } = await import("./scrapers/opticsbay");
|
||
await scrapeOpticsBay();
|
||
});
|
||
|
||
// ── Mouser OEM reference prices ────────────────────────────────────────
|
||
await boss.work("scrape:pricing:mouser-oem", async () => {
|
||
console.log(`[${new Date().toISOString()}] Running: Mouser OEM reference prices`);
|
||
if (!process.env["MOUSER_API_KEY"]) {
|
||
console.warn(" [mouser-oem] Skipping — MOUSER_API_KEY not set");
|
||
return;
|
||
}
|
||
const { scrapeMouser } = await import("./scrapers/mouser");
|
||
await scrapeMouser();
|
||
});
|
||
|
||
// ── Health monitor ──────────────────────────────────────────────────────
|
||
await boss.work("monitor:scraper-health", async () => {
|
||
const { pool } = await import("./utils/db");
|
||
|
||
// Vendors we expect to see prices from regularly.
|
||
// Mapped: display name → pg-boss job name prefix (for last-run lookup).
|
||
const EXPECTED_VENDORS: Array<{ name: string; jobName: string }> = [
|
||
{ name: "FiberMall", jobName: "scrape:pricing:fibermall" },
|
||
{ name: "QSFPTEK", jobName: "scrape:pricing:qsfptek" },
|
||
{ name: "Flexoptix", jobName: "scrape:pricing:flexoptix" },
|
||
// FS.COM is skipped on Erik (SKIP_FS_SCRAPER=true) — Mac launchd handles it.
|
||
// Exclude from monitor so stale last_seen doesn't trigger false positives.
|
||
{ name: "10Gtek", jobName: "scrape:pricing:10gtek" },
|
||
{ name: "ATGBICS", jobName: "scrape:pricing:atgbics" },
|
||
{ name: "GBICS", jobName: "scrape:pricing:gbics" },
|
||
{ name: "SFPcables", jobName: "scrape:pricing:sfpcables" },
|
||
{ name: "NADDOD", jobName: "scrape:pricing:naddod" },
|
||
];
|
||
|
||
const vendorNames = EXPECTED_VENDORS.map((v) => v.name);
|
||
const jobNames = EXPECTED_VENDORS.map((v) => v.jobName);
|
||
|
||
// Price observation recency per vendor
|
||
const priceResult = await pool.query(`
|
||
SELECT v.name,
|
||
SUM(CASE WHEN po.time > NOW() - INTERVAL '6 hours' THEN 1 ELSE 0 END) AS prices_6h,
|
||
MAX(po.time) AS last_seen,
|
||
EXTRACT(EPOCH FROM (NOW() - MAX(po.time))) / 3600.0 AS hours_since
|
||
FROM vendors v
|
||
LEFT JOIN price_observations po ON po.source_vendor_id = v.id
|
||
WHERE v.name = ANY($1)
|
||
GROUP BY v.name
|
||
ORDER BY last_seen ASC NULLS FIRST
|
||
`, [vendorNames]);
|
||
|
||
// Last successful pg-boss job per vendor scraper (within last 26h — covers any
|
||
// 2h-scheduled job at least once plus 2h slack for daemon restart delays).
|
||
// Also pull the most recent failed job so we can distinguish "running but
|
||
// prices stable" from "job consistently failing".
|
||
const jobResult = await pool.query(`
|
||
SELECT DISTINCT ON (name) name, state, completed_on
|
||
FROM pgboss.job
|
||
WHERE name = ANY($1)
|
||
AND created_on > NOW() - INTERVAL '26 hours'
|
||
ORDER BY name, created_on DESC
|
||
`, [jobNames]);
|
||
|
||
const jobMap = new Map<string, { state: string; completed_on: Date | null }>();
|
||
for (const row of jobResult.rows) {
|
||
jobMap.set(row.name as string, { state: row.state as string, completed_on: row.completed_on as Date | null });
|
||
}
|
||
|
||
// Last COMPLETED job per vendor (to know when the job last ran successfully)
|
||
const completedResult = await pool.query(`
|
||
SELECT DISTINCT ON (name) name, completed_on AS last_completed
|
||
FROM pgboss.job
|
||
WHERE name = ANY($1)
|
||
AND state = 'completed'
|
||
AND created_on > NOW() - INTERVAL '26 hours'
|
||
ORDER BY name, completed_on DESC
|
||
`, [jobNames]);
|
||
|
||
const completedMap = new Map<string, Date>();
|
||
for (const row of completedResult.rows) {
|
||
if (row.last_completed) completedMap.set(row.name as string, new Date(row.last_completed as string));
|
||
}
|
||
|
||
// Thresholds for alerting:
|
||
// CRITICAL (🔴): no job completed in 26h AND last price > 168h (7 days)
|
||
// — scraper genuinely not running or consistently failing
|
||
// WARNING (🟡): no job completed in 26h AND last price > 48h
|
||
// — scraper may be broken but recently seen
|
||
// STABLE (✅): job completed in last 26h AND vendor has historical prices
|
||
// — prices unchanged (hash dedup), scraper is healthy
|
||
const CRITICAL_HOURS = 168;
|
||
const WARN_HOURS = 48;
|
||
|
||
const critical: string[] = [];
|
||
const warnings: string[] = [];
|
||
const stable: string[] = [];
|
||
|
||
for (const row of priceResult.rows) {
|
||
const h = parseFloat(row.hours_since ?? "9999");
|
||
const n = parseInt(row.prices_6h ?? "0", 10);
|
||
if (n > 0) continue; // new prices written → definitely healthy
|
||
|
||
const lastStr = row.last_seen
|
||
? `last price ${h.toFixed(1)}h ago (${new Date(row.last_seen as string).toISOString().slice(0, 16)})`
|
||
: "NEVER scraped";
|
||
|
||
const vendor = EXPECTED_VENDORS.find((v) => v.name === row.name);
|
||
const jobInfo = vendor ? jobMap.get(vendor.jobName) : undefined;
|
||
const lastCompleted = vendor ? completedMap.get(vendor.jobName) : undefined;
|
||
|
||
const jobStr = jobInfo
|
||
? ` | job=${jobInfo.state} at ${jobInfo.completed_on ? new Date(jobInfo.completed_on).toISOString().slice(11, 16) : "?"}`
|
||
: " | job=not run in 26h";
|
||
|
||
// If the job completed successfully in the last 26h AND the vendor has
|
||
// historical prices, prices are just stable (hash dedup) — not an outage.
|
||
const jobRunningOk = !!lastCompleted && row.last_seen;
|
||
if (jobRunningOk) {
|
||
stable.push(`✅ ${row.name}: prices stable (${h.toFixed(1)}h unchanged, job OK)${jobStr}`);
|
||
continue;
|
||
}
|
||
|
||
if (!row.last_seen || h > CRITICAL_HOURS) {
|
||
critical.push(`🔴 ${row.name}: ${lastStr}${jobStr}`);
|
||
} else if (h > WARN_HOURS) {
|
||
warnings.push(`🟡 ${row.name}: ${lastStr}${jobStr}`);
|
||
} else {
|
||
stable.push(`✅ ${row.name}: prices stable (${h.toFixed(1)}h unchanged)${jobStr}`);
|
||
}
|
||
}
|
||
|
||
if (critical.length > 0 || warnings.length > 0) {
|
||
if (critical.length > 0) {
|
||
console.error("=== 🔴 SCRAPER CRITICAL — vendors with no prices for 7+ days ===");
|
||
for (const p of critical) console.error(p);
|
||
}
|
||
if (warnings.length > 0) {
|
||
console.warn("=== 🟡 SCRAPER WARNING — vendors with stale prices (48h+) ===");
|
||
for (const p of warnings) console.warn(p);
|
||
}
|
||
console.error("=== Check: pm2 logs tip-scraper-daemon ===");
|
||
} else {
|
||
const activeCount = EXPECTED_VENDORS.length - stable.length;
|
||
if (stable.length > 0) {
|
||
console.log(`[monitor] Scraper health OK — ${activeCount} vendors active, ${stable.length} stable (no price changes)`);
|
||
for (const s of stable) console.log(` ${s}`);
|
||
} else {
|
||
console.log(`[monitor] Scraper health OK — all ${EXPECTED_VENDORS.length} vendors active in last 6h`);
|
||
}
|
||
}
|
||
});
|
||
|
||
// ── Verification reconciliation ─────────────────────────────────────────
|
||
await boss.work("maintenance:reconcile-verification", async () => {
|
||
const { pool } = await import("./utils/db");
|
||
|
||
// 1. Reset competitor_verified=false for products with no non-Flexoptix price in last 30 days
|
||
const resetComp = await pool.query(`
|
||
UPDATE transceivers t
|
||
SET competitor_verified = false,
|
||
competitor_verified_at = NULL
|
||
WHERE competitor_verified = true
|
||
AND NOT EXISTS (
|
||
SELECT 1 FROM price_observations po
|
||
JOIN vendors v ON po.source_vendor_id = v.id
|
||
WHERE po.transceiver_id = t.id
|
||
AND po.time > NOW() - INTERVAL '30 days'
|
||
AND UPPER(v.name) NOT LIKE '%FLEXOPTIX%'
|
||
)
|
||
`);
|
||
|
||
// 2. Reset fully_verified=false for products that lost competitor_verified
|
||
const resetFull = await pool.query(`
|
||
UPDATE transceivers
|
||
SET fully_verified = false,
|
||
fully_verified_at = NULL
|
||
WHERE fully_verified = true
|
||
AND (competitor_verified = false OR price_verified = false OR image_verified = false OR details_verified = false)
|
||
`);
|
||
|
||
// 3. Set fully_verified=true for products that now meet all 4 criteria
|
||
const setFull = await pool.query(`
|
||
UPDATE transceivers
|
||
SET fully_verified = true,
|
||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||
WHERE competitor_verified = true
|
||
AND price_verified = true
|
||
AND image_verified = true
|
||
AND details_verified = true
|
||
AND fully_verified = false
|
||
`);
|
||
|
||
console.log(
|
||
`[reconcile] competitor_verified reset: ${resetComp.rowCount}, ` +
|
||
`fully_verified cleared: ${resetFull.rowCount}, ` +
|
||
`fully_verified earned: ${setFull.rowCount}`
|
||
);
|
||
});
|
||
|
||
// ── Equivalence matching ────────────────────────────────────────────────
|
||
// Matches Flexoptix SKUs to technically equivalent competitor products by specs.
|
||
// Confidence scoring: standard_name(35) + form_factor(25) + speed_gbps(20) +
|
||
// fiber_type(10) + reach±25%(10) = 100 pts max
|
||
// ≥0.85 → auto_approved + competitor_verified=true
|
||
// 0.50–0.84 → pending (Manual Review queue in dashboard)
|
||
// <0.50 → skipped
|
||
await boss.work("maintenance:find-equivalences", async () => {
|
||
const { pool } = await import("./utils/db");
|
||
const ts = new Date().toISOString();
|
||
console.log(`[${ts}] Running: Equivalence matching`);
|
||
|
||
// Find all Flexoptix transceivers that are NOT yet competitor_verified
|
||
const flexResult = await pool.query(`
|
||
SELECT t.id, t.part_number, t.standard_name, t.form_factor,
|
||
t.speed_gbps, t.fiber_type, t.reach_meters, t.wavelengths,
|
||
t.connector, t.wdm_type, t.coherent
|
||
FROM transceivers t
|
||
JOIN vendors v ON v.id = t.vendor_id
|
||
WHERE UPPER(v.name) LIKE '%FLEXOPTIX%'
|
||
AND t.competitor_verified = false
|
||
`);
|
||
|
||
let autoApproved = 0;
|
||
let queued = 0;
|
||
let skipped = 0;
|
||
|
||
for (const fx of flexResult.rows) {
|
||
// Find competitor transceivers with recent price observations and matching specs
|
||
const candidates = await pool.query(`
|
||
SELECT t.id AS competitor_id, t.part_number, t.standard_name,
|
||
t.form_factor, t.speed_gbps, t.fiber_type, t.reach_meters,
|
||
t.wavelengths, t.connector, v.name AS vendor_name,
|
||
MAX(po.time) AS last_price, COUNT(*) AS price_count
|
||
FROM transceivers t
|
||
JOIN vendors v ON v.id = t.vendor_id
|
||
JOIN price_observations po ON po.transceiver_id = t.id
|
||
WHERE UPPER(v.name) NOT LIKE '%FLEXOPTIX%'
|
||
AND po.time > NOW() - INTERVAL '30 days'
|
||
AND t.form_factor = $1
|
||
AND t.speed_gbps = $2
|
||
AND t.id != $3
|
||
GROUP BY t.id, t.part_number, t.standard_name, t.form_factor,
|
||
t.speed_gbps, t.fiber_type, t.reach_meters,
|
||
t.wavelengths, t.connector, v.name
|
||
`, [fx.form_factor, fx.speed_gbps, fx.id]);
|
||
|
||
for (const cand of candidates.rows) {
|
||
// Confidence scoring
|
||
// Max points: form_factor(25) + speed_gbps(20) + standard_name(30) +
|
||
// wavelength_nm(20) + fiber_type(10) + reach(10) = 115
|
||
let score = 0;
|
||
const basis: string[] = [];
|
||
|
||
// form_factor already matched (pre-filter), award points
|
||
score += 25; basis.push("form_factor");
|
||
|
||
// speed_gbps already matched (pre-filter)
|
||
score += 20; basis.push("speed_gbps");
|
||
|
||
// standard_name match (strong signal — e.g. "10GBASE-LR")
|
||
if (fx.standard_name && cand.standard_name &&
|
||
fx.standard_name.trim().toUpperCase() === cand.standard_name.trim().toUpperCase()) {
|
||
score += 30; basis.push("standard_name");
|
||
}
|
||
|
||
// wavelength match — extract first numeric nm value and compare within ±15nm
|
||
// "wavelengths" is text: "1310 nm", "850nm", "1270/1290/1310/1330 nm" etc.
|
||
const extractNm = (w: string | null): number | null => {
|
||
if (!w) return null;
|
||
const m = w.match(/(\d{3,4})/);
|
||
return m ? parseInt(m[1], 10) : null;
|
||
};
|
||
const fxNm = extractNm(fx.wavelengths);
|
||
const candNm = extractNm(cand.wavelengths);
|
||
if (fxNm !== null && candNm !== null) {
|
||
if (Math.abs(fxNm - candNm) <= 15) {
|
||
score += 20; basis.push(`wavelength_${fxNm}nm`);
|
||
} else {
|
||
score -= 20; // hard penalize wrong wavelength (1310 vs 1550 = completely different product)
|
||
}
|
||
}
|
||
|
||
// fiber_type match (SMF vs MMF — critical)
|
||
if (fx.fiber_type && cand.fiber_type) {
|
||
if (fx.fiber_type.trim().toUpperCase() === cand.fiber_type.trim().toUpperCase()) {
|
||
score += 10; basis.push("fiber_type");
|
||
} else {
|
||
score -= 15; // SMF vs MMF = wrong product
|
||
}
|
||
}
|
||
|
||
// reach within ±25%
|
||
if (fx.reach_meters && cand.reach_meters && fx.reach_meters > 0 && cand.reach_meters > 0) {
|
||
const diff = Math.abs(fx.reach_meters - cand.reach_meters);
|
||
const tolerance = Math.max(fx.reach_meters, 1) * 0.25;
|
||
if (diff <= tolerance) {
|
||
score += 10; basis.push("reach");
|
||
} else {
|
||
score -= 15; // penalize mismatched reach
|
||
}
|
||
} else if (!fx.reach_meters && !cand.reach_meters) {
|
||
score += 5; basis.push("reach_null");
|
||
}
|
||
|
||
const confidence = Math.max(0, Math.min(1, score / 115));
|
||
|
||
if (confidence < 0.50) { skipped++; continue; }
|
||
|
||
const notes = `${fx.part_number} ↔ ${cand.part_number} (${cand.vendor_name}) | ` +
|
||
`basis: ${basis.join(", ")} | reach: ${fx.reach_meters}m vs ${cand.reach_meters}m | ` +
|
||
`wavelength: ${fx.wavelengths||"?"} vs ${cand.wavelengths||"?"}`;
|
||
|
||
// Upsert equivalence candidate
|
||
const status = confidence >= 0.73 ? "auto_approved" : "pending";
|
||
|
||
await pool.query(`
|
||
INSERT INTO transceiver_equivalences
|
||
(flexoptix_id, competitor_id, confidence, match_basis, match_notes, status)
|
||
VALUES ($1, $2, $3, $4, $5, $6)
|
||
ON CONFLICT (flexoptix_id, competitor_id) DO UPDATE SET
|
||
confidence = EXCLUDED.confidence,
|
||
match_basis = EXCLUDED.match_basis,
|
||
match_notes = EXCLUDED.match_notes,
|
||
updated_at = NOW()
|
||
WHERE transceiver_equivalences.status NOT IN ('approved', 'rejected')
|
||
`, [fx.id, cand.competitor_id, confidence, basis, notes, status]);
|
||
|
||
if (confidence >= 0.73) {
|
||
// Auto-approve: set competitor_verified on the Flexoptix transceiver
|
||
await pool.query(`
|
||
UPDATE transceivers
|
||
SET competitor_verified = true,
|
||
competitor_verified_at = NOW()
|
||
WHERE id = $1 AND competitor_verified = false
|
||
`, [fx.id]);
|
||
autoApproved++;
|
||
} else {
|
||
queued++;
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(
|
||
`[find-equivalences] auto_approved: ${autoApproved}, ` +
|
||
`queued for review: ${queued}, skipped (low confidence): ${skipped}`
|
||
);
|
||
|
||
// After auto-approvals, rerun fully_verified check
|
||
if (autoApproved > 0) {
|
||
await pool.query(`
|
||
UPDATE transceivers
|
||
SET fully_verified = true,
|
||
fully_verified_at = COALESCE(fully_verified_at, NOW())
|
||
WHERE competitor_verified = true
|
||
AND price_verified = true
|
||
AND image_verified = true
|
||
AND details_verified = true
|
||
AND fully_verified = false
|
||
`);
|
||
}
|
||
});
|
||
|
||
// ── Re-research approved equivalences ────────────────────────────────────────
|
||
// Processes up to 200 approved equivalences per day that have re_research_due_at <= NOW().
|
||
// Re-runs the confidence check: if competitor still has recent prices and specs still match,
|
||
// the approval is confirmed (re_researched_at = NOW(), next check in 30 days).
|
||
// If confidence drops or competitor has no recent price: reverts to pending.
|
||
await boss.work("maintenance:re-research-equivalences", async () => {
|
||
const { pool } = await import("./utils/db");
|
||
const ts = new Date().toISOString();
|
||
console.log(`[${ts}] Running: Re-research approved equivalences`);
|
||
|
||
const batch = await pool.query(`
|
||
SELECT eq.id, eq.flexoptix_id, eq.competitor_id, eq.confidence,
|
||
fx.form_factor, fx.speed_gbps, fx.standard_name, fx.fiber_type,
|
||
fx.reach_meters, fx.wavelengths
|
||
FROM transceiver_equivalences eq
|
||
JOIN transceivers fx ON eq.flexoptix_id = fx.id
|
||
WHERE eq.status IN ('approved', 'auto_approved')
|
||
AND eq.re_research_due_at IS NOT NULL
|
||
AND eq.re_research_due_at <= NOW()
|
||
ORDER BY eq.re_research_due_at ASC
|
||
LIMIT 200
|
||
`);
|
||
|
||
let confirmed = 0;
|
||
let reverted = 0;
|
||
|
||
for (const eq of batch.rows) {
|
||
// Check if competitor still has a recent price observation
|
||
const priceCheck = await pool.query(`
|
||
SELECT COUNT(*) AS cnt
|
||
FROM price_observations
|
||
WHERE transceiver_id = $1 AND time > NOW() - INTERVAL '45 days'
|
||
`, [eq.competitor_id]);
|
||
|
||
const hasRecentPrice = parseInt(priceCheck.rows[0].cnt, 10) > 0;
|
||
|
||
if (!hasRecentPrice) {
|
||
// Competitor no longer carries this — revert to pending for manual review
|
||
await pool.query(`
|
||
UPDATE transceiver_equivalences
|
||
SET status = 'pending', re_research_due_at = NULL, re_researched_at = NULL,
|
||
match_notes = CONCAT(match_notes, E'\n[Re-research ' || NOW()::date || ': no recent price — reverted to pending]')
|
||
WHERE id = $1
|
||
`, [eq.id]);
|
||
|
||
// Reset competitor_verified if no other approved equivalence covers this transceiver
|
||
await pool.query(`
|
||
UPDATE transceivers
|
||
SET competitor_verified = false, competitor_verified_at = NULL,
|
||
fully_verified = false, fully_verified_at = NULL
|
||
WHERE id = $1
|
||
AND NOT EXISTS (
|
||
SELECT 1 FROM transceiver_equivalences
|
||
WHERE flexoptix_id = $1
|
||
AND status IN ('approved', 'auto_approved')
|
||
AND id != $2
|
||
)
|
||
`, [eq.flexoptix_id, eq.id]);
|
||
|
||
reverted++;
|
||
} else {
|
||
// Still valid — confirm and schedule next re-research in 30 days
|
||
await pool.query(`
|
||
UPDATE transceiver_equivalences
|
||
SET re_researched_at = NOW(),
|
||
re_research_due_at = NOW() + INTERVAL '30 days'
|
||
WHERE id = $1
|
||
`, [eq.id]);
|
||
confirmed++;
|
||
}
|
||
}
|
||
|
||
console.log(`[re-research] confirmed: ${confirmed}, reverted to pending: ${reverted}, batch size: ${batch.rows.length}`);
|
||
});
|
||
|
||
console.log("All workers registered (78 jobs, 24/7 continuous)");
|
||
}
|