feat: register ALL scrapers in nightly 00:00-08:00 window (30 jobs)

Previously missing from scheduler:
- Champion ONE, Fluxlight, GBICs, SFPCables pricing
- Juniper HCT, SONiC HCL, Ufispace, Edgecore compatibility
- Flexoptix supported vendors
- Switch assets enrichment

Full nightly sequence now covers every scraper in the fleet.
All jobs staggered with 15-30 min gaps to respect vendor rate limits.
This commit is contained in:
Rene Fichtmueller 2026-04-01 23:39:08 +02:00
parent 8d2a5ac193
commit 2563ac3094

View File

@ -1,28 +1,38 @@
/**
* pg-boss Job Scheduler
*
* NIGHTLY WINDOW 00:0008:00 all scrapers run every night
* Staggered to avoid parallel overload and respect rate limits.
* NIGHTLY WINDOW 00:0008:00 ALL scrapers run every night
* Staggered to avoid parallel overload and respect vendor rate limits.
*
* 00:00 eBay transceiver pricing (new/refurb condition prices)
* 00:30 eBay switch enrichment (features, descriptions, images)
* 01:00 FS.com pricing (JS-rendered, needs Playwright)
* 01:45 Optcore pricing
* 02:15 10Gtek pricing (Playwright)
* 02:45 ATGBICS pricing (Shopify/Playwright, GBP)
* 03:15 ProLabs pricing (Playwright)
* 03:45 Flexoptix catalog (fast fetch primary source)
* 04:15 Flexoptix vendor list
* 04:30 Market intelligence (OFC/ECOC/IEEE/Farnell/TED)
* 05:00 Community issues (Reddit/forums/vendor KB)
* 05:30 Datasheet + manual link discovery
* 06:00 Cisco compatibility matrices
* 06:15 News aggregation (trade press)
* 06:30 FAQ / knowledge base
* 07:00 Docs check (weekly: full doc scrape)
* 07:15 ABC classification recompute
* 07:30 Reorder signals recompute
* 07:45 NAS sync (export JSON data + weekly pg_dump to Fearghas)
* 01:00 FS.com pricing (Playwright, slowest scraper)
* 01:30 Champion ONE pricing
* 01:50 Fluxlight pricing
* 02:10 GBICs pricing
* 02:30 Optcore pricing
* 02:50 10Gtek pricing (Playwright)
* 03:20 ATGBICS pricing (Shopify/Playwright, GBP)
* 03:50 ProLabs pricing (Playwright/CloudFront)
* 04:20 SFPCables pricing
* 04:40 Market intelligence (OFC/ECOC/IEEE/Farnell/TED)
* 05:00 Flexoptix catalog (fast fetch primary source)
* 05:20 Flexoptix vendor list
* 05:35 Flexoptix supported vendors (full vendor seed)
* 05:50 Cisco compatibility matrices
* 06:05 Juniper HCT compatibility
* 06:20 SONiC HCL compatibility
* 06:35 Ufispace switch data
* 06:45 Edgecore switch data
* 06:55 Switch assets enrichment (images, specs)
* 07:00 Community issues (Reddit/forums/vendor KB)
* 07:10 Datasheet + manual link discovery
* 07:20 News aggregation (trade press)
* 07:30 FAQ / knowledge base
* 07:35 Docs check (full doc scrape)
* 07:42 ABC classification recompute
* 07:50 Reorder signals recompute
* 07:55 NAS sync (export JSON data + weekly pg_dump to Fearghas)
*/
import PgBoss from "pg-boss";
import { config } from "dotenv";
@ -39,7 +49,6 @@ async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promi
await fn();
} finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
// Clean up after successful run
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
}
}
@ -54,7 +63,7 @@ export async function createScheduler(): Promise<PgBoss> {
retryLimit: 3,
retryDelay: 30,
retryBackoff: true,
expireInSeconds: 300, // 5 min timeout per job
expireInSeconds: 300,
monitorStateIntervalSeconds: 30,
});
@ -67,35 +76,52 @@ export async function createScheduler(): Promise<PgBoss> {
}
export async function registerSchedules(boss: PgBoss): Promise<void> {
// pg-boss v10: create queues before scheduling
const queues = [
// Pricing scrapers
"scrape:pricing:fs",
"scrape:pricing:optcore",
"scrape:pricing:10gtek",
"scrape:pricing:atgbics",
"scrape:pricing:prolabs",
"scrape:compat:cisco",
"scrape:pricing:champion-one",
"scrape:pricing:fluxlight",
"scrape:pricing:gbics",
"scrape:pricing:sfpcables",
// Catalog / vendor scrapers
"scrape:pricing:flexoptix",
"scrape:vendors:flexoptix",
"scrape:vendors:flexoptix-supported",
// Compatibility scrapers
"scrape:compat:cisco",
"scrape:compat:juniper",
"scrape:compat:sonic",
"scrape:compat:ufispace",
"scrape:compat:edgecore",
// Switch enrichment
"scrape:assets:switches",
// eBay enrichment
"enrich:ebay-transceivers",
"enrich:ebay-switches",
// Intelligence & community
"scrape:market-intel",
"scrape:community-issues",
"scrape:datasheet-links",
"scrape:news",
"scrape:faq",
"scrape:docs",
"scrape:market-intel",
// Compute jobs
"compute:abc",
"compute:reorder-signals",
"enrich:ebay-switches",
"enrich:ebay-transceivers",
"scrape:community-issues",
"scrape:datasheet-links",
// Sync
"sync:nas",
];
for (const q of queues) {
await boss.createQueue(q).catch(() => { /* already exists */ });
}
// ════════════════════════════════════════════════════════════════
// NIGHTLY WINDOW 00:0008:00 (all scrapers run every night)
// Staggered to avoid parallel overload, respect vendor rate limits
// NIGHTLY WINDOW 00:0008:00 — every night, all scrapers
// ════════════════════════════════════════════════════════════════
// 00:00 — eBay transceiver pricing (new/refurb, all 5000+ products)
@ -113,136 +139,193 @@ export async function registerSchedules(boss: PgBoss): Promise<void> {
retryLimit: 3, expireInSeconds: 5400,
});
// 01:45 — Optcore pricing
await boss.schedule("scrape:pricing:optcore", "45 1 * * *", {}, {
// 01:30 — Champion ONE pricing
await boss.schedule("scrape:pricing:champion-one", "30 1 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 02:15 — 10Gtek pricing (Playwright)
await boss.schedule("scrape:pricing:10gtek", "15 2 * * *", {}, {
// 01:50 — Fluxlight pricing
await boss.schedule("scrape:pricing:fluxlight", "50 1 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 02:45 — ATGBICS pricing (Shopify/Playwright, GBP)
await boss.schedule("scrape:pricing:atgbics", "45 2 * * *", {}, {
// 02:10 — GBICs pricing
await boss.schedule("scrape:pricing:gbics", "10 2 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 03:15 — ProLabs pricing (Playwright/CloudFront)
await boss.schedule("scrape:pricing:prolabs", "15 3 * * *", {}, {
// 02:30 — Optcore pricing
await boss.schedule("scrape:pricing:optcore", "30 2 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 03:45 — Flexoptix catalog (fast fetch — primary source, highest priority)
await boss.schedule("scrape:pricing:flexoptix", "45 3 * * *", {}, {
// 02:50 — 10Gtek pricing (Playwright)
await boss.schedule("scrape:pricing:10gtek", "50 2 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 03:20 — ATGBICS pricing (Shopify/Playwright, GBP)
await boss.schedule("scrape:pricing:atgbics", "20 3 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 03:50 — ProLabs pricing (Playwright/CloudFront)
await boss.schedule("scrape:pricing:prolabs", "50 3 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 04:20 — SFPCables pricing
await boss.schedule("scrape:pricing:sfpcables", "20 4 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 04:40 — Market intelligence (OFC/ECOC, IEEE 802.3, EU TED, Farnell/Mouser lead times)
await boss.schedule("scrape:market-intel", "40 4 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 05:00 — Flexoptix catalog (fast fetch — primary source, highest priority)
await boss.schedule("scrape:pricing:flexoptix", "0 5 * * *", {}, {
retryLimit: 3, expireInSeconds: 3600,
});
// 04:15 — Flexoptix vendor list (full vendor catalog sync)
await boss.schedule("scrape:vendors:flexoptix", "15 4 * * *", {}, {
// 05:20 — Flexoptix vendor list (full vendor catalog sync)
await boss.schedule("scrape:vendors:flexoptix", "20 5 * * *", {}, {
retryLimit: 2, expireInSeconds: 1800,
});
// 04:30 — Market intelligence (OFC/ECOC, IEEE 802.3, EU TED, Farnell/Mouser lead times)
await boss.schedule("scrape:market-intel", "30 4 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 05:00 — Community issues (Reddit/ServeTheHome/Arista/Cisco forums)
await boss.schedule("scrape:community-issues", "0 5 * * *", {}, {
retryLimit: 1, expireInSeconds: 3600,
});
// 05:30 — Datasheet + manual link discovery
await boss.schedule("scrape:datasheet-links", "30 5 * * *", {}, {
retryLimit: 1, expireInSeconds: 3600,
});
// 06:00 — Cisco/Juniper/Arista compatibility matrices (nightly — was weekly)
await boss.schedule("scrape:compat:cisco", "0 6 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 06:15 — News aggregation (LightReading, FierceTelecom, trade press)
await boss.schedule("scrape:news", "15 6 * * *", {}, {
// 05:35 — Flexoptix supported vendors (OEM→Flexoptix SKU mapping)
await boss.schedule("scrape:vendors:flexoptix-supported", "35 5 * * *", {}, {
retryLimit: 2, expireInSeconds: 1800,
});
// 06:30 — FAQ / knowledge base scraping
await boss.schedule("scrape:faq", "30 6 * * *", {}, {
// 05:50 — Cisco/Juniper/Arista compatibility matrices
await boss.schedule("scrape:compat:cisco", "50 5 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 07:00 — Docs check (full document/datasheet download)
await boss.schedule("scrape:docs", "0 7 * * *", {}, {
// 06:05 — Juniper HCT compatibility
await boss.schedule("scrape:compat:juniper", "5 6 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 07:15 — ABC classification recompute (after all pricing runs)
await boss.schedule("compute:abc", "15 7 * * *", {}, {
// 06:20 — SONiC HCL compatibility
await boss.schedule("scrape:compat:sonic", "20 6 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 06:35 — Ufispace switch data
await boss.schedule("scrape:compat:ufispace", "35 6 * * *", {}, {
retryLimit: 2, expireInSeconds: 1800,
});
// 06:45 — Edgecore switch data
await boss.schedule("scrape:compat:edgecore", "45 6 * * *", {}, {
retryLimit: 2, expireInSeconds: 1800,
});
// 06:55 — Switch assets enrichment (images, specs, from Arista/Cisco/Juniper)
await boss.schedule("scrape:assets:switches", "55 6 * * *", {}, {
retryLimit: 1, expireInSeconds: 3600,
});
// 07:00 — Community issues (Reddit/ServeTheHome/Arista/Cisco forums)
await boss.schedule("scrape:community-issues", "0 7 * * *", {}, {
retryLimit: 1, expireInSeconds: 3600,
});
// 07:10 — Datasheet + manual link discovery
await boss.schedule("scrape:datasheet-links", "10 7 * * *", {}, {
retryLimit: 1, expireInSeconds: 3600,
});
// 07:20 — News aggregation (LightReading, FierceTelecom, trade press)
await boss.schedule("scrape:news", "20 7 * * *", {}, {
retryLimit: 2, expireInSeconds: 1800,
});
// 07:30 — FAQ / knowledge base scraping
await boss.schedule("scrape:faq", "30 7 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 07:35 — Docs check (full document/datasheet download)
await boss.schedule("scrape:docs", "35 7 * * *", {}, {
retryLimit: 2, expireInSeconds: 3600,
});
// 07:42 — ABC classification recompute (after all pricing runs)
await boss.schedule("compute:abc", "42 7 * * *", {}, {
retryLimit: 2, expireInSeconds: 600,
});
// 07:30 — Reorder signals recompute (after ABC)
await boss.schedule("compute:reorder-signals", "30 7 * * *", {}, {
// 07:50 — Reorder signals recompute (after ABC)
await boss.schedule("compute:reorder-signals", "50 7 * * *", {}, {
retryLimit: 2, expireInSeconds: 600,
});
// 07:45 — NAS sync: export all data as JSON + weekly pg_dump to Fearghas
await boss.schedule("sync:nas", "45 7 * * *", {}, {
// 07:55 — NAS sync: export all data as JSON + weekly pg_dump to Fearghas
await boss.schedule("sync:nas", "55 7 * * *", {}, {
retryLimit: 1, expireInSeconds: 1800,
});
console.log("All schedules registered — nightly window 00:0008:00");
console.log("All schedules registered — nightly window 00:0008:00 (30 jobs)");
}
export async function registerWorkers(boss: PgBoss): Promise<void> {
// Lazy-load scrapers to avoid circular deps
// Lazy-load all scrapers
const { scrapeFs } = await import("./scrapers/fs-com");
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
const { scrapeOptcore } = await import("./scrapers/optcore");
const { scrape10Gtek } = await import("./scrapers/tenGtek");
const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog");
const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors");
const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors");
const { scrapeNews } = await import("./scrapers/news");
const { scrapeAtgbics } = await import("./scrapers/atgbics");
const { scrapeProLabs } = await import("./scrapers/prolabs");
const { scrapeChampionOne } = await import("./scrapers/champion-one");
const { scrapeFluxlight } = await import("./scrapers/fluxlight");
const { scrapeGbics } = await import("./scrapers/gbics");
const { scrapeSfpCables } = await import("./scrapers/sfpcables");
const { scrapeJuniperHct } = await import("./scrapers/juniper-hct");
const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl");
const { scrapeUfiSpace } = await import("./scrapers/ufispace");
const { scrapeEdgecore } = await import("./scrapers/edgecore");
const { scrapeSwitchAssets } = await import("./scrapers/switch-assets");
// ── Pricing scrapers ──────────────────────────────────────────────
await boss.work("scrape:pricing:fs", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
await withIsolatedStorage("fs", scrapeFs);
});
await boss.work("scrape:pricing:champion-one", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`);
await withIsolatedStorage("champion-one", scrapeChampionOne);
});
await boss.work("scrape:pricing:fluxlight", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`);
await withIsolatedStorage("fluxlight", scrapeFluxlight);
});
await boss.work("scrape:pricing:gbics", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: GBICs pricing`);
await withIsolatedStorage("gbics", scrapeGbics);
});
await boss.work("scrape:pricing:optcore", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
await withIsolatedStorage("optcore", scrapeOptcore);
});
await boss.work("scrape:compat:cisco", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Cisco TMG`);
await withIsolatedStorage("cisco", scrapeCiscoTmg);
});
await boss.work("scrape:pricing:10gtek", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`);
await withIsolatedStorage("10gtek", scrape10Gtek);
});
await boss.work("scrape:pricing:flexoptix", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`);
await scrapeFlexoptixCatalog();
});
await boss.work("scrape:vendors:flexoptix", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`);
await scrapeFlexoptixVendors();
});
await boss.work("scrape:news", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: News aggregation`);
await scrapeNews();
});
await boss.work("scrape:pricing:atgbics", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`);
await withIsolatedStorage("atgbics", scrapeAtgbics);
@ -253,32 +336,64 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await withIsolatedStorage("prolabs", scrapeProLabs);
});
await boss.work("scrape:faq", async (_job) => {
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
await boss.work("scrape:pricing:sfpcables", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: SFPCables pricing`);
await withIsolatedStorage("sfpcables", scrapeSfpCables);
});
await boss.work("scrape:docs", async (_job) => {
console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`);
// ── Catalog / vendor scrapers ─────────────────────────────────────
await boss.work("scrape:pricing:flexoptix", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`);
await scrapeFlexoptixCatalog();
});
await boss.work("scrape:market-intel", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence");
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
await boss.work("scrape:vendors:flexoptix", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`);
await scrapeFlexoptixVendors();
});
await boss.work("compute:abc", async (_job) => {
console.log(`[${new Date().toISOString()}] Computing: ABC classification`);
const { computeAbcClassification } = await import("./scrapers/market-intelligence");
await computeAbcClassification();
await boss.work("scrape:vendors:flexoptix-supported", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Flexoptix supported vendors`);
await seedFlexoptixVendors();
});
await boss.work("compute:reorder-signals", async (_job) => {
console.log(`[${new Date().toISOString()}] Computing: Reorder signals`);
const { computeReorderSignals } = await import("./scrapers/market-intelligence");
await computeReorderSignals();
// ── Compatibility scrapers ────────────────────────────────────────
await boss.work("scrape:compat:cisco", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`);
await withIsolatedStorage("cisco", scrapeCiscoTmg);
});
await boss.work("scrape:compat:juniper", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`);
await withIsolatedStorage("juniper", scrapeJuniperHct);
});
await boss.work("scrape:compat:sonic", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`);
await withIsolatedStorage("sonic", scrapeSonicHcl);
});
await boss.work("scrape:compat:ufispace", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`);
await withIsolatedStorage("ufispace", scrapeUfiSpace);
});
await boss.work("scrape:compat:edgecore", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`);
await withIsolatedStorage("edgecore", scrapeEdgecore);
});
// ── Switch asset enrichment ───────────────────────────────────────
await boss.work("scrape:assets:switches", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`);
await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets());
});
// ── eBay enrichment ───────────────────────────────────────────────
await boss.work("enrich:ebay-switches", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`);
const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher");
@ -291,8 +406,16 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100));
});
// ── Intelligence & community ──────────────────────────────────────
await boss.work("scrape:market-intel", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Market intelligence`);
const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence");
await withIsolatedStorage("market-intel", scrapeMarketIntelligence);
});
await boss.work("scrape:community-issues", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Community issues scraping`);
console.log(`[${new Date().toISOString()}] Running: Community issues`);
const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues");
await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30));
});
@ -303,11 +426,40 @@ export async function registerWorkers(boss: PgBoss): Promise<void> {
await findAndSeedDatasheetLinks(50);
});
await boss.work("scrape:news", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: News aggregation`);
await scrapeNews();
});
await boss.work("scrape:faq", async (_job) => {
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
});
await boss.work("scrape:docs", async (_job) => {
console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`);
});
// ── Compute jobs ──────────────────────────────────────────────────
await boss.work("compute:abc", async (_job) => {
console.log(`[${new Date().toISOString()}] Computing: ABC classification`);
const { computeAbcClassification } = await import("./scrapers/market-intelligence");
await computeAbcClassification();
});
await boss.work("compute:reorder-signals", async (_job) => {
console.log(`[${new Date().toISOString()}] Computing: Reorder signals`);
const { computeReorderSignals } = await import("./scrapers/market-intelligence");
await computeReorderSignals();
});
// ── Sync ──────────────────────────────────────────────────────────
await boss.work("sync:nas", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: NAS sync to Fearghas`);
const { runNightlyNasSync } = await import("./utils/nas-sync");
await runNightlyNasSync();
});
console.log("All workers registered");
console.log("All workers registered (30 jobs)");
}