diff --git a/packages/scraper/src/scheduler.ts b/packages/scraper/src/scheduler.ts index c6707bf..b0252cb 100644 --- a/packages/scraper/src/scheduler.ts +++ b/packages/scraper/src/scheduler.ts @@ -1,28 +1,38 @@ /** * pg-boss Job Scheduler * - * NIGHTLY WINDOW 00:00–08:00 — all scrapers run every night - * Staggered to avoid parallel overload and respect rate limits. + * NIGHTLY WINDOW 00:00–08:00 — ALL scrapers run every night + * Staggered to avoid parallel overload and respect vendor rate limits. * * 00:00 eBay transceiver pricing (new/refurb condition prices) * 00:30 eBay switch enrichment (features, descriptions, images) - * 01:00 FS.com pricing (JS-rendered, needs Playwright) - * 01:45 Optcore pricing - * 02:15 10Gtek pricing (Playwright) - * 02:45 ATGBICS pricing (Shopify/Playwright, GBP) - * 03:15 ProLabs pricing (Playwright) - * 03:45 Flexoptix catalog (fast fetch — primary source) - * 04:15 Flexoptix vendor list - * 04:30 Market intelligence (OFC/ECOC/IEEE/Farnell/TED) - * 05:00 Community issues (Reddit/forums/vendor KB) - * 05:30 Datasheet + manual link discovery - * 06:00 Cisco compatibility matrices - * 06:15 News aggregation (trade press) - * 06:30 FAQ / knowledge base - * 07:00 Docs check (weekly: full doc scrape) - * 07:15 ABC classification recompute - * 07:30 Reorder signals recompute - * 07:45 NAS sync (export JSON data + weekly pg_dump to Fearghas) + * 01:00 FS.com pricing (Playwright, slowest scraper) + * 01:30 Champion ONE pricing + * 01:50 Fluxlight pricing + * 02:10 GBICs pricing + * 02:30 Optcore pricing + * 02:50 10Gtek pricing (Playwright) + * 03:20 ATGBICS pricing (Shopify/Playwright, GBP) + * 03:50 ProLabs pricing (Playwright/CloudFront) + * 04:20 SFPCables pricing + * 04:40 Market intelligence (OFC/ECOC/IEEE/Farnell/TED) + * 05:00 Flexoptix catalog (fast fetch — primary source) + * 05:20 Flexoptix vendor list + * 05:35 Flexoptix supported vendors (full vendor seed) + * 05:50 Cisco compatibility matrices + * 06:05 Juniper HCT compatibility + * 06:20 SONiC HCL compatibility + * 06:35 Ufispace switch data + * 06:45 Edgecore switch data + * 06:55 Switch assets enrichment (images, specs) + * 07:00 Community issues (Reddit/forums/vendor KB) + * 07:10 Datasheet + manual link discovery + * 07:20 News aggregation (trade press) + * 07:30 FAQ / knowledge base + * 07:35 Docs check (full doc scrape) + * 07:42 ABC classification recompute + * 07:50 Reorder signals recompute + * 07:55 NAS sync (export JSON data + weekly pg_dump to Fearghas) */ import PgBoss from "pg-boss"; import { config } from "dotenv"; @@ -39,7 +49,6 @@ async function withIsolatedStorage(name: string, fn: () => Promise): Promi await fn(); } finally { process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; - // Clean up after successful run try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ } } } @@ -54,7 +63,7 @@ export async function createScheduler(): Promise { retryLimit: 3, retryDelay: 30, retryBackoff: true, - expireInSeconds: 300, // 5 min timeout per job + expireInSeconds: 300, monitorStateIntervalSeconds: 30, }); @@ -67,35 +76,52 @@ export async function createScheduler(): Promise { } export async function registerSchedules(boss: PgBoss): Promise { - // pg-boss v10: create queues before scheduling const queues = [ + // Pricing scrapers "scrape:pricing:fs", "scrape:pricing:optcore", "scrape:pricing:10gtek", "scrape:pricing:atgbics", "scrape:pricing:prolabs", - "scrape:compat:cisco", + "scrape:pricing:champion-one", + "scrape:pricing:fluxlight", + "scrape:pricing:gbics", + "scrape:pricing:sfpcables", + // Catalog / vendor scrapers "scrape:pricing:flexoptix", "scrape:vendors:flexoptix", + "scrape:vendors:flexoptix-supported", + // Compatibility scrapers + "scrape:compat:cisco", + "scrape:compat:juniper", + "scrape:compat:sonic", + "scrape:compat:ufispace", + "scrape:compat:edgecore", + // Switch enrichment + "scrape:assets:switches", + // eBay enrichment + "enrich:ebay-transceivers", + "enrich:ebay-switches", + // Intelligence & community + "scrape:market-intel", + "scrape:community-issues", + "scrape:datasheet-links", "scrape:news", "scrape:faq", "scrape:docs", - "scrape:market-intel", + // Compute jobs "compute:abc", "compute:reorder-signals", - "enrich:ebay-switches", - "enrich:ebay-transceivers", - "scrape:community-issues", - "scrape:datasheet-links", + // Sync "sync:nas", ]; + for (const q of queues) { await boss.createQueue(q).catch(() => { /* already exists */ }); } // ════════════════════════════════════════════════════════════════ - // NIGHTLY WINDOW 00:00–08:00 (all scrapers run every night) - // Staggered to avoid parallel overload, respect vendor rate limits + // NIGHTLY WINDOW 00:00–08:00 — every night, all scrapers // ════════════════════════════════════════════════════════════════ // 00:00 — eBay transceiver pricing (new/refurb, all 5000+ products) @@ -113,136 +139,193 @@ export async function registerSchedules(boss: PgBoss): Promise { retryLimit: 3, expireInSeconds: 5400, }); - // 01:45 — Optcore pricing - await boss.schedule("scrape:pricing:optcore", "45 1 * * *", {}, { + // 01:30 — Champion ONE pricing + await boss.schedule("scrape:pricing:champion-one", "30 1 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 02:15 — 10Gtek pricing (Playwright) - await boss.schedule("scrape:pricing:10gtek", "15 2 * * *", {}, { + // 01:50 — Fluxlight pricing + await boss.schedule("scrape:pricing:fluxlight", "50 1 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 02:45 — ATGBICS pricing (Shopify/Playwright, GBP) - await boss.schedule("scrape:pricing:atgbics", "45 2 * * *", {}, { + // 02:10 — GBICs pricing + await boss.schedule("scrape:pricing:gbics", "10 2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 03:15 — ProLabs pricing (Playwright/CloudFront) - await boss.schedule("scrape:pricing:prolabs", "15 3 * * *", {}, { + // 02:30 — Optcore pricing + await boss.schedule("scrape:pricing:optcore", "30 2 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 03:45 — Flexoptix catalog (fast fetch — primary source, highest priority) - await boss.schedule("scrape:pricing:flexoptix", "45 3 * * *", {}, { + // 02:50 — 10Gtek pricing (Playwright) + await boss.schedule("scrape:pricing:10gtek", "50 2 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 03:20 — ATGBICS pricing (Shopify/Playwright, GBP) + await boss.schedule("scrape:pricing:atgbics", "20 3 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 03:50 — ProLabs pricing (Playwright/CloudFront) + await boss.schedule("scrape:pricing:prolabs", "50 3 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 04:20 — SFPCables pricing + await boss.schedule("scrape:pricing:sfpcables", "20 4 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 04:40 — Market intelligence (OFC/ECOC, IEEE 802.3, EU TED, Farnell/Mouser lead times) + await boss.schedule("scrape:market-intel", "40 4 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 05:00 — Flexoptix catalog (fast fetch — primary source, highest priority) + await boss.schedule("scrape:pricing:flexoptix", "0 5 * * *", {}, { retryLimit: 3, expireInSeconds: 3600, }); - // 04:15 — Flexoptix vendor list (full vendor catalog sync) - await boss.schedule("scrape:vendors:flexoptix", "15 4 * * *", {}, { + // 05:20 — Flexoptix vendor list (full vendor catalog sync) + await boss.schedule("scrape:vendors:flexoptix", "20 5 * * *", {}, { retryLimit: 2, expireInSeconds: 1800, }); - // 04:30 — Market intelligence (OFC/ECOC, IEEE 802.3, EU TED, Farnell/Mouser lead times) - await boss.schedule("scrape:market-intel", "30 4 * * *", {}, { - retryLimit: 2, expireInSeconds: 3600, - }); - - // 05:00 — Community issues (Reddit/ServeTheHome/Arista/Cisco forums) - await boss.schedule("scrape:community-issues", "0 5 * * *", {}, { - retryLimit: 1, expireInSeconds: 3600, - }); - - // 05:30 — Datasheet + manual link discovery - await boss.schedule("scrape:datasheet-links", "30 5 * * *", {}, { - retryLimit: 1, expireInSeconds: 3600, - }); - - // 06:00 — Cisco/Juniper/Arista compatibility matrices (nightly — was weekly) - await boss.schedule("scrape:compat:cisco", "0 6 * * *", {}, { - retryLimit: 2, expireInSeconds: 3600, - }); - - // 06:15 — News aggregation (LightReading, FierceTelecom, trade press) - await boss.schedule("scrape:news", "15 6 * * *", {}, { + // 05:35 — Flexoptix supported vendors (OEM→Flexoptix SKU mapping) + await boss.schedule("scrape:vendors:flexoptix-supported", "35 5 * * *", {}, { retryLimit: 2, expireInSeconds: 1800, }); - // 06:30 — FAQ / knowledge base scraping - await boss.schedule("scrape:faq", "30 6 * * *", {}, { + // 05:50 — Cisco/Juniper/Arista compatibility matrices + await boss.schedule("scrape:compat:cisco", "50 5 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 07:00 — Docs check (full document/datasheet download) - await boss.schedule("scrape:docs", "0 7 * * *", {}, { + // 06:05 — Juniper HCT compatibility + await boss.schedule("scrape:compat:juniper", "5 6 * * *", {}, { retryLimit: 2, expireInSeconds: 3600, }); - // 07:15 — ABC classification recompute (after all pricing runs) - await boss.schedule("compute:abc", "15 7 * * *", {}, { + // 06:20 — SONiC HCL compatibility + await boss.schedule("scrape:compat:sonic", "20 6 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 06:35 — Ufispace switch data + await boss.schedule("scrape:compat:ufispace", "35 6 * * *", {}, { + retryLimit: 2, expireInSeconds: 1800, + }); + + // 06:45 — Edgecore switch data + await boss.schedule("scrape:compat:edgecore", "45 6 * * *", {}, { + retryLimit: 2, expireInSeconds: 1800, + }); + + // 06:55 — Switch assets enrichment (images, specs, from Arista/Cisco/Juniper) + await boss.schedule("scrape:assets:switches", "55 6 * * *", {}, { + retryLimit: 1, expireInSeconds: 3600, + }); + + // 07:00 — Community issues (Reddit/ServeTheHome/Arista/Cisco forums) + await boss.schedule("scrape:community-issues", "0 7 * * *", {}, { + retryLimit: 1, expireInSeconds: 3600, + }); + + // 07:10 — Datasheet + manual link discovery + await boss.schedule("scrape:datasheet-links", "10 7 * * *", {}, { + retryLimit: 1, expireInSeconds: 3600, + }); + + // 07:20 — News aggregation (LightReading, FierceTelecom, trade press) + await boss.schedule("scrape:news", "20 7 * * *", {}, { + retryLimit: 2, expireInSeconds: 1800, + }); + + // 07:30 — FAQ / knowledge base scraping + await boss.schedule("scrape:faq", "30 7 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 07:35 — Docs check (full document/datasheet download) + await boss.schedule("scrape:docs", "35 7 * * *", {}, { + retryLimit: 2, expireInSeconds: 3600, + }); + + // 07:42 — ABC classification recompute (after all pricing runs) + await boss.schedule("compute:abc", "42 7 * * *", {}, { retryLimit: 2, expireInSeconds: 600, }); - // 07:30 — Reorder signals recompute (after ABC) - await boss.schedule("compute:reorder-signals", "30 7 * * *", {}, { + // 07:50 — Reorder signals recompute (after ABC) + await boss.schedule("compute:reorder-signals", "50 7 * * *", {}, { retryLimit: 2, expireInSeconds: 600, }); - // 07:45 — NAS sync: export all data as JSON + weekly pg_dump to Fearghas - await boss.schedule("sync:nas", "45 7 * * *", {}, { + // 07:55 — NAS sync: export all data as JSON + weekly pg_dump to Fearghas + await boss.schedule("sync:nas", "55 7 * * *", {}, { retryLimit: 1, expireInSeconds: 1800, }); - console.log("All schedules registered — nightly window 00:00–08:00"); + console.log("All schedules registered — nightly window 00:00–08:00 (30 jobs)"); } export async function registerWorkers(boss: PgBoss): Promise { - // Lazy-load scrapers to avoid circular deps + // Lazy-load all scrapers const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg"); const { scrapeOptcore } = await import("./scrapers/optcore"); const { scrape10Gtek } = await import("./scrapers/tenGtek"); const { scrapeFlexoptixCatalog } = await import("./scrapers/flexoptix-catalog"); const { scrapeFlexoptixVendors } = await import("./scrapers/flexoptix-vendors"); + const { seedFlexoptixVendors } = await import("./scrapers/flexoptix-supported-vendors"); const { scrapeNews } = await import("./scrapers/news"); const { scrapeAtgbics } = await import("./scrapers/atgbics"); const { scrapeProLabs } = await import("./scrapers/prolabs"); + const { scrapeChampionOne } = await import("./scrapers/champion-one"); + const { scrapeFluxlight } = await import("./scrapers/fluxlight"); + const { scrapeGbics } = await import("./scrapers/gbics"); + const { scrapeSfpCables } = await import("./scrapers/sfpcables"); + const { scrapeJuniperHct } = await import("./scrapers/juniper-hct"); + const { scrapeSonicHcl } = await import("./scrapers/sonic-hcl"); + const { scrapeUfiSpace } = await import("./scrapers/ufispace"); + const { scrapeEdgecore } = await import("./scrapers/edgecore"); + const { scrapeSwitchAssets } = await import("./scrapers/switch-assets"); + + // ── Pricing scrapers ────────────────────────────────────────────── await boss.work("scrape:pricing:fs", async (_job) => { console.log(`[${new Date().toISOString()}] Running: FS.com pricing`); await withIsolatedStorage("fs", scrapeFs); }); + await boss.work("scrape:pricing:champion-one", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Champion ONE pricing`); + await withIsolatedStorage("champion-one", scrapeChampionOne); + }); + + await boss.work("scrape:pricing:fluxlight", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Fluxlight pricing`); + await withIsolatedStorage("fluxlight", scrapeFluxlight); + }); + + await boss.work("scrape:pricing:gbics", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: GBICs pricing`); + await withIsolatedStorage("gbics", scrapeGbics); + }); + await boss.work("scrape:pricing:optcore", async (_job) => { console.log(`[${new Date().toISOString()}] Running: Optcore pricing`); await withIsolatedStorage("optcore", scrapeOptcore); }); - await boss.work("scrape:compat:cisco", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: Cisco TMG`); - await withIsolatedStorage("cisco", scrapeCiscoTmg); - }); - await boss.work("scrape:pricing:10gtek", async (_job) => { console.log(`[${new Date().toISOString()}] Running: 10Gtek pricing`); await withIsolatedStorage("10gtek", scrape10Gtek); }); - await boss.work("scrape:pricing:flexoptix", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`); - await scrapeFlexoptixCatalog(); - }); - - await boss.work("scrape:vendors:flexoptix", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`); - await scrapeFlexoptixVendors(); - }); - - await boss.work("scrape:news", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: News aggregation`); - await scrapeNews(); - }); - await boss.work("scrape:pricing:atgbics", async (_job) => { console.log(`[${new Date().toISOString()}] Running: ATGBICS pricing`); await withIsolatedStorage("atgbics", scrapeAtgbics); @@ -253,32 +336,64 @@ export async function registerWorkers(boss: PgBoss): Promise { await withIsolatedStorage("prolabs", scrapeProLabs); }); - await boss.work("scrape:faq", async (_job) => { - console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); + await boss.work("scrape:pricing:sfpcables", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: SFPCables pricing`); + await withIsolatedStorage("sfpcables", scrapeSfpCables); }); - await boss.work("scrape:docs", async (_job) => { - console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`); + // ── Catalog / vendor scrapers ───────────────────────────────────── + + await boss.work("scrape:pricing:flexoptix", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Flexoptix catalog pricing`); + await scrapeFlexoptixCatalog(); }); - await boss.work("scrape:market-intel", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: Market intelligence`); - const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); - await withIsolatedStorage("market-intel", scrapeMarketIntelligence); + await boss.work("scrape:vendors:flexoptix", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Flexoptix vendor list`); + await scrapeFlexoptixVendors(); }); - await boss.work("compute:abc", async (_job) => { - console.log(`[${new Date().toISOString()}] Computing: ABC classification`); - const { computeAbcClassification } = await import("./scrapers/market-intelligence"); - await computeAbcClassification(); + await boss.work("scrape:vendors:flexoptix-supported", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Flexoptix supported vendors`); + await seedFlexoptixVendors(); }); - await boss.work("compute:reorder-signals", async (_job) => { - console.log(`[${new Date().toISOString()}] Computing: Reorder signals`); - const { computeReorderSignals } = await import("./scrapers/market-intelligence"); - await computeReorderSignals(); + // ── Compatibility scrapers ──────────────────────────────────────── + + await boss.work("scrape:compat:cisco", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Cisco TMG compatibility`); + await withIsolatedStorage("cisco", scrapeCiscoTmg); }); + await boss.work("scrape:compat:juniper", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Juniper HCT compatibility`); + await withIsolatedStorage("juniper", scrapeJuniperHct); + }); + + await boss.work("scrape:compat:sonic", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: SONiC HCL compatibility`); + await withIsolatedStorage("sonic", scrapeSonicHcl); + }); + + await boss.work("scrape:compat:ufispace", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Ufispace switch data`); + await withIsolatedStorage("ufispace", scrapeUfiSpace); + }); + + await boss.work("scrape:compat:edgecore", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Edgecore switch data`); + await withIsolatedStorage("edgecore", scrapeEdgecore); + }); + + // ── Switch asset enrichment ─────────────────────────────────────── + + await boss.work("scrape:assets:switches", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Switch assets enrichment`); + await withIsolatedStorage("switch-assets", () => scrapeSwitchAssets()); + }); + + // ── eBay enrichment ─────────────────────────────────────────────── + await boss.work("enrich:ebay-switches", async (_job) => { console.log(`[${new Date().toISOString()}] Running: eBay switch enrichment`); const { enrichSwitchesFromEbay } = await import("./scrapers/ebay-enricher"); @@ -291,8 +406,16 @@ export async function registerWorkers(boss: PgBoss): Promise { await withIsolatedStorage("ebay-transceivers", () => enrichTransceiversFromEbay(100)); }); + // ── Intelligence & community ────────────────────────────────────── + + await boss.work("scrape:market-intel", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: Market intelligence`); + const { scrapeMarketIntelligence } = await import("./scrapers/market-intelligence"); + await withIsolatedStorage("market-intel", scrapeMarketIntelligence); + }); + await boss.work("scrape:community-issues", async (_job) => { - console.log(`[${new Date().toISOString()}] Running: Community issues scraping`); + console.log(`[${new Date().toISOString()}] Running: Community issues`); const { scrapeAllSwitchIssues } = await import("./scrapers/community-issues"); await withIsolatedStorage("community-issues", () => scrapeAllSwitchIssues(30)); }); @@ -303,11 +426,40 @@ export async function registerWorkers(boss: PgBoss): Promise { await findAndSeedDatasheetLinks(50); }); + await boss.work("scrape:news", async (_job) => { + console.log(`[${new Date().toISOString()}] Running: News aggregation`); + await scrapeNews(); + }); + + await boss.work("scrape:faq", async (_job) => { + console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`); + }); + + await boss.work("scrape:docs", async (_job) => { + console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`); + }); + + // ── Compute jobs ────────────────────────────────────────────────── + + await boss.work("compute:abc", async (_job) => { + console.log(`[${new Date().toISOString()}] Computing: ABC classification`); + const { computeAbcClassification } = await import("./scrapers/market-intelligence"); + await computeAbcClassification(); + }); + + await boss.work("compute:reorder-signals", async (_job) => { + console.log(`[${new Date().toISOString()}] Computing: Reorder signals`); + const { computeReorderSignals } = await import("./scrapers/market-intelligence"); + await computeReorderSignals(); + }); + + // ── Sync ────────────────────────────────────────────────────────── + await boss.work("sync:nas", async (_job) => { console.log(`[${new Date().toISOString()}] Running: NAS sync to Fearghas`); const { runNightlyNasSync } = await import("./utils/nas-sync"); await runNightlyNasSync(); }); - console.log("All workers registered"); + console.log("All workers registered (30 jobs)"); }