Rene Fichtmueller e9fb50a248 feat: TIP Phase 0+1 — monorepo, DB schema, API, scraper engine
Phase 0 - Foundation:
- Restructure into npm workspace monorepo (packages/core, api, scraper)
- PostgreSQL 17 + TimescaleDB schema (15 tables incl. hypertables)
- Docker Compose for local dev (PostgreSQL on 5433 + Qdrant)
- Express 5 API on port 3200 with 6 routes
- Seed script to migrate 159 transceivers + 42 standards from npm package
- Erik server setup script + PM2 ecosystem config

Phase 1 - Scraper Engine:
- Crawlee + Playwright framework with pg-boss scheduler
- FS.com scraper (PlaywrightCrawler, anti-bot workaround)
- Optcore.net scraper (WP REST API enumeration + PlaywrightCrawler)
  - Uses /wp-json/wp/v2/product to get 2000+ product URLs
  - Playwright renders individual product pages for price extraction
- Cisco TMG Matrix scraper (compatibility data)
- News RSS aggregator (optics.org, SPIE, Network World, Nature Photonics)
  - Keyword relevance scoring for transceiver/fiber topics
  - xml2js with malformed XML sanitization
- SHA-256 content hashing for change detection (skip unchanged records)
- pg-boss v10 with explicit queue creation before scheduling
2026-03-27 16:27:31 +13:00

128 lines
4.1 KiB
TypeScript

/**
* pg-boss Job Scheduler — manages scrape jobs with adaptive timing.
*
* Job types:
* scrape:pricing:fs — Every 4 hours for FS.com prices/stock
* scrape:pricing:optcore — Every 6 hours for Optcore prices/stock
* scrape:compat:cisco — Weekly for OEM compatibility matrices
* scrape:news — Every 6 hours for trade press and news
* scrape:docs — Weekly for manuals and datasheets
* scrape:faq — Weekly for vendor FAQ/troubleshooting pages
*/
import PgBoss from "pg-boss";
import { config } from "dotenv";
import { join } from "path";
config({ path: join(__dirname, "..", "..", "..", ".env") });
const connectionString = `postgres://${process.env.POSTGRES_USER || "tip"}:${process.env.POSTGRES_PASSWORD || "tip_dev_2026"}@${process.env.POSTGRES_HOST || "localhost"}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB || "transceiver_db"}`;
export async function createScheduler(): Promise<PgBoss> {
const boss = new PgBoss({
connectionString,
retryLimit: 3,
retryDelay: 30,
retryBackoff: true,
expireInSeconds: 300, // 5 min timeout per job
monitorStateIntervalSeconds: 30,
});
boss.on("error", (error) => console.error("pg-boss error:", error));
await boss.start();
console.log("pg-boss scheduler started");
return boss;
}
export async function registerSchedules(boss: PgBoss): Promise<void> {
// pg-boss v10: create queues before scheduling
const queues = [
"scrape:pricing:fs",
"scrape:pricing:optcore",
"scrape:compat:cisco",
"scrape:news",
"scrape:faq",
"scrape:docs",
];
for (const q of queues) {
await boss.createQueue(q).catch(() => { /* already exists */ });
}
// FS.com pricing (every 4 hours — JS rendering is slow)
await boss.schedule("scrape:pricing:fs", "0 */4 * * *", {}, {
retryLimit: 2,
expireInSeconds: 3600,
});
// Optcore pricing (every 6 hours — WP API enumeration + Playwright)
await boss.schedule("scrape:pricing:optcore", "0 */6 * * *", {}, {
retryLimit: 2,
expireInSeconds: 7200,
});
// Compatibility matrices (every Sunday at 3am)
await boss.schedule("scrape:compat:cisco", "0 3 * * 0", {}, {
retryLimit: 3,
expireInSeconds: 3600,
});
// News aggregation (every 6 hours)
await boss.schedule("scrape:news", "0 */6 * * *", {}, {
retryLimit: 2,
expireInSeconds: 1800,
});
// FAQ/KB scraping (every Wednesday at 2am)
await boss.schedule("scrape:faq", "0 2 * * 3", {}, {
retryLimit: 3,
expireInSeconds: 3600,
});
// Document/datasheet check (every Saturday at 4am)
await boss.schedule("scrape:docs", "0 4 * * 6", {}, {
retryLimit: 3,
expireInSeconds: 7200,
});
console.log("All schedules registered");
}
export async function registerWorkers(boss: PgBoss): Promise<void> {
// Lazy-load scrapers to avoid circular deps
const { scrapeFs } = await import("./scrapers/fs-com");
const { scrapeCiscoTmg } = await import("./scrapers/cisco-tmg");
const { scrapeOptcore } = await import("./scrapers/optcore");
const { scrapeNews } = await import("./scrapers/news");
await boss.work("scrape:pricing:fs", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: FS.com pricing`);
await scrapeFs();
});
await boss.work("scrape:pricing:optcore", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Optcore pricing`);
await scrapeOptcore();
});
await boss.work("scrape:compat:cisco", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: Cisco TMG`);
await scrapeCiscoTmg();
});
await boss.work("scrape:news", async (_job) => {
console.log(`[${new Date().toISOString()}] Running: News aggregation`);
await scrapeNews();
});
await boss.work("scrape:faq", async (_job) => {
console.log(`[${new Date().toISOString()}] FAQ scraper — not yet implemented`);
});
await boss.work("scrape:docs", async (_job) => {
console.log(`[${new Date().toISOString()}] Docs scraper — not yet implemented`);
});
console.log("All workers registered");
}