/** * TIP FS.COM + NADDOD Dedicated Scraper — index-fs-only.ts * * Runs on ERIK but routes traffic through Pi's SOCKS5 proxy so target sites * see a residential IP instead of the IONOS datacenter IP range. * * Required env: PROXY_URLS=socks5://10.10.0.6:1080 * * Jobs consumed from pg-boss: * scrape:pricing:fs — FS.com (Playwright, scheduled 02:00 + 14:00) * scrape:pricing:naddod — NADDOD (fetch, scheduled 00:00/06:00/12:00/18:00) */ import { config } from "dotenv"; import { join } from "path"; config({ path: join(__dirname, "..", "..", "..", ".env") }); import PgBoss from "pg-boss"; import { mkdirSync, rmSync } from "fs"; const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`; async function withIsolatedStorage(name: string, fn: () => Promise): Promise { const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`; mkdirSync(join(dir, "request_queues", "default"), { recursive: true }); mkdirSync(join(dir, "datasets", "default"), { recursive: true }); mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true }); const prev = process.env.CRAWLEE_STORAGE_DIR; process.env.CRAWLEE_STORAGE_DIR = dir; try { await fn(); } finally { process.env.CRAWLEE_STORAGE_DIR = prev ?? ""; try { rmSync(dir, { recursive: true, force: true }); } catch {} } } async function main() { const proxy = process.env.PROXY_URLS; console.log(`\n=== TIP FS.COM/NADDOD Scraper (proxy: ${proxy ?? "NONE — datacenter IP!"}) ===\n`); if (!proxy) { console.warn("WARNING: PROXY_URLS not set — FS.com will see IONOS IP (likely blocked by Cloudflare)"); } const boss = new PgBoss({ connectionString, retryLimit: 3, retryDelay: 300, // 5 min retry on failure expireInSeconds: 7200, // 2h timeout for full FS catalog run monitorStateIntervalSeconds: 60, }); boss.on("error", (e: Error) => console.error("pg-boss error:", e.message)); await boss.start(); await boss.createQueue("scrape:pricing:fs").catch(() => {}); await boss.createQueue("scrape:pricing:naddod").catch(() => {}); const { scrapeFs } = await import("./scrapers/fs-com"); const { scrapeNaddod } = await import("./scrapers/naddod"); // FS.com: Playwright crawler — PROXY_URLS auto-picked up by buildProxyConfiguration() await boss.work("scrape:pricing:fs", async () => { console.log(`[${new Date().toISOString()}] FS.COM → via ${proxy ?? "direct (no proxy)"}`); await withIsolatedStorage("fs", scrapeFs); }); // NADDOD: fetch-based — residential IP reduces rate-limit risk on US CDN await boss.work("scrape:pricing:naddod", async () => { console.log(`[${new Date().toISOString()}] NADDOD → via ${proxy ?? "direct (no proxy)"}`); await scrapeNaddod(); }); console.log("FS.COM + NADDOD workers active — waiting for pg-boss jobs\n"); process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); }); process.on("SIGINT", async () => { await boss.stop(); process.exit(0); }); } main().catch((e) => { console.error("Fatal:", e); process.exit(1); });