transceiver-db/packages/scraper/src/index-fs-only.ts
Rene Fichtmueller 0edc6e3f3a feat: Pi scraper fleet — fetch-only index-pi.ts + FS.COM/NADDOD via SOCKS5
- index-pi.ts: removed Playwright scrapers (FS.COM, eBay enricher, switch assets)
  added NADDOD (fetch-based, benefits from residential IP)
  now 32 fetch-only queues safe for ARM/Pi without Chromium
- index-fs-only.ts: new dedicated FS.COM + NADDOD worker for Erik
  routes through Pi SOCKS5 via PROXY_URLS=socks5://10.10.0.6:1080
  Crawlee ProxyConfiguration automatically applies to Playwright crawler
- pi-scraper-setup.sh: removed inline index-pi.ts override (repo version now authoritative)
- CODEX-TASK-pi-scraper-deploy.md: full 9-step Codex spec for Pi fleet setup
  covers WireGuard keypair, Erik peer config, setup script, ecosystem.config.js
- CODEX-TASK-zero-manual-review.md: deterministic equivalence matcher spec
2026-05-10 09:53:55 +02:00

78 lines
3.1 KiB
TypeScript

/**
* TIP FS.COM + NADDOD Dedicated Scraper — index-fs-only.ts
*
* Runs on ERIK but routes traffic through Pi's SOCKS5 proxy so target sites
* see a residential IP instead of the IONOS datacenter IP range.
*
* Required env: PROXY_URLS=socks5://10.10.0.6:1080
*
* Jobs consumed from pg-boss:
* scrape:pricing:fs — FS.com (Playwright, scheduled 02:00 + 14:00)
* scrape:pricing:naddod — NADDOD (fetch, scheduled 00:00/06:00/12:00/18:00)
*/
import { config } from "dotenv";
import { join } from "path";
config({ path: join(__dirname, "..", "..", "..", ".env") });
import PgBoss from "pg-boss";
import { mkdirSync, rmSync } from "fs";
const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`;
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`;
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
const prev = process.env.CRAWLEE_STORAGE_DIR;
process.env.CRAWLEE_STORAGE_DIR = dir;
try { await fn(); }
finally {
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
try { rmSync(dir, { recursive: true, force: true }); } catch {}
}
}
async function main() {
const proxy = process.env.PROXY_URLS;
console.log(`\n=== TIP FS.COM/NADDOD Scraper (proxy: ${proxy ?? "NONE — datacenter IP!"}) ===\n`);
if (!proxy) {
console.warn("WARNING: PROXY_URLS not set — FS.com will see IONOS IP (likely blocked by Cloudflare)");
}
const boss = new PgBoss({
connectionString,
retryLimit: 3,
retryDelay: 300, // 5 min retry on failure
expireInSeconds: 7200, // 2h timeout for full FS catalog run
monitorStateIntervalSeconds: 60,
});
boss.on("error", (e: Error) => console.error("pg-boss error:", e.message));
await boss.start();
await boss.createQueue("scrape:pricing:fs").catch(() => {});
await boss.createQueue("scrape:pricing:naddod").catch(() => {});
const { scrapeFs } = await import("./scrapers/fs-com");
const { scrapeNaddod } = await import("./scrapers/naddod");
// FS.com: Playwright crawler — PROXY_URLS auto-picked up by buildProxyConfiguration()
await boss.work("scrape:pricing:fs", async () => {
console.log(`[${new Date().toISOString()}] FS.COM → via ${proxy ?? "direct (no proxy)"}`);
await withIsolatedStorage("fs", scrapeFs);
});
// NADDOD: fetch-based — residential IP reduces rate-limit risk on US CDN
await boss.work("scrape:pricing:naddod", async () => {
console.log(`[${new Date().toISOString()}] NADDOD → via ${proxy ?? "direct (no proxy)"}`);
await scrapeNaddod();
});
console.log("FS.COM + NADDOD workers active — waiting for pg-boss jobs\n");
process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); });
process.on("SIGINT", async () => { await boss.stop(); process.exit(0); });
}
main().catch((e) => { console.error("Fatal:", e); process.exit(1); });