- index-pi.ts: removed Playwright scrapers (FS.COM, eBay enricher, switch assets) added NADDOD (fetch-based, benefits from residential IP) now 32 fetch-only queues safe for ARM/Pi without Chromium - index-fs-only.ts: new dedicated FS.COM + NADDOD worker for Erik routes through Pi SOCKS5 via PROXY_URLS=socks5://10.10.0.6:1080 Crawlee ProxyConfiguration automatically applies to Playwright crawler - pi-scraper-setup.sh: removed inline index-pi.ts override (repo version now authoritative) - CODEX-TASK-pi-scraper-deploy.md: full 9-step Codex spec for Pi fleet setup covers WireGuard keypair, Erik peer config, setup script, ecosystem.config.js - CODEX-TASK-zero-manual-review.md: deterministic equivalence matcher spec
78 lines
3.1 KiB
TypeScript
78 lines
3.1 KiB
TypeScript
/**
|
|
* TIP FS.COM + NADDOD Dedicated Scraper — index-fs-only.ts
|
|
*
|
|
* Runs on ERIK but routes traffic through Pi's SOCKS5 proxy so target sites
|
|
* see a residential IP instead of the IONOS datacenter IP range.
|
|
*
|
|
* Required env: PROXY_URLS=socks5://10.10.0.6:1080
|
|
*
|
|
* Jobs consumed from pg-boss:
|
|
* scrape:pricing:fs — FS.com (Playwright, scheduled 02:00 + 14:00)
|
|
* scrape:pricing:naddod — NADDOD (fetch, scheduled 00:00/06:00/12:00/18:00)
|
|
*/
|
|
import { config } from "dotenv";
|
|
import { join } from "path";
|
|
config({ path: join(__dirname, "..", "..", "..", ".env") });
|
|
|
|
import PgBoss from "pg-boss";
|
|
import { mkdirSync, rmSync } from "fs";
|
|
|
|
const connectionString = `postgres://${process.env.POSTGRES_USER}:${process.env.POSTGRES_PASSWORD}@${process.env.POSTGRES_HOST}:${process.env.POSTGRES_PORT || "5433"}/${process.env.POSTGRES_DB}`;
|
|
|
|
async function withIsolatedStorage(name: string, fn: () => Promise<void>): Promise<void> {
|
|
const dir = `/tmp/tip-crawlee-${name}-${Date.now()}`;
|
|
mkdirSync(join(dir, "request_queues", "default"), { recursive: true });
|
|
mkdirSync(join(dir, "datasets", "default"), { recursive: true });
|
|
mkdirSync(join(dir, "key_value_stores", "default"), { recursive: true });
|
|
const prev = process.env.CRAWLEE_STORAGE_DIR;
|
|
process.env.CRAWLEE_STORAGE_DIR = dir;
|
|
try { await fn(); }
|
|
finally {
|
|
process.env.CRAWLEE_STORAGE_DIR = prev ?? "";
|
|
try { rmSync(dir, { recursive: true, force: true }); } catch {}
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const proxy = process.env.PROXY_URLS;
|
|
console.log(`\n=== TIP FS.COM/NADDOD Scraper (proxy: ${proxy ?? "NONE — datacenter IP!"}) ===\n`);
|
|
if (!proxy) {
|
|
console.warn("WARNING: PROXY_URLS not set — FS.com will see IONOS IP (likely blocked by Cloudflare)");
|
|
}
|
|
|
|
const boss = new PgBoss({
|
|
connectionString,
|
|
retryLimit: 3,
|
|
retryDelay: 300, // 5 min retry on failure
|
|
expireInSeconds: 7200, // 2h timeout for full FS catalog run
|
|
monitorStateIntervalSeconds: 60,
|
|
});
|
|
|
|
boss.on("error", (e: Error) => console.error("pg-boss error:", e.message));
|
|
await boss.start();
|
|
|
|
await boss.createQueue("scrape:pricing:fs").catch(() => {});
|
|
await boss.createQueue("scrape:pricing:naddod").catch(() => {});
|
|
|
|
const { scrapeFs } = await import("./scrapers/fs-com");
|
|
const { scrapeNaddod } = await import("./scrapers/naddod");
|
|
|
|
// FS.com: Playwright crawler — PROXY_URLS auto-picked up by buildProxyConfiguration()
|
|
await boss.work("scrape:pricing:fs", async () => {
|
|
console.log(`[${new Date().toISOString()}] FS.COM → via ${proxy ?? "direct (no proxy)"}`);
|
|
await withIsolatedStorage("fs", scrapeFs);
|
|
});
|
|
|
|
// NADDOD: fetch-based — residential IP reduces rate-limit risk on US CDN
|
|
await boss.work("scrape:pricing:naddod", async () => {
|
|
console.log(`[${new Date().toISOString()}] NADDOD → via ${proxy ?? "direct (no proxy)"}`);
|
|
await scrapeNaddod();
|
|
});
|
|
|
|
console.log("FS.COM + NADDOD workers active — waiting for pg-boss jobs\n");
|
|
process.on("SIGTERM", async () => { await boss.stop(); process.exit(0); });
|
|
process.on("SIGINT", async () => { await boss.stop(); process.exit(0); });
|
|
}
|
|
|
|
main().catch((e) => { console.error("Fatal:", e); process.exit(1); });
|