feat: NADDOD cursor rotation — covers all 7300+ URLs across 12 runs (24h)

Previously always sliced first 600 URLs from sitemap, missing 6700+ products.
Now stores offset in naddod-cursor.json, advances by 600 per run with wrap-around.
Full sitemap coverage in ~13 runs (26h). Also adds TIP_STORAGE_DIR env support.
This commit is contained in:
Rene Fichtmueller 2026-05-06 23:26:58 +02:00
parent efb0c24a19
commit 5a77fce9f3

View File

@ -17,6 +17,8 @@
*/ */
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
import { contentHash } from "../utils/hash"; import { contentHash } from "../utils/hash";
import { readFileSync, writeFileSync, existsSync } from "node:fs";
import { join } from "node:path";
const BASE = "https://www.naddod.com"; const BASE = "https://www.naddod.com";
const SITEMAP_URL = `${BASE}/sitemaps/products.xml`; const SITEMAP_URL = `${BASE}/sitemaps/products.xml`;
@ -29,6 +31,27 @@ const HEADERS = {
// Limit detail-page fetches per run to stay reasonable // Limit detail-page fetches per run to stay reasonable
const MAX_DETAIL_PAGES = 600; const MAX_DETAIL_PAGES = 600;
// Cursor file: persists across runs so each run covers the next 600 URLs
const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json");
function readCursor(): number {
try {
if (existsSync(CURSOR_FILE)) {
const data = JSON.parse(readFileSync(CURSOR_FILE, "utf-8")) as { offset: number };
return typeof data.offset === "number" && data.offset >= 0 ? data.offset : 0;
}
} catch { /* fall through */ }
return 0;
}
function writeCursor(offset: number): void {
try {
writeFileSync(CURSOR_FILE, JSON.stringify({ offset, updated: new Date().toISOString() }));
} catch (err) {
console.warn(` [cursor] Failed to write cursor: ${(err as Error).message}`);
}
}
function sleep(ms: number): Promise<void> { function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms)); return new Promise((resolve) => setTimeout(resolve, ms));
} }
@ -224,9 +247,19 @@ export async function scrapeNaddod(): Promise<void> {
return; return;
} }
// Limit to avoid excessive runtime // Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs
const urls = productUrls.slice(0, MAX_DETAIL_PAGES); // (24 hours) we cover all ~7300 products. Wraps around when exhausted.
console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`); const totalUrls = productUrls.length;
const offset = readCursor() % totalUrls;
const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls);
let urls = productUrls.slice(offset, endIdx);
// Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list)
if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
const wrap = MAX_DETAIL_PAGES - urls.length;
urls = urls.concat(productUrls.slice(0, wrap));
}
const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls;
console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`);
// ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── // ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
console.log("\n[Phase 2] Fetching product detail pages..."); console.log("\n[Phase 2] Fetching product detail pages...");
@ -315,12 +348,17 @@ export async function scrapeNaddod(): Promise<void> {
} }
} }
// Advance cursor for next run
writeCursor(nextOffset);
console.log("\n=== NADDOD Scraper v2 Complete ==="); console.log("\n=== NADDOD Scraper v2 Complete ===");
console.log(` URL range processed: ${offset}${offset + urls.length - 1} of ${totalUrls}`);
console.log(` Products processed: ${processed}`); console.log(` Products processed: ${processed}`);
console.log(` Non-transceivers skip: ${skippedNonTx}`); console.log(` Non-transceivers skip: ${skippedNonTx}`);
console.log(` Price observations: ${priceUpdates} new`); console.log(` Price observations: ${priceUpdates} new`);
console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`); console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`);
if (errors > 0) console.warn(` Errors: ${errors}`); if (errors > 0) console.warn(` Errors: ${errors}`);
console.log(` Next run starts at: offset ${nextOffset}`);
} }
if (require.main === module) { if (require.main === module) {