From 5a77fce9f3d01321f1e4e03704aaa3d127cc5d25 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Wed, 6 May 2026 23:26:58 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20NADDOD=20cursor=20rotation=20=E2=80=94?= =?UTF-8?q?=20covers=20all=207300+=20URLs=20across=2012=20runs=20(24h)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously always sliced first 600 URLs from sitemap, missing 6700+ products. Now stores offset in naddod-cursor.json, advances by 600 per run with wrap-around. Full sitemap coverage in ~13 runs (26h). Also adds TIP_STORAGE_DIR env support. --- packages/scraper/src/scrapers/naddod.ts | 44 +++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/packages/scraper/src/scrapers/naddod.ts b/packages/scraper/src/scrapers/naddod.ts index f147d75..075866c 100644 --- a/packages/scraper/src/scrapers/naddod.ts +++ b/packages/scraper/src/scrapers/naddod.ts @@ -17,6 +17,8 @@ */ import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db"; import { contentHash } from "../utils/hash"; +import { readFileSync, writeFileSync, existsSync } from "node:fs"; +import { join } from "node:path"; const BASE = "https://www.naddod.com"; const SITEMAP_URL = `${BASE}/sitemaps/products.xml`; @@ -29,6 +31,27 @@ const HEADERS = { // Limit detail-page fetches per run to stay reasonable const MAX_DETAIL_PAGES = 600; +// Cursor file: persists across runs so each run covers the next 600 URLs +const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json"); + +function readCursor(): number { + try { + if (existsSync(CURSOR_FILE)) { + const data = JSON.parse(readFileSync(CURSOR_FILE, "utf-8")) as { offset: number }; + return typeof data.offset === "number" && data.offset >= 0 ? data.offset : 0; + } + } catch { /* fall through */ } + return 0; +} + +function writeCursor(offset: number): void { + try { + writeFileSync(CURSOR_FILE, JSON.stringify({ offset, updated: new Date().toISOString() })); + } catch (err) { + console.warn(` [cursor] Failed to write cursor: ${(err as Error).message}`); + } +} + function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -224,9 +247,19 @@ export async function scrapeNaddod(): Promise { return; } - // Limit to avoid excessive runtime - const urls = productUrls.slice(0, MAX_DETAIL_PAGES); - console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`); + // Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs + // (24 hours) we cover all ~7300 products. Wraps around when exhausted. + const totalUrls = productUrls.length; + const offset = readCursor() % totalUrls; + const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls); + let urls = productUrls.slice(offset, endIdx); + // Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list) + if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) { + const wrap = MAX_DETAIL_PAGES - urls.length; + urls = urls.concat(productUrls.slice(0, wrap)); + } + const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls; + console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`); // ── Phase 2: Fetch detail pages + write to DB ───────────────────────────── console.log("\n[Phase 2] Fetching product detail pages..."); @@ -315,12 +348,17 @@ export async function scrapeNaddod(): Promise { } } + // Advance cursor for next run + writeCursor(nextOffset); + console.log("\n=== NADDOD Scraper v2 Complete ==="); + console.log(` URL range processed: ${offset}–${offset + urls.length - 1} of ${totalUrls}`); console.log(` Products processed: ${processed}`); console.log(` Non-transceivers skip: ${skippedNonTx}`); console.log(` Price observations: ${priceUpdates} new`); console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`); if (errors > 0) console.warn(` Errors: ${errors}`); + console.log(` Next run starts at: offset ${nextOffset}`); } if (require.main === module) {