feat: NADDOD cursor rotation — covers all 7300+ URLs across 12 runs (24h)
Previously always sliced first 600 URLs from sitemap, missing 6700+ products. Now stores offset in naddod-cursor.json, advances by 600 per run with wrap-around. Full sitemap coverage in ~13 runs (26h). Also adds TIP_STORAGE_DIR env support.
This commit is contained in:
parent
efb0c24a19
commit
5a77fce9f3
@ -17,6 +17,8 @@
|
||||
*/
|
||||
import { pool, findOrCreateScrapedTransceiver, ensureVendor, upsertPriceObservation, upsertStockObservation } from "../utils/db";
|
||||
import { contentHash } from "../utils/hash";
|
||||
import { readFileSync, writeFileSync, existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
|
||||
const BASE = "https://www.naddod.com";
|
||||
const SITEMAP_URL = `${BASE}/sitemaps/products.xml`;
|
||||
@ -29,6 +31,27 @@ const HEADERS = {
|
||||
// Limit detail-page fetches per run to stay reasonable
|
||||
const MAX_DETAIL_PAGES = 600;
|
||||
|
||||
// Cursor file: persists across runs so each run covers the next 600 URLs
|
||||
const CURSOR_FILE = join(process.env["TIP_STORAGE_DIR"] ?? "/opt/tip", "naddod-cursor.json");
|
||||
|
||||
function readCursor(): number {
|
||||
try {
|
||||
if (existsSync(CURSOR_FILE)) {
|
||||
const data = JSON.parse(readFileSync(CURSOR_FILE, "utf-8")) as { offset: number };
|
||||
return typeof data.offset === "number" && data.offset >= 0 ? data.offset : 0;
|
||||
}
|
||||
} catch { /* fall through */ }
|
||||
return 0;
|
||||
}
|
||||
|
||||
function writeCursor(offset: number): void {
|
||||
try {
|
||||
writeFileSync(CURSOR_FILE, JSON.stringify({ offset, updated: new Date().toISOString() }));
|
||||
} catch (err) {
|
||||
console.warn(` [cursor] Failed to write cursor: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
@ -224,9 +247,19 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
return;
|
||||
}
|
||||
|
||||
// Limit to avoid excessive runtime
|
||||
const urls = productUrls.slice(0, MAX_DETAIL_PAGES);
|
||||
console.log(` Processing ${urls.length} products (limit: ${MAX_DETAIL_PAGES})`);
|
||||
// Cursor-based rotation: each run advances by MAX_DETAIL_PAGES so over ~12 runs
|
||||
// (24 hours) we cover all ~7300 products. Wraps around when exhausted.
|
||||
const totalUrls = productUrls.length;
|
||||
const offset = readCursor() % totalUrls;
|
||||
const endIdx = Math.min(offset + MAX_DETAIL_PAGES, totalUrls);
|
||||
let urls = productUrls.slice(offset, endIdx);
|
||||
// Wrap around if we got fewer than MAX_DETAIL_PAGES (hit the end of the list)
|
||||
if (urls.length < MAX_DETAIL_PAGES && totalUrls > MAX_DETAIL_PAGES) {
|
||||
const wrap = MAX_DETAIL_PAGES - urls.length;
|
||||
urls = urls.concat(productUrls.slice(0, wrap));
|
||||
}
|
||||
const nextOffset = (offset + MAX_DETAIL_PAGES) % totalUrls;
|
||||
console.log(` Offset: ${offset}/${totalUrls} → processing ${urls.length} products (next run starts at ${nextOffset})`);
|
||||
|
||||
// ── Phase 2: Fetch detail pages + write to DB ─────────────────────────────
|
||||
console.log("\n[Phase 2] Fetching product detail pages...");
|
||||
@ -315,12 +348,17 @@ export async function scrapeNaddod(): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
// Advance cursor for next run
|
||||
writeCursor(nextOffset);
|
||||
|
||||
console.log("\n=== NADDOD Scraper v2 Complete ===");
|
||||
console.log(` URL range processed: ${offset}–${offset + urls.length - 1} of ${totalUrls}`);
|
||||
console.log(` Products processed: ${processed}`);
|
||||
console.log(` Non-transceivers skip: ${skippedNonTx}`);
|
||||
console.log(` Price observations: ${priceUpdates} new`);
|
||||
console.log(` Stock observations: ${stockWritten} new, ${stockSkipped} unchanged`);
|
||||
if (errors > 0) console.warn(` Errors: ${errors}`);
|
||||
console.log(` Next run starts at: offset ${nextOffset}`);
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user