From fc7d74e6808b85814388c9d605af044dcfff6a92 Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Thu, 2 Apr 2026 01:47:16 +0200 Subject: [PATCH] feat: download datasheets + manuals to Fearghas NAS in nightly sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - downloadDocuments(): fetches PDFs from product_documents and documents tables using curl, organises into switches/ transceivers/ whitepapers/ other/ subdirs - Integrated into runNightlyNasSync() — runs after JSON exports - rsync incremental — only new/changed files transferred - NAS dir structure: /volume1/tip-data/datasheets/{switches,transceivers,whitepapers,other} - max-filesize 50MB guard per file --- packages/scraper/src/utils/nas-sync.ts | 90 ++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/packages/scraper/src/utils/nas-sync.ts b/packages/scraper/src/utils/nas-sync.ts index 4804890..ee851c6 100644 --- a/packages/scraper/src/utils/nas-sync.ts +++ b/packages/scraper/src/utils/nas-sync.ts @@ -28,7 +28,7 @@ import { exec } from "child_process"; import { promisify } from "util"; -import { writeFile, mkdir, rm } from "fs/promises"; +import { writeFile, mkdir, rm, access } from "fs/promises"; import { join } from "path"; import { tmpdir } from "os"; import { db as pool } from "./db"; @@ -253,10 +253,17 @@ export async function runNightlyNasSync(): Promise { await createDbBackup(backupDir, date); } - // 4. Sync exports to NAS + // 4. Download datasheets / manuals / whitepapers + const datasheetDir = join(tmpBase, "datasheets"); + await downloadDocuments(datasheetDir); + + // 5. Sync exports to NAS await rsyncToNas(join(tmpBase, "exports"), "exports"); - // 5. Sync backups if created + // 6. Sync datasheets to NAS (incremental — only new files) + await rsyncToNas(datasheetDir, "datasheets"); + + // 7. Sync backups if created if (isBackupDay) { await rsyncToNas(backupDir, "db-backups"); } @@ -268,11 +275,86 @@ export async function runNightlyNasSync(): Promise { } } +// ───────────────────────────────────────────────────────────────────────────── +// Download datasheets, manuals & documents to local dir for NAS sync +// ───────────────────────────────────────────────────────────────────────────── +async function downloadDocuments(destDir: string): Promise { + await mkdir(destDir, { recursive: true }); + await mkdir(join(destDir, "switches"), { recursive: true }); + await mkdir(join(destDir, "transceivers"), { recursive: true }); + await mkdir(join(destDir, "whitepapers"), { recursive: true }); + await mkdir(join(destDir, "other"), { recursive: true }); + + let downloaded = 0; + + // 1. product_documents (switch + transceiver datasheets, manuals) + const pdRows = await pool.query<{ + id: string; doc_type: string; title: string; + url: string; switch_id: string | null; transceiver_id: string | null; + }>(` + SELECT id, doc_type, title, + COALESCE(download_url, source_url) AS url, + switch_id, transceiver_id + FROM product_documents + WHERE COALESCE(download_url, source_url) IS NOT NULL + ORDER BY created_at DESC + `); + + // 2. documents table (whitepapers, research) + const docsRows = await pool.query<{ + id: string; doc_type: string; title: string; url: string; + }>(` + SELECT id, doc_type, title, source_url AS url + FROM documents + WHERE source_url IS NOT NULL + ORDER BY created_at DESC + `); + + const allDocs = [ + ...pdRows.rows.map(r => ({ ...r, source: "product_documents" as const })), + ...docsRows.rows.map(r => ({ ...r, switch_id: null, transceiver_id: null, source: "documents" as const })), + ]; + + for (const doc of allDocs) { + if (!doc.url) continue; + + const subdir = doc.switch_id ? "switches" + : doc.transceiver_id ? "transceivers" + : doc.doc_type === "whitepaper" ? "whitepapers" + : "other"; + + const safeName = (doc.title || "doc").replace(/[^a-z0-9_\-\.]/gi, "_").slice(0, 80); + const ext = doc.url.toLowerCase().includes(".pdf") ? ".pdf" : ".pdf"; + const filePath = join(destDir, subdir, `${safeName}${ext}`); + + // Skip if already downloaded in this run + try { await access(filePath); continue; } catch { /* not cached, download */ } + + try { + await execAsync( + `curl -sL --max-time 60 --max-filesize 52428800 -A "TIP-DataCollector/1.0" -o "${filePath}" "${doc.url}"`, + { timeout: 65000 } + ); + downloaded++; + logger.info(`Downloaded: ${subdir}/${safeName}${ext}`); + } catch (err) { + logger.warn(`Failed to download doc: ${doc.title}`, { url: doc.url, err }); + } + } + + logger.info(`Documents downloaded: ${downloaded}/${allDocs.length}`); + return downloaded; +} + // ───────────────────────────────────────────────────────────────────────────── // Setup NAS directory structure (run once) // ───────────────────────────────────────────────────────────────────────────── export async function setupNasDirectories(): Promise { - const dirs = ["exports", "db-backups", "raw-cache", "logs"]; + const dirs = [ + "exports", "db-backups", "raw-cache", "logs", + "datasheets", "datasheets/switches", "datasheets/transceivers", + "datasheets/whitepapers", "datasheets/other", + ]; for (const dir of dirs) { try { await execAsync(