From 9e6be570a3cb704bee43920f9bd0a44eaabac3ee Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Tue, 21 Apr 2026 00:31:43 +0200 Subject: [PATCH] feat: more switch image coverage + system health metrics + Erik monitor switch-image-fetcher: - Add Fortinet URL builder (11 FortiSwitch models) - Add Quanta Cloud Technology, Allied Telesis, Ufispace, Netgear URL builders - Fix alcatel-lucent-enterprise slug missing from URL_BUILDERS dispatcher - Fix NVIDIA builder to skip ConnectX/BlueField adapters (not switches) - Add aruba slug alias for hpe-aruba health endpoint: - Add system metrics: CPU load (1/5/15m), memory usage, disk usage - Add load_status indicator (ok/busy/overloaded) - Expose process RSS memory - Used by external monitors scripts/monitor-erik.sh: - Cron-ready health check script for Claudi (.82) and Raspberry Pis - Checks TIP API health endpoint (load, memory, disk, DB latency) - Checks PM2 process state via SSH (errored/stopped detection) - ntfy.sh push notifications (set NTFY_TOPIC env var) - Includes systemd service + timer unit comments for auto-install --- packages/api/src/routes/health.ts | 36 ++++ .../src/scrapers/switch-image-fetcher.ts | 86 ++++++--- scripts/monitor-erik.sh | 166 ++++++++++++++++++ 3 files changed, 266 insertions(+), 22 deletions(-) create mode 100755 scripts/monitor-erik.sh diff --git a/packages/api/src/routes/health.ts b/packages/api/src/routes/health.ts index 6d7cee9..66beeff 100644 --- a/packages/api/src/routes/health.ts +++ b/packages/api/src/routes/health.ts @@ -1,6 +1,8 @@ import { Router, Request, Response } from "express"; import { getDbStats } from "../db/queries"; import { pool } from "../db/client"; +import { loadavg, totalmem, freemem, cpus } from "os"; +import { execSync } from "child_process"; export const healthRouter = Router(); @@ -36,11 +38,45 @@ healthRouter.get("/", async (_req: Request, res: Response) => { `).catch(() => ({ rows: [{}] })); const s = stockStats.rows[0] || {}; + // System metrics + const [load1, load5, load15] = loadavg(); + const totalMem = totalmem(); + const freeMem = freemem(); + const usedMem = totalMem - freeMem; + const coreCount = cpus().length; + + let diskUsedPct: number | null = null; + let diskFreeGb: number | null = null; + try { + const df = execSync("df -h / 2>/dev/null | tail -1", { timeout: 2000 }).toString().trim(); + const parts = df.split(/\s+/); + diskUsedPct = parseInt(parts[4] ?? "0", 10) || null; + diskFreeGb = parseFloat(parts[3] ?? "0") || null; + } catch { /* skip on systems without df */ } + + const loadStatus = load1 > coreCount * 0.9 ? "overloaded" : load1 > coreCount * 0.6 ? "busy" : "ok"; + res.json({ success: true, status: "healthy", version: "0.3.0", uptime: process.uptime(), + system: { + load: { "1m": +load1.toFixed(2), "5m": +load5.toFixed(2), "15m": +load15.toFixed(2) }, + load_status: loadStatus, + cpu_cores: coreCount, + memory: { + total_mb: Math.round(totalMem / 1024 / 1024), + used_mb: Math.round(usedMem / 1024 / 1024), + free_mb: Math.round(freeMem / 1024 / 1024), + used_pct: Math.round(usedMem / totalMem * 100), + }, + disk: { + used_pct: diskUsedPct, + free_gb: diskFreeGb, + }, + process_rss_mb: Math.round(process.memoryUsage().rss / 1024 / 1024), + }, database: { connected: true, latency_ms: latencyMs, diff --git a/packages/scraper/src/scrapers/switch-image-fetcher.ts b/packages/scraper/src/scrapers/switch-image-fetcher.ts index 597cca3..1b198e6 100644 --- a/packages/scraper/src/scrapers/switch-image-fetcher.ts +++ b/packages/scraper/src/scrapers/switch-image-fetcher.ts @@ -8,12 +8,14 @@ * 4. Write image_url + product_page_url to switches table * * Vendors covered: - * Cisco (Nexus 9000/9300, NCS 5500/5700, Catalyst 9300/9500) + * Cisco (Nexus 9000/9300, NCS 5500/5700, Catalyst 9300/9500, 8000 SP) * Arista (7000 series) * Juniper (QFX, EX series) - * NVIDIA Networking (Spectrum SN series) + * NVIDIA Networking (Spectrum SN series — ConnectX skipped) * Edgecore, Celestica, Asterfusion (whitebox) + * Fortinet (FortiSwitch series) * Dell, HPE/Aruba, Huawei, Nokia, Extreme, MikroTik, Ubiquiti, FS.COM, Supermicro + * Alcatel-Lucent Enterprise, Allied Telesis, Netgear, Quanta Cloud Technology, Ufispace * * Rate limit: 1 req/2sec per domain, max 3 concurrent domains. * Respects robots.txt: User-Agent identifies as research bot. @@ -89,7 +91,11 @@ function buildJuniperUrl(model: string): string | null { function buildNvidiaUrl(model: string): string | null { // SN5600 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn5600/ // SN4700 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn4700/ - const slug = model.toUpperCase().replace(/[^A-Z0-9]/g, ""); + // ConnectX-7 / BlueField are adapters, not switches — skip + const m = model.toUpperCase(); + if (m.includes("CONNECTX") || m.includes("BLUEFIELD")) return null; + const slug = m.replace(/[^A-Z0-9]/g, ""); + if (!slug.startsWith("SN")) return null; // only Spectrum switch series return `https://www.nvidia.com/en-us/networking/ethernet-switching/${slug.toLowerCase()}/`; } @@ -154,28 +160,64 @@ function buildAsterfusionUrl(model: string): string | null { return `https://www.asterfusion.com/products/${slug}/`; } +function buildFortinetUrl(model: string): string | null { + // FortiSwitch 1024E → fortiswitch-1024e + // FortiSwitch 124F-POE → fortiswitch-124f-poe + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.fortinet.com/products/switches/${slug}.html`; +} + +function buildQuantaUrl(model: string): string | null { + // QuantaMesh T3048-LY8, T7032-IX1 etc. + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.qct.io/product/index/Infrastructure-Product/Networking/Switch/${slug}`; +} + +function buildAlliedTelesisUrl(model: string): string | null { + // AT-x530-28GSX → https://www.alliedtelesis.com/us/en/products/at-x530-28gsx + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.alliedtelesis.com/us/en/products/${slug}`; +} + +function buildUfispaceUrl(model: string): string | null { + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.ufispace.com/products/${slug}`; +} + +function buildNetgearUrl(model: string): string | null { + const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return `https://www.netgear.com/business/products/switches/${slug}`; +} + // ── URL dispatcher by vendor slug ─────────────────────────────────────────── const URL_BUILDERS: Record string | null> = { - cisco: buildCiscoUrl, - arista: buildAristaUrl, - juniper: buildJuniperUrl, - "nvidia-networking": buildNvidiaUrl, - edgecore: buildEdgecoreUrl, - celestica: buildCelesticaUrl, - asterfusion: buildAsterfusionUrl, - dell: buildDellUrl, - "hpe-aruba": buildHpeArubaUrl, - huawei: buildHuaweiUrl, - nokia: buildNobelUrl, - extreme: buildExtremeUrl, - mikrotik: buildMikroTikUrl, - ubiquiti: buildUbiquitiUrl, - "fs-com": buildFsComUrl, - supermicro: buildSupermicroUrl, - "alcatel-lucent": buildAlcatelLucentUrl, - "ale": buildAlcatelLucentUrl, - wistron: (_m) => null, // no public product pages + cisco: buildCiscoUrl, + arista: buildAristaUrl, + juniper: buildJuniperUrl, + "nvidia-networking": buildNvidiaUrl, + edgecore: buildEdgecoreUrl, + celestica: buildCelesticaUrl, + asterfusion: buildAsterfusionUrl, + fortinet: buildFortinetUrl, + dell: buildDellUrl, + "hpe-aruba": buildHpeArubaUrl, + huawei: buildHuaweiUrl, + nokia: buildNobelUrl, + extreme: buildExtremeUrl, + mikrotik: buildMikroTikUrl, + ubiquiti: buildUbiquitiUrl, + "fs-com": buildFsComUrl, + supermicro: buildSupermicroUrl, + "alcatel-lucent": buildAlcatelLucentUrl, + "alcatel-lucent-enterprise": buildAlcatelLucentUrl, // fix: DB uses this slug + ale: buildAlcatelLucentUrl, + "quanta-cloud-technology": buildQuantaUrl, + "allied-telesis": buildAlliedTelesisUrl, + ufispace: buildUfispaceUrl, + netgear: buildNetgearUrl, + wistron: (_m) => null, // no public product pages + aruba: buildHpeArubaUrl, // alias }; // ── Generic marketing image detector ──────────────────────────────────────── diff --git a/scripts/monitor-erik.sh b/scripts/monitor-erik.sh new file mode 100755 index 0000000..72a882b --- /dev/null +++ b/scripts/monitor-erik.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# ───────────────────────────────────────────────────────────────────────────── +# TIP / Erik Health Monitor — run on Claudi (.82) or Raspberry Pi via cron +# +# Checks: +# 1. TIP API health endpoint (CPU load, memory, disk) +# 2. PM2 process status on Erik (errored/stopped processes) +# 3. DB query roundtrip latency +# +# Alerting: +# - ntfy.sh push notification (set NTFY_TOPIC below) +# - Appends to /var/log/tip-monitor.log +# +# Setup (Claudi / Pi): +# chmod +x monitor-erik.sh +# sudo cp monitor-erik.sh /usr/local/bin/tip-monitor +# +# Add to crontab (every 5 minutes): +# */5 * * * * /usr/local/bin/tip-monitor >> /var/log/tip-monitor.log 2>&1 +# +# Or for systemd timer — see monitor-erik.service / monitor-erik.timer below +# ───────────────────────────────────────────────────────────────────────────── + +set -euo pipefail + +# ── Config ─────────────────────────────────────────────────────────────────── +TIP_API="${TIP_API:-https://tip.fichtmueller.org/api/health}" +NTFY_TOPIC="${NTFY_TOPIC:-}" # e.g. "tip-erik-alerts" +SSH_TARGET="${SSH_TARGET:-erik-cf}" # SSH alias for Erik +LOAD_WARN="${LOAD_WARN:-4.0}" # 1m load warning threshold +DISK_WARN="${DISK_WARN:-85}" # disk % warning threshold +MEM_WARN="${MEM_WARN:-90}" # memory % warning threshold +LOG_FILE="${LOG_FILE:-/var/log/tip-monitor.log}" + +# ── Helpers ────────────────────────────────────────────────────────────────── +TS() { date '+%Y-%m-%d %H:%M:%S'; } +log() { echo "[$(TS)] $*"; } +warn() { echo "[$(TS)] ⚠️ WARN: $*"; } +crit() { echo "[$(TS)] 🔴 CRIT: $*"; alert "$*"; } + +ALERTS=() +alert() { + ALERTS+=("$1") + if [[ -n "$NTFY_TOPIC" ]]; then + curl -s -m 5 \ + -H "Title: TIP/Erik Alert" \ + -H "Tags: warning,server" \ + -H "Priority: urgent" \ + -d "$1" \ + "https://ntfy.sh/${NTFY_TOPIC}" > /dev/null 2>&1 || true + fi +} + +# ── 1. TIP API health check ────────────────────────────────────────────────── +log "Checking TIP API health …" +HEALTH_JSON="" +HTTP_CODE=0 + +HTTP_CODE=$(curl -s -m 15 -o /tmp/tip-health.json -w "%{http_code}" "$TIP_API" 2>/dev/null || echo "0") + +if [[ "$HTTP_CODE" != "200" ]]; then + crit "TIP API unreachable (HTTP $HTTP_CODE) — $TIP_API" +else + HEALTH_JSON=$(cat /tmp/tip-health.json 2>/dev/null || echo "{}") + + # Extract fields (requires jq) + if command -v jq &>/dev/null; then + LOAD1=$(echo "$HEALTH_JSON" | jq -r '.system.load."1m" // "N/A"') + MEM_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.memory.used_pct // "N/A"') + DISK_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.disk.used_pct // "N/A"') + DISK_FREE=$(echo "$HEALTH_JSON"| jq -r '.system.disk.free_gb // "N/A"') + DB_LAT=$(echo "$HEALTH_JSON" | jq -r '.database.latency_ms // "N/A"') + STATUS=$(echo "$HEALTH_JSON" | jq -r '.status // "unknown"') + + log " Status: $STATUS | Load: $LOAD1 | Mem: ${MEM_PCT}% | Disk: ${DISK_PCT}% (${DISK_FREE}GB free) | DB: ${DB_LAT}ms" + + # Load check + if command -v bc &>/dev/null && [[ "$LOAD1" != "N/A" ]]; then + if (( $(echo "$LOAD1 > $LOAD_WARN" | bc -l) )); then + crit "High load on Erik: $LOAD1 (threshold $LOAD_WARN)" + fi + fi + + # Memory check + if [[ "$MEM_PCT" != "N/A" ]] && [[ "$MEM_PCT" -ge "$MEM_WARN" ]]; then + crit "High memory usage on Erik: ${MEM_PCT}%" + fi + + # Disk check + if [[ "$DISK_PCT" != "N/A" ]] && [[ "$DISK_PCT" -ge "$DISK_WARN" ]]; then + crit "Disk usage on Erik: ${DISK_PCT}% (${DISK_FREE}GB free)" + fi + + # DB latency + if command -v bc &>/dev/null && [[ "$DB_LAT" != "N/A" ]]; then + if (( $(echo "$DB_LAT > 2000" | bc -l) )); then + warn "High DB latency: ${DB_LAT}ms" + fi + fi + else + log " API OK (HTTP 200) — install jq for detailed metrics" + fi +fi + +# ── 2. PM2 process check (via SSH) ────────────────────────────────────────── +log "Checking PM2 processes on Erik …" +if ssh -o ConnectTimeout=10 -o BatchMode=yes "$SSH_TARGET" true 2>/dev/null; then + ERRORED=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \ + "pm2 list --no-color 2>/dev/null | grep -E 'errored|stopped' | grep -v 'ecosystem-stable'" \ + 2>/dev/null || echo "") + + if [[ -n "$ERRORED" ]]; then + COUNT=$(echo "$ERRORED" | wc -l | tr -d ' ') + crit "${COUNT} PM2 process(es) errored/stopped on Erik" + log " Errored: $ERRORED" + else + log " PM2: all processes running" + fi + + # Check restart counts (> 5 in the last run = likely crashing) + HIGH_RESTARTS=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \ + "pm2 list --no-color 2>/dev/null | awk '{if(\$16 > 5) print \$2, \"restarts:\", \$16}'" \ + 2>/dev/null || echo "") + + if [[ -n "$HIGH_RESTARTS" ]]; then + warn "High restart count: $HIGH_RESTARTS" + fi +else + crit "SSH connection to Erik failed (via $SSH_TARGET)" +fi + +# ── 3. Summary ─────────────────────────────────────────────────────────────── +if [[ ${#ALERTS[@]} -eq 0 ]]; then + log "✅ All checks passed" +else + log "🔴 ${#ALERTS[@]} alert(s) sent" +fi + +# ── Optional: truncate log file at 5000 lines ──────────────────────────────── +if [[ -f "$LOG_FILE" ]] && [[ $(wc -l < "$LOG_FILE") -gt 5000 ]]; then + tail -n 2500 "$LOG_FILE" > /tmp/tip-monitor-trim && mv /tmp/tip-monitor-trim "$LOG_FILE" +fi + +# ── Systemd unit (paste into /etc/systemd/system/tip-monitor.service) ──────── +# [Unit] +# Description=TIP/Erik Health Monitor +# After=network.target +# +# [Service] +# Type=oneshot +# ExecStart=/usr/local/bin/tip-monitor +# StandardOutput=append:/var/log/tip-monitor.log +# StandardError=append:/var/log/tip-monitor.log +# Environment=NTFY_TOPIC=tip-erik-alerts +# +# [Install] +# WantedBy=multi-user.target +# +# Systemd timer (paste into /etc/systemd/system/tip-monitor.timer): +# [Unit] +# Description=Run TIP monitor every 5 minutes +# [Timer] +# OnBootSec=60 +# OnUnitActiveSec=300 +# [Install] +# WantedBy=timers.target