feat: more switch image coverage + system health metrics + Erik monitor
switch-image-fetcher: - Add Fortinet URL builder (11 FortiSwitch models) - Add Quanta Cloud Technology, Allied Telesis, Ufispace, Netgear URL builders - Fix alcatel-lucent-enterprise slug missing from URL_BUILDERS dispatcher - Fix NVIDIA builder to skip ConnectX/BlueField adapters (not switches) - Add aruba slug alias for hpe-aruba health endpoint: - Add system metrics: CPU load (1/5/15m), memory usage, disk usage - Add load_status indicator (ok/busy/overloaded) - Expose process RSS memory - Used by external monitors scripts/monitor-erik.sh: - Cron-ready health check script for Claudi (.82) and Raspberry Pis - Checks TIP API health endpoint (load, memory, disk, DB latency) - Checks PM2 process state via SSH (errored/stopped detection) - ntfy.sh push notifications (set NTFY_TOPIC env var) - Includes systemd service + timer unit comments for auto-install
This commit is contained in:
parent
5d09b954f5
commit
ea6ef606d3
@ -1,6 +1,8 @@
|
||||
import { Router, Request, Response } from "express";
|
||||
import { getDbStats } from "../db/queries";
|
||||
import { pool } from "../db/client";
|
||||
import { loadavg, totalmem, freemem, cpus } from "os";
|
||||
import { execSync } from "child_process";
|
||||
|
||||
export const healthRouter = Router();
|
||||
|
||||
@ -36,11 +38,45 @@ healthRouter.get("/", async (_req: Request, res: Response) => {
|
||||
`).catch(() => ({ rows: [{}] }));
|
||||
const s = stockStats.rows[0] || {};
|
||||
|
||||
// System metrics
|
||||
const [load1, load5, load15] = loadavg();
|
||||
const totalMem = totalmem();
|
||||
const freeMem = freemem();
|
||||
const usedMem = totalMem - freeMem;
|
||||
const coreCount = cpus().length;
|
||||
|
||||
let diskUsedPct: number | null = null;
|
||||
let diskFreeGb: number | null = null;
|
||||
try {
|
||||
const df = execSync("df -h / 2>/dev/null | tail -1", { timeout: 2000 }).toString().trim();
|
||||
const parts = df.split(/\s+/);
|
||||
diskUsedPct = parseInt(parts[4] ?? "0", 10) || null;
|
||||
diskFreeGb = parseFloat(parts[3] ?? "0") || null;
|
||||
} catch { /* skip on systems without df */ }
|
||||
|
||||
const loadStatus = load1 > coreCount * 0.9 ? "overloaded" : load1 > coreCount * 0.6 ? "busy" : "ok";
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
status: "healthy",
|
||||
version: "0.3.0",
|
||||
uptime: process.uptime(),
|
||||
system: {
|
||||
load: { "1m": +load1.toFixed(2), "5m": +load5.toFixed(2), "15m": +load15.toFixed(2) },
|
||||
load_status: loadStatus,
|
||||
cpu_cores: coreCount,
|
||||
memory: {
|
||||
total_mb: Math.round(totalMem / 1024 / 1024),
|
||||
used_mb: Math.round(usedMem / 1024 / 1024),
|
||||
free_mb: Math.round(freeMem / 1024 / 1024),
|
||||
used_pct: Math.round(usedMem / totalMem * 100),
|
||||
},
|
||||
disk: {
|
||||
used_pct: diskUsedPct,
|
||||
free_gb: diskFreeGb,
|
||||
},
|
||||
process_rss_mb: Math.round(process.memoryUsage().rss / 1024 / 1024),
|
||||
},
|
||||
database: {
|
||||
connected: true,
|
||||
latency_ms: latencyMs,
|
||||
|
||||
@ -8,12 +8,14 @@
|
||||
* 4. Write image_url + product_page_url to switches table
|
||||
*
|
||||
* Vendors covered:
|
||||
* Cisco (Nexus 9000/9300, NCS 5500/5700, Catalyst 9300/9500)
|
||||
* Cisco (Nexus 9000/9300, NCS 5500/5700, Catalyst 9300/9500, 8000 SP)
|
||||
* Arista (7000 series)
|
||||
* Juniper (QFX, EX series)
|
||||
* NVIDIA Networking (Spectrum SN series)
|
||||
* NVIDIA Networking (Spectrum SN series — ConnectX skipped)
|
||||
* Edgecore, Celestica, Asterfusion (whitebox)
|
||||
* Fortinet (FortiSwitch series)
|
||||
* Dell, HPE/Aruba, Huawei, Nokia, Extreme, MikroTik, Ubiquiti, FS.COM, Supermicro
|
||||
* Alcatel-Lucent Enterprise, Allied Telesis, Netgear, Quanta Cloud Technology, Ufispace
|
||||
*
|
||||
* Rate limit: 1 req/2sec per domain, max 3 concurrent domains.
|
||||
* Respects robots.txt: User-Agent identifies as research bot.
|
||||
@ -89,7 +91,11 @@ function buildJuniperUrl(model: string): string | null {
|
||||
function buildNvidiaUrl(model: string): string | null {
|
||||
// SN5600 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn5600/
|
||||
// SN4700 → https://www.nvidia.com/en-us/networking/ethernet-switching/sn4700/
|
||||
const slug = model.toUpperCase().replace(/[^A-Z0-9]/g, "");
|
||||
// ConnectX-7 / BlueField are adapters, not switches — skip
|
||||
const m = model.toUpperCase();
|
||||
if (m.includes("CONNECTX") || m.includes("BLUEFIELD")) return null;
|
||||
const slug = m.replace(/[^A-Z0-9]/g, "");
|
||||
if (!slug.startsWith("SN")) return null; // only Spectrum switch series
|
||||
return `https://www.nvidia.com/en-us/networking/ethernet-switching/${slug.toLowerCase()}/`;
|
||||
}
|
||||
|
||||
@ -154,6 +160,35 @@ function buildAsterfusionUrl(model: string): string | null {
|
||||
return `https://www.asterfusion.com/products/${slug}/`;
|
||||
}
|
||||
|
||||
function buildFortinetUrl(model: string): string | null {
|
||||
// FortiSwitch 1024E → fortiswitch-1024e
|
||||
// FortiSwitch 124F-POE → fortiswitch-124f-poe
|
||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||
return `https://www.fortinet.com/products/switches/${slug}.html`;
|
||||
}
|
||||
|
||||
function buildQuantaUrl(model: string): string | null {
|
||||
// QuantaMesh T3048-LY8, T7032-IX1 etc.
|
||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||
return `https://www.qct.io/product/index/Infrastructure-Product/Networking/Switch/${slug}`;
|
||||
}
|
||||
|
||||
function buildAlliedTelesisUrl(model: string): string | null {
|
||||
// AT-x530-28GSX → https://www.alliedtelesis.com/us/en/products/at-x530-28gsx
|
||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||
return `https://www.alliedtelesis.com/us/en/products/${slug}`;
|
||||
}
|
||||
|
||||
function buildUfispaceUrl(model: string): string | null {
|
||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||
return `https://www.ufispace.com/products/${slug}`;
|
||||
}
|
||||
|
||||
function buildNetgearUrl(model: string): string | null {
|
||||
const slug = model.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
||||
return `https://www.netgear.com/business/products/switches/${slug}`;
|
||||
}
|
||||
|
||||
// ── URL dispatcher by vendor slug ───────────────────────────────────────────
|
||||
|
||||
const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||
@ -164,6 +199,7 @@ const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||
edgecore: buildEdgecoreUrl,
|
||||
celestica: buildCelesticaUrl,
|
||||
asterfusion: buildAsterfusionUrl,
|
||||
fortinet: buildFortinetUrl,
|
||||
dell: buildDellUrl,
|
||||
"hpe-aruba": buildHpeArubaUrl,
|
||||
huawei: buildHuaweiUrl,
|
||||
@ -174,8 +210,14 @@ const URL_BUILDERS: Record<string, (m: string) => string | null> = {
|
||||
"fs-com": buildFsComUrl,
|
||||
supermicro: buildSupermicroUrl,
|
||||
"alcatel-lucent": buildAlcatelLucentUrl,
|
||||
"ale": buildAlcatelLucentUrl,
|
||||
"alcatel-lucent-enterprise": buildAlcatelLucentUrl, // fix: DB uses this slug
|
||||
ale: buildAlcatelLucentUrl,
|
||||
"quanta-cloud-technology": buildQuantaUrl,
|
||||
"allied-telesis": buildAlliedTelesisUrl,
|
||||
ufispace: buildUfispaceUrl,
|
||||
netgear: buildNetgearUrl,
|
||||
wistron: (_m) => null, // no public product pages
|
||||
aruba: buildHpeArubaUrl, // alias
|
||||
};
|
||||
|
||||
// ── Generic marketing image detector ────────────────────────────────────────
|
||||
|
||||
166
scripts/monitor-erik.sh
Executable file
166
scripts/monitor-erik.sh
Executable file
@ -0,0 +1,166 @@
|
||||
#!/bin/bash
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# TIP / Erik Health Monitor — run on Claudi (.82) or Raspberry Pi via cron
|
||||
#
|
||||
# Checks:
|
||||
# 1. TIP API health endpoint (CPU load, memory, disk)
|
||||
# 2. PM2 process status on Erik (errored/stopped processes)
|
||||
# 3. DB query roundtrip latency
|
||||
#
|
||||
# Alerting:
|
||||
# - ntfy.sh push notification (set NTFY_TOPIC below)
|
||||
# - Appends to /var/log/tip-monitor.log
|
||||
#
|
||||
# Setup (Claudi / Pi):
|
||||
# chmod +x monitor-erik.sh
|
||||
# sudo cp monitor-erik.sh /usr/local/bin/tip-monitor
|
||||
#
|
||||
# Add to crontab (every 5 minutes):
|
||||
# */5 * * * * /usr/local/bin/tip-monitor >> /var/log/tip-monitor.log 2>&1
|
||||
#
|
||||
# Or for systemd timer — see monitor-erik.service / monitor-erik.timer below
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────────────────────
|
||||
TIP_API="${TIP_API:-https://tip.fichtmueller.org/api/health}"
|
||||
NTFY_TOPIC="${NTFY_TOPIC:-}" # e.g. "tip-erik-alerts"
|
||||
SSH_TARGET="${SSH_TARGET:-erik-cf}" # SSH alias for Erik
|
||||
LOAD_WARN="${LOAD_WARN:-4.0}" # 1m load warning threshold
|
||||
DISK_WARN="${DISK_WARN:-85}" # disk % warning threshold
|
||||
MEM_WARN="${MEM_WARN:-90}" # memory % warning threshold
|
||||
LOG_FILE="${LOG_FILE:-/var/log/tip-monitor.log}"
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
TS() { date '+%Y-%m-%d %H:%M:%S'; }
|
||||
log() { echo "[$(TS)] $*"; }
|
||||
warn() { echo "[$(TS)] ⚠️ WARN: $*"; }
|
||||
crit() { echo "[$(TS)] 🔴 CRIT: $*"; alert "$*"; }
|
||||
|
||||
ALERTS=()
|
||||
alert() {
|
||||
ALERTS+=("$1")
|
||||
if [[ -n "$NTFY_TOPIC" ]]; then
|
||||
curl -s -m 5 \
|
||||
-H "Title: TIP/Erik Alert" \
|
||||
-H "Tags: warning,server" \
|
||||
-H "Priority: urgent" \
|
||||
-d "$1" \
|
||||
"https://ntfy.sh/${NTFY_TOPIC}" > /dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
# ── 1. TIP API health check ──────────────────────────────────────────────────
|
||||
log "Checking TIP API health …"
|
||||
HEALTH_JSON=""
|
||||
HTTP_CODE=0
|
||||
|
||||
HTTP_CODE=$(curl -s -m 15 -o /tmp/tip-health.json -w "%{http_code}" "$TIP_API" 2>/dev/null || echo "0")
|
||||
|
||||
if [[ "$HTTP_CODE" != "200" ]]; then
|
||||
crit "TIP API unreachable (HTTP $HTTP_CODE) — $TIP_API"
|
||||
else
|
||||
HEALTH_JSON=$(cat /tmp/tip-health.json 2>/dev/null || echo "{}")
|
||||
|
||||
# Extract fields (requires jq)
|
||||
if command -v jq &>/dev/null; then
|
||||
LOAD1=$(echo "$HEALTH_JSON" | jq -r '.system.load."1m" // "N/A"')
|
||||
MEM_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.memory.used_pct // "N/A"')
|
||||
DISK_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.disk.used_pct // "N/A"')
|
||||
DISK_FREE=$(echo "$HEALTH_JSON"| jq -r '.system.disk.free_gb // "N/A"')
|
||||
DB_LAT=$(echo "$HEALTH_JSON" | jq -r '.database.latency_ms // "N/A"')
|
||||
STATUS=$(echo "$HEALTH_JSON" | jq -r '.status // "unknown"')
|
||||
|
||||
log " Status: $STATUS | Load: $LOAD1 | Mem: ${MEM_PCT}% | Disk: ${DISK_PCT}% (${DISK_FREE}GB free) | DB: ${DB_LAT}ms"
|
||||
|
||||
# Load check
|
||||
if command -v bc &>/dev/null && [[ "$LOAD1" != "N/A" ]]; then
|
||||
if (( $(echo "$LOAD1 > $LOAD_WARN" | bc -l) )); then
|
||||
crit "High load on Erik: $LOAD1 (threshold $LOAD_WARN)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Memory check
|
||||
if [[ "$MEM_PCT" != "N/A" ]] && [[ "$MEM_PCT" -ge "$MEM_WARN" ]]; then
|
||||
crit "High memory usage on Erik: ${MEM_PCT}%"
|
||||
fi
|
||||
|
||||
# Disk check
|
||||
if [[ "$DISK_PCT" != "N/A" ]] && [[ "$DISK_PCT" -ge "$DISK_WARN" ]]; then
|
||||
crit "Disk usage on Erik: ${DISK_PCT}% (${DISK_FREE}GB free)"
|
||||
fi
|
||||
|
||||
# DB latency
|
||||
if command -v bc &>/dev/null && [[ "$DB_LAT" != "N/A" ]]; then
|
||||
if (( $(echo "$DB_LAT > 2000" | bc -l) )); then
|
||||
warn "High DB latency: ${DB_LAT}ms"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log " API OK (HTTP 200) — install jq for detailed metrics"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 2. PM2 process check (via SSH) ──────────────────────────────────────────
|
||||
log "Checking PM2 processes on Erik …"
|
||||
if ssh -o ConnectTimeout=10 -o BatchMode=yes "$SSH_TARGET" true 2>/dev/null; then
|
||||
ERRORED=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \
|
||||
"pm2 list --no-color 2>/dev/null | grep -E 'errored|stopped' | grep -v 'ecosystem-stable'" \
|
||||
2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$ERRORED" ]]; then
|
||||
COUNT=$(echo "$ERRORED" | wc -l | tr -d ' ')
|
||||
crit "${COUNT} PM2 process(es) errored/stopped on Erik"
|
||||
log " Errored: $ERRORED"
|
||||
else
|
||||
log " PM2: all processes running"
|
||||
fi
|
||||
|
||||
# Check restart counts (> 5 in the last run = likely crashing)
|
||||
HIGH_RESTARTS=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \
|
||||
"pm2 list --no-color 2>/dev/null | awk '{if(\$16 > 5) print \$2, \"restarts:\", \$16}'" \
|
||||
2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$HIGH_RESTARTS" ]]; then
|
||||
warn "High restart count: $HIGH_RESTARTS"
|
||||
fi
|
||||
else
|
||||
crit "SSH connection to Erik failed (via $SSH_TARGET)"
|
||||
fi
|
||||
|
||||
# ── 3. Summary ───────────────────────────────────────────────────────────────
|
||||
if [[ ${#ALERTS[@]} -eq 0 ]]; then
|
||||
log "✅ All checks passed"
|
||||
else
|
||||
log "🔴 ${#ALERTS[@]} alert(s) sent"
|
||||
fi
|
||||
|
||||
# ── Optional: truncate log file at 5000 lines ────────────────────────────────
|
||||
if [[ -f "$LOG_FILE" ]] && [[ $(wc -l < "$LOG_FILE") -gt 5000 ]]; then
|
||||
tail -n 2500 "$LOG_FILE" > /tmp/tip-monitor-trim && mv /tmp/tip-monitor-trim "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# ── Systemd unit (paste into /etc/systemd/system/tip-monitor.service) ────────
|
||||
# [Unit]
|
||||
# Description=TIP/Erik Health Monitor
|
||||
# After=network.target
|
||||
#
|
||||
# [Service]
|
||||
# Type=oneshot
|
||||
# ExecStart=/usr/local/bin/tip-monitor
|
||||
# StandardOutput=append:/var/log/tip-monitor.log
|
||||
# StandardError=append:/var/log/tip-monitor.log
|
||||
# Environment=NTFY_TOPIC=tip-erik-alerts
|
||||
#
|
||||
# [Install]
|
||||
# WantedBy=multi-user.target
|
||||
#
|
||||
# Systemd timer (paste into /etc/systemd/system/tip-monitor.timer):
|
||||
# [Unit]
|
||||
# Description=Run TIP monitor every 5 minutes
|
||||
# [Timer]
|
||||
# OnBootSec=60
|
||||
# OnUnitActiveSec=300
|
||||
# [Install]
|
||||
# WantedBy=timers.target
|
||||
Loading…
x
Reference in New Issue
Block a user