#!/bin/bash # ───────────────────────────────────────────────────────────────────────────── # TIP / Erik Health Monitor — run on Claudi (.82) or Raspberry Pi via cron # # Checks: # 1. TIP API health endpoint (CPU load, memory, disk) # 2. PM2 process status on Erik (errored/stopped processes) # 3. DB query roundtrip latency # # Alerting: # - ntfy.sh push notification (set NTFY_TOPIC below) # - Appends to /var/log/tip-monitor.log # # Setup (Claudi / Pi): # chmod +x monitor-erik.sh # sudo cp monitor-erik.sh /usr/local/bin/tip-monitor # # Add to crontab (every 5 minutes): # */5 * * * * /usr/local/bin/tip-monitor >> /var/log/tip-monitor.log 2>&1 # # Or for systemd timer — see monitor-erik.service / monitor-erik.timer below # ───────────────────────────────────────────────────────────────────────────── set -euo pipefail # ── Config ─────────────────────────────────────────────────────────────────── TIP_API="${TIP_API:-https://transceiver-db.context-x.org/api/health}" NTFY_TOPIC="${NTFY_TOPIC:-}" # e.g. "tip-erik-alerts" SSH_TARGET="${SSH_TARGET:-root@82.165.222.127}" # Erik IONOS direct (Claudi key authorized) LOAD_WARN="${LOAD_WARN:-4.0}" # 1m load warning threshold DISK_WARN="${DISK_WARN:-85}" # disk % warning threshold MEM_WARN="${MEM_WARN:-90}" # memory % warning threshold LOG_FILE="${LOG_FILE:-/var/log/tip-monitor.log}" # ── Helpers ────────────────────────────────────────────────────────────────── TS() { date '+%Y-%m-%d %H:%M:%S'; } log() { echo "[$(TS)] $*"; } warn() { echo "[$(TS)] ⚠️ WARN: $*"; } crit() { echo "[$(TS)] 🔴 CRIT: $*"; alert "$*"; } ALERTS=() alert() { ALERTS+=("$1") if [[ -n "$NTFY_TOPIC" ]]; then curl -s -m 5 \ -H "Title: TIP/Erik Alert" \ -H "Tags: warning,server" \ -H "Priority: urgent" \ -d "$1" \ "https://ntfy.sh/${NTFY_TOPIC}" > /dev/null 2>&1 || true fi } # ── 1. TIP API health check ────────────────────────────────────────────────── log "Checking TIP API health …" HEALTH_JSON="" HTTP_CODE=0 HTTP_CODE=$(curl -s -m 15 -o /tmp/tip-health.json -w "%{http_code}" "$TIP_API" 2>/dev/null || echo "0") if [[ "$HTTP_CODE" != "200" ]]; then crit "TIP API unreachable (HTTP $HTTP_CODE) — $TIP_API" else HEALTH_JSON=$(cat /tmp/tip-health.json 2>/dev/null || echo "{}") # Extract fields (requires jq) if command -v jq &>/dev/null; then LOAD1=$(echo "$HEALTH_JSON" | jq -r '.system.load."1m" // "N/A"') MEM_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.memory.used_pct // "N/A"') DISK_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.disk.used_pct // "N/A"') DISK_FREE=$(echo "$HEALTH_JSON"| jq -r '.system.disk.free_gb // "N/A"') DB_LAT=$(echo "$HEALTH_JSON" | jq -r '.database.latency_ms // "N/A"') STATUS=$(echo "$HEALTH_JSON" | jq -r '.status // "unknown"') log " Status: $STATUS | Load: $LOAD1 | Mem: ${MEM_PCT}% | Disk: ${DISK_PCT}% (${DISK_FREE}GB free) | DB: ${DB_LAT}ms" # Load check if command -v bc &>/dev/null && [[ "$LOAD1" != "N/A" ]]; then if (( $(echo "$LOAD1 > $LOAD_WARN" | bc -l) )); then crit "High load on Erik: $LOAD1 (threshold $LOAD_WARN)" fi fi # Memory check if [[ "$MEM_PCT" != "N/A" ]] && [[ "$MEM_PCT" -ge "$MEM_WARN" ]]; then crit "High memory usage on Erik: ${MEM_PCT}%" fi # Disk check if [[ "$DISK_PCT" != "N/A" ]] && [[ "$DISK_PCT" -ge "$DISK_WARN" ]]; then crit "Disk usage on Erik: ${DISK_PCT}% (${DISK_FREE}GB free)" fi # DB latency if command -v bc &>/dev/null && [[ "$DB_LAT" != "N/A" ]]; then if (( $(echo "$DB_LAT > 2000" | bc -l) )); then warn "High DB latency: ${DB_LAT}ms" fi fi else log " API OK (HTTP 200) — install jq for detailed metrics" fi fi # ── 2. PM2 process check (via SSH) ────────────────────────────────────────── log "Checking PM2 processes on Erik …" if ssh -o ConnectTimeout=10 -o BatchMode=yes "$SSH_TARGET" true 2>/dev/null; then ERRORED=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \ "pm2 list --no-color 2>/dev/null | grep -E 'errored|stopped' | grep -v 'ecosystem-stable'" \ 2>/dev/null || echo "") if [[ -n "$ERRORED" ]]; then COUNT=$(echo "$ERRORED" | wc -l | tr -d ' ') crit "${COUNT} PM2 process(es) errored/stopped on Erik" log " Errored: $ERRORED" else log " PM2: all processes running" fi # Check restart counts (> 5 in the last run = likely crashing) HIGH_RESTARTS=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \ "pm2 list --no-color 2>/dev/null | awk 'NR>3 && \$16~/^[0-9]+$/ && \$16+0 > 5 {print \$2, \"restarts:\", \$16}'" \ 2>/dev/null || echo "") if [[ -n "$HIGH_RESTARTS" ]]; then warn "High restart count: $HIGH_RESTARTS" fi else crit "SSH connection to Erik failed (via $SSH_TARGET)" fi # ── 3. Summary ─────────────────────────────────────────────────────────────── if [[ ${#ALERTS[@]} -eq 0 ]]; then log "✅ All checks passed" else log "🔴 ${#ALERTS[@]} alert(s) sent" fi # ── Optional: truncate log file at 5000 lines ──────────────────────────────── if [[ -f "$LOG_FILE" ]] && [[ $(wc -l < "$LOG_FILE") -gt 5000 ]]; then tail -n 2500 "$LOG_FILE" > /tmp/tip-monitor-trim && mv /tmp/tip-monitor-trim "$LOG_FILE" fi # ── Systemd unit (paste into /etc/systemd/system/tip-monitor.service) ──────── # [Unit] # Description=TIP/Erik Health Monitor # After=network.target # # [Service] # Type=oneshot # ExecStart=/usr/local/bin/tip-monitor # StandardOutput=append:/var/log/tip-monitor.log # StandardError=append:/var/log/tip-monitor.log # Environment=NTFY_TOPIC=tip-erik-alerts # # [Install] # WantedBy=multi-user.target # # Systemd timer (paste into /etc/systemd/system/tip-monitor.timer): # [Unit] # Description=Run TIP monitor every 5 minutes # [Timer] # OnBootSec=60 # OnUnitActiveSec=300 # [Install] # WantedBy=timers.target