#!/bin/bash
# ─────────────────────────────────────────────────────────────────────────────
# TIP / Erik Health Monitor — run on Claudi (.82) or Raspberry Pi via cron
#
# Checks:
#   1. TIP API health endpoint (CPU load, memory, disk)
#   2. PM2 process status on Erik (errored/stopped processes)
#   3. DB query roundtrip latency
#
# Alerting:
#   - ntfy.sh push notification (set NTFY_TOPIC below)
#   - Appends to /var/log/tip-monitor.log
#
# Setup (Claudi / Pi):
#   chmod +x monitor-erik.sh
#   sudo cp monitor-erik.sh /usr/local/bin/tip-monitor
#
# Add to crontab (every 5 minutes):
#   */5 * * * * /usr/local/bin/tip-monitor >> /var/log/tip-monitor.log 2>&1
#
# Or for systemd timer — see monitor-erik.service / monitor-erik.timer below
# ─────────────────────────────────────────────────────────────────────────────

set -euo pipefail

# ── Config ───────────────────────────────────────────────────────────────────
TIP_API="${TIP_API:-https://transceiver-db.context-x.org/api/health}"
NTFY_TOPIC="${NTFY_TOPIC:-}"                      # e.g. "tip-erik-alerts"
SSH_TARGET="${SSH_TARGET:-root@82.165.222.127}"   # Erik IONOS direct (Claudi key authorized)
LOAD_WARN="${LOAD_WARN:-4.0}"                     # 1m load warning threshold
DISK_WARN="${DISK_WARN:-85}"                      # disk % warning threshold
MEM_WARN="${MEM_WARN:-90}"                        # memory % warning threshold
LOG_FILE="${LOG_FILE:-/var/log/tip-monitor.log}"

# ── Helpers ──────────────────────────────────────────────────────────────────
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log()  { echo "[$(TS)] $*"; }
warn() { echo "[$(TS)] ⚠️  WARN: $*"; }
crit() { echo "[$(TS)] 🔴 CRIT: $*"; alert "$*"; }

ALERTS=()
alert() {
  ALERTS+=("$1")
  if [[ -n "$NTFY_TOPIC" ]]; then
    curl -s -m 5 \
      -H "Title: TIP/Erik Alert" \
      -H "Tags: warning,server" \
      -H "Priority: urgent" \
      -d "$1" \
      "https://ntfy.sh/${NTFY_TOPIC}" > /dev/null 2>&1 || true
  fi
}

# ── 1. TIP API health check ──────────────────────────────────────────────────
log "Checking TIP API health …"
HEALTH_JSON=""
HTTP_CODE=0

HTTP_CODE=$(curl -s -m 15 -o /tmp/tip-health.json -w "%{http_code}" "$TIP_API" 2>/dev/null || echo "0")

if [[ "$HTTP_CODE" != "200" ]]; then
  crit "TIP API unreachable (HTTP $HTTP_CODE) — $TIP_API"
else
  HEALTH_JSON=$(cat /tmp/tip-health.json 2>/dev/null || echo "{}")

  # Extract fields (requires jq)
  if command -v jq &>/dev/null; then
    LOAD1=$(echo "$HEALTH_JSON"    | jq -r '.system.load."1m"     // "N/A"')
    MEM_PCT=$(echo "$HEALTH_JSON"  | jq -r '.system.memory.used_pct // "N/A"')
    DISK_PCT=$(echo "$HEALTH_JSON" | jq -r '.system.disk.used_pct   // "N/A"')
    DISK_FREE=$(echo "$HEALTH_JSON"| jq -r '.system.disk.free_gb    // "N/A"')
    DB_LAT=$(echo "$HEALTH_JSON"   | jq -r '.database.latency_ms    // "N/A"')
    STATUS=$(echo "$HEALTH_JSON"   | jq -r '.status                 // "unknown"')

    log "  Status: $STATUS | Load: $LOAD1 | Mem: ${MEM_PCT}% | Disk: ${DISK_PCT}% (${DISK_FREE}GB free) | DB: ${DB_LAT}ms"

    # Load check
    if command -v bc &>/dev/null && [[ "$LOAD1" != "N/A" ]]; then
      if (( $(echo "$LOAD1 > $LOAD_WARN" | bc -l) )); then
        crit "High load on Erik: $LOAD1 (threshold $LOAD_WARN)"
      fi
    fi

    # Memory check
    if [[ "$MEM_PCT" != "N/A" ]] && [[ "$MEM_PCT" -ge "$MEM_WARN" ]]; then
      crit "High memory usage on Erik: ${MEM_PCT}%"
    fi

    # Disk check
    if [[ "$DISK_PCT" != "N/A" ]] && [[ "$DISK_PCT" -ge "$DISK_WARN" ]]; then
      crit "Disk usage on Erik: ${DISK_PCT}% (${DISK_FREE}GB free)"
    fi

    # DB latency
    if command -v bc &>/dev/null && [[ "$DB_LAT" != "N/A" ]]; then
      if (( $(echo "$DB_LAT > 2000" | bc -l) )); then
        warn "High DB latency: ${DB_LAT}ms"
      fi
    fi
  else
    log "  API OK (HTTP 200) — install jq for detailed metrics"
  fi
fi

# ── 2. PM2 process check (via SSH) ──────────────────────────────────────────
log "Checking PM2 processes on Erik …"
if ssh -o ConnectTimeout=10 -o BatchMode=yes "$SSH_TARGET" true 2>/dev/null; then
  ERRORED=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \
    "pm2 list --no-color 2>/dev/null | grep -E 'errored|stopped' | grep -v 'ecosystem-stable'" \
    2>/dev/null || echo "")

  if [[ -n "$ERRORED" ]]; then
    COUNT=$(echo "$ERRORED" | wc -l | tr -d ' ')
    crit "${COUNT} PM2 process(es) errored/stopped on Erik"
    log "  Errored: $ERRORED"
  else
    log "  PM2: all processes running"
  fi

  # Check restart counts (> 5 in the last run = likely crashing)
  HIGH_RESTARTS=$(ssh -o ConnectTimeout=10 "$SSH_TARGET" \
    "pm2 list --no-color 2>/dev/null | awk 'NR>3 && \$16~/^[0-9]+$/ && \$16+0 > 5 {print \$2, \"restarts:\", \$16}'" \
    2>/dev/null || echo "")

  if [[ -n "$HIGH_RESTARTS" ]]; then
    warn "High restart count: $HIGH_RESTARTS"
  fi
else
  crit "SSH connection to Erik failed (via $SSH_TARGET)"
fi

# ── 3. Summary ───────────────────────────────────────────────────────────────
if [[ ${#ALERTS[@]} -eq 0 ]]; then
  log "✅ All checks passed"
else
  log "🔴 ${#ALERTS[@]} alert(s) sent"
fi

# ── Optional: truncate log file at 5000 lines ────────────────────────────────
if [[ -f "$LOG_FILE" ]] && [[ $(wc -l < "$LOG_FILE") -gt 5000 ]]; then
  tail -n 2500 "$LOG_FILE" > /tmp/tip-monitor-trim && mv /tmp/tip-monitor-trim "$LOG_FILE"
fi

# ── Systemd unit (paste into /etc/systemd/system/tip-monitor.service) ────────
# [Unit]
# Description=TIP/Erik Health Monitor
# After=network.target
#
# [Service]
# Type=oneshot
# ExecStart=/usr/local/bin/tip-monitor
# StandardOutput=append:/var/log/tip-monitor.log
# StandardError=append:/var/log/tip-monitor.log
# Environment=NTFY_TOPIC=tip-erik-alerts
#
# [Install]
# WantedBy=multi-user.target
#
# Systemd timer (paste into /etc/systemd/system/tip-monitor.timer):
# [Unit]
# Description=Run TIP monitor every 5 minutes
# [Timer]
# OnBootSec=60
# OnUnitActiveSec=300
# [Install]
# WantedBy=timers.target