feat(ctx-health): add proxmox-pvestatd + opnsense-disk health checks

- Add SSH-based health check for pvestatd D-state detection on Proxmox host (heal via cgroup move + lock file removal + reset-failed) - Add SSH-based disk check for OPNsense VM (threshold 75%, auto-cleanup) - knowledge/fixes.json: add 48 training fixes including post-reboot DNS recovery (fix-046), cloudflared DNS-wait boot fix (fix-047), and vzdump load-crash scenario with recovery steps (fix-048)
2026-04-13 05:42:24 +02:00 · 2026-04-13 05:42:24 +02:00 · c50af63389
commit c50af63389
parent b4593b6582
2 changed files with 1289 additions and 4 deletions
--- a/knowledge/fixes.json
+++ b/knowledge/fixes.json
--- a/packages/ctx-health/src/checks/index.ts
+++ b/packages/ctx-health/src/checks/index.ts
@ -35,6 +35,7 @@ const ALLOWED_COMMANDS = new Set([
  '/usr/sbin/systemctl',
  '/usr/bin/sync',
  '/bin/sync',
  '/usr/bin/ssh',
 ]);
 async function safeExec(
@ -69,7 +70,13 @@ async function findPm2(): Promise<string | null> {
 // ─── 1. PM2 processes ────────────────────────────────────────────────────────
-const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
+const PM2_REQUIRED_PROCESSES = [
  'llm-gateway', 'llm-learning',
  'magatama', 'magatama-dashboard',
  'tip-api', 'tip-scraper-daemon',
  'peercortex', 'eo-global-pulse',
  'ghost-blog', 'nognet',
 ];
 async function checkPm2(): Promise<CheckResult> {
  const start = Date.now();
@ -114,10 +121,40 @@ async function healPm2(diagnosis: string): Promise<HealResult> {
  const pm2 = await findPm2();
  if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
-  const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
+  // Only restart processes that are actually offline — avoid pm2 restart all
  const { stdout: jlist } = await safeExec(pm2, ['jlist']);
  let processes: Array<{ name: string; pm2_env?: { status?: string } }> = [];
  try { processes = JSON.parse(jlist) as typeof processes; } catch { /* ignore */ }
  const offline = PM2_REQUIRED_PROCESSES.filter((name) => {
    const proc = processes.find((p) => p.name === name);
    return !proc || proc.pm2_env?.status !== 'online';
  });
  if (offline.length === 0) {
    return { action_taken: 'no offline processes found — skipping restart', success: true };
  }
  const results: string[] = [];
  for (const name of offline) {
    const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
    results.push(`${name}: ${success ? 'restarted' : stderr.slice(0, 80)}`);
  }
  const output = results.join('; ');
  logger.info({ diagnosis: diagnosis.slice(0, 120), output, offline }, 'PM2 targeted restart executed');
  return { action_taken: `pm2 restart ${offline.join(', ')}`, success: true, output };
 }
 // ─── pm2-aware targeted restart (shared helper) ───────────────────────────────
 async function restartProcess(name: string): Promise<HealResult> {
  const pm2 = await findPm2();
  if (!pm2) return { action_taken: 'pm2 not found', success: false };
  const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
  const output = `${stdout}\n${stderr}`.trim();
-  logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
+  logger.info({ name, success, output: output.slice(0, 200) }, 'PM2 targeted process restart');
-  return { action_taken: 'pm2 restart all', success: true, output };
+  return { action_taken: `pm2 restart ${name}`, success, output };
 }
 // ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
@ -401,6 +438,237 @@ async function healWireGuard(_diagnosis: string): Promise<HealResult> {
  return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
 }
 // ─── 9. Service port health checks ───────────────────────────────────────────
 // For each critical service, performs a real HTTP check on the local port.
 // This catches crash-loops where PM2 shows "online" but the port is not responding.
 interface ServiceDef {
  name: string;       // PM2 process name to restart on failure
  port: number;
  path: string;       // health endpoint path
  okStatus?: number[];// accepted HTTP status codes (default: <500)
 }
 const SERVICES: ServiceDef[] = [
  { name: 'magatama',           port: 3210, path: '/' },
  { name: 'magatama-dashboard', port: 3211, path: '/' },
  { name: 'magatama-admin',     port: 3212, path: '/' },
  { name: 'tip-api',            port: 3201, path: '/api/health' },
  { name: 'peercortex',         port: 3101, path: '/' },
  { name: 'llm-gateway',        port: 3103, path: '/health' },
  { name: 'eo-global-pulse',    port: 3000, path: '/' },
  { name: 'nognet',             port: 3001, path: '/' },
  { name: 'ghost-blog',         port: 2368, path: '/' },
  { name: 'switchblade',        port: 3334, path: '/' },
 ];
 async function probePort(service: ServiceDef): Promise<{ ok: boolean; status?: number; error?: string }> {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), 8_000);
  try {
    const res = await fetch(`http://localhost:${service.port}${service.path}`, {
      signal: controller.signal,
      cache: 'no-store',
      redirect: 'manual',
    } as RequestInit);
    const acceptedCodes = service.okStatus ?? [];
    const ok = acceptedCodes.length > 0
      ? acceptedCodes.includes(res.status)
      : res.status < 500 || (res.status >= 300 && res.status < 400);
    return { ok, status: res.status };
  } catch (err) {
    return { ok: false, error: err instanceof Error ? err.message : String(err) };
  } finally {
    clearTimeout(timer);
  }
 }
 async function checkServicePorts(): Promise<CheckResult> {
  const start = Date.now();
  const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
  const latency_ms = Date.now() - start;
  const failing = results.filter((r) => !r.probe.ok);
  if (failing.length === 0) {
    return { healthy: true, message: `All ${SERVICES.length} service ports are responding`, latency_ms };
  }
  const details = Object.fromEntries(
    failing.map((r) => [r.svc.name, r.probe.status ?? r.probe.error ?? 'no response']),
  );
  return {
    healthy: false,
    message: `${failing.length} service(s) not responding: ${failing.map((r) => r.svc.name).join(', ')}`,
    details,
    latency_ms,
  };
 }
 async function healServicePorts(diagnosis: string): Promise<HealResult> {
  // Re-probe to find which services are currently failing
  const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
  const failing = results.filter((r) => !r.probe.ok);
  if (failing.length === 0) {
    return { action_taken: 'services recovered on re-check — no restart needed', success: true };
  }
  const healResults: string[] = [];
  for (const { svc } of failing) {
    const result = await restartProcess(svc.name);
    healResults.push(`${svc.name}: ${result.success ? 'restarted' : result.output?.slice(0, 80) ?? 'failed'}`);
    logger.info({ service: svc.name, diagnosis: diagnosis.slice(0, 80) }, 'Service port failure — PM2 restart triggered');
  }
  return {
    action_taken: `restarted: ${failing.map((r) => r.svc.name).join(', ')}`,
    success: true,
    output: healResults.join('; '),
  };
 }
 // ─── SSH helper ──────────────────────────────────────────────────────────────
 const SSH_OPTS = ['-o', 'BatchMode=yes', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=accept-new'];
 async function sshExec(host: string, command: string): Promise<{ stdout: string; stderr: string; success: boolean }> {
  return safeExec('/usr/bin/ssh', [...SSH_OPTS, host, command]);
 }
 // ─── 10. Proxmox pvestatd health ─────────────────────────────────────────────
 const PROXMOX_HOST = process.env['PROXMOX_HOST'] ?? 'root@192.168.178.10';
 async function checkProxmoxPvestatd(): Promise<CheckResult> {
  const start = Date.now();
  const { stdout, success } = await sshExec(
    PROXMOX_HOST,
    'pid=$(pgrep pvestatd 2>/dev/null | head -1); [ -n "$pid" ] && cat /proc/$pid/status 2>/dev/null | grep "^State:" || echo "NOT_RUNNING"',
  );
  const latency_ms = Date.now() - start;
  if (!success) {
    return { healthy: false, message: 'Cannot SSH to Proxmox to check pvestatd', latency_ms };
  }
  const out = stdout.trim();
  if (out === 'NOT_RUNNING') {
    return { healthy: false, message: 'pvestatd is not running on Proxmox', details: { state: 'not_running' }, latency_ms };
  }
  const stateMatch = out.match(/State:\s+(\S)/);
  const state = stateMatch?.[1] ?? '?';
  if (state === 'D') {
    return {
      healthy: false,
      message: 'pvestatd is in D-state (kernel deadlock) — Proxmox GUI graphs will be empty',
      details: { state: 'D', raw: out.slice(0, 200) },
      latency_ms,
    };
  }
  return { healthy: true, message: `pvestatd is running (state: ${state})`, details: { state }, latency_ms };
 }
 async function healProxmoxPvestatd(diagnosis: string): Promise<HealResult> {
  // Step 1: Get PID
  const { stdout: pidOut } = await sshExec(PROXMOX_HOST, 'pgrep pvestatd 2>/dev/null | head -1');
  const pid = pidOut.trim();
  if (!pid) {
    // Not running at all — just start it
    const { success, stdout, stderr } = await sshExec(PROXMOX_HOST, 'systemctl start pvestatd 2>&1');
    return { action_taken: 'systemctl start pvestatd', success, output: `${stdout}\n${stderr}`.trim() };
  }
  // Step 2: Move stuck process to root cgroup so systemd can reclaim the unit
  await sshExec(PROXMOX_HOST, `echo ${pid} > /sys/fs/cgroup/cgroup.procs 2>/dev/null || true`);
  // Step 3: Remove stale lock files
  await sshExec(PROXMOX_HOST, 'rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid 2>/dev/null || true');
  // Step 4: Reset failed state and start
  const { stdout, stderr, success } = await sshExec(
    PROXMOX_HOST,
    'systemctl reset-failed pvestatd 2>/dev/null; systemctl start pvestatd 2>&1; sleep 3; systemctl is-active pvestatd',
  );
  const output = `${stdout}\n${stderr}`.trim();
  logger.info({ pid, diagnosis: diagnosis.slice(0, 120), output }, 'pvestatd D-state heal executed');
  return { action_taken: `moved pid ${pid} to root cgroup, reset-failed, started pvestatd`, success, output };
 }
 // ─── 11. OPNsense disk space ─────────────────────────────────────────────────
 const OPNSENSE_HOST = process.env['OPNSENSE_HOST'] ?? 'root@192.168.178.11';
 const OPNSENSE_DISK_THRESHOLD = 75;
 async function checkOpnsenseDisk(): Promise<CheckResult> {
  const start = Date.now();
  const { stdout, success } = await sshExec(
    OPNSENSE_HOST,
    "df -h / | awk 'NR==2{print $5}' | tr -d '%'",
  );
  const latency_ms = Date.now() - start;
  if (!success) {
    return { healthy: false, message: 'Cannot SSH to OPNsense to check disk', latency_ms };
  }
  const usedPct = parseInt(stdout.trim(), 10);
  if (isNaN(usedPct)) {
    return { healthy: false, message: `Cannot parse OPNsense disk usage: "${stdout.trim()}"`, latency_ms };
  }
  if (usedPct >= OPNSENSE_DISK_THRESHOLD) {
    return {
      healthy: false,
      message: `OPNsense disk usage ${usedPct}% ≥ threshold ${OPNSENSE_DISK_THRESHOLD}%`,
      details: { usedPercent: usedPct, threshold: OPNSENSE_DISK_THRESHOLD },
      latency_ms,
    };
  }
  return {
    healthy: true,
    message: `OPNsense disk usage: ${usedPct}%`,
    details: { usedPercent: usedPct },
    latency_ms,
  };
 }
 async function healOpnsenseDisk(diagnosis: string): Promise<HealResult> {
  const steps = [
    // Remove logs older than 7 days
    'find /var/log -name "*.log" -mtime +7 -delete 2>/dev/null || true',
    // Suricata JSON logs older than 3 days
    'find /var/log/suricata -name "*.json" -mtime +3 -delete 2>/dev/null || true',
    'find /var/log/suricata -name "*.json.gz" -mtime +1 -delete 2>/dev/null || true',
    // Tmp files older than 1 day
    'find /tmp /var/tmp -mtime +1 -delete 2>/dev/null || true',
    // pkg cache
    'pkg clean -y 2>/dev/null || true',
    // Report new usage
    "df -h / | awk 'NR==2{print $5}'",
  ];
  const { stdout, stderr, success } = await sshExec(OPNSENSE_HOST, steps.join('; '));
  const output = `${stdout}\n${stderr}`.trim();
  const newUsage = stdout.trim().split('\n').at(-1) ?? '?';
  logger.info({ diagnosis: diagnosis.slice(0, 120), newUsage, output: output.slice(0, 400) }, 'OPNsense disk cleanup executed');
  return {
    action_taken: `cleaned logs, tmp, pkg cache on OPNsense — disk now at ${newUsage}`,
    success,
    output: output.slice(0, 500),
  };
 }
 // ─── Exported check list ──────────────────────────────────────────────────────
 export const healthChecks: HealthCheck[] = [
@ -408,8 +676,11 @@ export const healthChecks: HealthCheck[] = [
  { name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
  { name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
  { name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
  { name: 'service-ports', category: 'service', check: checkServicePorts, heal: healServicePorts },
  { name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
  { name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
  { name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
  { name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
  { name: 'proxmox-pvestatd', category: 'service', check: checkProxmoxPvestatd, heal: healProxmoxPvestatd },
  { name: 'opnsense-disk', category: 'service', check: checkOpnsenseDisk, heal: healOpnsenseDisk },
 ];