llm-gateway/packages/ctx-health/src/checks/index.ts

/**
 * CtxHealth — Health check implementations.
 *
 * Each check exposes:
 *   check() → CheckResult
 *   heal()  → HealResult
 *
 * exec safety: all shell commands use execFile with an allow-list.
 * Never pass user-supplied strings to any exec call.
 */

import { execFile as execFileCb } from 'node:child_process';
import { readFile } from 'node:fs/promises';
import { promisify } from 'node:util';
import { logger } from '../observability/logger.js';
import { resetCircuitBreaker } from '../gateway-client.js';
import type { CheckResult, HealResult, HealthCheck } from '../types.js';

const execFile = promisify(execFileCb);
const EXEC_TIMEOUT_MS = 15_000;

// ─── Allowed command allow-list ───────────────────────────────────────────────

const ALLOWED_COMMANDS = new Set([
  '/usr/bin/ping',
  '/usr/bin/wg',
  '/usr/sbin/wg',
  '/usr/bin/df',
  '/bin/df',
  '/usr/local/bin/pm2',
  '/usr/bin/pm2',
  '/usr/local/bin/node',
  '/usr/bin/systemctl',
  '/bin/systemctl',
  '/usr/sbin/systemctl',
  '/usr/bin/sync',
  '/bin/sync',
]);

async function safeExec(
  cmd: string,
  args: readonly string[],
): Promise<{ stdout: string; stderr: string; success: boolean }> {
  if (!ALLOWED_COMMANDS.has(cmd)) {
    logger.error({ cmd }, 'Command not in allow-list — refusing to execute');
    return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false };
  }
  try {
    const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS });
    return { stdout, stderr, success: true };
  } catch (err) {
    const e = err as { stdout?: string; stderr?: string; message?: string };
    return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false };
  }
}

// ─── pm2 resolve helper (tries common locations) ─────────────────────────────

async function findPm2(): Promise<string | null> {
  const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2'];
  for (const p of candidates) {
    if (ALLOWED_COMMANDS.has(p)) {
      const { success } = await safeExec(p, ['--version']);
      if (success) return p;
    }
  }
  return null;
}

// ─── 1. PM2 processes ────────────────────────────────────────────────────────

const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];

async function checkPm2(): Promise<CheckResult> {
  const start = Date.now();
  const pm2 = await findPm2();
  if (!pm2) {
    return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start };
  }

  const { stdout, success } = await safeExec(pm2, ['jlist']);
  const latency_ms = Date.now() - start;

  if (!success) {
    return { healthy: false, message: 'pm2 jlist failed', latency_ms };
  }

  let processes: Array<{ name: string; pm2_env?: { status?: string } }>;
  try {
    processes = JSON.parse(stdout) as typeof processes;
  } catch {
    return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms };
  }

  const statusMap = Object.fromEntries(
    processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']),
  );

  const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online');

  if (offline.length > 0) {
    return {
      healthy: false,
      message: `PM2 processes not online: ${offline.join(', ')}`,
      details: { statusMap },
      latency_ms,
    };
  }

  return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms };
}

async function healPm2(diagnosis: string): Promise<HealResult> {
  const pm2 = await findPm2();
  if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };

  const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
  const output = `${stdout}\n${stderr}`.trim();
  logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
  return { action_taken: 'pm2 restart all', success: true, output };
}

// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────

async function checkPostgres(): Promise<CheckResult> {
  const start = Date.now();
  // Dynamic import to avoid top-level pool creation before env is loaded
  const { query } = await import('../db/client.js');
  try {
    await query('SELECT 1');
    return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start };
  } catch (err) {
    return {
      healthy: false,
      message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`,
      latency_ms: Date.now() - start,
    };
  }
}

async function healPostgres(_diagnosis: string): Promise<HealResult> {
  const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']);
  const output = `${stdout}\n${stderr}`.trim();
  logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed');
  return { action_taken: 'systemctl restart postgresql', success, output };
}

// ─── 3. Ollama ───────────────────────────────────────────────────────────────

const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org';

async function checkOllama(): Promise<CheckResult> {
  const start = Date.now();
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), 10_000);
  try {
    const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
    const latency_ms = Date.now() - start;
    if (!res.ok) {
      return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms };
    }
    return { healthy: true, message: 'Ollama is reachable', latency_ms };
  } catch (err) {
    return {
      healthy: false,
      message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`,
      latency_ms: Date.now() - start,
    };
  } finally {
    clearTimeout(timer);
  }
}

async function healOllama(_diagnosis: string): Promise<HealResult> {
  await resetCircuitBreaker();
  return { action_taken: 'circuit-breaker reset requested via gateway', success: true };
}

// ─── 4. Cloudflare tunnel ────────────────────────────────────────────────────

const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';

async function checkCloudflareTunnel(): Promise<CheckResult> {
  const start = Date.now();
  const pm2 = await findPm2();

  let cloudflaredRunning = false;
  if (pm2) {
    const { stdout } = await safeExec(pm2, ['jlist']);
    try {
      const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>;
      cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online');
    } catch { /* ignore parse errors */ }
  }

  // Also attempt to ping the gateway URL to confirm tunnel is routing traffic
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), 8_000);
  let gatewayReachable = false;
  try {
    const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
    gatewayReachable = res.status < 500;
  } catch { /* expected if tunnel is down */ } finally {
    clearTimeout(timer);
  }

  const latency_ms = Date.now() - start;

  if (!cloudflaredRunning && !gatewayReachable) {
    return {
      healthy: false,
      message: 'cloudflared not running via pm2 and gateway is unreachable',
      details: { cloudflaredRunning, gatewayReachable },
      latency_ms,
    };
  }

  return {
    healthy: true,
    message: 'Cloudflare tunnel appears healthy',
    details: { cloudflaredRunning, gatewayReachable },
    latency_ms,
  };
}

async function healCloudflareTunnel(_diagnosis: string): Promise<HealResult> {
  const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']);
  const output = `${stdout}\n${stderr}`.trim();
  logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed');
  return { action_taken: 'systemctl restart cloudflared', success, output };
}

// ─── 5. Disk space ───────────────────────────────────────────────────────────

const DISK_ALERT_PERCENT = 85;

async function checkDiskSpace(): Promise<CheckResult> {
  const start = Date.now();
  const { stdout, success } = await safeExec('/bin/df', ['-h', '/']);
  const latency_ms = Date.now() - start;

  if (!success) {
    return { healthy: false, message: 'df command failed', latency_ms };
  }

  const lines = stdout.trim().split('\n');
  const dataLine = lines[1] ?? '';
  const parts = dataLine.split(/\s+/);
  const usedPct = parseInt(parts[4] ?? '0', 10);

  if (isNaN(usedPct)) {
    return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms };
  }

  if (usedPct > DISK_ALERT_PERCENT) {
    return {
      healthy: false,
      message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`,
      details: { usedPercent: usedPct, raw: dataLine },
      latency_ms,
    };
  }

  return {
    healthy: true,
    message: `Disk usage: ${usedPct}%`,
    details: { usedPercent: usedPct },
    latency_ms,
  };
}

async function healDiskSpace(diagnosis: string): Promise<HealResult> {
  // Never auto-delete — just log the LLM advisory
  logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required');
  return {
    action_taken: 'logged LLM advisory — manual cleanup required',
    success: true,
    output: diagnosis.slice(0, 500),
  };
}

// ─── 6. Memory ───────────────────────────────────────────────────────────────

const MEMORY_FREE_MIN_MB = 500;

async function checkMemory(): Promise<CheckResult> {
  const start = Date.now();
  try {
    const meminfo = await readFile('/proc/meminfo', 'utf-8');
    const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:'));
    const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10);
    const freeMb = Math.floor(freeKb / 1024);
    const latency_ms = Date.now() - start;

    if (isNaN(freeMb)) {
      return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms };
    }

    if (freeMb < MEMORY_FREE_MIN_MB) {
      return {
        healthy: false,
        message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`,
        details: { availableMb: freeMb },
        latency_ms,
      };
    }

    return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms };
  } catch (err) {
    // /proc/meminfo not available (e.g. macOS dev environment)
    return {
      healthy: true,
      message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`,
      latency_ms: Date.now() - start,
    };
  }
}

async function healMemory(_diagnosis: string): Promise<HealResult> {
  const { stdout, stderr, success } = await safeExec('/bin/sync', []);
  if (!success) {
    return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr };
  }
  // Writing to /proc/sys requires execFile with shell — skipped for safety.
  // In production, a privileged helper script should handle this.
  logger.warn('Memory healing: sync executed. Drop caches requires privileged script.');
  return {
    action_taken: 'sync executed; drop_caches requires privileged helper',
    success: true,
    output: stdout.slice(0, 200),
  };
}

// ─── 7. Network connectivity ─────────────────────────────────────────────────

async function checkNetwork(): Promise<CheckResult> {
  const start = Date.now();
  const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']);
  const latency_ms = Date.now() - start;
  if (!success) {
    return {
      healthy: false,
      message: 'Cannot ping 1.1.1.1 — network connectivity issue',
      details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) },
      latency_ms,
    };
  }
  return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms };
}

async function healNetwork(diagnosis: string): Promise<HealResult> {
  logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required');
  return {
    action_taken: 'logged critical alert — network issues require manual intervention',
    success: false,
    output: 'Cannot auto-heal network connectivity issues.',
  };
}

// ─── 8. WireGuard ────────────────────────────────────────────────────────────

async function findWg(): Promise<string | null> {
  const candidates = ['/usr/bin/wg', '/usr/sbin/wg'];
  for (const p of candidates) {
    if (ALLOWED_COMMANDS.has(p)) return p;
  }
  return null;
}

async function checkWireGuard(): Promise<CheckResult> {
  const start = Date.now();
  const wg = await findWg();
  if (!wg) {
    return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start };
  }

  const { stdout, success } = await safeExec(wg, ['show']);
  const latency_ms = Date.now() - start;

  if (!success) {
    return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms };
  }

  const hasActivePeer = stdout.includes('latest handshake');
  if (!hasActivePeer) {
    return {
      healthy: false,
      message: 'WireGuard: no active peers with recent handshake detected',
      details: { output: stdout.slice(0, 300) },
      latency_ms,
    };
  }

  return { healthy: true, message: 'WireGuard peers active', latency_ms };
}

async function healWireGuard(_diagnosis: string): Promise<HealResult> {
  const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']);
  const output = `${stdout}\n${stderr}`.trim();
  logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed');
  return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
}

// ─── Exported check list ──────────────────────────────────────────────────────

export const healthChecks: HealthCheck[] = [
  { name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 },
  { name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
  { name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
  { name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
  { name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
  { name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
  { name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
  { name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
];