/** * CtxHealth — Health check implementations. * * Each check exposes: * check() → CheckResult * heal() → HealResult * * exec safety: all shell commands use execFile with an allow-list. * Never pass user-supplied strings to any exec call. */ import { execFile as execFileCb } from 'node:child_process'; import { readFile } from 'node:fs/promises'; import { promisify } from 'node:util'; import { logger } from '../observability/logger.js'; import { resetCircuitBreaker } from '../gateway-client.js'; import type { CheckResult, HealResult, HealthCheck } from '../types.js'; const execFile = promisify(execFileCb); const EXEC_TIMEOUT_MS = 15_000; // ─── Allowed command allow-list ─────────────────────────────────────────────── const ALLOWED_COMMANDS = new Set([ '/usr/bin/ping', '/usr/bin/wg', '/usr/sbin/wg', '/usr/bin/df', '/bin/df', '/usr/local/bin/pm2', '/usr/bin/pm2', '/usr/local/bin/node', '/usr/bin/systemctl', '/bin/systemctl', '/usr/sbin/systemctl', '/usr/bin/sync', '/bin/sync', ]); async function safeExec( cmd: string, args: readonly string[], ): Promise<{ stdout: string; stderr: string; success: boolean }> { if (!ALLOWED_COMMANDS.has(cmd)) { logger.error({ cmd }, 'Command not in allow-list — refusing to execute'); return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false }; } try { const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS }); return { stdout, stderr, success: true }; } catch (err) { const e = err as { stdout?: string; stderr?: string; message?: string }; return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false }; } } // ─── pm2 resolve helper (tries common locations) ───────────────────────────── async function findPm2(): Promise { const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2']; for (const p of candidates) { if (ALLOWED_COMMANDS.has(p)) { const { success } = await safeExec(p, ['--version']); if (success) return p; } } return null; } // ─── 1. PM2 processes ──────────────────────────────────────────────────────── const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning']; async function checkPm2(): Promise { const start = Date.now(); const pm2 = await findPm2(); if (!pm2) { return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start }; } const { stdout, success } = await safeExec(pm2, ['jlist']); const latency_ms = Date.now() - start; if (!success) { return { healthy: false, message: 'pm2 jlist failed', latency_ms }; } let processes: Array<{ name: string; pm2_env?: { status?: string } }>; try { processes = JSON.parse(stdout) as typeof processes; } catch { return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms }; } const statusMap = Object.fromEntries( processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']), ); const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online'); if (offline.length > 0) { return { healthy: false, message: `PM2 processes not online: ${offline.join(', ')}`, details: { statusMap }, latency_ms, }; } return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms }; } async function healPm2(diagnosis: string): Promise { const pm2 = await findPm2(); if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false }; const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']); const output = `${stdout}\n${stderr}`.trim(); logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed'); return { action_taken: 'pm2 restart all', success: true, output }; } // ─── 2. PostgreSQL ──────────────────────────────────────────────────────────── async function checkPostgres(): Promise { const start = Date.now(); // Dynamic import to avoid top-level pool creation before env is loaded const { query } = await import('../db/client.js'); try { await query('SELECT 1'); return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start }; } catch (err) { return { healthy: false, message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`, latency_ms: Date.now() - start, }; } } async function healPostgres(_diagnosis: string): Promise { const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']); const output = `${stdout}\n${stderr}`.trim(); logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed'); return { action_taken: 'systemctl restart postgresql', success, output }; } // ─── 3. Ollama ─────────────────────────────────────────────────────────────── const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org'; async function checkOllama(): Promise { const start = Date.now(); const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), 10_000); try { const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit); const latency_ms = Date.now() - start; if (!res.ok) { return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms }; } return { healthy: true, message: 'Ollama is reachable', latency_ms }; } catch (err) { return { healthy: false, message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`, latency_ms: Date.now() - start, }; } finally { clearTimeout(timer); } } async function healOllama(_diagnosis: string): Promise { await resetCircuitBreaker(); return { action_taken: 'circuit-breaker reset requested via gateway', success: true }; } // ─── 4. Cloudflare tunnel ──────────────────────────────────────────────────── const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103'; async function checkCloudflareTunnel(): Promise { const start = Date.now(); const pm2 = await findPm2(); let cloudflaredRunning = false; if (pm2) { const { stdout } = await safeExec(pm2, ['jlist']); try { const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>; cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online'); } catch { /* ignore parse errors */ } } // Also attempt to ping the gateway URL to confirm tunnel is routing traffic const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), 8_000); let gatewayReachable = false; try { const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit); gatewayReachable = res.status < 500; } catch { /* expected if tunnel is down */ } finally { clearTimeout(timer); } const latency_ms = Date.now() - start; if (!cloudflaredRunning && !gatewayReachable) { return { healthy: false, message: 'cloudflared not running via pm2 and gateway is unreachable', details: { cloudflaredRunning, gatewayReachable }, latency_ms, }; } return { healthy: true, message: 'Cloudflare tunnel appears healthy', details: { cloudflaredRunning, gatewayReachable }, latency_ms, }; } async function healCloudflareTunnel(_diagnosis: string): Promise { const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']); const output = `${stdout}\n${stderr}`.trim(); logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed'); return { action_taken: 'systemctl restart cloudflared', success, output }; } // ─── 5. Disk space ─────────────────────────────────────────────────────────── const DISK_ALERT_PERCENT = 85; async function checkDiskSpace(): Promise { const start = Date.now(); const { stdout, success } = await safeExec('/bin/df', ['-h', '/']); const latency_ms = Date.now() - start; if (!success) { return { healthy: false, message: 'df command failed', latency_ms }; } const lines = stdout.trim().split('\n'); const dataLine = lines[1] ?? ''; const parts = dataLine.split(/\s+/); const usedPct = parseInt(parts[4] ?? '0', 10); if (isNaN(usedPct)) { return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms }; } if (usedPct > DISK_ALERT_PERCENT) { return { healthy: false, message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`, details: { usedPercent: usedPct, raw: dataLine }, latency_ms, }; } return { healthy: true, message: `Disk usage: ${usedPct}%`, details: { usedPercent: usedPct }, latency_ms, }; } async function healDiskSpace(diagnosis: string): Promise { // Never auto-delete — just log the LLM advisory logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required'); return { action_taken: 'logged LLM advisory — manual cleanup required', success: true, output: diagnosis.slice(0, 500), }; } // ─── 6. Memory ─────────────────────────────────────────────────────────────── const MEMORY_FREE_MIN_MB = 500; async function checkMemory(): Promise { const start = Date.now(); try { const meminfo = await readFile('/proc/meminfo', 'utf-8'); const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:')); const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10); const freeMb = Math.floor(freeKb / 1024); const latency_ms = Date.now() - start; if (isNaN(freeMb)) { return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms }; } if (freeMb < MEMORY_FREE_MIN_MB) { return { healthy: false, message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`, details: { availableMb: freeMb }, latency_ms, }; } return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms }; } catch (err) { // /proc/meminfo not available (e.g. macOS dev environment) return { healthy: true, message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`, latency_ms: Date.now() - start, }; } } async function healMemory(_diagnosis: string): Promise { const { stdout, stderr, success } = await safeExec('/bin/sync', []); if (!success) { return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr }; } // Writing to /proc/sys requires execFile with shell — skipped for safety. // In production, a privileged helper script should handle this. logger.warn('Memory healing: sync executed. Drop caches requires privileged script.'); return { action_taken: 'sync executed; drop_caches requires privileged helper', success: true, output: stdout.slice(0, 200), }; } // ─── 7. Network connectivity ───────────────────────────────────────────────── async function checkNetwork(): Promise { const start = Date.now(); const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']); const latency_ms = Date.now() - start; if (!success) { return { healthy: false, message: 'Cannot ping 1.1.1.1 — network connectivity issue', details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) }, latency_ms, }; } return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms }; } async function healNetwork(diagnosis: string): Promise { logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required'); return { action_taken: 'logged critical alert — network issues require manual intervention', success: false, output: 'Cannot auto-heal network connectivity issues.', }; } // ─── 8. WireGuard ──────────────────────────────────────────────────────────── async function findWg(): Promise { const candidates = ['/usr/bin/wg', '/usr/sbin/wg']; for (const p of candidates) { if (ALLOWED_COMMANDS.has(p)) return p; } return null; } async function checkWireGuard(): Promise { const start = Date.now(); const wg = await findWg(); if (!wg) { return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start }; } const { stdout, success } = await safeExec(wg, ['show']); const latency_ms = Date.now() - start; if (!success) { return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms }; } const hasActivePeer = stdout.includes('latest handshake'); if (!hasActivePeer) { return { healthy: false, message: 'WireGuard: no active peers with recent handshake detected', details: { output: stdout.slice(0, 300) }, latency_ms, }; } return { healthy: true, message: 'WireGuard peers active', latency_ms }; } async function healWireGuard(_diagnosis: string): Promise { const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']); const output = `${stdout}\n${stderr}`.trim(); logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed'); return { action_taken: 'systemctl restart wg-quick@wg0', success, output }; } // ─── Exported check list ────────────────────────────────────────────────────── export const healthChecks: HealthCheck[] = [ { name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 }, { name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres }, { name: 'ollama', category: 'service', check: checkOllama, heal: healOllama }, { name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel }, { name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace }, { name: 'memory', category: 'service', check: checkMemory, heal: healMemory }, { name: 'network', category: 'network', check: checkNetwork, heal: healNetwork }, { name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard }, ];