Rene Fichtmueller e0b9fa1f53 feat: add CtxHealth self-healing daemon as new workspace package
New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript
infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems
every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory,
network, WireGuard), gets AI-powered root cause analysis via the gateway
(ctxhealer caller / ctx_health_diagnose task_type), executes healing
actions with cooldown (5min) and escalation guards (3+ failures → human
escalation), persists all incidents to ctx_health_incidents and
ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true.
Runs as ctx-health PM2 process on Erik server.
2026-04-03 00:16:08 +02:00

416 lines
16 KiB
TypeScript

/**
* CtxHealth — Health check implementations.
*
* Each check exposes:
* check() → CheckResult
* heal() → HealResult
*
* exec safety: all shell commands use execFile with an allow-list.
* Never pass user-supplied strings to any exec call.
*/
import { execFile as execFileCb } from 'node:child_process';
import { readFile } from 'node:fs/promises';
import { promisify } from 'node:util';
import { logger } from '../observability/logger.js';
import { resetCircuitBreaker } from '../gateway-client.js';
import type { CheckResult, HealResult, HealthCheck } from '../types.js';
const execFile = promisify(execFileCb);
const EXEC_TIMEOUT_MS = 15_000;
// ─── Allowed command allow-list ───────────────────────────────────────────────
const ALLOWED_COMMANDS = new Set([
'/usr/bin/ping',
'/usr/bin/wg',
'/usr/sbin/wg',
'/usr/bin/df',
'/bin/df',
'/usr/local/bin/pm2',
'/usr/bin/pm2',
'/usr/local/bin/node',
'/usr/bin/systemctl',
'/bin/systemctl',
'/usr/sbin/systemctl',
'/usr/bin/sync',
'/bin/sync',
]);
async function safeExec(
cmd: string,
args: readonly string[],
): Promise<{ stdout: string; stderr: string; success: boolean }> {
if (!ALLOWED_COMMANDS.has(cmd)) {
logger.error({ cmd }, 'Command not in allow-list — refusing to execute');
return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false };
}
try {
const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS });
return { stdout, stderr, success: true };
} catch (err) {
const e = err as { stdout?: string; stderr?: string; message?: string };
return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false };
}
}
// ─── pm2 resolve helper (tries common locations) ─────────────────────────────
async function findPm2(): Promise<string | null> {
const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2'];
for (const p of candidates) {
if (ALLOWED_COMMANDS.has(p)) {
const { success } = await safeExec(p, ['--version']);
if (success) return p;
}
}
return null;
}
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
async function checkPm2(): Promise<CheckResult> {
const start = Date.now();
const pm2 = await findPm2();
if (!pm2) {
return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start };
}
const { stdout, success } = await safeExec(pm2, ['jlist']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'pm2 jlist failed', latency_ms };
}
let processes: Array<{ name: string; pm2_env?: { status?: string } }>;
try {
processes = JSON.parse(stdout) as typeof processes;
} catch {
return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms };
}
const statusMap = Object.fromEntries(
processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']),
);
const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online');
if (offline.length > 0) {
return {
healthy: false,
message: `PM2 processes not online: ${offline.join(', ')}`,
details: { statusMap },
latency_ms,
};
}
return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms };
}
async function healPm2(diagnosis: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
return { action_taken: 'pm2 restart all', success: true, output };
}
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
async function checkPostgres(): Promise<CheckResult> {
const start = Date.now();
// Dynamic import to avoid top-level pool creation before env is loaded
const { query } = await import('../db/client.js');
try {
await query('SELECT 1');
return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start };
} catch (err) {
return {
healthy: false,
message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
}
}
async function healPostgres(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed');
return { action_taken: 'systemctl restart postgresql', success, output };
}
// ─── 3. Ollama ───────────────────────────────────────────────────────────────
const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org';
async function checkOllama(): Promise<CheckResult> {
const start = Date.now();
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 10_000);
try {
const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
const latency_ms = Date.now() - start;
if (!res.ok) {
return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms };
}
return { healthy: true, message: 'Ollama is reachable', latency_ms };
} catch (err) {
return {
healthy: false,
message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
} finally {
clearTimeout(timer);
}
}
async function healOllama(_diagnosis: string): Promise<HealResult> {
await resetCircuitBreaker();
return { action_taken: 'circuit-breaker reset requested via gateway', success: true };
}
// ─── 4. Cloudflare tunnel ────────────────────────────────────────────────────
const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
async function checkCloudflareTunnel(): Promise<CheckResult> {
const start = Date.now();
const pm2 = await findPm2();
let cloudflaredRunning = false;
if (pm2) {
const { stdout } = await safeExec(pm2, ['jlist']);
try {
const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>;
cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online');
} catch { /* ignore parse errors */ }
}
// Also attempt to ping the gateway URL to confirm tunnel is routing traffic
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 8_000);
let gatewayReachable = false;
try {
const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
gatewayReachable = res.status < 500;
} catch { /* expected if tunnel is down */ } finally {
clearTimeout(timer);
}
const latency_ms = Date.now() - start;
if (!cloudflaredRunning && !gatewayReachable) {
return {
healthy: false,
message: 'cloudflared not running via pm2 and gateway is unreachable',
details: { cloudflaredRunning, gatewayReachable },
latency_ms,
};
}
return {
healthy: true,
message: 'Cloudflare tunnel appears healthy',
details: { cloudflaredRunning, gatewayReachable },
latency_ms,
};
}
async function healCloudflareTunnel(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed');
return { action_taken: 'systemctl restart cloudflared', success, output };
}
// ─── 5. Disk space ───────────────────────────────────────────────────────────
const DISK_ALERT_PERCENT = 85;
async function checkDiskSpace(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await safeExec('/bin/df', ['-h', '/']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'df command failed', latency_ms };
}
const lines = stdout.trim().split('\n');
const dataLine = lines[1] ?? '';
const parts = dataLine.split(/\s+/);
const usedPct = parseInt(parts[4] ?? '0', 10);
if (isNaN(usedPct)) {
return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms };
}
if (usedPct > DISK_ALERT_PERCENT) {
return {
healthy: false,
message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`,
details: { usedPercent: usedPct, raw: dataLine },
latency_ms,
};
}
return {
healthy: true,
message: `Disk usage: ${usedPct}%`,
details: { usedPercent: usedPct },
latency_ms,
};
}
async function healDiskSpace(diagnosis: string): Promise<HealResult> {
// Never auto-delete — just log the LLM advisory
logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required');
return {
action_taken: 'logged LLM advisory — manual cleanup required',
success: true,
output: diagnosis.slice(0, 500),
};
}
// ─── 6. Memory ───────────────────────────────────────────────────────────────
const MEMORY_FREE_MIN_MB = 500;
async function checkMemory(): Promise<CheckResult> {
const start = Date.now();
try {
const meminfo = await readFile('/proc/meminfo', 'utf-8');
const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:'));
const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10);
const freeMb = Math.floor(freeKb / 1024);
const latency_ms = Date.now() - start;
if (isNaN(freeMb)) {
return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms };
}
if (freeMb < MEMORY_FREE_MIN_MB) {
return {
healthy: false,
message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`,
details: { availableMb: freeMb },
latency_ms,
};
}
return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms };
} catch (err) {
// /proc/meminfo not available (e.g. macOS dev environment)
return {
healthy: true,
message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
}
}
async function healMemory(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/sync', []);
if (!success) {
return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr };
}
// Writing to /proc/sys requires execFile with shell — skipped for safety.
// In production, a privileged helper script should handle this.
logger.warn('Memory healing: sync executed. Drop caches requires privileged script.');
return {
action_taken: 'sync executed; drop_caches requires privileged helper',
success: true,
output: stdout.slice(0, 200),
};
}
// ─── 7. Network connectivity ─────────────────────────────────────────────────
async function checkNetwork(): Promise<CheckResult> {
const start = Date.now();
const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']);
const latency_ms = Date.now() - start;
if (!success) {
return {
healthy: false,
message: 'Cannot ping 1.1.1.1 — network connectivity issue',
details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) },
latency_ms,
};
}
return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms };
}
async function healNetwork(diagnosis: string): Promise<HealResult> {
logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required');
return {
action_taken: 'logged critical alert — network issues require manual intervention',
success: false,
output: 'Cannot auto-heal network connectivity issues.',
};
}
// ─── 8. WireGuard ────────────────────────────────────────────────────────────
async function findWg(): Promise<string | null> {
const candidates = ['/usr/bin/wg', '/usr/sbin/wg'];
for (const p of candidates) {
if (ALLOWED_COMMANDS.has(p)) return p;
}
return null;
}
async function checkWireGuard(): Promise<CheckResult> {
const start = Date.now();
const wg = await findWg();
if (!wg) {
return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start };
}
const { stdout, success } = await safeExec(wg, ['show']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms };
}
const hasActivePeer = stdout.includes('latest handshake');
if (!hasActivePeer) {
return {
healthy: false,
message: 'WireGuard: no active peers with recent handshake detected',
details: { output: stdout.slice(0, 300) },
latency_ms,
};
}
return { healthy: true, message: 'WireGuard peers active', latency_ms };
}
async function healWireGuard(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed');
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
}
// ─── Exported check list ──────────────────────────────────────────────────────
export const healthChecks: HealthCheck[] = [
{ name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 },
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
];