New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard), gets AI-powered root cause analysis via the gateway (ctxhealer caller / ctx_health_diagnose task_type), executes healing actions with cooldown (5min) and escalation guards (3+ failures → human escalation), persists all incidents to ctx_health_incidents and ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true. Runs as ctx-health PM2 process on Erik server.
416 lines
16 KiB
TypeScript
416 lines
16 KiB
TypeScript
/**
|
|
* CtxHealth — Health check implementations.
|
|
*
|
|
* Each check exposes:
|
|
* check() → CheckResult
|
|
* heal() → HealResult
|
|
*
|
|
* exec safety: all shell commands use execFile with an allow-list.
|
|
* Never pass user-supplied strings to any exec call.
|
|
*/
|
|
|
|
import { execFile as execFileCb } from 'node:child_process';
|
|
import { readFile } from 'node:fs/promises';
|
|
import { promisify } from 'node:util';
|
|
import { logger } from '../observability/logger.js';
|
|
import { resetCircuitBreaker } from '../gateway-client.js';
|
|
import type { CheckResult, HealResult, HealthCheck } from '../types.js';
|
|
|
|
const execFile = promisify(execFileCb);
|
|
const EXEC_TIMEOUT_MS = 15_000;
|
|
|
|
// ─── Allowed command allow-list ───────────────────────────────────────────────
|
|
|
|
const ALLOWED_COMMANDS = new Set([
|
|
'/usr/bin/ping',
|
|
'/usr/bin/wg',
|
|
'/usr/sbin/wg',
|
|
'/usr/bin/df',
|
|
'/bin/df',
|
|
'/usr/local/bin/pm2',
|
|
'/usr/bin/pm2',
|
|
'/usr/local/bin/node',
|
|
'/usr/bin/systemctl',
|
|
'/bin/systemctl',
|
|
'/usr/sbin/systemctl',
|
|
'/usr/bin/sync',
|
|
'/bin/sync',
|
|
]);
|
|
|
|
async function safeExec(
|
|
cmd: string,
|
|
args: readonly string[],
|
|
): Promise<{ stdout: string; stderr: string; success: boolean }> {
|
|
if (!ALLOWED_COMMANDS.has(cmd)) {
|
|
logger.error({ cmd }, 'Command not in allow-list — refusing to execute');
|
|
return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false };
|
|
}
|
|
try {
|
|
const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS });
|
|
return { stdout, stderr, success: true };
|
|
} catch (err) {
|
|
const e = err as { stdout?: string; stderr?: string; message?: string };
|
|
return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false };
|
|
}
|
|
}
|
|
|
|
// ─── pm2 resolve helper (tries common locations) ─────────────────────────────
|
|
|
|
async function findPm2(): Promise<string | null> {
|
|
const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2'];
|
|
for (const p of candidates) {
|
|
if (ALLOWED_COMMANDS.has(p)) {
|
|
const { success } = await safeExec(p, ['--version']);
|
|
if (success) return p;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
|
|
|
|
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
|
|
|
|
async function checkPm2(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const pm2 = await findPm2();
|
|
if (!pm2) {
|
|
return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start };
|
|
}
|
|
|
|
const { stdout, success } = await safeExec(pm2, ['jlist']);
|
|
const latency_ms = Date.now() - start;
|
|
|
|
if (!success) {
|
|
return { healthy: false, message: 'pm2 jlist failed', latency_ms };
|
|
}
|
|
|
|
let processes: Array<{ name: string; pm2_env?: { status?: string } }>;
|
|
try {
|
|
processes = JSON.parse(stdout) as typeof processes;
|
|
} catch {
|
|
return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms };
|
|
}
|
|
|
|
const statusMap = Object.fromEntries(
|
|
processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']),
|
|
);
|
|
|
|
const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online');
|
|
|
|
if (offline.length > 0) {
|
|
return {
|
|
healthy: false,
|
|
message: `PM2 processes not online: ${offline.join(', ')}`,
|
|
details: { statusMap },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms };
|
|
}
|
|
|
|
async function healPm2(diagnosis: string): Promise<HealResult> {
|
|
const pm2 = await findPm2();
|
|
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
|
|
|
|
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
|
|
const output = `${stdout}\n${stderr}`.trim();
|
|
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
|
|
return { action_taken: 'pm2 restart all', success: true, output };
|
|
}
|
|
|
|
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
|
|
|
|
async function checkPostgres(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
// Dynamic import to avoid top-level pool creation before env is loaded
|
|
const { query } = await import('../db/client.js');
|
|
try {
|
|
await query('SELECT 1');
|
|
return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start };
|
|
} catch (err) {
|
|
return {
|
|
healthy: false,
|
|
message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`,
|
|
latency_ms: Date.now() - start,
|
|
};
|
|
}
|
|
}
|
|
|
|
async function healPostgres(_diagnosis: string): Promise<HealResult> {
|
|
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']);
|
|
const output = `${stdout}\n${stderr}`.trim();
|
|
logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed');
|
|
return { action_taken: 'systemctl restart postgresql', success, output };
|
|
}
|
|
|
|
// ─── 3. Ollama ───────────────────────────────────────────────────────────────
|
|
|
|
const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org';
|
|
|
|
async function checkOllama(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), 10_000);
|
|
try {
|
|
const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
|
|
const latency_ms = Date.now() - start;
|
|
if (!res.ok) {
|
|
return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms };
|
|
}
|
|
return { healthy: true, message: 'Ollama is reachable', latency_ms };
|
|
} catch (err) {
|
|
return {
|
|
healthy: false,
|
|
message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`,
|
|
latency_ms: Date.now() - start,
|
|
};
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
async function healOllama(_diagnosis: string): Promise<HealResult> {
|
|
await resetCircuitBreaker();
|
|
return { action_taken: 'circuit-breaker reset requested via gateway', success: true };
|
|
}
|
|
|
|
// ─── 4. Cloudflare tunnel ────────────────────────────────────────────────────
|
|
|
|
const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
|
|
|
|
async function checkCloudflareTunnel(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const pm2 = await findPm2();
|
|
|
|
let cloudflaredRunning = false;
|
|
if (pm2) {
|
|
const { stdout } = await safeExec(pm2, ['jlist']);
|
|
try {
|
|
const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>;
|
|
cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online');
|
|
} catch { /* ignore parse errors */ }
|
|
}
|
|
|
|
// Also attempt to ping the gateway URL to confirm tunnel is routing traffic
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), 8_000);
|
|
let gatewayReachable = false;
|
|
try {
|
|
const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
|
|
gatewayReachable = res.status < 500;
|
|
} catch { /* expected if tunnel is down */ } finally {
|
|
clearTimeout(timer);
|
|
}
|
|
|
|
const latency_ms = Date.now() - start;
|
|
|
|
if (!cloudflaredRunning && !gatewayReachable) {
|
|
return {
|
|
healthy: false,
|
|
message: 'cloudflared not running via pm2 and gateway is unreachable',
|
|
details: { cloudflaredRunning, gatewayReachable },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
return {
|
|
healthy: true,
|
|
message: 'Cloudflare tunnel appears healthy',
|
|
details: { cloudflaredRunning, gatewayReachable },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
async function healCloudflareTunnel(_diagnosis: string): Promise<HealResult> {
|
|
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']);
|
|
const output = `${stdout}\n${stderr}`.trim();
|
|
logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed');
|
|
return { action_taken: 'systemctl restart cloudflared', success, output };
|
|
}
|
|
|
|
// ─── 5. Disk space ───────────────────────────────────────────────────────────
|
|
|
|
const DISK_ALERT_PERCENT = 85;
|
|
|
|
async function checkDiskSpace(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const { stdout, success } = await safeExec('/bin/df', ['-h', '/']);
|
|
const latency_ms = Date.now() - start;
|
|
|
|
if (!success) {
|
|
return { healthy: false, message: 'df command failed', latency_ms };
|
|
}
|
|
|
|
const lines = stdout.trim().split('\n');
|
|
const dataLine = lines[1] ?? '';
|
|
const parts = dataLine.split(/\s+/);
|
|
const usedPct = parseInt(parts[4] ?? '0', 10);
|
|
|
|
if (isNaN(usedPct)) {
|
|
return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms };
|
|
}
|
|
|
|
if (usedPct > DISK_ALERT_PERCENT) {
|
|
return {
|
|
healthy: false,
|
|
message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`,
|
|
details: { usedPercent: usedPct, raw: dataLine },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
return {
|
|
healthy: true,
|
|
message: `Disk usage: ${usedPct}%`,
|
|
details: { usedPercent: usedPct },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
async function healDiskSpace(diagnosis: string): Promise<HealResult> {
|
|
// Never auto-delete — just log the LLM advisory
|
|
logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required');
|
|
return {
|
|
action_taken: 'logged LLM advisory — manual cleanup required',
|
|
success: true,
|
|
output: diagnosis.slice(0, 500),
|
|
};
|
|
}
|
|
|
|
// ─── 6. Memory ───────────────────────────────────────────────────────────────
|
|
|
|
const MEMORY_FREE_MIN_MB = 500;
|
|
|
|
async function checkMemory(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
try {
|
|
const meminfo = await readFile('/proc/meminfo', 'utf-8');
|
|
const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:'));
|
|
const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10);
|
|
const freeMb = Math.floor(freeKb / 1024);
|
|
const latency_ms = Date.now() - start;
|
|
|
|
if (isNaN(freeMb)) {
|
|
return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms };
|
|
}
|
|
|
|
if (freeMb < MEMORY_FREE_MIN_MB) {
|
|
return {
|
|
healthy: false,
|
|
message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`,
|
|
details: { availableMb: freeMb },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms };
|
|
} catch (err) {
|
|
// /proc/meminfo not available (e.g. macOS dev environment)
|
|
return {
|
|
healthy: true,
|
|
message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`,
|
|
latency_ms: Date.now() - start,
|
|
};
|
|
}
|
|
}
|
|
|
|
async function healMemory(_diagnosis: string): Promise<HealResult> {
|
|
const { stdout, stderr, success } = await safeExec('/bin/sync', []);
|
|
if (!success) {
|
|
return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr };
|
|
}
|
|
// Writing to /proc/sys requires execFile with shell — skipped for safety.
|
|
// In production, a privileged helper script should handle this.
|
|
logger.warn('Memory healing: sync executed. Drop caches requires privileged script.');
|
|
return {
|
|
action_taken: 'sync executed; drop_caches requires privileged helper',
|
|
success: true,
|
|
output: stdout.slice(0, 200),
|
|
};
|
|
}
|
|
|
|
// ─── 7. Network connectivity ─────────────────────────────────────────────────
|
|
|
|
async function checkNetwork(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']);
|
|
const latency_ms = Date.now() - start;
|
|
if (!success) {
|
|
return {
|
|
healthy: false,
|
|
message: 'Cannot ping 1.1.1.1 — network connectivity issue',
|
|
details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) },
|
|
latency_ms,
|
|
};
|
|
}
|
|
return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms };
|
|
}
|
|
|
|
async function healNetwork(diagnosis: string): Promise<HealResult> {
|
|
logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required');
|
|
return {
|
|
action_taken: 'logged critical alert — network issues require manual intervention',
|
|
success: false,
|
|
output: 'Cannot auto-heal network connectivity issues.',
|
|
};
|
|
}
|
|
|
|
// ─── 8. WireGuard ────────────────────────────────────────────────────────────
|
|
|
|
async function findWg(): Promise<string | null> {
|
|
const candidates = ['/usr/bin/wg', '/usr/sbin/wg'];
|
|
for (const p of candidates) {
|
|
if (ALLOWED_COMMANDS.has(p)) return p;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function checkWireGuard(): Promise<CheckResult> {
|
|
const start = Date.now();
|
|
const wg = await findWg();
|
|
if (!wg) {
|
|
return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start };
|
|
}
|
|
|
|
const { stdout, success } = await safeExec(wg, ['show']);
|
|
const latency_ms = Date.now() - start;
|
|
|
|
if (!success) {
|
|
return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms };
|
|
}
|
|
|
|
const hasActivePeer = stdout.includes('latest handshake');
|
|
if (!hasActivePeer) {
|
|
return {
|
|
healthy: false,
|
|
message: 'WireGuard: no active peers with recent handshake detected',
|
|
details: { output: stdout.slice(0, 300) },
|
|
latency_ms,
|
|
};
|
|
}
|
|
|
|
return { healthy: true, message: 'WireGuard peers active', latency_ms };
|
|
}
|
|
|
|
async function healWireGuard(_diagnosis: string): Promise<HealResult> {
|
|
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']);
|
|
const output = `${stdout}\n${stderr}`.trim();
|
|
logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed');
|
|
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
|
|
}
|
|
|
|
// ─── Exported check list ──────────────────────────────────────────────────────
|
|
|
|
export const healthChecks: HealthCheck[] = [
|
|
{ name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 },
|
|
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
|
|
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
|
|
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
|
|
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
|
|
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
|
|
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
|
|
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
|
|
];
|