feat(ctx-health): add proxmox-pvestatd + opnsense-disk health checks

- Add SSH-based health check for pvestatd D-state detection on Proxmox host
  (heal via cgroup move + lock file removal + reset-failed)
- Add SSH-based disk check for OPNsense VM (threshold 75%, auto-cleanup)
- knowledge/fixes.json: add 48 training fixes including post-reboot DNS
  recovery (fix-046), cloudflared DNS-wait boot fix (fix-047), and
  vzdump load-crash scenario with recovery steps (fix-048)
This commit is contained in:
Rene Fichtmueller 2026-04-13 05:42:24 +02:00
parent b4593b6582
commit c50af63389
2 changed files with 1289 additions and 4 deletions

1014
knowledge/fixes.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -35,6 +35,7 @@ const ALLOWED_COMMANDS = new Set([
'/usr/sbin/systemctl',
'/usr/bin/sync',
'/bin/sync',
'/usr/bin/ssh',
]);
async function safeExec(
@ -69,7 +70,13 @@ async function findPm2(): Promise<string | null> {
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
const PM2_REQUIRED_PROCESSES = [
'llm-gateway', 'llm-learning',
'magatama', 'magatama-dashboard',
'tip-api', 'tip-scraper-daemon',
'peercortex', 'eo-global-pulse',
'ghost-blog', 'nognet',
];
async function checkPm2(): Promise<CheckResult> {
const start = Date.now();
@ -114,10 +121,40 @@ async function healPm2(diagnosis: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
// Only restart processes that are actually offline — avoid pm2 restart all
const { stdout: jlist } = await safeExec(pm2, ['jlist']);
let processes: Array<{ name: string; pm2_env?: { status?: string } }> = [];
try { processes = JSON.parse(jlist) as typeof processes; } catch { /* ignore */ }
const offline = PM2_REQUIRED_PROCESSES.filter((name) => {
const proc = processes.find((p) => p.name === name);
return !proc || proc.pm2_env?.status !== 'online';
});
if (offline.length === 0) {
return { action_taken: 'no offline processes found — skipping restart', success: true };
}
const results: string[] = [];
for (const name of offline) {
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
results.push(`${name}: ${success ? 'restarted' : stderr.slice(0, 80)}`);
}
const output = results.join('; ');
logger.info({ diagnosis: diagnosis.slice(0, 120), output, offline }, 'PM2 targeted restart executed');
return { action_taken: `pm2 restart ${offline.join(', ')}`, success: true, output };
}
// ─── pm2-aware targeted restart (shared helper) ───────────────────────────────
async function restartProcess(name: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found', success: false };
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
return { action_taken: 'pm2 restart all', success: true, output };
logger.info({ name, success, output: output.slice(0, 200) }, 'PM2 targeted process restart');
return { action_taken: `pm2 restart ${name}`, success, output };
}
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
@ -401,6 +438,237 @@ async function healWireGuard(_diagnosis: string): Promise<HealResult> {
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
}
// ─── 9. Service port health checks ───────────────────────────────────────────
// For each critical service, performs a real HTTP check on the local port.
// This catches crash-loops where PM2 shows "online" but the port is not responding.
interface ServiceDef {
name: string; // PM2 process name to restart on failure
port: number;
path: string; // health endpoint path
okStatus?: number[];// accepted HTTP status codes (default: <500)
}
const SERVICES: ServiceDef[] = [
{ name: 'magatama', port: 3210, path: '/' },
{ name: 'magatama-dashboard', port: 3211, path: '/' },
{ name: 'magatama-admin', port: 3212, path: '/' },
{ name: 'tip-api', port: 3201, path: '/api/health' },
{ name: 'peercortex', port: 3101, path: '/' },
{ name: 'llm-gateway', port: 3103, path: '/health' },
{ name: 'eo-global-pulse', port: 3000, path: '/' },
{ name: 'nognet', port: 3001, path: '/' },
{ name: 'ghost-blog', port: 2368, path: '/' },
{ name: 'switchblade', port: 3334, path: '/' },
];
async function probePort(service: ServiceDef): Promise<{ ok: boolean; status?: number; error?: string }> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 8_000);
try {
const res = await fetch(`http://localhost:${service.port}${service.path}`, {
signal: controller.signal,
cache: 'no-store',
redirect: 'manual',
} as RequestInit);
const acceptedCodes = service.okStatus ?? [];
const ok = acceptedCodes.length > 0
? acceptedCodes.includes(res.status)
: res.status < 500 || (res.status >= 300 && res.status < 400);
return { ok, status: res.status };
} catch (err) {
return { ok: false, error: err instanceof Error ? err.message : String(err) };
} finally {
clearTimeout(timer);
}
}
async function checkServicePorts(): Promise<CheckResult> {
const start = Date.now();
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
const latency_ms = Date.now() - start;
const failing = results.filter((r) => !r.probe.ok);
if (failing.length === 0) {
return { healthy: true, message: `All ${SERVICES.length} service ports are responding`, latency_ms };
}
const details = Object.fromEntries(
failing.map((r) => [r.svc.name, r.probe.status ?? r.probe.error ?? 'no response']),
);
return {
healthy: false,
message: `${failing.length} service(s) not responding: ${failing.map((r) => r.svc.name).join(', ')}`,
details,
latency_ms,
};
}
async function healServicePorts(diagnosis: string): Promise<HealResult> {
// Re-probe to find which services are currently failing
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
const failing = results.filter((r) => !r.probe.ok);
if (failing.length === 0) {
return { action_taken: 'services recovered on re-check — no restart needed', success: true };
}
const healResults: string[] = [];
for (const { svc } of failing) {
const result = await restartProcess(svc.name);
healResults.push(`${svc.name}: ${result.success ? 'restarted' : result.output?.slice(0, 80) ?? 'failed'}`);
logger.info({ service: svc.name, diagnosis: diagnosis.slice(0, 80) }, 'Service port failure — PM2 restart triggered');
}
return {
action_taken: `restarted: ${failing.map((r) => r.svc.name).join(', ')}`,
success: true,
output: healResults.join('; '),
};
}
// ─── SSH helper ──────────────────────────────────────────────────────────────
const SSH_OPTS = ['-o', 'BatchMode=yes', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=accept-new'];
async function sshExec(host: string, command: string): Promise<{ stdout: string; stderr: string; success: boolean }> {
return safeExec('/usr/bin/ssh', [...SSH_OPTS, host, command]);
}
// ─── 10. Proxmox pvestatd health ─────────────────────────────────────────────
const PROXMOX_HOST = process.env['PROXMOX_HOST'] ?? 'root@192.168.178.10';
async function checkProxmoxPvestatd(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await sshExec(
PROXMOX_HOST,
'pid=$(pgrep pvestatd 2>/dev/null | head -1); [ -n "$pid" ] && cat /proc/$pid/status 2>/dev/null | grep "^State:" || echo "NOT_RUNNING"',
);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'Cannot SSH to Proxmox to check pvestatd', latency_ms };
}
const out = stdout.trim();
if (out === 'NOT_RUNNING') {
return { healthy: false, message: 'pvestatd is not running on Proxmox', details: { state: 'not_running' }, latency_ms };
}
const stateMatch = out.match(/State:\s+(\S)/);
const state = stateMatch?.[1] ?? '?';
if (state === 'D') {
return {
healthy: false,
message: 'pvestatd is in D-state (kernel deadlock) — Proxmox GUI graphs will be empty',
details: { state: 'D', raw: out.slice(0, 200) },
latency_ms,
};
}
return { healthy: true, message: `pvestatd is running (state: ${state})`, details: { state }, latency_ms };
}
async function healProxmoxPvestatd(diagnosis: string): Promise<HealResult> {
// Step 1: Get PID
const { stdout: pidOut } = await sshExec(PROXMOX_HOST, 'pgrep pvestatd 2>/dev/null | head -1');
const pid = pidOut.trim();
if (!pid) {
// Not running at all — just start it
const { success, stdout, stderr } = await sshExec(PROXMOX_HOST, 'systemctl start pvestatd 2>&1');
return { action_taken: 'systemctl start pvestatd', success, output: `${stdout}\n${stderr}`.trim() };
}
// Step 2: Move stuck process to root cgroup so systemd can reclaim the unit
await sshExec(PROXMOX_HOST, `echo ${pid} > /sys/fs/cgroup/cgroup.procs 2>/dev/null || true`);
// Step 3: Remove stale lock files
await sshExec(PROXMOX_HOST, 'rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid 2>/dev/null || true');
// Step 4: Reset failed state and start
const { stdout, stderr, success } = await sshExec(
PROXMOX_HOST,
'systemctl reset-failed pvestatd 2>/dev/null; systemctl start pvestatd 2>&1; sleep 3; systemctl is-active pvestatd',
);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ pid, diagnosis: diagnosis.slice(0, 120), output }, 'pvestatd D-state heal executed');
return { action_taken: `moved pid ${pid} to root cgroup, reset-failed, started pvestatd`, success, output };
}
// ─── 11. OPNsense disk space ─────────────────────────────────────────────────
const OPNSENSE_HOST = process.env['OPNSENSE_HOST'] ?? 'root@192.168.178.11';
const OPNSENSE_DISK_THRESHOLD = 75;
async function checkOpnsenseDisk(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await sshExec(
OPNSENSE_HOST,
"df -h / | awk 'NR==2{print $5}' | tr -d '%'",
);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'Cannot SSH to OPNsense to check disk', latency_ms };
}
const usedPct = parseInt(stdout.trim(), 10);
if (isNaN(usedPct)) {
return { healthy: false, message: `Cannot parse OPNsense disk usage: "${stdout.trim()}"`, latency_ms };
}
if (usedPct >= OPNSENSE_DISK_THRESHOLD) {
return {
healthy: false,
message: `OPNsense disk usage ${usedPct}% ≥ threshold ${OPNSENSE_DISK_THRESHOLD}%`,
details: { usedPercent: usedPct, threshold: OPNSENSE_DISK_THRESHOLD },
latency_ms,
};
}
return {
healthy: true,
message: `OPNsense disk usage: ${usedPct}%`,
details: { usedPercent: usedPct },
latency_ms,
};
}
async function healOpnsenseDisk(diagnosis: string): Promise<HealResult> {
const steps = [
// Remove logs older than 7 days
'find /var/log -name "*.log" -mtime +7 -delete 2>/dev/null || true',
// Suricata JSON logs older than 3 days
'find /var/log/suricata -name "*.json" -mtime +3 -delete 2>/dev/null || true',
'find /var/log/suricata -name "*.json.gz" -mtime +1 -delete 2>/dev/null || true',
// Tmp files older than 1 day
'find /tmp /var/tmp -mtime +1 -delete 2>/dev/null || true',
// pkg cache
'pkg clean -y 2>/dev/null || true',
// Report new usage
"df -h / | awk 'NR==2{print $5}'",
];
const { stdout, stderr, success } = await sshExec(OPNSENSE_HOST, steps.join('; '));
const output = `${stdout}\n${stderr}`.trim();
const newUsage = stdout.trim().split('\n').at(-1) ?? '?';
logger.info({ diagnosis: diagnosis.slice(0, 120), newUsage, output: output.slice(0, 400) }, 'OPNsense disk cleanup executed');
return {
action_taken: `cleaned logs, tmp, pkg cache on OPNsense — disk now at ${newUsage}`,
success,
output: output.slice(0, 500),
};
}
// ─── Exported check list ──────────────────────────────────────────────────────
export const healthChecks: HealthCheck[] = [
@ -408,8 +676,11 @@ export const healthChecks: HealthCheck[] = [
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
{ name: 'service-ports', category: 'service', check: checkServicePorts, heal: healServicePorts },
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
{ name: 'proxmox-pvestatd', category: 'service', check: checkProxmoxPvestatd, heal: healProxmoxPvestatd },
{ name: 'opnsense-disk', category: 'service', check: checkOpnsenseDisk, heal: healOpnsenseDisk },
];