feat(ctx-health): add proxmox-pvestatd + opnsense-disk health checks
- Add SSH-based health check for pvestatd D-state detection on Proxmox host (heal via cgroup move + lock file removal + reset-failed) - Add SSH-based disk check for OPNsense VM (threshold 75%, auto-cleanup) - knowledge/fixes.json: add 48 training fixes including post-reboot DNS recovery (fix-046), cloudflared DNS-wait boot fix (fix-047), and vzdump load-crash scenario with recovery steps (fix-048)
This commit is contained in:
parent
b4593b6582
commit
c50af63389
1014
knowledge/fixes.json
Normal file
1014
knowledge/fixes.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -35,6 +35,7 @@ const ALLOWED_COMMANDS = new Set([
|
|||||||
'/usr/sbin/systemctl',
|
'/usr/sbin/systemctl',
|
||||||
'/usr/bin/sync',
|
'/usr/bin/sync',
|
||||||
'/bin/sync',
|
'/bin/sync',
|
||||||
|
'/usr/bin/ssh',
|
||||||
]);
|
]);
|
||||||
|
|
||||||
async function safeExec(
|
async function safeExec(
|
||||||
@ -69,7 +70,13 @@ async function findPm2(): Promise<string | null> {
|
|||||||
|
|
||||||
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
|
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
|
const PM2_REQUIRED_PROCESSES = [
|
||||||
|
'llm-gateway', 'llm-learning',
|
||||||
|
'magatama', 'magatama-dashboard',
|
||||||
|
'tip-api', 'tip-scraper-daemon',
|
||||||
|
'peercortex', 'eo-global-pulse',
|
||||||
|
'ghost-blog', 'nognet',
|
||||||
|
];
|
||||||
|
|
||||||
async function checkPm2(): Promise<CheckResult> {
|
async function checkPm2(): Promise<CheckResult> {
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
@ -114,10 +121,40 @@ async function healPm2(diagnosis: string): Promise<HealResult> {
|
|||||||
const pm2 = await findPm2();
|
const pm2 = await findPm2();
|
||||||
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
|
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
|
||||||
|
|
||||||
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
|
// Only restart processes that are actually offline — avoid pm2 restart all
|
||||||
|
const { stdout: jlist } = await safeExec(pm2, ['jlist']);
|
||||||
|
let processes: Array<{ name: string; pm2_env?: { status?: string } }> = [];
|
||||||
|
try { processes = JSON.parse(jlist) as typeof processes; } catch { /* ignore */ }
|
||||||
|
|
||||||
|
const offline = PM2_REQUIRED_PROCESSES.filter((name) => {
|
||||||
|
const proc = processes.find((p) => p.name === name);
|
||||||
|
return !proc || proc.pm2_env?.status !== 'online';
|
||||||
|
});
|
||||||
|
|
||||||
|
if (offline.length === 0) {
|
||||||
|
return { action_taken: 'no offline processes found — skipping restart', success: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
const results: string[] = [];
|
||||||
|
for (const name of offline) {
|
||||||
|
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
|
||||||
|
results.push(`${name}: ${success ? 'restarted' : stderr.slice(0, 80)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const output = results.join('; ');
|
||||||
|
logger.info({ diagnosis: diagnosis.slice(0, 120), output, offline }, 'PM2 targeted restart executed');
|
||||||
|
return { action_taken: `pm2 restart ${offline.join(', ')}`, success: true, output };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── pm2-aware targeted restart (shared helper) ───────────────────────────────
|
||||||
|
|
||||||
|
async function restartProcess(name: string): Promise<HealResult> {
|
||||||
|
const pm2 = await findPm2();
|
||||||
|
if (!pm2) return { action_taken: 'pm2 not found', success: false };
|
||||||
|
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
|
||||||
const output = `${stdout}\n${stderr}`.trim();
|
const output = `${stdout}\n${stderr}`.trim();
|
||||||
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
|
logger.info({ name, success, output: output.slice(0, 200) }, 'PM2 targeted process restart');
|
||||||
return { action_taken: 'pm2 restart all', success: true, output };
|
return { action_taken: `pm2 restart ${name}`, success, output };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
|
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
|
||||||
@ -401,6 +438,237 @@ async function healWireGuard(_diagnosis: string): Promise<HealResult> {
|
|||||||
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
|
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── 9. Service port health checks ───────────────────────────────────────────
|
||||||
|
// For each critical service, performs a real HTTP check on the local port.
|
||||||
|
// This catches crash-loops where PM2 shows "online" but the port is not responding.
|
||||||
|
|
||||||
|
interface ServiceDef {
|
||||||
|
name: string; // PM2 process name to restart on failure
|
||||||
|
port: number;
|
||||||
|
path: string; // health endpoint path
|
||||||
|
okStatus?: number[];// accepted HTTP status codes (default: <500)
|
||||||
|
}
|
||||||
|
|
||||||
|
const SERVICES: ServiceDef[] = [
|
||||||
|
{ name: 'magatama', port: 3210, path: '/' },
|
||||||
|
{ name: 'magatama-dashboard', port: 3211, path: '/' },
|
||||||
|
{ name: 'magatama-admin', port: 3212, path: '/' },
|
||||||
|
{ name: 'tip-api', port: 3201, path: '/api/health' },
|
||||||
|
{ name: 'peercortex', port: 3101, path: '/' },
|
||||||
|
{ name: 'llm-gateway', port: 3103, path: '/health' },
|
||||||
|
{ name: 'eo-global-pulse', port: 3000, path: '/' },
|
||||||
|
{ name: 'nognet', port: 3001, path: '/' },
|
||||||
|
{ name: 'ghost-blog', port: 2368, path: '/' },
|
||||||
|
{ name: 'switchblade', port: 3334, path: '/' },
|
||||||
|
];
|
||||||
|
|
||||||
|
async function probePort(service: ServiceDef): Promise<{ ok: boolean; status?: number; error?: string }> {
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timer = setTimeout(() => controller.abort(), 8_000);
|
||||||
|
try {
|
||||||
|
const res = await fetch(`http://localhost:${service.port}${service.path}`, {
|
||||||
|
signal: controller.signal,
|
||||||
|
cache: 'no-store',
|
||||||
|
redirect: 'manual',
|
||||||
|
} as RequestInit);
|
||||||
|
const acceptedCodes = service.okStatus ?? [];
|
||||||
|
const ok = acceptedCodes.length > 0
|
||||||
|
? acceptedCodes.includes(res.status)
|
||||||
|
: res.status < 500 || (res.status >= 300 && res.status < 400);
|
||||||
|
return { ok, status: res.status };
|
||||||
|
} catch (err) {
|
||||||
|
return { ok: false, error: err instanceof Error ? err.message : String(err) };
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function checkServicePorts(): Promise<CheckResult> {
|
||||||
|
const start = Date.now();
|
||||||
|
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
|
||||||
|
const latency_ms = Date.now() - start;
|
||||||
|
|
||||||
|
const failing = results.filter((r) => !r.probe.ok);
|
||||||
|
|
||||||
|
if (failing.length === 0) {
|
||||||
|
return { healthy: true, message: `All ${SERVICES.length} service ports are responding`, latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
const details = Object.fromEntries(
|
||||||
|
failing.map((r) => [r.svc.name, r.probe.status ?? r.probe.error ?? 'no response']),
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
healthy: false,
|
||||||
|
message: `${failing.length} service(s) not responding: ${failing.map((r) => r.svc.name).join(', ')}`,
|
||||||
|
details,
|
||||||
|
latency_ms,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function healServicePorts(diagnosis: string): Promise<HealResult> {
|
||||||
|
// Re-probe to find which services are currently failing
|
||||||
|
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
|
||||||
|
const failing = results.filter((r) => !r.probe.ok);
|
||||||
|
|
||||||
|
if (failing.length === 0) {
|
||||||
|
return { action_taken: 'services recovered on re-check — no restart needed', success: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
const healResults: string[] = [];
|
||||||
|
for (const { svc } of failing) {
|
||||||
|
const result = await restartProcess(svc.name);
|
||||||
|
healResults.push(`${svc.name}: ${result.success ? 'restarted' : result.output?.slice(0, 80) ?? 'failed'}`);
|
||||||
|
logger.info({ service: svc.name, diagnosis: diagnosis.slice(0, 80) }, 'Service port failure — PM2 restart triggered');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
action_taken: `restarted: ${failing.map((r) => r.svc.name).join(', ')}`,
|
||||||
|
success: true,
|
||||||
|
output: healResults.join('; '),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── SSH helper ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const SSH_OPTS = ['-o', 'BatchMode=yes', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=accept-new'];
|
||||||
|
|
||||||
|
async function sshExec(host: string, command: string): Promise<{ stdout: string; stderr: string; success: boolean }> {
|
||||||
|
return safeExec('/usr/bin/ssh', [...SSH_OPTS, host, command]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 10. Proxmox pvestatd health ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
const PROXMOX_HOST = process.env['PROXMOX_HOST'] ?? 'root@192.168.178.10';
|
||||||
|
|
||||||
|
async function checkProxmoxPvestatd(): Promise<CheckResult> {
|
||||||
|
const start = Date.now();
|
||||||
|
const { stdout, success } = await sshExec(
|
||||||
|
PROXMOX_HOST,
|
||||||
|
'pid=$(pgrep pvestatd 2>/dev/null | head -1); [ -n "$pid" ] && cat /proc/$pid/status 2>/dev/null | grep "^State:" || echo "NOT_RUNNING"',
|
||||||
|
);
|
||||||
|
const latency_ms = Date.now() - start;
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
return { healthy: false, message: 'Cannot SSH to Proxmox to check pvestatd', latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
const out = stdout.trim();
|
||||||
|
|
||||||
|
if (out === 'NOT_RUNNING') {
|
||||||
|
return { healthy: false, message: 'pvestatd is not running on Proxmox', details: { state: 'not_running' }, latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
const stateMatch = out.match(/State:\s+(\S)/);
|
||||||
|
const state = stateMatch?.[1] ?? '?';
|
||||||
|
|
||||||
|
if (state === 'D') {
|
||||||
|
return {
|
||||||
|
healthy: false,
|
||||||
|
message: 'pvestatd is in D-state (kernel deadlock) — Proxmox GUI graphs will be empty',
|
||||||
|
details: { state: 'D', raw: out.slice(0, 200) },
|
||||||
|
latency_ms,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { healthy: true, message: `pvestatd is running (state: ${state})`, details: { state }, latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function healProxmoxPvestatd(diagnosis: string): Promise<HealResult> {
|
||||||
|
// Step 1: Get PID
|
||||||
|
const { stdout: pidOut } = await sshExec(PROXMOX_HOST, 'pgrep pvestatd 2>/dev/null | head -1');
|
||||||
|
const pid = pidOut.trim();
|
||||||
|
|
||||||
|
if (!pid) {
|
||||||
|
// Not running at all — just start it
|
||||||
|
const { success, stdout, stderr } = await sshExec(PROXMOX_HOST, 'systemctl start pvestatd 2>&1');
|
||||||
|
return { action_taken: 'systemctl start pvestatd', success, output: `${stdout}\n${stderr}`.trim() };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Move stuck process to root cgroup so systemd can reclaim the unit
|
||||||
|
await sshExec(PROXMOX_HOST, `echo ${pid} > /sys/fs/cgroup/cgroup.procs 2>/dev/null || true`);
|
||||||
|
|
||||||
|
// Step 3: Remove stale lock files
|
||||||
|
await sshExec(PROXMOX_HOST, 'rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid 2>/dev/null || true');
|
||||||
|
|
||||||
|
// Step 4: Reset failed state and start
|
||||||
|
const { stdout, stderr, success } = await sshExec(
|
||||||
|
PROXMOX_HOST,
|
||||||
|
'systemctl reset-failed pvestatd 2>/dev/null; systemctl start pvestatd 2>&1; sleep 3; systemctl is-active pvestatd',
|
||||||
|
);
|
||||||
|
const output = `${stdout}\n${stderr}`.trim();
|
||||||
|
logger.info({ pid, diagnosis: diagnosis.slice(0, 120), output }, 'pvestatd D-state heal executed');
|
||||||
|
return { action_taken: `moved pid ${pid} to root cgroup, reset-failed, started pvestatd`, success, output };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 11. OPNsense disk space ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const OPNSENSE_HOST = process.env['OPNSENSE_HOST'] ?? 'root@192.168.178.11';
|
||||||
|
const OPNSENSE_DISK_THRESHOLD = 75;
|
||||||
|
|
||||||
|
async function checkOpnsenseDisk(): Promise<CheckResult> {
|
||||||
|
const start = Date.now();
|
||||||
|
const { stdout, success } = await sshExec(
|
||||||
|
OPNSENSE_HOST,
|
||||||
|
"df -h / | awk 'NR==2{print $5}' | tr -d '%'",
|
||||||
|
);
|
||||||
|
const latency_ms = Date.now() - start;
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
return { healthy: false, message: 'Cannot SSH to OPNsense to check disk', latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
const usedPct = parseInt(stdout.trim(), 10);
|
||||||
|
|
||||||
|
if (isNaN(usedPct)) {
|
||||||
|
return { healthy: false, message: `Cannot parse OPNsense disk usage: "${stdout.trim()}"`, latency_ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (usedPct >= OPNSENSE_DISK_THRESHOLD) {
|
||||||
|
return {
|
||||||
|
healthy: false,
|
||||||
|
message: `OPNsense disk usage ${usedPct}% ≥ threshold ${OPNSENSE_DISK_THRESHOLD}%`,
|
||||||
|
details: { usedPercent: usedPct, threshold: OPNSENSE_DISK_THRESHOLD },
|
||||||
|
latency_ms,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
healthy: true,
|
||||||
|
message: `OPNsense disk usage: ${usedPct}%`,
|
||||||
|
details: { usedPercent: usedPct },
|
||||||
|
latency_ms,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function healOpnsenseDisk(diagnosis: string): Promise<HealResult> {
|
||||||
|
const steps = [
|
||||||
|
// Remove logs older than 7 days
|
||||||
|
'find /var/log -name "*.log" -mtime +7 -delete 2>/dev/null || true',
|
||||||
|
// Suricata JSON logs older than 3 days
|
||||||
|
'find /var/log/suricata -name "*.json" -mtime +3 -delete 2>/dev/null || true',
|
||||||
|
'find /var/log/suricata -name "*.json.gz" -mtime +1 -delete 2>/dev/null || true',
|
||||||
|
// Tmp files older than 1 day
|
||||||
|
'find /tmp /var/tmp -mtime +1 -delete 2>/dev/null || true',
|
||||||
|
// pkg cache
|
||||||
|
'pkg clean -y 2>/dev/null || true',
|
||||||
|
// Report new usage
|
||||||
|
"df -h / | awk 'NR==2{print $5}'",
|
||||||
|
];
|
||||||
|
|
||||||
|
const { stdout, stderr, success } = await sshExec(OPNSENSE_HOST, steps.join('; '));
|
||||||
|
const output = `${stdout}\n${stderr}`.trim();
|
||||||
|
const newUsage = stdout.trim().split('\n').at(-1) ?? '?';
|
||||||
|
|
||||||
|
logger.info({ diagnosis: diagnosis.slice(0, 120), newUsage, output: output.slice(0, 400) }, 'OPNsense disk cleanup executed');
|
||||||
|
|
||||||
|
return {
|
||||||
|
action_taken: `cleaned logs, tmp, pkg cache on OPNsense — disk now at ${newUsage}`,
|
||||||
|
success,
|
||||||
|
output: output.slice(0, 500),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// ─── Exported check list ──────────────────────────────────────────────────────
|
// ─── Exported check list ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
export const healthChecks: HealthCheck[] = [
|
export const healthChecks: HealthCheck[] = [
|
||||||
@ -408,8 +676,11 @@ export const healthChecks: HealthCheck[] = [
|
|||||||
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
|
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
|
||||||
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
|
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
|
||||||
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
|
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
|
||||||
|
{ name: 'service-ports', category: 'service', check: checkServicePorts, heal: healServicePorts },
|
||||||
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
|
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
|
||||||
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
|
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
|
||||||
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
|
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
|
||||||
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
|
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
|
||||||
|
{ name: 'proxmox-pvestatd', category: 'service', check: checkProxmoxPvestatd, heal: healProxmoxPvestatd },
|
||||||
|
{ name: 'opnsense-disk', category: 'service', check: checkOpnsenseDisk, heal: healOpnsenseDisk },
|
||||||
];
|
];
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user