llm-gateway/packages/ctx-health/src/gateway-client.ts
Rene Fichtmueller e0b9fa1f53 feat: add CtxHealth self-healing daemon as new workspace package
New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript
infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems
every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory,
network, WireGuard), gets AI-powered root cause analysis via the gateway
(ctxhealer caller / ctx_health_diagnose task_type), executes healing
actions with cooldown (5min) and escalation guards (3+ failures → human
escalation), persists all incidents to ctx_health_incidents and
ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true.
Runs as ctx-health PM2 process on Erik server.
2026-04-03 00:16:08 +02:00

138 lines
4.8 KiB
TypeScript

/**
* HTTP client for calling the LLM Gateway to get AI-powered diagnoses.
* Fail-open: if gateway is unavailable, returns a default diagnosis.
*/
import { logger } from './observability/logger.js';
import type { CheckResult, DiagnosisResult, IncidentRecord } from './types.js';
const GATEWAY_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
const INTERNAL_SECRET = process.env['INTERNAL_SECRET'] ?? 'internal-learning-secret';
const TIMEOUT_MS = 30_000;
interface GatewayResponse {
output: string;
confidence: number;
model: string;
latency_ms: number;
}
function buildDiagnosisInput(
checkName: string,
checkResult: CheckResult,
recentHistory: IncidentRecord[],
): string {
const historyLines = recentHistory.slice(0, 5).map((inc) => {
const ts = new Date(inc.created_at).toISOString();
return ` - [${ts}] ${inc.severity}: ${inc.error_message} (healed=${inc.auto_healed})`;
});
return [
`Infrastructure check FAILED: ${checkName}`,
`Error: ${checkResult.message}`,
`Latency: ${checkResult.latency_ms ?? 'n/a'}ms`,
`Details: ${JSON.stringify(checkResult.details ?? {})}`,
'',
`Recent incident history (last ${historyLines.length}):`,
historyLines.length > 0 ? historyLines.join('\n') : ' (none)',
'',
'Provide: root cause analysis, recommended action, severity (info/warning/critical), and whether auto-healing is safe (true/false).',
'Format your response as JSON: {"action":"...","severity":"warning","auto_heal":true,"explanation":"..."}',
].join('\n');
}
function parseGatewayOutput(raw: string): Omit<DiagnosisResult, 'raw_output'> {
try {
const match = raw.match(/\{[\s\S]*\}/);
if (!match) throw new Error('No JSON found in output');
const parsed = JSON.parse(match[0]) as {
action?: unknown;
severity?: unknown;
auto_heal?: unknown;
};
const action = typeof parsed.action === 'string' ? parsed.action : 'Review logs and restart service.';
const severity =
parsed.severity === 'info' || parsed.severity === 'warning' || parsed.severity === 'critical'
? parsed.severity
: 'warning';
const auto_heal = typeof parsed.auto_heal === 'boolean' ? parsed.auto_heal : false;
return { action, severity, auto_heal };
} catch {
return { action: 'Review logs and consider manual restart.', severity: 'warning', auto_heal: false };
}
}
function buildDefaultDiagnosis(checkName: string): DiagnosisResult {
return {
action: `Default healing: restart service associated with check '${checkName}'.`,
severity: 'warning',
auto_heal: true,
raw_output: '(gateway unavailable — default diagnosis)',
};
}
export async function diagnoseIssue(
checkName: string,
checkResult: CheckResult,
recentHistory: IncidentRecord[],
): Promise<DiagnosisResult> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
try {
const input = buildDiagnosisInput(checkName, checkResult, recentHistory);
const response = await fetch(`${GATEWAY_URL}/v1/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-Caller': 'ctxhealer',
'X-Internal-Secret': INTERNAL_SECRET,
'Cache-Control': 'no-store',
},
body: JSON.stringify({
task_type: 'ctx_health_diagnose',
input,
user_context: '',
}),
signal: controller.signal,
});
if (!response.ok) {
const body = await response.text();
logger.warn({ checkName, status: response.status, body: body.slice(0, 200) }, 'Gateway returned non-OK for diagnosis');
return buildDefaultDiagnosis(checkName);
}
const data = (await response.json()) as GatewayResponse;
const parsed = parseGatewayOutput(data.output);
logger.info({ checkName, severity: parsed.severity, auto_heal: parsed.auto_heal }, 'Gateway diagnosis received');
return { ...parsed, raw_output: data.output };
} catch (err) {
const isAbort = err instanceof Error && err.name === 'AbortError';
logger.warn({ err, checkName, isAbort }, 'Gateway call failed — using default diagnosis (fail-open)');
return buildDefaultDiagnosis(checkName);
} finally {
clearTimeout(timer);
}
}
export async function resetCircuitBreaker(): Promise<void> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 5_000);
try {
await fetch(`${GATEWAY_URL}/internal/circuit-breaker/reset`, {
method: 'POST',
headers: { 'X-Internal-Secret': INTERNAL_SECRET, 'Cache-Control': 'no-store' },
signal: controller.signal,
});
logger.info('Circuit breaker reset requested');
} catch (err) {
logger.warn({ err }, 'Circuit breaker reset failed — gateway may be down');
} finally {
clearTimeout(timer);
}
}