New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard), gets AI-powered root cause analysis via the gateway (ctxhealer caller / ctx_health_diagnose task_type), executes healing actions with cooldown (5min) and escalation guards (3+ failures → human escalation), persists all incidents to ctx_health_incidents and ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true. Runs as ctx-health PM2 process on Erik server.
138 lines
4.8 KiB
TypeScript
138 lines
4.8 KiB
TypeScript
/**
|
|
* HTTP client for calling the LLM Gateway to get AI-powered diagnoses.
|
|
* Fail-open: if gateway is unavailable, returns a default diagnosis.
|
|
*/
|
|
|
|
import { logger } from './observability/logger.js';
|
|
import type { CheckResult, DiagnosisResult, IncidentRecord } from './types.js';
|
|
|
|
const GATEWAY_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
|
|
const INTERNAL_SECRET = process.env['INTERNAL_SECRET'] ?? 'internal-learning-secret';
|
|
const TIMEOUT_MS = 30_000;
|
|
|
|
interface GatewayResponse {
|
|
output: string;
|
|
confidence: number;
|
|
model: string;
|
|
latency_ms: number;
|
|
}
|
|
|
|
function buildDiagnosisInput(
|
|
checkName: string,
|
|
checkResult: CheckResult,
|
|
recentHistory: IncidentRecord[],
|
|
): string {
|
|
const historyLines = recentHistory.slice(0, 5).map((inc) => {
|
|
const ts = new Date(inc.created_at).toISOString();
|
|
return ` - [${ts}] ${inc.severity}: ${inc.error_message} (healed=${inc.auto_healed})`;
|
|
});
|
|
|
|
return [
|
|
`Infrastructure check FAILED: ${checkName}`,
|
|
`Error: ${checkResult.message}`,
|
|
`Latency: ${checkResult.latency_ms ?? 'n/a'}ms`,
|
|
`Details: ${JSON.stringify(checkResult.details ?? {})}`,
|
|
'',
|
|
`Recent incident history (last ${historyLines.length}):`,
|
|
historyLines.length > 0 ? historyLines.join('\n') : ' (none)',
|
|
'',
|
|
'Provide: root cause analysis, recommended action, severity (info/warning/critical), and whether auto-healing is safe (true/false).',
|
|
'Format your response as JSON: {"action":"...","severity":"warning","auto_heal":true,"explanation":"..."}',
|
|
].join('\n');
|
|
}
|
|
|
|
function parseGatewayOutput(raw: string): Omit<DiagnosisResult, 'raw_output'> {
|
|
try {
|
|
const match = raw.match(/\{[\s\S]*\}/);
|
|
if (!match) throw new Error('No JSON found in output');
|
|
const parsed = JSON.parse(match[0]) as {
|
|
action?: unknown;
|
|
severity?: unknown;
|
|
auto_heal?: unknown;
|
|
};
|
|
const action = typeof parsed.action === 'string' ? parsed.action : 'Review logs and restart service.';
|
|
const severity =
|
|
parsed.severity === 'info' || parsed.severity === 'warning' || parsed.severity === 'critical'
|
|
? parsed.severity
|
|
: 'warning';
|
|
const auto_heal = typeof parsed.auto_heal === 'boolean' ? parsed.auto_heal : false;
|
|
return { action, severity, auto_heal };
|
|
} catch {
|
|
return { action: 'Review logs and consider manual restart.', severity: 'warning', auto_heal: false };
|
|
}
|
|
}
|
|
|
|
function buildDefaultDiagnosis(checkName: string): DiagnosisResult {
|
|
return {
|
|
action: `Default healing: restart service associated with check '${checkName}'.`,
|
|
severity: 'warning',
|
|
auto_heal: true,
|
|
raw_output: '(gateway unavailable — default diagnosis)',
|
|
};
|
|
}
|
|
|
|
export async function diagnoseIssue(
|
|
checkName: string,
|
|
checkResult: CheckResult,
|
|
recentHistory: IncidentRecord[],
|
|
): Promise<DiagnosisResult> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
|
|
|
try {
|
|
const input = buildDiagnosisInput(checkName, checkResult, recentHistory);
|
|
|
|
const response = await fetch(`${GATEWAY_URL}/v1/generate`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'X-Caller': 'ctxhealer',
|
|
'X-Internal-Secret': INTERNAL_SECRET,
|
|
'Cache-Control': 'no-store',
|
|
},
|
|
body: JSON.stringify({
|
|
task_type: 'ctx_health_diagnose',
|
|
input,
|
|
user_context: '',
|
|
}),
|
|
signal: controller.signal,
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const body = await response.text();
|
|
logger.warn({ checkName, status: response.status, body: body.slice(0, 200) }, 'Gateway returned non-OK for diagnosis');
|
|
return buildDefaultDiagnosis(checkName);
|
|
}
|
|
|
|
const data = (await response.json()) as GatewayResponse;
|
|
const parsed = parseGatewayOutput(data.output);
|
|
|
|
logger.info({ checkName, severity: parsed.severity, auto_heal: parsed.auto_heal }, 'Gateway diagnosis received');
|
|
|
|
return { ...parsed, raw_output: data.output };
|
|
} catch (err) {
|
|
const isAbort = err instanceof Error && err.name === 'AbortError';
|
|
logger.warn({ err, checkName, isAbort }, 'Gateway call failed — using default diagnosis (fail-open)');
|
|
return buildDefaultDiagnosis(checkName);
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
export async function resetCircuitBreaker(): Promise<void> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), 5_000);
|
|
try {
|
|
await fetch(`${GATEWAY_URL}/internal/circuit-breaker/reset`, {
|
|
method: 'POST',
|
|
headers: { 'X-Internal-Secret': INTERNAL_SECRET, 'Cache-Control': 'no-store' },
|
|
signal: controller.signal,
|
|
});
|
|
logger.info('Circuit breaker reset requested');
|
|
} catch (err) {
|
|
logger.warn({ err }, 'Circuit breaker reset failed — gateway may be down');
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|