diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index f24a1b3..8067d53 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -5,6 +5,7 @@ {"d":"2026-04-02","t":"FEAT","m":"Fine-tuner SSH tunnel launch script (scripts/start.sh): opens SSH tunnel to Erik:5432 before running fine-tuner, bypassing IONOS firewall"} {"d":"2026-04-02","t":"FIX","m":"Fine-tuner env vars: FT_DB_URL/FT_GATEWAY_URL/FT_OLLAMA_URL (not DATABASE_URL) — fine-tuner status command now works"} {"d":"2026-04-02","t":"FIX","m":"DB schema migration: added status, confidence_score, used_in_training, system_prompt, input_text, output_text, human_edited, edited_output, used_in_dpo_training to learning_corpus; added run_type, task_type, started_at, deployed_model_name to fine_tuning_runs"} +{"d":"2026-04-02","t":"FEAT","m":"CtxHealth: new workspace package packages/ctx-health — TypeScript self-healing daemon monitoring PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard every 60s; LLM-powered diagnosis via ctxhealer caller; auto-healing with cooldown + escalation guards; incidents persisted to ctx_health_incidents + ctx_health_status tables; dry-run mode; PM2 process name: ctx-health"} {"d":"2026-04-02","t":"FIX","m":"Template variable resolution: input is now mapped as fallback for all 20+ content variable names (ocr_text, alert_data, bgp_data, anomaly_data, etc.) — all 7 project templates work with simple string input"} {"d":"2026-04-02","t":"FIX","m":"24 templates updated: {{input}} added as primary content variable so simple callers work without knowing domain-specific variable names (nog_cfp_evaluate, pc_as_narrative, sb_root_cause, shieldx_false_positive, etc.)"} {"d":"2026-04-02","t":"FIX","m":"Prompt resolver: input also aliased as source_data fallback; context fields spread into template vars"} diff --git a/package-lock.json b/package-lock.json index b69365e..7a0d8f9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -510,6 +510,10 @@ "resolved": "packages/client", "link": true }, + "node_modules/@llm-gateway/ctx-health": { + "resolved": "packages/ctx-health", + "link": true + }, "node_modules/@llm-gateway/gateway": { "resolved": "packages/gateway", "link": true @@ -1152,6 +1156,12 @@ "node": ">=12" } }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, "node_modules/atomic-sleep": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz", @@ -1171,6 +1181,17 @@ "fastq": "^1.17.1" } }, + "node_modules/axios": { + "version": "1.14.0", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz", + "integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==", + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.15.11", + "form-data": "^4.0.5", + "proxy-from-env": "^2.1.0" + } + }, "node_modules/bintrees": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", @@ -1187,6 +1208,19 @@ "node": ">=8" } }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/chai": { "version": "5.3.3", "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", @@ -1231,6 +1265,18 @@ "dev": true, "license": "MIT" }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/cookie": { "version": "0.7.2", "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", @@ -1290,6 +1336,29 @@ "node": ">=6" } }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/end-of-stream": { "version": "1.4.5", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", @@ -1300,6 +1369,24 @@ "once": "^1.4.0" } }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es-module-lexer": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz", @@ -1307,6 +1394,33 @@ "dev": true, "license": "MIT" }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/esbuild": { "version": "0.27.7", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz", @@ -1526,6 +1640,42 @@ "node": ">=14" } }, + "node_modules/follow-redirects": { + "version": "1.15.11", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz", + "integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -1562,6 +1712,52 @@ "node": "^8.16.0 || ^10.6.0 || >=11.0.0" } }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/get-tsconfig": { "version": "4.13.7", "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.7.tgz", @@ -1574,6 +1770,57 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/helmet": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/helmet/-/helmet-7.2.0.tgz", @@ -1673,6 +1920,36 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/minimist": { "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", @@ -2106,6 +2383,15 @@ "node": ">= 0.10" } }, + "node_modules/proxy-from-env": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz", + "integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, "node_modules/pump": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz", @@ -3114,6 +3400,24 @@ "typescript": "^5.7.2" } }, + "packages/ctx-health": { + "name": "@llm-gateway/ctx-health", + "version": "1.0.0", + "dependencies": { + "axios": "^1.6.8", + "node-cron": "^3.0.3", + "pg": "^8.11.3", + "pino": "^9.0.0" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "@types/node-cron": "^3.0.11", + "@types/pg": "^8.11.3", + "pino-pretty": "^13.0.0", + "tsx": "^4.7.2", + "typescript": "^5.4.5" + } + }, "packages/gateway": { "name": "@llm-gateway/gateway", "version": "1.0.0", diff --git a/package.json b/package.json index b17f232..4178598 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,8 @@ "install:all": "npm install", "test": "vitest", "db:migrate": "bash scripts/init-db.sh", - "models:pull": "bash scripts/pull-models.sh" + "models:pull": "bash scripts/pull-models.sh", + "ctx-health": "npm run start --workspace=packages/ctx-health", + "ctx-health:dev": "npm run dev --workspace=packages/ctx-health" } } diff --git a/packages/ctx-health/package.json b/packages/ctx-health/package.json new file mode 100644 index 0000000..0b37ee9 --- /dev/null +++ b/packages/ctx-health/package.json @@ -0,0 +1,25 @@ +{ + "name": "@llm-gateway/ctx-health", + "version": "1.0.0", + "type": "module", + "description": "CtxHealth — Infrastructure self-healing daemon", + "scripts": { + "dev": "tsx watch src/index.ts", + "start": "node --import tsx/esm src/index.ts", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "node-cron": "^3.0.3", + "pino": "^9.0.0", + "pg": "^8.11.3", + "axios": "^1.6.8" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "@types/pg": "^8.11.3", + "@types/node-cron": "^3.0.11", + "pino-pretty": "^13.0.0", + "tsx": "^4.7.2", + "typescript": "^5.4.5" + } +} diff --git a/packages/ctx-health/src/checks/index.ts b/packages/ctx-health/src/checks/index.ts new file mode 100644 index 0000000..5e48e63 --- /dev/null +++ b/packages/ctx-health/src/checks/index.ts @@ -0,0 +1,415 @@ +/** + * CtxHealth — Health check implementations. + * + * Each check exposes: + * check() → CheckResult + * heal() → HealResult + * + * exec safety: all shell commands use execFile with an allow-list. + * Never pass user-supplied strings to any exec call. + */ + +import { execFile as execFileCb } from 'node:child_process'; +import { readFile } from 'node:fs/promises'; +import { promisify } from 'node:util'; +import { logger } from '../observability/logger.js'; +import { resetCircuitBreaker } from '../gateway-client.js'; +import type { CheckResult, HealResult, HealthCheck } from '../types.js'; + +const execFile = promisify(execFileCb); +const EXEC_TIMEOUT_MS = 15_000; + +// ─── Allowed command allow-list ─────────────────────────────────────────────── + +const ALLOWED_COMMANDS = new Set([ + '/usr/bin/ping', + '/usr/bin/wg', + '/usr/sbin/wg', + '/usr/bin/df', + '/bin/df', + '/usr/local/bin/pm2', + '/usr/bin/pm2', + '/usr/local/bin/node', + '/usr/bin/systemctl', + '/bin/systemctl', + '/usr/sbin/systemctl', + '/usr/bin/sync', + '/bin/sync', +]); + +async function safeExec( + cmd: string, + args: readonly string[], +): Promise<{ stdout: string; stderr: string; success: boolean }> { + if (!ALLOWED_COMMANDS.has(cmd)) { + logger.error({ cmd }, 'Command not in allow-list — refusing to execute'); + return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false }; + } + try { + const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS }); + return { stdout, stderr, success: true }; + } catch (err) { + const e = err as { stdout?: string; stderr?: string; message?: string }; + return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false }; + } +} + +// ─── pm2 resolve helper (tries common locations) ───────────────────────────── + +async function findPm2(): Promise { + const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2']; + for (const p of candidates) { + if (ALLOWED_COMMANDS.has(p)) { + const { success } = await safeExec(p, ['--version']); + if (success) return p; + } + } + return null; +} + +// ─── 1. PM2 processes ──────────────────────────────────────────────────────── + +const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning']; + +async function checkPm2(): Promise { + const start = Date.now(); + const pm2 = await findPm2(); + if (!pm2) { + return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start }; + } + + const { stdout, success } = await safeExec(pm2, ['jlist']); + const latency_ms = Date.now() - start; + + if (!success) { + return { healthy: false, message: 'pm2 jlist failed', latency_ms }; + } + + let processes: Array<{ name: string; pm2_env?: { status?: string } }>; + try { + processes = JSON.parse(stdout) as typeof processes; + } catch { + return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms }; + } + + const statusMap = Object.fromEntries( + processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']), + ); + + const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online'); + + if (offline.length > 0) { + return { + healthy: false, + message: `PM2 processes not online: ${offline.join(', ')}`, + details: { statusMap }, + latency_ms, + }; + } + + return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms }; +} + +async function healPm2(diagnosis: string): Promise { + const pm2 = await findPm2(); + if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false }; + + const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']); + const output = `${stdout}\n${stderr}`.trim(); + logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed'); + return { action_taken: 'pm2 restart all', success: true, output }; +} + +// ─── 2. PostgreSQL ──────────────────────────────────────────────────────────── + +async function checkPostgres(): Promise { + const start = Date.now(); + // Dynamic import to avoid top-level pool creation before env is loaded + const { query } = await import('../db/client.js'); + try { + await query('SELECT 1'); + return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start }; + } catch (err) { + return { + healthy: false, + message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`, + latency_ms: Date.now() - start, + }; + } +} + +async function healPostgres(_diagnosis: string): Promise { + const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']); + const output = `${stdout}\n${stderr}`.trim(); + logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed'); + return { action_taken: 'systemctl restart postgresql', success, output }; +} + +// ─── 3. Ollama ─────────────────────────────────────────────────────────────── + +const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org'; + +async function checkOllama(): Promise { + const start = Date.now(); + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 10_000); + try { + const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit); + const latency_ms = Date.now() - start; + if (!res.ok) { + return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms }; + } + return { healthy: true, message: 'Ollama is reachable', latency_ms }; + } catch (err) { + return { + healthy: false, + message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`, + latency_ms: Date.now() - start, + }; + } finally { + clearTimeout(timer); + } +} + +async function healOllama(_diagnosis: string): Promise { + await resetCircuitBreaker(); + return { action_taken: 'circuit-breaker reset requested via gateway', success: true }; +} + +// ─── 4. Cloudflare tunnel ──────────────────────────────────────────────────── + +const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103'; + +async function checkCloudflareTunnel(): Promise { + const start = Date.now(); + const pm2 = await findPm2(); + + let cloudflaredRunning = false; + if (pm2) { + const { stdout } = await safeExec(pm2, ['jlist']); + try { + const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>; + cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online'); + } catch { /* ignore parse errors */ } + } + + // Also attempt to ping the gateway URL to confirm tunnel is routing traffic + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 8_000); + let gatewayReachable = false; + try { + const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit); + gatewayReachable = res.status < 500; + } catch { /* expected if tunnel is down */ } finally { + clearTimeout(timer); + } + + const latency_ms = Date.now() - start; + + if (!cloudflaredRunning && !gatewayReachable) { + return { + healthy: false, + message: 'cloudflared not running via pm2 and gateway is unreachable', + details: { cloudflaredRunning, gatewayReachable }, + latency_ms, + }; + } + + return { + healthy: true, + message: 'Cloudflare tunnel appears healthy', + details: { cloudflaredRunning, gatewayReachable }, + latency_ms, + }; +} + +async function healCloudflareTunnel(_diagnosis: string): Promise { + const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']); + const output = `${stdout}\n${stderr}`.trim(); + logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed'); + return { action_taken: 'systemctl restart cloudflared', success, output }; +} + +// ─── 5. Disk space ─────────────────────────────────────────────────────────── + +const DISK_ALERT_PERCENT = 85; + +async function checkDiskSpace(): Promise { + const start = Date.now(); + const { stdout, success } = await safeExec('/bin/df', ['-h', '/']); + const latency_ms = Date.now() - start; + + if (!success) { + return { healthy: false, message: 'df command failed', latency_ms }; + } + + const lines = stdout.trim().split('\n'); + const dataLine = lines[1] ?? ''; + const parts = dataLine.split(/\s+/); + const usedPct = parseInt(parts[4] ?? '0', 10); + + if (isNaN(usedPct)) { + return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms }; + } + + if (usedPct > DISK_ALERT_PERCENT) { + return { + healthy: false, + message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`, + details: { usedPercent: usedPct, raw: dataLine }, + latency_ms, + }; + } + + return { + healthy: true, + message: `Disk usage: ${usedPct}%`, + details: { usedPercent: usedPct }, + latency_ms, + }; +} + +async function healDiskSpace(diagnosis: string): Promise { + // Never auto-delete — just log the LLM advisory + logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required'); + return { + action_taken: 'logged LLM advisory — manual cleanup required', + success: true, + output: diagnosis.slice(0, 500), + }; +} + +// ─── 6. Memory ─────────────────────────────────────────────────────────────── + +const MEMORY_FREE_MIN_MB = 500; + +async function checkMemory(): Promise { + const start = Date.now(); + try { + const meminfo = await readFile('/proc/meminfo', 'utf-8'); + const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:')); + const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10); + const freeMb = Math.floor(freeKb / 1024); + const latency_ms = Date.now() - start; + + if (isNaN(freeMb)) { + return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms }; + } + + if (freeMb < MEMORY_FREE_MIN_MB) { + return { + healthy: false, + message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`, + details: { availableMb: freeMb }, + latency_ms, + }; + } + + return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms }; + } catch (err) { + // /proc/meminfo not available (e.g. macOS dev environment) + return { + healthy: true, + message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`, + latency_ms: Date.now() - start, + }; + } +} + +async function healMemory(_diagnosis: string): Promise { + const { stdout, stderr, success } = await safeExec('/bin/sync', []); + if (!success) { + return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr }; + } + // Writing to /proc/sys requires execFile with shell — skipped for safety. + // In production, a privileged helper script should handle this. + logger.warn('Memory healing: sync executed. Drop caches requires privileged script.'); + return { + action_taken: 'sync executed; drop_caches requires privileged helper', + success: true, + output: stdout.slice(0, 200), + }; +} + +// ─── 7. Network connectivity ───────────────────────────────────────────────── + +async function checkNetwork(): Promise { + const start = Date.now(); + const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']); + const latency_ms = Date.now() - start; + if (!success) { + return { + healthy: false, + message: 'Cannot ping 1.1.1.1 — network connectivity issue', + details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) }, + latency_ms, + }; + } + return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms }; +} + +async function healNetwork(diagnosis: string): Promise { + logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required'); + return { + action_taken: 'logged critical alert — network issues require manual intervention', + success: false, + output: 'Cannot auto-heal network connectivity issues.', + }; +} + +// ─── 8. WireGuard ──────────────────────────────────────────────────────────── + +async function findWg(): Promise { + const candidates = ['/usr/bin/wg', '/usr/sbin/wg']; + for (const p of candidates) { + if (ALLOWED_COMMANDS.has(p)) return p; + } + return null; +} + +async function checkWireGuard(): Promise { + const start = Date.now(); + const wg = await findWg(); + if (!wg) { + return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start }; + } + + const { stdout, success } = await safeExec(wg, ['show']); + const latency_ms = Date.now() - start; + + if (!success) { + return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms }; + } + + const hasActivePeer = stdout.includes('latest handshake'); + if (!hasActivePeer) { + return { + healthy: false, + message: 'WireGuard: no active peers with recent handshake detected', + details: { output: stdout.slice(0, 300) }, + latency_ms, + }; + } + + return { healthy: true, message: 'WireGuard peers active', latency_ms }; +} + +async function healWireGuard(_diagnosis: string): Promise { + const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']); + const output = `${stdout}\n${stderr}`.trim(); + logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed'); + return { action_taken: 'systemctl restart wg-quick@wg0', success, output }; +} + +// ─── Exported check list ────────────────────────────────────────────────────── + +export const healthChecks: HealthCheck[] = [ + { name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 }, + { name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres }, + { name: 'ollama', category: 'service', check: checkOllama, heal: healOllama }, + { name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel }, + { name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace }, + { name: 'memory', category: 'service', check: checkMemory, heal: healMemory }, + { name: 'network', category: 'network', check: checkNetwork, heal: healNetwork }, + { name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard }, +]; diff --git a/packages/ctx-health/src/db/client.ts b/packages/ctx-health/src/db/client.ts new file mode 100644 index 0000000..f6010dd --- /dev/null +++ b/packages/ctx-health/src/db/client.ts @@ -0,0 +1,43 @@ +import pg from 'pg'; +import { logger } from '../observability/logger.js'; + +const { Pool } = pg; + +let pool: pg.Pool | null = null; + +const DEFAULT_DB_URL = 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway'; + +function buildPoolConfig(): pg.PoolConfig { + const databaseUrl = process.env['CTX_HEALTH_DB_URL'] ?? process.env['DATABASE_URL'] ?? DEFAULT_DB_URL; + return { + connectionString: databaseUrl, + max: 3, + idleTimeoutMillis: 30_000, + connectionTimeoutMillis: 5_000, + }; +} + +export function getPool(): pg.Pool { + if (!pool) { + pool = new Pool(buildPoolConfig()); + pool.on('error', (err) => { + logger.error({ err }, 'PostgreSQL pool error (ctx-health)'); + }); + } + return pool; +} + +export async function query( + sql: string, + params?: unknown[], +): Promise> { + const p = getPool(); + return p.query(sql, params); +} + +export async function closePool(): Promise { + if (pool) { + await pool.end(); + pool = null; + } +} diff --git a/packages/ctx-health/src/db/incidents.ts b/packages/ctx-health/src/db/incidents.ts new file mode 100644 index 0000000..1ccccbf --- /dev/null +++ b/packages/ctx-health/src/db/incidents.ts @@ -0,0 +1,140 @@ +/** + * Database operations for ctx_health_incidents and ctx_health_status tables. + */ + +import { query } from './client.js'; +import { logger } from '../observability/logger.js'; +import type { CheckResult, DiagnosisResult, HealResult, IncidentRecord } from '../types.js'; + +export async function getRecentIncidents(checkName: string, limitCount = 10): Promise { + const result = await query( + `SELECT * FROM ctx_health_incidents + WHERE check_name = $1 + ORDER BY created_at DESC + LIMIT $2`, + [checkName, limitCount], + ); + return result.rows; +} + +export async function getConsecutiveFailures(checkName: string): Promise { + const result = await query<{ consecutive_failures: number }>( + `SELECT consecutive_failures FROM ctx_health_status WHERE check_name = $1`, + [checkName], + ); + return result.rows[0]?.consecutive_failures ?? 0; +} + +export async function insertIncident( + checkName: string, + category: string, + checkResult: CheckResult, + diagnosis: DiagnosisResult, + healResult: HealResult | null, + autoHealed: boolean, +): Promise { + const result = await query<{ id: string }>( + `INSERT INTO ctx_health_incidents + (check_name, category, severity, error_message, details, llm_diagnosis, action_taken, heal_success, heal_output, auto_healed) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) + RETURNING id`, + [ + checkName, + category, + diagnosis.severity, + checkResult.message, + JSON.stringify(checkResult.details ?? {}), + diagnosis.raw_output.slice(0, 2000), + healResult?.action_taken ?? null, + healResult?.success ?? null, + healResult?.output?.slice(0, 2000) ?? null, + autoHealed, + ], + ); + return result.rows[0]?.id ?? 'unknown'; +} + +export async function markIncidentResolved(checkName: string): Promise { + await query( + `UPDATE ctx_health_incidents + SET resolved_at = now() + WHERE check_name = $1 AND resolved_at IS NULL`, + [checkName], + ); +} + +export async function upsertStatus( + checkName: string, + healthy: boolean, +): Promise { + if (healthy) { + await query( + `INSERT INTO ctx_health_status (check_name, last_checked, last_healthy, current_status, consecutive_failures) + VALUES ($1, now(), now(), 'healthy', 0) + ON CONFLICT (check_name) DO UPDATE + SET last_checked = now(), + last_healthy = now(), + current_status = 'healthy', + consecutive_failures = 0`, + [checkName], + ); + } else { + await query( + `INSERT INTO ctx_health_status (check_name, last_checked, current_status, consecutive_failures, total_incidents) + VALUES ($1, now(), 'unhealthy', 1, 1) + ON CONFLICT (check_name) DO UPDATE + SET last_checked = now(), + current_status = 'unhealthy', + consecutive_failures = ctx_health_status.consecutive_failures + 1, + total_incidents = ctx_health_status.total_incidents + 1`, + [checkName], + ); + } +} + +export async function runMigrations(): Promise { + logger.info('Running ctx-health DB migrations...'); + try { + await query(` + CREATE TABLE IF NOT EXISTS ctx_health_incidents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + resolved_at TIMESTAMPTZ, + check_name TEXT NOT NULL, + category TEXT NOT NULL, + severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')), + error_message TEXT NOT NULL, + details JSONB, + llm_diagnosis TEXT, + action_taken TEXT, + heal_success BOOLEAN, + heal_output TEXT, + auto_healed BOOLEAN NOT NULL DEFAULT false + ) + `); + + await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC)`); + await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC)`); + await query(` + CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved + ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL + `); + + await query(` + CREATE TABLE IF NOT EXISTS ctx_health_status ( + check_name TEXT PRIMARY KEY, + last_checked TIMESTAMPTZ NOT NULL DEFAULT now(), + last_healthy TIMESTAMPTZ, + current_status TEXT NOT NULL DEFAULT 'unknown', + consecutive_failures INTEGER NOT NULL DEFAULT 0, + total_incidents INTEGER NOT NULL DEFAULT 0, + avg_heal_time_ms INTEGER + ) + `); + + logger.info('ctx-health migrations complete'); + } catch (err) { + logger.error({ err }, 'ctx-health migration failed'); + throw err; + } +} diff --git a/packages/ctx-health/src/db/migrations/003_ctx_health.sql b/packages/ctx-health/src/db/migrations/003_ctx_health.sql new file mode 100644 index 0000000..ef1779a --- /dev/null +++ b/packages/ctx-health/src/db/migrations/003_ctx_health.sql @@ -0,0 +1,29 @@ +CREATE TABLE IF NOT EXISTS ctx_health_incidents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + resolved_at TIMESTAMPTZ, + check_name TEXT NOT NULL, + category TEXT NOT NULL, + severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')), + error_message TEXT NOT NULL, + details JSONB, + llm_diagnosis TEXT, + action_taken TEXT, + heal_success BOOLEAN, + heal_output TEXT, + auto_healed BOOLEAN NOT NULL DEFAULT false +); + +CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC); +CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC); +CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL; + +CREATE TABLE IF NOT EXISTS ctx_health_status ( + check_name TEXT PRIMARY KEY, + last_checked TIMESTAMPTZ NOT NULL DEFAULT now(), + last_healthy TIMESTAMPTZ, + current_status TEXT NOT NULL DEFAULT 'unknown', + consecutive_failures INTEGER NOT NULL DEFAULT 0, + total_incidents INTEGER NOT NULL DEFAULT 0, + avg_heal_time_ms INTEGER +); diff --git a/packages/ctx-health/src/gateway-client.ts b/packages/ctx-health/src/gateway-client.ts new file mode 100644 index 0000000..851972b --- /dev/null +++ b/packages/ctx-health/src/gateway-client.ts @@ -0,0 +1,137 @@ +/** + * HTTP client for calling the LLM Gateway to get AI-powered diagnoses. + * Fail-open: if gateway is unavailable, returns a default diagnosis. + */ + +import { logger } from './observability/logger.js'; +import type { CheckResult, DiagnosisResult, IncidentRecord } from './types.js'; + +const GATEWAY_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103'; +const INTERNAL_SECRET = process.env['INTERNAL_SECRET'] ?? 'internal-learning-secret'; +const TIMEOUT_MS = 30_000; + +interface GatewayResponse { + output: string; + confidence: number; + model: string; + latency_ms: number; +} + +function buildDiagnosisInput( + checkName: string, + checkResult: CheckResult, + recentHistory: IncidentRecord[], +): string { + const historyLines = recentHistory.slice(0, 5).map((inc) => { + const ts = new Date(inc.created_at).toISOString(); + return ` - [${ts}] ${inc.severity}: ${inc.error_message} (healed=${inc.auto_healed})`; + }); + + return [ + `Infrastructure check FAILED: ${checkName}`, + `Error: ${checkResult.message}`, + `Latency: ${checkResult.latency_ms ?? 'n/a'}ms`, + `Details: ${JSON.stringify(checkResult.details ?? {})}`, + '', + `Recent incident history (last ${historyLines.length}):`, + historyLines.length > 0 ? historyLines.join('\n') : ' (none)', + '', + 'Provide: root cause analysis, recommended action, severity (info/warning/critical), and whether auto-healing is safe (true/false).', + 'Format your response as JSON: {"action":"...","severity":"warning","auto_heal":true,"explanation":"..."}', + ].join('\n'); +} + +function parseGatewayOutput(raw: string): Omit { + try { + const match = raw.match(/\{[\s\S]*\}/); + if (!match) throw new Error('No JSON found in output'); + const parsed = JSON.parse(match[0]) as { + action?: unknown; + severity?: unknown; + auto_heal?: unknown; + }; + const action = typeof parsed.action === 'string' ? parsed.action : 'Review logs and restart service.'; + const severity = + parsed.severity === 'info' || parsed.severity === 'warning' || parsed.severity === 'critical' + ? parsed.severity + : 'warning'; + const auto_heal = typeof parsed.auto_heal === 'boolean' ? parsed.auto_heal : false; + return { action, severity, auto_heal }; + } catch { + return { action: 'Review logs and consider manual restart.', severity: 'warning', auto_heal: false }; + } +} + +function buildDefaultDiagnosis(checkName: string): DiagnosisResult { + return { + action: `Default healing: restart service associated with check '${checkName}'.`, + severity: 'warning', + auto_heal: true, + raw_output: '(gateway unavailable — default diagnosis)', + }; +} + +export async function diagnoseIssue( + checkName: string, + checkResult: CheckResult, + recentHistory: IncidentRecord[], +): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), TIMEOUT_MS); + + try { + const input = buildDiagnosisInput(checkName, checkResult, recentHistory); + + const response = await fetch(`${GATEWAY_URL}/v1/generate`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'X-Caller': 'ctxhealer', + 'X-Internal-Secret': INTERNAL_SECRET, + 'Cache-Control': 'no-store', + }, + body: JSON.stringify({ + task_type: 'ctx_health_diagnose', + input, + user_context: '', + }), + signal: controller.signal, + }); + + if (!response.ok) { + const body = await response.text(); + logger.warn({ checkName, status: response.status, body: body.slice(0, 200) }, 'Gateway returned non-OK for diagnosis'); + return buildDefaultDiagnosis(checkName); + } + + const data = (await response.json()) as GatewayResponse; + const parsed = parseGatewayOutput(data.output); + + logger.info({ checkName, severity: parsed.severity, auto_heal: parsed.auto_heal }, 'Gateway diagnosis received'); + + return { ...parsed, raw_output: data.output }; + } catch (err) { + const isAbort = err instanceof Error && err.name === 'AbortError'; + logger.warn({ err, checkName, isAbort }, 'Gateway call failed — using default diagnosis (fail-open)'); + return buildDefaultDiagnosis(checkName); + } finally { + clearTimeout(timer); + } +} + +export async function resetCircuitBreaker(): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), 5_000); + try { + await fetch(`${GATEWAY_URL}/internal/circuit-breaker/reset`, { + method: 'POST', + headers: { 'X-Internal-Secret': INTERNAL_SECRET, 'Cache-Control': 'no-store' }, + signal: controller.signal, + }); + logger.info('Circuit breaker reset requested'); + } catch (err) { + logger.warn({ err }, 'Circuit breaker reset failed — gateway may be down'); + } finally { + clearTimeout(timer); + } +} diff --git a/packages/ctx-health/src/index.ts b/packages/ctx-health/src/index.ts new file mode 100644 index 0000000..60b21e3 --- /dev/null +++ b/packages/ctx-health/src/index.ts @@ -0,0 +1,92 @@ +/** + * CtxHealth — Infrastructure Self-Healing Daemon + * + * Monitors infrastructure every 60 seconds: + * - PM2 processes, PostgreSQL, Ollama, Cloudflare tunnel, + * disk space, memory, network, WireGuard + * + * When a check fails: + * 1. Calls LLM Gateway for AI-powered root cause analysis + * 2. Executes auto-healing actions (with cooldown + escalation guards) + * 3. Logs all incidents to PostgreSQL + * + * Safety rules: + * - After 3+ consecutive failures: escalate to human, skip healing + * - Cooldown: no re-healing same check within 5 minutes + * - Dry-run mode: CTX_HEALTH_DRY_RUN=true → diagnose but don't heal + */ + +import cron from 'node-cron'; +import { logger } from './observability/logger.js'; +import { closePool } from './db/client.js'; +import { runMigrations } from './db/incidents.js'; +import { runHealthCycle } from './runner.js'; + +const DRY_RUN = process.env['CTX_HEALTH_DRY_RUN'] === 'true'; +const CRON_SCHEDULE = process.env['CTX_HEALTH_CRON'] ?? '*/1 * * * *'; + +// ─── Startup ───────────────────────────────────────────────────────────────── + +async function startup(): Promise { + logger.info({ dryRun: DRY_RUN, schedule: CRON_SCHEDULE }, 'CtxHealth daemon starting'); + + try { + await runMigrations(); + } catch (err) { + logger.error({ err }, 'Migration failed — proceeding without DB (degraded mode)'); + } + + // Run one immediate cycle on start + await safeRun(); + + cron.schedule(CRON_SCHEDULE, () => { + void safeRun(); + }); + + logger.info('CtxHealth daemon running — press Ctrl+C to stop'); +} + +let cycleRunning = false; + +async function safeRun(): Promise { + if (cycleRunning) { + logger.warn('Previous health cycle still running — skipping this tick'); + return; + } + cycleRunning = true; + const start = Date.now(); + try { + await runHealthCycle(DRY_RUN); + logger.debug({ durationMs: Date.now() - start }, 'Health cycle completed'); + } catch (err) { + logger.error({ err, durationMs: Date.now() - start }, 'Health cycle crashed unexpectedly'); + } finally { + cycleRunning = false; + } +} + +// ─── Graceful shutdown ─────────────────────────────────────────────────────── + +async function shutdown(signal: string): Promise { + logger.info({ signal }, 'CtxHealth shutting down gracefully'); + try { + await closePool(); + } catch (err) { + logger.warn({ err }, 'Error closing DB pool during shutdown'); + } + process.exit(0); +} + +process.on('SIGTERM', () => void shutdown('SIGTERM')); +process.on('SIGINT', () => void shutdown('SIGINT')); + +process.on('unhandledRejection', (reason) => { + logger.error({ reason }, 'Unhandled promise rejection in ctx-health'); +}); + +// ─── Boot ──────────────────────────────────────────────────────────────────── + +void startup().catch((err) => { + logger.error({ err }, 'Fatal startup error'); + process.exit(1); +}); diff --git a/packages/ctx-health/src/observability/logger.ts b/packages/ctx-health/src/observability/logger.ts new file mode 100644 index 0000000..a3cef05 --- /dev/null +++ b/packages/ctx-health/src/observability/logger.ts @@ -0,0 +1,13 @@ +import pino from 'pino'; + +export const logger = pino({ + level: process.env['LOG_LEVEL'] ?? 'info', + name: 'ctx-health', + transport: + process.env['NODE_ENV'] !== 'production' + ? { + target: 'pino-pretty', + options: { colorize: true, translateTime: 'SYS:standard' }, + } + : undefined, +}); diff --git a/packages/ctx-health/src/runner.ts b/packages/ctx-health/src/runner.ts new file mode 100644 index 0000000..0ac06aa --- /dev/null +++ b/packages/ctx-health/src/runner.ts @@ -0,0 +1,162 @@ +/** + * Health cycle runner — orchestrates checks, diagnosis, healing, and logging. + * + * Safety rules enforced here: + * - MAX_CONSECUTIVE_FAILURES: escalate after 3 failures, skip healing + * - COOLDOWN_MS: minimum 5 minutes between healing attempts per check + */ + +import { logger } from './observability/logger.js'; +import { healthChecks } from './checks/index.js'; +import { diagnoseIssue } from './gateway-client.js'; +import { + getRecentIncidents, + getConsecutiveFailures, + insertIncident, + markIncidentResolved, + upsertStatus, +} from './db/incidents.js'; +import type { CheckResult, DiagnosisResult, HealResult, HealthCheck } from './types.js'; + +const MAX_CONSECUTIVE_FAILURES = 3; +const COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes + +// Track last heal attempt per check in-process (secondary to DB guard) +const lastHealAttempt = new Map(); + +// ─── Single check handler ───────────────────────────────────────────────────── + +async function handleFailedCheck( + hc: HealthCheck, + checkResult: CheckResult, + dryRun: boolean, +): Promise { + const { name, category } = hc; + + logger.warn({ check: name, category, message: checkResult.message }, 'Health check FAILED'); + + // Fetch history for LLM context and consecutive failure count + const [recentHistory, consecutiveFailures] = await Promise.all([ + getRecentIncidents(name, 10).catch(() => []), + getConsecutiveFailures(name).catch(() => 0), + ]); + + // Escalate if too many consecutive failures + if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { + logger.error( + { check: name, consecutiveFailures }, + `ESCALATION: ${consecutiveFailures} consecutive failures — skipping auto-heal, manual intervention required`, + ); + } + + // Get LLM diagnosis (always, even in dry-run — for logging) + let diagnosis: DiagnosisResult; + try { + diagnosis = await diagnoseIssue(name, checkResult, recentHistory); + } catch (err) { + logger.error({ err, check: name }, 'Diagnosis call failed unexpectedly'); + diagnosis = { + action: 'Review logs manually.', + severity: 'warning', + auto_heal: false, + raw_output: '', + }; + } + + logger.info({ check: name, severity: diagnosis.severity, action: diagnosis.action.slice(0, 120) }, 'LLM diagnosis'); + + // Determine if healing should run + const shouldHeal = + !dryRun && + diagnosis.auto_heal && + consecutiveFailures < MAX_CONSECUTIVE_FAILURES && + !isCoolingDown(name); + + let healResult: HealResult | null = null; + + if (shouldHeal) { + lastHealAttempt.set(name, Date.now()); + logger.info({ check: name }, 'Executing auto-heal action'); + try { + healResult = await hc.heal(diagnosis.action); + logger.info( + { check: name, success: healResult.success, action: healResult.action_taken }, + 'Heal action completed', + ); + } catch (err) { + logger.error({ err, check: name }, 'Heal action threw unexpectedly'); + healResult = { + action_taken: 'heal function threw an exception', + success: false, + output: err instanceof Error ? err.message : String(err), + }; + } + } else if (dryRun) { + logger.info({ check: name, action: diagnosis.action.slice(0, 120) }, '[DRY-RUN] Would execute heal action'); + } else if (!diagnosis.auto_heal) { + logger.info({ check: name }, 'LLM advises against auto-heal — skipping'); + } else if (isCoolingDown(name)) { + const remainingMs = COOLDOWN_MS - (Date.now() - (lastHealAttempt.get(name) ?? 0)); + logger.info({ check: name, remainingMs }, 'Heal on cooldown — skipping'); + } + + // Persist incident to DB + try { + await insertIncident(name, category, checkResult, diagnosis, healResult, shouldHeal && (healResult?.success ?? false)); + } catch (err) { + logger.error({ err, check: name }, 'Failed to persist incident to DB'); + } + + // Update status table + await upsertStatus(name, false).catch((err) => logger.warn({ err, check: name }, 'upsertStatus failed')); +} + +function isCoolingDown(checkName: string): boolean { + const last = lastHealAttempt.get(checkName); + if (!last) return false; + return Date.now() - last < COOLDOWN_MS; +} + +// ─── Full cycle ─────────────────────────────────────────────────────────────── + +export async function runHealthCycle(dryRun: boolean): Promise { + logger.debug({ dryRun, checks: healthChecks.length }, 'Running health cycle'); + + // Run all checks in parallel + const results = await Promise.allSettled( + healthChecks.map(async (hc) => { + const start = Date.now(); + let checkResult: CheckResult; + try { + checkResult = await hc.check(); + } catch (err) { + checkResult = { + healthy: false, + message: `Check threw exception: ${err instanceof Error ? err.message : String(err)}`, + latency_ms: Date.now() - start, + }; + } + return { hc, checkResult }; + }), + ); + + // Process results sequentially to avoid DB write races + for (const result of results) { + if (result.status === 'rejected') { + logger.error({ reason: result.reason }, 'Unexpected check runner rejection'); + continue; + } + + const { hc, checkResult } = result.value; + + if (checkResult.healthy) { + logger.debug({ check: hc.name, latency_ms: checkResult.latency_ms }, 'Check passed'); + await Promise.allSettled([ + markIncidentResolved(hc.name), + upsertStatus(hc.name, true), + ]); + } else { + await handleFailedCheck(hc, checkResult, dryRun); + } + } +} diff --git a/packages/ctx-health/src/types.ts b/packages/ctx-health/src/types.ts new file mode 100644 index 0000000..e635656 --- /dev/null +++ b/packages/ctx-health/src/types.ts @@ -0,0 +1,42 @@ +export interface CheckResult { + healthy: boolean; + message: string; + details?: Record; + latency_ms?: number; +} + +export interface HealResult { + action_taken: string; + success: boolean; + output?: string; +} + +export interface HealthCheck { + name: string; + category: 'process' | 'network' | 'database' | 'tunnel' | 'service'; + check: () => Promise; + heal: (diagnosis: string) => Promise; +} + +export interface IncidentRecord { + id: string; + created_at: Date; + resolved_at: Date | null; + check_name: string; + category: string; + severity: 'info' | 'warning' | 'critical'; + error_message: string; + details: Record | null; + llm_diagnosis: string | null; + action_taken: string | null; + heal_success: boolean | null; + heal_output: string | null; + auto_healed: boolean; +} + +export interface DiagnosisResult { + action: string; + severity: 'info' | 'warning' | 'critical'; + auto_heal: boolean; + raw_output: string; +} diff --git a/packages/ctx-health/tsconfig.json b/packages/ctx-health/tsconfig.json new file mode 100644 index 0000000..84ddb0c --- /dev/null +++ b/packages/ctx-health/tsconfig.json @@ -0,0 +1,22 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "lib": ["ES2022"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "noUncheckedIndexedAccess": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}