feat: add CtxHealth self-healing daemon as new workspace package

New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript
infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems
every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory,
network, WireGuard), gets AI-powered root cause analysis via the gateway
(ctxhealer caller / ctx_health_diagnose task_type), executes healing
actions with cooldown (5min) and escalation guards (3+ failures → human
escalation), persists all incidents to ctx_health_incidents and
ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true.
Runs as ctx-health PM2 process on Erik server.
This commit is contained in:
Rene Fichtmueller 2026-04-03 00:16:08 +02:00
parent a8a77e689c
commit e0b9fa1f53
14 changed files with 1428 additions and 1 deletions

View File

@ -5,6 +5,7 @@
{"d":"2026-04-02","t":"FEAT","m":"Fine-tuner SSH tunnel launch script (scripts/start.sh): opens SSH tunnel to Erik:5432 before running fine-tuner, bypassing IONOS firewall"} {"d":"2026-04-02","t":"FEAT","m":"Fine-tuner SSH tunnel launch script (scripts/start.sh): opens SSH tunnel to Erik:5432 before running fine-tuner, bypassing IONOS firewall"}
{"d":"2026-04-02","t":"FIX","m":"Fine-tuner env vars: FT_DB_URL/FT_GATEWAY_URL/FT_OLLAMA_URL (not DATABASE_URL) — fine-tuner status command now works"} {"d":"2026-04-02","t":"FIX","m":"Fine-tuner env vars: FT_DB_URL/FT_GATEWAY_URL/FT_OLLAMA_URL (not DATABASE_URL) — fine-tuner status command now works"}
{"d":"2026-04-02","t":"FIX","m":"DB schema migration: added status, confidence_score, used_in_training, system_prompt, input_text, output_text, human_edited, edited_output, used_in_dpo_training to learning_corpus; added run_type, task_type, started_at, deployed_model_name to fine_tuning_runs"} {"d":"2026-04-02","t":"FIX","m":"DB schema migration: added status, confidence_score, used_in_training, system_prompt, input_text, output_text, human_edited, edited_output, used_in_dpo_training to learning_corpus; added run_type, task_type, started_at, deployed_model_name to fine_tuning_runs"}
{"d":"2026-04-02","t":"FEAT","m":"CtxHealth: new workspace package packages/ctx-health — TypeScript self-healing daemon monitoring PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard every 60s; LLM-powered diagnosis via ctxhealer caller; auto-healing with cooldown + escalation guards; incidents persisted to ctx_health_incidents + ctx_health_status tables; dry-run mode; PM2 process name: ctx-health"}
{"d":"2026-04-02","t":"FIX","m":"Template variable resolution: input is now mapped as fallback for all 20+ content variable names (ocr_text, alert_data, bgp_data, anomaly_data, etc.) — all 7 project templates work with simple string input"} {"d":"2026-04-02","t":"FIX","m":"Template variable resolution: input is now mapped as fallback for all 20+ content variable names (ocr_text, alert_data, bgp_data, anomaly_data, etc.) — all 7 project templates work with simple string input"}
{"d":"2026-04-02","t":"FIX","m":"24 templates updated: {{input}} added as primary content variable so simple callers work without knowing domain-specific variable names (nog_cfp_evaluate, pc_as_narrative, sb_root_cause, shieldx_false_positive, etc.)"} {"d":"2026-04-02","t":"FIX","m":"24 templates updated: {{input}} added as primary content variable so simple callers work without knowing domain-specific variable names (nog_cfp_evaluate, pc_as_narrative, sb_root_cause, shieldx_false_positive, etc.)"}
{"d":"2026-04-02","t":"FIX","m":"Prompt resolver: input also aliased as source_data fallback; context fields spread into template vars"} {"d":"2026-04-02","t":"FIX","m":"Prompt resolver: input also aliased as source_data fallback; context fields spread into template vars"}

304
package-lock.json generated
View File

@ -510,6 +510,10 @@
"resolved": "packages/client", "resolved": "packages/client",
"link": true "link": true
}, },
"node_modules/@llm-gateway/ctx-health": {
"resolved": "packages/ctx-health",
"link": true
},
"node_modules/@llm-gateway/gateway": { "node_modules/@llm-gateway/gateway": {
"resolved": "packages/gateway", "resolved": "packages/gateway",
"link": true "link": true
@ -1152,6 +1156,12 @@
"node": ">=12" "node": ">=12"
} }
}, },
"node_modules/asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
"license": "MIT"
},
"node_modules/atomic-sleep": { "node_modules/atomic-sleep": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz", "resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz",
@ -1171,6 +1181,17 @@
"fastq": "^1.17.1" "fastq": "^1.17.1"
} }
}, },
"node_modules/axios": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz",
"integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==",
"license": "MIT",
"dependencies": {
"follow-redirects": "^1.15.11",
"form-data": "^4.0.5",
"proxy-from-env": "^2.1.0"
}
},
"node_modules/bintrees": { "node_modules/bintrees": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz",
@ -1187,6 +1208,19 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/call-bind-apply-helpers": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0",
"function-bind": "^1.1.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/chai": { "node_modules/chai": {
"version": "5.3.3", "version": "5.3.3",
"resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz",
@ -1231,6 +1265,18 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/combined-stream": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"license": "MIT",
"dependencies": {
"delayed-stream": "~1.0.0"
},
"engines": {
"node": ">= 0.8"
}
},
"node_modules/cookie": { "node_modules/cookie": {
"version": "0.7.2", "version": "0.7.2",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
@ -1290,6 +1336,29 @@
"node": ">=6" "node": ">=6"
} }
}, },
"node_modules/delayed-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"license": "MIT",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/dunder-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
"license": "MIT",
"dependencies": {
"call-bind-apply-helpers": "^1.0.1",
"es-errors": "^1.3.0",
"gopd": "^1.2.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/end-of-stream": { "node_modules/end-of-stream": {
"version": "1.4.5", "version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
@ -1300,6 +1369,24 @@
"once": "^1.4.0" "once": "^1.4.0"
} }
}, },
"node_modules/es-define-property": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
"license": "MIT",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-errors": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
"license": "MIT",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-module-lexer": { "node_modules/es-module-lexer": {
"version": "1.7.0", "version": "1.7.0",
"resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz",
@ -1307,6 +1394,33 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/es-object-atoms": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-set-tostringtag": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0",
"get-intrinsic": "^1.2.6",
"has-tostringtag": "^1.0.2",
"hasown": "^2.0.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/esbuild": { "node_modules/esbuild": {
"version": "0.27.7", "version": "0.27.7",
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz",
@ -1526,6 +1640,42 @@
"node": ">=14" "node": ">=14"
} }
}, },
"node_modules/follow-redirects": {
"version": "1.15.11",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
"funding": [
{
"type": "individual",
"url": "https://github.com/sponsors/RubenVerborgh"
}
],
"license": "MIT",
"engines": {
"node": ">=4.0"
},
"peerDependenciesMeta": {
"debug": {
"optional": true
}
}
},
"node_modules/form-data": {
"version": "4.0.5",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
"integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==",
"license": "MIT",
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
"es-set-tostringtag": "^2.1.0",
"hasown": "^2.0.2",
"mime-types": "^2.1.12"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/forwarded": { "node_modules/forwarded": {
"version": "0.2.0", "version": "0.2.0",
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
@ -1562,6 +1712,52 @@
"node": "^8.16.0 || ^10.6.0 || >=11.0.0" "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
} }
}, },
"node_modules/function-bind": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/get-intrinsic": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
"license": "MIT",
"dependencies": {
"call-bind-apply-helpers": "^1.0.2",
"es-define-property": "^1.0.1",
"es-errors": "^1.3.0",
"es-object-atoms": "^1.1.1",
"function-bind": "^1.1.2",
"get-proto": "^1.0.1",
"gopd": "^1.2.0",
"has-symbols": "^1.1.0",
"hasown": "^2.0.2",
"math-intrinsics": "^1.1.0"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/get-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
"license": "MIT",
"dependencies": {
"dunder-proto": "^1.0.1",
"es-object-atoms": "^1.0.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/get-tsconfig": { "node_modules/get-tsconfig": {
"version": "4.13.7", "version": "4.13.7",
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.7.tgz", "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.7.tgz",
@ -1574,6 +1770,57 @@
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
} }
}, },
"node_modules/gopd": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
"license": "MIT",
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/has-symbols": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
"license": "MIT",
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/has-tostringtag": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
"license": "MIT",
"dependencies": {
"has-symbols": "^1.0.3"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/hasown": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
"license": "MIT",
"dependencies": {
"function-bind": "^1.1.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/helmet": { "node_modules/helmet": {
"version": "7.2.0", "version": "7.2.0",
"resolved": "https://registry.npmjs.org/helmet/-/helmet-7.2.0.tgz", "resolved": "https://registry.npmjs.org/helmet/-/helmet-7.2.0.tgz",
@ -1673,6 +1920,36 @@
"@jridgewell/sourcemap-codec": "^1.5.5" "@jridgewell/sourcemap-codec": "^1.5.5"
} }
}, },
"node_modules/math-intrinsics": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
"license": "MIT",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/mime-db": {
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"license": "MIT",
"engines": {
"node": ">= 0.6"
}
},
"node_modules/mime-types": {
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"license": "MIT",
"dependencies": {
"mime-db": "1.52.0"
},
"engines": {
"node": ">= 0.6"
}
},
"node_modules/minimist": { "node_modules/minimist": {
"version": "1.2.8", "version": "1.2.8",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
@ -2106,6 +2383,15 @@
"node": ">= 0.10" "node": ">= 0.10"
} }
}, },
"node_modules/proxy-from-env": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz",
"integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==",
"license": "MIT",
"engines": {
"node": ">=10"
}
},
"node_modules/pump": { "node_modules/pump": {
"version": "3.0.4", "version": "3.0.4",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz",
@ -3114,6 +3400,24 @@
"typescript": "^5.7.2" "typescript": "^5.7.2"
} }
}, },
"packages/ctx-health": {
"name": "@llm-gateway/ctx-health",
"version": "1.0.0",
"dependencies": {
"axios": "^1.6.8",
"node-cron": "^3.0.3",
"pg": "^8.11.3",
"pino": "^9.0.0"
},
"devDependencies": {
"@types/node": "^22.0.0",
"@types/node-cron": "^3.0.11",
"@types/pg": "^8.11.3",
"pino-pretty": "^13.0.0",
"tsx": "^4.7.2",
"typescript": "^5.4.5"
}
},
"packages/gateway": { "packages/gateway": {
"name": "@llm-gateway/gateway", "name": "@llm-gateway/gateway",
"version": "1.0.0", "version": "1.0.0",

View File

@ -11,6 +11,8 @@
"install:all": "npm install", "install:all": "npm install",
"test": "vitest", "test": "vitest",
"db:migrate": "bash scripts/init-db.sh", "db:migrate": "bash scripts/init-db.sh",
"models:pull": "bash scripts/pull-models.sh" "models:pull": "bash scripts/pull-models.sh",
"ctx-health": "npm run start --workspace=packages/ctx-health",
"ctx-health:dev": "npm run dev --workspace=packages/ctx-health"
} }
} }

View File

@ -0,0 +1,25 @@
{
"name": "@llm-gateway/ctx-health",
"version": "1.0.0",
"type": "module",
"description": "CtxHealth — Infrastructure self-healing daemon",
"scripts": {
"dev": "tsx watch src/index.ts",
"start": "node --import tsx/esm src/index.ts",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"node-cron": "^3.0.3",
"pino": "^9.0.0",
"pg": "^8.11.3",
"axios": "^1.6.8"
},
"devDependencies": {
"@types/node": "^22.0.0",
"@types/pg": "^8.11.3",
"@types/node-cron": "^3.0.11",
"pino-pretty": "^13.0.0",
"tsx": "^4.7.2",
"typescript": "^5.4.5"
}
}

View File

@ -0,0 +1,415 @@
/**
* CtxHealth Health check implementations.
*
* Each check exposes:
* check() CheckResult
* heal() HealResult
*
* exec safety: all shell commands use execFile with an allow-list.
* Never pass user-supplied strings to any exec call.
*/
import { execFile as execFileCb } from 'node:child_process';
import { readFile } from 'node:fs/promises';
import { promisify } from 'node:util';
import { logger } from '../observability/logger.js';
import { resetCircuitBreaker } from '../gateway-client.js';
import type { CheckResult, HealResult, HealthCheck } from '../types.js';
const execFile = promisify(execFileCb);
const EXEC_TIMEOUT_MS = 15_000;
// ─── Allowed command allow-list ───────────────────────────────────────────────
const ALLOWED_COMMANDS = new Set([
'/usr/bin/ping',
'/usr/bin/wg',
'/usr/sbin/wg',
'/usr/bin/df',
'/bin/df',
'/usr/local/bin/pm2',
'/usr/bin/pm2',
'/usr/local/bin/node',
'/usr/bin/systemctl',
'/bin/systemctl',
'/usr/sbin/systemctl',
'/usr/bin/sync',
'/bin/sync',
]);
async function safeExec(
cmd: string,
args: readonly string[],
): Promise<{ stdout: string; stderr: string; success: boolean }> {
if (!ALLOWED_COMMANDS.has(cmd)) {
logger.error({ cmd }, 'Command not in allow-list — refusing to execute');
return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false };
}
try {
const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS });
return { stdout, stderr, success: true };
} catch (err) {
const e = err as { stdout?: string; stderr?: string; message?: string };
return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false };
}
}
// ─── pm2 resolve helper (tries common locations) ─────────────────────────────
async function findPm2(): Promise<string | null> {
const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2'];
for (const p of candidates) {
if (ALLOWED_COMMANDS.has(p)) {
const { success } = await safeExec(p, ['--version']);
if (success) return p;
}
}
return null;
}
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
async function checkPm2(): Promise<CheckResult> {
const start = Date.now();
const pm2 = await findPm2();
if (!pm2) {
return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start };
}
const { stdout, success } = await safeExec(pm2, ['jlist']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'pm2 jlist failed', latency_ms };
}
let processes: Array<{ name: string; pm2_env?: { status?: string } }>;
try {
processes = JSON.parse(stdout) as typeof processes;
} catch {
return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms };
}
const statusMap = Object.fromEntries(
processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']),
);
const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online');
if (offline.length > 0) {
return {
healthy: false,
message: `PM2 processes not online: ${offline.join(', ')}`,
details: { statusMap },
latency_ms,
};
}
return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms };
}
async function healPm2(diagnosis: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
return { action_taken: 'pm2 restart all', success: true, output };
}
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
async function checkPostgres(): Promise<CheckResult> {
const start = Date.now();
// Dynamic import to avoid top-level pool creation before env is loaded
const { query } = await import('../db/client.js');
try {
await query('SELECT 1');
return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start };
} catch (err) {
return {
healthy: false,
message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
}
}
async function healPostgres(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed');
return { action_taken: 'systemctl restart postgresql', success, output };
}
// ─── 3. Ollama ───────────────────────────────────────────────────────────────
const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org';
async function checkOllama(): Promise<CheckResult> {
const start = Date.now();
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 10_000);
try {
const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
const latency_ms = Date.now() - start;
if (!res.ok) {
return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms };
}
return { healthy: true, message: 'Ollama is reachable', latency_ms };
} catch (err) {
return {
healthy: false,
message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
} finally {
clearTimeout(timer);
}
}
async function healOllama(_diagnosis: string): Promise<HealResult> {
await resetCircuitBreaker();
return { action_taken: 'circuit-breaker reset requested via gateway', success: true };
}
// ─── 4. Cloudflare tunnel ────────────────────────────────────────────────────
const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
async function checkCloudflareTunnel(): Promise<CheckResult> {
const start = Date.now();
const pm2 = await findPm2();
let cloudflaredRunning = false;
if (pm2) {
const { stdout } = await safeExec(pm2, ['jlist']);
try {
const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>;
cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online');
} catch { /* ignore parse errors */ }
}
// Also attempt to ping the gateway URL to confirm tunnel is routing traffic
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 8_000);
let gatewayReachable = false;
try {
const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
gatewayReachable = res.status < 500;
} catch { /* expected if tunnel is down */ } finally {
clearTimeout(timer);
}
const latency_ms = Date.now() - start;
if (!cloudflaredRunning && !gatewayReachable) {
return {
healthy: false,
message: 'cloudflared not running via pm2 and gateway is unreachable',
details: { cloudflaredRunning, gatewayReachable },
latency_ms,
};
}
return {
healthy: true,
message: 'Cloudflare tunnel appears healthy',
details: { cloudflaredRunning, gatewayReachable },
latency_ms,
};
}
async function healCloudflareTunnel(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed');
return { action_taken: 'systemctl restart cloudflared', success, output };
}
// ─── 5. Disk space ───────────────────────────────────────────────────────────
const DISK_ALERT_PERCENT = 85;
async function checkDiskSpace(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await safeExec('/bin/df', ['-h', '/']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'df command failed', latency_ms };
}
const lines = stdout.trim().split('\n');
const dataLine = lines[1] ?? '';
const parts = dataLine.split(/\s+/);
const usedPct = parseInt(parts[4] ?? '0', 10);
if (isNaN(usedPct)) {
return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms };
}
if (usedPct > DISK_ALERT_PERCENT) {
return {
healthy: false,
message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`,
details: { usedPercent: usedPct, raw: dataLine },
latency_ms,
};
}
return {
healthy: true,
message: `Disk usage: ${usedPct}%`,
details: { usedPercent: usedPct },
latency_ms,
};
}
async function healDiskSpace(diagnosis: string): Promise<HealResult> {
// Never auto-delete — just log the LLM advisory
logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required');
return {
action_taken: 'logged LLM advisory — manual cleanup required',
success: true,
output: diagnosis.slice(0, 500),
};
}
// ─── 6. Memory ───────────────────────────────────────────────────────────────
const MEMORY_FREE_MIN_MB = 500;
async function checkMemory(): Promise<CheckResult> {
const start = Date.now();
try {
const meminfo = await readFile('/proc/meminfo', 'utf-8');
const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:'));
const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10);
const freeMb = Math.floor(freeKb / 1024);
const latency_ms = Date.now() - start;
if (isNaN(freeMb)) {
return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms };
}
if (freeMb < MEMORY_FREE_MIN_MB) {
return {
healthy: false,
message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`,
details: { availableMb: freeMb },
latency_ms,
};
}
return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms };
} catch (err) {
// /proc/meminfo not available (e.g. macOS dev environment)
return {
healthy: true,
message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
}
}
async function healMemory(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/sync', []);
if (!success) {
return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr };
}
// Writing to /proc/sys requires execFile with shell — skipped for safety.
// In production, a privileged helper script should handle this.
logger.warn('Memory healing: sync executed. Drop caches requires privileged script.');
return {
action_taken: 'sync executed; drop_caches requires privileged helper',
success: true,
output: stdout.slice(0, 200),
};
}
// ─── 7. Network connectivity ─────────────────────────────────────────────────
async function checkNetwork(): Promise<CheckResult> {
const start = Date.now();
const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']);
const latency_ms = Date.now() - start;
if (!success) {
return {
healthy: false,
message: 'Cannot ping 1.1.1.1 — network connectivity issue',
details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) },
latency_ms,
};
}
return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms };
}
async function healNetwork(diagnosis: string): Promise<HealResult> {
logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required');
return {
action_taken: 'logged critical alert — network issues require manual intervention',
success: false,
output: 'Cannot auto-heal network connectivity issues.',
};
}
// ─── 8. WireGuard ────────────────────────────────────────────────────────────
async function findWg(): Promise<string | null> {
const candidates = ['/usr/bin/wg', '/usr/sbin/wg'];
for (const p of candidates) {
if (ALLOWED_COMMANDS.has(p)) return p;
}
return null;
}
async function checkWireGuard(): Promise<CheckResult> {
const start = Date.now();
const wg = await findWg();
if (!wg) {
return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start };
}
const { stdout, success } = await safeExec(wg, ['show']);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms };
}
const hasActivePeer = stdout.includes('latest handshake');
if (!hasActivePeer) {
return {
healthy: false,
message: 'WireGuard: no active peers with recent handshake detected',
details: { output: stdout.slice(0, 300) },
latency_ms,
};
}
return { healthy: true, message: 'WireGuard peers active', latency_ms };
}
async function healWireGuard(_diagnosis: string): Promise<HealResult> {
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed');
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
}
// ─── Exported check list ──────────────────────────────────────────────────────
export const healthChecks: HealthCheck[] = [
{ name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 },
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
];

View File

@ -0,0 +1,43 @@
import pg from 'pg';
import { logger } from '../observability/logger.js';
const { Pool } = pg;
let pool: pg.Pool | null = null;
const DEFAULT_DB_URL = 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway';
function buildPoolConfig(): pg.PoolConfig {
const databaseUrl = process.env['CTX_HEALTH_DB_URL'] ?? process.env['DATABASE_URL'] ?? DEFAULT_DB_URL;
return {
connectionString: databaseUrl,
max: 3,
idleTimeoutMillis: 30_000,
connectionTimeoutMillis: 5_000,
};
}
export function getPool(): pg.Pool {
if (!pool) {
pool = new Pool(buildPoolConfig());
pool.on('error', (err) => {
logger.error({ err }, 'PostgreSQL pool error (ctx-health)');
});
}
return pool;
}
export async function query<T extends pg.QueryResultRow = pg.QueryResultRow>(
sql: string,
params?: unknown[],
): Promise<pg.QueryResult<T>> {
const p = getPool();
return p.query<T>(sql, params);
}
export async function closePool(): Promise<void> {
if (pool) {
await pool.end();
pool = null;
}
}

View File

@ -0,0 +1,140 @@
/**
* Database operations for ctx_health_incidents and ctx_health_status tables.
*/
import { query } from './client.js';
import { logger } from '../observability/logger.js';
import type { CheckResult, DiagnosisResult, HealResult, IncidentRecord } from '../types.js';
export async function getRecentIncidents(checkName: string, limitCount = 10): Promise<IncidentRecord[]> {
const result = await query<IncidentRecord>(
`SELECT * FROM ctx_health_incidents
WHERE check_name = $1
ORDER BY created_at DESC
LIMIT $2`,
[checkName, limitCount],
);
return result.rows;
}
export async function getConsecutiveFailures(checkName: string): Promise<number> {
const result = await query<{ consecutive_failures: number }>(
`SELECT consecutive_failures FROM ctx_health_status WHERE check_name = $1`,
[checkName],
);
return result.rows[0]?.consecutive_failures ?? 0;
}
export async function insertIncident(
checkName: string,
category: string,
checkResult: CheckResult,
diagnosis: DiagnosisResult,
healResult: HealResult | null,
autoHealed: boolean,
): Promise<string> {
const result = await query<{ id: string }>(
`INSERT INTO ctx_health_incidents
(check_name, category, severity, error_message, details, llm_diagnosis, action_taken, heal_success, heal_output, auto_healed)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
RETURNING id`,
[
checkName,
category,
diagnosis.severity,
checkResult.message,
JSON.stringify(checkResult.details ?? {}),
diagnosis.raw_output.slice(0, 2000),
healResult?.action_taken ?? null,
healResult?.success ?? null,
healResult?.output?.slice(0, 2000) ?? null,
autoHealed,
],
);
return result.rows[0]?.id ?? 'unknown';
}
export async function markIncidentResolved(checkName: string): Promise<void> {
await query(
`UPDATE ctx_health_incidents
SET resolved_at = now()
WHERE check_name = $1 AND resolved_at IS NULL`,
[checkName],
);
}
export async function upsertStatus(
checkName: string,
healthy: boolean,
): Promise<void> {
if (healthy) {
await query(
`INSERT INTO ctx_health_status (check_name, last_checked, last_healthy, current_status, consecutive_failures)
VALUES ($1, now(), now(), 'healthy', 0)
ON CONFLICT (check_name) DO UPDATE
SET last_checked = now(),
last_healthy = now(),
current_status = 'healthy',
consecutive_failures = 0`,
[checkName],
);
} else {
await query(
`INSERT INTO ctx_health_status (check_name, last_checked, current_status, consecutive_failures, total_incidents)
VALUES ($1, now(), 'unhealthy', 1, 1)
ON CONFLICT (check_name) DO UPDATE
SET last_checked = now(),
current_status = 'unhealthy',
consecutive_failures = ctx_health_status.consecutive_failures + 1,
total_incidents = ctx_health_status.total_incidents + 1`,
[checkName],
);
}
}
export async function runMigrations(): Promise<void> {
logger.info('Running ctx-health DB migrations...');
try {
await query(`
CREATE TABLE IF NOT EXISTS ctx_health_incidents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
resolved_at TIMESTAMPTZ,
check_name TEXT NOT NULL,
category TEXT NOT NULL,
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')),
error_message TEXT NOT NULL,
details JSONB,
llm_diagnosis TEXT,
action_taken TEXT,
heal_success BOOLEAN,
heal_output TEXT,
auto_healed BOOLEAN NOT NULL DEFAULT false
)
`);
await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC)`);
await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC)`);
await query(`
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved
ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL
`);
await query(`
CREATE TABLE IF NOT EXISTS ctx_health_status (
check_name TEXT PRIMARY KEY,
last_checked TIMESTAMPTZ NOT NULL DEFAULT now(),
last_healthy TIMESTAMPTZ,
current_status TEXT NOT NULL DEFAULT 'unknown',
consecutive_failures INTEGER NOT NULL DEFAULT 0,
total_incidents INTEGER NOT NULL DEFAULT 0,
avg_heal_time_ms INTEGER
)
`);
logger.info('ctx-health migrations complete');
} catch (err) {
logger.error({ err }, 'ctx-health migration failed');
throw err;
}
}

View File

@ -0,0 +1,29 @@
CREATE TABLE IF NOT EXISTS ctx_health_incidents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
resolved_at TIMESTAMPTZ,
check_name TEXT NOT NULL,
category TEXT NOT NULL,
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')),
error_message TEXT NOT NULL,
details JSONB,
llm_diagnosis TEXT,
action_taken TEXT,
heal_success BOOLEAN,
heal_output TEXT,
auto_healed BOOLEAN NOT NULL DEFAULT false
);
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL;
CREATE TABLE IF NOT EXISTS ctx_health_status (
check_name TEXT PRIMARY KEY,
last_checked TIMESTAMPTZ NOT NULL DEFAULT now(),
last_healthy TIMESTAMPTZ,
current_status TEXT NOT NULL DEFAULT 'unknown',
consecutive_failures INTEGER NOT NULL DEFAULT 0,
total_incidents INTEGER NOT NULL DEFAULT 0,
avg_heal_time_ms INTEGER
);

View File

@ -0,0 +1,137 @@
/**
* HTTP client for calling the LLM Gateway to get AI-powered diagnoses.
* Fail-open: if gateway is unavailable, returns a default diagnosis.
*/
import { logger } from './observability/logger.js';
import type { CheckResult, DiagnosisResult, IncidentRecord } from './types.js';
const GATEWAY_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
const INTERNAL_SECRET = process.env['INTERNAL_SECRET'] ?? 'internal-learning-secret';
const TIMEOUT_MS = 30_000;
interface GatewayResponse {
output: string;
confidence: number;
model: string;
latency_ms: number;
}
function buildDiagnosisInput(
checkName: string,
checkResult: CheckResult,
recentHistory: IncidentRecord[],
): string {
const historyLines = recentHistory.slice(0, 5).map((inc) => {
const ts = new Date(inc.created_at).toISOString();
return ` - [${ts}] ${inc.severity}: ${inc.error_message} (healed=${inc.auto_healed})`;
});
return [
`Infrastructure check FAILED: ${checkName}`,
`Error: ${checkResult.message}`,
`Latency: ${checkResult.latency_ms ?? 'n/a'}ms`,
`Details: ${JSON.stringify(checkResult.details ?? {})}`,
'',
`Recent incident history (last ${historyLines.length}):`,
historyLines.length > 0 ? historyLines.join('\n') : ' (none)',
'',
'Provide: root cause analysis, recommended action, severity (info/warning/critical), and whether auto-healing is safe (true/false).',
'Format your response as JSON: {"action":"...","severity":"warning","auto_heal":true,"explanation":"..."}',
].join('\n');
}
function parseGatewayOutput(raw: string): Omit<DiagnosisResult, 'raw_output'> {
try {
const match = raw.match(/\{[\s\S]*\}/);
if (!match) throw new Error('No JSON found in output');
const parsed = JSON.parse(match[0]) as {
action?: unknown;
severity?: unknown;
auto_heal?: unknown;
};
const action = typeof parsed.action === 'string' ? parsed.action : 'Review logs and restart service.';
const severity =
parsed.severity === 'info' || parsed.severity === 'warning' || parsed.severity === 'critical'
? parsed.severity
: 'warning';
const auto_heal = typeof parsed.auto_heal === 'boolean' ? parsed.auto_heal : false;
return { action, severity, auto_heal };
} catch {
return { action: 'Review logs and consider manual restart.', severity: 'warning', auto_heal: false };
}
}
function buildDefaultDiagnosis(checkName: string): DiagnosisResult {
return {
action: `Default healing: restart service associated with check '${checkName}'.`,
severity: 'warning',
auto_heal: true,
raw_output: '(gateway unavailable — default diagnosis)',
};
}
export async function diagnoseIssue(
checkName: string,
checkResult: CheckResult,
recentHistory: IncidentRecord[],
): Promise<DiagnosisResult> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
try {
const input = buildDiagnosisInput(checkName, checkResult, recentHistory);
const response = await fetch(`${GATEWAY_URL}/v1/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-Caller': 'ctxhealer',
'X-Internal-Secret': INTERNAL_SECRET,
'Cache-Control': 'no-store',
},
body: JSON.stringify({
task_type: 'ctx_health_diagnose',
input,
user_context: '',
}),
signal: controller.signal,
});
if (!response.ok) {
const body = await response.text();
logger.warn({ checkName, status: response.status, body: body.slice(0, 200) }, 'Gateway returned non-OK for diagnosis');
return buildDefaultDiagnosis(checkName);
}
const data = (await response.json()) as GatewayResponse;
const parsed = parseGatewayOutput(data.output);
logger.info({ checkName, severity: parsed.severity, auto_heal: parsed.auto_heal }, 'Gateway diagnosis received');
return { ...parsed, raw_output: data.output };
} catch (err) {
const isAbort = err instanceof Error && err.name === 'AbortError';
logger.warn({ err, checkName, isAbort }, 'Gateway call failed — using default diagnosis (fail-open)');
return buildDefaultDiagnosis(checkName);
} finally {
clearTimeout(timer);
}
}
export async function resetCircuitBreaker(): Promise<void> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 5_000);
try {
await fetch(`${GATEWAY_URL}/internal/circuit-breaker/reset`, {
method: 'POST',
headers: { 'X-Internal-Secret': INTERNAL_SECRET, 'Cache-Control': 'no-store' },
signal: controller.signal,
});
logger.info('Circuit breaker reset requested');
} catch (err) {
logger.warn({ err }, 'Circuit breaker reset failed — gateway may be down');
} finally {
clearTimeout(timer);
}
}

View File

@ -0,0 +1,92 @@
/**
* CtxHealth Infrastructure Self-Healing Daemon
*
* Monitors infrastructure every 60 seconds:
* - PM2 processes, PostgreSQL, Ollama, Cloudflare tunnel,
* disk space, memory, network, WireGuard
*
* When a check fails:
* 1. Calls LLM Gateway for AI-powered root cause analysis
* 2. Executes auto-healing actions (with cooldown + escalation guards)
* 3. Logs all incidents to PostgreSQL
*
* Safety rules:
* - After 3+ consecutive failures: escalate to human, skip healing
* - Cooldown: no re-healing same check within 5 minutes
* - Dry-run mode: CTX_HEALTH_DRY_RUN=true diagnose but don't heal
*/
import cron from 'node-cron';
import { logger } from './observability/logger.js';
import { closePool } from './db/client.js';
import { runMigrations } from './db/incidents.js';
import { runHealthCycle } from './runner.js';
const DRY_RUN = process.env['CTX_HEALTH_DRY_RUN'] === 'true';
const CRON_SCHEDULE = process.env['CTX_HEALTH_CRON'] ?? '*/1 * * * *';
// ─── Startup ─────────────────────────────────────────────────────────────────
async function startup(): Promise<void> {
logger.info({ dryRun: DRY_RUN, schedule: CRON_SCHEDULE }, 'CtxHealth daemon starting');
try {
await runMigrations();
} catch (err) {
logger.error({ err }, 'Migration failed — proceeding without DB (degraded mode)');
}
// Run one immediate cycle on start
await safeRun();
cron.schedule(CRON_SCHEDULE, () => {
void safeRun();
});
logger.info('CtxHealth daemon running — press Ctrl+C to stop');
}
let cycleRunning = false;
async function safeRun(): Promise<void> {
if (cycleRunning) {
logger.warn('Previous health cycle still running — skipping this tick');
return;
}
cycleRunning = true;
const start = Date.now();
try {
await runHealthCycle(DRY_RUN);
logger.debug({ durationMs: Date.now() - start }, 'Health cycle completed');
} catch (err) {
logger.error({ err, durationMs: Date.now() - start }, 'Health cycle crashed unexpectedly');
} finally {
cycleRunning = false;
}
}
// ─── Graceful shutdown ───────────────────────────────────────────────────────
async function shutdown(signal: string): Promise<void> {
logger.info({ signal }, 'CtxHealth shutting down gracefully');
try {
await closePool();
} catch (err) {
logger.warn({ err }, 'Error closing DB pool during shutdown');
}
process.exit(0);
}
process.on('SIGTERM', () => void shutdown('SIGTERM'));
process.on('SIGINT', () => void shutdown('SIGINT'));
process.on('unhandledRejection', (reason) => {
logger.error({ reason }, 'Unhandled promise rejection in ctx-health');
});
// ─── Boot ────────────────────────────────────────────────────────────────────
void startup().catch((err) => {
logger.error({ err }, 'Fatal startup error');
process.exit(1);
});

View File

@ -0,0 +1,13 @@
import pino from 'pino';
export const logger = pino({
level: process.env['LOG_LEVEL'] ?? 'info',
name: 'ctx-health',
transport:
process.env['NODE_ENV'] !== 'production'
? {
target: 'pino-pretty',
options: { colorize: true, translateTime: 'SYS:standard' },
}
: undefined,
});

View File

@ -0,0 +1,162 @@
/**
* Health cycle runner orchestrates checks, diagnosis, healing, and logging.
*
* Safety rules enforced here:
* - MAX_CONSECUTIVE_FAILURES: escalate after 3 failures, skip healing
* - COOLDOWN_MS: minimum 5 minutes between healing attempts per check
*/
import { logger } from './observability/logger.js';
import { healthChecks } from './checks/index.js';
import { diagnoseIssue } from './gateway-client.js';
import {
getRecentIncidents,
getConsecutiveFailures,
insertIncident,
markIncidentResolved,
upsertStatus,
} from './db/incidents.js';
import type { CheckResult, DiagnosisResult, HealResult, HealthCheck } from './types.js';
const MAX_CONSECUTIVE_FAILURES = 3;
const COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
// Track last heal attempt per check in-process (secondary to DB guard)
const lastHealAttempt = new Map<string, number>();
// ─── Single check handler ─────────────────────────────────────────────────────
async function handleFailedCheck(
hc: HealthCheck,
checkResult: CheckResult,
dryRun: boolean,
): Promise<void> {
const { name, category } = hc;
logger.warn({ check: name, category, message: checkResult.message }, 'Health check FAILED');
// Fetch history for LLM context and consecutive failure count
const [recentHistory, consecutiveFailures] = await Promise.all([
getRecentIncidents(name, 10).catch(() => []),
getConsecutiveFailures(name).catch(() => 0),
]);
// Escalate if too many consecutive failures
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
logger.error(
{ check: name, consecutiveFailures },
`ESCALATION: ${consecutiveFailures} consecutive failures — skipping auto-heal, manual intervention required`,
);
}
// Get LLM diagnosis (always, even in dry-run — for logging)
let diagnosis: DiagnosisResult;
try {
diagnosis = await diagnoseIssue(name, checkResult, recentHistory);
} catch (err) {
logger.error({ err, check: name }, 'Diagnosis call failed unexpectedly');
diagnosis = {
action: 'Review logs manually.',
severity: 'warning',
auto_heal: false,
raw_output: '',
};
}
logger.info({ check: name, severity: diagnosis.severity, action: diagnosis.action.slice(0, 120) }, 'LLM diagnosis');
// Determine if healing should run
const shouldHeal =
!dryRun &&
diagnosis.auto_heal &&
consecutiveFailures < MAX_CONSECUTIVE_FAILURES &&
!isCoolingDown(name);
let healResult: HealResult | null = null;
if (shouldHeal) {
lastHealAttempt.set(name, Date.now());
logger.info({ check: name }, 'Executing auto-heal action');
try {
healResult = await hc.heal(diagnosis.action);
logger.info(
{ check: name, success: healResult.success, action: healResult.action_taken },
'Heal action completed',
);
} catch (err) {
logger.error({ err, check: name }, 'Heal action threw unexpectedly');
healResult = {
action_taken: 'heal function threw an exception',
success: false,
output: err instanceof Error ? err.message : String(err),
};
}
} else if (dryRun) {
logger.info({ check: name, action: diagnosis.action.slice(0, 120) }, '[DRY-RUN] Would execute heal action');
} else if (!diagnosis.auto_heal) {
logger.info({ check: name }, 'LLM advises against auto-heal — skipping');
} else if (isCoolingDown(name)) {
const remainingMs = COOLDOWN_MS - (Date.now() - (lastHealAttempt.get(name) ?? 0));
logger.info({ check: name, remainingMs }, 'Heal on cooldown — skipping');
}
// Persist incident to DB
try {
await insertIncident(name, category, checkResult, diagnosis, healResult, shouldHeal && (healResult?.success ?? false));
} catch (err) {
logger.error({ err, check: name }, 'Failed to persist incident to DB');
}
// Update status table
await upsertStatus(name, false).catch((err) => logger.warn({ err, check: name }, 'upsertStatus failed'));
}
function isCoolingDown(checkName: string): boolean {
const last = lastHealAttempt.get(checkName);
if (!last) return false;
return Date.now() - last < COOLDOWN_MS;
}
// ─── Full cycle ───────────────────────────────────────────────────────────────
export async function runHealthCycle(dryRun: boolean): Promise<void> {
logger.debug({ dryRun, checks: healthChecks.length }, 'Running health cycle');
// Run all checks in parallel
const results = await Promise.allSettled(
healthChecks.map(async (hc) => {
const start = Date.now();
let checkResult: CheckResult;
try {
checkResult = await hc.check();
} catch (err) {
checkResult = {
healthy: false,
message: `Check threw exception: ${err instanceof Error ? err.message : String(err)}`,
latency_ms: Date.now() - start,
};
}
return { hc, checkResult };
}),
);
// Process results sequentially to avoid DB write races
for (const result of results) {
if (result.status === 'rejected') {
logger.error({ reason: result.reason }, 'Unexpected check runner rejection');
continue;
}
const { hc, checkResult } = result.value;
if (checkResult.healthy) {
logger.debug({ check: hc.name, latency_ms: checkResult.latency_ms }, 'Check passed');
await Promise.allSettled([
markIncidentResolved(hc.name),
upsertStatus(hc.name, true),
]);
} else {
await handleFailedCheck(hc, checkResult, dryRun);
}
}
}

View File

@ -0,0 +1,42 @@
export interface CheckResult {
healthy: boolean;
message: string;
details?: Record<string, unknown>;
latency_ms?: number;
}
export interface HealResult {
action_taken: string;
success: boolean;
output?: string;
}
export interface HealthCheck {
name: string;
category: 'process' | 'network' | 'database' | 'tunnel' | 'service';
check: () => Promise<CheckResult>;
heal: (diagnosis: string) => Promise<HealResult>;
}
export interface IncidentRecord {
id: string;
created_at: Date;
resolved_at: Date | null;
check_name: string;
category: string;
severity: 'info' | 'warning' | 'critical';
error_message: string;
details: Record<string, unknown> | null;
llm_diagnosis: string | null;
action_taken: string | null;
heal_success: boolean | null;
heal_output: string | null;
auto_healed: boolean;
}
export interface DiagnosisResult {
action: string;
severity: 'info' | 'warning' | 'critical';
auto_heal: boolean;
raw_output: string;
}

View File

@ -0,0 +1,22 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"lib": ["ES2022"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"noUncheckedIndexedAccess": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"resolveJsonModule": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}