feat: add CtxHealth self-healing daemon as new workspace package
New package @llm-gateway/ctx-health (packages/ctx-health/) — a TypeScript infrastructure monitoring and auto-healing daemon. Monitors 8 subsystems every 60s (PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard), gets AI-powered root cause analysis via the gateway (ctxhealer caller / ctx_health_diagnose task_type), executes healing actions with cooldown (5min) and escalation guards (3+ failures → human escalation), persists all incidents to ctx_health_incidents and ctx_health_status tables. Dry-run mode via CTX_HEALTH_DRY_RUN=true. Runs as ctx-health PM2 process on Erik server.
This commit is contained in:
parent
a8a77e689c
commit
e0b9fa1f53
@ -5,6 +5,7 @@
|
||||
{"d":"2026-04-02","t":"FEAT","m":"Fine-tuner SSH tunnel launch script (scripts/start.sh): opens SSH tunnel to Erik:5432 before running fine-tuner, bypassing IONOS firewall"}
|
||||
{"d":"2026-04-02","t":"FIX","m":"Fine-tuner env vars: FT_DB_URL/FT_GATEWAY_URL/FT_OLLAMA_URL (not DATABASE_URL) — fine-tuner status command now works"}
|
||||
{"d":"2026-04-02","t":"FIX","m":"DB schema migration: added status, confidence_score, used_in_training, system_prompt, input_text, output_text, human_edited, edited_output, used_in_dpo_training to learning_corpus; added run_type, task_type, started_at, deployed_model_name to fine_tuning_runs"}
|
||||
{"d":"2026-04-02","t":"FEAT","m":"CtxHealth: new workspace package packages/ctx-health — TypeScript self-healing daemon monitoring PM2, PostgreSQL, Ollama, Cloudflare tunnel, disk, memory, network, WireGuard every 60s; LLM-powered diagnosis via ctxhealer caller; auto-healing with cooldown + escalation guards; incidents persisted to ctx_health_incidents + ctx_health_status tables; dry-run mode; PM2 process name: ctx-health"}
|
||||
{"d":"2026-04-02","t":"FIX","m":"Template variable resolution: input is now mapped as fallback for all 20+ content variable names (ocr_text, alert_data, bgp_data, anomaly_data, etc.) — all 7 project templates work with simple string input"}
|
||||
{"d":"2026-04-02","t":"FIX","m":"24 templates updated: {{input}} added as primary content variable so simple callers work without knowing domain-specific variable names (nog_cfp_evaluate, pc_as_narrative, sb_root_cause, shieldx_false_positive, etc.)"}
|
||||
{"d":"2026-04-02","t":"FIX","m":"Prompt resolver: input also aliased as source_data fallback; context fields spread into template vars"}
|
||||
|
||||
304
package-lock.json
generated
304
package-lock.json
generated
@ -510,6 +510,10 @@
|
||||
"resolved": "packages/client",
|
||||
"link": true
|
||||
},
|
||||
"node_modules/@llm-gateway/ctx-health": {
|
||||
"resolved": "packages/ctx-health",
|
||||
"link": true
|
||||
},
|
||||
"node_modules/@llm-gateway/gateway": {
|
||||
"resolved": "packages/gateway",
|
||||
"link": true
|
||||
@ -1152,6 +1156,12 @@
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/asynckit": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
|
||||
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/atomic-sleep": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz",
|
||||
@ -1171,6 +1181,17 @@
|
||||
"fastq": "^1.17.1"
|
||||
}
|
||||
},
|
||||
"node_modules/axios": {
|
||||
"version": "1.14.0",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz",
|
||||
"integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"follow-redirects": "^1.15.11",
|
||||
"form-data": "^4.0.5",
|
||||
"proxy-from-env": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bintrees": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz",
|
||||
@ -1187,6 +1208,19 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/call-bind-apply-helpers": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
|
||||
"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0",
|
||||
"function-bind": "^1.1.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/chai": {
|
||||
"version": "5.3.3",
|
||||
"resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz",
|
||||
@ -1231,6 +1265,18 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/combined-stream": {
|
||||
"version": "1.0.8",
|
||||
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
|
||||
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"delayed-stream": "~1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/cookie": {
|
||||
"version": "0.7.2",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
|
||||
@ -1290,6 +1336,29 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/delayed-stream": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
|
||||
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/dunder-proto": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
||||
"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"call-bind-apply-helpers": "^1.0.1",
|
||||
"es-errors": "^1.3.0",
|
||||
"gopd": "^1.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/end-of-stream": {
|
||||
"version": "1.4.5",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||
@ -1300,6 +1369,24 @@
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/es-define-property": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
|
||||
"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/es-errors": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
|
||||
"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/es-module-lexer": {
|
||||
"version": "1.7.0",
|
||||
"resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz",
|
||||
@ -1307,6 +1394,33 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/es-object-atoms": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
|
||||
"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/es-set-tostringtag": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
|
||||
"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0",
|
||||
"get-intrinsic": "^1.2.6",
|
||||
"has-tostringtag": "^1.0.2",
|
||||
"hasown": "^2.0.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/esbuild": {
|
||||
"version": "0.27.7",
|
||||
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz",
|
||||
@ -1526,6 +1640,42 @@
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/follow-redirects": {
|
||||
"version": "1.15.11",
|
||||
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
|
||||
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "individual",
|
||||
"url": "https://github.com/sponsors/RubenVerborgh"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=4.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"debug": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/form-data": {
|
||||
"version": "4.0.5",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
|
||||
"integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"asynckit": "^0.4.0",
|
||||
"combined-stream": "^1.0.8",
|
||||
"es-set-tostringtag": "^2.1.0",
|
||||
"hasown": "^2.0.2",
|
||||
"mime-types": "^2.1.12"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
}
|
||||
},
|
||||
"node_modules/forwarded": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
|
||||
@ -1562,6 +1712,52 @@
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/function-bind": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
|
||||
"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/get-intrinsic": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
|
||||
"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"call-bind-apply-helpers": "^1.0.2",
|
||||
"es-define-property": "^1.0.1",
|
||||
"es-errors": "^1.3.0",
|
||||
"es-object-atoms": "^1.1.1",
|
||||
"function-bind": "^1.1.2",
|
||||
"get-proto": "^1.0.1",
|
||||
"gopd": "^1.2.0",
|
||||
"has-symbols": "^1.1.0",
|
||||
"hasown": "^2.0.2",
|
||||
"math-intrinsics": "^1.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/get-proto": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
|
||||
"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"dunder-proto": "^1.0.1",
|
||||
"es-object-atoms": "^1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/get-tsconfig": {
|
||||
"version": "4.13.7",
|
||||
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.7.tgz",
|
||||
@ -1574,6 +1770,57 @@
|
||||
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/gopd": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
|
||||
"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/has-symbols": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
|
||||
"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/has-tostringtag": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
|
||||
"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"has-symbols": "^1.0.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/hasown": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
|
||||
"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"function-bind": "^1.1.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/helmet": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/helmet/-/helmet-7.2.0.tgz",
|
||||
@ -1673,6 +1920,36 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.5.5"
|
||||
}
|
||||
},
|
||||
"node_modules/math-intrinsics": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
|
||||
"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/mime-db": {
|
||||
"version": "1.52.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
|
||||
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/mime-types": {
|
||||
"version": "2.1.35",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
|
||||
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"mime-db": "1.52.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/minimist": {
|
||||
"version": "1.2.8",
|
||||
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
|
||||
@ -2106,6 +2383,15 @@
|
||||
"node": ">= 0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/proxy-from-env": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz",
|
||||
"integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/pump": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz",
|
||||
@ -3114,6 +3400,24 @@
|
||||
"typescript": "^5.7.2"
|
||||
}
|
||||
},
|
||||
"packages/ctx-health": {
|
||||
"name": "@llm-gateway/ctx-health",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"node-cron": "^3.0.3",
|
||||
"pg": "^8.11.3",
|
||||
"pino": "^9.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.0.0",
|
||||
"@types/node-cron": "^3.0.11",
|
||||
"@types/pg": "^8.11.3",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"tsx": "^4.7.2",
|
||||
"typescript": "^5.4.5"
|
||||
}
|
||||
},
|
||||
"packages/gateway": {
|
||||
"name": "@llm-gateway/gateway",
|
||||
"version": "1.0.0",
|
||||
|
||||
@ -11,6 +11,8 @@
|
||||
"install:all": "npm install",
|
||||
"test": "vitest",
|
||||
"db:migrate": "bash scripts/init-db.sh",
|
||||
"models:pull": "bash scripts/pull-models.sh"
|
||||
"models:pull": "bash scripts/pull-models.sh",
|
||||
"ctx-health": "npm run start --workspace=packages/ctx-health",
|
||||
"ctx-health:dev": "npm run dev --workspace=packages/ctx-health"
|
||||
}
|
||||
}
|
||||
|
||||
25
packages/ctx-health/package.json
Normal file
25
packages/ctx-health/package.json
Normal file
@ -0,0 +1,25 @@
|
||||
{
|
||||
"name": "@llm-gateway/ctx-health",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"description": "CtxHealth — Infrastructure self-healing daemon",
|
||||
"scripts": {
|
||||
"dev": "tsx watch src/index.ts",
|
||||
"start": "node --import tsx/esm src/index.ts",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"node-cron": "^3.0.3",
|
||||
"pino": "^9.0.0",
|
||||
"pg": "^8.11.3",
|
||||
"axios": "^1.6.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.0.0",
|
||||
"@types/pg": "^8.11.3",
|
||||
"@types/node-cron": "^3.0.11",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"tsx": "^4.7.2",
|
||||
"typescript": "^5.4.5"
|
||||
}
|
||||
}
|
||||
415
packages/ctx-health/src/checks/index.ts
Normal file
415
packages/ctx-health/src/checks/index.ts
Normal file
@ -0,0 +1,415 @@
|
||||
/**
|
||||
* CtxHealth — Health check implementations.
|
||||
*
|
||||
* Each check exposes:
|
||||
* check() → CheckResult
|
||||
* heal() → HealResult
|
||||
*
|
||||
* exec safety: all shell commands use execFile with an allow-list.
|
||||
* Never pass user-supplied strings to any exec call.
|
||||
*/
|
||||
|
||||
import { execFile as execFileCb } from 'node:child_process';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { promisify } from 'node:util';
|
||||
import { logger } from '../observability/logger.js';
|
||||
import { resetCircuitBreaker } from '../gateway-client.js';
|
||||
import type { CheckResult, HealResult, HealthCheck } from '../types.js';
|
||||
|
||||
const execFile = promisify(execFileCb);
|
||||
const EXEC_TIMEOUT_MS = 15_000;
|
||||
|
||||
// ─── Allowed command allow-list ───────────────────────────────────────────────
|
||||
|
||||
const ALLOWED_COMMANDS = new Set([
|
||||
'/usr/bin/ping',
|
||||
'/usr/bin/wg',
|
||||
'/usr/sbin/wg',
|
||||
'/usr/bin/df',
|
||||
'/bin/df',
|
||||
'/usr/local/bin/pm2',
|
||||
'/usr/bin/pm2',
|
||||
'/usr/local/bin/node',
|
||||
'/usr/bin/systemctl',
|
||||
'/bin/systemctl',
|
||||
'/usr/sbin/systemctl',
|
||||
'/usr/bin/sync',
|
||||
'/bin/sync',
|
||||
]);
|
||||
|
||||
async function safeExec(
|
||||
cmd: string,
|
||||
args: readonly string[],
|
||||
): Promise<{ stdout: string; stderr: string; success: boolean }> {
|
||||
if (!ALLOWED_COMMANDS.has(cmd)) {
|
||||
logger.error({ cmd }, 'Command not in allow-list — refusing to execute');
|
||||
return { stdout: '', stderr: `Command not allowed: ${cmd}`, success: false };
|
||||
}
|
||||
try {
|
||||
const { stdout, stderr } = await execFile(cmd, [...args], { timeout: EXEC_TIMEOUT_MS });
|
||||
return { stdout, stderr, success: true };
|
||||
} catch (err) {
|
||||
const e = err as { stdout?: string; stderr?: string; message?: string };
|
||||
return { stdout: e.stdout ?? '', stderr: e.stderr ?? e.message ?? String(err), success: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── pm2 resolve helper (tries common locations) ─────────────────────────────
|
||||
|
||||
async function findPm2(): Promise<string | null> {
|
||||
const candidates = ['/usr/local/bin/pm2', '/usr/bin/pm2'];
|
||||
for (const p of candidates) {
|
||||
if (ALLOWED_COMMANDS.has(p)) {
|
||||
const { success } = await safeExec(p, ['--version']);
|
||||
if (success) return p;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
|
||||
|
||||
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
|
||||
|
||||
async function checkPm2(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const pm2 = await findPm2();
|
||||
if (!pm2) {
|
||||
return { healthy: false, message: 'pm2 binary not found on this host', latency_ms: Date.now() - start };
|
||||
}
|
||||
|
||||
const { stdout, success } = await safeExec(pm2, ['jlist']);
|
||||
const latency_ms = Date.now() - start;
|
||||
|
||||
if (!success) {
|
||||
return { healthy: false, message: 'pm2 jlist failed', latency_ms };
|
||||
}
|
||||
|
||||
let processes: Array<{ name: string; pm2_env?: { status?: string } }>;
|
||||
try {
|
||||
processes = JSON.parse(stdout) as typeof processes;
|
||||
} catch {
|
||||
return { healthy: false, message: 'Could not parse pm2 jlist output', latency_ms };
|
||||
}
|
||||
|
||||
const statusMap = Object.fromEntries(
|
||||
processes.map((p) => [p.name, p.pm2_env?.status ?? 'unknown']),
|
||||
);
|
||||
|
||||
const offline = PM2_REQUIRED_PROCESSES.filter((name) => statusMap[name] !== 'online');
|
||||
|
||||
if (offline.length > 0) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: `PM2 processes not online: ${offline.join(', ')}`,
|
||||
details: { statusMap },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
return { healthy: true, message: 'All required PM2 processes are online', details: { statusMap }, latency_ms };
|
||||
}
|
||||
|
||||
async function healPm2(diagnosis: string): Promise<HealResult> {
|
||||
const pm2 = await findPm2();
|
||||
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
|
||||
|
||||
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
|
||||
const output = `${stdout}\n${stderr}`.trim();
|
||||
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
|
||||
return { action_taken: 'pm2 restart all', success: true, output };
|
||||
}
|
||||
|
||||
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
|
||||
|
||||
async function checkPostgres(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
// Dynamic import to avoid top-level pool creation before env is loaded
|
||||
const { query } = await import('../db/client.js');
|
||||
try {
|
||||
await query('SELECT 1');
|
||||
return { healthy: true, message: 'PostgreSQL is reachable', latency_ms: Date.now() - start };
|
||||
} catch (err) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: `PostgreSQL unreachable: ${err instanceof Error ? err.message : String(err)}`,
|
||||
latency_ms: Date.now() - start,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function healPostgres(_diagnosis: string): Promise<HealResult> {
|
||||
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'postgresql']);
|
||||
const output = `${stdout}\n${stderr}`.trim();
|
||||
logger.info({ output: output.slice(0, 200) }, 'PostgreSQL restart executed');
|
||||
return { action_taken: 'systemctl restart postgresql', success, output };
|
||||
}
|
||||
|
||||
// ─── 3. Ollama ───────────────────────────────────────────────────────────────
|
||||
|
||||
const OLLAMA_URL = process.env['OLLAMA_URL'] ?? 'https://ollama.fichtmueller.org';
|
||||
|
||||
async function checkOllama(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), 10_000);
|
||||
try {
|
||||
const res = await fetch(`${OLLAMA_URL}/api/tags`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
|
||||
const latency_ms = Date.now() - start;
|
||||
if (!res.ok) {
|
||||
return { healthy: false, message: `Ollama returned HTTP ${res.status}`, latency_ms };
|
||||
}
|
||||
return { healthy: true, message: 'Ollama is reachable', latency_ms };
|
||||
} catch (err) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: `Ollama unreachable: ${err instanceof Error ? err.message : String(err)}`,
|
||||
latency_ms: Date.now() - start,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async function healOllama(_diagnosis: string): Promise<HealResult> {
|
||||
await resetCircuitBreaker();
|
||||
return { action_taken: 'circuit-breaker reset requested via gateway', success: true };
|
||||
}
|
||||
|
||||
// ─── 4. Cloudflare tunnel ────────────────────────────────────────────────────
|
||||
|
||||
const GATEWAY_PING_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
|
||||
|
||||
async function checkCloudflareTunnel(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const pm2 = await findPm2();
|
||||
|
||||
let cloudflaredRunning = false;
|
||||
if (pm2) {
|
||||
const { stdout } = await safeExec(pm2, ['jlist']);
|
||||
try {
|
||||
const procs = JSON.parse(stdout) as Array<{ name: string; pm2_env?: { status?: string } }>;
|
||||
cloudflaredRunning = procs.some((p) => p.name.includes('cloudflared') && p.pm2_env?.status === 'online');
|
||||
} catch { /* ignore parse errors */ }
|
||||
}
|
||||
|
||||
// Also attempt to ping the gateway URL to confirm tunnel is routing traffic
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), 8_000);
|
||||
let gatewayReachable = false;
|
||||
try {
|
||||
const res = await fetch(`${GATEWAY_PING_URL}/health`, { signal: controller.signal, cache: 'no-store' } as RequestInit);
|
||||
gatewayReachable = res.status < 500;
|
||||
} catch { /* expected if tunnel is down */ } finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
|
||||
const latency_ms = Date.now() - start;
|
||||
|
||||
if (!cloudflaredRunning && !gatewayReachable) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: 'cloudflared not running via pm2 and gateway is unreachable',
|
||||
details: { cloudflaredRunning, gatewayReachable },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
healthy: true,
|
||||
message: 'Cloudflare tunnel appears healthy',
|
||||
details: { cloudflaredRunning, gatewayReachable },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
async function healCloudflareTunnel(_diagnosis: string): Promise<HealResult> {
|
||||
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'cloudflared']);
|
||||
const output = `${stdout}\n${stderr}`.trim();
|
||||
logger.info({ output: output.slice(0, 200) }, 'cloudflared restart executed');
|
||||
return { action_taken: 'systemctl restart cloudflared', success, output };
|
||||
}
|
||||
|
||||
// ─── 5. Disk space ───────────────────────────────────────────────────────────
|
||||
|
||||
const DISK_ALERT_PERCENT = 85;
|
||||
|
||||
async function checkDiskSpace(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const { stdout, success } = await safeExec('/bin/df', ['-h', '/']);
|
||||
const latency_ms = Date.now() - start;
|
||||
|
||||
if (!success) {
|
||||
return { healthy: false, message: 'df command failed', latency_ms };
|
||||
}
|
||||
|
||||
const lines = stdout.trim().split('\n');
|
||||
const dataLine = lines[1] ?? '';
|
||||
const parts = dataLine.split(/\s+/);
|
||||
const usedPct = parseInt(parts[4] ?? '0', 10);
|
||||
|
||||
if (isNaN(usedPct)) {
|
||||
return { healthy: false, message: `Could not parse disk usage: ${dataLine}`, latency_ms };
|
||||
}
|
||||
|
||||
if (usedPct > DISK_ALERT_PERCENT) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: `Disk usage is ${usedPct}% (threshold: ${DISK_ALERT_PERCENT}%)`,
|
||||
details: { usedPercent: usedPct, raw: dataLine },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
healthy: true,
|
||||
message: `Disk usage: ${usedPct}%`,
|
||||
details: { usedPercent: usedPct },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
async function healDiskSpace(diagnosis: string): Promise<HealResult> {
|
||||
// Never auto-delete — just log the LLM advisory
|
||||
logger.warn({ diagnosis: diagnosis.slice(0, 300) }, 'Disk space advisory — manual action required');
|
||||
return {
|
||||
action_taken: 'logged LLM advisory — manual cleanup required',
|
||||
success: true,
|
||||
output: diagnosis.slice(0, 500),
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 6. Memory ───────────────────────────────────────────────────────────────
|
||||
|
||||
const MEMORY_FREE_MIN_MB = 500;
|
||||
|
||||
async function checkMemory(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
try {
|
||||
const meminfo = await readFile('/proc/meminfo', 'utf-8');
|
||||
const freeLine = meminfo.split('\n').find((l) => l.startsWith('MemAvailable:'));
|
||||
const freeKb = parseInt((freeLine ?? '').replace(/[^0-9]/g, ''), 10);
|
||||
const freeMb = Math.floor(freeKb / 1024);
|
||||
const latency_ms = Date.now() - start;
|
||||
|
||||
if (isNaN(freeMb)) {
|
||||
return { healthy: false, message: 'Could not parse /proc/meminfo', latency_ms };
|
||||
}
|
||||
|
||||
if (freeMb < MEMORY_FREE_MIN_MB) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: `Available memory is ${freeMb}MB (minimum: ${MEMORY_FREE_MIN_MB}MB)`,
|
||||
details: { availableMb: freeMb },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
return { healthy: true, message: `Available memory: ${freeMb}MB`, details: { availableMb: freeMb }, latency_ms };
|
||||
} catch (err) {
|
||||
// /proc/meminfo not available (e.g. macOS dev environment)
|
||||
return {
|
||||
healthy: true,
|
||||
message: `Memory check skipped: ${err instanceof Error ? err.message : String(err)}`,
|
||||
latency_ms: Date.now() - start,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function healMemory(_diagnosis: string): Promise<HealResult> {
|
||||
const { stdout, stderr, success } = await safeExec('/bin/sync', []);
|
||||
if (!success) {
|
||||
return { action_taken: 'sync failed — cannot drop caches without root', success: false, output: stderr };
|
||||
}
|
||||
// Writing to /proc/sys requires execFile with shell — skipped for safety.
|
||||
// In production, a privileged helper script should handle this.
|
||||
logger.warn('Memory healing: sync executed. Drop caches requires privileged script.');
|
||||
return {
|
||||
action_taken: 'sync executed; drop_caches requires privileged helper',
|
||||
success: true,
|
||||
output: stdout.slice(0, 200),
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 7. Network connectivity ─────────────────────────────────────────────────
|
||||
|
||||
async function checkNetwork(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const { success, stdout, stderr } = await safeExec('/usr/bin/ping', ['-c', '3', '-W', '3', '1.1.1.1']);
|
||||
const latency_ms = Date.now() - start;
|
||||
if (!success) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: 'Cannot ping 1.1.1.1 — network connectivity issue',
|
||||
details: { stdout: stdout.slice(0, 200), stderr: stderr.slice(0, 200) },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
return { healthy: true, message: 'Network connectivity OK (1.1.1.1 reachable)', latency_ms };
|
||||
}
|
||||
|
||||
async function healNetwork(diagnosis: string): Promise<HealResult> {
|
||||
logger.error({ diagnosis: diagnosis.slice(0, 300) }, 'Network issue detected — cannot self-heal, manual intervention required');
|
||||
return {
|
||||
action_taken: 'logged critical alert — network issues require manual intervention',
|
||||
success: false,
|
||||
output: 'Cannot auto-heal network connectivity issues.',
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 8. WireGuard ────────────────────────────────────────────────────────────
|
||||
|
||||
async function findWg(): Promise<string | null> {
|
||||
const candidates = ['/usr/bin/wg', '/usr/sbin/wg'];
|
||||
for (const p of candidates) {
|
||||
if (ALLOWED_COMMANDS.has(p)) return p;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function checkWireGuard(): Promise<CheckResult> {
|
||||
const start = Date.now();
|
||||
const wg = await findWg();
|
||||
if (!wg) {
|
||||
return { healthy: true, message: 'wg binary not found — skipping WireGuard check', latency_ms: Date.now() - start };
|
||||
}
|
||||
|
||||
const { stdout, success } = await safeExec(wg, ['show']);
|
||||
const latency_ms = Date.now() - start;
|
||||
|
||||
if (!success) {
|
||||
return { healthy: false, message: 'wg show failed — WireGuard may not be running', latency_ms };
|
||||
}
|
||||
|
||||
const hasActivePeer = stdout.includes('latest handshake');
|
||||
if (!hasActivePeer) {
|
||||
return {
|
||||
healthy: false,
|
||||
message: 'WireGuard: no active peers with recent handshake detected',
|
||||
details: { output: stdout.slice(0, 300) },
|
||||
latency_ms,
|
||||
};
|
||||
}
|
||||
|
||||
return { healthy: true, message: 'WireGuard peers active', latency_ms };
|
||||
}
|
||||
|
||||
async function healWireGuard(_diagnosis: string): Promise<HealResult> {
|
||||
const { stdout, stderr, success } = await safeExec('/bin/systemctl', ['restart', 'wg-quick@wg0']);
|
||||
const output = `${stdout}\n${stderr}`.trim();
|
||||
logger.info({ output: output.slice(0, 200) }, 'WireGuard restart executed');
|
||||
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
|
||||
}
|
||||
|
||||
// ─── Exported check list ──────────────────────────────────────────────────────
|
||||
|
||||
export const healthChecks: HealthCheck[] = [
|
||||
{ name: 'pm2-processes', category: 'process', check: checkPm2, heal: healPm2 },
|
||||
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
|
||||
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
|
||||
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
|
||||
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
|
||||
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
|
||||
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
|
||||
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
|
||||
];
|
||||
43
packages/ctx-health/src/db/client.ts
Normal file
43
packages/ctx-health/src/db/client.ts
Normal file
@ -0,0 +1,43 @@
|
||||
import pg from 'pg';
|
||||
import { logger } from '../observability/logger.js';
|
||||
|
||||
const { Pool } = pg;
|
||||
|
||||
let pool: pg.Pool | null = null;
|
||||
|
||||
const DEFAULT_DB_URL = 'postgresql://llm:llm_secure_2026@localhost:5432/llm_gateway';
|
||||
|
||||
function buildPoolConfig(): pg.PoolConfig {
|
||||
const databaseUrl = process.env['CTX_HEALTH_DB_URL'] ?? process.env['DATABASE_URL'] ?? DEFAULT_DB_URL;
|
||||
return {
|
||||
connectionString: databaseUrl,
|
||||
max: 3,
|
||||
idleTimeoutMillis: 30_000,
|
||||
connectionTimeoutMillis: 5_000,
|
||||
};
|
||||
}
|
||||
|
||||
export function getPool(): pg.Pool {
|
||||
if (!pool) {
|
||||
pool = new Pool(buildPoolConfig());
|
||||
pool.on('error', (err) => {
|
||||
logger.error({ err }, 'PostgreSQL pool error (ctx-health)');
|
||||
});
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
export async function query<T extends pg.QueryResultRow = pg.QueryResultRow>(
|
||||
sql: string,
|
||||
params?: unknown[],
|
||||
): Promise<pg.QueryResult<T>> {
|
||||
const p = getPool();
|
||||
return p.query<T>(sql, params);
|
||||
}
|
||||
|
||||
export async function closePool(): Promise<void> {
|
||||
if (pool) {
|
||||
await pool.end();
|
||||
pool = null;
|
||||
}
|
||||
}
|
||||
140
packages/ctx-health/src/db/incidents.ts
Normal file
140
packages/ctx-health/src/db/incidents.ts
Normal file
@ -0,0 +1,140 @@
|
||||
/**
|
||||
* Database operations for ctx_health_incidents and ctx_health_status tables.
|
||||
*/
|
||||
|
||||
import { query } from './client.js';
|
||||
import { logger } from '../observability/logger.js';
|
||||
import type { CheckResult, DiagnosisResult, HealResult, IncidentRecord } from '../types.js';
|
||||
|
||||
export async function getRecentIncidents(checkName: string, limitCount = 10): Promise<IncidentRecord[]> {
|
||||
const result = await query<IncidentRecord>(
|
||||
`SELECT * FROM ctx_health_incidents
|
||||
WHERE check_name = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2`,
|
||||
[checkName, limitCount],
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
export async function getConsecutiveFailures(checkName: string): Promise<number> {
|
||||
const result = await query<{ consecutive_failures: number }>(
|
||||
`SELECT consecutive_failures FROM ctx_health_status WHERE check_name = $1`,
|
||||
[checkName],
|
||||
);
|
||||
return result.rows[0]?.consecutive_failures ?? 0;
|
||||
}
|
||||
|
||||
export async function insertIncident(
|
||||
checkName: string,
|
||||
category: string,
|
||||
checkResult: CheckResult,
|
||||
diagnosis: DiagnosisResult,
|
||||
healResult: HealResult | null,
|
||||
autoHealed: boolean,
|
||||
): Promise<string> {
|
||||
const result = await query<{ id: string }>(
|
||||
`INSERT INTO ctx_health_incidents
|
||||
(check_name, category, severity, error_message, details, llm_diagnosis, action_taken, heal_success, heal_output, auto_healed)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
||||
RETURNING id`,
|
||||
[
|
||||
checkName,
|
||||
category,
|
||||
diagnosis.severity,
|
||||
checkResult.message,
|
||||
JSON.stringify(checkResult.details ?? {}),
|
||||
diagnosis.raw_output.slice(0, 2000),
|
||||
healResult?.action_taken ?? null,
|
||||
healResult?.success ?? null,
|
||||
healResult?.output?.slice(0, 2000) ?? null,
|
||||
autoHealed,
|
||||
],
|
||||
);
|
||||
return result.rows[0]?.id ?? 'unknown';
|
||||
}
|
||||
|
||||
export async function markIncidentResolved(checkName: string): Promise<void> {
|
||||
await query(
|
||||
`UPDATE ctx_health_incidents
|
||||
SET resolved_at = now()
|
||||
WHERE check_name = $1 AND resolved_at IS NULL`,
|
||||
[checkName],
|
||||
);
|
||||
}
|
||||
|
||||
export async function upsertStatus(
|
||||
checkName: string,
|
||||
healthy: boolean,
|
||||
): Promise<void> {
|
||||
if (healthy) {
|
||||
await query(
|
||||
`INSERT INTO ctx_health_status (check_name, last_checked, last_healthy, current_status, consecutive_failures)
|
||||
VALUES ($1, now(), now(), 'healthy', 0)
|
||||
ON CONFLICT (check_name) DO UPDATE
|
||||
SET last_checked = now(),
|
||||
last_healthy = now(),
|
||||
current_status = 'healthy',
|
||||
consecutive_failures = 0`,
|
||||
[checkName],
|
||||
);
|
||||
} else {
|
||||
await query(
|
||||
`INSERT INTO ctx_health_status (check_name, last_checked, current_status, consecutive_failures, total_incidents)
|
||||
VALUES ($1, now(), 'unhealthy', 1, 1)
|
||||
ON CONFLICT (check_name) DO UPDATE
|
||||
SET last_checked = now(),
|
||||
current_status = 'unhealthy',
|
||||
consecutive_failures = ctx_health_status.consecutive_failures + 1,
|
||||
total_incidents = ctx_health_status.total_incidents + 1`,
|
||||
[checkName],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function runMigrations(): Promise<void> {
|
||||
logger.info('Running ctx-health DB migrations...');
|
||||
try {
|
||||
await query(`
|
||||
CREATE TABLE IF NOT EXISTS ctx_health_incidents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
resolved_at TIMESTAMPTZ,
|
||||
check_name TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')),
|
||||
error_message TEXT NOT NULL,
|
||||
details JSONB,
|
||||
llm_diagnosis TEXT,
|
||||
action_taken TEXT,
|
||||
heal_success BOOLEAN,
|
||||
heal_output TEXT,
|
||||
auto_healed BOOLEAN NOT NULL DEFAULT false
|
||||
)
|
||||
`);
|
||||
|
||||
await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC)`);
|
||||
await query(`CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC)`);
|
||||
await query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved
|
||||
ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL
|
||||
`);
|
||||
|
||||
await query(`
|
||||
CREATE TABLE IF NOT EXISTS ctx_health_status (
|
||||
check_name TEXT PRIMARY KEY,
|
||||
last_checked TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
last_healthy TIMESTAMPTZ,
|
||||
current_status TEXT NOT NULL DEFAULT 'unknown',
|
||||
consecutive_failures INTEGER NOT NULL DEFAULT 0,
|
||||
total_incidents INTEGER NOT NULL DEFAULT 0,
|
||||
avg_heal_time_ms INTEGER
|
||||
)
|
||||
`);
|
||||
|
||||
logger.info('ctx-health migrations complete');
|
||||
} catch (err) {
|
||||
logger.error({ err }, 'ctx-health migration failed');
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
29
packages/ctx-health/src/db/migrations/003_ctx_health.sql
Normal file
29
packages/ctx-health/src/db/migrations/003_ctx_health.sql
Normal file
@ -0,0 +1,29 @@
|
||||
CREATE TABLE IF NOT EXISTS ctx_health_incidents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
resolved_at TIMESTAMPTZ,
|
||||
check_name TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical')),
|
||||
error_message TEXT NOT NULL,
|
||||
details JSONB,
|
||||
llm_diagnosis TEXT,
|
||||
action_taken TEXT,
|
||||
heal_success BOOLEAN,
|
||||
heal_output TEXT,
|
||||
auto_healed BOOLEAN NOT NULL DEFAULT false
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_created ON ctx_health_incidents (created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_check ON ctx_health_incidents (check_name, created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_ctx_health_incidents_unresolved ON ctx_health_incidents (check_name) WHERE resolved_at IS NULL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ctx_health_status (
|
||||
check_name TEXT PRIMARY KEY,
|
||||
last_checked TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
last_healthy TIMESTAMPTZ,
|
||||
current_status TEXT NOT NULL DEFAULT 'unknown',
|
||||
consecutive_failures INTEGER NOT NULL DEFAULT 0,
|
||||
total_incidents INTEGER NOT NULL DEFAULT 0,
|
||||
avg_heal_time_ms INTEGER
|
||||
);
|
||||
137
packages/ctx-health/src/gateway-client.ts
Normal file
137
packages/ctx-health/src/gateway-client.ts
Normal file
@ -0,0 +1,137 @@
|
||||
/**
|
||||
* HTTP client for calling the LLM Gateway to get AI-powered diagnoses.
|
||||
* Fail-open: if gateway is unavailable, returns a default diagnosis.
|
||||
*/
|
||||
|
||||
import { logger } from './observability/logger.js';
|
||||
import type { CheckResult, DiagnosisResult, IncidentRecord } from './types.js';
|
||||
|
||||
const GATEWAY_URL = process.env['CTX_HEALTH_GATEWAY_URL'] ?? 'http://localhost:3103';
|
||||
const INTERNAL_SECRET = process.env['INTERNAL_SECRET'] ?? 'internal-learning-secret';
|
||||
const TIMEOUT_MS = 30_000;
|
||||
|
||||
interface GatewayResponse {
|
||||
output: string;
|
||||
confidence: number;
|
||||
model: string;
|
||||
latency_ms: number;
|
||||
}
|
||||
|
||||
function buildDiagnosisInput(
|
||||
checkName: string,
|
||||
checkResult: CheckResult,
|
||||
recentHistory: IncidentRecord[],
|
||||
): string {
|
||||
const historyLines = recentHistory.slice(0, 5).map((inc) => {
|
||||
const ts = new Date(inc.created_at).toISOString();
|
||||
return ` - [${ts}] ${inc.severity}: ${inc.error_message} (healed=${inc.auto_healed})`;
|
||||
});
|
||||
|
||||
return [
|
||||
`Infrastructure check FAILED: ${checkName}`,
|
||||
`Error: ${checkResult.message}`,
|
||||
`Latency: ${checkResult.latency_ms ?? 'n/a'}ms`,
|
||||
`Details: ${JSON.stringify(checkResult.details ?? {})}`,
|
||||
'',
|
||||
`Recent incident history (last ${historyLines.length}):`,
|
||||
historyLines.length > 0 ? historyLines.join('\n') : ' (none)',
|
||||
'',
|
||||
'Provide: root cause analysis, recommended action, severity (info/warning/critical), and whether auto-healing is safe (true/false).',
|
||||
'Format your response as JSON: {"action":"...","severity":"warning","auto_heal":true,"explanation":"..."}',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function parseGatewayOutput(raw: string): Omit<DiagnosisResult, 'raw_output'> {
|
||||
try {
|
||||
const match = raw.match(/\{[\s\S]*\}/);
|
||||
if (!match) throw new Error('No JSON found in output');
|
||||
const parsed = JSON.parse(match[0]) as {
|
||||
action?: unknown;
|
||||
severity?: unknown;
|
||||
auto_heal?: unknown;
|
||||
};
|
||||
const action = typeof parsed.action === 'string' ? parsed.action : 'Review logs and restart service.';
|
||||
const severity =
|
||||
parsed.severity === 'info' || parsed.severity === 'warning' || parsed.severity === 'critical'
|
||||
? parsed.severity
|
||||
: 'warning';
|
||||
const auto_heal = typeof parsed.auto_heal === 'boolean' ? parsed.auto_heal : false;
|
||||
return { action, severity, auto_heal };
|
||||
} catch {
|
||||
return { action: 'Review logs and consider manual restart.', severity: 'warning', auto_heal: false };
|
||||
}
|
||||
}
|
||||
|
||||
function buildDefaultDiagnosis(checkName: string): DiagnosisResult {
|
||||
return {
|
||||
action: `Default healing: restart service associated with check '${checkName}'.`,
|
||||
severity: 'warning',
|
||||
auto_heal: true,
|
||||
raw_output: '(gateway unavailable — default diagnosis)',
|
||||
};
|
||||
}
|
||||
|
||||
export async function diagnoseIssue(
|
||||
checkName: string,
|
||||
checkResult: CheckResult,
|
||||
recentHistory: IncidentRecord[],
|
||||
): Promise<DiagnosisResult> {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
||||
|
||||
try {
|
||||
const input = buildDiagnosisInput(checkName, checkResult, recentHistory);
|
||||
|
||||
const response = await fetch(`${GATEWAY_URL}/v1/generate`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-Caller': 'ctxhealer',
|
||||
'X-Internal-Secret': INTERNAL_SECRET,
|
||||
'Cache-Control': 'no-store',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
task_type: 'ctx_health_diagnose',
|
||||
input,
|
||||
user_context: '',
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text();
|
||||
logger.warn({ checkName, status: response.status, body: body.slice(0, 200) }, 'Gateway returned non-OK for diagnosis');
|
||||
return buildDefaultDiagnosis(checkName);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as GatewayResponse;
|
||||
const parsed = parseGatewayOutput(data.output);
|
||||
|
||||
logger.info({ checkName, severity: parsed.severity, auto_heal: parsed.auto_heal }, 'Gateway diagnosis received');
|
||||
|
||||
return { ...parsed, raw_output: data.output };
|
||||
} catch (err) {
|
||||
const isAbort = err instanceof Error && err.name === 'AbortError';
|
||||
logger.warn({ err, checkName, isAbort }, 'Gateway call failed — using default diagnosis (fail-open)');
|
||||
return buildDefaultDiagnosis(checkName);
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
export async function resetCircuitBreaker(): Promise<void> {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), 5_000);
|
||||
try {
|
||||
await fetch(`${GATEWAY_URL}/internal/circuit-breaker/reset`, {
|
||||
method: 'POST',
|
||||
headers: { 'X-Internal-Secret': INTERNAL_SECRET, 'Cache-Control': 'no-store' },
|
||||
signal: controller.signal,
|
||||
});
|
||||
logger.info('Circuit breaker reset requested');
|
||||
} catch (err) {
|
||||
logger.warn({ err }, 'Circuit breaker reset failed — gateway may be down');
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
92
packages/ctx-health/src/index.ts
Normal file
92
packages/ctx-health/src/index.ts
Normal file
@ -0,0 +1,92 @@
|
||||
/**
|
||||
* CtxHealth — Infrastructure Self-Healing Daemon
|
||||
*
|
||||
* Monitors infrastructure every 60 seconds:
|
||||
* - PM2 processes, PostgreSQL, Ollama, Cloudflare tunnel,
|
||||
* disk space, memory, network, WireGuard
|
||||
*
|
||||
* When a check fails:
|
||||
* 1. Calls LLM Gateway for AI-powered root cause analysis
|
||||
* 2. Executes auto-healing actions (with cooldown + escalation guards)
|
||||
* 3. Logs all incidents to PostgreSQL
|
||||
*
|
||||
* Safety rules:
|
||||
* - After 3+ consecutive failures: escalate to human, skip healing
|
||||
* - Cooldown: no re-healing same check within 5 minutes
|
||||
* - Dry-run mode: CTX_HEALTH_DRY_RUN=true → diagnose but don't heal
|
||||
*/
|
||||
|
||||
import cron from 'node-cron';
|
||||
import { logger } from './observability/logger.js';
|
||||
import { closePool } from './db/client.js';
|
||||
import { runMigrations } from './db/incidents.js';
|
||||
import { runHealthCycle } from './runner.js';
|
||||
|
||||
const DRY_RUN = process.env['CTX_HEALTH_DRY_RUN'] === 'true';
|
||||
const CRON_SCHEDULE = process.env['CTX_HEALTH_CRON'] ?? '*/1 * * * *';
|
||||
|
||||
// ─── Startup ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async function startup(): Promise<void> {
|
||||
logger.info({ dryRun: DRY_RUN, schedule: CRON_SCHEDULE }, 'CtxHealth daemon starting');
|
||||
|
||||
try {
|
||||
await runMigrations();
|
||||
} catch (err) {
|
||||
logger.error({ err }, 'Migration failed — proceeding without DB (degraded mode)');
|
||||
}
|
||||
|
||||
// Run one immediate cycle on start
|
||||
await safeRun();
|
||||
|
||||
cron.schedule(CRON_SCHEDULE, () => {
|
||||
void safeRun();
|
||||
});
|
||||
|
||||
logger.info('CtxHealth daemon running — press Ctrl+C to stop');
|
||||
}
|
||||
|
||||
let cycleRunning = false;
|
||||
|
||||
async function safeRun(): Promise<void> {
|
||||
if (cycleRunning) {
|
||||
logger.warn('Previous health cycle still running — skipping this tick');
|
||||
return;
|
||||
}
|
||||
cycleRunning = true;
|
||||
const start = Date.now();
|
||||
try {
|
||||
await runHealthCycle(DRY_RUN);
|
||||
logger.debug({ durationMs: Date.now() - start }, 'Health cycle completed');
|
||||
} catch (err) {
|
||||
logger.error({ err, durationMs: Date.now() - start }, 'Health cycle crashed unexpectedly');
|
||||
} finally {
|
||||
cycleRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Graceful shutdown ───────────────────────────────────────────────────────
|
||||
|
||||
async function shutdown(signal: string): Promise<void> {
|
||||
logger.info({ signal }, 'CtxHealth shutting down gracefully');
|
||||
try {
|
||||
await closePool();
|
||||
} catch (err) {
|
||||
logger.warn({ err }, 'Error closing DB pool during shutdown');
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
process.on('SIGTERM', () => void shutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => void shutdown('SIGINT'));
|
||||
|
||||
process.on('unhandledRejection', (reason) => {
|
||||
logger.error({ reason }, 'Unhandled promise rejection in ctx-health');
|
||||
});
|
||||
|
||||
// ─── Boot ────────────────────────────────────────────────────────────────────
|
||||
|
||||
void startup().catch((err) => {
|
||||
logger.error({ err }, 'Fatal startup error');
|
||||
process.exit(1);
|
||||
});
|
||||
13
packages/ctx-health/src/observability/logger.ts
Normal file
13
packages/ctx-health/src/observability/logger.ts
Normal file
@ -0,0 +1,13 @@
|
||||
import pino from 'pino';
|
||||
|
||||
export const logger = pino({
|
||||
level: process.env['LOG_LEVEL'] ?? 'info',
|
||||
name: 'ctx-health',
|
||||
transport:
|
||||
process.env['NODE_ENV'] !== 'production'
|
||||
? {
|
||||
target: 'pino-pretty',
|
||||
options: { colorize: true, translateTime: 'SYS:standard' },
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
162
packages/ctx-health/src/runner.ts
Normal file
162
packages/ctx-health/src/runner.ts
Normal file
@ -0,0 +1,162 @@
|
||||
/**
|
||||
* Health cycle runner — orchestrates checks, diagnosis, healing, and logging.
|
||||
*
|
||||
* Safety rules enforced here:
|
||||
* - MAX_CONSECUTIVE_FAILURES: escalate after 3 failures, skip healing
|
||||
* - COOLDOWN_MS: minimum 5 minutes between healing attempts per check
|
||||
*/
|
||||
|
||||
import { logger } from './observability/logger.js';
|
||||
import { healthChecks } from './checks/index.js';
|
||||
import { diagnoseIssue } from './gateway-client.js';
|
||||
import {
|
||||
getRecentIncidents,
|
||||
getConsecutiveFailures,
|
||||
insertIncident,
|
||||
markIncidentResolved,
|
||||
upsertStatus,
|
||||
} from './db/incidents.js';
|
||||
import type { CheckResult, DiagnosisResult, HealResult, HealthCheck } from './types.js';
|
||||
|
||||
const MAX_CONSECUTIVE_FAILURES = 3;
|
||||
const COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
// Track last heal attempt per check in-process (secondary to DB guard)
|
||||
const lastHealAttempt = new Map<string, number>();
|
||||
|
||||
// ─── Single check handler ─────────────────────────────────────────────────────
|
||||
|
||||
async function handleFailedCheck(
|
||||
hc: HealthCheck,
|
||||
checkResult: CheckResult,
|
||||
dryRun: boolean,
|
||||
): Promise<void> {
|
||||
const { name, category } = hc;
|
||||
|
||||
logger.warn({ check: name, category, message: checkResult.message }, 'Health check FAILED');
|
||||
|
||||
// Fetch history for LLM context and consecutive failure count
|
||||
const [recentHistory, consecutiveFailures] = await Promise.all([
|
||||
getRecentIncidents(name, 10).catch(() => []),
|
||||
getConsecutiveFailures(name).catch(() => 0),
|
||||
]);
|
||||
|
||||
// Escalate if too many consecutive failures
|
||||
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
||||
logger.error(
|
||||
{ check: name, consecutiveFailures },
|
||||
`ESCALATION: ${consecutiveFailures} consecutive failures — skipping auto-heal, manual intervention required`,
|
||||
);
|
||||
}
|
||||
|
||||
// Get LLM diagnosis (always, even in dry-run — for logging)
|
||||
let diagnosis: DiagnosisResult;
|
||||
try {
|
||||
diagnosis = await diagnoseIssue(name, checkResult, recentHistory);
|
||||
} catch (err) {
|
||||
logger.error({ err, check: name }, 'Diagnosis call failed unexpectedly');
|
||||
diagnosis = {
|
||||
action: 'Review logs manually.',
|
||||
severity: 'warning',
|
||||
auto_heal: false,
|
||||
raw_output: '',
|
||||
};
|
||||
}
|
||||
|
||||
logger.info({ check: name, severity: diagnosis.severity, action: diagnosis.action.slice(0, 120) }, 'LLM diagnosis');
|
||||
|
||||
// Determine if healing should run
|
||||
const shouldHeal =
|
||||
!dryRun &&
|
||||
diagnosis.auto_heal &&
|
||||
consecutiveFailures < MAX_CONSECUTIVE_FAILURES &&
|
||||
!isCoolingDown(name);
|
||||
|
||||
let healResult: HealResult | null = null;
|
||||
|
||||
if (shouldHeal) {
|
||||
lastHealAttempt.set(name, Date.now());
|
||||
logger.info({ check: name }, 'Executing auto-heal action');
|
||||
try {
|
||||
healResult = await hc.heal(diagnosis.action);
|
||||
logger.info(
|
||||
{ check: name, success: healResult.success, action: healResult.action_taken },
|
||||
'Heal action completed',
|
||||
);
|
||||
} catch (err) {
|
||||
logger.error({ err, check: name }, 'Heal action threw unexpectedly');
|
||||
healResult = {
|
||||
action_taken: 'heal function threw an exception',
|
||||
success: false,
|
||||
output: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
} else if (dryRun) {
|
||||
logger.info({ check: name, action: diagnosis.action.slice(0, 120) }, '[DRY-RUN] Would execute heal action');
|
||||
} else if (!diagnosis.auto_heal) {
|
||||
logger.info({ check: name }, 'LLM advises against auto-heal — skipping');
|
||||
} else if (isCoolingDown(name)) {
|
||||
const remainingMs = COOLDOWN_MS - (Date.now() - (lastHealAttempt.get(name) ?? 0));
|
||||
logger.info({ check: name, remainingMs }, 'Heal on cooldown — skipping');
|
||||
}
|
||||
|
||||
// Persist incident to DB
|
||||
try {
|
||||
await insertIncident(name, category, checkResult, diagnosis, healResult, shouldHeal && (healResult?.success ?? false));
|
||||
} catch (err) {
|
||||
logger.error({ err, check: name }, 'Failed to persist incident to DB');
|
||||
}
|
||||
|
||||
// Update status table
|
||||
await upsertStatus(name, false).catch((err) => logger.warn({ err, check: name }, 'upsertStatus failed'));
|
||||
}
|
||||
|
||||
function isCoolingDown(checkName: string): boolean {
|
||||
const last = lastHealAttempt.get(checkName);
|
||||
if (!last) return false;
|
||||
return Date.now() - last < COOLDOWN_MS;
|
||||
}
|
||||
|
||||
// ─── Full cycle ───────────────────────────────────────────────────────────────
|
||||
|
||||
export async function runHealthCycle(dryRun: boolean): Promise<void> {
|
||||
logger.debug({ dryRun, checks: healthChecks.length }, 'Running health cycle');
|
||||
|
||||
// Run all checks in parallel
|
||||
const results = await Promise.allSettled(
|
||||
healthChecks.map(async (hc) => {
|
||||
const start = Date.now();
|
||||
let checkResult: CheckResult;
|
||||
try {
|
||||
checkResult = await hc.check();
|
||||
} catch (err) {
|
||||
checkResult = {
|
||||
healthy: false,
|
||||
message: `Check threw exception: ${err instanceof Error ? err.message : String(err)}`,
|
||||
latency_ms: Date.now() - start,
|
||||
};
|
||||
}
|
||||
return { hc, checkResult };
|
||||
}),
|
||||
);
|
||||
|
||||
// Process results sequentially to avoid DB write races
|
||||
for (const result of results) {
|
||||
if (result.status === 'rejected') {
|
||||
logger.error({ reason: result.reason }, 'Unexpected check runner rejection');
|
||||
continue;
|
||||
}
|
||||
|
||||
const { hc, checkResult } = result.value;
|
||||
|
||||
if (checkResult.healthy) {
|
||||
logger.debug({ check: hc.name, latency_ms: checkResult.latency_ms }, 'Check passed');
|
||||
await Promise.allSettled([
|
||||
markIncidentResolved(hc.name),
|
||||
upsertStatus(hc.name, true),
|
||||
]);
|
||||
} else {
|
||||
await handleFailedCheck(hc, checkResult, dryRun);
|
||||
}
|
||||
}
|
||||
}
|
||||
42
packages/ctx-health/src/types.ts
Normal file
42
packages/ctx-health/src/types.ts
Normal file
@ -0,0 +1,42 @@
|
||||
export interface CheckResult {
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
details?: Record<string, unknown>;
|
||||
latency_ms?: number;
|
||||
}
|
||||
|
||||
export interface HealResult {
|
||||
action_taken: string;
|
||||
success: boolean;
|
||||
output?: string;
|
||||
}
|
||||
|
||||
export interface HealthCheck {
|
||||
name: string;
|
||||
category: 'process' | 'network' | 'database' | 'tunnel' | 'service';
|
||||
check: () => Promise<CheckResult>;
|
||||
heal: (diagnosis: string) => Promise<HealResult>;
|
||||
}
|
||||
|
||||
export interface IncidentRecord {
|
||||
id: string;
|
||||
created_at: Date;
|
||||
resolved_at: Date | null;
|
||||
check_name: string;
|
||||
category: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
error_message: string;
|
||||
details: Record<string, unknown> | null;
|
||||
llm_diagnosis: string | null;
|
||||
action_taken: string | null;
|
||||
heal_success: boolean | null;
|
||||
heal_output: string | null;
|
||||
auto_healed: boolean;
|
||||
}
|
||||
|
||||
export interface DiagnosisResult {
|
||||
action: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
auto_heal: boolean;
|
||||
raw_output: string;
|
||||
}
|
||||
22
packages/ctx-health/tsconfig.json
Normal file
22
packages/ctx-health/tsconfig.json
Normal file
@ -0,0 +1,22 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"lib": ["ES2022"],
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true,
|
||||
"noUncheckedIndexedAccess": true,
|
||||
"noImplicitReturns": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user