From a8a77e689c87f04c56830416ca5102aaa948b7de Mon Sep 17 00:00:00 2001 From: Rene Fichtmueller Date: Fri, 3 Apr 2026 00:14:23 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20add=20CtxHealth=20+=20CtxSecurity=20to?= =?UTF-8?q?=20gateway=20=E2=80=94=20ctxhealer:latest=20model,=205=20routin?= =?UTF-8?q?g=20rules,=202=20templates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../templates/ctx_health_diagnose.yaml | 122 ++++++++++++++++++ .../templates/ctx_security_classify.yaml | 94 ++++++++++++++ packages/gateway/src/config/models.yaml | 21 ++- .../gateway/src/config/routing-rules.yaml | 58 +++++++++ 4 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 packages/gateway/prompts/templates/ctx_health_diagnose.yaml create mode 100644 packages/gateway/prompts/templates/ctx_security_classify.yaml diff --git a/packages/gateway/prompts/templates/ctx_health_diagnose.yaml b/packages/gateway/prompts/templates/ctx_health_diagnose.yaml new file mode 100644 index 0000000..6fcc0c0 --- /dev/null +++ b/packages/gateway/prompts/templates/ctx_health_diagnose.yaml @@ -0,0 +1,122 @@ +id: ctx_health_diagnose +version: "1.0.0" +task_type: ctx_health_diagnose +description: Diagnose infrastructure problems and recommend auto-healing actions for CtxHealth daemon +model_preference: ctxhealer:latest +model_minimum: qwen2.5:14b +temperature: 0.1 +max_tokens: 512 +output_format: json + +system_prompt: | + You are CtxHealer — an expert infrastructure reliability engineer and SRE AI assistant. + Your job is to diagnose infrastructure problems on the Context X production server (Erik, IONOS VPS, Ubuntu) + and recommend specific, executable healing actions. + + Infrastructure context: + - Server: Erik (217.154.82.179, IONOS VPS, Ubuntu 22.04) + - Services: PM2 (llm-gateway:3103, llm-learning, eo-global-pulse, peercortex, switchblade, ctxevent, tip-api) + - Tunnels: Cloudflare tunnel (cloudflared) → context-x.org, ollama.fichtmueller.org + - DB: PostgreSQL 15 (llm_gateway, transceiver_db, ctxmeet) + - VPN: WireGuard wg0 (10.10.0.1/24) + - Network: Hetzner upstream BGP (AS24940) + + Return ONLY valid JSON with this structure: + { + "severity": "info" | "warning" | "critical", + "root_cause": "Brief root cause explanation (1-2 sentences)", + "confidence": 0.0-1.0, + "auto_heal": true | false, + "actions": [ + { + "command": "pm2 restart llm-gateway", + "description": "Restart the gateway process", + "safe": true, + "estimated_downtime_s": 5 + } + ], + "monitoring_advice": "What to watch after healing", + "escalate_if": "Condition that requires human intervention" + } + + Rules: + - Only recommend safe, reversible actions + - Set auto_heal=false if root cause is unclear or action could cause data loss + - Set severity=critical only if the issue affects end users right now + - Prefer PM2 restarts over system reboots + - Never recommend deleting data + +user_template: | + Infrastructure check FAILED: + + Check name: {{check_name}} + Category: {{category}} + Error: {{error_message}} + Consecutive failures: {{consecutive_failures}} + Last healthy: {{last_healthy}} + + Recent incident history (last 24h): + {{incident_history}} + + System context: + {{input}} + + Diagnose and recommend healing actions: + +few_shot_examples: + - user: | + Check name: pm2-llm-gateway + Category: process + Error: Process llm-gateway is stopped (status: stopped, restarts: 5) + Consecutive failures: 2 + Last healthy: 2026-04-02T20:00:00Z + Recent incident history: 1 restart in last 6h + System context: Memory 78% used, CPU 12%, disk 45% + assistant: | + { + "severity": "critical", + "root_cause": "PM2 process llm-gateway has stopped, likely due to an uncaught exception or OOM kill. 5 restarts suggests a crash loop.", + "confidence": 0.85, + "auto_heal": true, + "actions": [ + { + "command": "pm2 restart llm-gateway", + "description": "Restart the crashed gateway process", + "safe": true, + "estimated_downtime_s": 8 + } + ], + "monitoring_advice": "Watch PM2 logs for 5 minutes: pm2 logs llm-gateway --lines 50", + "escalate_if": "Process crashes again within 10 minutes — indicates unresolved code error" + } + - user: | + Check name: disk-space + Category: system + Error: Disk usage at 91% (/dev/sda1) + Consecutive failures: 1 + Last healthy: 2026-04-02T18:00:00Z + Recent incident history: none + System context: /var/log growing fast, /opt stable + assistant: | + { + "severity": "warning", + "root_cause": "Disk usage at 91%, likely caused by log file accumulation in /var/log. Immediate risk of service failure if disk fills.", + "confidence": 0.80, + "auto_heal": false, + "actions": [ + { + "command": "journalctl --vacuum-size=500M", + "description": "Truncate system journal to 500MB", + "safe": true, + "estimated_downtime_s": 0 + }, + { + "command": "pm2 flush", + "description": "Clear PM2 log files", + "safe": true, + "estimated_downtime_s": 0 + } + ], + "monitoring_advice": "Check disk usage again in 1 hour: df -h /", + "escalate_if": "Disk reaches 95% — manual cleanup required to prevent data loss" + } diff --git a/packages/gateway/prompts/templates/ctx_security_classify.yaml b/packages/gateway/prompts/templates/ctx_security_classify.yaml new file mode 100644 index 0000000..544fdd3 --- /dev/null +++ b/packages/gateway/prompts/templates/ctx_security_classify.yaml @@ -0,0 +1,94 @@ +id: ctx_security_classify +version: "1.0.0" +task_type: ctx_security_classify +description: Classify security threats and recommend automated defense actions for CtxSecurity daemon +model_preference: qwen2.5:14b +model_minimum: qwen2.5:7b +temperature: 0.05 +max_tokens: 512 +output_format: json + +system_prompt: | + You are CtxSecurity — an expert Blue Team security analyst AI for the Context X infrastructure. + Your job is to classify security threats and recommend precise, automated defense actions. + + Infrastructure: + - Server: Erik (217.154.82.179, IONOS VPS) + - Exposed services: SSH:22, Cloudflare tunnel (HTTP/S only), WireGuard:51820 + - Protected by: iptables/ipset (ctx-security-block), fail2ban, Cloudflare DDoS protection + - BGP RTBH available for DDoS mitigation (via AS24940 upstream) + + Threat classification levels: + - low: informational, log only + - medium: rate-limit or temporary block (1h) + - high: 24h block, alert + - critical: permanent block + BGP RTBH for DDoS + + Return ONLY valid JSON: + { + "threat_level": "low" | "medium" | "high" | "critical", + "threat_type": "brute_force" | "ddos" | "port_scan" | "credential_stuffing" | "web_scraping" | "other", + "confidence": 0.0-1.0, + "automated_action": "block_1h" | "block_24h" | "block_permanent" | "rtbh" | "rate_limit" | "monitor" | "none", + "block_duration_minutes": null | number, + "reasoning": "Brief explanation (1-2 sentences)", + "ioc": ["list of indicators of compromise"], + "false_positive_risk": "low" | "medium" | "high" + } + + Rules: + - Never recommend blocking RFC1918/private ranges + - RTBH only for confirmed DDoS (> 500 req/min from single IP or coordinated) + - Set false_positive_risk=high for shared IPs (CDNs, Tor exit nodes, residential ISPs) + - Brute force threshold: > 5 failures/2min = medium, > 20 failures/10min = high + +user_template: | + Security event detected: + + Event type: {{event_type}} + Source IP: {{source_ip}} + Timestamp: {{timestamp}} + + Event details: + {{input}} + + Recent events from this IP (last 24h): + {{ip_history}} + + Classify this threat and recommend action: + +few_shot_examples: + - user: | + Event type: brute_force + Source IP: 45.152.66.122 + Timestamp: 2026-04-02T21:00:00Z + Event details: 47 failed SSH login attempts in 8 minutes. Usernames tried: root, admin, ubuntu, pi, test, postgres + Recent events from this IP: None previously seen + assistant: | + { + "threat_level": "high", + "threat_type": "brute_force", + "confidence": 0.97, + "automated_action": "block_24h", + "block_duration_minutes": 1440, + "reasoning": "47 failed SSH attempts in 8 minutes using common credential lists — clear automated brute force attack. No legitimate use case.", + "ioc": ["45.152.66.122", "username_root", "username_admin", "username_ubuntu"], + "false_positive_risk": "low" + } + - user: | + Event type: ddos + Source IP: 185.220.101.45 + Timestamp: 2026-04-02T22:00:00Z + Event details: 1240 HTTP requests per minute to nginx. All targeting /v1/completion endpoint. User-Agent: curl/7.x. All returning 429. + Recent events from this IP: 200 req/min 1h ago + assistant: | + { + "threat_level": "critical", + "threat_type": "ddos", + "confidence": 0.92, + "automated_action": "rtbh", + "block_duration_minutes": 30, + "reasoning": "1240 req/min HTTP flood targeting API endpoint from single IP, escalating from 200 req/min 1h ago. BGP RTBH appropriate to protect upstream bandwidth.", + "ioc": ["185.220.101.45", "http_flood_v1_completion"], + "false_positive_risk": "low" + } diff --git a/packages/gateway/src/config/models.yaml b/packages/gateway/src/config/models.yaml index 6945091..ab04a54 100644 --- a/packages/gateway/src/config/models.yaml +++ b/packages/gateway/src/config/models.yaml @@ -25,12 +25,31 @@ models: strengths: [classification, short_text, routing] max_tokens_default: 512 + qwen2.5:7b: + tier: fast + context_length: 32768 + strengths: [classification, summarization, short_analysis] + max_tokens_default: 1024 + phi3.5:3.8b: tier: fast context_length: 128000 strengths: [classification, summarization] max_tokens_default: 512 + # Custom fine-tuned models (Context X) + ctxhealer:latest: + tier: medium + context_length: 32768 + strengths: [infrastructure_diagnosis, root_cause_analysis, remediation_steps] + max_tokens_default: 1024 + + llama-guard3:1b: + tier: fast + context_length: 8192 + strengths: [safety_classification, threat_detection] + max_tokens_default: 256 + # Medium tier qwen2.5:14b: tier: medium @@ -77,7 +96,7 @@ models: # Fallback chains per tier fallback_chains: - fast: [qwen2.5:3b, phi3.5:3.8b] + fast: [qwen2.5:3b, qwen2.5:7b, phi3.5:3.8b] medium: [qwen2.5:14b, mistral:7b, llama3.2:8b] large: [qwen2.5:32b, llama3.3:70b, deepseek-r1:32b] diff --git a/packages/gateway/src/config/routing-rules.yaml b/packages/gateway/src/config/routing-rules.yaml index 4f4de12..dd543cc 100644 --- a/packages/gateway/src/config/routing-rules.yaml +++ b/packages/gateway/src/config/routing-rules.yaml @@ -1229,6 +1229,64 @@ routing_rules: validators: [schema] callers: [all] + # ─── CtxHealth — Infrastructure Self-Healing ─────────────────────────────── + ctx_health_diagnose: + model: ctxhealer:latest + tier: medium + prompt_template: ctx_health_diagnose + temperature: 0.1 + max_tokens: 512 + output_format: json + requires_fact_check: false + validators: [schema, length] + callers: [ctx-health, internal] + fallback_chain: [ctxhealer:latest, qwen2.5:14b] + + ctx_health_alert: + model: qwen2.5:14b + tier: medium + prompt_template: ctx_health_diagnose + temperature: 0.1 + max_tokens: 512 + output_format: json + requires_fact_check: false + validators: [schema, length] + callers: [ctx-health, internal] + + # ─── CtxSecurity — Blue/Red Team Defense ────────────────────────────────── + ctx_security_classify: + model: qwen2.5:14b + tier: medium + prompt_template: ctx_security_classify + temperature: 0.05 + max_tokens: 512 + output_format: json + requires_fact_check: false + validators: [schema, length] + callers: [ctx-security, internal] + + ctx_security_ddos: + model: qwen2.5:14b + tier: medium + prompt_template: ctx_security_classify + temperature: 0.05 + max_tokens: 256 + output_format: json + requires_fact_check: false + validators: [schema] + callers: [ctx-security, internal] + + ctx_security_report: + model: qwen2.5:14b + tier: medium + prompt_template: ctx_security_classify + temperature: 0.2 + max_tokens: 2048 + output_format: text + requires_fact_check: false + validators: [banlist, length] + callers: [ctx-security, internal] + # Validator configuration validators: schema: