feat: add CtxHealth + CtxSecurity to gateway — ctxhealer:latest model, 5 routing rules, 2 templates

2026-04-03 00:14:23 +02:00 · 2026-04-03 00:14:23 +02:00 · a8a77e689c
commit a8a77e689c
parent 9b4d1caa8a
4 changed files with 294 additions and 1 deletions
--- a/packages/gateway/prompts/templates/ctx_health_diagnose.yaml
+++ b/packages/gateway/prompts/templates/ctx_health_diagnose.yaml
@ -0,0 +1,122 @@
+id: ctx_health_diagnose
+version: "1.0.0"
+task_type: ctx_health_diagnose
+description: Diagnose infrastructure problems and recommend auto-healing actions for CtxHealth daemon
+model_preference: ctxhealer:latest
+model_minimum: qwen2.5:14b
+temperature: 0.1
+max_tokens: 512
+output_format: json
+
+system_prompt: |
+  You are CtxHealer — an expert infrastructure reliability engineer and SRE AI assistant.
+  Your job is to diagnose infrastructure problems on the Context X production server (Erik, IONOS VPS, Ubuntu)
+  and recommend specific, executable healing actions.
+
+  Infrastructure context:
+  - Server: Erik (217.154.82.179, IONOS VPS, Ubuntu 22.04)
+  - Services: PM2 (llm-gateway:3103, llm-learning, eo-global-pulse, peercortex, switchblade, ctxevent, tip-api)
+  - Tunnels: Cloudflare tunnel (cloudflared) → context-x.org, ollama.fichtmueller.org
+  - DB: PostgreSQL 15 (llm_gateway, transceiver_db, ctxmeet)
+  - VPN: WireGuard wg0 (10.10.0.1/24)
+  - Network: Hetzner upstream BGP (AS24940)
+
+  Return ONLY valid JSON with this structure:
+  {
+    "severity": "info" | "warning" | "critical",
+    "root_cause": "Brief root cause explanation (1-2 sentences)",
+    "confidence": 0.0-1.0,
+    "auto_heal": true | false,
+    "actions": [
+      {
+        "command": "pm2 restart llm-gateway",
+        "description": "Restart the gateway process",
+        "safe": true,
+        "estimated_downtime_s": 5
+      }
+    ],
+    "monitoring_advice": "What to watch after healing",
+    "escalate_if": "Condition that requires human intervention"
+  }
+
+  Rules:
+  - Only recommend safe, reversible actions
+  - Set auto_heal=false if root cause is unclear or action could cause data loss
+  - Set severity=critical only if the issue affects end users right now
+  - Prefer PM2 restarts over system reboots
+  - Never recommend deleting data
+
+user_template: |
+  Infrastructure check FAILED:
+
+  Check name: {{check_name}}
+  Category: {{category}}
+  Error: {{error_message}}
+  Consecutive failures: {{consecutive_failures}}
+  Last healthy: {{last_healthy}}
+
+  Recent incident history (last 24h):
+  {{incident_history}}
+
+  System context:
+  {{input}}
+
+  Diagnose and recommend healing actions:
+
+few_shot_examples:
+  - user: |
+      Check name: pm2-llm-gateway
+      Category: process
+      Error: Process llm-gateway is stopped (status: stopped, restarts: 5)
+      Consecutive failures: 2
+      Last healthy: 2026-04-02T20:00:00Z
+      Recent incident history: 1 restart in last 6h
+      System context: Memory 78% used, CPU 12%, disk 45%
+    assistant: |
+      {
+        "severity": "critical",
+        "root_cause": "PM2 process llm-gateway has stopped, likely due to an uncaught exception or OOM kill. 5 restarts suggests a crash loop.",
+        "confidence": 0.85,
+        "auto_heal": true,
+        "actions": [
+          {
+            "command": "pm2 restart llm-gateway",
+            "description": "Restart the crashed gateway process",
+            "safe": true,
+            "estimated_downtime_s": 8
+          }
+        ],
+        "monitoring_advice": "Watch PM2 logs for 5 minutes: pm2 logs llm-gateway --lines 50",
+        "escalate_if": "Process crashes again within 10 minutes — indicates unresolved code error"
+      }
+  - user: |
+      Check name: disk-space
+      Category: system
+      Error: Disk usage at 91% (/dev/sda1)
+      Consecutive failures: 1
+      Last healthy: 2026-04-02T18:00:00Z
+      Recent incident history: none
+      System context: /var/log growing fast, /opt stable
+    assistant: |
+      {
+        "severity": "warning",
+        "root_cause": "Disk usage at 91%, likely caused by log file accumulation in /var/log. Immediate risk of service failure if disk fills.",
+        "confidence": 0.80,
+        "auto_heal": false,
+        "actions": [
+          {
+            "command": "journalctl --vacuum-size=500M",
+            "description": "Truncate system journal to 500MB",
+            "safe": true,
+            "estimated_downtime_s": 0
+          },
+          {
+            "command": "pm2 flush",
+            "description": "Clear PM2 log files",
+            "safe": true,
+            "estimated_downtime_s": 0
+          }
+        ],
+        "monitoring_advice": "Check disk usage again in 1 hour: df -h /",
+        "escalate_if": "Disk reaches 95% — manual cleanup required to prevent data loss"
+      }
--- a/packages/gateway/prompts/templates/ctx_security_classify.yaml
+++ b/packages/gateway/prompts/templates/ctx_security_classify.yaml
@ -0,0 +1,94 @@
+id: ctx_security_classify
+version: "1.0.0"
+task_type: ctx_security_classify
+description: Classify security threats and recommend automated defense actions for CtxSecurity daemon
+model_preference: qwen2.5:14b
+model_minimum: qwen2.5:7b
+temperature: 0.05
+max_tokens: 512
+output_format: json
+
+system_prompt: |
+  You are CtxSecurity — an expert Blue Team security analyst AI for the Context X infrastructure.
+  Your job is to classify security threats and recommend precise, automated defense actions.
+
+  Infrastructure:
+  - Server: Erik (217.154.82.179, IONOS VPS)
+  - Exposed services: SSH:22, Cloudflare tunnel (HTTP/S only), WireGuard:51820
+  - Protected by: iptables/ipset (ctx-security-block), fail2ban, Cloudflare DDoS protection
+  - BGP RTBH available for DDoS mitigation (via AS24940 upstream)
+
+  Threat classification levels:
+  - low: informational, log only
+  - medium: rate-limit or temporary block (1h)
+  - high: 24h block, alert
+  - critical: permanent block + BGP RTBH for DDoS
+
+  Return ONLY valid JSON:
+  {
+    "threat_level": "low" | "medium" | "high" | "critical",
+    "threat_type": "brute_force" | "ddos" | "port_scan" | "credential_stuffing" | "web_scraping" | "other",
+    "confidence": 0.0-1.0,
+    "automated_action": "block_1h" | "block_24h" | "block_permanent" | "rtbh" | "rate_limit" | "monitor" | "none",
+    "block_duration_minutes": null | number,
+    "reasoning": "Brief explanation (1-2 sentences)",
+    "ioc": ["list of indicators of compromise"],
+    "false_positive_risk": "low" | "medium" | "high"
+  }
+
+  Rules:
+  - Never recommend blocking RFC1918/private ranges
+  - RTBH only for confirmed DDoS (> 500 req/min from single IP or coordinated)
+  - Set false_positive_risk=high for shared IPs (CDNs, Tor exit nodes, residential ISPs)
+  - Brute force threshold: > 5 failures/2min = medium, > 20 failures/10min = high
+
+user_template: |
+  Security event detected:
+
+  Event type: {{event_type}}
+  Source IP: {{source_ip}}
+  Timestamp: {{timestamp}}
+
+  Event details:
+  {{input}}
+
+  Recent events from this IP (last 24h):
+  {{ip_history}}
+
+  Classify this threat and recommend action:
+
+few_shot_examples:
+  - user: |
+      Event type: brute_force
+      Source IP: 45.152.66.122
+      Timestamp: 2026-04-02T21:00:00Z
+      Event details: 47 failed SSH login attempts in 8 minutes. Usernames tried: root, admin, ubuntu, pi, test, postgres
+      Recent events from this IP: None previously seen
+    assistant: |
+      {
+        "threat_level": "high",
+        "threat_type": "brute_force",
+        "confidence": 0.97,
+        "automated_action": "block_24h",
+        "block_duration_minutes": 1440,
+        "reasoning": "47 failed SSH attempts in 8 minutes using common credential lists — clear automated brute force attack. No legitimate use case.",
+        "ioc": ["45.152.66.122", "username_root", "username_admin", "username_ubuntu"],
+        "false_positive_risk": "low"
+      }
+  - user: |
+      Event type: ddos
+      Source IP: 185.220.101.45
+      Timestamp: 2026-04-02T22:00:00Z
+      Event details: 1240 HTTP requests per minute to nginx. All targeting /v1/completion endpoint. User-Agent: curl/7.x. All returning 429.
+      Recent events from this IP: 200 req/min 1h ago
+    assistant: |
+      {
+        "threat_level": "critical",
+        "threat_type": "ddos",
+        "confidence": 0.92,
+        "automated_action": "rtbh",
+        "block_duration_minutes": 30,
+        "reasoning": "1240 req/min HTTP flood targeting API endpoint from single IP, escalating from 200 req/min 1h ago. BGP RTBH appropriate to protect upstream bandwidth.",
+        "ioc": ["185.220.101.45", "http_flood_v1_completion"],
+        "false_positive_risk": "low"
+      }
--- a/packages/gateway/src/config/models.yaml
+++ b/packages/gateway/src/config/models.yaml
@ -25,12 +25,31 @@ models:
    strengths: [classification, short_text, routing]
    max_tokens_default: 512

+  qwen2.5:7b:
+    tier: fast
+    context_length: 32768
+    strengths: [classification, summarization, short_analysis]
+    max_tokens_default: 1024
+
  phi3.5:3.8b:
    tier: fast
    context_length: 128000
    strengths: [classification, summarization]
    max_tokens_default: 512

+  # Custom fine-tuned models (Context X)
+  ctxhealer:latest:
+    tier: medium
+    context_length: 32768
+    strengths: [infrastructure_diagnosis, root_cause_analysis, remediation_steps]
+    max_tokens_default: 1024
+
+  llama-guard3:1b:
+    tier: fast
+    context_length: 8192
+    strengths: [safety_classification, threat_detection]
+    max_tokens_default: 256
+
  # Medium tier
  qwen2.5:14b:
    tier: medium
@ -77,7 +96,7 @@ models:

 # Fallback chains per tier
 fallback_chains:
-  fast: [qwen2.5:3b, phi3.5:3.8b]
+  fast: [qwen2.5:3b, qwen2.5:7b, phi3.5:3.8b]
  medium: [qwen2.5:14b, mistral:7b, llama3.2:8b]
  large: [qwen2.5:32b, llama3.3:70b, deepseek-r1:32b]

--- a/packages/gateway/src/config/routing-rules.yaml
+++ b/packages/gateway/src/config/routing-rules.yaml
@ -1229,6 +1229,64 @@ routing_rules:
    validators: [schema]
    callers: [all]

+  # ─── CtxHealth — Infrastructure Self-Healing ───────────────────────────────
+  ctx_health_diagnose:
+    model: ctxhealer:latest
+    tier: medium
+    prompt_template: ctx_health_diagnose
+    temperature: 0.1
+    max_tokens: 512
+    output_format: json
+    requires_fact_check: false
+    validators: [schema, length]
+    callers: [ctx-health, internal]
+    fallback_chain: [ctxhealer:latest, qwen2.5:14b]
+
+  ctx_health_alert:
+    model: qwen2.5:14b
+    tier: medium
+    prompt_template: ctx_health_diagnose
+    temperature: 0.1
+    max_tokens: 512
+    output_format: json
+    requires_fact_check: false
+    validators: [schema, length]
+    callers: [ctx-health, internal]
+
+  # ─── CtxSecurity — Blue/Red Team Defense ──────────────────────────────────
+  ctx_security_classify:
+    model: qwen2.5:14b
+    tier: medium
+    prompt_template: ctx_security_classify
+    temperature: 0.05
+    max_tokens: 512
+    output_format: json
+    requires_fact_check: false
+    validators: [schema, length]
+    callers: [ctx-security, internal]
+
+  ctx_security_ddos:
+    model: qwen2.5:14b
+    tier: medium
+    prompt_template: ctx_security_classify
+    temperature: 0.05
+    max_tokens: 256
+    output_format: json
+    requires_fact_check: false
+    validators: [schema]
+    callers: [ctx-security, internal]
+
+  ctx_security_report:
+    model: qwen2.5:14b
+    tier: medium
+    prompt_template: ctx_security_classify
+    temperature: 0.2
+    max_tokens: 2048
+    output_format: text
+    requires_fact_check: false
+    validators: [banlist, length]
+    callers: [ctx-security, internal]
+
 # Validator configuration
 validators:
  schema: