feat: add CtxHealth + CtxSecurity to gateway — ctxhealer:latest model, 5 routing rules, 2 templates
This commit is contained in:
parent
9b4d1caa8a
commit
a8a77e689c
122
packages/gateway/prompts/templates/ctx_health_diagnose.yaml
Normal file
122
packages/gateway/prompts/templates/ctx_health_diagnose.yaml
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
id: ctx_health_diagnose
|
||||||
|
version: "1.0.0"
|
||||||
|
task_type: ctx_health_diagnose
|
||||||
|
description: Diagnose infrastructure problems and recommend auto-healing actions for CtxHealth daemon
|
||||||
|
model_preference: ctxhealer:latest
|
||||||
|
model_minimum: qwen2.5:14b
|
||||||
|
temperature: 0.1
|
||||||
|
max_tokens: 512
|
||||||
|
output_format: json
|
||||||
|
|
||||||
|
system_prompt: |
|
||||||
|
You are CtxHealer — an expert infrastructure reliability engineer and SRE AI assistant.
|
||||||
|
Your job is to diagnose infrastructure problems on the Context X production server (Erik, IONOS VPS, Ubuntu)
|
||||||
|
and recommend specific, executable healing actions.
|
||||||
|
|
||||||
|
Infrastructure context:
|
||||||
|
- Server: Erik (217.154.82.179, IONOS VPS, Ubuntu 22.04)
|
||||||
|
- Services: PM2 (llm-gateway:3103, llm-learning, eo-global-pulse, peercortex, switchblade, ctxevent, tip-api)
|
||||||
|
- Tunnels: Cloudflare tunnel (cloudflared) → context-x.org, ollama.fichtmueller.org
|
||||||
|
- DB: PostgreSQL 15 (llm_gateway, transceiver_db, ctxmeet)
|
||||||
|
- VPN: WireGuard wg0 (10.10.0.1/24)
|
||||||
|
- Network: Hetzner upstream BGP (AS24940)
|
||||||
|
|
||||||
|
Return ONLY valid JSON with this structure:
|
||||||
|
{
|
||||||
|
"severity": "info" | "warning" | "critical",
|
||||||
|
"root_cause": "Brief root cause explanation (1-2 sentences)",
|
||||||
|
"confidence": 0.0-1.0,
|
||||||
|
"auto_heal": true | false,
|
||||||
|
"actions": [
|
||||||
|
{
|
||||||
|
"command": "pm2 restart llm-gateway",
|
||||||
|
"description": "Restart the gateway process",
|
||||||
|
"safe": true,
|
||||||
|
"estimated_downtime_s": 5
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"monitoring_advice": "What to watch after healing",
|
||||||
|
"escalate_if": "Condition that requires human intervention"
|
||||||
|
}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Only recommend safe, reversible actions
|
||||||
|
- Set auto_heal=false if root cause is unclear or action could cause data loss
|
||||||
|
- Set severity=critical only if the issue affects end users right now
|
||||||
|
- Prefer PM2 restarts over system reboots
|
||||||
|
- Never recommend deleting data
|
||||||
|
|
||||||
|
user_template: |
|
||||||
|
Infrastructure check FAILED:
|
||||||
|
|
||||||
|
Check name: {{check_name}}
|
||||||
|
Category: {{category}}
|
||||||
|
Error: {{error_message}}
|
||||||
|
Consecutive failures: {{consecutive_failures}}
|
||||||
|
Last healthy: {{last_healthy}}
|
||||||
|
|
||||||
|
Recent incident history (last 24h):
|
||||||
|
{{incident_history}}
|
||||||
|
|
||||||
|
System context:
|
||||||
|
{{input}}
|
||||||
|
|
||||||
|
Diagnose and recommend healing actions:
|
||||||
|
|
||||||
|
few_shot_examples:
|
||||||
|
- user: |
|
||||||
|
Check name: pm2-llm-gateway
|
||||||
|
Category: process
|
||||||
|
Error: Process llm-gateway is stopped (status: stopped, restarts: 5)
|
||||||
|
Consecutive failures: 2
|
||||||
|
Last healthy: 2026-04-02T20:00:00Z
|
||||||
|
Recent incident history: 1 restart in last 6h
|
||||||
|
System context: Memory 78% used, CPU 12%, disk 45%
|
||||||
|
assistant: |
|
||||||
|
{
|
||||||
|
"severity": "critical",
|
||||||
|
"root_cause": "PM2 process llm-gateway has stopped, likely due to an uncaught exception or OOM kill. 5 restarts suggests a crash loop.",
|
||||||
|
"confidence": 0.85,
|
||||||
|
"auto_heal": true,
|
||||||
|
"actions": [
|
||||||
|
{
|
||||||
|
"command": "pm2 restart llm-gateway",
|
||||||
|
"description": "Restart the crashed gateway process",
|
||||||
|
"safe": true,
|
||||||
|
"estimated_downtime_s": 8
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"monitoring_advice": "Watch PM2 logs for 5 minutes: pm2 logs llm-gateway --lines 50",
|
||||||
|
"escalate_if": "Process crashes again within 10 minutes — indicates unresolved code error"
|
||||||
|
}
|
||||||
|
- user: |
|
||||||
|
Check name: disk-space
|
||||||
|
Category: system
|
||||||
|
Error: Disk usage at 91% (/dev/sda1)
|
||||||
|
Consecutive failures: 1
|
||||||
|
Last healthy: 2026-04-02T18:00:00Z
|
||||||
|
Recent incident history: none
|
||||||
|
System context: /var/log growing fast, /opt stable
|
||||||
|
assistant: |
|
||||||
|
{
|
||||||
|
"severity": "warning",
|
||||||
|
"root_cause": "Disk usage at 91%, likely caused by log file accumulation in /var/log. Immediate risk of service failure if disk fills.",
|
||||||
|
"confidence": 0.80,
|
||||||
|
"auto_heal": false,
|
||||||
|
"actions": [
|
||||||
|
{
|
||||||
|
"command": "journalctl --vacuum-size=500M",
|
||||||
|
"description": "Truncate system journal to 500MB",
|
||||||
|
"safe": true,
|
||||||
|
"estimated_downtime_s": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"command": "pm2 flush",
|
||||||
|
"description": "Clear PM2 log files",
|
||||||
|
"safe": true,
|
||||||
|
"estimated_downtime_s": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"monitoring_advice": "Check disk usage again in 1 hour: df -h /",
|
||||||
|
"escalate_if": "Disk reaches 95% — manual cleanup required to prevent data loss"
|
||||||
|
}
|
||||||
@ -0,0 +1,94 @@
|
|||||||
|
id: ctx_security_classify
|
||||||
|
version: "1.0.0"
|
||||||
|
task_type: ctx_security_classify
|
||||||
|
description: Classify security threats and recommend automated defense actions for CtxSecurity daemon
|
||||||
|
model_preference: qwen2.5:14b
|
||||||
|
model_minimum: qwen2.5:7b
|
||||||
|
temperature: 0.05
|
||||||
|
max_tokens: 512
|
||||||
|
output_format: json
|
||||||
|
|
||||||
|
system_prompt: |
|
||||||
|
You are CtxSecurity — an expert Blue Team security analyst AI for the Context X infrastructure.
|
||||||
|
Your job is to classify security threats and recommend precise, automated defense actions.
|
||||||
|
|
||||||
|
Infrastructure:
|
||||||
|
- Server: Erik (217.154.82.179, IONOS VPS)
|
||||||
|
- Exposed services: SSH:22, Cloudflare tunnel (HTTP/S only), WireGuard:51820
|
||||||
|
- Protected by: iptables/ipset (ctx-security-block), fail2ban, Cloudflare DDoS protection
|
||||||
|
- BGP RTBH available for DDoS mitigation (via AS24940 upstream)
|
||||||
|
|
||||||
|
Threat classification levels:
|
||||||
|
- low: informational, log only
|
||||||
|
- medium: rate-limit or temporary block (1h)
|
||||||
|
- high: 24h block, alert
|
||||||
|
- critical: permanent block + BGP RTBH for DDoS
|
||||||
|
|
||||||
|
Return ONLY valid JSON:
|
||||||
|
{
|
||||||
|
"threat_level": "low" | "medium" | "high" | "critical",
|
||||||
|
"threat_type": "brute_force" | "ddos" | "port_scan" | "credential_stuffing" | "web_scraping" | "other",
|
||||||
|
"confidence": 0.0-1.0,
|
||||||
|
"automated_action": "block_1h" | "block_24h" | "block_permanent" | "rtbh" | "rate_limit" | "monitor" | "none",
|
||||||
|
"block_duration_minutes": null | number,
|
||||||
|
"reasoning": "Brief explanation (1-2 sentences)",
|
||||||
|
"ioc": ["list of indicators of compromise"],
|
||||||
|
"false_positive_risk": "low" | "medium" | "high"
|
||||||
|
}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Never recommend blocking RFC1918/private ranges
|
||||||
|
- RTBH only for confirmed DDoS (> 500 req/min from single IP or coordinated)
|
||||||
|
- Set false_positive_risk=high for shared IPs (CDNs, Tor exit nodes, residential ISPs)
|
||||||
|
- Brute force threshold: > 5 failures/2min = medium, > 20 failures/10min = high
|
||||||
|
|
||||||
|
user_template: |
|
||||||
|
Security event detected:
|
||||||
|
|
||||||
|
Event type: {{event_type}}
|
||||||
|
Source IP: {{source_ip}}
|
||||||
|
Timestamp: {{timestamp}}
|
||||||
|
|
||||||
|
Event details:
|
||||||
|
{{input}}
|
||||||
|
|
||||||
|
Recent events from this IP (last 24h):
|
||||||
|
{{ip_history}}
|
||||||
|
|
||||||
|
Classify this threat and recommend action:
|
||||||
|
|
||||||
|
few_shot_examples:
|
||||||
|
- user: |
|
||||||
|
Event type: brute_force
|
||||||
|
Source IP: 45.152.66.122
|
||||||
|
Timestamp: 2026-04-02T21:00:00Z
|
||||||
|
Event details: 47 failed SSH login attempts in 8 minutes. Usernames tried: root, admin, ubuntu, pi, test, postgres
|
||||||
|
Recent events from this IP: None previously seen
|
||||||
|
assistant: |
|
||||||
|
{
|
||||||
|
"threat_level": "high",
|
||||||
|
"threat_type": "brute_force",
|
||||||
|
"confidence": 0.97,
|
||||||
|
"automated_action": "block_24h",
|
||||||
|
"block_duration_minutes": 1440,
|
||||||
|
"reasoning": "47 failed SSH attempts in 8 minutes using common credential lists — clear automated brute force attack. No legitimate use case.",
|
||||||
|
"ioc": ["45.152.66.122", "username_root", "username_admin", "username_ubuntu"],
|
||||||
|
"false_positive_risk": "low"
|
||||||
|
}
|
||||||
|
- user: |
|
||||||
|
Event type: ddos
|
||||||
|
Source IP: 185.220.101.45
|
||||||
|
Timestamp: 2026-04-02T22:00:00Z
|
||||||
|
Event details: 1240 HTTP requests per minute to nginx. All targeting /v1/completion endpoint. User-Agent: curl/7.x. All returning 429.
|
||||||
|
Recent events from this IP: 200 req/min 1h ago
|
||||||
|
assistant: |
|
||||||
|
{
|
||||||
|
"threat_level": "critical",
|
||||||
|
"threat_type": "ddos",
|
||||||
|
"confidence": 0.92,
|
||||||
|
"automated_action": "rtbh",
|
||||||
|
"block_duration_minutes": 30,
|
||||||
|
"reasoning": "1240 req/min HTTP flood targeting API endpoint from single IP, escalating from 200 req/min 1h ago. BGP RTBH appropriate to protect upstream bandwidth.",
|
||||||
|
"ioc": ["185.220.101.45", "http_flood_v1_completion"],
|
||||||
|
"false_positive_risk": "low"
|
||||||
|
}
|
||||||
@ -25,12 +25,31 @@ models:
|
|||||||
strengths: [classification, short_text, routing]
|
strengths: [classification, short_text, routing]
|
||||||
max_tokens_default: 512
|
max_tokens_default: 512
|
||||||
|
|
||||||
|
qwen2.5:7b:
|
||||||
|
tier: fast
|
||||||
|
context_length: 32768
|
||||||
|
strengths: [classification, summarization, short_analysis]
|
||||||
|
max_tokens_default: 1024
|
||||||
|
|
||||||
phi3.5:3.8b:
|
phi3.5:3.8b:
|
||||||
tier: fast
|
tier: fast
|
||||||
context_length: 128000
|
context_length: 128000
|
||||||
strengths: [classification, summarization]
|
strengths: [classification, summarization]
|
||||||
max_tokens_default: 512
|
max_tokens_default: 512
|
||||||
|
|
||||||
|
# Custom fine-tuned models (Context X)
|
||||||
|
ctxhealer:latest:
|
||||||
|
tier: medium
|
||||||
|
context_length: 32768
|
||||||
|
strengths: [infrastructure_diagnosis, root_cause_analysis, remediation_steps]
|
||||||
|
max_tokens_default: 1024
|
||||||
|
|
||||||
|
llama-guard3:1b:
|
||||||
|
tier: fast
|
||||||
|
context_length: 8192
|
||||||
|
strengths: [safety_classification, threat_detection]
|
||||||
|
max_tokens_default: 256
|
||||||
|
|
||||||
# Medium tier
|
# Medium tier
|
||||||
qwen2.5:14b:
|
qwen2.5:14b:
|
||||||
tier: medium
|
tier: medium
|
||||||
@ -77,7 +96,7 @@ models:
|
|||||||
|
|
||||||
# Fallback chains per tier
|
# Fallback chains per tier
|
||||||
fallback_chains:
|
fallback_chains:
|
||||||
fast: [qwen2.5:3b, phi3.5:3.8b]
|
fast: [qwen2.5:3b, qwen2.5:7b, phi3.5:3.8b]
|
||||||
medium: [qwen2.5:14b, mistral:7b, llama3.2:8b]
|
medium: [qwen2.5:14b, mistral:7b, llama3.2:8b]
|
||||||
large: [qwen2.5:32b, llama3.3:70b, deepseek-r1:32b]
|
large: [qwen2.5:32b, llama3.3:70b, deepseek-r1:32b]
|
||||||
|
|
||||||
|
|||||||
@ -1229,6 +1229,64 @@ routing_rules:
|
|||||||
validators: [schema]
|
validators: [schema]
|
||||||
callers: [all]
|
callers: [all]
|
||||||
|
|
||||||
|
# ─── CtxHealth — Infrastructure Self-Healing ───────────────────────────────
|
||||||
|
ctx_health_diagnose:
|
||||||
|
model: ctxhealer:latest
|
||||||
|
tier: medium
|
||||||
|
prompt_template: ctx_health_diagnose
|
||||||
|
temperature: 0.1
|
||||||
|
max_tokens: 512
|
||||||
|
output_format: json
|
||||||
|
requires_fact_check: false
|
||||||
|
validators: [schema, length]
|
||||||
|
callers: [ctx-health, internal]
|
||||||
|
fallback_chain: [ctxhealer:latest, qwen2.5:14b]
|
||||||
|
|
||||||
|
ctx_health_alert:
|
||||||
|
model: qwen2.5:14b
|
||||||
|
tier: medium
|
||||||
|
prompt_template: ctx_health_diagnose
|
||||||
|
temperature: 0.1
|
||||||
|
max_tokens: 512
|
||||||
|
output_format: json
|
||||||
|
requires_fact_check: false
|
||||||
|
validators: [schema, length]
|
||||||
|
callers: [ctx-health, internal]
|
||||||
|
|
||||||
|
# ─── CtxSecurity — Blue/Red Team Defense ──────────────────────────────────
|
||||||
|
ctx_security_classify:
|
||||||
|
model: qwen2.5:14b
|
||||||
|
tier: medium
|
||||||
|
prompt_template: ctx_security_classify
|
||||||
|
temperature: 0.05
|
||||||
|
max_tokens: 512
|
||||||
|
output_format: json
|
||||||
|
requires_fact_check: false
|
||||||
|
validators: [schema, length]
|
||||||
|
callers: [ctx-security, internal]
|
||||||
|
|
||||||
|
ctx_security_ddos:
|
||||||
|
model: qwen2.5:14b
|
||||||
|
tier: medium
|
||||||
|
prompt_template: ctx_security_classify
|
||||||
|
temperature: 0.05
|
||||||
|
max_tokens: 256
|
||||||
|
output_format: json
|
||||||
|
requires_fact_check: false
|
||||||
|
validators: [schema]
|
||||||
|
callers: [ctx-security, internal]
|
||||||
|
|
||||||
|
ctx_security_report:
|
||||||
|
model: qwen2.5:14b
|
||||||
|
tier: medium
|
||||||
|
prompt_template: ctx_security_classify
|
||||||
|
temperature: 0.2
|
||||||
|
max_tokens: 2048
|
||||||
|
output_format: text
|
||||||
|
requires_fact_check: false
|
||||||
|
validators: [banlist, length]
|
||||||
|
callers: [ctx-security, internal]
|
||||||
|
|
||||||
# Validator configuration
|
# Validator configuration
|
||||||
validators:
|
validators:
|
||||||
schema:
|
schema:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user