Compare commits

...

2 Commits

Author SHA1 Message Date
Rene Fichtmueller
c50af63389 feat(ctx-health): add proxmox-pvestatd + opnsense-disk health checks
- Add SSH-based health check for pvestatd D-state detection on Proxmox host
  (heal via cgroup move + lock file removal + reset-failed)
- Add SSH-based disk check for OPNsense VM (threshold 75%, auto-cleanup)
- knowledge/fixes.json: add 48 training fixes including post-reboot DNS
  recovery (fix-046), cloudflared DNS-wait boot fix (fix-047), and
  vzdump load-crash scenario with recovery steps (fix-048)
2026-04-13 05:42:24 +02:00
Rene Fichtmueller
b4593b6582 feat: integrate real @shieldx/core library into gateway pipeline
Replace recursive HTTP-based ShieldX scan with direct library integration.
- 547+ rules, 50+ languages, sub-millisecond scans
- Enables: rules, entropy, indirect injection, behavioral, unicode,
  tokenizer, compressed payload detection
- Disables Ollama-dependent scanners for zero external dependency
- Response now includes threat_level, kill_chain_phase, shieldx_latency_ms
2026-04-07 09:03:02 +02:00
5 changed files with 1382 additions and 36 deletions

1014
knowledge/fixes.json Normal file

File diff suppressed because it is too large Load Diff

37
package-lock.json generated
View File

@ -11,6 +11,38 @@
"packages/*"
]
},
"../../../shieldx": {
"name": "@shieldx/core",
"version": "0.5.0",
"license": "Apache-2.0",
"dependencies": {
"pg": "^8.13.0",
"pgvector": "^0.2.0",
"pino": "^9.6.0",
"zod": "^3.24.0"
},
"devDependencies": {
"@types/node": "^22.0.0",
"@types/pg": "^8.11.0",
"@vitest/coverage-v8": "^3.0.0",
"eslint": "^9.0.0",
"tsup": "^8.3.0",
"tsx": "^4.19.0",
"typescript": "^5.7.0",
"vitest": "^3.0.0"
},
"engines": {
"node": ">=20.0.0"
},
"peerDependencies": {
"next": ">=15.0.0"
},
"peerDependenciesMeta": {
"next": {
"optional": true
}
}
},
"node_modules/@esbuild/aix-ppc64": {
"version": "0.27.7",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.7.tgz",
@ -935,6 +967,10 @@
"win32"
]
},
"node_modules/@shieldx/core": {
"resolved": "../../../shieldx",
"link": true
},
"node_modules/@types/estree": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
@ -3425,6 +3461,7 @@
"@fastify/cors": "^9.0.1",
"@fastify/helmet": "^11.1.1",
"@fastify/rate-limit": "^9.1.0",
"@shieldx/core": "file:../../../../../shieldx",
"ajv": "^8.17.1",
"fastify": "^4.28.1",
"franc": "^6.2.0",

View File

@ -35,6 +35,7 @@ const ALLOWED_COMMANDS = new Set([
'/usr/sbin/systemctl',
'/usr/bin/sync',
'/bin/sync',
'/usr/bin/ssh',
]);
async function safeExec(
@ -69,7 +70,13 @@ async function findPm2(): Promise<string | null> {
// ─── 1. PM2 processes ────────────────────────────────────────────────────────
const PM2_REQUIRED_PROCESSES = ['llm-gateway', 'llm-learning'];
const PM2_REQUIRED_PROCESSES = [
'llm-gateway', 'llm-learning',
'magatama', 'magatama-dashboard',
'tip-api', 'tip-scraper-daemon',
'peercortex', 'eo-global-pulse',
'ghost-blog', 'nognet',
];
async function checkPm2(): Promise<CheckResult> {
const start = Date.now();
@ -114,10 +121,40 @@ async function healPm2(diagnosis: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found — cannot restart', success: false };
const { stdout, stderr } = await safeExec(pm2, ['restart', 'all']);
// Only restart processes that are actually offline — avoid pm2 restart all
const { stdout: jlist } = await safeExec(pm2, ['jlist']);
let processes: Array<{ name: string; pm2_env?: { status?: string } }> = [];
try { processes = JSON.parse(jlist) as typeof processes; } catch { /* ignore */ }
const offline = PM2_REQUIRED_PROCESSES.filter((name) => {
const proc = processes.find((p) => p.name === name);
return !proc || proc.pm2_env?.status !== 'online';
});
if (offline.length === 0) {
return { action_taken: 'no offline processes found — skipping restart', success: true };
}
const results: string[] = [];
for (const name of offline) {
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
results.push(`${name}: ${success ? 'restarted' : stderr.slice(0, 80)}`);
}
const output = results.join('; ');
logger.info({ diagnosis: diagnosis.slice(0, 120), output, offline }, 'PM2 targeted restart executed');
return { action_taken: `pm2 restart ${offline.join(', ')}`, success: true, output };
}
// ─── pm2-aware targeted restart (shared helper) ───────────────────────────────
async function restartProcess(name: string): Promise<HealResult> {
const pm2 = await findPm2();
if (!pm2) return { action_taken: 'pm2 not found', success: false };
const { stdout, stderr, success } = await safeExec(pm2, ['restart', name]);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ diagnosis: diagnosis.slice(0, 120), output: output.slice(0, 200) }, 'PM2 restart executed');
return { action_taken: 'pm2 restart all', success: true, output };
logger.info({ name, success, output: output.slice(0, 200) }, 'PM2 targeted process restart');
return { action_taken: `pm2 restart ${name}`, success, output };
}
// ─── 2. PostgreSQL ────────────────────────────────────────────────────────────
@ -401,6 +438,237 @@ async function healWireGuard(_diagnosis: string): Promise<HealResult> {
return { action_taken: 'systemctl restart wg-quick@wg0', success, output };
}
// ─── 9. Service port health checks ───────────────────────────────────────────
// For each critical service, performs a real HTTP check on the local port.
// This catches crash-loops where PM2 shows "online" but the port is not responding.
interface ServiceDef {
name: string; // PM2 process name to restart on failure
port: number;
path: string; // health endpoint path
okStatus?: number[];// accepted HTTP status codes (default: <500)
}
const SERVICES: ServiceDef[] = [
{ name: 'magatama', port: 3210, path: '/' },
{ name: 'magatama-dashboard', port: 3211, path: '/' },
{ name: 'magatama-admin', port: 3212, path: '/' },
{ name: 'tip-api', port: 3201, path: '/api/health' },
{ name: 'peercortex', port: 3101, path: '/' },
{ name: 'llm-gateway', port: 3103, path: '/health' },
{ name: 'eo-global-pulse', port: 3000, path: '/' },
{ name: 'nognet', port: 3001, path: '/' },
{ name: 'ghost-blog', port: 2368, path: '/' },
{ name: 'switchblade', port: 3334, path: '/' },
];
async function probePort(service: ServiceDef): Promise<{ ok: boolean; status?: number; error?: string }> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 8_000);
try {
const res = await fetch(`http://localhost:${service.port}${service.path}`, {
signal: controller.signal,
cache: 'no-store',
redirect: 'manual',
} as RequestInit);
const acceptedCodes = service.okStatus ?? [];
const ok = acceptedCodes.length > 0
? acceptedCodes.includes(res.status)
: res.status < 500 || (res.status >= 300 && res.status < 400);
return { ok, status: res.status };
} catch (err) {
return { ok: false, error: err instanceof Error ? err.message : String(err) };
} finally {
clearTimeout(timer);
}
}
async function checkServicePorts(): Promise<CheckResult> {
const start = Date.now();
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
const latency_ms = Date.now() - start;
const failing = results.filter((r) => !r.probe.ok);
if (failing.length === 0) {
return { healthy: true, message: `All ${SERVICES.length} service ports are responding`, latency_ms };
}
const details = Object.fromEntries(
failing.map((r) => [r.svc.name, r.probe.status ?? r.probe.error ?? 'no response']),
);
return {
healthy: false,
message: `${failing.length} service(s) not responding: ${failing.map((r) => r.svc.name).join(', ')}`,
details,
latency_ms,
};
}
async function healServicePorts(diagnosis: string): Promise<HealResult> {
// Re-probe to find which services are currently failing
const results = await Promise.all(SERVICES.map(async (svc) => ({ svc, probe: await probePort(svc) })));
const failing = results.filter((r) => !r.probe.ok);
if (failing.length === 0) {
return { action_taken: 'services recovered on re-check — no restart needed', success: true };
}
const healResults: string[] = [];
for (const { svc } of failing) {
const result = await restartProcess(svc.name);
healResults.push(`${svc.name}: ${result.success ? 'restarted' : result.output?.slice(0, 80) ?? 'failed'}`);
logger.info({ service: svc.name, diagnosis: diagnosis.slice(0, 80) }, 'Service port failure — PM2 restart triggered');
}
return {
action_taken: `restarted: ${failing.map((r) => r.svc.name).join(', ')}`,
success: true,
output: healResults.join('; '),
};
}
// ─── SSH helper ──────────────────────────────────────────────────────────────
const SSH_OPTS = ['-o', 'BatchMode=yes', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=accept-new'];
async function sshExec(host: string, command: string): Promise<{ stdout: string; stderr: string; success: boolean }> {
return safeExec('/usr/bin/ssh', [...SSH_OPTS, host, command]);
}
// ─── 10. Proxmox pvestatd health ─────────────────────────────────────────────
const PROXMOX_HOST = process.env['PROXMOX_HOST'] ?? 'root@192.168.178.10';
async function checkProxmoxPvestatd(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await sshExec(
PROXMOX_HOST,
'pid=$(pgrep pvestatd 2>/dev/null | head -1); [ -n "$pid" ] && cat /proc/$pid/status 2>/dev/null | grep "^State:" || echo "NOT_RUNNING"',
);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'Cannot SSH to Proxmox to check pvestatd', latency_ms };
}
const out = stdout.trim();
if (out === 'NOT_RUNNING') {
return { healthy: false, message: 'pvestatd is not running on Proxmox', details: { state: 'not_running' }, latency_ms };
}
const stateMatch = out.match(/State:\s+(\S)/);
const state = stateMatch?.[1] ?? '?';
if (state === 'D') {
return {
healthy: false,
message: 'pvestatd is in D-state (kernel deadlock) — Proxmox GUI graphs will be empty',
details: { state: 'D', raw: out.slice(0, 200) },
latency_ms,
};
}
return { healthy: true, message: `pvestatd is running (state: ${state})`, details: { state }, latency_ms };
}
async function healProxmoxPvestatd(diagnosis: string): Promise<HealResult> {
// Step 1: Get PID
const { stdout: pidOut } = await sshExec(PROXMOX_HOST, 'pgrep pvestatd 2>/dev/null | head -1');
const pid = pidOut.trim();
if (!pid) {
// Not running at all — just start it
const { success, stdout, stderr } = await sshExec(PROXMOX_HOST, 'systemctl start pvestatd 2>&1');
return { action_taken: 'systemctl start pvestatd', success, output: `${stdout}\n${stderr}`.trim() };
}
// Step 2: Move stuck process to root cgroup so systemd can reclaim the unit
await sshExec(PROXMOX_HOST, `echo ${pid} > /sys/fs/cgroup/cgroup.procs 2>/dev/null || true`);
// Step 3: Remove stale lock files
await sshExec(PROXMOX_HOST, 'rm -f /var/run/pvestatd.pid.lock /var/run/pvestatd.pid 2>/dev/null || true');
// Step 4: Reset failed state and start
const { stdout, stderr, success } = await sshExec(
PROXMOX_HOST,
'systemctl reset-failed pvestatd 2>/dev/null; systemctl start pvestatd 2>&1; sleep 3; systemctl is-active pvestatd',
);
const output = `${stdout}\n${stderr}`.trim();
logger.info({ pid, diagnosis: diagnosis.slice(0, 120), output }, 'pvestatd D-state heal executed');
return { action_taken: `moved pid ${pid} to root cgroup, reset-failed, started pvestatd`, success, output };
}
// ─── 11. OPNsense disk space ─────────────────────────────────────────────────
const OPNSENSE_HOST = process.env['OPNSENSE_HOST'] ?? 'root@192.168.178.11';
const OPNSENSE_DISK_THRESHOLD = 75;
async function checkOpnsenseDisk(): Promise<CheckResult> {
const start = Date.now();
const { stdout, success } = await sshExec(
OPNSENSE_HOST,
"df -h / | awk 'NR==2{print $5}' | tr -d '%'",
);
const latency_ms = Date.now() - start;
if (!success) {
return { healthy: false, message: 'Cannot SSH to OPNsense to check disk', latency_ms };
}
const usedPct = parseInt(stdout.trim(), 10);
if (isNaN(usedPct)) {
return { healthy: false, message: `Cannot parse OPNsense disk usage: "${stdout.trim()}"`, latency_ms };
}
if (usedPct >= OPNSENSE_DISK_THRESHOLD) {
return {
healthy: false,
message: `OPNsense disk usage ${usedPct}% ≥ threshold ${OPNSENSE_DISK_THRESHOLD}%`,
details: { usedPercent: usedPct, threshold: OPNSENSE_DISK_THRESHOLD },
latency_ms,
};
}
return {
healthy: true,
message: `OPNsense disk usage: ${usedPct}%`,
details: { usedPercent: usedPct },
latency_ms,
};
}
async function healOpnsenseDisk(diagnosis: string): Promise<HealResult> {
const steps = [
// Remove logs older than 7 days
'find /var/log -name "*.log" -mtime +7 -delete 2>/dev/null || true',
// Suricata JSON logs older than 3 days
'find /var/log/suricata -name "*.json" -mtime +3 -delete 2>/dev/null || true',
'find /var/log/suricata -name "*.json.gz" -mtime +1 -delete 2>/dev/null || true',
// Tmp files older than 1 day
'find /tmp /var/tmp -mtime +1 -delete 2>/dev/null || true',
// pkg cache
'pkg clean -y 2>/dev/null || true',
// Report new usage
"df -h / | awk 'NR==2{print $5}'",
];
const { stdout, stderr, success } = await sshExec(OPNSENSE_HOST, steps.join('; '));
const output = `${stdout}\n${stderr}`.trim();
const newUsage = stdout.trim().split('\n').at(-1) ?? '?';
logger.info({ diagnosis: diagnosis.slice(0, 120), newUsage, output: output.slice(0, 400) }, 'OPNsense disk cleanup executed');
return {
action_taken: `cleaned logs, tmp, pkg cache on OPNsense — disk now at ${newUsage}`,
success,
output: output.slice(0, 500),
};
}
// ─── Exported check list ──────────────────────────────────────────────────────
export const healthChecks: HealthCheck[] = [
@ -408,8 +676,11 @@ export const healthChecks: HealthCheck[] = [
{ name: 'postgresql', category: 'database', check: checkPostgres, heal: healPostgres },
{ name: 'ollama', category: 'service', check: checkOllama, heal: healOllama },
{ name: 'cloudflare-tunnel', category: 'tunnel', check: checkCloudflareTunnel, heal: healCloudflareTunnel },
{ name: 'service-ports', category: 'service', check: checkServicePorts, heal: healServicePorts },
{ name: 'disk-space', category: 'service', check: checkDiskSpace, heal: healDiskSpace },
{ name: 'memory', category: 'service', check: checkMemory, heal: healMemory },
{ name: 'network', category: 'network', check: checkNetwork, heal: healNetwork },
{ name: 'wireguard', category: 'network', check: checkWireGuard, heal: healWireGuard },
{ name: 'proxmox-pvestatd', category: 'service', check: checkProxmoxPvestatd, heal: healProxmoxPvestatd },
{ name: 'opnsense-disk', category: 'service', check: checkOpnsenseDisk, heal: healOpnsenseDisk },
];

View File

@ -12,6 +12,7 @@
"@fastify/cors": "^9.0.1",
"@fastify/helmet": "^11.1.1",
"@fastify/rate-limit": "^9.1.0",
"@shieldx/core": "file:../../../../../shieldx",
"ajv": "^8.17.1",
"fastify": "^4.28.1",
"franc": "^6.2.0",

View File

@ -17,6 +17,32 @@ import {
validationFailuresTotal,
} from '../observability/metrics.js';
import { logger } from '../observability/logger.js';
import { ShieldX } from '@shieldx/core';
// Singleton ShieldX instance — initialized once, sub-millisecond scans
// Disable Ollama-dependent scanners (sentinel, constitutional, embedding, attention)
// to keep gateway scans fast and dependency-free
const shieldx = new ShieldX({
scanners: {
rules: true, // 547+ rules, 50+ languages
sentinel: false, // Requires Ollama
constitutional: false, // Requires Ollama
embedding: false, // Requires Ollama
embeddingAnomaly: false,
entropy: true, // Zero-cost entropy analysis
yara: false, // Requires YARA binary
attention: false, // Requires Ollama
canary: false, // Not needed in gateway context
indirect: true, // RAG/tool injection detection
selfConsciousness: false,
crossModel: false,
behavioral: true, // Session profiling
unicode: true, // Homoglyph/script detection
tokenizer: true, // I.g.n.o.r.e-style attacks
compressedPayload: true,
},
logging: { level: 'warn', structured: true, incidentLog: false },
} as any); // DeepPartial config — merges with defaults
const CompletionRequestSchema = z.object({
caller: z.string().min(1).max(100),
@ -38,43 +64,37 @@ type CompletionRequest = z.infer<typeof CompletionRequestSchema>;
const SKIP_SHIELDX_CALLERS = new Set(['internal', 'shieldx']);
async function runShieldXScan(input: string, caller: string): Promise<{ passed: boolean; reason?: string }> {
const GATEWAY_URL = `http://localhost:${process.env['PORT'] ?? '3100'}`;
async function runShieldXScan(
input: string,
caller: string,
): Promise<{ passed: boolean; reason?: string; threatLevel?: string; phase?: string; latencyMs?: number }> {
try {
const response = await fetch(`${GATEWAY_URL}/v1/completion`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'X-Caller-ID': 'internal' },
body: JSON.stringify({
caller: 'internal',
task_type: 'shieldx_threat_classification',
input,
options: { return_validation_details: false },
}),
signal: AbortSignal.timeout(8000),
});
const result = await shieldx.scanInput(input);
if (!response.ok) return { passed: true }; // Fail open if ShieldX is down
if (result.detected) {
logger.warn({
caller,
threatLevel: result.threatLevel,
phase: result.killChainPhase,
action: result.action,
latencyMs: result.latencyMs,
ensemble: result.ensemble,
atlasMapping: result.atlasMapping?.techniqueIds?.slice(0, 5),
scannerCount: result.scanResults.length,
}, 'ShieldX threat detected — input blocked');
const result = await response.json() as { output?: string; status?: string };
if (result.status !== 'approved' || !result.output) return { passed: true };
type ShieldResult = { threat_detected: boolean; threat_type?: string; confidence?: number };
let parsed: ShieldResult;
try {
parsed = JSON.parse(result.output) as ShieldResult;
} catch {
return { passed: true };
return {
passed: false,
reason: `Prompt injection detected: ${result.killChainPhase} (${result.threatLevel})`,
threatLevel: result.threatLevel,
phase: result.killChainPhase,
latencyMs: result.latencyMs,
};
}
if (parsed.threat_detected && (parsed.confidence ?? 0) > 0.8) {
logger.warn({ caller, threat_type: parsed.threat_type }, 'ShieldX threat detected');
return { passed: false, reason: `Threat detected: ${parsed.threat_type ?? 'unknown'}` };
}
return { passed: true };
return { passed: true, latencyMs: result.latencyMs };
} catch (err) {
// ShieldX unavailable — fail open (log but continue)
logger.warn({ err, caller }, 'ShieldX scan failed, continuing without scan');
logger.error({ err, caller }, 'ShieldX scan error — failing open');
return { passed: true };
}
}
@ -102,7 +122,7 @@ export async function completionRoute(fastify: FastifyInstance): Promise<void> {
const { caller, input, language, context, options } = body;
const returnValidationDetails = options?.return_validation_details ?? false;
// Stage 2: ShieldX scan
// Stage 2: ShieldX scan (real library, 547+ rules, sub-millisecond)
if (!SKIP_SHIELDX_CALLERS.has(caller)) {
const shieldResult = await runShieldXScan(input, caller);
if (!shieldResult.passed) {
@ -111,6 +131,9 @@ export async function completionRoute(fastify: FastifyInstance): Promise<void> {
statusCode: 400,
error: 'Rejected',
message: shieldResult.reason ?? 'Input rejected by security scan',
threat_level: shieldResult.threatLevel,
kill_chain_phase: shieldResult.phase,
shieldx_latency_ms: shieldResult.latencyMs,
});
}
}