sec(gateway): Layer-2 ML classifier — Prompt-Guard sidecar integration

Adds a second defense layer between Layer-1 regex (62 patterns) and the existing Layer-3 llm_judge. Calls a FastAPI sidecar running on the Mac Studio (port 9091, MPS) that wraps protectai/deberta-v3-base-prompt- injection-v2 — public model, no auth needed, ~50-400ms inference. modules/prompt-guard-client.ts: - callPromptGuard(input) opportunistic, never throws - isPromptGuardConfigured() true if PROMPT_GUARD_URL is set - getPromptGuardThreshold() default 0.85 - getPromptGuardMinLen() default 16 chars (skip tiny inputs) routes/completion.ts: - New Layer-2 block between regex scan and llm_judge: when Layer-1 didn't detect and input is long enough, ask the sidecar. If sidecar returns INJECTION with score >= threshold, return HTTP 422 with error.prompt_guard payload (score + latency). - Fail-open: sidecar timeout/error logs a warning and the request falls through to llm_judge / cache / model — never blocks legitimate traffic due to sidecar issues. Env (set in ecosystem.config.js): PROMPT_GUARD_URL http://192.168.178.213:9091 PROMPT_GUARD_THRESHOLD 0.70 (lowered from 0.85 after empirical testing) PROMPT_GUARD_TIMEOUT 1500 ms Sidecar code lives at: ~/magatama-llm/prompt-guard-sidecar/server.py (Mac Studio) launched via ~/Library/LaunchAgents/org.fichtmueller.prompt-guard-sidecar.plist Smoke tests after deploy: Layer-1 caught: German "ignoriere..." -> HTTP 422 Layer-2 caught: English "pretend no restrict.."-> HTTP 422 (pg_score 0.9999) Layer-2 caught: Bangla-romanized -> HTTP 422 (Layer-1 actually) Benign: "Explain DNS in 2 sentences" -> HTTP 200
2026-05-16 23:14:16 +02:00 · 2026-05-16 23:14:16 +02:00 · f399999e62
commit f399999e62
parent 6f5dd81d7a
2 changed files with 1166 additions and 61 deletions
--- a/packages/gateway/src/modules/prompt-guard-client.ts
+++ b/packages/gateway/src/modules/prompt-guard-client.ts
@ -0,0 +1,89 @@
+/**
+ * prompt-guard-client.ts
+ *
+ * Layer-2 LLM injection classifier — wraps the protectai DeBERTa-prompt-
+ * injection-v2 model running as a FastAPI sidecar on the Mac Studio.
+ *
+ * Architecture:
+ *   Layer 1: scanForInjection() — fast regex patterns (this same module)
+ *   Layer 2: callPromptGuard() — ML classifier (THIS file)
+ *   Layer 3: llmJudge() — small LLM judges borderline cases
+ *
+ * The deep-scan flow (callDeepScan below) escalates to Layer-2 only when
+ * Layer-1 doesn't already detect, AND the input is suspicious enough to
+ * warrant the ~50-400 ms classifier cost.
+ *
+ * Env vars:
+ *   PROMPT_GUARD_URL       e.g. http://192.168.178.213:9091
+ *   PROMPT_GUARD_TIMEOUT   ms, default 1500
+ *   PROMPT_GUARD_THRESHOLD 0.0-1.0, default 0.85 (block if score >= this)
+ *   PROMPT_GUARD_MIN_LEN   chars, default 16 (skip very short inputs)
+ */
+
+export interface PromptGuardResult {
+  available: boolean;
+  label: 'INJECTION' | 'SAFE' | null;
+  score: number;
+  latencyMs: number;
+  error?: string;
+}
+
+const URL_ENV = 'PROMPT_GUARD_URL';
+const TIMEOUT_ENV = 'PROMPT_GUARD_TIMEOUT';
+const THRESHOLD_ENV = 'PROMPT_GUARD_THRESHOLD';
+const MIN_LEN_ENV = 'PROMPT_GUARD_MIN_LEN';
+
+export function isPromptGuardConfigured(): boolean {
+  return !!(process.env[URL_ENV] && process.env[URL_ENV].length > 0);
+}
+
+export function getPromptGuardThreshold(): number {
+  const v = Number(process.env[THRESHOLD_ENV] ?? '0.85');
+  return Number.isFinite(v) && v > 0 && v <= 1 ? v : 0.85;
+}
+
+export function getPromptGuardMinLen(): number {
+  const v = Number(process.env[MIN_LEN_ENV] ?? '16');
+  return Number.isInteger(v) && v >= 0 ? v : 16;
+}
+
+/**
+ * Classify an input via the sidecar. Returns { available: false } if
+ * not configured or if the sidecar is unreachable — never throws.
+ * Caller decides whether to enforce based on the score + threshold.
+ */
+export async function callPromptGuard(input: string): Promise<PromptGuardResult> {
+  const url = process.env[URL_ENV];
+  if (!url) {
+    return { available: false, label: null, score: 0, latencyMs: 0, error: 'not-configured' };
+  }
+  const timeout = Number(process.env[TIMEOUT_ENV] ?? '1500');
+  const t0 = Date.now();
+  try {
+    const res = await fetch(`${url.replace(/\/$/, '')}/classify`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ text: input }),
+      signal: AbortSignal.timeout(timeout),
+    });
+    if (!res.ok) {
+      return {
+        available: true, label: null, score: 0, latencyMs: Date.now() - t0,
+        error: `HTTP ${res.status}`,
+      };
+    }
+    const data = await res.json() as { label: string; score: number };
+    const label = (data.label === 'INJECTION' || data.label === 'SAFE') ? data.label : null;
+    return {
+      available: true,
+      label,
+      score: Number(data.score ?? 0),
+      latencyMs: Date.now() - t0,
+    };
+  } catch (e: unknown) {
+    return {
+      available: true, label: null, score: 0, latencyMs: Date.now() - t0,
+      error: e instanceof Error ? e.message : String(e),
+    };
+  }
+}
--- a/packages/gateway/src/routes/completion.ts
+++ b/packages/gateway/src/routes/completion.ts