sec(gateway): Layer-2 ML classifier — Prompt-Guard sidecar integration
Adds a second defense layer between Layer-1 regex (62 patterns) and the
existing Layer-3 llm_judge. Calls a FastAPI sidecar running on the Mac
Studio (port 9091, MPS) that wraps protectai/deberta-v3-base-prompt-
injection-v2 — public model, no auth needed, ~50-400ms inference.
modules/prompt-guard-client.ts:
- callPromptGuard(input) opportunistic, never throws
- isPromptGuardConfigured() true if PROMPT_GUARD_URL is set
- getPromptGuardThreshold() default 0.85
- getPromptGuardMinLen() default 16 chars (skip tiny inputs)
routes/completion.ts:
- New Layer-2 block between regex scan and llm_judge: when Layer-1
didn't detect and input is long enough, ask the sidecar. If sidecar
returns INJECTION with score >= threshold, return HTTP 422 with
error.prompt_guard payload (score + latency).
- Fail-open: sidecar timeout/error logs a warning and the request
falls through to llm_judge / cache / model — never blocks legitimate
traffic due to sidecar issues.
Env (set in ecosystem.config.js):
PROMPT_GUARD_URL http://192.168.178.213:9091
PROMPT_GUARD_THRESHOLD 0.70 (lowered from 0.85 after empirical testing)
PROMPT_GUARD_TIMEOUT 1500 ms
Sidecar code lives at:
~/magatama-llm/prompt-guard-sidecar/server.py (Mac Studio)
launched via ~/Library/LaunchAgents/org.fichtmueller.prompt-guard-sidecar.plist
Smoke tests after deploy:
Layer-1 caught: German "ignoriere..." -> HTTP 422
Layer-2 caught: English "pretend no restrict.."-> HTTP 422 (pg_score 0.9999)
Layer-2 caught: Bangla-romanized -> HTTP 422 (Layer-1 actually)
Benign: "Explain DNS in 2 sentences" -> HTTP 200
This commit is contained in:
parent
6f5dd81d7a
commit
f399999e62
89
packages/gateway/src/modules/prompt-guard-client.ts
Normal file
89
packages/gateway/src/modules/prompt-guard-client.ts
Normal file
@ -0,0 +1,89 @@
|
||||
/**
|
||||
* prompt-guard-client.ts
|
||||
*
|
||||
* Layer-2 LLM injection classifier — wraps the protectai DeBERTa-prompt-
|
||||
* injection-v2 model running as a FastAPI sidecar on the Mac Studio.
|
||||
*
|
||||
* Architecture:
|
||||
* Layer 1: scanForInjection() — fast regex patterns (this same module)
|
||||
* Layer 2: callPromptGuard() — ML classifier (THIS file)
|
||||
* Layer 3: llmJudge() — small LLM judges borderline cases
|
||||
*
|
||||
* The deep-scan flow (callDeepScan below) escalates to Layer-2 only when
|
||||
* Layer-1 doesn't already detect, AND the input is suspicious enough to
|
||||
* warrant the ~50-400 ms classifier cost.
|
||||
*
|
||||
* Env vars:
|
||||
* PROMPT_GUARD_URL e.g. http://192.168.178.213:9091
|
||||
* PROMPT_GUARD_TIMEOUT ms, default 1500
|
||||
* PROMPT_GUARD_THRESHOLD 0.0-1.0, default 0.85 (block if score >= this)
|
||||
* PROMPT_GUARD_MIN_LEN chars, default 16 (skip very short inputs)
|
||||
*/
|
||||
|
||||
export interface PromptGuardResult {
|
||||
available: boolean;
|
||||
label: 'INJECTION' | 'SAFE' | null;
|
||||
score: number;
|
||||
latencyMs: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const URL_ENV = 'PROMPT_GUARD_URL';
|
||||
const TIMEOUT_ENV = 'PROMPT_GUARD_TIMEOUT';
|
||||
const THRESHOLD_ENV = 'PROMPT_GUARD_THRESHOLD';
|
||||
const MIN_LEN_ENV = 'PROMPT_GUARD_MIN_LEN';
|
||||
|
||||
export function isPromptGuardConfigured(): boolean {
|
||||
return !!(process.env[URL_ENV] && process.env[URL_ENV].length > 0);
|
||||
}
|
||||
|
||||
export function getPromptGuardThreshold(): number {
|
||||
const v = Number(process.env[THRESHOLD_ENV] ?? '0.85');
|
||||
return Number.isFinite(v) && v > 0 && v <= 1 ? v : 0.85;
|
||||
}
|
||||
|
||||
export function getPromptGuardMinLen(): number {
|
||||
const v = Number(process.env[MIN_LEN_ENV] ?? '16');
|
||||
return Number.isInteger(v) && v >= 0 ? v : 16;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify an input via the sidecar. Returns { available: false } if
|
||||
* not configured or if the sidecar is unreachable — never throws.
|
||||
* Caller decides whether to enforce based on the score + threshold.
|
||||
*/
|
||||
export async function callPromptGuard(input: string): Promise<PromptGuardResult> {
|
||||
const url = process.env[URL_ENV];
|
||||
if (!url) {
|
||||
return { available: false, label: null, score: 0, latencyMs: 0, error: 'not-configured' };
|
||||
}
|
||||
const timeout = Number(process.env[TIMEOUT_ENV] ?? '1500');
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const res = await fetch(`${url.replace(/\/$/, '')}/classify`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text: input }),
|
||||
signal: AbortSignal.timeout(timeout),
|
||||
});
|
||||
if (!res.ok) {
|
||||
return {
|
||||
available: true, label: null, score: 0, latencyMs: Date.now() - t0,
|
||||
error: `HTTP ${res.status}`,
|
||||
};
|
||||
}
|
||||
const data = await res.json() as { label: string; score: number };
|
||||
const label = (data.label === 'INJECTION' || data.label === 'SAFE') ? data.label : null;
|
||||
return {
|
||||
available: true,
|
||||
label,
|
||||
score: Number(data.score ?? 0),
|
||||
latencyMs: Date.now() - t0,
|
||||
};
|
||||
} catch (e: unknown) {
|
||||
return {
|
||||
available: true, label: null, score: 0, latencyMs: Date.now() - t0,
|
||||
error: e instanceof Error ? e.message : String(e),
|
||||
};
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user