/** * Prompt-Injection Defense Layer * * First-class LLM security: detects prompt injection, jailbreak attempts, * role-bypass, indirect injection, data-exfiltration, and policy violations * before the request hits the upstream model. * * Modes (env var INJECTION_DEFENSE_MODE): * - off → no scanning (default off for backward compat) * - warn → scan and tag metadata, but allow through * - block → reject HTTP 422 if any pattern matches above threshold * - llm_judge → block + fall back to a cheap LLM classifier for ambiguous * cases that pattern matching alone marks as borderline * * Tuned for low false-positive rate. Detection is bilingual (EN/DE) and * covers the OWASP LLM Top-10 attack families. * * Inspired by patterns documented in academic literature on prompt * injection (Greshake et al. 2023, Yi et al. 2023) and the OWASP LLM-01: * Prompt Injection category. All detection logic is original to this repo. */ import { logger } from '../observability/logger.js'; // ─── Pattern catalog ───────────────────────────────────────────────────────── interface InjectionPattern { readonly id: string; readonly category: 'jailbreak' | 'role_bypass' | 'indirect' | 'exfiltration' | 'policy' | 'system_prompt_leak'; readonly severity: 'low' | 'medium' | 'high' | 'critical'; readonly pattern: RegExp; readonly description: string; } const PATTERNS: readonly InjectionPattern[] = [ // ─── Direct jailbreak attempts (English) ────────────────────────────────── { id: 'ignore-previous-en', category: 'jailbreak', severity: 'high', pattern: /\bignore\s+(?:all\s+)?(?:previous|prior|above|earlier)\s+(?:instructions?|prompts?|rules?|directions?)\b/i, description: 'Classic "ignore previous instructions" injection' }, { id: 'disregard-en', category: 'jailbreak', severity: 'high', pattern: /\b(?:disregard|forget|cancel)\s+(?:all\s+)?(?:previous|prior|above|earlier)\s+(?:instructions?|prompts?|rules?)\b/i, description: 'Variant of ignore-previous using disregard/forget/cancel' }, { id: 'override-instructions-en', category: 'jailbreak', severity: 'high', pattern: /\b(?:override|bypass|supersede|replace)\s+(?:the\s+)?(?:previous|system|original|initial)\s+(?:instructions?|prompt|rules?)\b/i, description: 'Direct override of system instructions' }, // ─── German equivalents ───────────────────────────────────────────────── { id: 'ignore-previous-de', category: 'jailbreak', severity: 'high', pattern: /\b(?:ignoriere|vergiss|verwerfe)\s+(?:alle\s+)?(?:vorherigen|vorigen|obigen|bisherigen)\s+(?:anweisungen|instruktionen|regeln|prompts?)\b/i, description: 'German: "ignoriere vorherige Anweisungen"' }, { id: 'override-de', category: 'jailbreak', severity: 'high', pattern: /\b(?:überschreibe|umgehe|ersetze)\s+(?:die\s+)?(?:vorherigen|system|ursprünglichen)\s+(?:anweisungen|regeln)\b/i, description: 'German: override system instructions' }, // ─── Role bypass / persona injection ──────────────────────────────────── { id: 'dan-persona', category: 'role_bypass', severity: 'high', pattern: /\b(?:you\s+are\s+now\s+|act\s+as\s+|pretend\s+to\s+be\s+)?(?:DAN|Developer\s*Mode|jailbreak\s*mode|unrestricted\s+mode|god\s+mode)\b/i, description: 'DAN / Developer Mode / unrestricted persona injection' }, { id: 'new-system-prompt', category: 'role_bypass', severity: 'critical', pattern: /\bnew\s+system\s+prompt\s*[:=]/i, description: 'Attempt to redefine the system prompt mid-conversation' }, { id: 'pretend-rolemix', category: 'role_bypass', severity: 'medium', pattern: /\bpretend\s+you\s+(?:are\s+not\s+|don't\s+have\s+|have\s+no\s+)(?:bound\s+by|restricted\s+by|limited\s+by|filtered\s+by)\b/i, description: 'Pretend-you-are-not-restricted bypass' }, // ─── System-prompt extraction ─────────────────────────────────────────── { id: 'reveal-system-prompt', category: 'system_prompt_leak', severity: 'high', pattern: /\b(?:reveal|show|display|print|output|repeat|tell\s+me)\s+(?:your\s+|the\s+)?(?:system\s+prompt|initial\s+prompt|original\s+instructions?|hidden\s+prompt)\b/i, description: 'Direct request to leak system prompt' }, { id: 'repeat-instructions', category: 'system_prompt_leak', severity: 'medium', pattern: /\brepeat\s+(?:the\s+|your\s+)?(?:instructions?|prompt|context|message)\s+(?:above|verbatim|word\s+for\s+word|exactly\s+as\s+given)\b/i, description: 'Sneaky system-prompt extraction via verbatim repeat' }, { id: 'starts-with', category: 'system_prompt_leak', severity: 'medium', pattern: /\b(?:what|tell\s+me|repeat)\s+(?:everything|all\s+text|the\s+text)\s+(?:above|before|that\s+comes\s+before)\b/i, description: 'Indirect: ask for text before the user message' }, // ─── Indirect injection markers (data poisoning) ──────────────────────── { id: 'fake-system-tag', category: 'indirect', severity: 'high', pattern: /<\|im_start\|>(?:system|developer)|<\|system\|>|\[\[SYSTEM\]\]|^---\s*system\s*---$/im, description: 'Embedded role-delimiter tokens trying to spoof system role' }, { id: 'embedded-user-msg', category: 'indirect', severity: 'medium', pattern: /<\|im_start\|>user|\[\[USER\]\]|^---\s*user\s*---$/im, description: 'Embedded user tokens to inject fake messages' }, { id: 'instruction-in-data', category: 'indirect', severity: 'medium', pattern: /^[\s\S]{50,}?\n\s*(?:IMPORTANT|ATTENTION|URGENT|SYSTEM)\s*[:!]\s*(?:ignore|disregard|forget|override|new\s+(?:rules?|instructions?))/im, description: 'Mid-document IMPORTANT/SYSTEM marker followed by jailbreak verb' }, // ─── Data exfiltration ────────────────────────────────────────────────── { id: 'markdown-image-exfil', category: 'exfiltration', severity: 'high', pattern: /!\[[^\]]*\]\(https?:\/\/[^)]*\?[^)]*(?:data|secret|key|token|password|prompt)=/i, description: 'Markdown image with secret-bearing query string (browser exfil)' }, { id: 'send-data-to', category: 'exfiltration', severity: 'high', pattern: /\b(?:send|post|transmit|email|share|leak)\s+(?:this\s+)?(?:conversation|history|prompt|context|data|secrets?)\s+to\s+(?:https?:|email|webhook)/i, description: 'Explicit request to send data to external endpoint' }, { id: 'base64-instruction', category: 'exfiltration', severity: 'medium', pattern: /\b(?:decode|execute|run|interpret)\s+(?:this\s+)?base64\s*[:.]?\s*[A-Za-z0-9+/]{40,}={0,2}/i, description: 'Hidden instructions encoded in base64' }, // ─── Policy bypass / harmful content ──────────────────────────────────── { id: 'no-refusal', category: 'policy', severity: 'medium', pattern: /\byou\s+(?:must\s+not|cannot|are\s+not\s+allowed\s+to)\s+(?:refuse|decline|say\s+no|apologize)\b/i, description: 'Refusal-suppression attempt' }, { id: 'illegal-content-demand', category: 'policy', severity: 'high', pattern: /\b(?:without\s+any\s+(?:warnings?|disclaimers?|safety|filters?|restrictions?)|no\s+matter\s+(?:what|how\s+harmful))/i, description: 'Demand for filter-free / unrestricted output' }, // ═════════════════════════════════════════════════════════════════════════ // 2026 expansion — new patterns added after CVE-2026-45321 / Shai-Hulud // event triggered comprehensive review of jailbreak surface. // Sources: PromptArmor PoC repo, L1B3RT4S, stepsecurity blog, OWASP LLM Top10 // ═════════════════════════════════════════════════════════════════════════ // ─── 2026 jailbreak personas (the meta keeps reinventing names) ───────── { id: 'aim-niccolo', category: 'jailbreak', severity: 'high', pattern: /\b(?:AIM|Niccolo\s+Machiavelli|Always\s+Intelligent\s+and\s+Machiavellian)\b/i, description: 'AIM (Always Intelligent Machiavellian) persona — popular 2024+ jailbreak' }, { id: 'evilbot-omega-dude', category: 'jailbreak', severity: 'high', pattern: /\b(?:EvilBOT|OmegaGPT|AntiGPT|BetterDAN|DUDE|Maximum|STAN|MongoTom|HackerGPT|FreeGPT|WormGPT|FraudGPT|DarkGPT)\b/i, description: '2024-2026 known jailbreak persona names' }, { id: 'fiction-framing', category: 'jailbreak', severity: 'medium', pattern: /\b(?:in\s+a\s+(?:fictional|hypothetical|imaginary|alternate|parallel)\s+(?:world|universe|reality|scenario)|for\s+(?:educational|research|academic)\s+purposes\s+only)\b[\s\S]{0,200}\b(?:bomb|exploit|malware|hack|kill|poison|weapon|drug|illegal)/i, description: 'Hypothetical/fiction framing combined with harmful topic' }, { id: 'do-anything-now', category: 'jailbreak', severity: 'high', pattern: /\b(?:do\s+anything\s+now|free\s+from\s+the\s+typical\s+(?:confines|rules|restrictions)|broke\s+free\s+of\s+the)\b/i, description: 'DAN-style "do anything now" framing' }, { id: 'goal-hijack', category: 'jailbreak', severity: 'high', pattern: /\byour\s+(?:new|real|true|only|primary)\s+(?:goal|purpose|mission|directive|objective)\s+is\s+(?:now\s+)?to\b/i, description: 'Goal hijacking — redefines model purpose mid-conversation' }, // ─── Multilingual jailbreak (Romance + Slavic + CJK) ──────────────────── { id: 'ignore-fr', category: 'jailbreak', severity: 'high', pattern: /\b(?:ignorez|ignore|oubliez|écartez|annulez)\s+(?:toutes\s+les\s+)?(?:instructions?|consignes|règles)\s+(?:précédentes?|antérieures?|ci-dessus)\b/i, description: 'French: ignorez instructions précédentes' }, { id: 'ignore-es', category: 'jailbreak', severity: 'high', pattern: /\b(?:ignora|olvida|descarta|cancela)\s+(?:todas\s+las\s+)?(?:instrucciones?|reglas)\s+(?:anteriores?|previas?)\b/i, description: 'Spanish: ignora instrucciones anteriores' }, { id: 'ignore-it', category: 'jailbreak', severity: 'high', pattern: /\b(?:ignora|dimentica|annulla)\s+(?:tutte\s+le\s+)?(?:istruzioni|regole|prompt)\s+(?:precedenti|anteriori)\b/i, description: 'Italian: ignora istruzioni precedenti' }, { id: 'ignore-ru', category: 'jailbreak', severity: 'high', pattern: /\b(?:игнорируй|забудь|отмени)\s+(?:все\s+)?(?:предыдущие|прошлые)\s+(?:инструкции|указания|правила|промпт)\b/i, description: 'Russian: игнорируй предыдущие инструкции' }, { id: 'ignore-zh', category: 'jailbreak', severity: 'high', pattern: /(?:忽略|无视|忘记|取消)(?:之前|以前|先前|上面|所有)(?:的)?(?:指令|指示|规则|提示|命令)/, description: 'Chinese (Simplified): 忽略之前的指令' }, { id: 'ignore-ja', category: 'jailbreak', severity: 'high', pattern: /(?:以前の|これまでの|先の)(?:指示|命令|プロンプト|ルール)を(?:無視|忘れて|キャンセル)/, description: 'Japanese: 以前の指示を無視' }, // ─── 2026-05-16 expansion: 15 more languages to close the bypass gap ───── { id: 'ignore-bn', category: 'jailbreak', severity: 'high', pattern: /(?:পূর্ববর্তী|আগের|উপরের)\s*(?:সমস্ত\s*)?(?:নির্দেশ|নির্দেশাবলী|আদেশ|প্রম্পট)\s*(?:উপেক্ষা|ভুলে|বাতিল)/u, description: 'Bangla / Bengali: পূর্ববর্তী নির্দেশাবলী উপেক্ষা করুন' }, { id: 'ignore-af', category: 'jailbreak', severity: 'high', pattern: /\b(?:ignoreer|vergeet|misken)\s+(?:alle\s+)?(?:vorige|voorafgaande|bostaande)\s+(?:instruksies?|opdragte|reëls)\b/i, description: 'Afrikaans: ignoreer alle vorige instruksies' }, { id: 'ignore-hi', category: 'jailbreak', severity: 'high', pattern: /(?:पिछले|पहले\s*के|पूर्व|उपरोक्त)\s*(?:सभी\s*)?(?:निर्देश|आदेश|नियम|प्रॉम्प्ट|हिदायत)[ोंकामिकेय्ाऀ-ॿ]{0,4}\s*(?:को\s+)?(?:अनदेखा|भूल|रद्द|नज़रअंदाज|उपेक्षा)/u, description: 'Hindi: पिछले निर्देशों को अनदेखा करें' }, { id: 'ignore-ar', category: 'jailbreak', severity: 'high', pattern: /(?:تجاهل|انس|ألغ[ِي]?|اطرح)\s*(?:جميع\s+)?(?:التعليمات|الإرشادات|الأوامر|القواعد|التوجيهات)\s*(?:السابقة|السالفة|أعلاه)/u, description: 'Arabic: تجاهل التعليمات السابقة' }, { id: 'ignore-he', category: 'jailbreak', severity: 'high', pattern: /(?:התעלם|שכח|בטל)\s*(?:מ?(?:ה?כל)\s+)?(?:[מבכל]?ה?(?:הוראות|פקודות|כללים|פרומפט|הנחיות))\s*(?:ה?קודמות?|ה?קודמים|לעיל|ה?קודם)/u, description: 'Hebrew: התעלם מההוראות הקודמות (with prefix prepositions)' }, { id: 'ignore-fa', category: 'jailbreak', severity: 'high', pattern: /(?:(?:دستورات|دستورالعمل|قوانین|پرامپت|پرامپت‌ها)\s*(?:قبلی|پیشین|بالا|قبل)\s*(?:را\s*)?(?:نادیده\s*بگیر|فراموش\s*کن|لغو\s*کن))|(?:(?:نادیده\s*بگیر|فراموش\s*کن|لغو\s*کن)\s*(?:تمام\s+)?(?:دستورات|دستورالعمل|قوانین|پرامپت)\s*(?:قبلی|پیشین))/u, description: 'Persian / Farsi: دستورات قبلی را نادیده بگیر (SOV + VSO orders)' }, { id: 'ignore-tr', category: 'jailbreak', severity: 'high', pattern: /(?:önceki|evvelki|yukarıdaki|geçmiş)\s+(?:tüm\s+|bütün\s+)?(?:talimat\p{L}*|kural\p{L}*|yönerge\p{L}*|prompt\p{L}*|emir\p{L}*)\s+(?:yok\s*say|unut|iptal\s*et|göz\s*ardı|atla|umursam)/iu, description: 'Turkish: önceki talimatları yok say (uses \\p{L} for Turkish ı/ş/ç/etc)' }, { id: 'ignore-vi', category: 'jailbreak', severity: 'high', pattern: /\b(?:bỏ\s*qua|quên|hủy)\s+(?:tất\s*cả\s+)?(?:các\s+)?(?:hướng\s*dẫn|chỉ\s*dẫn|chỉ\s*thị|lệnh|quy\s*tắc)\s+(?:trước\s*đó|phía\s*trên|trước)\b/i, description: 'Vietnamese: bỏ qua các hướng dẫn trước đó' }, { id: 'ignore-th', category: 'jailbreak', severity: 'high', pattern: /(?:เพิกเฉย|ละเลย|ลืม|ยกเลิก)\s*(?:ต่อ\s*)?(?:คำสั่ง|คำแนะนำ|กฎ|prompt)\s*(?:ก่อนหน้า|ที่ผ่านมา|ทั้งหมด)/u, description: 'Thai: เพิกเฉยต่อคำสั่งก่อนหน้า' }, { id: 'ignore-ko', category: 'jailbreak', severity: 'high', pattern: /(?:이전|이전의|위의|앞선)\s*(?:모든\s+)?(?:지시|명령|규칙|프롬프트)(?:사항|문)?(?:을|를)\s*(?:무시|잊어|취소)/u, description: 'Korean: 이전 지시를 무시하세요' }, { id: 'ignore-pl', category: 'jailbreak', severity: 'high', pattern: /\b(?:zignoruj|pomiń|zapomnij|anuluj)\s+(?:wszystkie\s+)?(?:poprzednie|wcześniejsze|powyższe)\s+(?:instrukcje|polecenia|zasady|reguły|prompt)\b/i, description: 'Polish: zignoruj poprzednie instrukcje' }, { id: 'ignore-nl', category: 'jailbreak', severity: 'high', pattern: /\b(?:negeer|vergeet|annuleer)\s+(?:alle\s+)?(?:vorige|voorgaande|bovenstaande)\s+(?:instructies?|opdrachten|regels|prompts?)\b/i, description: 'Dutch: negeer alle vorige instructies' }, { id: 'ignore-id', category: 'jailbreak', severity: 'high', pattern: /\b(?:abaikan|lupakan|batalkan)\s+(?:semua\s+)?(?:instruksi|perintah|aturan|prompt)\s+(?:sebelumnya|yang\s+lalu|di\s+atas)\b/i, description: 'Indonesian: abaikan semua instruksi sebelumnya' }, { id: 'ignore-tl', category: 'jailbreak', severity: 'high', pattern: /\b(?:huwag\s+pansinin|kalimutan|kanselahin|balewalain)\s+(?:ang\s+|sa\s+)?(?:lahat\s+ng\s+)?(?:mga\s+)?(?:nakaraang|naunang|naunang)\s+(?:tagubilin|utos|patakaran|prompt)\b/i, description: 'Tagalog / Filipino: huwag pansinin (ang mga) nakaraang tagubilin' }, { id: 'ignore-sw', category: 'jailbreak', severity: 'high', pattern: /\b(?:puuza|sahau|ghairi)\s+(?:zote\s+)?(?:maagizo|maelekezo|amri|sheria|prompt)\s+(?:ya\s+awali|za\s+awali|zilizotangulia)\b/i, description: 'Swahili: puuza maagizo ya awali' }, // ─── Universal non-Latin script catch-all (script-detector heuristic) ──── // If input contains substantial non-Latin script AND any "instruction verb" // marker we haven't explicitly translated, flag for llm_judge escalation. // This is a SOFT-flag (severity: medium) — paired with the script detector // below to escalate to llm_judge mode rather than auto-block. { id: 'non-latin-instruction-marker', category: 'jailbreak', severity: 'medium', pattern: /[\p{Script=Arabic}\p{Script=Bengali}\p{Script=Devanagari}\p{Script=Hebrew}\p{Script=Thai}\p{Script=Hangul}\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Cyrillic}\p{Script=Tamil}\p{Script=Telugu}\p{Script=Gujarati}\p{Script=Gurmukhi}\p{Script=Myanmar}\p{Script=Khmer}\p{Script=Lao}\p{Script=Tibetan}\p{Script=Georgian}\p{Script=Armenian}\p{Script=Sinhala}]{20,}/u, description: 'Substantial non-Latin script (≥20 chars) — escalate to llm_judge' }, // ─── Token / chat-template smuggling (LLM control-token spoofing) ─────── { id: 'chatml-smuggle', category: 'indirect', severity: 'critical', pattern: /<\|(?:im_start|im_end|im_sep|fim_prefix|fim_middle|fim_suffix|endoftext|start_header_id|end_header_id|eot_id)\|>/, description: 'Smuggled ChatML / Llama / Qwen control tokens in user input' }, { id: 'inst-smuggle', category: 'indirect', severity: 'critical', pattern: /\[\/?INST\]|<\/?s>|<>|<<\/SYS>>/, description: 'Smuggled Llama-2 [INST] or <> control sequences' }, { id: 'tool-output-poison', category: 'indirect', severity: 'high', pattern: /