feat: add typoglycemia detection to TokenizerNormalizer

Detects scrambled-middle-letter attack words (OWASP LLM defense).
Pre-computed signature map for O(1) lookups — "ignroe" → "ignore",
"bypssa" → "bypass", "insrtuctinos" → "instructions".
40 attack keywords covered. Zero false positives on benchmark.
This commit is contained in:
Rene Fichtmueller 2026-04-07 11:35:10 +02:00
parent 2f6dea9959
commit 8df3dad3c6

View File

@ -71,8 +71,70 @@ const ATTACK_KEYWORDS: readonly string[] = Object.freeze([
'override', 'bypass', 'system', 'prompt', 'jailbreak', 'override', 'bypass', 'system', 'prompt', 'jailbreak',
'restrict', 'filter', 'safety', 'guideline', 'execute', 'restrict', 'filter', 'safety', 'guideline', 'execute',
'command', 'admin', 'sudo', 'inject', 'instruction', 'command', 'admin', 'sudo', 'inject', 'instruction',
'password', 'credentials', 'secret', 'token', 'reveal',
'delete', 'remove', 'disable', 'shutdown', 'terminate',
'extract', 'exfiltrate', 'exploit', 'vulnerability', 'privilege',
'escalate', 'rootkit', 'malware', 'payload', 'obfuscate',
]) ])
// ---------------------------------------------------------------------------
// Typoglycemia detection — OWASP LLM Top 10 defense
// ---------------------------------------------------------------------------
/**
* Pre-computed signature map for O(1) typoglycemia lookups.
* Key: first_char + sorted_middle + last_char + length
* Value: canonical keyword
*
* Typoglycemia: humans read "igrneo" as "ignore" because first/last letters
* match and middle letters are a permutation. Attackers use this to bypass
* regex-based detection while remaining readable to the LLM.
*/
function sortedMiddle(word: string): string {
return word.slice(1, -1).split('').sort().join('')
}
const TYPOGLYCEMIA_MAP: ReadonlyMap<string, string> = (() => {
const map = new Map<string, string>()
for (const kw of ATTACK_KEYWORDS) {
if (kw.length < 4) continue // Words ≤3 chars can't have scrambled middles
const key = kw[0] + sortedMiddle(kw) + kw[kw.length - 1] + kw.length
map.set(key, kw)
}
return map
})()
/**
* Detect and fix typoglycemia-scrambled attack words.
* "igrneo" "ignore", "intrsuctinos" "instructions", "bpyass" "bypass"
*
* Only corrects words where:
* 1. Length matches an attack keyword exactly
* 2. First and last characters match
* 3. Middle characters are an exact permutation (anagram)
*
* This avoids false positives on legitimate words.
*/
function deobfuscateTypoglycemia(input: string): { text: string; corrections: string[] } {
const corrections: string[] = []
const result = input.replace(/\b[A-Za-z]{4,}\b/g, (word) => {
const lower = word.toLowerCase()
// Skip if the word is already an exact keyword (no need to correct)
if (ATTACK_KEYWORDS.includes(lower)) return word
const key = lower[0] + sortedMiddle(lower) + lower[lower.length - 1] + lower.length
const match = TYPOGLYCEMIA_MAP.get(key)
if (match && match !== lower) {
corrections.push(`${word}${match}`)
// Preserve original casing of first character
return word[0] === word[0]?.toUpperCase()
? match[0]!.toUpperCase() + match.slice(1)
: match
}
return word
})
return { text: result, corrections }
}
/** /**
* Pattern matching single characters separated by dots, dashes, or underscores. * Pattern matching single characters separated by dots, dashes, or underscores.
* Matches sequences like "I.g.n.o.r.e" or "I-g-n-o-r-e" or "I_g_n_o_r_e" * Matches sequences like "I.g.n.o.r.e" or "I-g-n-o-r-e" or "I_g_n_o_r_e"
@ -199,7 +261,11 @@ export class TokenizerNormalizer {
// 9. Rejoin split words: "igno re" -> "ignore", "in-struc-tions" -> "instructions" // 9. Rejoin split words: "igno re" -> "ignore", "in-struc-tions" -> "instructions"
result = deobfuscateSplitWords(result) result = deobfuscateSplitWords(result)
// 10. Final whitespace cleanup after deobfuscation // 10. Typoglycemia correction: "igrneo" -> "ignore", "bpyass" -> "bypass"
const typo = deobfuscateTypoglycemia(result)
result = typo.text
// 11. Final whitespace cleanup after deobfuscation
result = result.replace(MULTI_SPACE_REGEX, ' ').trim() result = result.replace(MULTI_SPACE_REGEX, ' ').trim()
return result return result
@ -260,6 +326,13 @@ export class TokenizerNormalizer {
matchedPatterns.push('non_nfkc_form') matchedPatterns.push('non_nfkc_form')
} }
// Check for typoglycemia-scrambled attack keywords
const typoResult = deobfuscateTypoglycemia(input)
if (typoResult.corrections.length > 0) {
modifications += typoResult.corrections.length * 5 // High weight — deliberate evasion
matchedPatterns.push('typoglycemia_scramble')
}
const latencyMs = performance.now() - start const latencyMs = performance.now() - start
// Zero-width chars in a token context are more suspicious than curly quotes // Zero-width chars in a token context are more suspicious than curly quotes