10-layer defense pipeline with kill chain mapping, self-healing, self-learning, and compliance reporting. Local-first, zero cloud deps. - 72 detection rules across 7 kill chain phases - 294 unit tests, 500+ attack corpus samples - Management dashboard (Next.js 15, 10 pages) - Automated resistance testing (2x daily, 31 probes) - MITRE ATLAS, OWASP LLM Top 10, EU AI Act compliance - Integrations: Next.js middleware, Ollama, n8n - PostgreSQL 17 + pgvector for persistent learning
312 lines
9.6 KiB
TypeScript
312 lines
9.6 KiB
TypeScript
/**
|
||
* UnicodeNormalizer — Layer 0 critical preprocessing.
|
||
*
|
||
* Strips invisible characters, homoglyphs, and steganographic Unicode
|
||
* that evade all commercial guardrails. This is the single most impactful
|
||
* zero-cost defense: invisible payloads are neutralized before any
|
||
* downstream scanner ever sees the input.
|
||
*
|
||
* Covers: Unicode Tags, Zero-Width, BiDi overrides, Variation Selectors,
|
||
* Cyrillic/Greek/Armenian homoglyphs, invisible formatting, control chars.
|
||
*/
|
||
|
||
import type { ScanResult, ScannerType, ShieldXConfig } from '../types/detection.js'
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Constants
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** Scanner identifier for ScanResult */
|
||
const SCANNER_ID = 'unicode-normalizer'
|
||
const SCANNER_TYPE: ScannerType = 'unicode'
|
||
|
||
/**
|
||
* Homoglyph map: visually identical non-Latin characters mapped to ASCII.
|
||
* Covers Cyrillic, Greek, Armenian, and other confusable scripts.
|
||
* At least 50 entries as required.
|
||
*/
|
||
const HOMOGLYPH_MAP: Readonly<Record<string, string>> = Object.freeze({
|
||
// Cyrillic → Latin
|
||
'\u0430': 'a', // а → a
|
||
'\u0435': 'e', // е → e
|
||
'\u043E': 'o', // о → o
|
||
'\u0440': 'p', // р → p
|
||
'\u0441': 'c', // с → c
|
||
'\u0443': 'y', // у → y
|
||
'\u0445': 'x', // х → x
|
||
'\u0456': 'i', // і → i (Ukrainian)
|
||
'\u0458': 'j', // ј → j (Serbian)
|
||
'\u04BB': 'h', // һ → h
|
||
'\u0455': 's', // ѕ → s
|
||
'\u0454': 'e', // є → e (Ukrainian)
|
||
'\u0457': 'i', // ї → i (Ukrainian)
|
||
'\u043A': 'k', // к → k
|
||
'\u043C': 'm', // м → m (visually similar in some fonts)
|
||
'\u0422': 'T', // Т → T
|
||
'\u0410': 'A', // А → A
|
||
'\u0412': 'B', // В → B
|
||
'\u0415': 'E', // Е → E
|
||
'\u041A': 'K', // К → K
|
||
'\u041C': 'M', // М → M
|
||
'\u041D': 'H', // Н → H
|
||
'\u041E': 'O', // О → O
|
||
'\u0420': 'P', // Р → P
|
||
'\u0421': 'C', // С → C
|
||
'\u0423': 'Y', // У → Y (uppercase)
|
||
'\u0425': 'X', // Х → X
|
||
'\u0417': '3', // З → 3
|
||
'\u0432': 'b', // в → b (in some fonts)
|
||
|
||
// Greek → Latin
|
||
'\u03B1': 'a', // α → a
|
||
'\u03B5': 'e', // ε → e (visually close)
|
||
'\u03BF': 'o', // ο → o
|
||
'\u03C1': 'p', // ρ → p
|
||
'\u03C4': 't', // τ → t (visually close in some fonts)
|
||
'\u03BD': 'v', // ν → v
|
||
'\u0391': 'A', // Α → A
|
||
'\u0392': 'B', // Β → B
|
||
'\u0395': 'E', // Ε → E
|
||
'\u0396': 'Z', // Ζ → Z
|
||
'\u0397': 'H', // Η → H
|
||
'\u0399': 'I', // Ι → I
|
||
'\u039A': 'K', // Κ → K
|
||
'\u039C': 'M', // Μ → M
|
||
'\u039D': 'N', // Ν → N
|
||
'\u039F': 'O', // Ο → O
|
||
'\u03A1': 'P', // Ρ → P
|
||
'\u03A4': 'T', // Τ → T
|
||
'\u03A5': 'Y', // Υ → Y
|
||
'\u03A7': 'X', // Χ → X
|
||
'\u03B9': 'i', // ι → i
|
||
|
||
// Armenian → Latin
|
||
'\u0570': 'h', // հ → h
|
||
'\u0578': 'n', // ո → n
|
||
'\u057D': 's', // ս → s
|
||
'\u0585': 'o', // օ → o
|
||
'\u0575': 'j', // յ → j
|
||
|
||
// Fullwidth Latin
|
||
'\uFF41': 'a', // a → a
|
||
'\uFF42': 'b', // b → b
|
||
'\uFF43': 'c', // c → c
|
||
'\uFF44': 'd', // d → d
|
||
'\uFF45': 'e', // e → e
|
||
})
|
||
|
||
/**
|
||
* Regex for Unicode Tag Characters (U+E0000 - U+E007F).
|
||
* These are invisible "language tag" characters abused for steganographic payloads.
|
||
*/
|
||
const UNICODE_TAGS_REGEX = /[\u{E0000}-\u{E007F}]/gu
|
||
|
||
/**
|
||
* Zero-width characters used for invisible payloads.
|
||
* ZWSP, ZWNJ, ZWJ, BOM/ZWNBSP
|
||
*/
|
||
const ZERO_WIDTH_REGEX = /[\u200B\u200C\u200D\uFEFF]/g
|
||
|
||
/**
|
||
* Bidirectional override and embedding characters.
|
||
* LRE (202A), RLE (202B), PDF (202C), LRO (202D), RLO (202E),
|
||
* LRI (2066), RLI (2067), FSI (2068), PDI (2069)
|
||
*/
|
||
const BIDI_REGEX = /[\u202A-\u202E\u2066-\u2069]/g
|
||
|
||
/**
|
||
* Variation Selectors: VS1-VS16 (FE00-FE0F) and VS17-VS256 (E0100-E01EF).
|
||
*/
|
||
const VARIATION_SELECTORS_REGEX = /[\uFE00-\uFE0F]|[\u{E0100}-\u{E01EF}]/gu
|
||
|
||
/**
|
||
* Invisible formatting characters:
|
||
* - U+00AD soft hyphen
|
||
* - U+034F combining grapheme joiner
|
||
* - U+061C Arabic letter mark
|
||
* - U+115F, U+1160 Hangul filler characters
|
||
* - U+17B4, U+17B5 Khmer invisible vowels
|
||
* - U+180E Mongolian vowel separator
|
||
* - U+2060 word joiner
|
||
* - U+2061-U+2064 invisible math operators
|
||
*/
|
||
const INVISIBLE_FORMATTING_REGEX = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u2060-\u2064]/g
|
||
|
||
/**
|
||
* Control characters: C0 (0000-001F except tab/newline/cr), DEL (007F), C1 (0080-009F).
|
||
*/
|
||
const CONTROL_CHARS_REGEX = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F\u0080-\u009F]/g
|
||
|
||
// Pre-build homoglyph regex for single-pass replacement
|
||
const HOMOGLYPH_CHARS = Object.keys(HOMOGLYPH_MAP)
|
||
const HOMOGLYPH_REGEX = HOMOGLYPH_CHARS.length > 0
|
||
? new RegExp(`[${HOMOGLYPH_CHARS.join('')}]`, 'g')
|
||
: null
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Result type for normalize()
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** Result of Unicode normalization with metadata for logging */
|
||
export interface UnicodeNormalizationResult {
|
||
readonly normalized: string
|
||
readonly strippedChars: number
|
||
readonly homoglyphsReplaced: number
|
||
readonly suspiciousPatterns: readonly string[]
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// UnicodeNormalizer class
|
||
// ---------------------------------------------------------------------------
|
||
|
||
export class UnicodeNormalizer {
|
||
private readonly strippedCharsThreshold: number
|
||
private readonly homoglyphThreshold: number
|
||
|
||
/**
|
||
* Create a UnicodeNormalizer.
|
||
* @param config - ShieldX configuration. Thresholds drive suspicion flagging.
|
||
*/
|
||
constructor(private readonly config: ShieldXConfig) {
|
||
// Default thresholds — flag if more than 5 stripped chars or 3 homoglyphs
|
||
this.strippedCharsThreshold = 5
|
||
this.homoglyphThreshold = 3
|
||
}
|
||
|
||
/**
|
||
* Normalize input by stripping all invisible/steganographic Unicode.
|
||
* Synchronous for <0.1ms latency.
|
||
*
|
||
* @param input - Raw user input string
|
||
* @returns Normalization result with metadata
|
||
*/
|
||
normalize(input: string): UnicodeNormalizationResult {
|
||
let strippedChars = 0
|
||
const suspiciousPatterns: string[] = []
|
||
|
||
// Count and strip each category
|
||
const afterTags = input.replace(UNICODE_TAGS_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
const afterZeroWidth = afterTags.replace(ZERO_WIDTH_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
const afterBidi = afterZeroWidth.replace(BIDI_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
const afterVarSel = afterBidi.replace(VARIATION_SELECTORS_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
const afterInvisible = afterVarSel.replace(INVISIBLE_FORMATTING_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
const afterControl = afterInvisible.replace(CONTROL_CHARS_REGEX, () => {
|
||
strippedChars++
|
||
return ''
|
||
})
|
||
|
||
// Homoglyph replacement
|
||
let homoglyphsReplaced = 0
|
||
const afterHomoglyphs = HOMOGLYPH_REGEX
|
||
? afterControl.replace(HOMOGLYPH_REGEX, (ch) => {
|
||
homoglyphsReplaced++
|
||
return HOMOGLYPH_MAP[ch] ?? ch
|
||
})
|
||
: afterControl
|
||
|
||
// Build suspicious pattern list for logging
|
||
if (input.match(UNICODE_TAGS_REGEX)) {
|
||
suspiciousPatterns.push('unicode_tag_characters')
|
||
}
|
||
if (input.match(ZERO_WIDTH_REGEX)) {
|
||
suspiciousPatterns.push('zero_width_characters')
|
||
}
|
||
if (input.match(BIDI_REGEX)) {
|
||
suspiciousPatterns.push('bidi_override_characters')
|
||
}
|
||
if (input.match(VARIATION_SELECTORS_REGEX)) {
|
||
suspiciousPatterns.push('variation_selectors')
|
||
}
|
||
if (input.match(INVISIBLE_FORMATTING_REGEX)) {
|
||
suspiciousPatterns.push('invisible_formatting')
|
||
}
|
||
if (input.match(CONTROL_CHARS_REGEX)) {
|
||
suspiciousPatterns.push('control_characters')
|
||
}
|
||
if (homoglyphsReplaced > 0) {
|
||
suspiciousPatterns.push('homoglyph_substitution')
|
||
}
|
||
|
||
return {
|
||
normalized: afterHomoglyphs,
|
||
strippedChars,
|
||
homoglyphsReplaced,
|
||
suspiciousPatterns,
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Scan input for suspicious Unicode patterns and return a ScanResult.
|
||
* Synchronous for <0.1ms latency.
|
||
*
|
||
* @param input - Raw user input string
|
||
* @returns ScanResult with detection details
|
||
*/
|
||
scan(input: string): ScanResult {
|
||
const start = performance.now()
|
||
const result = this.normalize(input)
|
||
const latencyMs = performance.now() - start
|
||
|
||
const isSuspicious =
|
||
result.strippedChars > this.strippedCharsThreshold ||
|
||
result.homoglyphsReplaced > this.homoglyphThreshold
|
||
|
||
// Confidence: scale based on number of suspicious indicators
|
||
const rawScore = Math.min(
|
||
1.0,
|
||
(result.strippedChars / 20) + (result.homoglyphsReplaced / 10),
|
||
)
|
||
|
||
const confidence = isSuspicious ? Math.max(0.4, rawScore) : rawScore
|
||
|
||
const threatLevel = this.computeThreatLevel(confidence)
|
||
|
||
return {
|
||
scannerId: SCANNER_ID,
|
||
scannerType: SCANNER_TYPE,
|
||
detected: isSuspicious,
|
||
confidence,
|
||
threatLevel,
|
||
killChainPhase: isSuspicious ? 'initial_access' : 'none',
|
||
matchedPatterns: result.suspiciousPatterns,
|
||
rawScore,
|
||
latencyMs,
|
||
metadata: {
|
||
strippedChars: result.strippedChars,
|
||
homoglyphsReplaced: result.homoglyphsReplaced,
|
||
},
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Map confidence score to threat level using config thresholds.
|
||
*/
|
||
private computeThreatLevel(confidence: number): ScanResult['threatLevel'] {
|
||
if (confidence >= this.config.thresholds.critical) return 'critical'
|
||
if (confidence >= this.config.thresholds.high) return 'high'
|
||
if (confidence >= this.config.thresholds.medium) return 'medium'
|
||
if (confidence >= this.config.thresholds.low) return 'low'
|
||
return 'none'
|
||
}
|
||
}
|