- Layer 4 EntropyScanner: Shannon entropy, Base32/Base64 detection, CVE-2025-55284 ping/nslookup exfil, EchoLeak markdown pattern, DNS tunneling (iodine/dnscat) - Layer 5 UnicodeScanner: ASCII Smuggling (U+E0000 Tags Block), Variant Selectors, Zero-Width steganography, CamoLeak image-ordering (CVE-2025-53773), homoglyphs, BiDi override, high-entropy URL params - 30 DNS covert channel rules (dns-001 to dns-030) - ATLASMapper: 29 techniques (ATLAS v5.4.0 Feb 2026), added AML.T0062 (Agent Tool Invocation), AML.TA0015 (C2 tactic), memory poisoning, multi-agent trust, CamoLeak, Unicode steganography mappings - Rule count: 72 → 102 - Build: tsup 316ms, zero TypeScript errors
248 lines
9.3 KiB
TypeScript
248 lines
9.3 KiB
TypeScript
import { describe, it, expect, beforeEach } from 'vitest'
|
||
import { UnicodeNormalizer } from '../../../src/preprocessing/UnicodeNormalizer.js'
|
||
import { defaultConfig } from '../../../src/core/config.js'
|
||
|
||
describe('UnicodeNormalizer', () => {
|
||
let normalizer: UnicodeNormalizer
|
||
|
||
beforeEach(() => {
|
||
normalizer = new UnicodeNormalizer(defaultConfig)
|
||
})
|
||
|
||
describe('normalize()', () => {
|
||
describe('zero-width character stripping', () => {
|
||
it('should strip zero-width space (U+200B)', () => {
|
||
const input = 'hello\u200Bworld'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('helloworld')
|
||
expect(result.strippedChars).toBeGreaterThan(0)
|
||
})
|
||
|
||
it('should strip zero-width non-joiner (U+200C)', () => {
|
||
const input = 'test\u200Cinput'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('testinput')
|
||
})
|
||
|
||
it('should strip zero-width joiner (U+200D)', () => {
|
||
const input = 'a\u200Db'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('ab')
|
||
})
|
||
|
||
it('should strip BOM / zero-width no-break space (U+FEFF)', () => {
|
||
const input = '\uFEFFhello'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('hello')
|
||
})
|
||
|
||
it('should report zero_width_characters in suspicious patterns', () => {
|
||
const input = 'ig\u200Bnore prev\u200Cious'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.suspiciousPatterns).toContain('zero_width_characters')
|
||
})
|
||
})
|
||
|
||
describe('Unicode tag character stripping', () => {
|
||
it('should strip Unicode tag characters (U+E0000 range)', () => {
|
||
const input = 'hello\u{E0001}\u{E0069}\u{E0067}\u{E006E}world'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('helloworld')
|
||
expect(result.strippedChars).toBe(4)
|
||
})
|
||
|
||
it('should report unicode_tag_characters in suspicious patterns', () => {
|
||
const input = 'test\u{E0020}\u{E0041}data'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.suspiciousPatterns).toContain('unicode_tag_characters')
|
||
})
|
||
})
|
||
|
||
describe('bidi override removal', () => {
|
||
it('should strip LRO (U+202D) and RLO (U+202E)', () => {
|
||
const input = 'normal\u202Dtext\u202E'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('normaltext')
|
||
expect(result.strippedChars).toBe(2)
|
||
})
|
||
|
||
it('should strip LRE, RLE, PDF, LRI, RLI, FSI, PDI', () => {
|
||
const input = '\u202A\u202B\u202C\u2066\u2067\u2068\u2069text'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('text')
|
||
expect(result.strippedChars).toBe(7)
|
||
})
|
||
|
||
it('should report bidi_override_characters in suspicious patterns', () => {
|
||
const input = '\u202Ehello'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.suspiciousPatterns).toContain('bidi_override_characters')
|
||
})
|
||
})
|
||
|
||
describe('homoglyph normalization', () => {
|
||
it('should normalize Cyrillic а (U+0430) to Latin a', () => {
|
||
const input = '\u0430dmin'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('admin')
|
||
expect(result.homoglyphsReplaced).toBe(1)
|
||
})
|
||
|
||
it('should normalize Cyrillic о (U+043E) to Latin o', () => {
|
||
const input = 'hell\u043E'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('hello')
|
||
})
|
||
|
||
it('should normalize multiple Cyrillic homoglyphs', () => {
|
||
const input = '\u0441\u043E\u0440\u0435'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('cope')
|
||
expect(result.homoglyphsReplaced).toBe(4)
|
||
})
|
||
|
||
it('should normalize Greek homoglyphs (Α → A, ο → o)', () => {
|
||
const input = '\u0391\u03BFtest'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('Aotest')
|
||
expect(result.homoglyphsReplaced).toBe(2)
|
||
})
|
||
|
||
it('should normalize fullwidth Latin characters', () => {
|
||
const input = '\uFF41\uFF42\uFF43'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('abc')
|
||
expect(result.homoglyphsReplaced).toBe(3)
|
||
})
|
||
|
||
it('should report homoglyph_substitution in suspicious patterns', () => {
|
||
const input = '\u0430\u0435\u043E\u0440'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.suspiciousPatterns).toContain('homoglyph_substitution')
|
||
})
|
||
})
|
||
|
||
describe('normal text passthrough', () => {
|
||
it('should pass through normal ASCII text unchanged', () => {
|
||
const input = 'Hello, this is a normal message.'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe(input)
|
||
expect(result.strippedChars).toBe(0)
|
||
expect(result.homoglyphsReplaced).toBe(0)
|
||
expect(result.suspiciousPatterns).toHaveLength(0)
|
||
})
|
||
|
||
it('should preserve tabs and newlines', () => {
|
||
const input = 'line one\nline two\ttab'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe(input)
|
||
expect(result.strippedChars).toBe(0)
|
||
})
|
||
|
||
it('should preserve normal Unicode text (Chinese)', () => {
|
||
const input = '你好世界'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('你好世界')
|
||
expect(result.strippedChars).toBe(0)
|
||
})
|
||
|
||
it('should preserve normal Unicode text (Arabic)', () => {
|
||
const input = 'مرحبا بالعالم'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('مرحبا بالعالم')
|
||
expect(result.strippedChars).toBe(0)
|
||
})
|
||
|
||
it('should preserve emoji', () => {
|
||
const input = 'Hello 👋 World 🌍'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe(input)
|
||
})
|
||
})
|
||
|
||
describe('variation selector stripping', () => {
|
||
it('should strip variation selectors (FE00-FE0F)', () => {
|
||
const input = 'text\uFE0Fmore'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('textmore')
|
||
expect(result.suspiciousPatterns).toContain('variation_selectors')
|
||
})
|
||
})
|
||
|
||
describe('invisible formatting stripping', () => {
|
||
it('should strip soft hyphen (U+00AD)', () => {
|
||
const input = 'in\u00ADvisible'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('invisible')
|
||
})
|
||
|
||
it('should strip word joiner (U+2060)', () => {
|
||
const input = 'hello\u2060world'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('helloworld')
|
||
})
|
||
})
|
||
|
||
describe('control character stripping', () => {
|
||
it('should strip null bytes and other C0 controls', () => {
|
||
const input = 'hello\x00\x01\x02world'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('helloworld')
|
||
expect(result.suspiciousPatterns).toContain('control_characters')
|
||
})
|
||
|
||
it('should strip C1 control characters (U+0080-009F)', () => {
|
||
const input = 'test\x80\x90data'
|
||
const result = normalizer.normalize(input)
|
||
expect(result.normalized).toBe('testdata')
|
||
})
|
||
})
|
||
})
|
||
|
||
describe('scan()', () => {
|
||
it('should return ScanResult with correct scanner metadata', () => {
|
||
const result = normalizer.scan('clean text')
|
||
expect(result.scannerId).toBe('unicode-normalizer')
|
||
expect(result.scannerType).toBe('unicode')
|
||
expect(result.latencyMs).toBeGreaterThanOrEqual(0)
|
||
})
|
||
|
||
it('should not detect clean text as suspicious', () => {
|
||
const result = normalizer.scan('This is perfectly normal text.')
|
||
expect(result.detected).toBe(false)
|
||
expect(result.killChainPhase).toBe('none')
|
||
})
|
||
|
||
it('should detect text with many stripped chars (>5) as suspicious', () => {
|
||
const input = '\u200B\u200C\u200D\uFEFF\u200B\u200C extra zero widths'
|
||
const result = normalizer.scan(input)
|
||
expect(result.detected).toBe(true)
|
||
expect(result.killChainPhase).toBe('initial_access')
|
||
})
|
||
|
||
it('should detect text with many homoglyphs (>3) as suspicious', () => {
|
||
const input = '\u0430\u0435\u043E\u0440 looks like aeop'
|
||
const result = normalizer.scan(input)
|
||
expect(result.detected).toBe(true)
|
||
expect(result.confidence).toBeGreaterThanOrEqual(0.4)
|
||
})
|
||
|
||
it('should not flag text with exactly 5 stripped chars as suspicious', () => {
|
||
const input = '\u200B\u200C\u200D\uFEFF\u200B text'
|
||
const result = normalizer.scan(input)
|
||
expect(result.detected).toBe(false)
|
||
})
|
||
|
||
it('should include confidence score', () => {
|
||
const result = normalizer.scan('\u200B'.repeat(20) + 'test')
|
||
expect(result.confidence).toBeGreaterThan(0)
|
||
expect(result.confidence).toBeLessThanOrEqual(1.0)
|
||
})
|
||
|
||
it('should compute threat level based on confidence', () => {
|
||
const result = normalizer.scan('\u200B'.repeat(20) + '\u0430'.repeat(10) + 'test')
|
||
expect(['low', 'medium', 'high', 'critical']).toContain(result.threatLevel)
|
||
})
|
||
})
|
||
})
|