shieldx/tests/unit/preprocessing/UnicodeNormalizer.test.ts
Rene Fichtmueller 1c4c034483 feat: ShieldX v0.3.0 — UnicodeScanner (L5), DNS Covert Channel rules, ATLAS v5.4 mappings
- Layer 4 EntropyScanner: Shannon entropy, Base32/Base64 detection, CVE-2025-55284
  ping/nslookup exfil, EchoLeak markdown pattern, DNS tunneling (iodine/dnscat)
- Layer 5 UnicodeScanner: ASCII Smuggling (U+E0000 Tags Block), Variant Selectors,
  Zero-Width steganography, CamoLeak image-ordering (CVE-2025-53773), homoglyphs,
  BiDi override, high-entropy URL params
- 30 DNS covert channel rules (dns-001 to dns-030)
- ATLASMapper: 29 techniques (ATLAS v5.4.0 Feb 2026), added AML.T0062 (Agent Tool
  Invocation), AML.TA0015 (C2 tactic), memory poisoning, multi-agent trust,
  CamoLeak, Unicode steganography mappings
- Rule count: 72 → 102
- Build: tsup 316ms, zero TypeScript errors
2026-03-31 16:32:16 +02:00

248 lines
9.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { describe, it, expect, beforeEach } from 'vitest'
import { UnicodeNormalizer } from '../../../src/preprocessing/UnicodeNormalizer.js'
import { defaultConfig } from '../../../src/core/config.js'
describe('UnicodeNormalizer', () => {
let normalizer: UnicodeNormalizer
beforeEach(() => {
normalizer = new UnicodeNormalizer(defaultConfig)
})
describe('normalize()', () => {
describe('zero-width character stripping', () => {
it('should strip zero-width space (U+200B)', () => {
const input = 'hello\u200Bworld'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('helloworld')
expect(result.strippedChars).toBeGreaterThan(0)
})
it('should strip zero-width non-joiner (U+200C)', () => {
const input = 'test\u200Cinput'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('testinput')
})
it('should strip zero-width joiner (U+200D)', () => {
const input = 'a\u200Db'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('ab')
})
it('should strip BOM / zero-width no-break space (U+FEFF)', () => {
const input = '\uFEFFhello'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('hello')
})
it('should report zero_width_characters in suspicious patterns', () => {
const input = 'ig\u200Bnore prev\u200Cious'
const result = normalizer.normalize(input)
expect(result.suspiciousPatterns).toContain('zero_width_characters')
})
})
describe('Unicode tag character stripping', () => {
it('should strip Unicode tag characters (U+E0000 range)', () => {
const input = 'hello\u{E0001}\u{E0069}\u{E0067}\u{E006E}world'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('helloworld')
expect(result.strippedChars).toBe(4)
})
it('should report unicode_tag_characters in suspicious patterns', () => {
const input = 'test\u{E0020}\u{E0041}data'
const result = normalizer.normalize(input)
expect(result.suspiciousPatterns).toContain('unicode_tag_characters')
})
})
describe('bidi override removal', () => {
it('should strip LRO (U+202D) and RLO (U+202E)', () => {
const input = 'normal\u202Dtext\u202E'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('normaltext')
expect(result.strippedChars).toBe(2)
})
it('should strip LRE, RLE, PDF, LRI, RLI, FSI, PDI', () => {
const input = '\u202A\u202B\u202C\u2066\u2067\u2068\u2069text'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('text')
expect(result.strippedChars).toBe(7)
})
it('should report bidi_override_characters in suspicious patterns', () => {
const input = '\u202Ehello'
const result = normalizer.normalize(input)
expect(result.suspiciousPatterns).toContain('bidi_override_characters')
})
})
describe('homoglyph normalization', () => {
it('should normalize Cyrillic а (U+0430) to Latin a', () => {
const input = '\u0430dmin'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('admin')
expect(result.homoglyphsReplaced).toBe(1)
})
it('should normalize Cyrillic о (U+043E) to Latin o', () => {
const input = 'hell\u043E'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('hello')
})
it('should normalize multiple Cyrillic homoglyphs', () => {
const input = '\u0441\u043E\u0440\u0435'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('cope')
expect(result.homoglyphsReplaced).toBe(4)
})
it('should normalize Greek homoglyphs (Α → A, ο → o)', () => {
const input = '\u0391\u03BFtest'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('Aotest')
expect(result.homoglyphsReplaced).toBe(2)
})
it('should normalize fullwidth Latin characters', () => {
const input = '\uFF41\uFF42\uFF43'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('abc')
expect(result.homoglyphsReplaced).toBe(3)
})
it('should report homoglyph_substitution in suspicious patterns', () => {
const input = '\u0430\u0435\u043E\u0440'
const result = normalizer.normalize(input)
expect(result.suspiciousPatterns).toContain('homoglyph_substitution')
})
})
describe('normal text passthrough', () => {
it('should pass through normal ASCII text unchanged', () => {
const input = 'Hello, this is a normal message.'
const result = normalizer.normalize(input)
expect(result.normalized).toBe(input)
expect(result.strippedChars).toBe(0)
expect(result.homoglyphsReplaced).toBe(0)
expect(result.suspiciousPatterns).toHaveLength(0)
})
it('should preserve tabs and newlines', () => {
const input = 'line one\nline two\ttab'
const result = normalizer.normalize(input)
expect(result.normalized).toBe(input)
expect(result.strippedChars).toBe(0)
})
it('should preserve normal Unicode text (Chinese)', () => {
const input = '你好世界'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('你好世界')
expect(result.strippedChars).toBe(0)
})
it('should preserve normal Unicode text (Arabic)', () => {
const input = 'مرحبا بالعالم'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('مرحبا بالعالم')
expect(result.strippedChars).toBe(0)
})
it('should preserve emoji', () => {
const input = 'Hello 👋 World 🌍'
const result = normalizer.normalize(input)
expect(result.normalized).toBe(input)
})
})
describe('variation selector stripping', () => {
it('should strip variation selectors (FE00-FE0F)', () => {
const input = 'text\uFE0Fmore'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('textmore')
expect(result.suspiciousPatterns).toContain('variation_selectors')
})
})
describe('invisible formatting stripping', () => {
it('should strip soft hyphen (U+00AD)', () => {
const input = 'in\u00ADvisible'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('invisible')
})
it('should strip word joiner (U+2060)', () => {
const input = 'hello\u2060world'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('helloworld')
})
})
describe('control character stripping', () => {
it('should strip null bytes and other C0 controls', () => {
const input = 'hello\x00\x01\x02world'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('helloworld')
expect(result.suspiciousPatterns).toContain('control_characters')
})
it('should strip C1 control characters (U+0080-009F)', () => {
const input = 'test\x80\x90data'
const result = normalizer.normalize(input)
expect(result.normalized).toBe('testdata')
})
})
})
describe('scan()', () => {
it('should return ScanResult with correct scanner metadata', () => {
const result = normalizer.scan('clean text')
expect(result.scannerId).toBe('unicode-normalizer')
expect(result.scannerType).toBe('unicode')
expect(result.latencyMs).toBeGreaterThanOrEqual(0)
})
it('should not detect clean text as suspicious', () => {
const result = normalizer.scan('This is perfectly normal text.')
expect(result.detected).toBe(false)
expect(result.killChainPhase).toBe('none')
})
it('should detect text with many stripped chars (>5) as suspicious', () => {
const input = '\u200B\u200C\u200D\uFEFF\u200B\u200C extra zero widths'
const result = normalizer.scan(input)
expect(result.detected).toBe(true)
expect(result.killChainPhase).toBe('initial_access')
})
it('should detect text with many homoglyphs (>3) as suspicious', () => {
const input = '\u0430\u0435\u043E\u0440 looks like aeop'
const result = normalizer.scan(input)
expect(result.detected).toBe(true)
expect(result.confidence).toBeGreaterThanOrEqual(0.4)
})
it('should not flag text with exactly 5 stripped chars as suspicious', () => {
const input = '\u200B\u200C\u200D\uFEFF\u200B text'
const result = normalizer.scan(input)
expect(result.detected).toBe(false)
})
it('should include confidence score', () => {
const result = normalizer.scan('\u200B'.repeat(20) + 'test')
expect(result.confidence).toBeGreaterThan(0)
expect(result.confidence).toBeLessThanOrEqual(1.0)
})
it('should compute threat level based on confidence', () => {
const result = normalizer.scan('\u200B'.repeat(20) + '\u0430'.repeat(10) + 'test')
expect(['low', 'medium', 'high', 'critical']).toContain(result.threatLevel)
})
})
})