shieldx/src/core/DefenseEnsemble.ts

/**
 * DefenseEnsemble — ShieldX Phase 3: Ensemble Voting Layer.
 *
 * Three independent voters (Rule-Based, Semantic, Behavioral) evaluate
 * disjoint subsets of ScanResult[], then a weighted-majority aggregation
 * produces the final EnsembleVerdict.
 *
 * Voter weights:
 *   Rule-Based   0.35
 *   Semantic     0.30
 *   Behavioral   0.35
 *
 * Decision logic:
 *   2+ voters 'threat'     → final 'threat'
 *   2+ voters 'suspicious' → final 'suspicious'
 *   otherwise              → final 'clean'
 *   unanimous 'threat'     → confidence boosted +0.1 (capped 1.0)
 *
 * All returned objects are deeply frozen (immutable).
 */

import type { ScanResult, ScannerType, ThreatLevel } from '../types/detection.js'

// ---------------------------------------------------------------------------
// Public interfaces
// ---------------------------------------------------------------------------

/** Vote produced by a single voter */
export interface VoterVerdict {
  readonly voterId: string
  readonly vote: 'clean' | 'suspicious' | 'threat'
  readonly confidence: number
  readonly maxThreatLevel: ThreatLevel
  readonly resultCount: number
  readonly detectedCount: number
}

/** Aggregated verdict from the DefenseEnsemble */
export interface EnsembleVerdict {
  readonly finalVote: 'clean' | 'suspicious' | 'threat'
  readonly finalConfidence: number
  readonly maxThreatLevel: ThreatLevel
  readonly ruleVoter: VoterVerdict
  readonly semanticVoter: VoterVerdict
  readonly behavioralVoter: VoterVerdict
  readonly unanimous: boolean
  readonly evaluatedAt: string
}

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/** Voter weight distribution (must sum to 1.0) */
const WEIGHTS = Object.freeze({
  rule: 0.35,
  semantic: 0.30,
  behavioral: 0.35,
} as const)

/** Confidence boost when all three voters agree on 'threat' */
const UNANIMOUS_BOOST = 0.1

/** Detection ratio thresholds for voter verdicts */
const RATIO_THREAT = 0.5
const RATIO_SUSPICIOUS = 0.2

/** Threat level severity ordering (higher index = more severe) */
const THREAT_SEVERITY: readonly ThreatLevel[] = Object.freeze([
  'none', 'low', 'medium', 'high', 'critical',
])

// ---------------------------------------------------------------------------
// Scanner-to-voter classification
// ---------------------------------------------------------------------------

/** ScannerTypes routed to the RuleBasedVoter */
const RULE_SCANNER_TYPES: ReadonlySet<ScannerType> = new Set<ScannerType>([
  'rule', 'tokenizer', 'entropy', 'unicode',
])

/** ScannerTypes routed to the SemanticVoter */
const SEMANTIC_SCANNER_TYPES: ReadonlySet<ScannerType> = new Set<ScannerType>([
  'embedding', 'sentinel',
])

/** ScannerTypes routed to the BehavioralVoter */
const BEHAVIORAL_SCANNER_TYPES: ReadonlySet<ScannerType> = new Set<ScannerType>([
  'behavioral', 'conversation', 'context_integrity',
  'memory_integrity', 'intent_guard', 'tool_chain',
])

/** ScannerId substrings that override type-based classification */
const RULE_ID_PATTERNS: readonly string[] = Object.freeze([
  'cipher', 'emoji', 'upside', 'unicode', 'entropy',
  'rule', 'indirect', 'resource', 'output-payload',
])

const SEMANTIC_ID_PATTERNS: readonly string[] = Object.freeze([
  'semantic', 'embedding', 'sentinel',
])

const BEHAVIORAL_ID_PATTERNS: readonly string[] = Object.freeze([
  'conversation', 'intent', 'context', 'auth',
  'decomposition', 'tool-call', 'melon',
])

// ---------------------------------------------------------------------------
// Classification helpers
// ---------------------------------------------------------------------------

type VoterCategory = 'rule' | 'semantic' | 'behavioral'

function classifyResult(result: ScanResult): VoterCategory | null {
  const id = result.scannerId.toLowerCase()

  if (RULE_SCANNER_TYPES.has(result.scannerType)) return 'rule'
  if (SEMANTIC_SCANNER_TYPES.has(result.scannerType)) return 'semantic'
  if (BEHAVIORAL_SCANNER_TYPES.has(result.scannerType)) return 'behavioral'

  if (RULE_ID_PATTERNS.some((p) => id.includes(p))) return 'rule'
  if (SEMANTIC_ID_PATTERNS.some((p) => id.includes(p))) return 'semantic'
  if (BEHAVIORAL_ID_PATTERNS.some((p) => id.includes(p))) return 'behavioral'

  return null
}

function partitionResults(
  results: readonly ScanResult[],
): Readonly<Record<VoterCategory, readonly ScanResult[]>> {
  const rule: ScanResult[] = []
  const semantic: ScanResult[] = []
  const behavioral: ScanResult[] = []

  for (const result of results) {
    const category = classifyResult(result)
    if (category === 'rule') rule.push(result)
    else if (category === 'semantic') semantic.push(result)
    else if (category === 'behavioral') behavioral.push(result)
    // Unclassified results are intentionally dropped — each voter
    // only sees results from its domain.
  }

  return Object.freeze({
    rule: Object.freeze(rule),
    semantic: Object.freeze(semantic),
    behavioral: Object.freeze(behavioral),
  })
}

// ---------------------------------------------------------------------------
// Threat level helpers
// ---------------------------------------------------------------------------

function threatSeverityIndex(level: ThreatLevel): number {
  const idx = THREAT_SEVERITY.indexOf(level)
  return idx >= 0 ? idx : 0
}

function highestThreatLevel(results: readonly ScanResult[]): ThreatLevel {
  let maxIdx = 0
  for (const r of results) {
    const idx = threatSeverityIndex(r.threatLevel)
    if (idx > maxIdx) maxIdx = idx
  }
  return THREAT_SEVERITY[maxIdx] ?? 'none'
}

// ---------------------------------------------------------------------------
// Individual voter evaluation
// ---------------------------------------------------------------------------

function evaluateVoter(
  voterId: string,
  results: readonly ScanResult[],
): VoterVerdict {
  if (results.length === 0) {
    return Object.freeze({
      voterId,
      vote: 'clean' as const,
      confidence: 0,
      maxThreatLevel: 'none' as const,
      resultCount: 0,
      detectedCount: 0,
    })
  }

  const detectedResults = results.filter((r) => r.detected)
  const detectedCount = detectedResults.length
  const detectedRatio = detectedCount / results.length

  const avgConfidence = detectedCount > 0
    ? detectedResults.reduce((sum, r) => sum + r.confidence, 0) / detectedCount
    : 0

  const maxThreat = highestThreatLevel(results)
  const hasHighOrCritical = results.some(
    (r) => r.threatLevel === 'high' || r.threatLevel === 'critical',
  )

  let vote: VoterVerdict['vote']
  if (detectedRatio >= RATIO_THREAT) {
    vote = 'threat'
  } else if (detectedRatio >= RATIO_SUSPICIOUS || hasHighOrCritical) {
    vote = 'suspicious'
  } else {
    vote = 'clean'
  }

  return Object.freeze({
    voterId,
    vote,
    confidence: Math.round(avgConfidence * 1000) / 1000,
    maxThreatLevel: maxThreat,
    resultCount: results.length,
    detectedCount,
  })
}

// ---------------------------------------------------------------------------
// Ensemble aggregation
// ---------------------------------------------------------------------------

type VoteLevel = 'clean' | 'suspicious' | 'threat'

const VOTE_SEVERITY: Readonly<Record<VoteLevel, number>> = Object.freeze({
  clean: 0,
  suspicious: 1,
  threat: 2,
})

function aggregateVotes(
  ruleVoter: VoterVerdict,
  semanticVoter: VoterVerdict,
  behavioralVoter: VoterVerdict,
): { readonly finalVote: VoteLevel; readonly finalConfidence: number; readonly unanimous: boolean } {
  const votes: readonly VoterVerdict[] = [ruleVoter, semanticVoter, behavioralVoter]

  const threatCount = votes.filter((v) => v.vote === 'threat').length
  const suspiciousOrHigherCount = votes.filter(
    (v) => VOTE_SEVERITY[v.vote] >= VOTE_SEVERITY['suspicious'],
  ).length

  let finalVote: VoteLevel
  if (threatCount >= 2) {
    finalVote = 'threat'
  } else if (suspiciousOrHigherCount >= 2) {
    finalVote = 'suspicious'
  } else {
    finalVote = 'clean'
  }

  const weightedConfidence =
    ruleVoter.confidence * WEIGHTS.rule +
    semanticVoter.confidence * WEIGHTS.semantic +
    behavioralVoter.confidence * WEIGHTS.behavioral

  const unanimous = threatCount === 3
  const boostedConfidence = unanimous
    ? Math.min(weightedConfidence + UNANIMOUS_BOOST, 1.0)
    : weightedConfidence

  const finalConfidence = Math.round(boostedConfidence * 1000) / 1000

  return Object.freeze({ finalVote, finalConfidence, unanimous })
}

// ---------------------------------------------------------------------------
// DefenseEnsemble
// ---------------------------------------------------------------------------

/**
 * Defense Ensemble — weighted majority voting across three independent voters.
 *
 * Classifies each ScanResult by scanner type/id, feeds subsets to the
 * Rule-Based, Semantic, and Behavioral voters, then aggregates their
 * verdicts into a final EnsembleVerdict.
 *
 * Stateless: no mutable fields, every call to evaluate() is independent.
 *
 * @example
 * ```typescript
 * const ensemble = new DefenseEnsemble()
 * const verdict = ensemble.evaluate(scanResults)
 * if (verdict.finalVote === 'threat') blockRequest()
 * ```
 */
export class DefenseEnsemble {
  /**
   * Evaluate a set of ScanResults and produce an ensemble verdict.
   *
   * @param results - Array of ScanResult from the ShieldX pipeline scanners
   * @returns Frozen EnsembleVerdict with individual voter verdicts + final decision
   */
  evaluate(results: readonly ScanResult[]): EnsembleVerdict {
    const partitions = partitionResults(results)

    const ruleVoter = evaluateVoter('rule-based-voter', partitions.rule)
    const semanticVoter = evaluateVoter('semantic-voter', partitions.semantic)
    const behavioralVoter = evaluateVoter('behavioral-voter', partitions.behavioral)

    const { finalVote, finalConfidence, unanimous } = aggregateVotes(
      ruleVoter,
      semanticVoter,
      behavioralVoter,
    )

    const allResults = [
      ...partitions.rule,
      ...partitions.semantic,
      ...partitions.behavioral,
    ]
    const maxThreatLevel = allResults.length > 0
      ? highestThreatLevel(allResults)
      : 'none' as ThreatLevel

    return Object.freeze({
      finalVote,
      finalConfidence,
      maxThreatLevel,
      ruleVoter,
      semanticVoter,
      behavioralVoter,
      unanimous,
      evaluatedAt: new Date().toISOString(),
    })
  }
}