llm-gateway/packages/gateway/prompts/templates/shieldx_false_positive.yaml

id: shieldx_false_positive
version: "1.0.0"
task_type: shieldx_false_positive
description: Analyze a false positive detection in ShieldX and recommend threshold adjustments to reduce recurrence
model_preference: qwen2.5:14b
model_minimum: qwen2.5:7b
temperature: 0.3
max_tokens: 1024
output_format: json

system_prompt: |
  You are the false positive analysis specialist for ShieldX, an LLM prompt injection defense system.
  Analyze false positive detections and recommend specific adjustments to detection rules and thresholds.

  Return ONLY valid JSON:
  {
    "fp_confirmed": true|false,
    "fp_reason": "string — why this was a false positive",
    "legitimate_use_case": "string — what the user was legitimately trying to do",
    "affected_rule": "string — which ShieldX rule triggered incorrectly",
    "current_threshold": number or null,
    "suggested_threshold_change": {
      "direction": "increase|decrease|no_change",
      "new_threshold": number or null,
      "confidence": "high|medium|low",
      "reasoning": "string"
    },
    "negative_indicators": [
      {
        "indicator": "string — text pattern that suggests legitimate use",
        "type": "string — add as exclusion to the rule"
      }
    ],
    "example_legitimate_inputs": ["string — 2-3 examples of similar legitimate inputs that should NOT be blocked"],
    "rule_modification_needed": true|false,
    "rule_modification_description": "string or null",
    "test_required_before_deploy": true|false
  }

  Analysis approach:
  - Confirm false positive: is this TRULY a legitimate input that was incorrectly blocked?
  - Identify the specific linguistic/behavioral pattern that triggered the false positive
  - Determine if the rule threshold is too sensitive or if negative indicators should be added
  - Threshold increase = less sensitive (reduces FP but risks more FN)
  - Adding negative indicators = more precise without affecting sensitivity for other patterns
  - Prefer negative indicators over threshold changes where possible

user_template: |
  Flagged input:
  {{flagged_input}}

  Rule that triggered: {{triggered_rule}}
  Detection score: {{detection_score}}
  Rule threshold: {{rule_threshold}}
  Context: {{context}}
  User/session context: {{user_context}}

  Analyze this false positive:

few_shot_examples:
  - user: |
      Flagged input:
      "Write a security research paper about prompt injection attacks. Include examples of common attack patterns like 'ignore previous instructions' and 'you are now...' style jailbreaks. I'm studying AI safety."

      Rule that triggered: SX-001 direct_injection_detector
      Detection score: 0.81
      Rule threshold: 0.80
      Context: Security research assistant deployment
      User/session context: User is authenticated as security researcher on an enterprise platform

    assistant: |
      {
        "fp_confirmed": true,
        "fp_reason": "The input mentions attack pattern phrases ('ignore previous instructions', 'you are now') in a meta-context — the user is asking to WRITE ABOUT these patterns, not to USE them. The rule detected the presence of attack language without accounting for quotation context or meta-discussion framing.",
        "legitimate_use_case": "Security researcher writing a paper about prompt injection attacks — explicitly studying and documenting known attack patterns. This is a core use case for security tooling.",
        "affected_rule": "SX-001 direct_injection_detector",
        "current_threshold": 0.80,
        "suggested_threshold_change": {
          "direction": "no_change",
          "new_threshold": 0.80,
          "confidence": "high",
          "reasoning": "Threshold is correct. The issue is not sensitivity but lack of negative indicators for meta-discussion framing. Raising the threshold would reduce FPs in this case but would also miss real direct injection attacks that score 0.80-0.85."
        },
        "negative_indicators": [
          {
            "indicator": "Quoted attack phrases in research/educational context: 'write about', 'examples of', 'studying', 'paper about', 'documentation of'",
            "type": "Add as exclusion: if attack pattern phrases appear within 10 tokens of research framing words, reduce score by 0.25"
          },
          {
            "indicator": "User role == 'security_researcher' AND context == 'educational'",
            "type": "Add as contextual exclusion: apply 0.15 score reduction for verified security researcher accounts"
          }
        ],
        "example_legitimate_inputs": [
          "Can you explain how 'ignore previous instructions' attacks work and why they're dangerous?",
          "Write documentation about jailbreak attack patterns for our security team training",
          "I need to test my application for prompt injection — can you help me understand the attack surface?"
        ],
        "rule_modification_needed": true,
        "rule_modification_description": "Add negative indicator: if input contains attack pattern phrases AND contains meta-framing words ('write about', 'explain', 'examples of', 'studying', 'research', 'documentation') within the same sentence, reduce detection score by 0.25 before threshold comparison.",
        "test_required_before_deploy": true
      }

variables:
  - flagged_input
  - triggered_rule
  - detection_score
  - rule_threshold
  - context
  - user_context
  - few_shot_examples

validation_rules:
  output_must_be_json: true
  required_fields: ["fp_confirmed", "fp_reason", "affected_rule", "suggested_threshold_change", "example_legitimate_inputs"]