zeph_common/
patterns.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shared injection-detection patterns for the security sanitization layers.
5//!
6//! This module is the single source of truth for prompt-injection detection patterns
7//! used by both `zeph-mcp` (MCP tool definition sanitization) and `zeph-core`
8//! (content isolation pipeline). Each consumer compiles its own `Regex` instances
9//! from [`RAW_INJECTION_PATTERNS`] at startup via `LazyLock`.
10//!
11//! # Known limitations
12//!
13//! The patterns cover common English-language prompt-injection techniques. Known evasion
14//! vectors include: non-English injections, semantic rephrasing, encoded payloads in
15//! markdown code blocks, multi-line splitting (regex `.` does not match `\n` by default),
16//! and homoglyph substitution. [`strip_format_chars`] mitigates Unicode Cf-category bypass
17//! but does not handle homoglyphs. This scanner is **advisory and defense-in-depth only**,
18//! not a security boundary. The trust gate (tool blocking via `TrustGateExecutor`) is the
19//! primary enforcement mechanism.
20
21/// Raw (name, regex pattern) pairs for prompt-injection detection.
22///
23/// Covers common English-language techniques from OWASP LLM Top 10, Unicode bypass
24/// vectors (handled upstream by [`strip_format_chars`]), exfiltration channels
25/// (markdown/HTML images), and delimiter-escape attempts against Zeph's own wrapper tags.
26///
27/// Both `zeph-mcp` and `zeph-core::sanitizer` compile their own `regex::Regex` instances
28/// from this slice. Do not export a compiled `LazyLock` — let each consumer own its state.
29pub const RAW_INJECTION_PATTERNS: &[(&str, &str)] = &[
30    (
31        "ignore_instructions",
32        r"(?i)ignore\s+(all\s+|any\s+|previous\s+|prior\s+)?instructions",
33    ),
34    ("role_override", r"(?i)you\s+are\s+now"),
35    (
36        "new_directive",
37        r"(?i)new\s+(instructions?|directives?)\s*:",
38    ),
39    ("developer_mode", r"(?i)developer\s+mode"),
40    (
41        "system_prompt_leak",
42        r"(?i)((reveal|show|print|output|display|repeat|expose|dump|leak|copy|give)\s+(me\s+)?(your\s+|the\s+|my\s+)?(full\s+|entire\s+|exact\s+|complete\s+)?system\s+prompt|what\s+(is|are|was)\s+(your\s+|the\s+)?system\s+prompt)",
43    ),
44    (
45        "reveal_instructions",
46        r"(?i)(reveal|show|display|print)\s+your\s+(instructions?|prompts?|rules?)",
47    ),
48    ("jailbreak", r"(?i)\b(DAN|jailbreak)\b"),
49    ("base64_payload", r"(?i)(decode|eval|execute).*base64"),
50    (
51        "xml_tag_injection",
52        r"(?i)</?\s*(system|assistant|user|tool_result|function_call)\s*>",
53    ),
54    ("markdown_image_exfil", r"(?i)!\[.*?\]\(https?://[^)]+\)"),
55    ("forget_everything", r"(?i)forget\s+(everything|all)"),
56    (
57        "disregard_instructions",
58        r"(?i)disregard\s+(your|all|previous)",
59    ),
60    (
61        "override_directives",
62        r"(?i)override\s+(your|all)\s+(directives?|instructions?|rules?)",
63    ),
64    ("act_as_if", r"(?i)act\s+as\s+if"),
65    ("html_image_exfil", r"(?i)<img\s+[^>]*src\s*="),
66    ("delimiter_escape_tool_output", r"(?i)</?tool-output[\s>]"),
67    (
68        "delimiter_escape_external_data",
69        r"(?i)</?external-data[\s>]",
70    ),
71];
72
73/// Patterns for scanning LLM *output* (response verification layer).
74///
75/// These are intentionally separate from [`RAW_INJECTION_PATTERNS`] (which target untrusted
76/// *input*). Output patterns must have very low false-positive rate on normal LLM responses.
77/// Patterns here detect cases where an LLM response itself contains injected instructions
78/// that could cause the agent to behave incorrectly.
79///
80/// Note: `markdown_image_exfil` is intentionally absent — it is already handled by
81/// `scan_output_and_warn`/`ExfiltrationGuard`.
82pub const RAW_RESPONSE_PATTERNS: &[(&str, &str)] = &[
83    (
84        "autonomy_override",
85        r"(?i)\bset\s+(autonomy|trust)\s*(level|mode)\s*to\b",
86    ),
87    (
88        "memory_write_instruction",
89        r"(?i)\b(now\s+)?(store|save|remember|write)\s+this\s+(to|in)\s+(memory|vault|database)\b",
90    ),
91    (
92        "instruction_override",
93        r"(?i)\b(from\s+now\s+on|henceforth)\b.{0,80}\b(always|never|must)\b",
94    ),
95    (
96        "config_manipulation",
97        r"(?i)\b(change|modify|update)\s+your\s+(config|configuration|settings)\b",
98    ),
99    (
100        "ignore_instructions_response",
101        r"(?i)\bignore\s+(all\s+|any\s+|your\s+)?(previous\s+|prior\s+)?(instructions?|rules?|constraints?)\b",
102    ),
103    (
104        "override_directives_response",
105        r"(?i)\boverride\s+(your\s+)?(directives?|instructions?|rules?|constraints?)\b",
106    ),
107    (
108        "disregard_system",
109        r"(?i)\bdisregard\s+(your\s+|the\s+)?(system\s+prompt|instructions?|guidelines?)\b",
110    ),
111];
112
113/// Strip Unicode format (Cf) characters and ASCII control characters (except tab/newline)
114/// from `text` before injection pattern matching.
115///
116/// These characters are invisible to humans but can break regex word boundaries,
117/// allowing attackers to smuggle injection keywords through zero-width joiners,
118/// soft hyphens, BOM, etc.
119///
120/// # Examples
121///
122/// ```rust
123/// use zeph_common::patterns::strip_format_chars;
124///
125/// let result = strip_format_chars("ig\u{200B}nore instructions");
126/// assert!(!result.contains('\u{200B}'));
127/// assert!(result.contains("ignore"));
128/// ```
129#[must_use]
130pub fn strip_format_chars(text: &str) -> String {
131    text.chars()
132        .filter(|&c| {
133            // Keep printable ASCII, tab, newline
134            if c == '\t' || c == '\n' {
135                return true;
136            }
137            // Drop ASCII control characters
138            if c.is_ascii_control() {
139                return false;
140            }
141            // Drop known Unicode Cf (format) codepoints that are used as bypass vectors
142            !matches!(
143                c,
144                '\u{00AD}'  // Soft hyphen
145                | '\u{034F}'  // Combining grapheme joiner
146                | '\u{061C}'  // Arabic letter mark
147                | '\u{115F}'  // Hangul filler
148                | '\u{1160}'  // Hangul jungseong filler
149                | '\u{17B4}'  // Khmer vowel inherent aq
150                | '\u{17B5}'  // Khmer vowel inherent aa
151                | '\u{180B}'..='\u{180D}'  // Mongolian free variation selectors
152                | '\u{180F}'  // Mongolian free variation selector 4
153                | '\u{200B}'..='\u{200F}'  // Zero-width space/ZWNJ/ZWJ/LRM/RLM
154                | '\u{202A}'..='\u{202E}'  // Directional formatting
155                | '\u{2060}'..='\u{2064}'  // Word joiner / invisible separators
156                | '\u{2066}'..='\u{206F}'  // Bidi controls
157                | '\u{FEFF}'  // BOM / zero-width no-break space
158                | '\u{FFF9}'..='\u{FFFB}'  // Interlinear annotation
159                | '\u{1BCA0}'..='\u{1BCA3}'  // Shorthand format controls
160                | '\u{1D173}'..='\u{1D17A}'  // Musical symbol beam controls
161                | '\u{E0000}'..='\u{E007F}'  // Tags block
162            )
163        })
164        .collect()
165}
zeph_common/patterns.rs

zeph_common/
patterns.rs