Skip to main content

agentzero_core/security/
perplexity.rs

1//! Character-class bigram perplexity filter for detecting adversarial suffixes.
2//!
3//! Adversarial prompt injection attacks often append high-perplexity suffix strings
4//! that look nothing like natural language. This filter scores the suffix window of
5//! incoming prompts using character-class bigram frequencies and blocks prompts that
6//! exceed the perplexity threshold.
7
8/// Character classes for bigram analysis.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10enum CharClass {
11    Lower,
12    Upper,
13    Digit,
14    Space,
15    Punct,
16    Symbol,
17    Other,
18}
19
20fn classify(c: char) -> CharClass {
21    if c.is_ascii_lowercase() {
22        CharClass::Lower
23    } else if c.is_ascii_uppercase() {
24        CharClass::Upper
25    } else if c.is_ascii_digit() {
26        CharClass::Digit
27    } else if c.is_ascii_whitespace() {
28        CharClass::Space
29    } else if c.is_ascii_punctuation() {
30        CharClass::Punct
31    } else if c.is_ascii() {
32        CharClass::Symbol
33    } else {
34        CharClass::Other
35    }
36}
37
38const NUM_CLASSES: usize = 7;
39
40fn class_index(c: CharClass) -> usize {
41    match c {
42        CharClass::Lower => 0,
43        CharClass::Upper => 1,
44        CharClass::Digit => 2,
45        CharClass::Space => 3,
46        CharClass::Punct => 4,
47        CharClass::Symbol => 5,
48        CharClass::Other => 6,
49    }
50}
51
52/// Compute character-class bigram perplexity for a string.
53///
54/// Returns the perplexity score — higher values indicate more "random" text.
55/// Natural English text typically scores 3-8; adversarial suffixes score 15+.
56pub fn bigram_perplexity(text: &str) -> f64 {
57    if text.len() < 2 {
58        return 0.0;
59    }
60
61    let chars: Vec<CharClass> = text.chars().map(classify).collect();
62    let n = chars.len();
63
64    // Count bigram frequencies
65    let mut bigram_counts = [[0u32; NUM_CLASSES]; NUM_CLASSES];
66    let mut total_bigrams = 0u32;
67
68    for window in chars.windows(2) {
69        let a = class_index(window[0]);
70        let b = class_index(window[1]);
71        bigram_counts[a][b] += 1;
72        total_bigrams += 1;
73    }
74
75    if total_bigrams == 0 {
76        return 0.0;
77    }
78
79    // Compute perplexity using log-probability
80    // P(bigram) = count(bigram) / total_bigrams
81    // Perplexity = exp(-1/N * sum(log(P(bigram))))
82    let total_f = total_bigrams as f64;
83    let mut log_prob_sum = 0.0;
84
85    for window in chars.windows(2) {
86        let a = class_index(window[0]);
87        let b = class_index(window[1]);
88        let count = bigram_counts[a][b] as f64;
89        // Laplace smoothing to avoid log(0)
90        let prob = (count + 0.1) / (total_f + 0.1 * (NUM_CLASSES * NUM_CLASSES) as f64);
91        log_prob_sum += prob.ln();
92    }
93
94    let avg_log_prob = log_prob_sum / (n - 1) as f64;
95    (-avg_log_prob).exp()
96}
97
98/// Compute the ratio of symbol/punctuation characters in the text.
99pub fn symbol_ratio(text: &str) -> f64 {
100    if text.is_empty() {
101        return 0.0;
102    }
103
104    let symbol_count = text
105        .chars()
106        .filter(|c| {
107            let cls = classify(*c);
108            matches!(cls, CharClass::Punct | CharClass::Symbol | CharClass::Other)
109        })
110        .count();
111
112    symbol_count as f64 / text.len() as f64
113}
114
115/// Result of perplexity filter analysis.
116#[derive(Debug, Clone, PartialEq)]
117pub enum PerplexityResult {
118    /// The text passes the filter.
119    Pass,
120    /// The text is flagged as potentially adversarial.
121    Flagged {
122        perplexity: f64,
123        symbol_ratio: f64,
124        reason: String,
125    },
126}
127
128/// Analyze the suffix window of a prompt for adversarial content.
129///
130/// - `text`: full prompt text
131/// - `suffix_window_chars`: number of trailing characters to analyze
132/// - `perplexity_threshold`: perplexity score above which to flag
133/// - `symbol_ratio_threshold`: symbol ratio above which to flag
134/// - `min_prompt_chars`: minimum prompt length to apply the filter
135pub fn analyze_suffix(
136    text: &str,
137    suffix_window_chars: usize,
138    perplexity_threshold: f64,
139    symbol_ratio_threshold: f64,
140    min_prompt_chars: usize,
141) -> PerplexityResult {
142    if text.len() < min_prompt_chars {
143        return PerplexityResult::Pass;
144    }
145
146    // Extract suffix window
147    let suffix_start = text.len().saturating_sub(suffix_window_chars);
148    let suffix = &text[suffix_start..];
149
150    let perp = bigram_perplexity(suffix);
151    let sym_ratio = symbol_ratio(suffix);
152
153    // Flag if either threshold is exceeded
154    if perp > perplexity_threshold {
155        return PerplexityResult::Flagged {
156            perplexity: perp,
157            symbol_ratio: sym_ratio,
158            reason: format!(
159                "Suffix perplexity {perp:.2} exceeds threshold {perplexity_threshold:.2}"
160            ),
161        };
162    }
163
164    if sym_ratio > symbol_ratio_threshold {
165        return PerplexityResult::Flagged {
166            perplexity: perp,
167            symbol_ratio: sym_ratio,
168            reason: format!(
169                "Suffix symbol ratio {sym_ratio:.2} exceeds threshold {symbol_ratio_threshold:.2}"
170            ),
171        };
172    }
173
174    PerplexityResult::Pass
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    #[test]
182    fn natural_english_low_perplexity() {
183        let text = "Hello, this is a normal English sentence about programming.";
184        let perp = bigram_perplexity(text);
185        // Natural text should have relatively low perplexity
186        assert!(perp < 10.0, "English text perplexity {perp} should be < 10");
187    }
188
189    #[test]
190    fn random_chars_high_perplexity() {
191        let text = "xK7!mQ@3#zP$9&wR*5^yL%2(eN)8+bT";
192        let perp = bigram_perplexity(text);
193        // Random mixed-class chars should have high perplexity
194        assert!(perp > 5.0, "Random chars perplexity {perp} should be > 5");
195    }
196
197    #[test]
198    fn empty_text_zero_perplexity() {
199        assert_eq!(bigram_perplexity(""), 0.0);
200        assert_eq!(bigram_perplexity("a"), 0.0);
201    }
202
203    #[test]
204    fn repeated_chars_low_perplexity() {
205        let text = "aaaaaaaaaaaaaaaaaaa";
206        let perp = bigram_perplexity(text);
207        assert!(perp < 3.0, "Repeated chars perplexity {perp} should be < 3");
208    }
209
210    #[test]
211    fn symbol_ratio_normal_text() {
212        let text = "Hello, world!";
213        let ratio = symbol_ratio(text);
214        assert!(
215            ratio < 0.20,
216            "Normal text symbol ratio {ratio} should be < 0.20"
217        );
218    }
219
220    #[test]
221    fn symbol_ratio_heavy_symbols() {
222        let text = "!@#$%^&*()_+-=[]{}|;':\",./<>?";
223        let ratio = symbol_ratio(text);
224        assert!(
225            ratio > 0.80,
226            "Heavy symbol text ratio {ratio} should be > 0.80"
227        );
228    }
229
230    #[test]
231    fn symbol_ratio_empty() {
232        assert_eq!(symbol_ratio(""), 0.0);
233    }
234
235    #[test]
236    fn analyze_suffix_passes_normal_text() {
237        let text = "Can you help me write a function that calculates the fibonacci sequence?";
238        let result = analyze_suffix(text, 64, 18.0, 0.20, 32);
239        assert_eq!(result, PerplexityResult::Pass);
240    }
241
242    #[test]
243    fn analyze_suffix_flags_adversarial_suffix() {
244        // Simulate an adversarial prompt: normal text followed by gibberish
245        let normal = "Please write a function.";
246        let adversarial = "xK7!mQ@3#zP$9&wR*5^yL%2(eN)8+bT!@#$%^&*()_+-=[]{}|xK7!mQ@3#";
247        let text = format!("{normal} {adversarial}");
248
249        let result = analyze_suffix(&text, 64, 4.0, 0.20, 32);
250        match result {
251            PerplexityResult::Flagged { .. } => {} // expected
252            PerplexityResult::Pass => panic!("adversarial suffix should be flagged"),
253        }
254    }
255
256    #[test]
257    fn analyze_suffix_skips_short_prompts() {
258        let text = "hi";
259        let result = analyze_suffix(text, 64, 18.0, 0.20, 32);
260        assert_eq!(result, PerplexityResult::Pass);
261    }
262
263    #[test]
264    fn analyze_suffix_symbol_ratio_flag() {
265        let text = "Please help me with this: !@#$%^&*()!@#$%^&*()!@#$%^&*()!@#$%^&*()";
266        let result = analyze_suffix(text, 40, 100.0, 0.10, 32);
267        match result {
268            PerplexityResult::Flagged { symbol_ratio, .. } => {
269                assert!(symbol_ratio > 0.10);
270            }
271            PerplexityResult::Pass => panic!("high symbol ratio should be flagged"),
272        }
273    }
274}