Skip to main content

oxideshield_core/
perplexity.rs

1//! Perplexity and entropy analysis for adversarial suffix detection
2//!
3//! This module provides tools to detect adversarial suffixes like those
4//! generated by AutoDAN and GCG attacks by analyzing character-level
5//! perplexity and token entropy.
6//!
7//! ## Research References
8//!
9//! - [AutoDAN](https://arxiv.org/abs/2310.04451) - Genetic algorithm adversarial prompts
10//! - [GCG Attack](https://arxiv.org/abs/2307.15043) - Zou et al., 2023
11//!   Gradient-based universal attacks that produce gibberish suffixes
12//!
13//! ## Detection Approach
14//!
15//! Adversarial suffixes typically exhibit unusual statistical properties:
16//! - Very high perplexity: Random/gibberish character sequences
17//! - Very low perplexity: Repeated characters or patterns
18//! - Unusual character n-gram distributions
19//! - Low token entropy (many repeated tokens)
20
21use std::collections::HashMap;
22use tracing::debug;
23
24/// Default character n-gram order for perplexity calculation
25pub const DEFAULT_NGRAM_ORDER: usize = 3;
26
27/// Default window size for sliding window analysis
28pub const DEFAULT_WINDOW_SIZE: usize = 50;
29
30/// Represents an anomalous segment detected in text
31#[derive(Debug, Clone)]
32pub struct AnomalySegment {
33    /// Start position in the text
34    pub start: usize,
35    /// End position in the text
36    pub end: usize,
37    /// The anomalous text segment
38    pub text: String,
39    /// Perplexity score of this segment
40    pub perplexity: f32,
41    /// Entropy score of this segment
42    pub entropy: f32,
43    /// Type of anomaly detected
44    pub anomaly_type: AnomalyType,
45}
46
47/// Types of anomalies that can be detected
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub enum AnomalyType {
50    /// High perplexity (gibberish/random)
51    HighPerplexity,
52    /// Low perplexity (repetitive patterns)
53    LowPerplexity,
54    /// Low entropy (many repeated tokens)
55    LowEntropy,
56    /// Unusual character distribution
57    UnusualDistribution,
58}
59
60impl std::fmt::Display for AnomalyType {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        match self {
63            AnomalyType::HighPerplexity => write!(f, "high_perplexity"),
64            AnomalyType::LowPerplexity => write!(f, "low_perplexity"),
65            AnomalyType::LowEntropy => write!(f, "low_entropy"),
66            AnomalyType::UnusualDistribution => write!(f, "unusual_distribution"),
67        }
68    }
69}
70
71/// Configuration for perplexity analysis
72#[derive(Debug, Clone)]
73pub struct PerplexityConfig {
74    /// N-gram order for character perplexity
75    pub ngram_order: usize,
76    /// Window size for sliding analysis
77    pub window_size: usize,
78    /// Minimum segment length to analyze
79    pub min_segment_length: usize,
80}
81
82impl Default for PerplexityConfig {
83    fn default() -> Self {
84        Self {
85            ngram_order: DEFAULT_NGRAM_ORDER,
86            window_size: DEFAULT_WINDOW_SIZE,
87            min_segment_length: 10,
88        }
89    }
90}
91
92/// Perplexity analyzer for detecting adversarial patterns
93///
94/// Uses character-level n-gram models and entropy calculations
95/// to detect unusual text patterns that may indicate adversarial
96/// suffixes or manipulated content.
97pub struct PerplexityAnalyzer {
98    /// Pre-computed n-gram frequencies for English text
99    char_ngram_model: HashMap<String, f32>,
100    /// Configuration
101    config: PerplexityConfig,
102}
103
104impl PerplexityAnalyzer {
105    /// Create a new analyzer with default English n-gram model
106    pub fn new() -> Self {
107        Self::with_config(PerplexityConfig::default())
108    }
109
110    /// Create a new analyzer with custom configuration
111    pub fn with_config(config: PerplexityConfig) -> Self {
112        let char_ngram_model = Self::build_english_ngram_model(config.ngram_order);
113        Self {
114            char_ngram_model,
115            config,
116        }
117    }
118
119    /// Build a simple English character n-gram model
120    ///
121    /// This is a simplified model based on common English character patterns.
122    /// A production system would use a model trained on large corpora.
123    fn build_english_ngram_model(ngram_order: usize) -> HashMap<String, f32> {
124        let mut model = HashMap::new();
125
126        // Common English bigrams/trigrams with approximate log probabilities
127        // These are simplified estimates based on English letter frequency
128        let common_patterns = [
129            // Common bigrams
130            ("th", -2.0),
131            ("he", -2.1),
132            ("in", -2.3),
133            ("er", -2.4),
134            ("an", -2.5),
135            ("re", -2.6),
136            ("on", -2.7),
137            ("at", -2.8),
138            ("en", -2.9),
139            ("nd", -3.0),
140            ("ti", -3.1),
141            ("es", -3.2),
142            ("or", -3.3),
143            ("te", -3.4),
144            ("of", -3.5),
145            ("ed", -3.6),
146            ("is", -3.7),
147            ("it", -3.8),
148            ("al", -3.9),
149            ("ar", -4.0),
150            ("st", -4.1),
151            ("to", -4.2),
152            ("nt", -4.3),
153            ("ng", -4.4),
154            ("se", -4.5),
155            // Common trigrams
156            ("the", -3.0),
157            ("and", -3.5),
158            ("ing", -3.8),
159            ("ion", -4.0),
160            ("tio", -4.2),
161            ("ent", -4.4),
162            ("ati", -4.6),
163            ("for", -4.8),
164            ("her", -5.0),
165            ("ter", -5.2),
166            ("hat", -5.4),
167            ("tha", -5.6),
168            ("ere", -5.8),
169            ("ate", -6.0),
170            ("his", -6.2),
171            ("con", -6.4),
172            ("res", -6.6),
173            ("ver", -6.8),
174            ("all", -7.0),
175            ("ons", -7.2),
176            // Spaces and punctuation
177            (" th", -2.5),
178            ("e ", -2.8),
179            (" a ", -3.0),
180            (" of", -3.2),
181            (" to", -3.4),
182            (" in", -3.6),
183            ("s ", -3.8),
184            (". ", -4.0),
185            (", ", -4.2),
186        ];
187
188        for (pattern, log_prob) in common_patterns {
189            if pattern.len() <= ngram_order {
190                model.insert(pattern.to_lowercase(), log_prob);
191            }
192        }
193
194        model
195    }
196
197    /// Calculate character-level perplexity for text
198    ///
199    /// Perplexity measures how "surprising" the text is according to the
200    /// character n-gram model. Higher values indicate unusual text.
201    pub fn char_perplexity(&self, text: &str) -> f32 {
202        let text = text.to_lowercase();
203        let chars: Vec<char> = text.chars().collect();
204
205        if chars.len() < self.config.ngram_order {
206            return 0.0;
207        }
208
209        let mut total_log_prob = 0.0f32;
210        let mut count = 0;
211
212        for i in 0..=(chars.len() - self.config.ngram_order) {
213            let ngram: String = chars[i..i + self.config.ngram_order].iter().collect();
214
215            // Use model probability or backoff
216            let log_prob = self.char_ngram_model.get(&ngram).copied().unwrap_or(-10.0);
217            total_log_prob += log_prob;
218            count += 1;
219        }
220
221        if count == 0 {
222            return 0.0;
223        }
224
225        // Perplexity = exp(-average_log_prob)
226        let avg_log_prob = total_log_prob / count as f32;
227        (-avg_log_prob).exp()
228    }
229
230    /// Calculate token/character entropy
231    ///
232    /// Low entropy indicates repetitive patterns (e.g., "aaaaaaa").
233    /// Normal text has moderate entropy.
234    pub fn token_entropy(&self, text: &str) -> f32 {
235        let chars: Vec<char> = text.chars().collect();
236        if chars.is_empty() {
237            return 0.0;
238        }
239
240        // Count character frequencies
241        let mut freq: HashMap<char, usize> = HashMap::new();
242        for &c in &chars {
243            *freq.entry(c).or_insert(0) += 1;
244        }
245
246        // Calculate Shannon entropy
247        let n = chars.len() as f32;
248        let entropy: f32 = freq
249            .values()
250            .map(|&count| {
251                let p = count as f32 / n;
252                if p > 0.0 {
253                    -p * p.log2()
254                } else {
255                    0.0
256                }
257            })
258            .sum();
259
260        entropy
261    }
262
263    /// Calculate unique character ratio
264    ///
265    /// Very low ratio indicates repetitive text.
266    /// Very high ratio with long text may indicate random characters.
267    pub fn unique_char_ratio(&self, text: &str) -> f32 {
268        let chars: Vec<char> = text.chars().collect();
269        if chars.is_empty() {
270            return 0.0;
271        }
272
273        let unique: std::collections::HashSet<char> = chars.iter().copied().collect();
274        unique.len() as f32 / chars.len() as f32
275    }
276
277    /// Detect anomalous segments in text using sliding window analysis
278    ///
279    /// Returns segments that have unusual perplexity or entropy scores.
280    pub fn find_anomalous_segments(
281        &self,
282        text: &str,
283        max_perplexity: f32,
284        min_perplexity: f32,
285        min_entropy: f32,
286    ) -> Vec<AnomalySegment> {
287        let mut anomalies = Vec::new();
288        let chars: Vec<char> = text.chars().collect();
289
290        if chars.len() < self.config.min_segment_length {
291            return anomalies;
292        }
293
294        let window_size = self.config.window_size.min(chars.len());
295        let step = window_size / 4; // 75% overlap
296
297        let mut i = 0;
298        while i + window_size <= chars.len() {
299            let segment: String = chars[i..i + window_size].iter().collect();
300
301            let perplexity = self.char_perplexity(&segment);
302            let entropy = self.token_entropy(&segment);
303
304            let anomaly_type = if perplexity > max_perplexity {
305                Some(AnomalyType::HighPerplexity)
306            } else if perplexity < min_perplexity && perplexity > 0.0 {
307                Some(AnomalyType::LowPerplexity)
308            } else if entropy < min_entropy {
309                Some(AnomalyType::LowEntropy)
310            } else {
311                None
312            };
313
314            if let Some(atype) = anomaly_type {
315                debug!(
316                    start = i,
317                    end = i + window_size,
318                    perplexity = %perplexity,
319                    entropy = %entropy,
320                    anomaly_type = %atype,
321                    "Anomalous segment detected"
322                );
323
324                anomalies.push(AnomalySegment {
325                    start: i,
326                    end: i + window_size,
327                    text: segment,
328                    perplexity,
329                    entropy,
330                    anomaly_type: atype,
331                });
332            }
333
334            i += step;
335        }
336
337        // Merge overlapping anomalies
338        Self::merge_overlapping_anomalies(anomalies)
339    }
340
341    /// Merge overlapping anomaly segments
342    fn merge_overlapping_anomalies(mut segments: Vec<AnomalySegment>) -> Vec<AnomalySegment> {
343        if segments.is_empty() {
344            return segments;
345        }
346
347        segments.sort_by_key(|s| s.start);
348
349        let mut merged = Vec::new();
350        let mut current = segments.remove(0);
351
352        for next in segments {
353            if next.start <= current.end {
354                // Overlapping, merge
355                current.end = current.end.max(next.end);
356                current.perplexity = current.perplexity.max(next.perplexity);
357                current.entropy = current.entropy.min(next.entropy);
358                // Keep the more severe anomaly type
359                if next.anomaly_type == AnomalyType::HighPerplexity {
360                    current.anomaly_type = AnomalyType::HighPerplexity;
361                }
362            } else {
363                merged.push(current);
364                current = next;
365            }
366        }
367        merged.push(current);
368
369        merged
370    }
371
372    /// Quick check for adversarial patterns
373    ///
374    /// Returns true if the text shows signs of adversarial manipulation.
375    pub fn is_suspicious(&self, text: &str, max_perplexity: f32, min_entropy: f32) -> bool {
376        if text.len() < self.config.min_segment_length {
377            return false;
378        }
379
380        let perplexity = self.char_perplexity(text);
381        let entropy = self.token_entropy(text);
382
383        perplexity > max_perplexity || entropy < min_entropy
384    }
385
386    /// Analyze the suffix portion of text
387    ///
388    /// GCG and AutoDAN attacks typically add adversarial suffixes.
389    /// This method focuses on the last portion of the text.
390    ///
391    /// # Arguments
392    /// * `text` - The text to analyze
393    /// * `suffix_ratio` - Portion of text to analyze as suffix (0.1 - 0.5)
394    /// * `max_perplexity` - Maximum allowed perplexity
395    /// * `min_entropy` - Minimum allowed entropy
396    pub fn analyze_suffix(
397        &self,
398        text: &str,
399        suffix_ratio: f32,
400        max_perplexity: f32,
401        min_entropy: f32,
402    ) -> Option<AnomalySegment> {
403        let chars: Vec<char> = text.chars().collect();
404        let suffix_len = (chars.len() as f32 * suffix_ratio.clamp(0.1, 0.5)) as usize;
405
406        if suffix_len < self.config.min_segment_length {
407            return None;
408        }
409
410        let start = chars.len() - suffix_len;
411        let suffix: String = chars[start..].iter().collect();
412
413        let perplexity = self.char_perplexity(&suffix);
414        let entropy = self.token_entropy(&suffix);
415
416        if perplexity > max_perplexity {
417            Some(AnomalySegment {
418                start,
419                end: chars.len(),
420                text: suffix,
421                perplexity,
422                entropy,
423                anomaly_type: AnomalyType::HighPerplexity,
424            })
425        } else if entropy < min_entropy {
426            Some(AnomalySegment {
427                start,
428                end: chars.len(),
429                text: suffix,
430                perplexity,
431                entropy,
432                anomaly_type: AnomalyType::LowEntropy,
433            })
434        } else {
435            None
436        }
437    }
438}
439
440impl Default for PerplexityAnalyzer {
441    fn default() -> Self {
442        Self::new()
443    }
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn test_normal_english_perplexity() {
452        let analyzer = PerplexityAnalyzer::new();
453
454        let normal_text = "The quick brown fox jumps over the lazy dog.";
455        let perplexity = analyzer.char_perplexity(normal_text);
456
457        // Normal English should have moderate perplexity
458        // Note: With our simplified n-gram model, perplexity values are higher
459        // than production models trained on large corpora
460        assert!(perplexity > 0.0);
461        // The simplified model produces higher values - this is expected
462        assert!(
463            perplexity < 50000.0,
464            "Normal text perplexity too high: {}",
465            perplexity
466        );
467    }
468
469    #[test]
470    fn test_gibberish_perplexity() {
471        let analyzer = PerplexityAnalyzer::new();
472
473        let gibberish = "xyzqkjwfpvbn zxcvqwert yuiopasdfghjkl";
474        let perplexity = analyzer.char_perplexity(gibberish);
475
476        let normal_text = "The quick brown fox jumps over the lazy dog.";
477        let normal_perplexity = analyzer.char_perplexity(normal_text);
478
479        // Gibberish should have higher perplexity than normal text
480        assert!(
481            perplexity > normal_perplexity,
482            "Gibberish ({}) should have higher perplexity than normal ({})",
483            perplexity,
484            normal_perplexity
485        );
486    }
487
488    #[test]
489    fn test_repetitive_text_entropy() {
490        let analyzer = PerplexityAnalyzer::new();
491
492        let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaa";
493        let entropy = analyzer.token_entropy(repetitive);
494
495        // Repetitive text should have very low entropy
496        assert!(
497            entropy < 0.5,
498            "Repetitive text entropy too high: {}",
499            entropy
500        );
501
502        let normal_text = "The quick brown fox jumps over the lazy dog.";
503        let normal_entropy = analyzer.token_entropy(normal_text);
504
505        // Normal text should have higher entropy
506        assert!(normal_entropy > entropy);
507    }
508
509    #[test]
510    fn test_unique_char_ratio() {
511        let analyzer = PerplexityAnalyzer::new();
512
513        let repetitive = "aaaaaaaaaa";
514        let ratio = analyzer.unique_char_ratio(repetitive);
515        assert!(ratio < 0.2, "Repetitive text should have low unique ratio");
516
517        let varied = "abcdefghij";
518        let varied_ratio = analyzer.unique_char_ratio(varied);
519        assert!(
520            varied_ratio > 0.9,
521            "Varied text should have high unique ratio"
522        );
523    }
524
525    #[test]
526    fn test_find_anomalous_segments() {
527        let analyzer = PerplexityAnalyzer::with_config(PerplexityConfig {
528            ngram_order: 3,
529            window_size: 20,
530            min_segment_length: 10,
531        });
532
533        let text = "Normal text here. xxxxxxxxxxxxxxxxxxxxxxx More normal text.";
534        let anomalies = analyzer.find_anomalous_segments(text, 1000.0, 1.0, 1.0);
535
536        // Should detect the repetitive section
537        assert!(!anomalies.is_empty(), "Should detect repetitive segment");
538    }
539
540    #[test]
541    fn test_is_suspicious() {
542        let analyzer = PerplexityAnalyzer::new();
543
544        // With our simplified model, use higher thresholds
545        let normal = "This is a normal sentence with common words.";
546        assert!(!analyzer.is_suspicious(normal, 50000.0, 1.5));
547
548        let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
549        // Repetitive text has very low entropy, which should trigger
550        assert!(analyzer.is_suspicious(repetitive, 50000.0, 1.5));
551    }
552
553    #[test]
554    fn test_analyze_suffix() {
555        let analyzer = PerplexityAnalyzer::new();
556
557        // Normal text with adversarial-like suffix
558        let text = "Please answer the following question. zde yz q xk wj pv bn zde yz";
559        // Use thresholds appropriate for simplified model
560        let anomaly = analyzer.analyze_suffix(text, 0.3, 100000.0, 1.0);
561
562        // The suffix should be detected as anomalous (low entropy due to repeated patterns)
563        if let Some(a) = anomaly {
564            assert!(a.perplexity > 0.0);
565        }
566    }
567
568    #[test]
569    fn test_short_text() {
570        let analyzer = PerplexityAnalyzer::new();
571
572        let short = "Hi";
573        let perplexity = analyzer.char_perplexity(short);
574        assert_eq!(perplexity, 0.0, "Short text should return 0 perplexity");
575
576        let entropy = analyzer.token_entropy(short);
577        assert!(entropy >= 0.0);
578    }
579
580    #[test]
581    fn test_empty_text() {
582        let analyzer = PerplexityAnalyzer::new();
583
584        assert_eq!(analyzer.char_perplexity(""), 0.0);
585        assert_eq!(analyzer.token_entropy(""), 0.0);
586        assert_eq!(analyzer.unique_char_ratio(""), 0.0);
587    }
588}