Skip to main content

oxideshield_guard/guards/
perplexity.rs

1//! Perplexity-based guard for detecting adversarial suffixes
2//!
3//! This guard uses character-level perplexity and entropy analysis to detect
4//! adversarial patterns like those generated by AutoDAN and GCG attacks.
5//!
6//! ## Research References
7//!
8//! - [AutoDAN](https://arxiv.org/abs/2310.04451) - arXiv, 2023
9//!   Genetic algorithm-based adversarial prompt generation
10//! - [GCG Attack](https://arxiv.org/abs/2307.15043) - Zou et al., 2023
11//!   Gradient-based universal attacks producing gibberish suffixes
12//! - [JailbreakBench](https://jailbreakbench.github.io/) - 2024
13//!   Benchmark for evaluating jailbreak attacks
14//!
15//! ## Detection Mechanism
16//!
17//! Adversarial suffixes typically exhibit unusual statistical properties:
18//! - **High Perplexity**: Random character sequences that don't match natural language
19//! - **Low Entropy**: Repetitive patterns used to extend prompts
20//! - **Unusual n-gram distributions**: Rare character combinations
21//!
22//! ## Example
23//!
24//! ```rust
25//! use oxideshield_guard::guards::PerplexityGuard;
26//! use oxideshield_guard::{Guard, GuardAction};
27//!
28//! // Create guard with thresholds appropriate for our n-gram model
29//! let guard = PerplexityGuard::new("perplexity")
30//!     .with_max_perplexity(50000.0)  // Higher for simplified model
31//!     .with_min_entropy(1.0)         // Low entropy catches repetition
32//!     .with_action(GuardAction::Block);
33//!
34//! // Normal text passes
35//! let result = guard.check("Hello, how are you today? This is normal text.");
36//! assert!(result.passed);
37//!
38//! // Repetitive text fails (low entropy)
39//! let result = guard.check("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
40//! assert!(!result.passed);
41//! ```
42
43use std::collections::HashMap;
44
45use oxideshield_core::{AnomalySegment, PerplexityAnalyzer, PerplexityConfig, Severity};
46use serde::{Deserialize, Serialize};
47use tracing::{debug, instrument};
48
49use crate::guard::{Guard, GuardAction, GuardCheckResult};
50use oxideshield_core::Match;
51
52/// Default maximum perplexity threshold
53/// Note: With the simplified n-gram model, this needs to be higher
54/// Production systems with trained models would use ~1000
55pub const DEFAULT_MAX_PERPLEXITY: f32 = 50000.0;
56
57/// Default minimum perplexity threshold (too low = repetitive)
58pub const DEFAULT_MIN_PERPLEXITY: f32 = 2.0;
59
60/// Default minimum entropy threshold
61pub const DEFAULT_MIN_ENTROPY: f32 = 1.5;
62
63/// Default suffix analysis ratio
64pub const DEFAULT_SUFFIX_RATIO: f32 = 0.3;
65
66/// Perplexity guard configuration
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct PerplexityGuardConfig {
69    /// Maximum allowed perplexity (above = gibberish)
70    pub max_perplexity: f32,
71    /// Minimum allowed perplexity (below = too repetitive)
72    pub min_perplexity: f32,
73    /// Minimum allowed entropy
74    pub min_entropy: f32,
75    /// Ratio of text to analyze as suffix (0.0 - 1.0)
76    pub suffix_ratio: f32,
77    /// Whether to analyze the full text
78    pub analyze_full_text: bool,
79    /// Whether to focus on suffix analysis
80    pub analyze_suffix: bool,
81    /// Action to take when triggered
82    pub action: GuardAction,
83    /// Severity to assign to matches
84    pub severity: Severity,
85}
86
87impl Default for PerplexityGuardConfig {
88    fn default() -> Self {
89        Self {
90            max_perplexity: DEFAULT_MAX_PERPLEXITY,
91            min_perplexity: DEFAULT_MIN_PERPLEXITY,
92            min_entropy: DEFAULT_MIN_ENTROPY,
93            suffix_ratio: DEFAULT_SUFFIX_RATIO,
94            analyze_full_text: true,
95            analyze_suffix: true,
96            action: GuardAction::Block,
97            severity: Severity::High,
98        }
99    }
100}
101
102/// Guard for detecting adversarial patterns using perplexity analysis
103///
104/// This guard is particularly effective at detecting:
105/// - GCG-style adversarial suffixes (random gibberish)
106/// - AutoDAN-style generated attacks
107/// - Repetitive padding attacks
108/// - Unusual character distributions
109pub struct PerplexityGuard {
110    name: String,
111    analyzer: PerplexityAnalyzer,
112    config: PerplexityGuardConfig,
113}
114
115impl PerplexityGuard {
116    /// Create a new perplexity guard with default settings
117    pub fn new(name: impl Into<String>) -> Self {
118        Self {
119            name: name.into(),
120            analyzer: PerplexityAnalyzer::new(),
121            config: PerplexityGuardConfig::default(),
122        }
123    }
124
125    /// Create with custom analyzer configuration
126    pub fn with_analyzer_config(name: impl Into<String>, config: PerplexityConfig) -> Self {
127        Self {
128            name: name.into(),
129            analyzer: PerplexityAnalyzer::with_config(config),
130            config: PerplexityGuardConfig::default(),
131        }
132    }
133
134    /// Set the maximum perplexity threshold
135    pub fn with_max_perplexity(mut self, max: f32) -> Self {
136        self.config.max_perplexity = max.max(1.0);
137        self
138    }
139
140    /// Set the minimum perplexity threshold
141    pub fn with_min_perplexity(mut self, min: f32) -> Self {
142        self.config.min_perplexity = min.max(0.1);
143        self
144    }
145
146    /// Set the minimum entropy threshold
147    pub fn with_min_entropy(mut self, min: f32) -> Self {
148        self.config.min_entropy = min.max(0.0);
149        self
150    }
151
152    /// Set the suffix analysis ratio
153    pub fn with_suffix_ratio(mut self, ratio: f32) -> Self {
154        self.config.suffix_ratio = ratio.clamp(0.1, 0.9);
155        self
156    }
157
158    /// Enable/disable full text analysis
159    pub fn analyze_full_text(mut self, enabled: bool) -> Self {
160        self.config.analyze_full_text = enabled;
161        self
162    }
163
164    /// Enable/disable suffix-focused analysis
165    pub fn analyze_suffix(mut self, enabled: bool) -> Self {
166        self.config.analyze_suffix = enabled;
167        self
168    }
169
170    /// Set the action to take
171    pub fn with_action(mut self, action: GuardAction) -> Self {
172        self.config.action = action;
173        self
174    }
175
176    /// Set the severity for matches
177    pub fn with_severity(mut self, severity: Severity) -> Self {
178        self.config.severity = severity;
179        self
180    }
181
182    /// Get the current configuration
183    pub fn config(&self) -> &PerplexityGuardConfig {
184        &self.config
185    }
186
187    /// Create a Match from an anomaly segment
188    fn create_match(&self, anomaly: &AnomalySegment) -> Match {
189        let mut metadata = HashMap::new();
190        metadata.insert(
191            "perplexity".to_string(),
192            format!("{:.2}", anomaly.perplexity),
193        );
194        metadata.insert("entropy".to_string(), format!("{:.2}", anomaly.entropy));
195        metadata.insert("anomaly_type".to_string(), anomaly.anomaly_type.to_string());
196
197        Match {
198            id: uuid::Uuid::new_v4(),
199            pattern: format!("[perplexity:{}]", anomaly.anomaly_type),
200            matched_text: if anomaly.text.len() > 50 {
201                format!("{}...", &anomaly.text[..50])
202            } else {
203                anomaly.text.clone()
204            },
205            start: anomaly.start,
206            end: anomaly.end,
207            severity: self.config.severity,
208            category: "adversarial".to_string(),
209            metadata,
210        }
211    }
212
213    /// Analyze text and return all detected anomalies
214    fn detect_anomalies(&self, content: &str) -> Vec<AnomalySegment> {
215        let mut all_anomalies = Vec::new();
216
217        // Full text sliding window analysis
218        if self.config.analyze_full_text {
219            let anomalies = self.analyzer.find_anomalous_segments(
220                content,
221                self.config.max_perplexity,
222                self.config.min_perplexity,
223                self.config.min_entropy,
224            );
225            all_anomalies.extend(anomalies);
226        }
227
228        // Suffix-focused analysis
229        if self.config.analyze_suffix {
230            if let Some(suffix_anomaly) = self.analyzer.analyze_suffix(
231                content,
232                self.config.suffix_ratio,
233                self.config.max_perplexity,
234                self.config.min_entropy,
235            ) {
236                // Check if not already covered by full text analysis
237                let already_covered = all_anomalies
238                    .iter()
239                    .any(|a| a.start <= suffix_anomaly.start && a.end >= suffix_anomaly.end);
240
241                if !already_covered {
242                    all_anomalies.push(suffix_anomaly);
243                }
244            }
245        }
246
247        all_anomalies
248    }
249}
250
251impl Guard for PerplexityGuard {
252    fn name(&self) -> &str {
253        &self.name
254    }
255
256    #[instrument(skip(self, content), fields(guard = %self.name, content_len = content.len()))]
257    fn check(&self, content: &str) -> GuardCheckResult {
258        // Quick check for minimum length
259        if content.len() < 10 {
260            debug!("Content too short for perplexity analysis");
261            return GuardCheckResult::pass(&self.name);
262        }
263
264        let anomalies = self.detect_anomalies(content);
265
266        if anomalies.is_empty() {
267            debug!("No perplexity anomalies detected");
268            return GuardCheckResult::pass(&self.name);
269        }
270
271        // Convert anomalies to matches
272        let matches: Vec<Match> = anomalies.iter().map(|a| self.create_match(a)).collect();
273
274        // Build reason string
275        let anomaly_types: Vec<String> = anomalies
276            .iter()
277            .map(|a| a.anomaly_type.to_string())
278            .collect::<std::collections::HashSet<_>>()
279            .into_iter()
280            .collect();
281
282        let reason = format!(
283            "Detected {} perplexity anomalies: {}",
284            anomalies.len(),
285            anomaly_types.join(", ")
286        );
287
288        debug!(
289            anomaly_count = anomalies.len(),
290            types = ?anomaly_types,
291            "Perplexity guard triggered"
292        );
293
294        GuardCheckResult::fail(&self.name, self.config.action, matches, reason)
295    }
296
297    fn action(&self) -> GuardAction {
298        self.config.action
299    }
300
301    fn severity_threshold(&self) -> Severity {
302        Severity::Low
303    }
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    #[test]
311    fn test_perplexity_guard_creation() {
312        let guard = PerplexityGuard::new("test")
313            .with_max_perplexity(50000.0)
314            .with_min_entropy(2.0)
315            .with_action(GuardAction::Block);
316
317        assert_eq!(guard.name(), "test");
318        assert_eq!(guard.config.max_perplexity, 50000.0);
319        assert_eq!(guard.config.min_entropy, 2.0);
320        assert_eq!(guard.action(), GuardAction::Block);
321    }
322
323    #[test]
324    fn test_normal_text_passes() {
325        let guard = PerplexityGuard::new("test");
326
327        let normal = "The quick brown fox jumps over the lazy dog. This is a normal sentence with common English words and phrases.";
328        let result = guard.check(normal);
329
330        assert!(
331            result.passed,
332            "Normal text should pass: {:?}",
333            result.reason
334        );
335    }
336
337    #[test]
338    fn test_repetitive_text_fails() {
339        let guard = PerplexityGuard::new("test").with_min_entropy(1.0);
340
341        let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
342        let result = guard.check(repetitive);
343
344        assert!(!result.passed, "Repetitive text should fail");
345        assert_eq!(result.action, GuardAction::Block);
346    }
347
348    #[test]
349    fn test_short_text_passes() {
350        let guard = PerplexityGuard::new("test");
351
352        let short = "Hello";
353        let result = guard.check(short);
354
355        assert!(result.passed, "Short text should pass (skip analysis)");
356    }
357
358    #[test]
359    fn test_config_clamping() {
360        let guard = PerplexityGuard::new("test")
361            .with_max_perplexity(-100.0) // Should clamp to 1.0
362            .with_min_perplexity(-50.0) // Should clamp to 0.1
363            .with_min_entropy(-10.0) // Should clamp to 0.0
364            .with_suffix_ratio(2.0); // Should clamp to 0.9
365
366        assert!(guard.config.max_perplexity >= 1.0);
367        assert!(guard.config.min_perplexity >= 0.1);
368        assert!(guard.config.min_entropy >= 0.0);
369        assert!(guard.config.suffix_ratio <= 0.9);
370    }
371
372    #[test]
373    fn test_suffix_analysis() {
374        let guard = PerplexityGuard::new("test")
375            .analyze_full_text(false)
376            .analyze_suffix(true)
377            .with_suffix_ratio(0.5);
378
379        // Text with adversarial-like suffix
380        let text = "Please answer my question about the weather. zxcvbnmasdfghjkqwertyuiopzxcvbnm";
381        let result = guard.check(text);
382
383        // The suffix should be flagged
384        // (This depends on the specific perplexity thresholds)
385        assert!(result.matches.is_empty() || !result.passed);
386    }
387
388    #[test]
389    fn test_mixed_text() {
390        let guard = PerplexityGuard::new("test").with_max_perplexity(800.0);
391
392        // Mixed normal and potentially adversarial content
393        let text = "Normal text at the start. xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx More normal text at the end.";
394        let result = guard.check(text);
395
396        // Should detect the repetitive section
397        if !result.passed {
398            assert!(!result.matches.is_empty());
399        }
400    }
401
402    #[test]
403    fn test_severity_assignment() {
404        let guard = PerplexityGuard::new("test")
405            .with_severity(Severity::Critical)
406            .with_min_entropy(2.0);
407
408        let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
409        let result = guard.check(repetitive);
410
411        if !result.matches.is_empty() {
412            assert_eq!(result.matches[0].severity, Severity::Critical);
413        }
414    }
415
416    #[test]
417    fn test_gcg_like_pattern() {
418        let guard = PerplexityGuard::new("test");
419
420        // Simulated GCG-style adversarial suffix (random looking characters)
421        let gcg_like =
422            "What is 2+2? ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !";
423        let result = guard.check(gcg_like);
424
425        // Should detect as low entropy due to repetitive pattern
426        if !result.passed {
427            let has_low_entropy = result.matches.iter().any(|m| {
428                m.metadata
429                    .get("anomaly_type")
430                    .map(|t| t == "low_entropy")
431                    .unwrap_or(false)
432            });
433            // Either passes or detects correctly
434            assert!(result.passed || has_low_entropy || !result.matches.is_empty());
435        }
436    }
437}