1use std::collections::HashMap;
22use tracing::debug;
23
24pub const DEFAULT_NGRAM_ORDER: usize = 3;
26
27pub const DEFAULT_WINDOW_SIZE: usize = 50;
29
30#[derive(Debug, Clone)]
32pub struct AnomalySegment {
33 pub start: usize,
35 pub end: usize,
37 pub text: String,
39 pub perplexity: f32,
41 pub entropy: f32,
43 pub anomaly_type: AnomalyType,
45}
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub enum AnomalyType {
50 HighPerplexity,
52 LowPerplexity,
54 LowEntropy,
56 UnusualDistribution,
58}
59
60impl std::fmt::Display for AnomalyType {
61 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62 match self {
63 AnomalyType::HighPerplexity => write!(f, "high_perplexity"),
64 AnomalyType::LowPerplexity => write!(f, "low_perplexity"),
65 AnomalyType::LowEntropy => write!(f, "low_entropy"),
66 AnomalyType::UnusualDistribution => write!(f, "unusual_distribution"),
67 }
68 }
69}
70
71#[derive(Debug, Clone)]
73pub struct PerplexityConfig {
74 pub ngram_order: usize,
76 pub window_size: usize,
78 pub min_segment_length: usize,
80}
81
82impl Default for PerplexityConfig {
83 fn default() -> Self {
84 Self {
85 ngram_order: DEFAULT_NGRAM_ORDER,
86 window_size: DEFAULT_WINDOW_SIZE,
87 min_segment_length: 10,
88 }
89 }
90}
91
92pub struct PerplexityAnalyzer {
98 char_ngram_model: HashMap<String, f32>,
100 config: PerplexityConfig,
102}
103
104impl PerplexityAnalyzer {
105 pub fn new() -> Self {
107 Self::with_config(PerplexityConfig::default())
108 }
109
110 pub fn with_config(config: PerplexityConfig) -> Self {
112 let char_ngram_model = Self::build_english_ngram_model(config.ngram_order);
113 Self {
114 char_ngram_model,
115 config,
116 }
117 }
118
119 fn build_english_ngram_model(ngram_order: usize) -> HashMap<String, f32> {
124 let mut model = HashMap::new();
125
126 let common_patterns = [
129 ("th", -2.0),
131 ("he", -2.1),
132 ("in", -2.3),
133 ("er", -2.4),
134 ("an", -2.5),
135 ("re", -2.6),
136 ("on", -2.7),
137 ("at", -2.8),
138 ("en", -2.9),
139 ("nd", -3.0),
140 ("ti", -3.1),
141 ("es", -3.2),
142 ("or", -3.3),
143 ("te", -3.4),
144 ("of", -3.5),
145 ("ed", -3.6),
146 ("is", -3.7),
147 ("it", -3.8),
148 ("al", -3.9),
149 ("ar", -4.0),
150 ("st", -4.1),
151 ("to", -4.2),
152 ("nt", -4.3),
153 ("ng", -4.4),
154 ("se", -4.5),
155 ("the", -3.0),
157 ("and", -3.5),
158 ("ing", -3.8),
159 ("ion", -4.0),
160 ("tio", -4.2),
161 ("ent", -4.4),
162 ("ati", -4.6),
163 ("for", -4.8),
164 ("her", -5.0),
165 ("ter", -5.2),
166 ("hat", -5.4),
167 ("tha", -5.6),
168 ("ere", -5.8),
169 ("ate", -6.0),
170 ("his", -6.2),
171 ("con", -6.4),
172 ("res", -6.6),
173 ("ver", -6.8),
174 ("all", -7.0),
175 ("ons", -7.2),
176 (" th", -2.5),
178 ("e ", -2.8),
179 (" a ", -3.0),
180 (" of", -3.2),
181 (" to", -3.4),
182 (" in", -3.6),
183 ("s ", -3.8),
184 (". ", -4.0),
185 (", ", -4.2),
186 ];
187
188 for (pattern, log_prob) in common_patterns {
189 if pattern.len() <= ngram_order {
190 model.insert(pattern.to_lowercase(), log_prob);
191 }
192 }
193
194 model
195 }
196
197 pub fn char_perplexity(&self, text: &str) -> f32 {
202 let text = text.to_lowercase();
203 let chars: Vec<char> = text.chars().collect();
204
205 if chars.len() < self.config.ngram_order {
206 return 0.0;
207 }
208
209 let mut total_log_prob = 0.0f32;
210 let mut count = 0;
211
212 for i in 0..=(chars.len() - self.config.ngram_order) {
213 let ngram: String = chars[i..i + self.config.ngram_order].iter().collect();
214
215 let log_prob = self.char_ngram_model.get(&ngram).copied().unwrap_or(-10.0);
217 total_log_prob += log_prob;
218 count += 1;
219 }
220
221 if count == 0 {
222 return 0.0;
223 }
224
225 let avg_log_prob = total_log_prob / count as f32;
227 (-avg_log_prob).exp()
228 }
229
230 pub fn token_entropy(&self, text: &str) -> f32 {
235 let chars: Vec<char> = text.chars().collect();
236 if chars.is_empty() {
237 return 0.0;
238 }
239
240 let mut freq: HashMap<char, usize> = HashMap::new();
242 for &c in &chars {
243 *freq.entry(c).or_insert(0) += 1;
244 }
245
246 let n = chars.len() as f32;
248 let entropy: f32 = freq
249 .values()
250 .map(|&count| {
251 let p = count as f32 / n;
252 if p > 0.0 {
253 -p * p.log2()
254 } else {
255 0.0
256 }
257 })
258 .sum();
259
260 entropy
261 }
262
263 pub fn unique_char_ratio(&self, text: &str) -> f32 {
268 let chars: Vec<char> = text.chars().collect();
269 if chars.is_empty() {
270 return 0.0;
271 }
272
273 let unique: std::collections::HashSet<char> = chars.iter().copied().collect();
274 unique.len() as f32 / chars.len() as f32
275 }
276
277 pub fn find_anomalous_segments(
281 &self,
282 text: &str,
283 max_perplexity: f32,
284 min_perplexity: f32,
285 min_entropy: f32,
286 ) -> Vec<AnomalySegment> {
287 let mut anomalies = Vec::new();
288 let chars: Vec<char> = text.chars().collect();
289
290 if chars.len() < self.config.min_segment_length {
291 return anomalies;
292 }
293
294 let window_size = self.config.window_size.min(chars.len());
295 let step = window_size / 4; let mut i = 0;
298 while i + window_size <= chars.len() {
299 let segment: String = chars[i..i + window_size].iter().collect();
300
301 let perplexity = self.char_perplexity(&segment);
302 let entropy = self.token_entropy(&segment);
303
304 let anomaly_type = if perplexity > max_perplexity {
305 Some(AnomalyType::HighPerplexity)
306 } else if perplexity < min_perplexity && perplexity > 0.0 {
307 Some(AnomalyType::LowPerplexity)
308 } else if entropy < min_entropy {
309 Some(AnomalyType::LowEntropy)
310 } else {
311 None
312 };
313
314 if let Some(atype) = anomaly_type {
315 debug!(
316 start = i,
317 end = i + window_size,
318 perplexity = %perplexity,
319 entropy = %entropy,
320 anomaly_type = %atype,
321 "Anomalous segment detected"
322 );
323
324 anomalies.push(AnomalySegment {
325 start: i,
326 end: i + window_size,
327 text: segment,
328 perplexity,
329 entropy,
330 anomaly_type: atype,
331 });
332 }
333
334 i += step;
335 }
336
337 Self::merge_overlapping_anomalies(anomalies)
339 }
340
341 fn merge_overlapping_anomalies(mut segments: Vec<AnomalySegment>) -> Vec<AnomalySegment> {
343 if segments.is_empty() {
344 return segments;
345 }
346
347 segments.sort_by_key(|s| s.start);
348
349 let mut merged = Vec::new();
350 let mut current = segments.remove(0);
351
352 for next in segments {
353 if next.start <= current.end {
354 current.end = current.end.max(next.end);
356 current.perplexity = current.perplexity.max(next.perplexity);
357 current.entropy = current.entropy.min(next.entropy);
358 if next.anomaly_type == AnomalyType::HighPerplexity {
360 current.anomaly_type = AnomalyType::HighPerplexity;
361 }
362 } else {
363 merged.push(current);
364 current = next;
365 }
366 }
367 merged.push(current);
368
369 merged
370 }
371
372 pub fn is_suspicious(&self, text: &str, max_perplexity: f32, min_entropy: f32) -> bool {
376 if text.len() < self.config.min_segment_length {
377 return false;
378 }
379
380 let perplexity = self.char_perplexity(text);
381 let entropy = self.token_entropy(text);
382
383 perplexity > max_perplexity || entropy < min_entropy
384 }
385
386 pub fn analyze_suffix(
397 &self,
398 text: &str,
399 suffix_ratio: f32,
400 max_perplexity: f32,
401 min_entropy: f32,
402 ) -> Option<AnomalySegment> {
403 let chars: Vec<char> = text.chars().collect();
404 let suffix_len = (chars.len() as f32 * suffix_ratio.clamp(0.1, 0.5)) as usize;
405
406 if suffix_len < self.config.min_segment_length {
407 return None;
408 }
409
410 let start = chars.len() - suffix_len;
411 let suffix: String = chars[start..].iter().collect();
412
413 let perplexity = self.char_perplexity(&suffix);
414 let entropy = self.token_entropy(&suffix);
415
416 if perplexity > max_perplexity {
417 Some(AnomalySegment {
418 start,
419 end: chars.len(),
420 text: suffix,
421 perplexity,
422 entropy,
423 anomaly_type: AnomalyType::HighPerplexity,
424 })
425 } else if entropy < min_entropy {
426 Some(AnomalySegment {
427 start,
428 end: chars.len(),
429 text: suffix,
430 perplexity,
431 entropy,
432 anomaly_type: AnomalyType::LowEntropy,
433 })
434 } else {
435 None
436 }
437 }
438}
439
440impl Default for PerplexityAnalyzer {
441 fn default() -> Self {
442 Self::new()
443 }
444}
445
446#[cfg(test)]
447mod tests {
448 use super::*;
449
450 #[test]
451 fn test_normal_english_perplexity() {
452 let analyzer = PerplexityAnalyzer::new();
453
454 let normal_text = "The quick brown fox jumps over the lazy dog.";
455 let perplexity = analyzer.char_perplexity(normal_text);
456
457 assert!(perplexity > 0.0);
461 assert!(
463 perplexity < 50000.0,
464 "Normal text perplexity too high: {}",
465 perplexity
466 );
467 }
468
469 #[test]
470 fn test_gibberish_perplexity() {
471 let analyzer = PerplexityAnalyzer::new();
472
473 let gibberish = "xyzqkjwfpvbn zxcvqwert yuiopasdfghjkl";
474 let perplexity = analyzer.char_perplexity(gibberish);
475
476 let normal_text = "The quick brown fox jumps over the lazy dog.";
477 let normal_perplexity = analyzer.char_perplexity(normal_text);
478
479 assert!(
481 perplexity > normal_perplexity,
482 "Gibberish ({}) should have higher perplexity than normal ({})",
483 perplexity,
484 normal_perplexity
485 );
486 }
487
488 #[test]
489 fn test_repetitive_text_entropy() {
490 let analyzer = PerplexityAnalyzer::new();
491
492 let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaa";
493 let entropy = analyzer.token_entropy(repetitive);
494
495 assert!(
497 entropy < 0.5,
498 "Repetitive text entropy too high: {}",
499 entropy
500 );
501
502 let normal_text = "The quick brown fox jumps over the lazy dog.";
503 let normal_entropy = analyzer.token_entropy(normal_text);
504
505 assert!(normal_entropy > entropy);
507 }
508
509 #[test]
510 fn test_unique_char_ratio() {
511 let analyzer = PerplexityAnalyzer::new();
512
513 let repetitive = "aaaaaaaaaa";
514 let ratio = analyzer.unique_char_ratio(repetitive);
515 assert!(ratio < 0.2, "Repetitive text should have low unique ratio");
516
517 let varied = "abcdefghij";
518 let varied_ratio = analyzer.unique_char_ratio(varied);
519 assert!(
520 varied_ratio > 0.9,
521 "Varied text should have high unique ratio"
522 );
523 }
524
525 #[test]
526 fn test_find_anomalous_segments() {
527 let analyzer = PerplexityAnalyzer::with_config(PerplexityConfig {
528 ngram_order: 3,
529 window_size: 20,
530 min_segment_length: 10,
531 });
532
533 let text = "Normal text here. xxxxxxxxxxxxxxxxxxxxxxx More normal text.";
534 let anomalies = analyzer.find_anomalous_segments(text, 1000.0, 1.0, 1.0);
535
536 assert!(!anomalies.is_empty(), "Should detect repetitive segment");
538 }
539
540 #[test]
541 fn test_is_suspicious() {
542 let analyzer = PerplexityAnalyzer::new();
543
544 let normal = "This is a normal sentence with common words.";
546 assert!(!analyzer.is_suspicious(normal, 50000.0, 1.5));
547
548 let repetitive = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
549 assert!(analyzer.is_suspicious(repetitive, 50000.0, 1.5));
551 }
552
553 #[test]
554 fn test_analyze_suffix() {
555 let analyzer = PerplexityAnalyzer::new();
556
557 let text = "Please answer the following question. zde yz q xk wj pv bn zde yz";
559 let anomaly = analyzer.analyze_suffix(text, 0.3, 100000.0, 1.0);
561
562 if let Some(a) = anomaly {
564 assert!(a.perplexity > 0.0);
565 }
566 }
567
568 #[test]
569 fn test_short_text() {
570 let analyzer = PerplexityAnalyzer::new();
571
572 let short = "Hi";
573 let perplexity = analyzer.char_perplexity(short);
574 assert_eq!(perplexity, 0.0, "Short text should return 0 perplexity");
575
576 let entropy = analyzer.token_entropy(short);
577 assert!(entropy >= 0.0);
578 }
579
580 #[test]
581 fn test_empty_text() {
582 let analyzer = PerplexityAnalyzer::new();
583
584 assert_eq!(analyzer.char_perplexity(""), 0.0);
585 assert_eq!(analyzer.token_entropy(""), 0.0);
586 assert_eq!(analyzer.unique_char_ratio(""), 0.0);
587 }
588}