use std::collections::HashMap;
pub struct EntropyAnalyzer {
#[allow(dead_code)]
window_size: usize,
#[allow(dead_code)]
jaccard_threshold: f64,
}
impl EntropyAnalyzer {
pub fn new(window_size: usize, jaccard_threshold: f64) -> Self {
Self {
window_size,
jaccard_threshold,
}
}
pub fn shannon_entropy(&self, text: &str) -> f64 {
if text.is_empty() {
return 0.0;
}
let mut char_counts: HashMap<char, usize> = HashMap::new();
for c in text.chars() {
*char_counts.entry(c).or_insert(0) += 1;
}
let len = text.len() as f64;
let mut entropy = 0.0;
for count in char_counts.values() {
let p = *count as f64 / len;
if p > 0.0 {
entropy -= p * p.log2();
}
}
entropy
}
pub fn normalized_entropy(&self, text: &str) -> f64 {
let entropy = self.shannon_entropy(text);
let len = text.len();
if len <= 1 {
return entropy;
}
let max_entropy = (len as f64).log2();
if max_entropy > 0.0 {
entropy / max_entropy
} else {
0.0
}
}
pub fn line_entropies(&self, lines: &[&str]) -> Vec<f64> {
lines
.iter()
.map(|line| self.normalized_entropy(line))
.collect()
}
pub fn jaccard_similarity(set1: &[&str], set2: &[&str]) -> f64 {
if set1.is_empty() && set2.is_empty() {
return 1.0;
}
if set1.is_empty() || set2.is_empty() {
return 0.0;
}
let set1: std::collections::HashSet<_> = set1.iter().collect();
let set2: std::collections::HashSet<_> = set2.iter().collect();
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
return 0.0;
}
intersection as f64 / union as f64
}
pub fn filter_low_entropy_lines<'a>(&self, lines: &[&'a str], threshold: f64) -> Vec<&'a str> {
let entropies = self.line_entropies(lines);
lines
.iter()
.zip(entropies.iter())
.filter(|(_, &entropy)| entropy >= threshold)
.map(|(line, _)| *line)
.collect()
}
pub fn find_repetitive_patterns(&self, lines: &[&str]) -> Vec<(usize, usize)> {
let mut patterns = Vec::new();
let n = lines.len();
if n < 2 {
return patterns;
}
for window_size in 2..=(n / 2).min(10) {
for i in 0..=(n - 2 * window_size) {
let pattern = &lines[i..i + window_size];
let mut count = 1;
for j in (i + window_size..=(n - window_size)).step_by(window_size) {
if lines[j..j + window_size] == *pattern {
count += 1;
}
}
if count >= 3 {
patterns.push((i, window_size));
}
}
}
patterns
}
pub fn kolmogorov_adjustment(&self, entropy: f64, complexity: usize) -> f64 {
let complexity_factor = 1.0 / (1.0 + (complexity as f64).ln());
entropy * complexity_factor
}
}
impl Default for EntropyAnalyzer {
fn default() -> Self {
Self::new(256, 0.7)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_shannon_entropy() {
let analyzer = EntropyAnalyzer::default();
let uniform = "abcdefghijklmnop";
let repeated = "aaaaaaaa";
let mixed = "aZ4!@9#";
let entropy_uniform = analyzer.shannon_entropy(uniform);
let entropy_repeated = analyzer.shannon_entropy(repeated);
let entropy_mixed = analyzer.shannon_entropy(mixed);
assert!(entropy_repeated < entropy_uniform);
assert!(entropy_mixed < entropy_uniform);
}
#[test]
fn test_filter_low_entropy() {
let analyzer = EntropyAnalyzer::default();
let lines = vec!["aaaaa", "xxxxx", "abcde", "fghij"];
let filtered = analyzer.filter_low_entropy_lines(&lines, 0.5);
assert_eq!(filtered.len(), 2);
}
#[test]
fn test_jaccard_similarity() {
let a = vec!["x", "y", "z"];
let b = vec!["y", "z", "w"];
let similarity = EntropyAnalyzer::jaccard_similarity(&a, &b);
assert!((similarity - 0.5).abs() < 0.01);
}
}