Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15pub struct EntropyResult {
16    pub output: String,
17    pub original_tokens: usize,
18    pub compressed_tokens: usize,
19    pub techniques: Vec<String>,
20}
21
22impl EntropyResult {
23    pub fn savings_percent(&self) -> f64 {
24        if self.original_tokens == 0 {
25            return 0.0;
26        }
27        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28        (saved as f64 / self.original_tokens as f64) * 100.0
29    }
30}
31
32#[allow(dead_code)]
33pub fn shannon_entropy(text: &str) -> f64 {
34    if text.is_empty() {
35        return 0.0;
36    }
37    let mut freq: HashMap<char, usize> = HashMap::new();
38    let total = text.chars().count();
39
40    for c in text.chars() {
41        *freq.entry(c).or_default() += 1;
42    }
43
44    freq.values().fold(0.0_f64, |acc, &count| {
45        let p = count as f64 / total as f64;
46        acc - p * p.log2()
47    })
48}
49
50/// Shannon entropy over BPE token IDs (o200k_base).
51/// More LLM-relevant than character entropy since LLMs process BPE tokens.
52pub fn token_entropy(text: &str) -> f64 {
53    let tokens = encode_tokens(text);
54    if tokens.is_empty() {
55        return 0.0;
56    }
57    let total = tokens.len();
58    let mut freq: HashMap<u32, usize> = HashMap::new();
59    for &t in &tokens {
60        *freq.entry(t).or_default() += 1;
61    }
62    freq.values().fold(0.0_f64, |acc, &count| {
63        let p = count as f64 / total as f64;
64        acc - p * p.log2()
65    })
66}
67
68/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
69/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
70/// This makes thresholds comparable across different alphabet sizes.
71pub fn normalized_token_entropy(text: &str) -> f64 {
72    let tokens = encode_tokens(text);
73    if tokens.is_empty() {
74        return 0.0;
75    }
76    let total = tokens.len();
77    let mut freq: HashMap<u32, usize> = HashMap::new();
78    for &t in &tokens {
79        *freq.entry(t).or_default() += 1;
80    }
81    let n_unique = freq.len();
82    if n_unique <= 1 {
83        return 0.0;
84    }
85    let h = freq.values().fold(0.0_f64, |acc, &count| {
86        let p = count as f64 / total as f64;
87        acc - p * p.log2()
88    });
89    let h_max = (n_unique as f64).log2();
90    h / h_max
91}
92
93#[allow(dead_code)]
94pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
95    let set_a: HashSet<&str> = a.split_whitespace().collect();
96    let set_b: HashSet<&str> = b.split_whitespace().collect();
97
98    let intersection = set_a.intersection(&set_b).count();
99    let union = set_a.union(&set_b).count();
100
101    if union == 0 {
102        return 0.0;
103    }
104    intersection as f64 / union as f64
105}
106
107/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
108pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
109    let set_a = ngram_set(a, n);
110    let set_b = ngram_set(b, n);
111
112    let intersection = set_a.intersection(&set_b).count();
113    let union = set_a.union(&set_b).count();
114
115    if union == 0 {
116        return 0.0;
117    }
118    intersection as f64 / union as f64
119}
120
121fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
122    let words: Vec<&str> = text.split_whitespace().collect();
123    if words.len() < n {
124        let mut set = HashSet::new();
125        if !words.is_empty() {
126            set.insert(words.iter().map(|w| w.to_string()).collect());
127        }
128        return set;
129    }
130    words
131        .windows(n)
132        .map(|w| w.iter().map(|s| s.to_string()).collect())
133        .collect()
134}
135
136/// Minhash signature for approximate Jaccard via LSH.
137/// Uses k independent hash functions (polynomial hashing with different seeds).
138#[allow(dead_code)]
139pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
140    let ngrams = ngram_set(text, n);
141    if ngrams.is_empty() {
142        return vec![u64::MAX; k];
143    }
144    let mut signature = vec![u64::MAX; k];
145    for ngram in &ngrams {
146        for (i, min) in signature.iter_mut().enumerate() {
147            let h = hash_with_seed(ngram, i as u64);
148            if h < *min {
149                *min = h;
150            }
151        }
152    }
153    signature
154}
155
156/// Approximate Jaccard from two minhash signatures.
157#[allow(dead_code)]
158pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
159    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
160        return 0.0;
161    }
162    let matches = sig_a
163        .iter()
164        .zip(sig_b.iter())
165        .filter(|(a, b)| a == b)
166        .count();
167    matches as f64 / sig_a.len() as f64
168}
169
170#[allow(dead_code)]
171fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
172    let mut hasher = std::collections::hash_map::DefaultHasher::new();
173    seed.hash(&mut hasher);
174    value.hash(&mut hasher);
175    hasher.finish()
176}
177
178/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
179/// Lower values = more compressible = more redundant.
180pub fn kolmogorov_proxy(content: &str) -> f64 {
181    if content.is_empty() {
182        return 1.0;
183    }
184    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
185    encoder.write_all(content.as_bytes()).ok();
186    let compressed = encoder.finish().unwrap_or_default();
187    compressed.len() as f64 / content.len() as f64
188}
189
190#[derive(Debug, Clone, Copy, PartialEq, Eq)]
191pub enum CompressibilityClass {
192    High,
193    Medium,
194    Low,
195}
196
197impl CompressibilityClass {
198    pub fn label(&self) -> &'static str {
199        match self {
200            Self::High => "high (K<0.3)",
201            Self::Medium => "medium (0.3≤K<0.6)",
202            Self::Low => "low (K≥0.6)",
203        }
204    }
205}
206
207/// Classify how compressible content is based on gzip ratio.
208pub fn compressibility_class(content: &str) -> CompressibilityClass {
209    let k = kolmogorov_proxy(content);
210    if k < 0.3 {
211        CompressibilityClass::High
212    } else if k < 0.6 {
213        CompressibilityClass::Medium
214    } else {
215        CompressibilityClass::Low
216    }
217}
218
219pub fn entropy_compress(content: &str) -> EntropyResult {
220    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
221}
222
223pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
224    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
225    let before_lines = content.lines().count() as u32;
226    let result =
227        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
228    let after_lines = result.output.lines().count() as u32;
229
230    if before_lines != after_lines {
231        super::events::emit(super::events::EventKind::Compression {
232            path: path.to_string(),
233            before_lines,
234            after_lines,
235            strategy: "entropy_adaptive".to_string(),
236            kept_line_count: after_lines,
237            removed_line_count: before_lines.saturating_sub(after_lines),
238        });
239    }
240
241    result
242}
243
244fn entropy_compress_with_thresholds(
245    content: &str,
246    entropy_threshold: f64,
247    jaccard_threshold: f64,
248) -> EntropyResult {
249    let original_tokens = count_tokens(content);
250    let mut lines: Vec<&str> = content.lines().collect();
251    let mut techniques = Vec::new();
252
253    let original_count = lines.len();
254    lines.retain(|line| {
255        let trimmed = line.trim();
256        super::surprise::should_keep_line(trimmed, entropy_threshold)
257    });
258    let removed = original_count - lines.len();
259    if removed > 0 {
260        techniques.push(format!(
261            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
262        ));
263    }
264
265    let blocks = extract_blocks(&lines);
266    let groups = find_pattern_groups(&blocks, jaccard_threshold);
267    let mut dedup_count = 0;
268    for group in &groups {
269        if group.len() > 1 {
270            dedup_count += group.len() - 1;
271        }
272    }
273    if dedup_count > 0 {
274        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
275    }
276
277    let mut result: Vec<String> = Vec::new();
278    let mut skip_indices: HashSet<usize> = HashSet::new();
279    for group in &groups {
280        if group.len() > 1 {
281            for &idx in &group[1..] {
282                skip_indices.insert(idx);
283            }
284        }
285    }
286    for (i, line) in lines.iter().enumerate() {
287        if !skip_indices.contains(&i) {
288            result.push(line.to_string());
289        }
290    }
291
292    let mut collapsed = Vec::new();
293    let mut blank_count = 0;
294    for line in &result {
295        if line.trim().is_empty() {
296            blank_count += 1;
297            if blank_count <= 1 {
298                collapsed.push(line.clone());
299            }
300        } else {
301            blank_count = 0;
302            collapsed.push(line.clone());
303        }
304    }
305
306    let output = collapsed.join("\n");
307    let compressed_tokens = count_tokens(&output);
308
309    EntropyResult {
310        output,
311        original_tokens,
312        compressed_tokens,
313        techniques,
314    }
315}
316
317#[derive(Debug)]
318pub struct EntropyAnalysis {
319    pub avg_entropy: f64,
320    pub low_entropy_count: usize,
321    pub high_entropy_count: usize,
322    pub total_lines: usize,
323}
324
325pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
326    let lines: Vec<&str> = content.lines().collect();
327    let total = lines.len();
328    let mut sum = 0.0;
329    let mut low = 0;
330    let mut high = 0;
331    let mut counted = 0;
332
333    for line in &lines {
334        let trimmed = line.trim();
335        if trimmed.is_empty() {
336            continue;
337        }
338        let h = token_entropy(trimmed);
339        sum += h;
340        counted += 1;
341        if h < BPE_ENTROPY_THRESHOLD {
342            low += 1;
343        }
344        if h > 3.0 {
345            high += 1;
346        }
347    }
348
349    EntropyAnalysis {
350        avg_entropy: if counted > 0 {
351            sum / counted as f64
352        } else {
353            0.0
354        },
355        low_entropy_count: low,
356        high_entropy_count: high,
357        total_lines: total,
358    }
359}
360
361#[allow(dead_code)]
362struct Block {
363    start: usize,
364    content: String,
365}
366
367fn extract_blocks(lines: &[&str]) -> Vec<Block> {
368    let mut blocks = Vec::new();
369    let mut current = String::new();
370    let mut start = 0;
371
372    for (i, line) in lines.iter().enumerate() {
373        let trimmed = line.trim();
374        if trimmed.is_empty() && !current.is_empty() {
375            blocks.push(Block {
376                start,
377                content: current.clone(),
378            });
379            current.clear();
380        } else if !trimmed.is_empty() {
381            if current.is_empty() {
382                start = i;
383            }
384            current.push_str(trimmed);
385            current.push('\n');
386        }
387    }
388
389    if !current.is_empty() {
390        blocks.push(Block {
391            start,
392            content: current,
393        });
394    }
395
396    blocks
397}
398
399fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
400    let mut groups: Vec<Vec<usize>> = Vec::new();
401    let mut assigned: HashSet<usize> = HashSet::new();
402
403    for (i, block_a) in blocks.iter().enumerate() {
404        if assigned.contains(&i) {
405            continue;
406        }
407        let mut group = vec![i];
408        for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
409            if assigned.contains(&j) {
410                continue;
411            }
412            if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
413                group.push(j);
414                assigned.insert(j);
415            }
416        }
417        if group.len() > 1 {
418            assigned.insert(i);
419        }
420        groups.push(group);
421    }
422
423    groups
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    #[test]
431    fn shannon_entropy_empty_is_zero() {
432        assert_eq!(shannon_entropy(""), 0.0);
433    }
434
435    #[test]
436    fn shannon_entropy_single_char() {
437        assert_eq!(shannon_entropy("aaaa"), 0.0);
438    }
439
440    #[test]
441    fn shannon_entropy_high_for_varied_text() {
442        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
443        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
444        assert!(
445            shannon_entropy(varied) > shannon_entropy(uniform),
446            "varied text should have higher entropy"
447        );
448    }
449
450    #[test]
451    fn jaccard_identical_is_one() {
452        let sim = jaccard_similarity("hello world", "hello world");
453        assert!((sim - 1.0).abs() < f64::EPSILON);
454    }
455
456    #[test]
457    fn jaccard_disjoint_is_zero() {
458        let sim = jaccard_similarity("abc", "xyz");
459        assert_eq!(sim, 0.0);
460    }
461
462    #[test]
463    fn jaccard_partial_overlap() {
464        let sim = jaccard_similarity("hello world", "hello rust");
465        assert!(sim > 0.0 && sim < 1.0);
466    }
467
468    #[test]
469    fn entropy_compress_produces_output() {
470        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
471        let result = entropy_compress(content);
472        assert!(!result.output.is_empty(), "should produce non-empty output");
473        assert!(result.compressed_tokens <= result.original_tokens);
474    }
475
476    #[test]
477    fn entropy_result_savings() {
478        let r = EntropyResult {
479            output: "short".to_string(),
480            original_tokens: 100,
481            compressed_tokens: 60,
482            techniques: vec!["test".to_string()],
483        };
484        assert!((r.savings_percent() - 40.0).abs() < 0.1);
485    }
486
487    #[test]
488    fn entropy_result_zero_original() {
489        let r = EntropyResult {
490            output: String::new(),
491            original_tokens: 0,
492            compressed_tokens: 0,
493            techniques: vec![],
494        };
495        assert_eq!(r.savings_percent(), 0.0);
496    }
497
498    #[test]
499    fn token_entropy_empty_is_zero() {
500        assert_eq!(token_entropy(""), 0.0);
501    }
502
503    #[test]
504    fn token_entropy_single_repeated_token() {
505        assert_eq!(token_entropy("}"), 0.0);
506    }
507
508    #[test]
509    fn token_entropy_higher_for_diverse_code() {
510        let diverse = "let result = compute_something(x, y, z);";
511        let repetitive = "aaaa aaaa aaaa aaaa";
512        assert!(
513            token_entropy(diverse) > token_entropy(repetitive),
514            "diverse code should have higher BPE token entropy"
515        );
516    }
517
518    #[test]
519    fn token_entropy_vs_char_entropy_differ() {
520        let code = "fn main() { println!(\"hello world\"); }";
521        let te = token_entropy(code);
522        let ce = shannon_entropy(code);
523        assert!(te != ce, "BPE and char entropy should differ for code");
524    }
525
526    #[test]
527    fn ngram_jaccard_preserves_order() {
528        let a = "a b c d";
529        let b = "d c b a";
530        let word_j = jaccard_similarity(a, b);
531        let ngram_j = ngram_jaccard(a, b, 2);
532        assert!(
533            ngram_j < word_j,
534            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
535        );
536    }
537
538    #[test]
539    fn ngram_jaccard_identical_is_one() {
540        let text = "fn main() { println!(\"hello\"); }";
541        let j = ngram_jaccard(text, text, 2);
542        assert!((j - 1.0).abs() < f64::EPSILON);
543    }
544
545    #[test]
546    fn ngram_jaccard_disjoint_is_zero() {
547        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
548        assert_eq!(j, 0.0);
549    }
550
551    #[test]
552    fn minhash_approximates_jaccard() {
553        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
554        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
555        let exact = ngram_jaccard(a, b, 2);
556        let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
557        let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
558        let approx = minhash_similarity(&sig_a, &sig_b);
559        assert!(
560            (exact - approx).abs() < 0.2,
561            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
562        );
563    }
564
565    #[test]
566    fn minhash_empty_text() {
567        let sig = minhash_signature("", 2, 64);
568        assert!(sig.iter().all(|&v| v == u64::MAX));
569    }
570
571    #[test]
572    fn kolmogorov_empty_is_one() {
573        assert_eq!(kolmogorov_proxy(""), 1.0);
574    }
575
576    #[test]
577    fn kolmogorov_repetitive_is_low() {
578        let repetitive = "aaa\n".repeat(1000);
579        let k = kolmogorov_proxy(&repetitive);
580        assert!(
581            k < 0.1,
582            "highly repetitive text should compress well: K={k}"
583        );
584    }
585
586    #[test]
587    fn kolmogorov_diverse_is_higher() {
588        let repetitive = "aaa\n".repeat(500);
589        let diverse = (0..500)
590            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
591            .collect::<Vec<_>>()
592            .join("\n");
593        assert!(
594            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
595            "diverse content should have higher K than repetitive"
596        );
597    }
598
599    #[test]
600    fn compressibility_class_repetitive_is_high() {
601        let text = "use std::io;\n".repeat(200);
602        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
603    }
604
605    #[test]
606    fn kolmogorov_diverse_higher_than_repetitive() {
607        let rep = "test\n".repeat(500);
608        let diverse = (0..500)
609            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
610            .collect::<Vec<_>>()
611            .join("\n");
612        assert!(
613            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
614            "diverse content should have higher K"
615        );
616    }
617}