Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
13pub struct EntropyResult {
14    pub output: String,
15    pub original_tokens: usize,
16    pub compressed_tokens: usize,
17    pub techniques: Vec<String>,
18}
19
20impl EntropyResult {
21    pub fn savings_percent(&self) -> f64 {
22        if self.original_tokens == 0 {
23            return 0.0;
24        }
25        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
26        (saved as f64 / self.original_tokens as f64) * 100.0
27    }
28}
29
30pub fn shannon_entropy(text: &str) -> f64 {
31    if text.is_empty() {
32        return 0.0;
33    }
34    let mut freq: HashMap<char, usize> = HashMap::new();
35    let total = text.chars().count();
36
37    for c in text.chars() {
38        *freq.entry(c).or_default() += 1;
39    }
40
41    freq.values().fold(0.0_f64, |acc, &count| {
42        let p = count as f64 / total as f64;
43        acc - p * p.log2()
44    })
45}
46
47/// Shannon entropy over BPE token IDs (o200k_base).
48/// More LLM-relevant than character entropy since LLMs process BPE tokens.
49pub fn token_entropy(text: &str) -> f64 {
50    let tokens = encode_tokens(text);
51    if tokens.is_empty() {
52        return 0.0;
53    }
54    let total = tokens.len();
55    let mut freq: HashMap<u32, usize> = HashMap::new();
56    for &t in &tokens {
57        *freq.entry(t).or_default() += 1;
58    }
59    freq.values().fold(0.0_f64, |acc, &count| {
60        let p = count as f64 / total as f64;
61        acc - p * p.log2()
62    })
63}
64
65/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
66/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
67/// This makes thresholds comparable across different alphabet sizes.
68pub fn normalized_token_entropy(text: &str) -> f64 {
69    let tokens = encode_tokens(text);
70    if tokens.is_empty() {
71        return 0.0;
72    }
73    let total = tokens.len();
74    let mut freq: HashMap<u32, usize> = HashMap::new();
75    for &t in &tokens {
76        *freq.entry(t).or_default() += 1;
77    }
78    let n_unique = freq.len();
79    if n_unique <= 1 {
80        return 0.0;
81    }
82    let h = freq.values().fold(0.0_f64, |acc, &count| {
83        let p = count as f64 / total as f64;
84        acc - p * p.log2()
85    });
86    let h_max = (n_unique as f64).log2();
87    h / h_max
88}
89
90pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
91    let set_a: HashSet<&str> = a.split_whitespace().collect();
92    let set_b: HashSet<&str> = b.split_whitespace().collect();
93
94    let intersection = set_a.intersection(&set_b).count();
95    let union = set_a.union(&set_b).count();
96
97    if union == 0 {
98        return 0.0;
99    }
100    intersection as f64 / union as f64
101}
102
103/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
104pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
105    let set_a = ngram_set(a, n);
106    let set_b = ngram_set(b, n);
107
108    let intersection = set_a.intersection(&set_b).count();
109    let union = set_a.union(&set_b).count();
110
111    if union == 0 {
112        return 0.0;
113    }
114    intersection as f64 / union as f64
115}
116
117fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
118    let words: Vec<&str> = text.split_whitespace().collect();
119    if words.len() < n {
120        let mut set = HashSet::new();
121        if !words.is_empty() {
122            set.insert(words.iter().map(|w| w.to_string()).collect());
123        }
124        return set;
125    }
126    words
127        .windows(n)
128        .map(|w| w.iter().map(|s| s.to_string()).collect())
129        .collect()
130}
131
132/// Minhash signature for approximate Jaccard via LSH.
133/// Uses k independent hash functions (polynomial hashing with different seeds).
134pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
135    let ngrams = ngram_set(text, n);
136    if ngrams.is_empty() {
137        return vec![u64::MAX; k];
138    }
139    let mut signature = vec![u64::MAX; k];
140    for ngram in &ngrams {
141        for (i, min) in signature.iter_mut().enumerate() {
142            let h = hash_with_seed(ngram, i as u64);
143            if h < *min {
144                *min = h;
145            }
146        }
147    }
148    signature
149}
150
151/// Approximate Jaccard from two minhash signatures.
152pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
153    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
154        return 0.0;
155    }
156    let matches = sig_a
157        .iter()
158        .zip(sig_b.iter())
159        .filter(|(a, b)| a == b)
160        .count();
161    matches as f64 / sig_a.len() as f64
162}
163
164fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
165    let mut hasher = std::collections::hash_map::DefaultHasher::new();
166    seed.hash(&mut hasher);
167    value.hash(&mut hasher);
168    hasher.finish()
169}
170
171/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
172/// Lower values = more compressible = more redundant.
173pub fn kolmogorov_proxy(content: &str) -> f64 {
174    if content.is_empty() {
175        return 1.0;
176    }
177    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
178    encoder.write_all(content.as_bytes()).ok();
179    let compressed = encoder.finish().unwrap_or_default();
180    compressed.len() as f64 / content.len() as f64
181}
182
183#[derive(Debug, Clone, Copy, PartialEq, Eq)]
184pub enum CompressibilityClass {
185    High,
186    Medium,
187    Low,
188}
189
190impl CompressibilityClass {
191    pub fn label(&self) -> &'static str {
192        match self {
193            Self::High => "high (K<0.3)",
194            Self::Medium => "medium (0.3≤K<0.6)",
195            Self::Low => "low (K≥0.6)",
196        }
197    }
198}
199
200/// Classify how compressible content is based on gzip ratio.
201pub fn compressibility_class(content: &str) -> CompressibilityClass {
202    let k = kolmogorov_proxy(content);
203    if k < 0.3 {
204        CompressibilityClass::High
205    } else if k < 0.6 {
206        CompressibilityClass::Medium
207    } else {
208        CompressibilityClass::Low
209    }
210}
211
212pub fn entropy_compress(content: &str) -> EntropyResult {
213    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
214}
215
216pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
217    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
218    let before_lines = content.lines().count() as u32;
219    let result =
220        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
221    let after_lines = result.output.lines().count() as u32;
222
223    if before_lines != after_lines {
224        super::events::emit(super::events::EventKind::Compression {
225            path: path.to_string(),
226            before_lines,
227            after_lines,
228            strategy: "entropy_adaptive".to_string(),
229            kept_line_count: after_lines,
230            removed_line_count: before_lines.saturating_sub(after_lines),
231        });
232    }
233
234    result
235}
236
237fn entropy_compress_with_thresholds(
238    content: &str,
239    entropy_threshold: f64,
240    jaccard_threshold: f64,
241) -> EntropyResult {
242    let original_tokens = count_tokens(content);
243    let mut lines: Vec<&str> = content.lines().collect();
244    let mut techniques = Vec::new();
245
246    let original_count = lines.len();
247    lines.retain(|line| {
248        let trimmed = line.trim();
249        super::surprise::should_keep_line(trimmed, entropy_threshold)
250    });
251    let removed = original_count - lines.len();
252    if removed > 0 {
253        techniques.push(format!(
254            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
255        ));
256    }
257
258    let blocks = extract_blocks(&lines);
259    let groups = find_pattern_groups(&blocks, jaccard_threshold);
260    let mut dedup_count = 0;
261    for group in &groups {
262        if group.len() > 1 {
263            dedup_count += group.len() - 1;
264        }
265    }
266    if dedup_count > 0 {
267        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
268    }
269
270    let mut result: Vec<String> = Vec::new();
271    let mut skip_indices: HashSet<usize> = HashSet::new();
272    for group in &groups {
273        if group.len() > 1 {
274            for &idx in &group[1..] {
275                skip_indices.insert(idx);
276            }
277        }
278    }
279    for (i, line) in lines.iter().enumerate() {
280        if !skip_indices.contains(&i) {
281            result.push(line.to_string());
282        }
283    }
284
285    let mut collapsed = Vec::new();
286    let mut blank_count = 0;
287    for line in &result {
288        if line.trim().is_empty() {
289            blank_count += 1;
290            if blank_count <= 1 {
291                collapsed.push(line.clone());
292            }
293        } else {
294            blank_count = 0;
295            collapsed.push(line.clone());
296        }
297    }
298
299    let output = collapsed.join("\n");
300    let compressed_tokens = count_tokens(&output);
301
302    EntropyResult {
303        output,
304        original_tokens,
305        compressed_tokens,
306        techniques,
307    }
308}
309
310#[derive(Debug)]
311pub struct EntropyAnalysis {
312    pub avg_entropy: f64,
313    pub low_entropy_count: usize,
314    pub high_entropy_count: usize,
315    pub total_lines: usize,
316}
317
318pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
319    let lines: Vec<&str> = content.lines().collect();
320    let total = lines.len();
321    let mut sum = 0.0;
322    let mut low = 0;
323    let mut high = 0;
324    let mut counted = 0;
325
326    for line in &lines {
327        let trimmed = line.trim();
328        if trimmed.is_empty() {
329            continue;
330        }
331        let h = token_entropy(trimmed);
332        sum += h;
333        counted += 1;
334        if h < BPE_ENTROPY_THRESHOLD {
335            low += 1;
336        }
337        if h > 3.0 {
338            high += 1;
339        }
340    }
341
342    EntropyAnalysis {
343        avg_entropy: if counted > 0 {
344            sum / counted as f64
345        } else {
346            0.0
347        },
348        low_entropy_count: low,
349        high_entropy_count: high,
350        total_lines: total,
351    }
352}
353
354struct Block {
355    content: String,
356}
357
358fn extract_blocks(lines: &[&str]) -> Vec<Block> {
359    let mut blocks = Vec::new();
360    let mut current = String::new();
361
362    for line in lines.iter() {
363        let trimmed = line.trim();
364        if trimmed.is_empty() && !current.is_empty() {
365            blocks.push(Block {
366                content: current.clone(),
367            });
368            current.clear();
369        } else if !trimmed.is_empty() {
370            current.push_str(trimmed);
371            current.push('\n');
372        }
373    }
374
375    if !current.is_empty() {
376        blocks.push(Block { content: current });
377    }
378
379    blocks
380}
381
382fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
383    // Exact n-gram Jaccard, but with precomputed n-gram sets per block to avoid
384    // rebuilding allocations per pair. Includes a size-ratio impossibility check
385    // (max possible Jaccard is |A|/|B| for |A|<=|B|).
386    let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
387    let sizes: Vec<usize> = sets.iter().map(|s| s.len()).collect();
388
389    let mut groups: Vec<Vec<usize>> = Vec::new();
390    let mut assigned: HashSet<usize> = HashSet::new();
391
392    for i in 0..blocks.len() {
393        if assigned.contains(&i) {
394            continue;
395        }
396        let mut group = vec![i];
397        for j in (i + 1)..blocks.len() {
398            if assigned.contains(&j) {
399                continue;
400            }
401            let size_i = sizes[i];
402            let size_j = sizes[j];
403            let min_sz = size_i.min(size_j);
404            let max_sz = size_i.max(size_j);
405            if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
406                continue;
407            }
408            let inter = sets[i].intersection(&sets[j]).count();
409            let union = size_i + size_j - inter;
410            if union > 0 && (inter as f64 / union as f64) >= threshold {
411                group.push(j);
412                assigned.insert(j);
413            }
414        }
415        if group.len() > 1 {
416            assigned.insert(i);
417        }
418        groups.push(group);
419    }
420
421    groups
422}
423
424#[cfg(test)]
425mod tests {
426    use super::*;
427
428    #[test]
429    fn shannon_entropy_empty_is_zero() {
430        assert_eq!(shannon_entropy(""), 0.0);
431    }
432
433    #[test]
434    fn shannon_entropy_single_char() {
435        assert_eq!(shannon_entropy("aaaa"), 0.0);
436    }
437
438    #[test]
439    fn shannon_entropy_high_for_varied_text() {
440        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
441        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
442        assert!(
443            shannon_entropy(varied) > shannon_entropy(uniform),
444            "varied text should have higher entropy"
445        );
446    }
447
448    #[test]
449    fn jaccard_identical_is_one() {
450        let sim = jaccard_similarity("hello world", "hello world");
451        assert!((sim - 1.0).abs() < f64::EPSILON);
452    }
453
454    #[test]
455    fn jaccard_disjoint_is_zero() {
456        let sim = jaccard_similarity("abc", "xyz");
457        assert_eq!(sim, 0.0);
458    }
459
460    #[test]
461    fn jaccard_partial_overlap() {
462        let sim = jaccard_similarity("hello world", "hello rust");
463        assert!(sim > 0.0 && sim < 1.0);
464    }
465
466    #[test]
467    fn entropy_compress_produces_output() {
468        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
469        let result = entropy_compress(content);
470        assert!(!result.output.is_empty(), "should produce non-empty output");
471        assert!(result.compressed_tokens <= result.original_tokens);
472    }
473
474    #[test]
475    fn entropy_result_savings() {
476        let r = EntropyResult {
477            output: "short".to_string(),
478            original_tokens: 100,
479            compressed_tokens: 60,
480            techniques: vec!["test".to_string()],
481        };
482        assert!((r.savings_percent() - 40.0).abs() < 0.1);
483    }
484
485    #[test]
486    fn entropy_result_zero_original() {
487        let r = EntropyResult {
488            output: String::new(),
489            original_tokens: 0,
490            compressed_tokens: 0,
491            techniques: vec![],
492        };
493        assert_eq!(r.savings_percent(), 0.0);
494    }
495
496    #[test]
497    fn token_entropy_empty_is_zero() {
498        assert_eq!(token_entropy(""), 0.0);
499    }
500
501    #[test]
502    fn token_entropy_single_repeated_token() {
503        assert_eq!(token_entropy("}"), 0.0);
504    }
505
506    #[test]
507    fn token_entropy_higher_for_diverse_code() {
508        let diverse = "let result = compute_something(x, y, z);";
509        let repetitive = "aaaa aaaa aaaa aaaa";
510        assert!(
511            token_entropy(diverse) > token_entropy(repetitive),
512            "diverse code should have higher BPE token entropy"
513        );
514    }
515
516    #[test]
517    fn token_entropy_vs_char_entropy_differ() {
518        let code = "fn main() { println!(\"hello world\"); }";
519        let te = token_entropy(code);
520        let ce = shannon_entropy(code);
521        assert!(te != ce, "BPE and char entropy should differ for code");
522    }
523
524    #[test]
525    fn ngram_jaccard_preserves_order() {
526        let a = "a b c d";
527        let b = "d c b a";
528        let word_j = jaccard_similarity(a, b);
529        let ngram_j = ngram_jaccard(a, b, 2);
530        assert!(
531            ngram_j < word_j,
532            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
533        );
534    }
535
536    #[test]
537    fn ngram_jaccard_identical_is_one() {
538        let text = "fn main() { println!(\"hello\"); }";
539        let j = ngram_jaccard(text, text, 2);
540        assert!((j - 1.0).abs() < f64::EPSILON);
541    }
542
543    #[test]
544    fn ngram_jaccard_disjoint_is_zero() {
545        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
546        assert_eq!(j, 0.0);
547    }
548
549    #[test]
550    fn minhash_approximates_jaccard() {
551        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
552        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
553        let exact = ngram_jaccard(a, b, 2);
554        let sig_a = minhash_signature(a, 2, 128);
555        let sig_b = minhash_signature(b, 2, 128);
556        let approx = minhash_similarity(&sig_a, &sig_b);
557        assert!(
558            (exact - approx).abs() < 0.2,
559            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
560        );
561    }
562
563    #[test]
564    fn minhash_empty_text() {
565        let sig = minhash_signature("", 2, 64);
566        assert!(sig.iter().all(|&v| v == u64::MAX));
567    }
568
569    #[test]
570    fn kolmogorov_empty_is_one() {
571        assert_eq!(kolmogorov_proxy(""), 1.0);
572    }
573
574    #[test]
575    fn kolmogorov_repetitive_is_low() {
576        let repetitive = "aaa\n".repeat(1000);
577        let k = kolmogorov_proxy(&repetitive);
578        assert!(
579            k < 0.1,
580            "highly repetitive text should compress well: K={k}"
581        );
582    }
583
584    #[test]
585    fn kolmogorov_diverse_is_higher() {
586        let repetitive = "aaa\n".repeat(500);
587        let diverse = (0..500)
588            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
589            .collect::<Vec<_>>()
590            .join("\n");
591        assert!(
592            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
593            "diverse content should have higher K than repetitive"
594        );
595    }
596
597    #[test]
598    fn compressibility_class_repetitive_is_high() {
599        let text = "use std::io;\n".repeat(200);
600        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
601    }
602
603    #[test]
604    fn kolmogorov_diverse_higher_than_repetitive() {
605        let rep = "test\n".repeat(500);
606        let diverse = (0..500)
607            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
608            .collect::<Vec<_>>()
609            .join("\n");
610        assert!(
611            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
612            "diverse content should have higher K"
613        );
614    }
615}