Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17    pub output: String,
18    pub original_tokens: usize,
19    pub compressed_tokens: usize,
20    pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25    pub fn savings_percent(&self) -> f64 {
26        if self.original_tokens == 0 {
27            return 0.0;
28        }
29        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30        (saved as f64 / self.original_tokens as f64) * 100.0
31    }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36    if text.is_empty() {
37        return 0.0;
38    }
39    let mut freq: HashMap<char, usize> = HashMap::new();
40    let total = text.chars().count();
41
42    for c in text.chars() {
43        *freq.entry(c).or_default() += 1;
44    }
45
46    freq.values().fold(0.0_f64, |acc, &count| {
47        let p = count as f64 / total as f64;
48        acc - p * p.log2()
49    })
50}
51
52/// Shannon entropy over BPE token IDs (o200k_base).
53/// More LLM-relevant than character entropy since LLMs process BPE tokens.
54pub fn token_entropy(text: &str) -> f64 {
55    let tokens = encode_tokens(text);
56    if tokens.is_empty() {
57        return 0.0;
58    }
59    let total = tokens.len();
60    let mut freq: HashMap<u32, usize> = HashMap::new();
61    for &t in &tokens {
62        *freq.entry(t).or_default() += 1;
63    }
64    freq.values().fold(0.0_f64, |acc, &count| {
65        let p = count as f64 / total as f64;
66        acc - p * p.log2()
67    })
68}
69
70/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
71/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
72/// This makes thresholds comparable across different alphabet sizes.
73pub fn normalized_token_entropy(text: &str) -> f64 {
74    let tokens = encode_tokens(text);
75    if tokens.is_empty() {
76        return 0.0;
77    }
78    let total = tokens.len();
79    let mut freq: HashMap<u32, usize> = HashMap::new();
80    for &t in &tokens {
81        *freq.entry(t).or_default() += 1;
82    }
83    let n_unique = freq.len();
84    if n_unique <= 1 {
85        return 0.0;
86    }
87    let h = freq.values().fold(0.0_f64, |acc, &count| {
88        let p = count as f64 / total as f64;
89        acc - p * p.log2()
90    });
91    let h_max = (n_unique as f64).log2();
92    h / h_max
93}
94
95#[allow(dead_code)]
96pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
97    let set_a: HashSet<&str> = a.split_whitespace().collect();
98    let set_b: HashSet<&str> = b.split_whitespace().collect();
99
100    let intersection = set_a.intersection(&set_b).count();
101    let union = set_a.union(&set_b).count();
102
103    if union == 0 {
104        return 0.0;
105    }
106    intersection as f64 / union as f64
107}
108
109/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
110pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
111    let set_a = ngram_set(a, n);
112    let set_b = ngram_set(b, n);
113
114    let intersection = set_a.intersection(&set_b).count();
115    let union = set_a.union(&set_b).count();
116
117    if union == 0 {
118        return 0.0;
119    }
120    intersection as f64 / union as f64
121}
122
123fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
124    let words: Vec<&str> = text.split_whitespace().collect();
125    if words.len() < n {
126        let mut set = HashSet::new();
127        if !words.is_empty() {
128            set.insert(words.iter().map(|w| w.to_string()).collect());
129        }
130        return set;
131    }
132    words
133        .windows(n)
134        .map(|w| w.iter().map(|s| s.to_string()).collect())
135        .collect()
136}
137
138/// Minhash signature for approximate Jaccard via LSH.
139/// Uses k independent hash functions (polynomial hashing with different seeds).
140#[allow(dead_code)]
141pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
142    let ngrams = ngram_set(text, n);
143    if ngrams.is_empty() {
144        return vec![u64::MAX; k];
145    }
146    let mut signature = vec![u64::MAX; k];
147    for ngram in &ngrams {
148        for (i, min) in signature.iter_mut().enumerate() {
149            let h = hash_with_seed(ngram, i as u64);
150            if h < *min {
151                *min = h;
152            }
153        }
154    }
155    signature
156}
157
158/// Approximate Jaccard from two minhash signatures.
159#[allow(dead_code)]
160pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
161    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
162        return 0.0;
163    }
164    let matches = sig_a
165        .iter()
166        .zip(sig_b.iter())
167        .filter(|(a, b)| a == b)
168        .count();
169    matches as f64 / sig_a.len() as f64
170}
171
172#[allow(dead_code)]
173fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
174    let mut hasher = std::collections::hash_map::DefaultHasher::new();
175    seed.hash(&mut hasher);
176    value.hash(&mut hasher);
177    hasher.finish()
178}
179
180/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
181/// Lower values = more compressible = more redundant.
182pub fn kolmogorov_proxy(content: &str) -> f64 {
183    if content.is_empty() {
184        return 1.0;
185    }
186    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
187    encoder.write_all(content.as_bytes()).ok();
188    let compressed = encoder.finish().unwrap_or_default();
189    compressed.len() as f64 / content.len() as f64
190}
191
192#[derive(Debug, Clone, Copy, PartialEq, Eq)]
193pub enum CompressibilityClass {
194    High,
195    Medium,
196    Low,
197}
198
199impl CompressibilityClass {
200    pub fn label(&self) -> &'static str {
201        match self {
202            Self::High => "high (K<0.3)",
203            Self::Medium => "medium (0.3≤K<0.6)",
204            Self::Low => "low (K≥0.6)",
205        }
206    }
207}
208
209/// Classify how compressible content is based on gzip ratio.
210pub fn compressibility_class(content: &str) -> CompressibilityClass {
211    let k = kolmogorov_proxy(content);
212    if k < 0.3 {
213        CompressibilityClass::High
214    } else if k < 0.6 {
215        CompressibilityClass::Medium
216    } else {
217        CompressibilityClass::Low
218    }
219}
220
221pub fn entropy_compress(content: &str) -> EntropyResult {
222    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
223}
224
225pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
226    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
227    let before_lines = content.lines().count() as u32;
228    let result =
229        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
230    let after_lines = result.output.lines().count() as u32;
231
232    if before_lines != after_lines {
233        super::events::emit(super::events::EventKind::Compression {
234            path: path.to_string(),
235            before_lines,
236            after_lines,
237            strategy: "entropy_adaptive".to_string(),
238            kept_line_count: after_lines,
239            removed_line_count: before_lines.saturating_sub(after_lines),
240        });
241    }
242
243    result
244}
245
246fn entropy_compress_with_thresholds(
247    content: &str,
248    entropy_threshold: f64,
249    jaccard_threshold: f64,
250) -> EntropyResult {
251    let original_tokens = count_tokens(content);
252    let mut lines: Vec<&str> = content.lines().collect();
253    let mut techniques = Vec::new();
254
255    let original_count = lines.len();
256    lines.retain(|line| {
257        let trimmed = line.trim();
258        super::surprise::should_keep_line(trimmed, entropy_threshold)
259    });
260    let removed = original_count - lines.len();
261    if removed > 0 {
262        techniques.push(format!(
263            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
264        ));
265    }
266
267    let blocks = extract_blocks(&lines);
268    let groups = find_pattern_groups(&blocks, jaccard_threshold);
269    let mut dedup_count = 0;
270    for group in &groups {
271        if group.len() > 1 {
272            dedup_count += group.len() - 1;
273        }
274    }
275    if dedup_count > 0 {
276        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
277    }
278
279    let mut result: Vec<String> = Vec::new();
280    let mut skip_indices: HashSet<usize> = HashSet::new();
281    for group in &groups {
282        if group.len() > 1 {
283            for &idx in &group[1..] {
284                skip_indices.insert(idx);
285            }
286        }
287    }
288    for (i, line) in lines.iter().enumerate() {
289        if !skip_indices.contains(&i) {
290            result.push(line.to_string());
291        }
292    }
293
294    let mut collapsed = Vec::new();
295    let mut blank_count = 0;
296    for line in &result {
297        if line.trim().is_empty() {
298            blank_count += 1;
299            if blank_count <= 1 {
300                collapsed.push(line.clone());
301            }
302        } else {
303            blank_count = 0;
304            collapsed.push(line.clone());
305        }
306    }
307
308    let output = collapsed.join("\n");
309    let compressed_tokens = count_tokens(&output);
310
311    EntropyResult {
312        output,
313        original_tokens,
314        compressed_tokens,
315        techniques,
316    }
317}
318
319#[derive(Debug)]
320pub struct EntropyAnalysis {
321    pub avg_entropy: f64,
322    pub low_entropy_count: usize,
323    pub high_entropy_count: usize,
324    pub total_lines: usize,
325}
326
327pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
328    let lines: Vec<&str> = content.lines().collect();
329    let total = lines.len();
330    let mut sum = 0.0;
331    let mut low = 0;
332    let mut high = 0;
333    let mut counted = 0;
334
335    for line in &lines {
336        let trimmed = line.trim();
337        if trimmed.is_empty() {
338            continue;
339        }
340        let h = token_entropy(trimmed);
341        sum += h;
342        counted += 1;
343        if h < BPE_ENTROPY_THRESHOLD {
344            low += 1;
345        }
346        if h > 3.0 {
347            high += 1;
348        }
349    }
350
351    EntropyAnalysis {
352        avg_entropy: if counted > 0 {
353            sum / counted as f64
354        } else {
355            0.0
356        },
357        low_entropy_count: low,
358        high_entropy_count: high,
359        total_lines: total,
360    }
361}
362
363#[allow(dead_code)]
364struct Block {
365    start: usize,
366    content: String,
367}
368
369fn extract_blocks(lines: &[&str]) -> Vec<Block> {
370    let mut blocks = Vec::new();
371    let mut current = String::new();
372    let mut start = 0;
373
374    for (i, line) in lines.iter().enumerate() {
375        let trimmed = line.trim();
376        if trimmed.is_empty() && !current.is_empty() {
377            blocks.push(Block {
378                start,
379                content: current.clone(),
380            });
381            current.clear();
382        } else if !trimmed.is_empty() {
383            if current.is_empty() {
384                start = i;
385            }
386            current.push_str(trimmed);
387            current.push('\n');
388        }
389    }
390
391    if !current.is_empty() {
392        blocks.push(Block {
393            start,
394            content: current,
395        });
396    }
397
398    blocks
399}
400
401fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
402    let mut groups: Vec<Vec<usize>> = Vec::new();
403    let mut assigned: HashSet<usize> = HashSet::new();
404
405    for (i, block_a) in blocks.iter().enumerate() {
406        if assigned.contains(&i) {
407            continue;
408        }
409        let mut group = vec![i];
410        for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
411            if assigned.contains(&j) {
412                continue;
413            }
414            if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
415                group.push(j);
416                assigned.insert(j);
417            }
418        }
419        if group.len() > 1 {
420            assigned.insert(i);
421        }
422        groups.push(group);
423    }
424
425    groups
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431
432    #[test]
433    fn shannon_entropy_empty_is_zero() {
434        assert_eq!(shannon_entropy(""), 0.0);
435    }
436
437    #[test]
438    fn shannon_entropy_single_char() {
439        assert_eq!(shannon_entropy("aaaa"), 0.0);
440    }
441
442    #[test]
443    fn shannon_entropy_high_for_varied_text() {
444        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
445        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
446        assert!(
447            shannon_entropy(varied) > shannon_entropy(uniform),
448            "varied text should have higher entropy"
449        );
450    }
451
452    #[test]
453    fn jaccard_identical_is_one() {
454        let sim = jaccard_similarity("hello world", "hello world");
455        assert!((sim - 1.0).abs() < f64::EPSILON);
456    }
457
458    #[test]
459    fn jaccard_disjoint_is_zero() {
460        let sim = jaccard_similarity("abc", "xyz");
461        assert_eq!(sim, 0.0);
462    }
463
464    #[test]
465    fn jaccard_partial_overlap() {
466        let sim = jaccard_similarity("hello world", "hello rust");
467        assert!(sim > 0.0 && sim < 1.0);
468    }
469
470    #[test]
471    fn entropy_compress_produces_output() {
472        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
473        let result = entropy_compress(content);
474        assert!(!result.output.is_empty(), "should produce non-empty output");
475        assert!(result.compressed_tokens <= result.original_tokens);
476    }
477
478    #[test]
479    fn entropy_result_savings() {
480        let r = EntropyResult {
481            output: "short".to_string(),
482            original_tokens: 100,
483            compressed_tokens: 60,
484            techniques: vec!["test".to_string()],
485        };
486        assert!((r.savings_percent() - 40.0).abs() < 0.1);
487    }
488
489    #[test]
490    fn entropy_result_zero_original() {
491        let r = EntropyResult {
492            output: String::new(),
493            original_tokens: 0,
494            compressed_tokens: 0,
495            techniques: vec![],
496        };
497        assert_eq!(r.savings_percent(), 0.0);
498    }
499
500    #[test]
501    fn token_entropy_empty_is_zero() {
502        assert_eq!(token_entropy(""), 0.0);
503    }
504
505    #[test]
506    fn token_entropy_single_repeated_token() {
507        assert_eq!(token_entropy("}"), 0.0);
508    }
509
510    #[test]
511    fn token_entropy_higher_for_diverse_code() {
512        let diverse = "let result = compute_something(x, y, z);";
513        let repetitive = "aaaa aaaa aaaa aaaa";
514        assert!(
515            token_entropy(diverse) > token_entropy(repetitive),
516            "diverse code should have higher BPE token entropy"
517        );
518    }
519
520    #[test]
521    fn token_entropy_vs_char_entropy_differ() {
522        let code = "fn main() { println!(\"hello world\"); }";
523        let te = token_entropy(code);
524        let ce = shannon_entropy(code);
525        assert!(te != ce, "BPE and char entropy should differ for code");
526    }
527
528    #[test]
529    fn ngram_jaccard_preserves_order() {
530        let a = "a b c d";
531        let b = "d c b a";
532        let word_j = jaccard_similarity(a, b);
533        let ngram_j = ngram_jaccard(a, b, 2);
534        assert!(
535            ngram_j < word_j,
536            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
537        );
538    }
539
540    #[test]
541    fn ngram_jaccard_identical_is_one() {
542        let text = "fn main() { println!(\"hello\"); }";
543        let j = ngram_jaccard(text, text, 2);
544        assert!((j - 1.0).abs() < f64::EPSILON);
545    }
546
547    #[test]
548    fn ngram_jaccard_disjoint_is_zero() {
549        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
550        assert_eq!(j, 0.0);
551    }
552
553    #[test]
554    fn minhash_approximates_jaccard() {
555        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
556        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
557        let exact = ngram_jaccard(a, b, 2);
558        let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
559        let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
560        let approx = minhash_similarity(&sig_a, &sig_b);
561        assert!(
562            (exact - approx).abs() < 0.2,
563            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
564        );
565    }
566
567    #[test]
568    fn minhash_empty_text() {
569        let sig = minhash_signature("", 2, 64);
570        assert!(sig.iter().all(|&v| v == u64::MAX));
571    }
572
573    #[test]
574    fn kolmogorov_empty_is_one() {
575        assert_eq!(kolmogorov_proxy(""), 1.0);
576    }
577
578    #[test]
579    fn kolmogorov_repetitive_is_low() {
580        let repetitive = "aaa\n".repeat(1000);
581        let k = kolmogorov_proxy(&repetitive);
582        assert!(
583            k < 0.1,
584            "highly repetitive text should compress well: K={k}"
585        );
586    }
587
588    #[test]
589    fn kolmogorov_diverse_is_higher() {
590        let repetitive = "aaa\n".repeat(500);
591        let diverse = (0..500)
592            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
593            .collect::<Vec<_>>()
594            .join("\n");
595        assert!(
596            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
597            "diverse content should have higher K than repetitive"
598        );
599    }
600
601    #[test]
602    fn compressibility_class_repetitive_is_high() {
603        let text = "use std::io;\n".repeat(200);
604        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
605    }
606
607    #[test]
608    fn kolmogorov_diverse_higher_than_repetitive() {
609        let rep = "test\n".repeat(500);
610        let diverse = (0..500)
611            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
612            .collect::<Vec<_>>()
613            .join("\n");
614        assert!(
615            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
616            "diverse content should have higher K"
617        );
618    }
619}