Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12/// Result of entropy-based compression: output text, token counts, and techniques used.
13#[derive(Debug)]
14pub struct EntropyResult {
15    pub output: String,
16    pub original_tokens: usize,
17    pub compressed_tokens: usize,
18    pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22    /// Returns the percentage of tokens saved by compression.
23    pub fn savings_percent(&self) -> f64 {
24        if self.original_tokens == 0 {
25            return 0.0;
26        }
27        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28        (saved as f64 / self.original_tokens as f64) * 100.0
29    }
30}
31
32/// Computes Shannon entropy (bits) over character frequencies in the text.
33pub fn shannon_entropy(text: &str) -> f64 {
34    if text.is_empty() {
35        return 0.0;
36    }
37    let mut freq: HashMap<char, usize> = HashMap::new();
38    let total = text.chars().count();
39
40    for c in text.chars() {
41        *freq.entry(c).or_default() += 1;
42    }
43
44    freq.values().fold(0.0_f64, |acc, &count| {
45        let p = count as f64 / total as f64;
46        acc - p * p.log2()
47    })
48}
49
50/// Shannon entropy over already-encoded BPE token IDs (o200k_base).
51pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52    if tokens.is_empty() {
53        return 0.0;
54    }
55    let total = tokens.len();
56    let mut freq: HashMap<u32, usize> = HashMap::new();
57    for &t in tokens {
58        *freq.entry(t).or_default() += 1;
59    }
60    freq.values().fold(0.0_f64, |acc, &count| {
61        let p = count as f64 / total as f64;
62        acc - p * p.log2()
63    })
64}
65
66/// Shannon entropy over BPE token IDs (o200k_base).
67/// More LLM-relevant than character entropy since LLMs process BPE tokens.
68pub fn token_entropy(text: &str) -> f64 {
69    let tokens = encode_tokens(text);
70    token_entropy_from_ids(&tokens)
71}
72
73/// Normalized Shannon entropy over encoded token IDs: H(X) / log₂(n), n = unique token count.
74pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75    if tokens.is_empty() {
76        return 0.0;
77    }
78    let total = tokens.len();
79    let mut freq: HashMap<u32, usize> = HashMap::new();
80    for &t in tokens {
81        *freq.entry(t).or_default() += 1;
82    }
83    let n_unique = freq.len();
84    if n_unique <= 1 {
85        return 0.0;
86    }
87    let h = freq.values().fold(0.0_f64, |acc, &count| {
88        let p = count as f64 / total as f64;
89        acc - p * p.log2()
90    });
91    let h_max = (n_unique as f64).log2();
92    h / h_max
93}
94
95/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
96/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
97/// This makes thresholds comparable across different alphabet sizes.
98pub fn normalized_token_entropy(text: &str) -> f64 {
99    let tokens = encode_tokens(text);
100    normalized_token_entropy_from_ids(&tokens)
101}
102
103/// Computes word-set Jaccard similarity between two strings (0.0–1.0).
104pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105    let set_a: HashSet<&str> = a.split_whitespace().collect();
106    let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108    let intersection = set_a.intersection(&set_b).count();
109    let union = set_a.union(&set_b).count();
110
111    if union == 0 {
112        return 0.0;
113    }
114    intersection as f64 / union as f64
115}
116
117/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
118pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119    let set_a = ngram_set(a, n);
120    let set_b = ngram_set(b, n);
121
122    let intersection = set_a.intersection(&set_b).count();
123    let union = set_a.union(&set_b).count();
124
125    if union == 0 {
126        return 0.0;
127    }
128    intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132    let words: Vec<&str> = text.split_whitespace().collect();
133    if words.len() < n {
134        let mut set = HashSet::new();
135        if !words.is_empty() {
136            set.insert(words.iter().map(std::string::ToString::to_string).collect());
137        }
138        return set;
139    }
140    words
141        .windows(n)
142        .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143        .collect()
144}
145
146/// Minhash signature for approximate Jaccard via LSH.
147/// Uses k independent hash functions (polynomial hashing with different seeds).
148pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149    let ngrams = ngram_set(text, n);
150    if ngrams.is_empty() {
151        return vec![u64::MAX; k];
152    }
153    let mut signature = vec![u64::MAX; k];
154    for ngram in &ngrams {
155        for (i, min) in signature.iter_mut().enumerate() {
156            let h = hash_with_seed(ngram, i as u64);
157            if h < *min {
158                *min = h;
159            }
160        }
161    }
162    signature
163}
164
165/// Approximate Jaccard from two minhash signatures.
166pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168        return 0.0;
169    }
170    let matches = sig_a
171        .iter()
172        .zip(sig_b.iter())
173        .filter(|(a, b)| a == b)
174        .count();
175    matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179    let mut hasher = std::collections::hash_map::DefaultHasher::new();
180    seed.hash(&mut hasher);
181    value.hash(&mut hasher);
182    hasher.finish()
183}
184
185/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
186/// Lower values = more compressible = more redundant.
187pub fn kolmogorov_proxy(content: &str) -> f64 {
188    if content.is_empty() {
189        return 1.0;
190    }
191    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192    encoder.write_all(content.as_bytes()).ok();
193    let compressed = encoder.finish().unwrap_or_default();
194    compressed.len() as f64 / content.len() as f64
195}
196
197/// Classification of content compressibility based on Kolmogorov proxy (gzip ratio).
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200    High,
201    Medium,
202    Low,
203}
204
205impl CompressibilityClass {
206    /// Returns a human-readable label with the Kolmogorov threshold range.
207    pub fn label(&self) -> &'static str {
208        match self {
209            Self::High => "high (K<0.3)",
210            Self::Medium => "medium (0.3≤K<0.6)",
211            Self::Low => "low (K≥0.6)",
212        }
213    }
214}
215
216/// Classify how compressible content is based on gzip ratio.
217pub fn compressibility_class(content: &str) -> CompressibilityClass {
218    let k = kolmogorov_proxy(content);
219    if k < 0.3 {
220        CompressibilityClass::High
221    } else if k < 0.6 {
222        CompressibilityClass::Medium
223    } else {
224        CompressibilityClass::Low
225    }
226}
227
228/// Compresses content by removing low-entropy lines and deduplicating patterns.
229pub fn entropy_compress(content: &str) -> EntropyResult {
230    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233/// Entropy compression with file-type-adaptive thresholds and event emission.
234pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236    let before_lines = content.lines().count() as u32;
237    let result =
238        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239    let after_lines = result.output.lines().count() as u32;
240
241    if before_lines != after_lines {
242        super::events::emit(super::events::EventKind::Compression {
243            path: path.to_string(),
244            before_lines,
245            after_lines,
246            strategy: "entropy_adaptive".to_string(),
247            kept_line_count: after_lines,
248            removed_line_count: before_lines.saturating_sub(after_lines),
249        });
250    }
251
252    result
253}
254
255fn entropy_compress_with_thresholds(
256    content: &str,
257    entropy_threshold: f64,
258    jaccard_threshold: f64,
259) -> EntropyResult {
260    let original_tokens = count_tokens(content);
261    let mut lines: Vec<&str> = content.lines().collect();
262    let mut techniques = Vec::new();
263
264    let original_count = lines.len();
265    lines.retain(|line| {
266        let trimmed = line.trim();
267        super::surprise::should_keep_line(trimmed, entropy_threshold)
268    });
269    let removed = original_count - lines.len();
270    if removed > 0 {
271        techniques.push(format!(
272            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
273        ));
274    }
275
276    let blocks = extract_blocks(&lines);
277    let groups = find_pattern_groups(&blocks, jaccard_threshold);
278    let mut dedup_count = 0;
279    for group in &groups {
280        if group.len() > 1 {
281            dedup_count += group.len() - 1;
282        }
283    }
284    if dedup_count > 0 {
285        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
286    }
287
288    let mut result: Vec<String> = Vec::new();
289    let mut skip_indices: HashSet<usize> = HashSet::new();
290    for group in &groups {
291        if group.len() > 1 {
292            for &idx in &group[1..] {
293                skip_indices.insert(idx);
294            }
295        }
296    }
297    for (i, line) in lines.iter().enumerate() {
298        if !skip_indices.contains(&i) {
299            result.push(line.to_string());
300        }
301    }
302
303    let mut collapsed = Vec::new();
304    let mut blank_count = 0;
305    for line in &result {
306        if line.trim().is_empty() {
307            blank_count += 1;
308            if blank_count <= 1 {
309                collapsed.push(line.clone());
310            }
311        } else {
312            blank_count = 0;
313            collapsed.push(line.clone());
314        }
315    }
316
317    let output = collapsed.join("\n");
318    let compressed_tokens = count_tokens(&output);
319
320    EntropyResult {
321        output,
322        original_tokens,
323        compressed_tokens,
324        techniques,
325    }
326}
327
328/// Per-line entropy statistics for a block of content.
329#[derive(Debug)]
330pub struct EntropyAnalysis {
331    pub avg_entropy: f64,
332    pub low_entropy_count: usize,
333    pub high_entropy_count: usize,
334    pub total_lines: usize,
335}
336
337/// Analyzes per-line BPE token entropy, counting low/high entropy lines.
338pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
339    let lines: Vec<&str> = content.lines().collect();
340    let total = lines.len();
341    let mut sum = 0.0;
342    let mut low = 0;
343    let mut high = 0;
344    let mut counted = 0;
345
346    for line in &lines {
347        let trimmed = line.trim();
348        if trimmed.is_empty() {
349            continue;
350        }
351        let h = token_entropy(trimmed);
352        sum += h;
353        counted += 1;
354        if h < BPE_ENTROPY_THRESHOLD {
355            low += 1;
356        }
357        if h > 3.0 {
358            high += 1;
359        }
360    }
361
362    EntropyAnalysis {
363        avg_entropy: if counted > 0 {
364            sum / counted as f64
365        } else {
366            0.0
367        },
368        low_entropy_count: low,
369        high_entropy_count: high,
370        total_lines: total,
371    }
372}
373
374struct Block {
375    content: String,
376}
377
378fn extract_blocks(lines: &[&str]) -> Vec<Block> {
379    let mut blocks = Vec::new();
380    let mut current = String::new();
381
382    for line in lines {
383        let trimmed = line.trim();
384        if trimmed.is_empty() && !current.is_empty() {
385            blocks.push(Block {
386                content: current.clone(),
387            });
388            current.clear();
389        } else if !trimmed.is_empty() {
390            current.push_str(trimmed);
391            current.push('\n');
392        }
393    }
394
395    if !current.is_empty() {
396        blocks.push(Block { content: current });
397    }
398
399    blocks
400}
401
402fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
403    // Exact n-gram Jaccard, but with precomputed n-gram sets per block to avoid
404    // rebuilding allocations per pair. Includes a size-ratio impossibility check
405    // (max possible Jaccard is |A|/|B| for |A|<=|B|).
406    let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
407    let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
408
409    let mut groups: Vec<Vec<usize>> = Vec::new();
410    let mut assigned: HashSet<usize> = HashSet::new();
411
412    for i in 0..blocks.len() {
413        if assigned.contains(&i) {
414            continue;
415        }
416        let mut group = vec![i];
417        for j in (i + 1)..blocks.len() {
418            if assigned.contains(&j) {
419                continue;
420            }
421            let size_i = sizes[i];
422            let size_j = sizes[j];
423            let min_sz = size_i.min(size_j);
424            let max_sz = size_i.max(size_j);
425            if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
426                continue;
427            }
428            let inter = sets[i].intersection(&sets[j]).count();
429            let union = size_i + size_j - inter;
430            if union > 0 && (inter as f64 / union as f64) >= threshold {
431                group.push(j);
432                assigned.insert(j);
433            }
434        }
435        if group.len() > 1 {
436            assigned.insert(i);
437        }
438        groups.push(group);
439    }
440
441    groups
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn shannon_entropy_empty_is_zero() {
450        assert_eq!(shannon_entropy(""), 0.0);
451    }
452
453    #[test]
454    fn shannon_entropy_single_char() {
455        assert_eq!(shannon_entropy("aaaa"), 0.0);
456    }
457
458    #[test]
459    fn shannon_entropy_high_for_varied_text() {
460        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
461        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
462        assert!(
463            shannon_entropy(varied) > shannon_entropy(uniform),
464            "varied text should have higher entropy"
465        );
466    }
467
468    #[test]
469    fn jaccard_identical_is_one() {
470        let sim = jaccard_similarity("hello world", "hello world");
471        assert!((sim - 1.0).abs() < f64::EPSILON);
472    }
473
474    #[test]
475    fn jaccard_disjoint_is_zero() {
476        let sim = jaccard_similarity("abc", "xyz");
477        assert_eq!(sim, 0.0);
478    }
479
480    #[test]
481    fn jaccard_partial_overlap() {
482        let sim = jaccard_similarity("hello world", "hello rust");
483        assert!(sim > 0.0 && sim < 1.0);
484    }
485
486    #[test]
487    fn entropy_compress_produces_output() {
488        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
489        let result = entropy_compress(content);
490        assert!(!result.output.is_empty(), "should produce non-empty output");
491        assert!(result.compressed_tokens <= result.original_tokens);
492    }
493
494    #[test]
495    fn entropy_result_savings() {
496        let r = EntropyResult {
497            output: "short".to_string(),
498            original_tokens: 100,
499            compressed_tokens: 60,
500            techniques: vec!["test".to_string()],
501        };
502        assert!((r.savings_percent() - 40.0).abs() < 0.1);
503    }
504
505    #[test]
506    fn entropy_result_zero_original() {
507        let r = EntropyResult {
508            output: String::new(),
509            original_tokens: 0,
510            compressed_tokens: 0,
511            techniques: vec![],
512        };
513        assert_eq!(r.savings_percent(), 0.0);
514    }
515
516    #[test]
517    fn token_entropy_empty_is_zero() {
518        assert_eq!(token_entropy(""), 0.0);
519    }
520
521    #[test]
522    fn token_entropy_single_repeated_token() {
523        assert_eq!(token_entropy("}"), 0.0);
524    }
525
526    #[test]
527    fn token_entropy_higher_for_diverse_code() {
528        let diverse = "let result = compute_something(x, y, z);";
529        let repetitive = "aaaa aaaa aaaa aaaa";
530        assert!(
531            token_entropy(diverse) > token_entropy(repetitive),
532            "diverse code should have higher BPE token entropy"
533        );
534    }
535
536    #[test]
537    fn token_entropy_vs_char_entropy_differ() {
538        let code = "fn main() { println!(\"hello world\"); }";
539        let te = token_entropy(code);
540        let ce = shannon_entropy(code);
541        assert!(te != ce, "BPE and char entropy should differ for code");
542    }
543
544    #[test]
545    fn ngram_jaccard_preserves_order() {
546        let a = "a b c d";
547        let b = "d c b a";
548        let word_j = jaccard_similarity(a, b);
549        let ngram_j = ngram_jaccard(a, b, 2);
550        assert!(
551            ngram_j < word_j,
552            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
553        );
554    }
555
556    #[test]
557    fn ngram_jaccard_identical_is_one() {
558        let text = "fn main() { println!(\"hello\"); }";
559        let j = ngram_jaccard(text, text, 2);
560        assert!((j - 1.0).abs() < f64::EPSILON);
561    }
562
563    #[test]
564    fn ngram_jaccard_disjoint_is_zero() {
565        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
566        assert_eq!(j, 0.0);
567    }
568
569    #[test]
570    fn minhash_approximates_jaccard() {
571        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
572        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
573        let exact = ngram_jaccard(a, b, 2);
574        let sig_a = minhash_signature(a, 2, 128);
575        let sig_b = minhash_signature(b, 2, 128);
576        let approx = minhash_similarity(&sig_a, &sig_b);
577        assert!(
578            (exact - approx).abs() < 0.2,
579            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
580        );
581    }
582
583    #[test]
584    fn minhash_empty_text() {
585        let sig = minhash_signature("", 2, 64);
586        assert!(sig.iter().all(|&v| v == u64::MAX));
587    }
588
589    #[test]
590    fn kolmogorov_empty_is_one() {
591        assert_eq!(kolmogorov_proxy(""), 1.0);
592    }
593
594    #[test]
595    fn kolmogorov_repetitive_is_low() {
596        let repetitive = "aaa\n".repeat(1000);
597        let k = kolmogorov_proxy(&repetitive);
598        assert!(
599            k < 0.1,
600            "highly repetitive text should compress well: K={k}"
601        );
602    }
603
604    #[test]
605    fn kolmogorov_diverse_is_higher() {
606        let repetitive = "aaa\n".repeat(500);
607        let diverse = (0..500)
608            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
609            .collect::<Vec<_>>()
610            .join("\n");
611        assert!(
612            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
613            "diverse content should have higher K than repetitive"
614        );
615    }
616
617    #[test]
618    fn compressibility_class_repetitive_is_high() {
619        let text = "use std::io;\n".repeat(200);
620        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
621    }
622
623    #[test]
624    fn kolmogorov_diverse_higher_than_repetitive() {
625        let rep = "test\n".repeat(500);
626        let diverse = (0..500)
627            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
628            .collect::<Vec<_>>()
629            .join("\n");
630        assert!(
631            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
632            "diverse content should have higher K"
633        );
634    }
635}