lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12/// Result of entropy-based compression: output text, token counts, and techniques used.
13#[derive(Debug)]
14pub struct EntropyResult {
15    pub output: String,
16    pub original_tokens: usize,
17    pub compressed_tokens: usize,
18    pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22    /// Returns the percentage of tokens saved by compression.
23    pub fn savings_percent(&self) -> f64 {
24        if self.original_tokens == 0 {
25            return 0.0;
26        }
27        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28        (saved as f64 / self.original_tokens as f64) * 100.0
29    }
30}
31
32/// Computes Shannon entropy (bits) over character frequencies in the text.
33pub fn shannon_entropy(text: &str) -> f64 {
34    if text.is_empty() {
35        return 0.0;
36    }
37    let mut freq: HashMap<char, usize> = HashMap::new();
38    let total = text.chars().count();
39
40    for c in text.chars() {
41        *freq.entry(c).or_default() += 1;
42    }
43
44    freq.values().fold(0.0_f64, |acc, &count| {
45        let p = count as f64 / total as f64;
46        acc - p * p.log2()
47    })
48}
49
50/// Shannon entropy over already-encoded BPE token IDs (o200k_base).
51pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52    if tokens.is_empty() {
53        return 0.0;
54    }
55    let total = tokens.len();
56    let mut freq: HashMap<u32, usize> = HashMap::new();
57    for &t in tokens {
58        *freq.entry(t).or_default() += 1;
59    }
60    freq.values().fold(0.0_f64, |acc, &count| {
61        let p = count as f64 / total as f64;
62        acc - p * p.log2()
63    })
64}
65
66/// Shannon entropy over BPE token IDs (o200k_base).
67/// More LLM-relevant than character entropy since LLMs process BPE tokens.
68pub fn token_entropy(text: &str) -> f64 {
69    let tokens = encode_tokens(text);
70    token_entropy_from_ids(&tokens)
71}
72
73/// Normalized Shannon entropy over encoded token IDs: H(X) / log₂(n), n = unique token count.
74pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75    if tokens.is_empty() {
76        return 0.0;
77    }
78    let total = tokens.len();
79    let mut freq: HashMap<u32, usize> = HashMap::new();
80    for &t in tokens {
81        *freq.entry(t).or_default() += 1;
82    }
83    let n_unique = freq.len();
84    if n_unique <= 1 {
85        return 0.0;
86    }
87    let h = freq.values().fold(0.0_f64, |acc, &count| {
88        let p = count as f64 / total as f64;
89        acc - p * p.log2()
90    });
91    let h_max = (n_unique as f64).log2();
92    h / h_max
93}
94
95/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
96/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
97/// This makes thresholds comparable across different alphabet sizes.
98pub fn normalized_token_entropy(text: &str) -> f64 {
99    let tokens = encode_tokens(text);
100    normalized_token_entropy_from_ids(&tokens)
101}
102
103/// Computes word-set Jaccard similarity between two strings (0.0–1.0).
104pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105    let set_a: HashSet<&str> = a.split_whitespace().collect();
106    let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108    let intersection = set_a.intersection(&set_b).count();
109    let union = set_a.union(&set_b).count();
110
111    if union == 0 {
112        return 0.0;
113    }
114    intersection as f64 / union as f64
115}
116
117/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
118pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119    let set_a = ngram_set(a, n);
120    let set_b = ngram_set(b, n);
121
122    let intersection = set_a.intersection(&set_b).count();
123    let union = set_a.union(&set_b).count();
124
125    if union == 0 {
126        return 0.0;
127    }
128    intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132    let words: Vec<&str> = text.split_whitespace().collect();
133    if words.len() < n {
134        let mut set = HashSet::new();
135        if !words.is_empty() {
136            set.insert(words.iter().map(std::string::ToString::to_string).collect());
137        }
138        return set;
139    }
140    words
141        .windows(n)
142        .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143        .collect()
144}
145
146/// Minhash signature for approximate Jaccard via LSH.
147/// Uses k independent hash functions (polynomial hashing with different seeds).
148pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149    let ngrams = ngram_set(text, n);
150    if ngrams.is_empty() {
151        return vec![u64::MAX; k];
152    }
153    let mut signature = vec![u64::MAX; k];
154    for ngram in &ngrams {
155        for (i, min) in signature.iter_mut().enumerate() {
156            let h = hash_with_seed(ngram, i as u64);
157            if h < *min {
158                *min = h;
159            }
160        }
161    }
162    signature
163}
164
165/// Approximate Jaccard from two minhash signatures.
166pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168        return 0.0;
169    }
170    let matches = sig_a
171        .iter()
172        .zip(sig_b.iter())
173        .filter(|(a, b)| a == b)
174        .count();
175    matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179    let mut hasher = std::collections::hash_map::DefaultHasher::new();
180    seed.hash(&mut hasher);
181    value.hash(&mut hasher);
182    hasher.finish()
183}
184
185/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
186/// Lower values = more compressible = more redundant.
187pub fn kolmogorov_proxy(content: &str) -> f64 {
188    if content.is_empty() {
189        return 1.0;
190    }
191    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192    encoder.write_all(content.as_bytes()).ok();
193    let compressed = encoder.finish().unwrap_or_default();
194    compressed.len() as f64 / content.len() as f64
195}
196
197/// Classification of content compressibility based on Kolmogorov proxy (gzip ratio).
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200    High,
201    Medium,
202    Low,
203}
204
205impl CompressibilityClass {
206    /// Returns a human-readable label with the Kolmogorov threshold range.
207    pub fn label(&self) -> &'static str {
208        match self {
209            Self::High => "high (K<0.3)",
210            Self::Medium => "medium (0.3≤K<0.6)",
211            Self::Low => "low (K≥0.6)",
212        }
213    }
214}
215
216/// Classify how compressible content is based on gzip ratio.
217pub fn compressibility_class(content: &str) -> CompressibilityClass {
218    let k = kolmogorov_proxy(content);
219    if k < 0.3 {
220        CompressibilityClass::High
221    } else if k < 0.6 {
222        CompressibilityClass::Medium
223    } else {
224        CompressibilityClass::Low
225    }
226}
227
228/// Compresses content by removing low-entropy lines and deduplicating patterns.
229pub fn entropy_compress(content: &str) -> EntropyResult {
230    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233/// Entropy compression with file-type-adaptive thresholds and event emission.
234pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236    let before_lines = content.lines().count() as u32;
237    let result =
238        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239    let after_lines = result.output.lines().count() as u32;
240
241    if before_lines != after_lines {
242        super::events::emit(super::events::EventKind::Compression {
243            path: path.to_string(),
244            before_lines,
245            after_lines,
246            strategy: "entropy_adaptive".to_string(),
247            kept_line_count: after_lines,
248            removed_line_count: before_lines.saturating_sub(after_lines),
249        });
250    }
251
252    result
253}
254
255fn entropy_compress_with_thresholds(
256    content: &str,
257    entropy_threshold: f64,
258    jaccard_threshold: f64,
259) -> EntropyResult {
260    let original_tokens = count_tokens(content);
261    let mut lines: Vec<&str> = content.lines().collect();
262    let mut techniques = Vec::new();
263
264    let original_count = lines.len();
265    lines.retain(|line| {
266        let trimmed = line.trim();
267        super::surprise::should_keep_line(trimmed, entropy_threshold)
268    });
269    let removed = original_count - lines.len();
270    if removed > 0 {
271        techniques.push(format!(
272            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
273        ));
274    }
275
276    let blocks = extract_blocks(&lines);
277    let groups = find_pattern_groups(&blocks, jaccard_threshold);
278    let mut dedup_count = 0;
279    for group in &groups {
280        if group.len() > 1 {
281            dedup_count += group.len() - 1;
282        }
283    }
284    if dedup_count > 0 {
285        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
286    }
287
288    let mut result: Vec<String> = Vec::new();
289    let mut skip_indices: HashSet<usize> = HashSet::new();
290    for group in &groups {
291        if group.len() > 1 {
292            for &idx in &group[1..] {
293                skip_indices.insert(idx);
294            }
295        }
296    }
297    for (i, line) in lines.iter().enumerate() {
298        if !skip_indices.contains(&i) {
299            result.push(line.to_string());
300        }
301    }
302
303    let mut collapsed = Vec::new();
304    let mut blank_count = 0;
305    for line in &result {
306        if line.trim().is_empty() {
307            blank_count += 1;
308            if blank_count <= 1 {
309                collapsed.push(line.clone());
310            }
311        } else {
312            blank_count = 0;
313            collapsed.push(line.clone());
314        }
315    }
316
317    let output = collapsed.join("\n");
318    let compressed_tokens = count_tokens(&output);
319
320    let final_output = if compressed_tokens > original_tokens {
321        content.to_string()
322    } else {
323        output
324    };
325    let final_tokens = if compressed_tokens > original_tokens {
326        original_tokens
327    } else {
328        compressed_tokens
329    };
330
331    EntropyResult {
332        output: final_output,
333        original_tokens,
334        compressed_tokens: final_tokens,
335        techniques,
336    }
337}
338
339/// Per-line entropy statistics for a block of content.
340#[derive(Debug)]
341pub struct EntropyAnalysis {
342    pub avg_entropy: f64,
343    pub low_entropy_count: usize,
344    pub high_entropy_count: usize,
345    pub total_lines: usize,
346}
347
348/// Analyzes per-line BPE token entropy, counting low/high entropy lines.
349pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
350    let lines: Vec<&str> = content.lines().collect();
351    let total = lines.len();
352    let mut sum = 0.0;
353    let mut low = 0;
354    let mut high = 0;
355    let mut counted = 0;
356
357    for line in &lines {
358        let trimmed = line.trim();
359        if trimmed.is_empty() {
360            continue;
361        }
362        let h = token_entropy(trimmed);
363        sum += h;
364        counted += 1;
365        if h < BPE_ENTROPY_THRESHOLD {
366            low += 1;
367        }
368        if h > 3.0 {
369            high += 1;
370        }
371    }
372
373    EntropyAnalysis {
374        avg_entropy: if counted > 0 {
375            sum / counted as f64
376        } else {
377            0.0
378        },
379        low_entropy_count: low,
380        high_entropy_count: high,
381        total_lines: total,
382    }
383}
384
385struct Block {
386    content: String,
387}
388
389fn extract_blocks(lines: &[&str]) -> Vec<Block> {
390    let mut blocks = Vec::new();
391    let mut current = String::new();
392
393    for line in lines {
394        let trimmed = line.trim();
395        if trimmed.is_empty() && !current.is_empty() {
396            blocks.push(Block {
397                content: current.clone(),
398            });
399            current.clear();
400        } else if !trimmed.is_empty() {
401            current.push_str(trimmed);
402            current.push('\n');
403        }
404    }
405
406    if !current.is_empty() {
407        blocks.push(Block { content: current });
408    }
409
410    blocks
411}
412
413fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
414    // Exact n-gram Jaccard, but with precomputed n-gram sets per block to avoid
415    // rebuilding allocations per pair. Includes a size-ratio impossibility check
416    // (max possible Jaccard is |A|/|B| for |A|<=|B|).
417    let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
418    let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
419
420    let mut groups: Vec<Vec<usize>> = Vec::new();
421    let mut assigned: HashSet<usize> = HashSet::new();
422
423    for i in 0..blocks.len() {
424        if assigned.contains(&i) {
425            continue;
426        }
427        let mut group = vec![i];
428        for j in (i + 1)..blocks.len() {
429            if assigned.contains(&j) {
430                continue;
431            }
432            let size_i = sizes[i];
433            let size_j = sizes[j];
434            let min_sz = size_i.min(size_j);
435            let max_sz = size_i.max(size_j);
436            if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
437                continue;
438            }
439            let inter = sets[i].intersection(&sets[j]).count();
440            let union = size_i + size_j - inter;
441            if union > 0 && (inter as f64 / union as f64) >= threshold {
442                group.push(j);
443                assigned.insert(j);
444            }
445        }
446        if group.len() > 1 {
447            assigned.insert(i);
448        }
449        groups.push(group);
450    }
451
452    groups
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn shannon_entropy_empty_is_zero() {
461        assert_eq!(shannon_entropy(""), 0.0);
462    }
463
464    #[test]
465    fn shannon_entropy_single_char() {
466        assert_eq!(shannon_entropy("aaaa"), 0.0);
467    }
468
469    #[test]
470    fn shannon_entropy_high_for_varied_text() {
471        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
472        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
473        assert!(
474            shannon_entropy(varied) > shannon_entropy(uniform),
475            "varied text should have higher entropy"
476        );
477    }
478
479    #[test]
480    fn jaccard_identical_is_one() {
481        let sim = jaccard_similarity("hello world", "hello world");
482        assert!((sim - 1.0).abs() < f64::EPSILON);
483    }
484
485    #[test]
486    fn jaccard_disjoint_is_zero() {
487        let sim = jaccard_similarity("abc", "xyz");
488        assert_eq!(sim, 0.0);
489    }
490
491    #[test]
492    fn jaccard_partial_overlap() {
493        let sim = jaccard_similarity("hello world", "hello rust");
494        assert!(sim > 0.0 && sim < 1.0);
495    }
496
497    #[test]
498    fn entropy_compress_produces_output() {
499        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
500        let result = entropy_compress(content);
501        assert!(!result.output.is_empty(), "should produce non-empty output");
502        assert!(result.compressed_tokens <= result.original_tokens);
503    }
504
505    #[test]
506    fn entropy_result_savings() {
507        let r = EntropyResult {
508            output: "short".to_string(),
509            original_tokens: 100,
510            compressed_tokens: 60,
511            techniques: vec!["test".to_string()],
512        };
513        assert!((r.savings_percent() - 40.0).abs() < 0.1);
514    }
515
516    #[test]
517    fn entropy_result_zero_original() {
518        let r = EntropyResult {
519            output: String::new(),
520            original_tokens: 0,
521            compressed_tokens: 0,
522            techniques: vec![],
523        };
524        assert_eq!(r.savings_percent(), 0.0);
525    }
526
527    #[test]
528    fn token_entropy_empty_is_zero() {
529        assert_eq!(token_entropy(""), 0.0);
530    }
531
532    #[test]
533    fn token_entropy_single_repeated_token() {
534        assert_eq!(token_entropy("}"), 0.0);
535    }
536
537    #[test]
538    fn token_entropy_higher_for_diverse_code() {
539        let diverse = "let result = compute_something(x, y, z);";
540        let repetitive = "aaaa aaaa aaaa aaaa";
541        assert!(
542            token_entropy(diverse) > token_entropy(repetitive),
543            "diverse code should have higher BPE token entropy"
544        );
545    }
546
547    #[test]
548    fn token_entropy_vs_char_entropy_differ() {
549        let code = "fn main() { println!(\"hello world\"); }";
550        let te = token_entropy(code);
551        let ce = shannon_entropy(code);
552        assert!(te != ce, "BPE and char entropy should differ for code");
553    }
554
555    #[test]
556    fn ngram_jaccard_preserves_order() {
557        let a = "a b c d";
558        let b = "d c b a";
559        let word_j = jaccard_similarity(a, b);
560        let ngram_j = ngram_jaccard(a, b, 2);
561        assert!(
562            ngram_j < word_j,
563            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
564        );
565    }
566
567    #[test]
568    fn ngram_jaccard_identical_is_one() {
569        let text = "fn main() { println!(\"hello\"); }";
570        let j = ngram_jaccard(text, text, 2);
571        assert!((j - 1.0).abs() < f64::EPSILON);
572    }
573
574    #[test]
575    fn ngram_jaccard_disjoint_is_zero() {
576        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
577        assert_eq!(j, 0.0);
578    }
579
580    #[test]
581    fn minhash_approximates_jaccard() {
582        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
583        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
584        let exact = ngram_jaccard(a, b, 2);
585        let sig_a = minhash_signature(a, 2, 128);
586        let sig_b = minhash_signature(b, 2, 128);
587        let approx = minhash_similarity(&sig_a, &sig_b);
588        assert!(
589            (exact - approx).abs() < 0.2,
590            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
591        );
592    }
593
594    #[test]
595    fn minhash_empty_text() {
596        let sig = minhash_signature("", 2, 64);
597        assert!(sig.iter().all(|&v| v == u64::MAX));
598    }
599
600    #[test]
601    fn kolmogorov_empty_is_one() {
602        assert_eq!(kolmogorov_proxy(""), 1.0);
603    }
604
605    #[test]
606    fn kolmogorov_repetitive_is_low() {
607        let repetitive = "aaa\n".repeat(1000);
608        let k = kolmogorov_proxy(&repetitive);
609        assert!(
610            k < 0.1,
611            "highly repetitive text should compress well: K={k}"
612        );
613    }
614
615    #[test]
616    fn kolmogorov_diverse_is_higher() {
617        let repetitive = "aaa\n".repeat(500);
618        let diverse = (0..500)
619            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
620            .collect::<Vec<_>>()
621            .join("\n");
622        assert!(
623            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
624            "diverse content should have higher K than repetitive"
625        );
626    }
627
628    #[test]
629    fn compressibility_class_repetitive_is_high() {
630        let text = "use std::io;\n".repeat(200);
631        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
632    }
633
634    #[test]
635    fn kolmogorov_diverse_higher_than_repetitive() {
636        let rep = "test\n".repeat(500);
637        let diverse = (0..500)
638            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
639            .collect::<Vec<_>>()
640            .join("\n");
641        assert!(
642            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
643            "diverse content should have higher K"
644        );
645    }
646}
lean_ctx/core/entropy.rs

lean_ctx/core/
entropy.rs