Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12/// Result of entropy-based compression: output text, token counts, and techniques used.
13#[derive(Debug)]
14pub struct EntropyResult {
15    pub output: String,
16    pub original_tokens: usize,
17    pub compressed_tokens: usize,
18    pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22    /// Returns the percentage of tokens saved by compression.
23    pub fn savings_percent(&self) -> f64 {
24        if self.original_tokens == 0 {
25            return 0.0;
26        }
27        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28        (saved as f64 / self.original_tokens as f64) * 100.0
29    }
30}
31
32/// Computes Shannon entropy (bits) over character frequencies in the text.
33pub fn shannon_entropy(text: &str) -> f64 {
34    if text.is_empty() {
35        return 0.0;
36    }
37    let mut freq: HashMap<char, usize> = HashMap::new();
38    let total = text.chars().count();
39
40    for c in text.chars() {
41        *freq.entry(c).or_default() += 1;
42    }
43
44    freq.values().fold(0.0_f64, |acc, &count| {
45        let p = count as f64 / total as f64;
46        acc - p * p.log2()
47    })
48}
49
50/// Shannon entropy over already-encoded BPE token IDs (o200k_base).
51pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52    if tokens.is_empty() {
53        return 0.0;
54    }
55    let total = tokens.len();
56    let mut freq: HashMap<u32, usize> = HashMap::new();
57    for &t in tokens {
58        *freq.entry(t).or_default() += 1;
59    }
60    freq.values().fold(0.0_f64, |acc, &count| {
61        let p = count as f64 / total as f64;
62        acc - p * p.log2()
63    })
64}
65
66/// Shannon entropy over BPE token IDs (o200k_base).
67/// More LLM-relevant than character entropy since LLMs process BPE tokens.
68pub fn token_entropy(text: &str) -> f64 {
69    let tokens = encode_tokens(text);
70    token_entropy_from_ids(&tokens)
71}
72
73/// Normalized Shannon entropy over encoded token IDs: H(X) / log₂(n), n = unique token count.
74pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75    if tokens.is_empty() {
76        return 0.0;
77    }
78    let total = tokens.len();
79    let mut freq: HashMap<u32, usize> = HashMap::new();
80    for &t in tokens {
81        *freq.entry(t).or_default() += 1;
82    }
83    let n_unique = freq.len();
84    if n_unique <= 1 {
85        return 0.0;
86    }
87    let h = freq.values().fold(0.0_f64, |acc, &count| {
88        let p = count as f64 / total as f64;
89        acc - p * p.log2()
90    });
91    let h_max = (n_unique as f64).log2();
92    h / h_max
93}
94
95/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
96/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
97/// This makes thresholds comparable across different alphabet sizes.
98pub fn normalized_token_entropy(text: &str) -> f64 {
99    let tokens = encode_tokens(text);
100    normalized_token_entropy_from_ids(&tokens)
101}
102
103/// Computes word-set Jaccard similarity between two strings (0.0–1.0).
104pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105    let set_a: HashSet<&str> = a.split_whitespace().collect();
106    let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108    let intersection = set_a.intersection(&set_b).count();
109    let union = set_a.union(&set_b).count();
110
111    if union == 0 {
112        return 0.0;
113    }
114    intersection as f64 / union as f64
115}
116
117/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
118pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119    let set_a = ngram_set(a, n);
120    let set_b = ngram_set(b, n);
121
122    let intersection = set_a.intersection(&set_b).count();
123    let union = set_a.union(&set_b).count();
124
125    if union == 0 {
126        return 0.0;
127    }
128    intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132    let words: Vec<&str> = text.split_whitespace().collect();
133    if words.len() < n {
134        let mut set = HashSet::new();
135        if !words.is_empty() {
136            set.insert(words.iter().map(std::string::ToString::to_string).collect());
137        }
138        return set;
139    }
140    words
141        .windows(n)
142        .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143        .collect()
144}
145
146/// Minhash signature for approximate Jaccard via LSH.
147/// Uses k independent hash functions (polynomial hashing with different seeds).
148pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149    let ngrams = ngram_set(text, n);
150    if ngrams.is_empty() {
151        return vec![u64::MAX; k];
152    }
153    let mut signature = vec![u64::MAX; k];
154    for ngram in &ngrams {
155        for (i, min) in signature.iter_mut().enumerate() {
156            let h = hash_with_seed(ngram, i as u64);
157            if h < *min {
158                *min = h;
159            }
160        }
161    }
162    signature
163}
164
165/// Approximate Jaccard from two minhash signatures.
166pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168        return 0.0;
169    }
170    let matches = sig_a
171        .iter()
172        .zip(sig_b.iter())
173        .filter(|(a, b)| a == b)
174        .count();
175    matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179    let mut hasher = std::collections::hash_map::DefaultHasher::new();
180    seed.hash(&mut hasher);
181    value.hash(&mut hasher);
182    hasher.finish()
183}
184
185/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
186/// Lower values = more compressible = more redundant.
187pub fn kolmogorov_proxy(content: &str) -> f64 {
188    if content.is_empty() {
189        return 1.0;
190    }
191    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192    encoder.write_all(content.as_bytes()).ok();
193    let compressed = encoder.finish().unwrap_or_default();
194    compressed.len() as f64 / content.len() as f64
195}
196
197/// Classification of content compressibility based on Kolmogorov proxy (gzip ratio).
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200    High,
201    Medium,
202    Low,
203}
204
205impl CompressibilityClass {
206    /// Returns a human-readable label with the Kolmogorov threshold range.
207    pub fn label(&self) -> &'static str {
208        match self {
209            Self::High => "high (K<0.3)",
210            Self::Medium => "medium (0.3≤K<0.6)",
211            Self::Low => "low (K≥0.6)",
212        }
213    }
214}
215
216/// Classify how compressible content is based on gzip ratio.
217pub fn compressibility_class(content: &str) -> CompressibilityClass {
218    let k = kolmogorov_proxy(content);
219    if k < 0.3 {
220        CompressibilityClass::High
221    } else if k < 0.6 {
222        CompressibilityClass::Medium
223    } else {
224        CompressibilityClass::Low
225    }
226}
227
228/// Compresses content by removing low-entropy lines and deduplicating patterns.
229pub fn entropy_compress(content: &str) -> EntropyResult {
230    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233/// Entropy compression with file-type-adaptive thresholds and event emission.
234pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236    let before_lines = content.lines().count() as u32;
237    let result =
238        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239    let after_lines = result.output.lines().count() as u32;
240
241    if before_lines != after_lines {
242        super::events::emit(super::events::EventKind::Compression {
243            path: path.to_string(),
244            before_lines,
245            after_lines,
246            strategy: "entropy_adaptive".to_string(),
247            kept_line_count: after_lines,
248            removed_line_count: before_lines.saturating_sub(after_lines),
249        });
250    }
251
252    result
253}
254
255/// Task-conditioned entropy compression: lines that would normally be dropped
256/// for low entropy are kept if they contain task-relevant keywords.  This is
257/// the Information Bottleneck proxy: we compress away only what is neither
258/// surprising (high H) *nor* task-relevant (mentions goal concepts).
259/// Falls back to pure entropy when `task_keywords` is empty.
260pub fn entropy_compress_task_conditioned(
261    content: &str,
262    path: &str,
263    task_keywords: &[String],
264) -> EntropyResult {
265    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
266    let before_lines = content.lines().count() as u32;
267    let result = entropy_compress_with_task(
268        content,
269        thresholds.bpe_entropy,
270        thresholds.jaccard,
271        task_keywords,
272    );
273    let after_lines = result.output.lines().count() as u32;
274    if before_lines != after_lines {
275        super::events::emit(super::events::EventKind::Compression {
276            path: path.to_string(),
277            before_lines,
278            after_lines,
279            strategy: "entropy_task_conditioned".to_string(),
280            kept_line_count: after_lines,
281            removed_line_count: before_lines.saturating_sub(after_lines),
282        });
283    }
284    result
285}
286
287fn entropy_compress_with_task(
288    content: &str,
289    entropy_threshold: f64,
290    jaccard_threshold: f64,
291    task_keywords: &[String],
292) -> EntropyResult {
293    let original_tokens = count_tokens(content);
294    let mut lines: Vec<&str> = content.lines().collect();
295    let mut techniques = Vec::new();
296
297    let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
298    let original_count = lines.len();
299    let mut task_rescued = 0usize;
300    lines.retain(|line| {
301        let trimmed = line.trim();
302        if super::surprise::should_keep_line(trimmed, entropy_threshold) {
303            return true;
304        }
305        // Task-conditioned rescue: keep low-entropy lines that mention task keywords.
306        if !kw_lower.is_empty() {
307            let lower = trimmed.to_lowercase();
308            if kw_lower.iter().any(|kw| lower.contains(kw.as_str())) {
309                task_rescued += 1;
310                return true;
311            }
312        }
313        false
314    });
315    let removed = original_count - lines.len();
316    if removed > 0 || task_rescued > 0 {
317        let mut msg = format!("⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})");
318        if task_rescued > 0 {
319            msg.push_str(&format!(" [+{task_rescued} task-rescued]"));
320        }
321        techniques.push(msg);
322    }
323
324    let blocks = extract_blocks(&lines);
325    let groups = find_pattern_groups(&blocks, jaccard_threshold);
326    let mut dedup_count = 0;
327    for group in &groups {
328        if group.len() > 1 {
329            dedup_count += group.len() - 1;
330        }
331    }
332    if dedup_count > 0 {
333        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
334    }
335
336    let mut result: Vec<String> = Vec::new();
337    let mut skip_indices: std::collections::HashSet<usize> = std::collections::HashSet::new();
338    for group in &groups {
339        if group.len() > 1 {
340            for &idx in &group[1..] {
341                skip_indices.insert(idx);
342            }
343        }
344    }
345    for (i, line) in lines.iter().enumerate() {
346        if !skip_indices.contains(&i) {
347            result.push(line.to_string());
348        }
349    }
350
351    let mut collapsed = Vec::new();
352    let mut blank_count = 0;
353    for line in &result {
354        if line.trim().is_empty() {
355            blank_count += 1;
356            if blank_count <= 1 {
357                collapsed.push(line.clone());
358            }
359        } else {
360            blank_count = 0;
361            collapsed.push(line.clone());
362        }
363    }
364    let output = collapsed.join("\n");
365    let compressed_tokens = count_tokens(&output);
366
367    // Safeguard: BPE re-tokenization of a line subset can, for tiny adversarial
368    // inputs, exceed the original token count (e.g. a dropped trailing newline
369    // merges differently). Never inflate — fall back to the original verbatim.
370    let final_output = if compressed_tokens > original_tokens {
371        content.to_string()
372    } else {
373        output
374    };
375    let final_tokens = if compressed_tokens > original_tokens {
376        original_tokens
377    } else {
378        compressed_tokens
379    };
380
381    EntropyResult {
382        output: final_output,
383        original_tokens,
384        compressed_tokens: final_tokens,
385        techniques,
386    }
387}
388
389fn entropy_compress_with_thresholds(
390    content: &str,
391    entropy_threshold: f64,
392    jaccard_threshold: f64,
393) -> EntropyResult {
394    entropy_compress_with_task(content, entropy_threshold, jaccard_threshold, &[])
395}
396
397/// Per-line entropy statistics for a block of content.
398#[derive(Debug)]
399pub struct EntropyAnalysis {
400    pub avg_entropy: f64,
401    pub low_entropy_count: usize,
402    pub high_entropy_count: usize,
403    pub total_lines: usize,
404}
405
406/// Analyzes per-line BPE token entropy, counting low/high entropy lines.
407pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
408    let lines: Vec<&str> = content.lines().collect();
409    let total = lines.len();
410    let mut sum = 0.0;
411    let mut low = 0;
412    let mut high = 0;
413    let mut counted = 0;
414
415    for line in &lines {
416        let trimmed = line.trim();
417        if trimmed.is_empty() {
418            continue;
419        }
420        let h = token_entropy(trimmed);
421        sum += h;
422        counted += 1;
423        if h < BPE_ENTROPY_THRESHOLD {
424            low += 1;
425        }
426        if h > 3.0 {
427            high += 1;
428        }
429    }
430
431    EntropyAnalysis {
432        avg_entropy: if counted > 0 {
433            sum / counted as f64
434        } else {
435            0.0
436        },
437        low_entropy_count: low,
438        high_entropy_count: high,
439        total_lines: total,
440    }
441}
442
443struct Block {
444    content: String,
445}
446
447fn extract_blocks(lines: &[&str]) -> Vec<Block> {
448    let mut blocks = Vec::new();
449    let mut current = String::new();
450
451    for line in lines {
452        let trimmed = line.trim();
453        if trimmed.is_empty() && !current.is_empty() {
454            blocks.push(Block {
455                content: current.clone(),
456            });
457            current.clear();
458        } else if !trimmed.is_empty() {
459            current.push_str(trimmed);
460            current.push('\n');
461        }
462    }
463
464    if !current.is_empty() {
465        blocks.push(Block { content: current });
466    }
467
468    blocks
469}
470
471fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
472    // Exact n-gram Jaccard, but with precomputed n-gram sets per block to avoid
473    // rebuilding allocations per pair. Includes a size-ratio impossibility check
474    // (max possible Jaccard is |A|/|B| for |A|<=|B|).
475    let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
476    let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
477
478    let mut groups: Vec<Vec<usize>> = Vec::new();
479    let mut assigned: HashSet<usize> = HashSet::new();
480
481    for i in 0..blocks.len() {
482        if assigned.contains(&i) {
483            continue;
484        }
485        let mut group = vec![i];
486        for j in (i + 1)..blocks.len() {
487            if assigned.contains(&j) {
488                continue;
489            }
490            let size_i = sizes[i];
491            let size_j = sizes[j];
492            let min_sz = size_i.min(size_j);
493            let max_sz = size_i.max(size_j);
494            if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
495                continue;
496            }
497            let inter = sets[i].intersection(&sets[j]).count();
498            let union = size_i + size_j - inter;
499            if union > 0 && (inter as f64 / union as f64) >= threshold {
500                group.push(j);
501                assigned.insert(j);
502            }
503        }
504        if group.len() > 1 {
505            assigned.insert(i);
506        }
507        groups.push(group);
508    }
509
510    groups
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn shannon_entropy_empty_is_zero() {
519        assert_eq!(shannon_entropy(""), 0.0);
520    }
521
522    #[test]
523    fn shannon_entropy_single_char() {
524        assert_eq!(shannon_entropy("aaaa"), 0.0);
525    }
526
527    #[test]
528    fn shannon_entropy_high_for_varied_text() {
529        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
530        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
531        assert!(
532            shannon_entropy(varied) > shannon_entropy(uniform),
533            "varied text should have higher entropy"
534        );
535    }
536
537    #[test]
538    fn jaccard_identical_is_one() {
539        let sim = jaccard_similarity("hello world", "hello world");
540        assert!((sim - 1.0).abs() < f64::EPSILON);
541    }
542
543    #[test]
544    fn jaccard_disjoint_is_zero() {
545        let sim = jaccard_similarity("abc", "xyz");
546        assert_eq!(sim, 0.0);
547    }
548
549    #[test]
550    fn jaccard_partial_overlap() {
551        let sim = jaccard_similarity("hello world", "hello rust");
552        assert!(sim > 0.0 && sim < 1.0);
553    }
554
555    #[test]
556    fn entropy_compress_produces_output() {
557        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
558        let result = entropy_compress(content);
559        assert!(!result.output.is_empty(), "should produce non-empty output");
560        assert!(result.compressed_tokens <= result.original_tokens);
561    }
562
563    #[test]
564    fn entropy_result_savings() {
565        let r = EntropyResult {
566            output: "short".to_string(),
567            original_tokens: 100,
568            compressed_tokens: 60,
569            techniques: vec!["test".to_string()],
570        };
571        assert!((r.savings_percent() - 40.0).abs() < 0.1);
572    }
573
574    #[test]
575    fn entropy_result_zero_original() {
576        let r = EntropyResult {
577            output: String::new(),
578            original_tokens: 0,
579            compressed_tokens: 0,
580            techniques: vec![],
581        };
582        assert_eq!(r.savings_percent(), 0.0);
583    }
584
585    #[test]
586    fn token_entropy_empty_is_zero() {
587        assert_eq!(token_entropy(""), 0.0);
588    }
589
590    #[test]
591    fn token_entropy_single_repeated_token() {
592        assert_eq!(token_entropy("}"), 0.0);
593    }
594
595    #[test]
596    fn token_entropy_higher_for_diverse_code() {
597        let diverse = "let result = compute_something(x, y, z);";
598        let repetitive = "aaaa aaaa aaaa aaaa";
599        assert!(
600            token_entropy(diverse) > token_entropy(repetitive),
601            "diverse code should have higher BPE token entropy"
602        );
603    }
604
605    #[test]
606    fn token_entropy_vs_char_entropy_differ() {
607        let code = "fn main() { println!(\"hello world\"); }";
608        let te = token_entropy(code);
609        let ce = shannon_entropy(code);
610        assert!(te != ce, "BPE and char entropy should differ for code");
611    }
612
613    #[test]
614    fn ngram_jaccard_preserves_order() {
615        let a = "a b c d";
616        let b = "d c b a";
617        let word_j = jaccard_similarity(a, b);
618        let ngram_j = ngram_jaccard(a, b, 2);
619        assert!(
620            ngram_j < word_j,
621            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
622        );
623    }
624
625    #[test]
626    fn ngram_jaccard_identical_is_one() {
627        let text = "fn main() { println!(\"hello\"); }";
628        let j = ngram_jaccard(text, text, 2);
629        assert!((j - 1.0).abs() < f64::EPSILON);
630    }
631
632    #[test]
633    fn ngram_jaccard_disjoint_is_zero() {
634        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
635        assert_eq!(j, 0.0);
636    }
637
638    #[test]
639    fn minhash_approximates_jaccard() {
640        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
641        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
642        let exact = ngram_jaccard(a, b, 2);
643        let sig_a = minhash_signature(a, 2, 128);
644        let sig_b = minhash_signature(b, 2, 128);
645        let approx = minhash_similarity(&sig_a, &sig_b);
646        assert!(
647            (exact - approx).abs() < 0.2,
648            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
649        );
650    }
651
652    #[test]
653    fn minhash_empty_text() {
654        let sig = minhash_signature("", 2, 64);
655        assert!(sig.iter().all(|&v| v == u64::MAX));
656    }
657
658    #[test]
659    fn kolmogorov_empty_is_one() {
660        assert_eq!(kolmogorov_proxy(""), 1.0);
661    }
662
663    #[test]
664    fn kolmogorov_repetitive_is_low() {
665        let repetitive = "aaa\n".repeat(1000);
666        let k = kolmogorov_proxy(&repetitive);
667        assert!(
668            k < 0.1,
669            "highly repetitive text should compress well: K={k}"
670        );
671    }
672
673    #[test]
674    fn kolmogorov_diverse_is_higher() {
675        let repetitive = "aaa\n".repeat(500);
676        let diverse = (0..500)
677            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
678            .collect::<Vec<_>>()
679            .join("\n");
680        assert!(
681            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
682            "diverse content should have higher K than repetitive"
683        );
684    }
685
686    #[test]
687    fn compressibility_class_repetitive_is_high() {
688        let text = "use std::io;\n".repeat(200);
689        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
690    }
691
692    #[test]
693    fn kolmogorov_diverse_higher_than_repetitive() {
694        let rep = "test\n".repeat(500);
695        let diverse = (0..500)
696            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
697            .collect::<Vec<_>>()
698            .join("\n");
699        assert!(
700            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
701            "diverse content should have higher K"
702        );
703    }
704}