Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12/// Result of entropy-based compression: output text, token counts, and techniques used.
13#[derive(Debug)]
14pub struct EntropyResult {
15    pub output: String,
16    pub original_tokens: usize,
17    pub compressed_tokens: usize,
18    pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22    /// Returns the percentage of tokens saved by compression.
23    pub fn savings_percent(&self) -> f64 {
24        if self.original_tokens == 0 {
25            return 0.0;
26        }
27        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28        (saved as f64 / self.original_tokens as f64) * 100.0
29    }
30}
31
32/// Computes Shannon entropy (bits) over character frequencies in the text.
33pub fn shannon_entropy(text: &str) -> f64 {
34    if text.is_empty() {
35        return 0.0;
36    }
37    let mut freq: HashMap<char, usize> = HashMap::new();
38    let total = text.chars().count();
39
40    for c in text.chars() {
41        *freq.entry(c).or_default() += 1;
42    }
43
44    freq.values().fold(0.0_f64, |acc, &count| {
45        let p = count as f64 / total as f64;
46        acc - p * p.log2()
47    })
48}
49
50/// Shannon entropy over BPE token IDs (o200k_base).
51/// More LLM-relevant than character entropy since LLMs process BPE tokens.
52pub fn token_entropy(text: &str) -> f64 {
53    let tokens = encode_tokens(text);
54    if tokens.is_empty() {
55        return 0.0;
56    }
57    let total = tokens.len();
58    let mut freq: HashMap<u32, usize> = HashMap::new();
59    for &t in &tokens {
60        *freq.entry(t).or_default() += 1;
61    }
62    freq.values().fold(0.0_f64, |acc, &count| {
63        let p = count as f64 / total as f64;
64        acc - p * p.log2()
65    })
66}
67
68/// Normalized Shannon entropy: H(X) / log₂(n) where n = number of unique symbols.
69/// Returns a value in [0, 1] where 0 = perfectly predictable, 1 = maximum entropy.
70/// This makes thresholds comparable across different alphabet sizes.
71pub fn normalized_token_entropy(text: &str) -> f64 {
72    let tokens = encode_tokens(text);
73    if tokens.is_empty() {
74        return 0.0;
75    }
76    let total = tokens.len();
77    let mut freq: HashMap<u32, usize> = HashMap::new();
78    for &t in &tokens {
79        *freq.entry(t).or_default() += 1;
80    }
81    let n_unique = freq.len();
82    if n_unique <= 1 {
83        return 0.0;
84    }
85    let h = freq.values().fold(0.0_f64, |acc, &count| {
86        let p = count as f64 / total as f64;
87        acc - p * p.log2()
88    });
89    let h_max = (n_unique as f64).log2();
90    h / h_max
91}
92
93/// Computes word-set Jaccard similarity between two strings (0.0–1.0).
94pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
95    let set_a: HashSet<&str> = a.split_whitespace().collect();
96    let set_b: HashSet<&str> = b.split_whitespace().collect();
97
98    let intersection = set_a.intersection(&set_b).count();
99    let union = set_a.union(&set_b).count();
100
101    if union == 0 {
102        return 0.0;
103    }
104    intersection as f64 / union as f64
105}
106
107/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
108pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
109    let set_a = ngram_set(a, n);
110    let set_b = ngram_set(b, n);
111
112    let intersection = set_a.intersection(&set_b).count();
113    let union = set_a.union(&set_b).count();
114
115    if union == 0 {
116        return 0.0;
117    }
118    intersection as f64 / union as f64
119}
120
121fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
122    let words: Vec<&str> = text.split_whitespace().collect();
123    if words.len() < n {
124        let mut set = HashSet::new();
125        if !words.is_empty() {
126            set.insert(words.iter().map(std::string::ToString::to_string).collect());
127        }
128        return set;
129    }
130    words
131        .windows(n)
132        .map(|w| w.iter().map(std::string::ToString::to_string).collect())
133        .collect()
134}
135
136/// Minhash signature for approximate Jaccard via LSH.
137/// Uses k independent hash functions (polynomial hashing with different seeds).
138pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
139    let ngrams = ngram_set(text, n);
140    if ngrams.is_empty() {
141        return vec![u64::MAX; k];
142    }
143    let mut signature = vec![u64::MAX; k];
144    for ngram in &ngrams {
145        for (i, min) in signature.iter_mut().enumerate() {
146            let h = hash_with_seed(ngram, i as u64);
147            if h < *min {
148                *min = h;
149            }
150        }
151    }
152    signature
153}
154
155/// Approximate Jaccard from two minhash signatures.
156pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
157    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
158        return 0.0;
159    }
160    let matches = sig_a
161        .iter()
162        .zip(sig_b.iter())
163        .filter(|(a, b)| a == b)
164        .count();
165    matches as f64 / sig_a.len() as f64
166}
167
168fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
169    let mut hasher = std::collections::hash_map::DefaultHasher::new();
170    seed.hash(&mut hasher);
171    value.hash(&mut hasher);
172    hasher.finish()
173}
174
175/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
176/// Lower values = more compressible = more redundant.
177pub fn kolmogorov_proxy(content: &str) -> f64 {
178    if content.is_empty() {
179        return 1.0;
180    }
181    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
182    encoder.write_all(content.as_bytes()).ok();
183    let compressed = encoder.finish().unwrap_or_default();
184    compressed.len() as f64 / content.len() as f64
185}
186
187/// Classification of content compressibility based on Kolmogorov proxy (gzip ratio).
188#[derive(Debug, Clone, Copy, PartialEq, Eq)]
189pub enum CompressibilityClass {
190    High,
191    Medium,
192    Low,
193}
194
195impl CompressibilityClass {
196    /// Returns a human-readable label with the Kolmogorov threshold range.
197    pub fn label(&self) -> &'static str {
198        match self {
199            Self::High => "high (K<0.3)",
200            Self::Medium => "medium (0.3≤K<0.6)",
201            Self::Low => "low (K≥0.6)",
202        }
203    }
204}
205
206/// Classify how compressible content is based on gzip ratio.
207pub fn compressibility_class(content: &str) -> CompressibilityClass {
208    let k = kolmogorov_proxy(content);
209    if k < 0.3 {
210        CompressibilityClass::High
211    } else if k < 0.6 {
212        CompressibilityClass::Medium
213    } else {
214        CompressibilityClass::Low
215    }
216}
217
218/// Compresses content by removing low-entropy lines and deduplicating patterns.
219pub fn entropy_compress(content: &str) -> EntropyResult {
220    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
221}
222
223/// Entropy compression with file-type-adaptive thresholds and event emission.
224pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
225    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
226    let before_lines = content.lines().count() as u32;
227    let result =
228        entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
229    let after_lines = result.output.lines().count() as u32;
230
231    if before_lines != after_lines {
232        super::events::emit(super::events::EventKind::Compression {
233            path: path.to_string(),
234            before_lines,
235            after_lines,
236            strategy: "entropy_adaptive".to_string(),
237            kept_line_count: after_lines,
238            removed_line_count: before_lines.saturating_sub(after_lines),
239        });
240    }
241
242    result
243}
244
245fn entropy_compress_with_thresholds(
246    content: &str,
247    entropy_threshold: f64,
248    jaccard_threshold: f64,
249) -> EntropyResult {
250    let original_tokens = count_tokens(content);
251    let mut lines: Vec<&str> = content.lines().collect();
252    let mut techniques = Vec::new();
253
254    let original_count = lines.len();
255    lines.retain(|line| {
256        let trimmed = line.trim();
257        super::surprise::should_keep_line(trimmed, entropy_threshold)
258    });
259    let removed = original_count - lines.len();
260    if removed > 0 {
261        techniques.push(format!(
262            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
263        ));
264    }
265
266    let blocks = extract_blocks(&lines);
267    let groups = find_pattern_groups(&blocks, jaccard_threshold);
268    let mut dedup_count = 0;
269    for group in &groups {
270        if group.len() > 1 {
271            dedup_count += group.len() - 1;
272        }
273    }
274    if dedup_count > 0 {
275        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
276    }
277
278    let mut result: Vec<String> = Vec::new();
279    let mut skip_indices: HashSet<usize> = HashSet::new();
280    for group in &groups {
281        if group.len() > 1 {
282            for &idx in &group[1..] {
283                skip_indices.insert(idx);
284            }
285        }
286    }
287    for (i, line) in lines.iter().enumerate() {
288        if !skip_indices.contains(&i) {
289            result.push(line.to_string());
290        }
291    }
292
293    let mut collapsed = Vec::new();
294    let mut blank_count = 0;
295    for line in &result {
296        if line.trim().is_empty() {
297            blank_count += 1;
298            if blank_count <= 1 {
299                collapsed.push(line.clone());
300            }
301        } else {
302            blank_count = 0;
303            collapsed.push(line.clone());
304        }
305    }
306
307    let output = collapsed.join("\n");
308    let compressed_tokens = count_tokens(&output);
309
310    EntropyResult {
311        output,
312        original_tokens,
313        compressed_tokens,
314        techniques,
315    }
316}
317
318/// Per-line entropy statistics for a block of content.
319#[derive(Debug)]
320pub struct EntropyAnalysis {
321    pub avg_entropy: f64,
322    pub low_entropy_count: usize,
323    pub high_entropy_count: usize,
324    pub total_lines: usize,
325}
326
327/// Analyzes per-line BPE token entropy, counting low/high entropy lines.
328pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
329    let lines: Vec<&str> = content.lines().collect();
330    let total = lines.len();
331    let mut sum = 0.0;
332    let mut low = 0;
333    let mut high = 0;
334    let mut counted = 0;
335
336    for line in &lines {
337        let trimmed = line.trim();
338        if trimmed.is_empty() {
339            continue;
340        }
341        let h = token_entropy(trimmed);
342        sum += h;
343        counted += 1;
344        if h < BPE_ENTROPY_THRESHOLD {
345            low += 1;
346        }
347        if h > 3.0 {
348            high += 1;
349        }
350    }
351
352    EntropyAnalysis {
353        avg_entropy: if counted > 0 {
354            sum / counted as f64
355        } else {
356            0.0
357        },
358        low_entropy_count: low,
359        high_entropy_count: high,
360        total_lines: total,
361    }
362}
363
364struct Block {
365    content: String,
366}
367
368fn extract_blocks(lines: &[&str]) -> Vec<Block> {
369    let mut blocks = Vec::new();
370    let mut current = String::new();
371
372    for line in lines {
373        let trimmed = line.trim();
374        if trimmed.is_empty() && !current.is_empty() {
375            blocks.push(Block {
376                content: current.clone(),
377            });
378            current.clear();
379        } else if !trimmed.is_empty() {
380            current.push_str(trimmed);
381            current.push('\n');
382        }
383    }
384
385    if !current.is_empty() {
386        blocks.push(Block { content: current });
387    }
388
389    blocks
390}
391
392fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
393    // Exact n-gram Jaccard, but with precomputed n-gram sets per block to avoid
394    // rebuilding allocations per pair. Includes a size-ratio impossibility check
395    // (max possible Jaccard is |A|/|B| for |A|<=|B|).
396    let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
397    let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
398
399    let mut groups: Vec<Vec<usize>> = Vec::new();
400    let mut assigned: HashSet<usize> = HashSet::new();
401
402    for i in 0..blocks.len() {
403        if assigned.contains(&i) {
404            continue;
405        }
406        let mut group = vec![i];
407        for j in (i + 1)..blocks.len() {
408            if assigned.contains(&j) {
409                continue;
410            }
411            let size_i = sizes[i];
412            let size_j = sizes[j];
413            let min_sz = size_i.min(size_j);
414            let max_sz = size_i.max(size_j);
415            if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
416                continue;
417            }
418            let inter = sets[i].intersection(&sets[j]).count();
419            let union = size_i + size_j - inter;
420            if union > 0 && (inter as f64 / union as f64) >= threshold {
421                group.push(j);
422                assigned.insert(j);
423            }
424        }
425        if group.len() > 1 {
426            assigned.insert(i);
427        }
428        groups.push(group);
429    }
430
431    groups
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    #[test]
439    fn shannon_entropy_empty_is_zero() {
440        assert_eq!(shannon_entropy(""), 0.0);
441    }
442
443    #[test]
444    fn shannon_entropy_single_char() {
445        assert_eq!(shannon_entropy("aaaa"), 0.0);
446    }
447
448    #[test]
449    fn shannon_entropy_high_for_varied_text() {
450        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
451        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
452        assert!(
453            shannon_entropy(varied) > shannon_entropy(uniform),
454            "varied text should have higher entropy"
455        );
456    }
457
458    #[test]
459    fn jaccard_identical_is_one() {
460        let sim = jaccard_similarity("hello world", "hello world");
461        assert!((sim - 1.0).abs() < f64::EPSILON);
462    }
463
464    #[test]
465    fn jaccard_disjoint_is_zero() {
466        let sim = jaccard_similarity("abc", "xyz");
467        assert_eq!(sim, 0.0);
468    }
469
470    #[test]
471    fn jaccard_partial_overlap() {
472        let sim = jaccard_similarity("hello world", "hello rust");
473        assert!(sim > 0.0 && sim < 1.0);
474    }
475
476    #[test]
477    fn entropy_compress_produces_output() {
478        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
479        let result = entropy_compress(content);
480        assert!(!result.output.is_empty(), "should produce non-empty output");
481        assert!(result.compressed_tokens <= result.original_tokens);
482    }
483
484    #[test]
485    fn entropy_result_savings() {
486        let r = EntropyResult {
487            output: "short".to_string(),
488            original_tokens: 100,
489            compressed_tokens: 60,
490            techniques: vec!["test".to_string()],
491        };
492        assert!((r.savings_percent() - 40.0).abs() < 0.1);
493    }
494
495    #[test]
496    fn entropy_result_zero_original() {
497        let r = EntropyResult {
498            output: String::new(),
499            original_tokens: 0,
500            compressed_tokens: 0,
501            techniques: vec![],
502        };
503        assert_eq!(r.savings_percent(), 0.0);
504    }
505
506    #[test]
507    fn token_entropy_empty_is_zero() {
508        assert_eq!(token_entropy(""), 0.0);
509    }
510
511    #[test]
512    fn token_entropy_single_repeated_token() {
513        assert_eq!(token_entropy("}"), 0.0);
514    }
515
516    #[test]
517    fn token_entropy_higher_for_diverse_code() {
518        let diverse = "let result = compute_something(x, y, z);";
519        let repetitive = "aaaa aaaa aaaa aaaa";
520        assert!(
521            token_entropy(diverse) > token_entropy(repetitive),
522            "diverse code should have higher BPE token entropy"
523        );
524    }
525
526    #[test]
527    fn token_entropy_vs_char_entropy_differ() {
528        let code = "fn main() { println!(\"hello world\"); }";
529        let te = token_entropy(code);
530        let ce = shannon_entropy(code);
531        assert!(te != ce, "BPE and char entropy should differ for code");
532    }
533
534    #[test]
535    fn ngram_jaccard_preserves_order() {
536        let a = "a b c d";
537        let b = "d c b a";
538        let word_j = jaccard_similarity(a, b);
539        let ngram_j = ngram_jaccard(a, b, 2);
540        assert!(
541            ngram_j < word_j,
542            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
543        );
544    }
545
546    #[test]
547    fn ngram_jaccard_identical_is_one() {
548        let text = "fn main() { println!(\"hello\"); }";
549        let j = ngram_jaccard(text, text, 2);
550        assert!((j - 1.0).abs() < f64::EPSILON);
551    }
552
553    #[test]
554    fn ngram_jaccard_disjoint_is_zero() {
555        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
556        assert_eq!(j, 0.0);
557    }
558
559    #[test]
560    fn minhash_approximates_jaccard() {
561        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
562        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
563        let exact = ngram_jaccard(a, b, 2);
564        let sig_a = minhash_signature(a, 2, 128);
565        let sig_b = minhash_signature(b, 2, 128);
566        let approx = minhash_similarity(&sig_a, &sig_b);
567        assert!(
568            (exact - approx).abs() < 0.2,
569            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
570        );
571    }
572
573    #[test]
574    fn minhash_empty_text() {
575        let sig = minhash_signature("", 2, 64);
576        assert!(sig.iter().all(|&v| v == u64::MAX));
577    }
578
579    #[test]
580    fn kolmogorov_empty_is_one() {
581        assert_eq!(kolmogorov_proxy(""), 1.0);
582    }
583
584    #[test]
585    fn kolmogorov_repetitive_is_low() {
586        let repetitive = "aaa\n".repeat(1000);
587        let k = kolmogorov_proxy(&repetitive);
588        assert!(
589            k < 0.1,
590            "highly repetitive text should compress well: K={k}"
591        );
592    }
593
594    #[test]
595    fn kolmogorov_diverse_is_higher() {
596        let repetitive = "aaa\n".repeat(500);
597        let diverse = (0..500)
598            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
599            .collect::<Vec<_>>()
600            .join("\n");
601        assert!(
602            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
603            "diverse content should have higher K than repetitive"
604        );
605    }
606
607    #[test]
608    fn compressibility_class_repetitive_is_high() {
609        let text = "use std::io;\n".repeat(200);
610        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
611    }
612
613    #[test]
614    fn kolmogorov_diverse_higher_than_repetitive() {
615        let rep = "test\n".repeat(500);
616        let diverse = (0..500)
617            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
618            .collect::<Vec<_>>()
619            .join("\n");
620        assert!(
621            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
622            "diverse content should have higher K"
623        );
624    }
625}