Skip to main content

lean_ctx/core/
entropy.rs

1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17    pub output: String,
18    pub original_tokens: usize,
19    pub compressed_tokens: usize,
20    pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25    pub fn savings_percent(&self) -> f64 {
26        if self.original_tokens == 0 {
27            return 0.0;
28        }
29        let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30        (saved as f64 / self.original_tokens as f64) * 100.0
31    }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36    if text.is_empty() {
37        return 0.0;
38    }
39    let mut freq: HashMap<char, usize> = HashMap::new();
40    let total = text.chars().count();
41
42    for c in text.chars() {
43        *freq.entry(c).or_default() += 1;
44    }
45
46    freq.values().fold(0.0_f64, |acc, &count| {
47        let p = count as f64 / total as f64;
48        acc - p * p.log2()
49    })
50}
51
52/// Shannon entropy over BPE token IDs (o200k_base).
53/// More LLM-relevant than character entropy since LLMs process BPE tokens.
54pub fn token_entropy(text: &str) -> f64 {
55    let tokens = encode_tokens(text);
56    if tokens.is_empty() {
57        return 0.0;
58    }
59    let total = tokens.len();
60    let mut freq: HashMap<u32, usize> = HashMap::new();
61    for &t in &tokens {
62        *freq.entry(t).or_default() += 1;
63    }
64    freq.values().fold(0.0_f64, |acc, &count| {
65        let p = count as f64 / total as f64;
66        acc - p * p.log2()
67    })
68}
69
70#[allow(dead_code)]
71pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
72    let set_a: HashSet<&str> = a.split_whitespace().collect();
73    let set_b: HashSet<&str> = b.split_whitespace().collect();
74
75    let intersection = set_a.intersection(&set_b).count();
76    let union = set_a.union(&set_b).count();
77
78    if union == 0 {
79        return 0.0;
80    }
81    intersection as f64 / union as f64
82}
83
84/// N-gram Jaccard similarity — preserves word order (unlike word-set Jaccard).
85pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
86    let set_a = ngram_set(a, n);
87    let set_b = ngram_set(b, n);
88
89    let intersection = set_a.intersection(&set_b).count();
90    let union = set_a.union(&set_b).count();
91
92    if union == 0 {
93        return 0.0;
94    }
95    intersection as f64 / union as f64
96}
97
98fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
99    let words: Vec<&str> = text.split_whitespace().collect();
100    if words.len() < n {
101        let mut set = HashSet::new();
102        if !words.is_empty() {
103            set.insert(words.iter().map(|w| w.to_string()).collect());
104        }
105        return set;
106    }
107    words
108        .windows(n)
109        .map(|w| w.iter().map(|s| s.to_string()).collect())
110        .collect()
111}
112
113/// Minhash signature for approximate Jaccard via LSH.
114/// Uses k independent hash functions (polynomial hashing with different seeds).
115#[allow(dead_code)]
116pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
117    let ngrams = ngram_set(text, n);
118    if ngrams.is_empty() {
119        return vec![u64::MAX; k];
120    }
121    let mut signature = vec![u64::MAX; k];
122    for ngram in &ngrams {
123        for (i, min) in signature.iter_mut().enumerate() {
124            let h = hash_with_seed(ngram, i as u64);
125            if h < *min {
126                *min = h;
127            }
128        }
129    }
130    signature
131}
132
133/// Approximate Jaccard from two minhash signatures.
134#[allow(dead_code)]
135pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
136    if sig_a.len() != sig_b.len() || sig_a.is_empty() {
137        return 0.0;
138    }
139    let matches = sig_a
140        .iter()
141        .zip(sig_b.iter())
142        .filter(|(a, b)| a == b)
143        .count();
144    matches as f64 / sig_a.len() as f64
145}
146
147#[allow(dead_code)]
148fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
149    let mut hasher = std::collections::hash_map::DefaultHasher::new();
150    seed.hash(&mut hasher);
151    value.hash(&mut hasher);
152    hasher.finish()
153}
154
155/// Kolmogorov complexity proxy: K(x) ≈ len(gzip(x)) / len(x).
156/// Lower values = more compressible = more redundant.
157pub fn kolmogorov_proxy(content: &str) -> f64 {
158    if content.is_empty() {
159        return 1.0;
160    }
161    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
162    encoder.write_all(content.as_bytes()).ok();
163    let compressed = encoder.finish().unwrap_or_default();
164    compressed.len() as f64 / content.len() as f64
165}
166
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum CompressibilityClass {
169    High,
170    Medium,
171    Low,
172}
173
174impl CompressibilityClass {
175    pub fn label(&self) -> &'static str {
176        match self {
177            Self::High => "high (K<0.3)",
178            Self::Medium => "medium (0.3≤K<0.6)",
179            Self::Low => "low (K≥0.6)",
180        }
181    }
182}
183
184/// Classify how compressible content is based on gzip ratio.
185pub fn compressibility_class(content: &str) -> CompressibilityClass {
186    let k = kolmogorov_proxy(content);
187    if k < 0.3 {
188        CompressibilityClass::High
189    } else if k < 0.6 {
190        CompressibilityClass::Medium
191    } else {
192        CompressibilityClass::Low
193    }
194}
195
196pub fn entropy_compress(content: &str) -> EntropyResult {
197    entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
198}
199
200pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
201    let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
202    entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard)
203}
204
205fn entropy_compress_with_thresholds(
206    content: &str,
207    entropy_threshold: f64,
208    jaccard_threshold: f64,
209) -> EntropyResult {
210    let original_tokens = count_tokens(content);
211    let mut lines: Vec<&str> = content.lines().collect();
212    let mut techniques = Vec::new();
213
214    let original_count = lines.len();
215    lines.retain(|line| {
216        let trimmed = line.trim();
217        if trimmed.is_empty() || trimmed.len() < 3 {
218            return true;
219        }
220        token_entropy(trimmed) >= entropy_threshold
221    });
222    let removed = original_count - lines.len();
223    if removed > 0 {
224        techniques.push(format!(
225            "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
226        ));
227    }
228
229    let blocks = extract_blocks(&lines);
230    let groups = find_pattern_groups(&blocks, jaccard_threshold);
231    let mut dedup_count = 0;
232    for group in &groups {
233        if group.len() > 1 {
234            dedup_count += group.len() - 1;
235        }
236    }
237    if dedup_count > 0 {
238        techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
239    }
240
241    let mut result: Vec<String> = Vec::new();
242    let mut skip_indices: HashSet<usize> = HashSet::new();
243    for group in &groups {
244        if group.len() > 1 {
245            for &idx in &group[1..] {
246                skip_indices.insert(idx);
247            }
248        }
249    }
250    for (i, line) in lines.iter().enumerate() {
251        if !skip_indices.contains(&i) {
252            result.push(line.to_string());
253        }
254    }
255
256    let mut collapsed = Vec::new();
257    let mut blank_count = 0;
258    for line in &result {
259        if line.trim().is_empty() {
260            blank_count += 1;
261            if blank_count <= 1 {
262                collapsed.push(line.clone());
263            }
264        } else {
265            blank_count = 0;
266            collapsed.push(line.clone());
267        }
268    }
269
270    let output = collapsed.join("\n");
271    let compressed_tokens = count_tokens(&output);
272
273    EntropyResult {
274        output,
275        original_tokens,
276        compressed_tokens,
277        techniques,
278    }
279}
280
281#[derive(Debug)]
282pub struct EntropyAnalysis {
283    pub avg_entropy: f64,
284    pub low_entropy_count: usize,
285    pub high_entropy_count: usize,
286    pub total_lines: usize,
287}
288
289pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
290    let lines: Vec<&str> = content.lines().collect();
291    let total = lines.len();
292    let mut sum = 0.0;
293    let mut low = 0;
294    let mut high = 0;
295    let mut counted = 0;
296
297    for line in &lines {
298        let trimmed = line.trim();
299        if trimmed.is_empty() {
300            continue;
301        }
302        let h = token_entropy(trimmed);
303        sum += h;
304        counted += 1;
305        if h < BPE_ENTROPY_THRESHOLD {
306            low += 1;
307        }
308        if h > 3.0 {
309            high += 1;
310        }
311    }
312
313    EntropyAnalysis {
314        avg_entropy: if counted > 0 {
315            sum / counted as f64
316        } else {
317            0.0
318        },
319        low_entropy_count: low,
320        high_entropy_count: high,
321        total_lines: total,
322    }
323}
324
325#[allow(dead_code)]
326struct Block {
327    start: usize,
328    content: String,
329}
330
331fn extract_blocks(lines: &[&str]) -> Vec<Block> {
332    let mut blocks = Vec::new();
333    let mut current = String::new();
334    let mut start = 0;
335
336    for (i, line) in lines.iter().enumerate() {
337        let trimmed = line.trim();
338        if trimmed.is_empty() && !current.is_empty() {
339            blocks.push(Block {
340                start,
341                content: current.clone(),
342            });
343            current.clear();
344        } else if !trimmed.is_empty() {
345            if current.is_empty() {
346                start = i;
347            }
348            current.push_str(trimmed);
349            current.push('\n');
350        }
351    }
352
353    if !current.is_empty() {
354        blocks.push(Block {
355            start,
356            content: current,
357        });
358    }
359
360    blocks
361}
362
363fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
364    let mut groups: Vec<Vec<usize>> = Vec::new();
365    let mut assigned: HashSet<usize> = HashSet::new();
366
367    for (i, block_a) in blocks.iter().enumerate() {
368        if assigned.contains(&i) {
369            continue;
370        }
371        let mut group = vec![i];
372        for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
373            if assigned.contains(&j) {
374                continue;
375            }
376            if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
377                group.push(j);
378                assigned.insert(j);
379            }
380        }
381        if group.len() > 1 {
382            assigned.insert(i);
383        }
384        groups.push(group);
385    }
386
387    groups
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn shannon_entropy_empty_is_zero() {
396        assert_eq!(shannon_entropy(""), 0.0);
397    }
398
399    #[test]
400    fn shannon_entropy_single_char() {
401        assert_eq!(shannon_entropy("aaaa"), 0.0);
402    }
403
404    #[test]
405    fn shannon_entropy_high_for_varied_text() {
406        let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
407        let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
408        assert!(
409            shannon_entropy(varied) > shannon_entropy(uniform),
410            "varied text should have higher entropy"
411        );
412    }
413
414    #[test]
415    fn jaccard_identical_is_one() {
416        let sim = jaccard_similarity("hello world", "hello world");
417        assert!((sim - 1.0).abs() < f64::EPSILON);
418    }
419
420    #[test]
421    fn jaccard_disjoint_is_zero() {
422        let sim = jaccard_similarity("abc", "xyz");
423        assert_eq!(sim, 0.0);
424    }
425
426    #[test]
427    fn jaccard_partial_overlap() {
428        let sim = jaccard_similarity("hello world", "hello rust");
429        assert!(sim > 0.0 && sim < 1.0);
430    }
431
432    #[test]
433    fn entropy_compress_produces_output() {
434        let content = "fn main() {\n    println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n    let x = 42;\n}\n";
435        let result = entropy_compress(content);
436        assert!(!result.output.is_empty(), "should produce non-empty output");
437        assert!(result.compressed_tokens <= result.original_tokens);
438    }
439
440    #[test]
441    fn entropy_result_savings() {
442        let r = EntropyResult {
443            output: "short".to_string(),
444            original_tokens: 100,
445            compressed_tokens: 60,
446            techniques: vec!["test".to_string()],
447        };
448        assert!((r.savings_percent() - 40.0).abs() < 0.1);
449    }
450
451    #[test]
452    fn entropy_result_zero_original() {
453        let r = EntropyResult {
454            output: String::new(),
455            original_tokens: 0,
456            compressed_tokens: 0,
457            techniques: vec![],
458        };
459        assert_eq!(r.savings_percent(), 0.0);
460    }
461
462    #[test]
463    fn token_entropy_empty_is_zero() {
464        assert_eq!(token_entropy(""), 0.0);
465    }
466
467    #[test]
468    fn token_entropy_single_repeated_token() {
469        assert_eq!(token_entropy("}"), 0.0);
470    }
471
472    #[test]
473    fn token_entropy_higher_for_diverse_code() {
474        let diverse = "let result = compute_something(x, y, z);";
475        let repetitive = "aaaa aaaa aaaa aaaa";
476        assert!(
477            token_entropy(diverse) > token_entropy(repetitive),
478            "diverse code should have higher BPE token entropy"
479        );
480    }
481
482    #[test]
483    fn token_entropy_vs_char_entropy_differ() {
484        let code = "fn main() { println!(\"hello world\"); }";
485        let te = token_entropy(code);
486        let ce = shannon_entropy(code);
487        assert!(te != ce, "BPE and char entropy should differ for code");
488    }
489
490    #[test]
491    fn ngram_jaccard_preserves_order() {
492        let a = "a b c d";
493        let b = "d c b a";
494        let word_j = jaccard_similarity(a, b);
495        let ngram_j = ngram_jaccard(a, b, 2);
496        assert!(
497            ngram_j < word_j,
498            "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
499        );
500    }
501
502    #[test]
503    fn ngram_jaccard_identical_is_one() {
504        let text = "fn main() { println!(\"hello\"); }";
505        let j = ngram_jaccard(text, text, 2);
506        assert!((j - 1.0).abs() < f64::EPSILON);
507    }
508
509    #[test]
510    fn ngram_jaccard_disjoint_is_zero() {
511        let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
512        assert_eq!(j, 0.0);
513    }
514
515    #[test]
516    fn minhash_approximates_jaccard() {
517        let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
518        let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
519        let exact = ngram_jaccard(a, b, 2);
520        let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
521        let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
522        let approx = minhash_similarity(&sig_a, &sig_b);
523        assert!(
524            (exact - approx).abs() < 0.2,
525            "minhash ({approx}) should approximate exact ({exact}) within 0.2"
526        );
527    }
528
529    #[test]
530    fn minhash_empty_text() {
531        let sig = minhash_signature("", 2, 64);
532        assert!(sig.iter().all(|&v| v == u64::MAX));
533    }
534
535    #[test]
536    fn kolmogorov_empty_is_one() {
537        assert_eq!(kolmogorov_proxy(""), 1.0);
538    }
539
540    #[test]
541    fn kolmogorov_repetitive_is_low() {
542        let repetitive = "aaa\n".repeat(1000);
543        let k = kolmogorov_proxy(&repetitive);
544        assert!(
545            k < 0.1,
546            "highly repetitive text should compress well: K={k}"
547        );
548    }
549
550    #[test]
551    fn kolmogorov_diverse_is_higher() {
552        let repetitive = "aaa\n".repeat(500);
553        let diverse = (0..500)
554            .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
555            .collect::<Vec<_>>()
556            .join("\n");
557        assert!(
558            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
559            "diverse content should have higher K than repetitive"
560        );
561    }
562
563    #[test]
564    fn compressibility_class_repetitive_is_high() {
565        let text = "use std::io;\n".repeat(200);
566        assert_eq!(compressibility_class(&text), CompressibilityClass::High);
567    }
568
569    #[test]
570    fn kolmogorov_diverse_higher_than_repetitive() {
571        let rep = "test\n".repeat(500);
572        let diverse = (0..500)
573            .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
574            .collect::<Vec<_>>()
575            .join("\n");
576        assert!(
577            kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
578            "diverse content should have higher K"
579        );
580    }
581}