lean_ctx/core/
information_bottleneck.rs1use super::entropy::normalized_token_entropy;
4use super::tokens::count_tokens;
5
6fn flush_omitted(out: &mut Vec<String>, run: &mut usize) {
7 if *run > 0 {
8 out.push(format!("// ... {} low-info lines omitted", *run));
9 *run = 0;
10 }
11}
12
13fn render_ib(lines: &[&str], scores: &[f64], threshold: f64) -> String {
14 debug_assert_eq!(lines.len(), scores.len());
15 let mut out = Vec::new();
16 let mut omit_run = 0usize;
17 for (&line, &score) in lines.iter().zip(scores.iter()) {
18 if score >= threshold {
19 flush_omitted(&mut out, &mut omit_run);
20 out.push(line.to_string());
21 } else {
22 omit_run += 1;
23 }
24 }
25 flush_omitted(&mut out, &mut omit_run);
26 out.join("\n")
27}
28
29pub fn compress_ib(text: &str, target_ratio: f64) -> String {
32 if text.is_empty() {
33 return String::new();
34 }
35 let input_tokens = count_tokens(text);
36 if input_tokens == 0 {
37 return text.to_string();
38 }
39 let ratio_target = target_ratio.clamp(0.02, 1.0);
40
41 let lines_vec: Vec<&str> = text.lines().collect();
42 let lines: &[&str] = &lines_vec;
43 let scores: Vec<f64> = lines
44 .iter()
45 .map(|ln| normalized_token_entropy(ln))
46 .collect();
47
48 let mut lo = 0.0_f64;
50 let mut hi = 1.0_f64;
51 let mut best = render_ib(lines, &scores, 0.0);
52 let mut best_diff = f64::INFINITY;
53
54 let mut consider = |thr: f64| {
55 let cand = render_ib(lines, &scores, thr);
56 let r = count_tokens(&cand) as f64 / input_tokens as f64;
57 let diff = (r - ratio_target).abs();
58 if diff < best_diff {
59 best_diff = diff;
60 best = cand;
61 }
62 };
63
64 for _ in 0..26 {
65 let mid = (lo + hi) * 0.5;
66 let cand = render_ib(lines, &scores, mid);
67 let r = count_tokens(&cand) as f64 / input_tokens as f64;
68 consider(mid);
69 if r > ratio_target {
70 lo = mid;
71 } else {
72 hi = mid;
73 }
74 }
75
76 for thr in [0.0_f64, 1.0_f64, lo, hi, (lo + hi) * 0.5] {
77 consider(thr);
78 }
79
80 best
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn empty_and_ratio_one_keeps_content() {
89 assert_eq!(compress_ib("", 0.5), "");
90 let s = "fn main() {\n println!(\"hi\");\n}\n";
91 let full = compress_ib(s, 1.0);
92 assert!(full.contains("fn main"));
93 }
94
95 #[test]
96 fn strong_compression_drops_redundant_lines() {
97 let mut boring = String::new();
98 for _ in 0..30 {
99 boring.push_str("aaa bbb aaa bbb\n");
100 }
101 boring.push_str("unique_identifier_xyz_quartz\n");
102 let out = compress_ib(&boring, 0.15);
103 assert!(out.contains("low-info lines omitted"));
104 assert!(out.contains("unique_identifier_xyz_quartz"));
105 assert!(count_tokens(&out) < count_tokens(&boring));
106 }
107
108 #[test]
109 fn placeholder_counts_skipped_lines() {
110 let lines: Vec<String> = (0..5).map(|_| "x x x x".into()).collect();
111 let mut text = lines.join("\n");
112 text.push('\n');
113 text.push_str("serde Deserialize TraitBounds\n");
114 let out = compress_ib(&text, 0.25);
115 assert!(out.contains("low-info lines omitted"));
116 assert!(out.contains("serde"));
117 }
118}