Skip to main content

lean_ctx/core/
information_bottleneck.rs

1//! QUITO-X–style trade-off: compress by dropping low token-entropy lines while targeting an output/input token ratio.
2
3use super::entropy::normalized_token_entropy;
4use super::tokens::count_tokens;
5
6fn flush_omitted(out: &mut Vec<String>, run: &mut usize) {
7    if *run > 0 {
8        out.push(format!("// ... {} low-info lines omitted", *run));
9        *run = 0;
10    }
11}
12
13fn render_ib(lines: &[&str], scores: &[f64], threshold: f64) -> String {
14    debug_assert_eq!(lines.len(), scores.len());
15    let mut out = Vec::new();
16    let mut omit_run = 0usize;
17    for (&line, &score) in lines.iter().zip(scores.iter()) {
18        if score >= threshold {
19            flush_omitted(&mut out, &mut omit_run);
20            out.push(line.to_string());
21        } else {
22            omit_run += 1;
23        }
24    }
25    flush_omitted(&mut out, &mut omit_run);
26    out.join("\n")
27}
28
29/// Compress `text` toward `target_ratio` (output tokens / input tokens) by dropping lines whose
30/// normalized BPE token entropy falls below a dynamically chosen threshold.
31pub fn compress_ib(text: &str, target_ratio: f64) -> String {
32    if text.is_empty() {
33        return String::new();
34    }
35    let input_tokens = count_tokens(text);
36    if input_tokens == 0 {
37        return text.to_string();
38    }
39    let ratio_target = target_ratio.clamp(0.02, 1.0);
40
41    let lines_vec: Vec<&str> = text.lines().collect();
42    let lines: &[&str] = &lines_vec;
43    let scores: Vec<f64> = lines
44        .iter()
45        .map(|ln| normalized_token_entropy(ln))
46        .collect();
47
48    // Higher threshold ⇒ fewer kept lines ⇒ lower output ratio (monotone decreasing in threshold).
49    let mut lo = 0.0_f64;
50    let mut hi = 1.0_f64;
51    let mut best = render_ib(lines, &scores, 0.0);
52    let mut best_diff = f64::INFINITY;
53
54    let mut consider = |thr: f64| {
55        let cand = render_ib(lines, &scores, thr);
56        let r = count_tokens(&cand) as f64 / input_tokens as f64;
57        let diff = (r - ratio_target).abs();
58        if diff < best_diff {
59            best_diff = diff;
60            best = cand;
61        }
62    };
63
64    for _ in 0..26 {
65        let mid = (lo + hi) * 0.5;
66        let cand = render_ib(lines, &scores, mid);
67        let r = count_tokens(&cand) as f64 / input_tokens as f64;
68        consider(mid);
69        if r > ratio_target {
70            lo = mid;
71        } else {
72            hi = mid;
73        }
74    }
75
76    for thr in [0.0_f64, 1.0_f64, lo, hi, (lo + hi) * 0.5] {
77        consider(thr);
78    }
79
80    best
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn empty_and_ratio_one_keeps_content() {
89        assert_eq!(compress_ib("", 0.5), "");
90        let s = "fn main() {\n    println!(\"hi\");\n}\n";
91        let full = compress_ib(s, 1.0);
92        assert!(full.contains("fn main"));
93    }
94
95    #[test]
96    fn strong_compression_drops_redundant_lines() {
97        let mut boring = String::new();
98        for _ in 0..30 {
99            boring.push_str("aaa bbb aaa bbb\n");
100        }
101        boring.push_str("unique_identifier_xyz_quartz\n");
102        let out = compress_ib(&boring, 0.15);
103        assert!(out.contains("low-info lines omitted"));
104        assert!(out.contains("unique_identifier_xyz_quartz"));
105        assert!(count_tokens(&out) < count_tokens(&boring));
106    }
107
108    #[test]
109    fn placeholder_counts_skipped_lines() {
110        let lines: Vec<String> = (0..5).map(|_| "x x x x".into()).collect();
111        let mut text = lines.join("\n");
112        text.push('\n');
113        text.push_str("serde Deserialize TraitBounds\n");
114        let out = compress_ib(&text, 0.25);
115        assert!(out.contains("low-info lines omitted"));
116        assert!(out.contains("serde"));
117    }
118}