lean_ctx/core/
compressor.rs

1use similar::{ChangeTag, TextDiff};
2
3macro_rules! static_regex {
4    ($pattern:expr) => {{
5        static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
6        RE.get_or_init(|| {
7            regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
8        })
9    }};
10}
11
12/// Removes ANSI escape codes from a string, returning clean text.
13pub fn strip_ansi(s: &str) -> String {
14    if !s.contains('\x1b') {
15        return s.to_string();
16    }
17    let mut result = String::with_capacity(s.len());
18    let mut in_escape = false;
19    for c in s.chars() {
20        if c == '\x1b' {
21            in_escape = true;
22            continue;
23        }
24        if in_escape {
25            if c.is_ascii_alphabetic() {
26                in_escape = false;
27            }
28            continue;
29        }
30        result.push(c);
31    }
32    result
33}
34
35/// Returns the ratio of ANSI escape characters to total string length.
36pub fn ansi_density(s: &str) -> f64 {
37    if s.is_empty() {
38        return 0.0;
39    }
40    let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
41    escape_bytes as f64 / s.len() as f64
42}
43
44/// Strips comments, blank lines, and normalizes indentation for maximum token savings.
45pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
46    let mut result: Vec<String> = Vec::new();
47    let is_python = matches!(ext, Some("py"));
48    let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
49    let is_sql = matches!(ext, Some("sql"));
50    let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
51
52    let mut in_block_comment = false;
53
54    for line in content.lines() {
55        let trimmed = line.trim();
56
57        if trimmed.is_empty() {
58            continue;
59        }
60
61        if in_block_comment {
62            if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
63                in_block_comment = false;
64            }
65            continue;
66        }
67
68        if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
69            if !(trimmed.contains("*/") || trimmed.contains("-->")) {
70                in_block_comment = true;
71            }
72            continue;
73        }
74
75        if trimmed.starts_with("//") && !trimmed.starts_with("///") {
76            continue;
77        }
78        if trimmed.starts_with('*') || trimmed.starts_with("*/") {
79            continue;
80        }
81        if is_python && trimmed.starts_with('#') {
82            continue;
83        }
84        if is_sql && trimmed.starts_with("--") {
85            continue;
86        }
87        if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
88            continue;
89        }
90        if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
91            continue;
92        }
93
94        if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
95            if let Some(last) = result.last() {
96                let last_trimmed = last.trim();
97                if matches!(last_trimmed, "}" | "};" | ");" | "});") {
98                    if let Some(last_mut) = result.last_mut() {
99                        last_mut.push_str(trimmed);
100                    }
101                    continue;
102                }
103            }
104            result.push(trimmed.to_string());
105            continue;
106        }
107
108        let normalized = normalize_indentation(line);
109        result.push(normalized);
110    }
111
112    result.join("\n")
113}
114
115/// Lightweight post-processing cleanup: collapses consecutive closing braces,
116/// removes whitespace-only lines, and limits consecutive blank lines to 1.
117pub fn lightweight_cleanup(content: &str) -> String {
118    let lines: Vec<&str> = content.lines().collect();
119    let total = lines.len();
120
121    let mut result: Vec<String> = Vec::new();
122    let mut blank_count = 0u32;
123    let mut brace_run: Vec<&str> = Vec::new();
124
125    let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
126        if total <= 200 || run.len() <= 5 {
127            for l in run.iter() {
128                out.push(l.to_string());
129            }
130        } else {
131            out.push(run[0].to_string());
132            out.push(run[1].to_string());
133            out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
134        }
135        run.clear();
136    };
137
138    for line in &lines {
139        let trimmed = line.trim();
140
141        if trimmed.is_empty() {
142            flush_brace_run(&mut brace_run, &mut result);
143            blank_count += 1;
144            if blank_count <= 1 {
145                result.push(String::new());
146            }
147            continue;
148        }
149        blank_count = 0;
150
151        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
152            brace_run.push(trimmed);
153            continue;
154        }
155
156        flush_brace_run(&mut brace_run, &mut result);
157        result.push(line.to_string());
158    }
159    flush_brace_run(&mut brace_run, &mut result);
160
161    result.join("\n")
162}
163
164/// Safeguard: prevents compression from inflating output or destroying small outputs.
165/// For small outputs (<2000 tokens), rejects extreme compression (>95% reduction)
166/// that likely lost important content. For large outputs, trusts the pattern.
167pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
168    let orig_tokens = super::tokens::count_tokens(original);
169    let comp_tokens = super::tokens::count_tokens(compressed);
170
171    if orig_tokens == 0 {
172        return compressed.to_string();
173    }
174
175    if comp_tokens > orig_tokens {
176        return original.to_string();
177    }
178
179    let ratio = comp_tokens as f64 / orig_tokens as f64;
180    if ratio < 0.05 && orig_tokens < 2000 {
181        original.to_string()
182    } else {
183        compressed.to_string()
184    }
185}
186
187fn normalize_indentation(line: &str) -> String {
188    let content = line.trim_start();
189    let leading = line.len() - content.len();
190    let has_tabs = line.starts_with('\t');
191    let reduced = if has_tabs { leading } else { leading / 2 };
192    format!("{}{}", " ".repeat(reduced), content)
193}
194
195/// Produces a compact unified diff between old and new content with line numbers.
196pub fn diff_content(old_content: &str, new_content: &str) -> String {
197    if old_content == new_content {
198        return "(no changes)".to_string();
199    }
200
201    let diff = TextDiff::from_lines(old_content, new_content);
202    let mut changes = Vec::new();
203    let mut additions = 0usize;
204    let mut deletions = 0usize;
205
206    for change in diff.iter_all_changes() {
207        let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
208        let text = change.value().trim_end_matches('\n');
209        match change.tag() {
210            ChangeTag::Insert => {
211                additions += 1;
212                if let Some(n) = line_no {
213                    changes.push(format!("+{n}: {text}"));
214                }
215            }
216            ChangeTag::Delete => {
217                deletions += 1;
218                if let Some(n) = line_no {
219                    changes.push(format!("-{n}: {text}"));
220                }
221            }
222            ChangeTag::Equal => {}
223        }
224    }
225
226    if changes.is_empty() {
227        return "(no changes)".to_string();
228    }
229
230    changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
231    changes.join("\n")
232}
233
234/// Deduplicates repeated lines, strips boilerplate, and normalizes timestamps/hashes.
235pub fn verbatim_compact(text: &str) -> String {
236    let mut lines: Vec<String> = Vec::new();
237    let mut blank_count = 0u32;
238    let mut prev_line: Option<String> = None;
239    let mut repeat_count = 0u32;
240
241    for line in text.lines() {
242        let trimmed = line.trim();
243
244        if trimmed.is_empty() {
245            blank_count += 1;
246            if blank_count <= 1 {
247                flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
248                lines.push(String::new());
249            }
250            continue;
251        }
252        blank_count = 0;
253
254        if is_boilerplate_line(trimmed) {
255            continue;
256        }
257
258        let normalized = normalize_whitespace(trimmed);
259        let stripped = strip_timestamps_hashes(&normalized);
260
261        if let Some(ref prev) = prev_line {
262            if *prev == stripped {
263                repeat_count += 1;
264                continue;
265            }
266        }
267
268        flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
269        prev_line = Some(stripped.clone());
270        repeat_count = 1;
271        lines.push(stripped);
272    }
273
274    flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
275    lines.join("\n")
276}
277
278/// Compresses content using the active task intent to preserve task-relevant sections.
279pub fn task_aware_compress(
280    content: &str,
281    ext: Option<&str>,
282    intent: &super::intent_engine::StructuredIntent,
283) -> String {
284    use super::intent_engine::{IntentScope, TaskType};
285
286    let budget_ratio = match intent.scope {
287        IntentScope::SingleFile => 0.7,
288        IntentScope::MultiFile => 0.5,
289        IntentScope::CrossModule => 0.35,
290        IntentScope::ProjectWide => 0.25,
291    };
292
293    match intent.task_type {
294        TaskType::FixBug | TaskType::Debug => {
295            let filtered = super::task_relevance::information_bottleneck_filter_typed(
296                content,
297                &intent.keywords,
298                budget_ratio,
299                Some(intent.task_type),
300            );
301            safeguard_ratio(content, &filtered)
302        }
303        TaskType::Refactor | TaskType::Review => {
304            let cleaned = lightweight_cleanup(content);
305            let filtered = super::task_relevance::information_bottleneck_filter_typed(
306                &cleaned,
307                &intent.keywords,
308                budget_ratio.max(0.5),
309                Some(intent.task_type),
310            );
311            safeguard_ratio(content, &filtered)
312        }
313        TaskType::Generate | TaskType::Test => {
314            let compressed = aggressive_compress(content, ext);
315            safeguard_ratio(content, &compressed)
316        }
317        TaskType::Explore | TaskType::Config | TaskType::Deploy => {
318            let cleaned = lightweight_cleanup(content);
319            safeguard_ratio(content, &cleaned)
320        }
321    }
322}
323
324fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
325    if *count > 1 {
326        if let Some(ref prev) = prev_line {
327            let last_idx = lines.len().saturating_sub(1);
328            if last_idx < lines.len() {
329                lines[last_idx] = format!("[{count}x] {prev}");
330            }
331        }
332    }
333    *count = 0;
334    *prev_line = None;
335}
336
337fn normalize_whitespace(line: &str) -> String {
338    let mut result = String::with_capacity(line.len());
339    let mut prev_space = false;
340    for ch in line.chars() {
341        if ch == ' ' || ch == '\t' {
342            if !prev_space {
343                result.push(' ');
344                prev_space = true;
345            }
346        } else {
347            result.push(ch);
348            prev_space = false;
349        }
350    }
351    result
352}
353
354fn strip_timestamps_hashes(line: &str) -> String {
355    let ts_re =
356        static_regex!(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?");
357    let hash_re = static_regex!(r"\b[0-9a-f]{32,64}\b");
358
359    let s = ts_re.replace_all(line, "[TS]");
360    let s = hash_re.replace_all(&s, "[HASH]");
361    s.into_owned()
362}
363
364fn is_boilerplate_line(trimmed: &str) -> bool {
365    let lower = trimmed.to_lowercase();
366    if lower.starts_with("copyright")
367        || lower.starts_with("licensed under")
368        || lower.starts_with("license:")
369        || lower.starts_with("all rights reserved")
370    {
371        return true;
372    }
373    if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
374        return true;
375    }
376    if trimmed.len() >= 4 {
377        let chars: Vec<char> = trimmed.chars().collect();
378        let first = chars[0];
379        if matches!(first, '=' | '-' | '*' | '─' | '━') {
380            let same = chars.iter().filter(|c| **c == first).count();
381            if same as f64 / chars.len() as f64 > 0.8 {
382                return true;
383            }
384        }
385    }
386    false
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392
393    #[test]
394    fn test_diff_insertion() {
395        let old = "line1\nline2\nline3";
396        let new = "line1\nline2\nnew_line\nline3";
397        let result = diff_content(old, new);
398        assert!(result.contains('+'), "should show additions");
399        assert!(result.contains("new_line"));
400    }
401
402    #[test]
403    fn test_diff_deletion() {
404        let old = "line1\nline2\nline3";
405        let new = "line1\nline3";
406        let result = diff_content(old, new);
407        assert!(result.contains('-'), "should show deletions");
408        assert!(result.contains("line2"));
409    }
410
411    #[test]
412    fn test_diff_no_changes() {
413        let content = "same\ncontent";
414        assert_eq!(diff_content(content, content), "(no changes)");
415    }
416
417    #[test]
418    fn test_lightweight_cleanup_collapses_braces() {
419        let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
420        lines.extend(
421            ["}", "}", "}", "}", "}", "}", "}", "}"]
422                .iter()
423                .map(std::string::ToString::to_string),
424        );
425        lines.push("fn next() {}".to_string());
426        let input = lines.join("\n");
427        let result = lightweight_cleanup(&input);
428        assert!(
429            result.contains("[6 brace-only lines collapsed]"),
430            "should collapse long brace runs in large files"
431        );
432        assert!(result.contains("fn next()"));
433    }
434
435    #[test]
436    fn test_lightweight_cleanup_blank_lines() {
437        let input = "line1\n\n\n\n\nline2";
438        let result = lightweight_cleanup(input);
439        let blank_runs = result.split("line1").nth(1).unwrap();
440        let blanks = blank_runs.matches('\n').count();
441        assert!(blanks <= 2, "should collapse multiple blank lines");
442    }
443
444    #[test]
445    fn test_safeguard_ratio_prevents_over_compression_on_small_output() {
446        let original = "a ".repeat(100); // ~100 tokens, < 2000
447        let too_compressed = "a";
448        let result = safeguard_ratio(&original, too_compressed);
449        assert_eq!(
450            result, original,
451            "should return original when ratio < 0.05 and output is small"
452        );
453    }
454
455    #[test]
456    fn test_safeguard_ratio_allows_strong_compression_on_large_output() {
457        let original = "line content here\n".repeat(1000); // ~4000 tokens, > 2000
458        let compressed = "summary: 1000 lines";
459        let result = safeguard_ratio(&original, compressed);
460        assert_eq!(
461            result, compressed,
462            "should allow strong compression for large outputs"
463        );
464    }
465
466    #[test]
467    fn test_aggressive_strips_comments() {
468        let code = "fn main() {\n    // a comment\n    let x = 1;\n}";
469        let result = aggressive_compress(code, Some("rs"));
470        assert!(!result.contains("// a comment"));
471        assert!(result.contains("let x = 1"));
472    }
473
474    #[test]
475    fn test_aggressive_python_comments() {
476        let code = "def main():\n    # comment\n    x = 1";
477        let result = aggressive_compress(code, Some("py"));
478        assert!(!result.contains("# comment"));
479        assert!(result.contains("x = 1"));
480    }
481
482    #[test]
483    fn test_aggressive_preserves_doc_comments() {
484        let code = "/// Doc comment\nfn main() {}";
485        let result = aggressive_compress(code, Some("rs"));
486        assert!(result.contains("/// Doc comment"));
487    }
488
489    #[test]
490    fn test_aggressive_block_comment() {
491        let code = "/* start\n * middle\n */ end\nfn main() {}";
492        let result = aggressive_compress(code, Some("rs"));
493        assert!(!result.contains("start"));
494        assert!(!result.contains("middle"));
495        assert!(result.contains("fn main()"));
496    }
497
498    #[test]
499    fn test_strip_ansi_removes_escape_codes() {
500        let input = "\x1b[31mERROR\x1b[0m: something failed";
501        let result = strip_ansi(input);
502        assert_eq!(result, "ERROR: something failed");
503        assert!(!result.contains('\x1b'));
504    }
505
506    #[test]
507    fn test_strip_ansi_passthrough_clean_text() {
508        let input = "clean text without escapes";
509        let result = strip_ansi(input);
510        assert_eq!(result, input);
511    }
512
513    #[test]
514    fn test_ansi_density_zero_for_clean() {
515        assert_eq!(ansi_density("hello world"), 0.0);
516    }
517
518    #[test]
519    fn test_ansi_density_nonzero_for_colored() {
520        let input = "\x1b[31mred\x1b[0m";
521        assert!(ansi_density(input) > 0.0);
522    }
523}
lean_ctx/core/compressor.rs

lean_ctx/core/
compressor.rs