Skip to main content

lean_ctx/core/
compressor.rs

1use similar::{ChangeTag, TextDiff};
2
3pub fn strip_ansi(s: &str) -> String {
4    if !s.contains('\x1b') {
5        return s.to_string();
6    }
7    let mut result = String::with_capacity(s.len());
8    let mut in_escape = false;
9    for c in s.chars() {
10        if c == '\x1b' {
11            in_escape = true;
12            continue;
13        }
14        if in_escape {
15            if c.is_ascii_alphabetic() {
16                in_escape = false;
17            }
18            continue;
19        }
20        result.push(c);
21    }
22    result
23}
24
25pub fn ansi_density(s: &str) -> f64 {
26    if s.is_empty() {
27        return 0.0;
28    }
29    let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
30    escape_bytes as f64 / s.len() as f64
31}
32
33pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
34    let mut result: Vec<String> = Vec::new();
35    let is_python = matches!(ext, Some("py"));
36    let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
37    let is_sql = matches!(ext, Some("sql"));
38    let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
39
40    let mut in_block_comment = false;
41
42    for line in content.lines() {
43        let trimmed = line.trim();
44
45        if trimmed.is_empty() {
46            continue;
47        }
48
49        if in_block_comment {
50            if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
51                in_block_comment = false;
52            }
53            continue;
54        }
55
56        if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
57            if !(trimmed.contains("*/") || trimmed.contains("-->")) {
58                in_block_comment = true;
59            }
60            continue;
61        }
62
63        if trimmed.starts_with("//") && !trimmed.starts_with("///") {
64            continue;
65        }
66        if trimmed.starts_with('*') || trimmed.starts_with("*/") {
67            continue;
68        }
69        if is_python && trimmed.starts_with('#') {
70            continue;
71        }
72        if is_sql && trimmed.starts_with("--") {
73            continue;
74        }
75        if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
76            continue;
77        }
78        if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
79            continue;
80        }
81
82        if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
83            if let Some(last) = result.last() {
84                let last_trimmed = last.trim();
85                if matches!(last_trimmed, "}" | "};" | ");" | "});") {
86                    if let Some(last_mut) = result.last_mut() {
87                        last_mut.push_str(trimmed);
88                    }
89                    continue;
90                }
91            }
92            result.push(trimmed.to_string());
93            continue;
94        }
95
96        let normalized = normalize_indentation(line);
97        result.push(normalized);
98    }
99
100    result.join("\n")
101}
102
103/// Lightweight post-processing cleanup: collapses consecutive closing braces,
104/// removes whitespace-only lines, and limits consecutive blank lines to 1.
105pub fn lightweight_cleanup(content: &str) -> String {
106    let lines: Vec<&str> = content.lines().collect();
107    let total = lines.len();
108
109    let mut result: Vec<String> = Vec::new();
110    let mut blank_count = 0u32;
111    let mut brace_run: Vec<&str> = Vec::new();
112
113    let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
114        if total <= 200 || run.len() <= 5 {
115            for l in run.iter() {
116                out.push(l.to_string());
117            }
118        } else {
119            out.push(run[0].to_string());
120            out.push(run[1].to_string());
121            out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
122        }
123        run.clear();
124    };
125
126    for line in &lines {
127        let trimmed = line.trim();
128
129        if trimmed.is_empty() {
130            flush_brace_run(&mut brace_run, &mut result);
131            blank_count += 1;
132            if blank_count <= 1 {
133                result.push(String::new());
134            }
135            continue;
136        }
137        blank_count = 0;
138
139        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
140            brace_run.push(trimmed);
141            continue;
142        }
143
144        flush_brace_run(&mut brace_run, &mut result);
145        result.push(line.to_string());
146    }
147    flush_brace_run(&mut brace_run, &mut result);
148
149    result.join("\n")
150}
151
152/// Safeguard: ensures compression ratio stays within safe bounds.
153/// Returns the compressed content if ratio is in [0.15, 1.0], otherwise the original.
154pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
155    let orig_tokens = super::tokens::count_tokens(original);
156    let comp_tokens = super::tokens::count_tokens(compressed);
157
158    if orig_tokens == 0 {
159        return compressed.to_string();
160    }
161
162    let ratio = comp_tokens as f64 / orig_tokens as f64;
163    if ratio < 0.15 || comp_tokens > orig_tokens {
164        original.to_string()
165    } else {
166        compressed.to_string()
167    }
168}
169
170fn normalize_indentation(line: &str) -> String {
171    let content = line.trim_start();
172    let leading = line.len() - content.len();
173    let has_tabs = line.starts_with('\t');
174    let reduced = if has_tabs { leading } else { leading / 2 };
175    format!("{}{}", " ".repeat(reduced), content)
176}
177
178pub fn diff_content(old_content: &str, new_content: &str) -> String {
179    if old_content == new_content {
180        return "(no changes)".to_string();
181    }
182
183    let diff = TextDiff::from_lines(old_content, new_content);
184    let mut changes = Vec::new();
185    let mut additions = 0usize;
186    let mut deletions = 0usize;
187
188    for change in diff.iter_all_changes() {
189        let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
190        let text = change.value().trim_end_matches('\n');
191        match change.tag() {
192            ChangeTag::Insert => {
193                additions += 1;
194                if let Some(n) = line_no {
195                    changes.push(format!("+{n}: {text}"));
196                }
197            }
198            ChangeTag::Delete => {
199                deletions += 1;
200                if let Some(n) = line_no {
201                    changes.push(format!("-{n}: {text}"));
202                }
203            }
204            ChangeTag::Equal => {}
205        }
206    }
207
208    if changes.is_empty() {
209        return "(no changes)".to_string();
210    }
211
212    changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
213    changes.join("\n")
214}
215
216pub fn verbatim_compact(text: &str) -> String {
217    let mut lines: Vec<String> = Vec::new();
218    let mut blank_count = 0u32;
219    let mut prev_line: Option<String> = None;
220    let mut repeat_count = 0u32;
221
222    for line in text.lines() {
223        let trimmed = line.trim();
224
225        if trimmed.is_empty() {
226            blank_count += 1;
227            if blank_count <= 1 {
228                flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
229                lines.push(String::new());
230            }
231            continue;
232        }
233        blank_count = 0;
234
235        if is_boilerplate_line(trimmed) {
236            continue;
237        }
238
239        let normalized = normalize_whitespace(trimmed);
240        let stripped = strip_timestamps_hashes(&normalized);
241
242        if let Some(ref prev) = prev_line {
243            if *prev == stripped {
244                repeat_count += 1;
245                continue;
246            }
247        }
248
249        flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
250        prev_line = Some(stripped.clone());
251        repeat_count = 1;
252        lines.push(stripped);
253    }
254
255    flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
256    lines.join("\n")
257}
258
259pub fn task_aware_compress(
260    content: &str,
261    ext: Option<&str>,
262    intent: &super::intent_engine::StructuredIntent,
263) -> String {
264    use super::intent_engine::{IntentScope, TaskType};
265
266    let budget_ratio = match intent.scope {
267        IntentScope::SingleFile => 0.7,
268        IntentScope::MultiFile => 0.5,
269        IntentScope::CrossModule => 0.35,
270        IntentScope::ProjectWide => 0.25,
271    };
272
273    match intent.task_type {
274        TaskType::FixBug | TaskType::Debug => {
275            let filtered = super::task_relevance::information_bottleneck_filter_typed(
276                content,
277                &intent.keywords,
278                budget_ratio,
279                Some(intent.task_type),
280            );
281            safeguard_ratio(content, &filtered)
282        }
283        TaskType::Refactor | TaskType::Review => {
284            let cleaned = lightweight_cleanup(content);
285            let filtered = super::task_relevance::information_bottleneck_filter_typed(
286                &cleaned,
287                &intent.keywords,
288                budget_ratio.max(0.5),
289                Some(intent.task_type),
290            );
291            safeguard_ratio(content, &filtered)
292        }
293        TaskType::Generate | TaskType::Test => {
294            let compressed = aggressive_compress(content, ext);
295            safeguard_ratio(content, &compressed)
296        }
297        TaskType::Explore => {
298            let cleaned = lightweight_cleanup(content);
299            safeguard_ratio(content, &cleaned)
300        }
301        TaskType::Config | TaskType::Deploy => {
302            let cleaned = lightweight_cleanup(content);
303            safeguard_ratio(content, &cleaned)
304        }
305    }
306}
307
308fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
309    if *count > 1 {
310        if let Some(ref prev) = prev_line {
311            let last_idx = lines.len().saturating_sub(1);
312            if last_idx < lines.len() {
313                lines[last_idx] = format!("[{}x] {}", count, prev);
314            }
315        }
316    }
317    *count = 0;
318    *prev_line = None;
319}
320
321fn normalize_whitespace(line: &str) -> String {
322    let mut result = String::with_capacity(line.len());
323    let mut prev_space = false;
324    for ch in line.chars() {
325        if ch == ' ' || ch == '\t' {
326            if !prev_space {
327                result.push(' ');
328                prev_space = true;
329            }
330        } else {
331            result.push(ch);
332            prev_space = false;
333        }
334    }
335    result
336}
337
338fn strip_timestamps_hashes(line: &str) -> String {
339    use regex::Regex;
340    use std::sync::OnceLock;
341
342    static TS_RE: OnceLock<Regex> = OnceLock::new();
343    static HASH_RE: OnceLock<Regex> = OnceLock::new();
344
345    let ts_re = TS_RE.get_or_init(|| {
346        Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
347            .unwrap()
348    });
349    let hash_re = HASH_RE.get_or_init(|| Regex::new(r"\b[0-9a-f]{32,64}\b").unwrap());
350
351    let s = ts_re.replace_all(line, "[TS]");
352    let s = hash_re.replace_all(&s, "[HASH]");
353    s.into_owned()
354}
355
356fn is_boilerplate_line(trimmed: &str) -> bool {
357    let lower = trimmed.to_lowercase();
358    if lower.starts_with("copyright")
359        || lower.starts_with("licensed under")
360        || lower.starts_with("license:")
361        || lower.starts_with("all rights reserved")
362    {
363        return true;
364    }
365    if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
366        return true;
367    }
368    if trimmed.len() >= 4 {
369        let chars: Vec<char> = trimmed.chars().collect();
370        let first = chars[0];
371        if matches!(first, '=' | '-' | '*' | '─' | '━') {
372            let same = chars.iter().filter(|c| **c == first).count();
373            if same as f64 / chars.len() as f64 > 0.8 {
374                return true;
375            }
376        }
377    }
378    false
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384
385    #[test]
386    fn test_diff_insertion() {
387        let old = "line1\nline2\nline3";
388        let new = "line1\nline2\nnew_line\nline3";
389        let result = diff_content(old, new);
390        assert!(result.contains("+"), "should show additions");
391        assert!(result.contains("new_line"));
392    }
393
394    #[test]
395    fn test_diff_deletion() {
396        let old = "line1\nline2\nline3";
397        let new = "line1\nline3";
398        let result = diff_content(old, new);
399        assert!(result.contains("-"), "should show deletions");
400        assert!(result.contains("line2"));
401    }
402
403    #[test]
404    fn test_diff_no_changes() {
405        let content = "same\ncontent";
406        assert_eq!(diff_content(content, content), "(no changes)");
407    }
408
409    #[test]
410    fn test_lightweight_cleanup_collapses_braces() {
411        let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
412        lines.extend(
413            ["}", "}", "}", "}", "}", "}", "}", "}"]
414                .iter()
415                .map(|s| s.to_string()),
416        );
417        lines.push("fn next() {}".to_string());
418        let input = lines.join("\n");
419        let result = lightweight_cleanup(&input);
420        assert!(
421            result.contains("[6 brace-only lines collapsed]"),
422            "should collapse long brace runs in large files"
423        );
424        assert!(result.contains("fn next()"));
425    }
426
427    #[test]
428    fn test_lightweight_cleanup_blank_lines() {
429        let input = "line1\n\n\n\n\nline2";
430        let result = lightweight_cleanup(input);
431        let blank_runs = result.split("line1").nth(1).unwrap();
432        let blanks = blank_runs.matches('\n').count();
433        assert!(blanks <= 2, "should collapse multiple blank lines");
434    }
435
436    #[test]
437    fn test_safeguard_ratio_prevents_over_compression() {
438        let original = "a ".repeat(100);
439        let too_compressed = "a";
440        let result = safeguard_ratio(&original, too_compressed);
441        assert_eq!(result, original, "should return original when ratio < 0.15");
442    }
443
444    #[test]
445    fn test_aggressive_strips_comments() {
446        let code = "fn main() {\n    // a comment\n    let x = 1;\n}";
447        let result = aggressive_compress(code, Some("rs"));
448        assert!(!result.contains("// a comment"));
449        assert!(result.contains("let x = 1"));
450    }
451
452    #[test]
453    fn test_aggressive_python_comments() {
454        let code = "def main():\n    # comment\n    x = 1";
455        let result = aggressive_compress(code, Some("py"));
456        assert!(!result.contains("# comment"));
457        assert!(result.contains("x = 1"));
458    }
459
460    #[test]
461    fn test_aggressive_preserves_doc_comments() {
462        let code = "/// Doc comment\nfn main() {}";
463        let result = aggressive_compress(code, Some("rs"));
464        assert!(result.contains("/// Doc comment"));
465    }
466
467    #[test]
468    fn test_aggressive_block_comment() {
469        let code = "/* start\n * middle\n */ end\nfn main() {}";
470        let result = aggressive_compress(code, Some("rs"));
471        assert!(!result.contains("start"));
472        assert!(!result.contains("middle"));
473        assert!(result.contains("fn main()"));
474    }
475
476    #[test]
477    fn test_strip_ansi_removes_escape_codes() {
478        let input = "\x1b[31mERROR\x1b[0m: something failed";
479        let result = strip_ansi(input);
480        assert_eq!(result, "ERROR: something failed");
481        assert!(!result.contains('\x1b'));
482    }
483
484    #[test]
485    fn test_strip_ansi_passthrough_clean_text() {
486        let input = "clean text without escapes";
487        let result = strip_ansi(input);
488        assert_eq!(result, input);
489    }
490
491    #[test]
492    fn test_ansi_density_zero_for_clean() {
493        assert_eq!(ansi_density("hello world"), 0.0);
494    }
495
496    #[test]
497    fn test_ansi_density_nonzero_for_colored() {
498        let input = "\x1b[31mred\x1b[0m";
499        assert!(ansi_density(input) > 0.0);
500    }
501}