Skip to main content

lean_ctx/core/
compressor.rs

1use similar::{ChangeTag, TextDiff};
2
3pub fn strip_ansi(s: &str) -> String {
4    if !s.contains('\x1b') {
5        return s.to_string();
6    }
7    let mut result = String::with_capacity(s.len());
8    let mut in_escape = false;
9    for c in s.chars() {
10        if c == '\x1b' {
11            in_escape = true;
12            continue;
13        }
14        if in_escape {
15            if c.is_ascii_alphabetic() {
16                in_escape = false;
17            }
18            continue;
19        }
20        result.push(c);
21    }
22    result
23}
24
25pub fn ansi_density(s: &str) -> f64 {
26    if s.is_empty() {
27        return 0.0;
28    }
29    let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
30    escape_bytes as f64 / s.len() as f64
31}
32
33pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
34    let mut result: Vec<String> = Vec::new();
35    let is_python = matches!(ext, Some("py"));
36    let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
37    let is_sql = matches!(ext, Some("sql"));
38    let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
39
40    let mut in_block_comment = false;
41
42    for line in content.lines() {
43        let trimmed = line.trim();
44
45        if trimmed.is_empty() {
46            continue;
47        }
48
49        if in_block_comment {
50            if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
51                in_block_comment = false;
52            }
53            continue;
54        }
55
56        if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
57            if !(trimmed.contains("*/") || trimmed.contains("-->")) {
58                in_block_comment = true;
59            }
60            continue;
61        }
62
63        if trimmed.starts_with("//") && !trimmed.starts_with("///") {
64            continue;
65        }
66        if trimmed.starts_with('*') || trimmed.starts_with("*/") {
67            continue;
68        }
69        if is_python && trimmed.starts_with('#') {
70            continue;
71        }
72        if is_sql && trimmed.starts_with("--") {
73            continue;
74        }
75        if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
76            continue;
77        }
78        if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
79            continue;
80        }
81
82        if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
83            if let Some(last) = result.last() {
84                let last_trimmed = last.trim();
85                if matches!(last_trimmed, "}" | "};" | ");" | "});") {
86                    if let Some(last_mut) = result.last_mut() {
87                        last_mut.push_str(trimmed);
88                    }
89                    continue;
90                }
91            }
92            result.push(trimmed.to_string());
93            continue;
94        }
95
96        let normalized = normalize_indentation(line);
97        result.push(normalized);
98    }
99
100    result.join("\n")
101}
102
103/// Lightweight post-processing cleanup: collapses consecutive closing braces,
104/// removes whitespace-only lines, and limits consecutive blank lines to 1.
105pub fn lightweight_cleanup(content: &str) -> String {
106    let mut result: Vec<String> = Vec::new();
107    let mut blank_count = 0u32;
108    let mut close_brace_count = 0u32;
109
110    for line in content.lines() {
111        let trimmed = line.trim();
112
113        if trimmed.is_empty() {
114            close_brace_count = 0;
115            blank_count += 1;
116            if blank_count <= 1 {
117                result.push(String::new());
118            }
119            continue;
120        }
121        blank_count = 0;
122
123        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
124            close_brace_count += 1;
125            if close_brace_count <= 2 {
126                result.push(trimmed.to_string());
127            }
128            continue;
129        }
130        close_brace_count = 0;
131
132        result.push(line.to_string());
133    }
134
135    result.join("\n")
136}
137
138/// Safeguard: ensures compression ratio stays within safe bounds.
139/// Returns the compressed content if ratio is in [0.15, 1.0], otherwise the original.
140pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
141    let orig_tokens = super::tokens::count_tokens(original);
142    let comp_tokens = super::tokens::count_tokens(compressed);
143
144    if orig_tokens == 0 {
145        return compressed.to_string();
146    }
147
148    let ratio = comp_tokens as f64 / orig_tokens as f64;
149    if ratio < 0.15 || comp_tokens > orig_tokens {
150        original.to_string()
151    } else {
152        compressed.to_string()
153    }
154}
155
156fn normalize_indentation(line: &str) -> String {
157    let content = line.trim_start();
158    let leading = line.len() - content.len();
159    let has_tabs = line.starts_with('\t');
160    let reduced = if has_tabs { leading } else { leading / 2 };
161    format!("{}{}", " ".repeat(reduced), content)
162}
163
164pub fn diff_content(old_content: &str, new_content: &str) -> String {
165    if old_content == new_content {
166        return "(no changes)".to_string();
167    }
168
169    let diff = TextDiff::from_lines(old_content, new_content);
170    let mut changes = Vec::new();
171    let mut additions = 0usize;
172    let mut deletions = 0usize;
173
174    for change in diff.iter_all_changes() {
175        let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
176        let text = change.value().trim_end_matches('\n');
177        match change.tag() {
178            ChangeTag::Insert => {
179                additions += 1;
180                if let Some(n) = line_no {
181                    changes.push(format!("+{n}: {text}"));
182                }
183            }
184            ChangeTag::Delete => {
185                deletions += 1;
186                if let Some(n) = line_no {
187                    changes.push(format!("-{n}: {text}"));
188                }
189            }
190            ChangeTag::Equal => {}
191        }
192    }
193
194    if changes.is_empty() {
195        return "(no changes)".to_string();
196    }
197
198    changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
199    changes.join("\n")
200}
201
202pub fn verbatim_compact(text: &str) -> String {
203    let mut lines: Vec<String> = Vec::new();
204    let mut blank_count = 0u32;
205    let mut prev_line: Option<String> = None;
206    let mut repeat_count = 0u32;
207
208    for line in text.lines() {
209        let trimmed = line.trim();
210
211        if trimmed.is_empty() {
212            blank_count += 1;
213            if blank_count <= 1 {
214                flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
215                lines.push(String::new());
216            }
217            continue;
218        }
219        blank_count = 0;
220
221        if is_boilerplate_line(trimmed) {
222            continue;
223        }
224
225        let normalized = normalize_whitespace(trimmed);
226        let stripped = strip_timestamps_hashes(&normalized);
227
228        if let Some(ref prev) = prev_line {
229            if *prev == stripped {
230                repeat_count += 1;
231                continue;
232            }
233        }
234
235        flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
236        prev_line = Some(stripped.clone());
237        repeat_count = 1;
238        lines.push(stripped);
239    }
240
241    flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
242    lines.join("\n")
243}
244
245fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
246    if *count > 1 {
247        if let Some(ref prev) = prev_line {
248            let last_idx = lines.len().saturating_sub(1);
249            if last_idx < lines.len() {
250                lines[last_idx] = format!("[{}x] {}", count, prev);
251            }
252        }
253    }
254    *count = 0;
255    *prev_line = None;
256}
257
258fn normalize_whitespace(line: &str) -> String {
259    let mut result = String::with_capacity(line.len());
260    let mut prev_space = false;
261    for ch in line.chars() {
262        if ch == ' ' || ch == '\t' {
263            if !prev_space {
264                result.push(' ');
265                prev_space = true;
266            }
267        } else {
268            result.push(ch);
269            prev_space = false;
270        }
271    }
272    result
273}
274
275fn strip_timestamps_hashes(line: &str) -> String {
276    use regex::Regex;
277    use std::sync::OnceLock;
278
279    static TS_RE: OnceLock<Regex> = OnceLock::new();
280    static HASH_RE: OnceLock<Regex> = OnceLock::new();
281
282    let ts_re = TS_RE.get_or_init(|| {
283        Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
284            .unwrap()
285    });
286    let hash_re = HASH_RE.get_or_init(|| Regex::new(r"\b[0-9a-f]{32,64}\b").unwrap());
287
288    let s = ts_re.replace_all(line, "[TS]");
289    let s = hash_re.replace_all(&s, "[HASH]");
290    s.into_owned()
291}
292
293fn is_boilerplate_line(trimmed: &str) -> bool {
294    let lower = trimmed.to_lowercase();
295    if lower.starts_with("copyright")
296        || lower.starts_with("licensed under")
297        || lower.starts_with("license:")
298        || lower.starts_with("all rights reserved")
299    {
300        return true;
301    }
302    if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
303        return true;
304    }
305    if trimmed.len() >= 4 {
306        let chars: Vec<char> = trimmed.chars().collect();
307        let first = chars[0];
308        if matches!(first, '=' | '-' | '*' | '─' | '━') {
309            let same = chars.iter().filter(|c| **c == first).count();
310            if same as f64 / chars.len() as f64 > 0.8 {
311                return true;
312            }
313        }
314    }
315    false
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn test_diff_insertion() {
324        let old = "line1\nline2\nline3";
325        let new = "line1\nline2\nnew_line\nline3";
326        let result = diff_content(old, new);
327        assert!(result.contains("+"), "should show additions");
328        assert!(result.contains("new_line"));
329    }
330
331    #[test]
332    fn test_diff_deletion() {
333        let old = "line1\nline2\nline3";
334        let new = "line1\nline3";
335        let result = diff_content(old, new);
336        assert!(result.contains("-"), "should show deletions");
337        assert!(result.contains("line2"));
338    }
339
340    #[test]
341    fn test_diff_no_changes() {
342        let content = "same\ncontent";
343        assert_eq!(diff_content(content, content), "(no changes)");
344    }
345
346    #[test]
347    fn test_lightweight_cleanup_collapses_braces() {
348        let input = "fn main() {\n    inner()\n}\n}\n}\n}\n}\nfn next() {}";
349        let result = lightweight_cleanup(input);
350        assert!(
351            result.matches('}').count() <= 3,
352            "should collapse consecutive closing braces"
353        );
354        assert!(result.contains("fn next()"));
355    }
356
357    #[test]
358    fn test_lightweight_cleanup_blank_lines() {
359        let input = "line1\n\n\n\n\nline2";
360        let result = lightweight_cleanup(input);
361        let blank_runs = result.split("line1").nth(1).unwrap();
362        let blanks = blank_runs.matches('\n').count();
363        assert!(blanks <= 2, "should collapse multiple blank lines");
364    }
365
366    #[test]
367    fn test_safeguard_ratio_prevents_over_compression() {
368        let original = "a ".repeat(100);
369        let too_compressed = "a";
370        let result = safeguard_ratio(&original, too_compressed);
371        assert_eq!(result, original, "should return original when ratio < 0.15");
372    }
373
374    #[test]
375    fn test_aggressive_strips_comments() {
376        let code = "fn main() {\n    // a comment\n    let x = 1;\n}";
377        let result = aggressive_compress(code, Some("rs"));
378        assert!(!result.contains("// a comment"));
379        assert!(result.contains("let x = 1"));
380    }
381
382    #[test]
383    fn test_aggressive_python_comments() {
384        let code = "def main():\n    # comment\n    x = 1";
385        let result = aggressive_compress(code, Some("py"));
386        assert!(!result.contains("# comment"));
387        assert!(result.contains("x = 1"));
388    }
389
390    #[test]
391    fn test_aggressive_preserves_doc_comments() {
392        let code = "/// Doc comment\nfn main() {}";
393        let result = aggressive_compress(code, Some("rs"));
394        assert!(result.contains("/// Doc comment"));
395    }
396
397    #[test]
398    fn test_aggressive_block_comment() {
399        let code = "/* start\n * middle\n */ end\nfn main() {}";
400        let result = aggressive_compress(code, Some("rs"));
401        assert!(!result.contains("start"));
402        assert!(!result.contains("middle"));
403        assert!(result.contains("fn main()"));
404    }
405
406    #[test]
407    fn test_strip_ansi_removes_escape_codes() {
408        let input = "\x1b[31mERROR\x1b[0m: something failed";
409        let result = strip_ansi(input);
410        assert_eq!(result, "ERROR: something failed");
411        assert!(!result.contains('\x1b'));
412    }
413
414    #[test]
415    fn test_strip_ansi_passthrough_clean_text() {
416        let input = "clean text without escapes";
417        let result = strip_ansi(input);
418        assert_eq!(result, input);
419    }
420
421    #[test]
422    fn test_ansi_density_zero_for_clean() {
423        assert_eq!(ansi_density("hello world"), 0.0);
424    }
425
426    #[test]
427    fn test_ansi_density_nonzero_for_colored() {
428        let input = "\x1b[31mred\x1b[0m";
429        assert!(ansi_density(input) > 0.0);
430    }
431}