Skip to main content

lean_ctx/core/
compressor.rs

1use similar::{ChangeTag, TextDiff};
2
3macro_rules! static_regex {
4    ($pattern:expr) => {{
5        static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
6        RE.get_or_init(|| {
7            regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
8        })
9    }};
10}
11
12/// Removes ANSI escape codes from a string, returning clean text.
13pub fn strip_ansi(s: &str) -> String {
14    if !s.contains('\x1b') {
15        return s.to_string();
16    }
17    let mut result = String::with_capacity(s.len());
18    let mut in_escape = false;
19    for c in s.chars() {
20        if c == '\x1b' {
21            in_escape = true;
22            continue;
23        }
24        if in_escape {
25            if c.is_ascii_alphabetic() {
26                in_escape = false;
27            }
28            continue;
29        }
30        result.push(c);
31    }
32    result
33}
34
35/// Returns the ratio of ANSI escape characters to total string length.
36pub fn ansi_density(s: &str) -> f64 {
37    if s.is_empty() {
38        return 0.0;
39    }
40    let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
41    escape_bytes as f64 / s.len() as f64
42}
43
44/// Strips comments, blank lines, and normalizes indentation for maximum token savings.
45pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
46    // Structured data (JSON/JSONL) carries no comments and barely compresses via
47    // the line-based path below (~0% measured). Strip insignificant whitespace
48    // losslessly instead — key order, numbers, and string contents are preserved.
49    if let Some(compacted) = crate::core::structured_compact::compact_structured(content, ext) {
50        return compacted;
51    }
52
53    let mut result: Vec<String> = Vec::new();
54    let is_python = matches!(ext, Some("py"));
55    let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
56    let is_sql = matches!(ext, Some("sql"));
57    let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
58
59    let mut in_block_comment = false;
60
61    for line in content.lines() {
62        let trimmed = line.trim();
63
64        if trimmed.is_empty() {
65            continue;
66        }
67
68        if in_block_comment {
69            if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
70                in_block_comment = false;
71            }
72            continue;
73        }
74
75        if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
76            if !(trimmed.contains("*/") || trimmed.contains("-->")) {
77                in_block_comment = true;
78            }
79            continue;
80        }
81
82        if trimmed.starts_with("//") && !trimmed.starts_with("///") {
83            continue;
84        }
85        if trimmed.starts_with('*') || trimmed.starts_with("*/") {
86            continue;
87        }
88        if is_python && trimmed.starts_with('#') {
89            continue;
90        }
91        if is_sql && trimmed.starts_with("--") {
92            continue;
93        }
94        if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
95            continue;
96        }
97        if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
98            continue;
99        }
100
101        if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
102            if let Some(last) = result.last() {
103                let last_trimmed = last.trim();
104                if matches!(last_trimmed, "}" | "};" | ");" | "});") {
105                    if let Some(last_mut) = result.last_mut() {
106                        last_mut.push_str(trimmed);
107                    }
108                    continue;
109                }
110            }
111            result.push(trimmed.to_string());
112            continue;
113        }
114
115        let normalized = normalize_indentation(line);
116        result.push(normalized);
117    }
118
119    result.join("\n")
120}
121
122/// Lightweight post-processing cleanup: collapses consecutive closing braces,
123/// removes whitespace-only lines, and limits consecutive blank lines to 1.
124pub fn lightweight_cleanup(content: &str) -> String {
125    let lines: Vec<&str> = content.lines().collect();
126    let total = lines.len();
127
128    let mut result: Vec<String> = Vec::new();
129    let mut blank_count = 0u32;
130    let mut brace_run: Vec<&str> = Vec::new();
131
132    let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
133        if total <= 200 || run.len() <= 5 {
134            for l in run.iter() {
135                out.push(l.to_string());
136            }
137        } else {
138            out.push(run[0].to_string());
139            out.push(run[1].to_string());
140            out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
141        }
142        run.clear();
143    };
144
145    for line in &lines {
146        let trimmed = line.trim();
147
148        if trimmed.is_empty() {
149            flush_brace_run(&mut brace_run, &mut result);
150            blank_count += 1;
151            if blank_count <= 1 {
152                result.push(String::new());
153            }
154            continue;
155        }
156        blank_count = 0;
157
158        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
159            brace_run.push(trimmed);
160            continue;
161        }
162
163        flush_brace_run(&mut brace_run, &mut result);
164        result.push(line.to_string());
165    }
166    flush_brace_run(&mut brace_run, &mut result);
167
168    result.join("\n")
169}
170
171/// Safeguard: prevents compression from inflating output or destroying small outputs.
172/// For small outputs (<2000 tokens), rejects extreme compression (>95% reduction)
173/// that likely lost important content. For large outputs, trusts the pattern.
174pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
175    let orig_tokens = super::tokens::count_tokens(original);
176    let comp_tokens = super::tokens::count_tokens(compressed);
177
178    if orig_tokens == 0 {
179        return compressed.to_string();
180    }
181
182    if comp_tokens > orig_tokens {
183        return original.to_string();
184    }
185
186    let ratio = comp_tokens as f64 / orig_tokens as f64;
187    if ratio < 0.05 && orig_tokens < 2000 {
188        original.to_string()
189    } else {
190        compressed.to_string()
191    }
192}
193
194fn normalize_indentation(line: &str) -> String {
195    let content = line.trim_start();
196    let leading = line.len() - content.len();
197    let has_tabs = line.starts_with('\t');
198    let reduced = if has_tabs { leading } else { leading / 2 };
199    format!("{}{}", " ".repeat(reduced), content)
200}
201
202/// Produces a compact unified diff between old and new content with line numbers.
203pub fn diff_content(old_content: &str, new_content: &str) -> String {
204    if old_content == new_content {
205        return "(no changes)".to_string();
206    }
207
208    let diff = TextDiff::from_lines(old_content, new_content);
209    let mut changes = Vec::new();
210    let mut additions = 0usize;
211    let mut deletions = 0usize;
212
213    for change in diff.iter_all_changes() {
214        let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
215        let text = change.value().trim_end_matches('\n');
216        match change.tag() {
217            ChangeTag::Insert => {
218                additions += 1;
219                if let Some(n) = line_no {
220                    changes.push(format!("+{n}: {text}"));
221                }
222            }
223            ChangeTag::Delete => {
224                deletions += 1;
225                if let Some(n) = line_no {
226                    changes.push(format!("-{n}: {text}"));
227                }
228            }
229            ChangeTag::Equal => {}
230        }
231    }
232
233    if changes.is_empty() {
234        return "(no changes)".to_string();
235    }
236
237    changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
238    changes.join("\n")
239}
240
241/// Deduplicates repeated lines, strips boilerplate, and normalizes timestamps/hashes.
242pub fn verbatim_compact(text: &str) -> String {
243    let mut lines: Vec<String> = Vec::new();
244    let mut blank_count = 0u32;
245    let mut prev_line: Option<String> = None;
246    let mut repeat_count = 0u32;
247
248    for line in text.lines() {
249        let trimmed = line.trim();
250
251        if trimmed.is_empty() {
252            blank_count += 1;
253            if blank_count <= 1 {
254                flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
255                lines.push(String::new());
256            }
257            continue;
258        }
259        blank_count = 0;
260
261        if is_boilerplate_line(trimmed) {
262            continue;
263        }
264
265        let normalized = normalize_whitespace(trimmed);
266        let stripped = strip_timestamps_hashes(&normalized);
267
268        if let Some(ref prev) = prev_line {
269            if *prev == stripped {
270                repeat_count += 1;
271                continue;
272            }
273        }
274
275        flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
276        prev_line = Some(stripped.clone());
277        repeat_count = 1;
278        lines.push(stripped);
279    }
280
281    flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
282    lines.join("\n")
283}
284
285/// Compresses content using the active task intent to preserve task-relevant sections.
286pub fn task_aware_compress(
287    content: &str,
288    ext: Option<&str>,
289    intent: &super::intent_engine::StructuredIntent,
290) -> String {
291    use super::intent_engine::{IntentScope, TaskType};
292
293    let budget_ratio = match intent.scope {
294        IntentScope::SingleFile => 0.7,
295        IntentScope::MultiFile => 0.5,
296        IntentScope::CrossModule => 0.35,
297        IntentScope::ProjectWide => 0.25,
298    };
299
300    match intent.task_type {
301        TaskType::FixBug | TaskType::Debug => {
302            let filtered = super::task_relevance::information_bottleneck_filter_typed(
303                content,
304                &intent.keywords,
305                budget_ratio,
306                Some(intent.task_type),
307            );
308            safeguard_ratio(content, &filtered)
309        }
310        TaskType::Refactor | TaskType::Review => {
311            let cleaned = lightweight_cleanup(content);
312            let filtered = super::task_relevance::information_bottleneck_filter_typed(
313                &cleaned,
314                &intent.keywords,
315                budget_ratio.max(0.5),
316                Some(intent.task_type),
317            );
318            safeguard_ratio(content, &filtered)
319        }
320        TaskType::Generate | TaskType::Test => {
321            let compressed = aggressive_compress(content, ext);
322            safeguard_ratio(content, &compressed)
323        }
324        TaskType::Explore | TaskType::Config | TaskType::Deploy => {
325            let cleaned = lightweight_cleanup(content);
326            safeguard_ratio(content, &cleaned)
327        }
328    }
329}
330
331fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
332    if *count > 1 {
333        if let Some(ref prev) = prev_line {
334            let last_idx = lines.len().saturating_sub(1);
335            if last_idx < lines.len() {
336                lines[last_idx] = format!("[{count}x] {prev}");
337            }
338        }
339    }
340    *count = 0;
341    *prev_line = None;
342}
343
344fn normalize_whitespace(line: &str) -> String {
345    let mut result = String::with_capacity(line.len());
346    let mut prev_space = false;
347    for ch in line.chars() {
348        if ch == ' ' || ch == '\t' {
349            if !prev_space {
350                result.push(' ');
351                prev_space = true;
352            }
353        } else {
354            result.push(ch);
355            prev_space = false;
356        }
357    }
358    result
359}
360
361fn strip_timestamps_hashes(line: &str) -> String {
362    let ts_re =
363        static_regex!(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?");
364    let hash_re = static_regex!(r"\b[0-9a-f]{32,64}\b");
365
366    let s = ts_re.replace_all(line, "[TS]");
367    let s = hash_re.replace_all(&s, "[HASH]");
368    s.into_owned()
369}
370
371fn is_boilerplate_line(trimmed: &str) -> bool {
372    let lower = trimmed.to_lowercase();
373    if lower.starts_with("copyright")
374        || lower.starts_with("licensed under")
375        || lower.starts_with("license:")
376        || lower.starts_with("all rights reserved")
377    {
378        return true;
379    }
380    if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
381        return true;
382    }
383    if trimmed.len() >= 4 {
384        let chars: Vec<char> = trimmed.chars().collect();
385        let first = chars[0];
386        if matches!(first, '=' | '-' | '*' | '─' | '━') {
387            let same = chars.iter().filter(|c| **c == first).count();
388            if same as f64 / chars.len() as f64 > 0.8 {
389                return true;
390            }
391        }
392    }
393    false
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399
400    #[test]
401    fn test_diff_insertion() {
402        let old = "line1\nline2\nline3";
403        let new = "line1\nline2\nnew_line\nline3";
404        let result = diff_content(old, new);
405        assert!(result.contains('+'), "should show additions");
406        assert!(result.contains("new_line"));
407    }
408
409    #[test]
410    fn test_diff_deletion() {
411        let old = "line1\nline2\nline3";
412        let new = "line1\nline3";
413        let result = diff_content(old, new);
414        assert!(result.contains('-'), "should show deletions");
415        assert!(result.contains("line2"));
416    }
417
418    #[test]
419    fn test_diff_no_changes() {
420        let content = "same\ncontent";
421        assert_eq!(diff_content(content, content), "(no changes)");
422    }
423
424    #[test]
425    fn test_lightweight_cleanup_collapses_braces() {
426        let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
427        lines.extend(
428            ["}", "}", "}", "}", "}", "}", "}", "}"]
429                .iter()
430                .map(std::string::ToString::to_string),
431        );
432        lines.push("fn next() {}".to_string());
433        let input = lines.join("\n");
434        let result = lightweight_cleanup(&input);
435        assert!(
436            result.contains("[6 brace-only lines collapsed]"),
437            "should collapse long brace runs in large files"
438        );
439        assert!(result.contains("fn next()"));
440    }
441
442    #[test]
443    fn test_lightweight_cleanup_blank_lines() {
444        let input = "line1\n\n\n\n\nline2";
445        let result = lightweight_cleanup(input);
446        let blank_runs = result.split("line1").nth(1).unwrap();
447        let blanks = blank_runs.matches('\n').count();
448        assert!(blanks <= 2, "should collapse multiple blank lines");
449    }
450
451    #[test]
452    fn test_safeguard_ratio_prevents_over_compression_on_small_output() {
453        let original = "a ".repeat(100); // ~100 tokens, < 2000
454        let too_compressed = "a";
455        let result = safeguard_ratio(&original, too_compressed);
456        assert_eq!(
457            result, original,
458            "should return original when ratio < 0.05 and output is small"
459        );
460    }
461
462    #[test]
463    fn test_safeguard_ratio_allows_strong_compression_on_large_output() {
464        let original = "line content here\n".repeat(1000); // ~4000 tokens, > 2000
465        let compressed = "summary: 1000 lines";
466        let result = safeguard_ratio(&original, compressed);
467        assert_eq!(
468            result, compressed,
469            "should allow strong compression for large outputs"
470        );
471    }
472
473    #[test]
474    fn test_aggressive_strips_comments() {
475        let code = "fn main() {\n    // a comment\n    let x = 1;\n}";
476        let result = aggressive_compress(code, Some("rs"));
477        assert!(!result.contains("// a comment"));
478        assert!(result.contains("let x = 1"));
479    }
480
481    #[test]
482    fn test_aggressive_python_comments() {
483        let code = "def main():\n    # comment\n    x = 1";
484        let result = aggressive_compress(code, Some("py"));
485        assert!(!result.contains("# comment"));
486        assert!(result.contains("x = 1"));
487    }
488
489    #[test]
490    fn test_aggressive_preserves_doc_comments() {
491        let code = "/// Doc comment\nfn main() {}";
492        let result = aggressive_compress(code, Some("rs"));
493        assert!(result.contains("/// Doc comment"));
494    }
495
496    #[test]
497    fn test_aggressive_block_comment() {
498        let code = "/* start\n * middle\n */ end\nfn main() {}";
499        let result = aggressive_compress(code, Some("rs"));
500        assert!(!result.contains("start"));
501        assert!(!result.contains("middle"));
502        assert!(result.contains("fn main()"));
503    }
504
505    #[test]
506    fn test_strip_ansi_removes_escape_codes() {
507        let input = "\x1b[31mERROR\x1b[0m: something failed";
508        let result = strip_ansi(input);
509        assert_eq!(result, "ERROR: something failed");
510        assert!(!result.contains('\x1b'));
511    }
512
513    #[test]
514    fn test_strip_ansi_passthrough_clean_text() {
515        let input = "clean text without escapes";
516        let result = strip_ansi(input);
517        assert_eq!(result, input);
518    }
519
520    #[test]
521    fn test_ansi_density_zero_for_clean() {
522        assert_eq!(ansi_density("hello world"), 0.0);
523    }
524
525    #[test]
526    fn test_ansi_density_nonzero_for_colored() {
527        let input = "\x1b[31mred\x1b[0m";
528        assert!(ansi_density(input) > 0.0);
529    }
530}