lean_ctx/core/
compressor.rs1use similar::{ChangeTag, TextDiff};
2
3macro_rules! static_regex {
4 ($pattern:expr) => {{
5 static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
6 RE.get_or_init(|| {
7 regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
8 })
9 }};
10}
11
12pub fn strip_ansi(s: &str) -> String {
14 if !s.contains('\x1b') {
15 return s.to_string();
16 }
17 let mut result = String::with_capacity(s.len());
18 let mut in_escape = false;
19 for c in s.chars() {
20 if c == '\x1b' {
21 in_escape = true;
22 continue;
23 }
24 if in_escape {
25 if c.is_ascii_alphabetic() {
26 in_escape = false;
27 }
28 continue;
29 }
30 result.push(c);
31 }
32 result
33}
34
35pub fn ansi_density(s: &str) -> f64 {
37 if s.is_empty() {
38 return 0.0;
39 }
40 let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
41 escape_bytes as f64 / s.len() as f64
42}
43
44pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
46 let mut result: Vec<String> = Vec::new();
47 let is_python = matches!(ext, Some("py"));
48 let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
49 let is_sql = matches!(ext, Some("sql"));
50 let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
51
52 let mut in_block_comment = false;
53
54 for line in content.lines() {
55 let trimmed = line.trim();
56
57 if trimmed.is_empty() {
58 continue;
59 }
60
61 if in_block_comment {
62 if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
63 in_block_comment = false;
64 }
65 continue;
66 }
67
68 if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
69 if !(trimmed.contains("*/") || trimmed.contains("-->")) {
70 in_block_comment = true;
71 }
72 continue;
73 }
74
75 if trimmed.starts_with("//") && !trimmed.starts_with("///") {
76 continue;
77 }
78 if trimmed.starts_with('*') || trimmed.starts_with("*/") {
79 continue;
80 }
81 if is_python && trimmed.starts_with('#') {
82 continue;
83 }
84 if is_sql && trimmed.starts_with("--") {
85 continue;
86 }
87 if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
88 continue;
89 }
90 if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
91 continue;
92 }
93
94 if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
95 if let Some(last) = result.last() {
96 let last_trimmed = last.trim();
97 if matches!(last_trimmed, "}" | "};" | ");" | "});") {
98 if let Some(last_mut) = result.last_mut() {
99 last_mut.push_str(trimmed);
100 }
101 continue;
102 }
103 }
104 result.push(trimmed.to_string());
105 continue;
106 }
107
108 let normalized = normalize_indentation(line);
109 result.push(normalized);
110 }
111
112 result.join("\n")
113}
114
115pub fn lightweight_cleanup(content: &str) -> String {
118 let lines: Vec<&str> = content.lines().collect();
119 let total = lines.len();
120
121 let mut result: Vec<String> = Vec::new();
122 let mut blank_count = 0u32;
123 let mut brace_run: Vec<&str> = Vec::new();
124
125 let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
126 if total <= 200 || run.len() <= 5 {
127 for l in run.iter() {
128 out.push(l.to_string());
129 }
130 } else {
131 out.push(run[0].to_string());
132 out.push(run[1].to_string());
133 out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
134 }
135 run.clear();
136 };
137
138 for line in &lines {
139 let trimmed = line.trim();
140
141 if trimmed.is_empty() {
142 flush_brace_run(&mut brace_run, &mut result);
143 blank_count += 1;
144 if blank_count <= 1 {
145 result.push(String::new());
146 }
147 continue;
148 }
149 blank_count = 0;
150
151 if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
152 brace_run.push(trimmed);
153 continue;
154 }
155
156 flush_brace_run(&mut brace_run, &mut result);
157 result.push(line.to_string());
158 }
159 flush_brace_run(&mut brace_run, &mut result);
160
161 result.join("\n")
162}
163
164pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
168 let orig_tokens = super::tokens::count_tokens(original);
169 let comp_tokens = super::tokens::count_tokens(compressed);
170
171 if orig_tokens == 0 {
172 return compressed.to_string();
173 }
174
175 if comp_tokens > orig_tokens {
176 return original.to_string();
177 }
178
179 let ratio = comp_tokens as f64 / orig_tokens as f64;
180 if ratio < 0.05 && orig_tokens < 2000 {
181 original.to_string()
182 } else {
183 compressed.to_string()
184 }
185}
186
187fn normalize_indentation(line: &str) -> String {
188 let content = line.trim_start();
189 let leading = line.len() - content.len();
190 let has_tabs = line.starts_with('\t');
191 let reduced = if has_tabs { leading } else { leading / 2 };
192 format!("{}{}", " ".repeat(reduced), content)
193}
194
195pub fn diff_content(old_content: &str, new_content: &str) -> String {
197 if old_content == new_content {
198 return "(no changes)".to_string();
199 }
200
201 let diff = TextDiff::from_lines(old_content, new_content);
202 let mut changes = Vec::new();
203 let mut additions = 0usize;
204 let mut deletions = 0usize;
205
206 for change in diff.iter_all_changes() {
207 let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
208 let text = change.value().trim_end_matches('\n');
209 match change.tag() {
210 ChangeTag::Insert => {
211 additions += 1;
212 if let Some(n) = line_no {
213 changes.push(format!("+{n}: {text}"));
214 }
215 }
216 ChangeTag::Delete => {
217 deletions += 1;
218 if let Some(n) = line_no {
219 changes.push(format!("-{n}: {text}"));
220 }
221 }
222 ChangeTag::Equal => {}
223 }
224 }
225
226 if changes.is_empty() {
227 return "(no changes)".to_string();
228 }
229
230 changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
231 changes.join("\n")
232}
233
234pub fn verbatim_compact(text: &str) -> String {
236 let mut lines: Vec<String> = Vec::new();
237 let mut blank_count = 0u32;
238 let mut prev_line: Option<String> = None;
239 let mut repeat_count = 0u32;
240
241 for line in text.lines() {
242 let trimmed = line.trim();
243
244 if trimmed.is_empty() {
245 blank_count += 1;
246 if blank_count <= 1 {
247 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
248 lines.push(String::new());
249 }
250 continue;
251 }
252 blank_count = 0;
253
254 if is_boilerplate_line(trimmed) {
255 continue;
256 }
257
258 let normalized = normalize_whitespace(trimmed);
259 let stripped = strip_timestamps_hashes(&normalized);
260
261 if let Some(ref prev) = prev_line {
262 if *prev == stripped {
263 repeat_count += 1;
264 continue;
265 }
266 }
267
268 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
269 prev_line = Some(stripped.clone());
270 repeat_count = 1;
271 lines.push(stripped);
272 }
273
274 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
275 lines.join("\n")
276}
277
278pub fn task_aware_compress(
280 content: &str,
281 ext: Option<&str>,
282 intent: &super::intent_engine::StructuredIntent,
283) -> String {
284 use super::intent_engine::{IntentScope, TaskType};
285
286 let budget_ratio = match intent.scope {
287 IntentScope::SingleFile => 0.7,
288 IntentScope::MultiFile => 0.5,
289 IntentScope::CrossModule => 0.35,
290 IntentScope::ProjectWide => 0.25,
291 };
292
293 match intent.task_type {
294 TaskType::FixBug | TaskType::Debug => {
295 let filtered = super::task_relevance::information_bottleneck_filter_typed(
296 content,
297 &intent.keywords,
298 budget_ratio,
299 Some(intent.task_type),
300 );
301 safeguard_ratio(content, &filtered)
302 }
303 TaskType::Refactor | TaskType::Review => {
304 let cleaned = lightweight_cleanup(content);
305 let filtered = super::task_relevance::information_bottleneck_filter_typed(
306 &cleaned,
307 &intent.keywords,
308 budget_ratio.max(0.5),
309 Some(intent.task_type),
310 );
311 safeguard_ratio(content, &filtered)
312 }
313 TaskType::Generate | TaskType::Test => {
314 let compressed = aggressive_compress(content, ext);
315 safeguard_ratio(content, &compressed)
316 }
317 TaskType::Explore | TaskType::Config | TaskType::Deploy => {
318 let cleaned = lightweight_cleanup(content);
319 safeguard_ratio(content, &cleaned)
320 }
321 }
322}
323
324fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
325 if *count > 1 {
326 if let Some(ref prev) = prev_line {
327 let last_idx = lines.len().saturating_sub(1);
328 if last_idx < lines.len() {
329 lines[last_idx] = format!("[{count}x] {prev}");
330 }
331 }
332 }
333 *count = 0;
334 *prev_line = None;
335}
336
337fn normalize_whitespace(line: &str) -> String {
338 let mut result = String::with_capacity(line.len());
339 let mut prev_space = false;
340 for ch in line.chars() {
341 if ch == ' ' || ch == '\t' {
342 if !prev_space {
343 result.push(' ');
344 prev_space = true;
345 }
346 } else {
347 result.push(ch);
348 prev_space = false;
349 }
350 }
351 result
352}
353
354fn strip_timestamps_hashes(line: &str) -> String {
355 let ts_re =
356 static_regex!(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?");
357 let hash_re = static_regex!(r"\b[0-9a-f]{32,64}\b");
358
359 let s = ts_re.replace_all(line, "[TS]");
360 let s = hash_re.replace_all(&s, "[HASH]");
361 s.into_owned()
362}
363
364fn is_boilerplate_line(trimmed: &str) -> bool {
365 let lower = trimmed.to_lowercase();
366 if lower.starts_with("copyright")
367 || lower.starts_with("licensed under")
368 || lower.starts_with("license:")
369 || lower.starts_with("all rights reserved")
370 {
371 return true;
372 }
373 if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
374 return true;
375 }
376 if trimmed.len() >= 4 {
377 let chars: Vec<char> = trimmed.chars().collect();
378 let first = chars[0];
379 if matches!(first, '=' | '-' | '*' | '─' | '━') {
380 let same = chars.iter().filter(|c| **c == first).count();
381 if same as f64 / chars.len() as f64 > 0.8 {
382 return true;
383 }
384 }
385 }
386 false
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392
393 #[test]
394 fn test_diff_insertion() {
395 let old = "line1\nline2\nline3";
396 let new = "line1\nline2\nnew_line\nline3";
397 let result = diff_content(old, new);
398 assert!(result.contains('+'), "should show additions");
399 assert!(result.contains("new_line"));
400 }
401
402 #[test]
403 fn test_diff_deletion() {
404 let old = "line1\nline2\nline3";
405 let new = "line1\nline3";
406 let result = diff_content(old, new);
407 assert!(result.contains('-'), "should show deletions");
408 assert!(result.contains("line2"));
409 }
410
411 #[test]
412 fn test_diff_no_changes() {
413 let content = "same\ncontent";
414 assert_eq!(diff_content(content, content), "(no changes)");
415 }
416
417 #[test]
418 fn test_lightweight_cleanup_collapses_braces() {
419 let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
420 lines.extend(
421 ["}", "}", "}", "}", "}", "}", "}", "}"]
422 .iter()
423 .map(std::string::ToString::to_string),
424 );
425 lines.push("fn next() {}".to_string());
426 let input = lines.join("\n");
427 let result = lightweight_cleanup(&input);
428 assert!(
429 result.contains("[6 brace-only lines collapsed]"),
430 "should collapse long brace runs in large files"
431 );
432 assert!(result.contains("fn next()"));
433 }
434
435 #[test]
436 fn test_lightweight_cleanup_blank_lines() {
437 let input = "line1\n\n\n\n\nline2";
438 let result = lightweight_cleanup(input);
439 let blank_runs = result.split("line1").nth(1).unwrap();
440 let blanks = blank_runs.matches('\n').count();
441 assert!(blanks <= 2, "should collapse multiple blank lines");
442 }
443
444 #[test]
445 fn test_safeguard_ratio_prevents_over_compression_on_small_output() {
446 let original = "a ".repeat(100); let too_compressed = "a";
448 let result = safeguard_ratio(&original, too_compressed);
449 assert_eq!(
450 result, original,
451 "should return original when ratio < 0.05 and output is small"
452 );
453 }
454
455 #[test]
456 fn test_safeguard_ratio_allows_strong_compression_on_large_output() {
457 let original = "line content here\n".repeat(1000); let compressed = "summary: 1000 lines";
459 let result = safeguard_ratio(&original, compressed);
460 assert_eq!(
461 result, compressed,
462 "should allow strong compression for large outputs"
463 );
464 }
465
466 #[test]
467 fn test_aggressive_strips_comments() {
468 let code = "fn main() {\n // a comment\n let x = 1;\n}";
469 let result = aggressive_compress(code, Some("rs"));
470 assert!(!result.contains("// a comment"));
471 assert!(result.contains("let x = 1"));
472 }
473
474 #[test]
475 fn test_aggressive_python_comments() {
476 let code = "def main():\n # comment\n x = 1";
477 let result = aggressive_compress(code, Some("py"));
478 assert!(!result.contains("# comment"));
479 assert!(result.contains("x = 1"));
480 }
481
482 #[test]
483 fn test_aggressive_preserves_doc_comments() {
484 let code = "/// Doc comment\nfn main() {}";
485 let result = aggressive_compress(code, Some("rs"));
486 assert!(result.contains("/// Doc comment"));
487 }
488
489 #[test]
490 fn test_aggressive_block_comment() {
491 let code = "/* start\n * middle\n */ end\nfn main() {}";
492 let result = aggressive_compress(code, Some("rs"));
493 assert!(!result.contains("start"));
494 assert!(!result.contains("middle"));
495 assert!(result.contains("fn main()"));
496 }
497
498 #[test]
499 fn test_strip_ansi_removes_escape_codes() {
500 let input = "\x1b[31mERROR\x1b[0m: something failed";
501 let result = strip_ansi(input);
502 assert_eq!(result, "ERROR: something failed");
503 assert!(!result.contains('\x1b'));
504 }
505
506 #[test]
507 fn test_strip_ansi_passthrough_clean_text() {
508 let input = "clean text without escapes";
509 let result = strip_ansi(input);
510 assert_eq!(result, input);
511 }
512
513 #[test]
514 fn test_ansi_density_zero_for_clean() {
515 assert_eq!(ansi_density("hello world"), 0.0);
516 }
517
518 #[test]
519 fn test_ansi_density_nonzero_for_colored() {
520 let input = "\x1b[31mred\x1b[0m";
521 assert!(ansi_density(input) > 0.0);
522 }
523}