lean_ctx/core/
compressor.rs1use similar::{ChangeTag, TextDiff};
2
3pub fn strip_ansi(s: &str) -> String {
4 if !s.contains('\x1b') {
5 return s.to_string();
6 }
7 let mut result = String::with_capacity(s.len());
8 let mut in_escape = false;
9 for c in s.chars() {
10 if c == '\x1b' {
11 in_escape = true;
12 continue;
13 }
14 if in_escape {
15 if c.is_ascii_alphabetic() {
16 in_escape = false;
17 }
18 continue;
19 }
20 result.push(c);
21 }
22 result
23}
24
25pub fn ansi_density(s: &str) -> f64 {
26 if s.is_empty() {
27 return 0.0;
28 }
29 let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
30 escape_bytes as f64 / s.len() as f64
31}
32
33pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
34 let mut result: Vec<String> = Vec::new();
35 let is_python = matches!(ext, Some("py"));
36 let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
37 let is_sql = matches!(ext, Some("sql"));
38 let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
39
40 let mut in_block_comment = false;
41
42 for line in content.lines() {
43 let trimmed = line.trim();
44
45 if trimmed.is_empty() {
46 continue;
47 }
48
49 if in_block_comment {
50 if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
51 in_block_comment = false;
52 }
53 continue;
54 }
55
56 if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
57 if !(trimmed.contains("*/") || trimmed.contains("-->")) {
58 in_block_comment = true;
59 }
60 continue;
61 }
62
63 if trimmed.starts_with("//") && !trimmed.starts_with("///") {
64 continue;
65 }
66 if trimmed.starts_with('*') || trimmed.starts_with("*/") {
67 continue;
68 }
69 if is_python && trimmed.starts_with('#') {
70 continue;
71 }
72 if is_sql && trimmed.starts_with("--") {
73 continue;
74 }
75 if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
76 continue;
77 }
78 if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
79 continue;
80 }
81
82 if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
83 if let Some(last) = result.last() {
84 let last_trimmed = last.trim();
85 if matches!(last_trimmed, "}" | "};" | ");" | "});") {
86 if let Some(last_mut) = result.last_mut() {
87 last_mut.push_str(trimmed);
88 }
89 continue;
90 }
91 }
92 result.push(trimmed.to_string());
93 continue;
94 }
95
96 let normalized = normalize_indentation(line);
97 result.push(normalized);
98 }
99
100 result.join("\n")
101}
102
103pub fn lightweight_cleanup(content: &str) -> String {
106 let lines: Vec<&str> = content.lines().collect();
107 let total = lines.len();
108
109 let mut result: Vec<String> = Vec::new();
110 let mut blank_count = 0u32;
111 let mut brace_run: Vec<&str> = Vec::new();
112
113 let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
114 if total <= 200 || run.len() <= 5 {
115 for l in run.iter() {
116 out.push(l.to_string());
117 }
118 } else {
119 out.push(run[0].to_string());
120 out.push(run[1].to_string());
121 out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
122 }
123 run.clear();
124 };
125
126 for line in &lines {
127 let trimmed = line.trim();
128
129 if trimmed.is_empty() {
130 flush_brace_run(&mut brace_run, &mut result);
131 blank_count += 1;
132 if blank_count <= 1 {
133 result.push(String::new());
134 }
135 continue;
136 }
137 blank_count = 0;
138
139 if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
140 brace_run.push(trimmed);
141 continue;
142 }
143
144 flush_brace_run(&mut brace_run, &mut result);
145 result.push(line.to_string());
146 }
147 flush_brace_run(&mut brace_run, &mut result);
148
149 result.join("\n")
150}
151
152pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
155 let orig_tokens = super::tokens::count_tokens(original);
156 let comp_tokens = super::tokens::count_tokens(compressed);
157
158 if orig_tokens == 0 {
159 return compressed.to_string();
160 }
161
162 let ratio = comp_tokens as f64 / orig_tokens as f64;
163 if ratio < 0.15 || comp_tokens > orig_tokens {
164 original.to_string()
165 } else {
166 compressed.to_string()
167 }
168}
169
170fn normalize_indentation(line: &str) -> String {
171 let content = line.trim_start();
172 let leading = line.len() - content.len();
173 let has_tabs = line.starts_with('\t');
174 let reduced = if has_tabs { leading } else { leading / 2 };
175 format!("{}{}", " ".repeat(reduced), content)
176}
177
178pub fn diff_content(old_content: &str, new_content: &str) -> String {
179 if old_content == new_content {
180 return "(no changes)".to_string();
181 }
182
183 let diff = TextDiff::from_lines(old_content, new_content);
184 let mut changes = Vec::new();
185 let mut additions = 0usize;
186 let mut deletions = 0usize;
187
188 for change in diff.iter_all_changes() {
189 let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
190 let text = change.value().trim_end_matches('\n');
191 match change.tag() {
192 ChangeTag::Insert => {
193 additions += 1;
194 if let Some(n) = line_no {
195 changes.push(format!("+{n}: {text}"));
196 }
197 }
198 ChangeTag::Delete => {
199 deletions += 1;
200 if let Some(n) = line_no {
201 changes.push(format!("-{n}: {text}"));
202 }
203 }
204 ChangeTag::Equal => {}
205 }
206 }
207
208 if changes.is_empty() {
209 return "(no changes)".to_string();
210 }
211
212 changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
213 changes.join("\n")
214}
215
216pub fn verbatim_compact(text: &str) -> String {
217 let mut lines: Vec<String> = Vec::new();
218 let mut blank_count = 0u32;
219 let mut prev_line: Option<String> = None;
220 let mut repeat_count = 0u32;
221
222 for line in text.lines() {
223 let trimmed = line.trim();
224
225 if trimmed.is_empty() {
226 blank_count += 1;
227 if blank_count <= 1 {
228 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
229 lines.push(String::new());
230 }
231 continue;
232 }
233 blank_count = 0;
234
235 if is_boilerplate_line(trimmed) {
236 continue;
237 }
238
239 let normalized = normalize_whitespace(trimmed);
240 let stripped = strip_timestamps_hashes(&normalized);
241
242 if let Some(ref prev) = prev_line {
243 if *prev == stripped {
244 repeat_count += 1;
245 continue;
246 }
247 }
248
249 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
250 prev_line = Some(stripped.clone());
251 repeat_count = 1;
252 lines.push(stripped);
253 }
254
255 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
256 lines.join("\n")
257}
258
259pub fn task_aware_compress(
260 content: &str,
261 ext: Option<&str>,
262 intent: &super::intent_engine::StructuredIntent,
263) -> String {
264 use super::intent_engine::{IntentScope, TaskType};
265
266 let budget_ratio = match intent.scope {
267 IntentScope::SingleFile => 0.7,
268 IntentScope::MultiFile => 0.5,
269 IntentScope::CrossModule => 0.35,
270 IntentScope::ProjectWide => 0.25,
271 };
272
273 match intent.task_type {
274 TaskType::FixBug | TaskType::Debug => {
275 let filtered = super::task_relevance::information_bottleneck_filter_typed(
276 content,
277 &intent.keywords,
278 budget_ratio,
279 Some(intent.task_type),
280 );
281 safeguard_ratio(content, &filtered)
282 }
283 TaskType::Refactor | TaskType::Review => {
284 let cleaned = lightweight_cleanup(content);
285 let filtered = super::task_relevance::information_bottleneck_filter_typed(
286 &cleaned,
287 &intent.keywords,
288 budget_ratio.max(0.5),
289 Some(intent.task_type),
290 );
291 safeguard_ratio(content, &filtered)
292 }
293 TaskType::Generate | TaskType::Test => {
294 let compressed = aggressive_compress(content, ext);
295 safeguard_ratio(content, &compressed)
296 }
297 TaskType::Explore => {
298 let cleaned = lightweight_cleanup(content);
299 safeguard_ratio(content, &cleaned)
300 }
301 TaskType::Config | TaskType::Deploy => {
302 let cleaned = lightweight_cleanup(content);
303 safeguard_ratio(content, &cleaned)
304 }
305 }
306}
307
308fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
309 if *count > 1 {
310 if let Some(ref prev) = prev_line {
311 let last_idx = lines.len().saturating_sub(1);
312 if last_idx < lines.len() {
313 lines[last_idx] = format!("[{}x] {}", count, prev);
314 }
315 }
316 }
317 *count = 0;
318 *prev_line = None;
319}
320
321fn normalize_whitespace(line: &str) -> String {
322 let mut result = String::with_capacity(line.len());
323 let mut prev_space = false;
324 for ch in line.chars() {
325 if ch == ' ' || ch == '\t' {
326 if !prev_space {
327 result.push(' ');
328 prev_space = true;
329 }
330 } else {
331 result.push(ch);
332 prev_space = false;
333 }
334 }
335 result
336}
337
338fn strip_timestamps_hashes(line: &str) -> String {
339 use regex::Regex;
340 use std::sync::OnceLock;
341
342 static TS_RE: OnceLock<Regex> = OnceLock::new();
343 static HASH_RE: OnceLock<Regex> = OnceLock::new();
344
345 let ts_re = TS_RE.get_or_init(|| {
346 Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
347 .unwrap()
348 });
349 let hash_re = HASH_RE.get_or_init(|| Regex::new(r"\b[0-9a-f]{32,64}\b").unwrap());
350
351 let s = ts_re.replace_all(line, "[TS]");
352 let s = hash_re.replace_all(&s, "[HASH]");
353 s.into_owned()
354}
355
356fn is_boilerplate_line(trimmed: &str) -> bool {
357 let lower = trimmed.to_lowercase();
358 if lower.starts_with("copyright")
359 || lower.starts_with("licensed under")
360 || lower.starts_with("license:")
361 || lower.starts_with("all rights reserved")
362 {
363 return true;
364 }
365 if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
366 return true;
367 }
368 if trimmed.len() >= 4 {
369 let chars: Vec<char> = trimmed.chars().collect();
370 let first = chars[0];
371 if matches!(first, '=' | '-' | '*' | '─' | '━') {
372 let same = chars.iter().filter(|c| **c == first).count();
373 if same as f64 / chars.len() as f64 > 0.8 {
374 return true;
375 }
376 }
377 }
378 false
379}
380
381#[cfg(test)]
382mod tests {
383 use super::*;
384
385 #[test]
386 fn test_diff_insertion() {
387 let old = "line1\nline2\nline3";
388 let new = "line1\nline2\nnew_line\nline3";
389 let result = diff_content(old, new);
390 assert!(result.contains("+"), "should show additions");
391 assert!(result.contains("new_line"));
392 }
393
394 #[test]
395 fn test_diff_deletion() {
396 let old = "line1\nline2\nline3";
397 let new = "line1\nline3";
398 let result = diff_content(old, new);
399 assert!(result.contains("-"), "should show deletions");
400 assert!(result.contains("line2"));
401 }
402
403 #[test]
404 fn test_diff_no_changes() {
405 let content = "same\ncontent";
406 assert_eq!(diff_content(content, content), "(no changes)");
407 }
408
409 #[test]
410 fn test_lightweight_cleanup_collapses_braces() {
411 let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
412 lines.extend(
413 ["}", "}", "}", "}", "}", "}", "}", "}"]
414 .iter()
415 .map(|s| s.to_string()),
416 );
417 lines.push("fn next() {}".to_string());
418 let input = lines.join("\n");
419 let result = lightweight_cleanup(&input);
420 assert!(
421 result.contains("[6 brace-only lines collapsed]"),
422 "should collapse long brace runs in large files"
423 );
424 assert!(result.contains("fn next()"));
425 }
426
427 #[test]
428 fn test_lightweight_cleanup_blank_lines() {
429 let input = "line1\n\n\n\n\nline2";
430 let result = lightweight_cleanup(input);
431 let blank_runs = result.split("line1").nth(1).unwrap();
432 let blanks = blank_runs.matches('\n').count();
433 assert!(blanks <= 2, "should collapse multiple blank lines");
434 }
435
436 #[test]
437 fn test_safeguard_ratio_prevents_over_compression() {
438 let original = "a ".repeat(100);
439 let too_compressed = "a";
440 let result = safeguard_ratio(&original, too_compressed);
441 assert_eq!(result, original, "should return original when ratio < 0.15");
442 }
443
444 #[test]
445 fn test_aggressive_strips_comments() {
446 let code = "fn main() {\n // a comment\n let x = 1;\n}";
447 let result = aggressive_compress(code, Some("rs"));
448 assert!(!result.contains("// a comment"));
449 assert!(result.contains("let x = 1"));
450 }
451
452 #[test]
453 fn test_aggressive_python_comments() {
454 let code = "def main():\n # comment\n x = 1";
455 let result = aggressive_compress(code, Some("py"));
456 assert!(!result.contains("# comment"));
457 assert!(result.contains("x = 1"));
458 }
459
460 #[test]
461 fn test_aggressive_preserves_doc_comments() {
462 let code = "/// Doc comment\nfn main() {}";
463 let result = aggressive_compress(code, Some("rs"));
464 assert!(result.contains("/// Doc comment"));
465 }
466
467 #[test]
468 fn test_aggressive_block_comment() {
469 let code = "/* start\n * middle\n */ end\nfn main() {}";
470 let result = aggressive_compress(code, Some("rs"));
471 assert!(!result.contains("start"));
472 assert!(!result.contains("middle"));
473 assert!(result.contains("fn main()"));
474 }
475
476 #[test]
477 fn test_strip_ansi_removes_escape_codes() {
478 let input = "\x1b[31mERROR\x1b[0m: something failed";
479 let result = strip_ansi(input);
480 assert_eq!(result, "ERROR: something failed");
481 assert!(!result.contains('\x1b'));
482 }
483
484 #[test]
485 fn test_strip_ansi_passthrough_clean_text() {
486 let input = "clean text without escapes";
487 let result = strip_ansi(input);
488 assert_eq!(result, input);
489 }
490
491 #[test]
492 fn test_ansi_density_zero_for_clean() {
493 assert_eq!(ansi_density("hello world"), 0.0);
494 }
495
496 #[test]
497 fn test_ansi_density_nonzero_for_colored() {
498 let input = "\x1b[31mred\x1b[0m";
499 assert!(ansi_density(input) > 0.0);
500 }
501}