lean_ctx/core/
compressor.rs1use similar::{ChangeTag, TextDiff};
2
3macro_rules! static_regex {
4 ($pattern:expr) => {{
5 static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
6 RE.get_or_init(|| {
7 regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
8 })
9 }};
10}
11
12pub fn strip_ansi(s: &str) -> String {
14 if !s.contains('\x1b') {
15 return s.to_string();
16 }
17 let mut result = String::with_capacity(s.len());
18 let mut in_escape = false;
19 for c in s.chars() {
20 if c == '\x1b' {
21 in_escape = true;
22 continue;
23 }
24 if in_escape {
25 if c.is_ascii_alphabetic() {
26 in_escape = false;
27 }
28 continue;
29 }
30 result.push(c);
31 }
32 result
33}
34
35pub fn ansi_density(s: &str) -> f64 {
37 if s.is_empty() {
38 return 0.0;
39 }
40 let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
41 escape_bytes as f64 / s.len() as f64
42}
43
44pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
46 if let Some(compacted) = crate::core::structured_compact::compact_structured(content, ext) {
50 return compacted;
51 }
52
53 let mut result: Vec<String> = Vec::new();
54 let is_python = matches!(ext, Some("py"));
55 let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
56 let is_sql = matches!(ext, Some("sql"));
57 let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
58
59 let mut in_block_comment = false;
60
61 for line in content.lines() {
62 let trimmed = line.trim();
63
64 if trimmed.is_empty() {
65 continue;
66 }
67
68 if in_block_comment {
69 if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
70 in_block_comment = false;
71 }
72 continue;
73 }
74
75 if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
76 if !(trimmed.contains("*/") || trimmed.contains("-->")) {
77 in_block_comment = true;
78 }
79 continue;
80 }
81
82 if trimmed.starts_with("//") && !trimmed.starts_with("///") {
83 continue;
84 }
85 if trimmed.starts_with('*') || trimmed.starts_with("*/") {
86 continue;
87 }
88 if is_python && trimmed.starts_with('#') {
89 continue;
90 }
91 if is_sql && trimmed.starts_with("--") {
92 continue;
93 }
94 if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
95 continue;
96 }
97 if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
98 continue;
99 }
100
101 if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
102 if let Some(last) = result.last() {
103 let last_trimmed = last.trim();
104 if matches!(last_trimmed, "}" | "};" | ");" | "});") {
105 if let Some(last_mut) = result.last_mut() {
106 last_mut.push_str(trimmed);
107 }
108 continue;
109 }
110 }
111 result.push(trimmed.to_string());
112 continue;
113 }
114
115 let normalized = normalize_indentation(line);
116 result.push(normalized);
117 }
118
119 result.join("\n")
120}
121
122pub fn lightweight_cleanup(content: &str) -> String {
125 let lines: Vec<&str> = content.lines().collect();
126 let total = lines.len();
127
128 let mut result: Vec<String> = Vec::new();
129 let mut blank_count = 0u32;
130 let mut brace_run: Vec<&str> = Vec::new();
131
132 let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
133 if total <= 200 || run.len() <= 5 {
134 for l in run.iter() {
135 out.push(l.to_string());
136 }
137 } else {
138 out.push(run[0].to_string());
139 out.push(run[1].to_string());
140 out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
141 }
142 run.clear();
143 };
144
145 for line in &lines {
146 let trimmed = line.trim();
147
148 if trimmed.is_empty() {
149 flush_brace_run(&mut brace_run, &mut result);
150 blank_count += 1;
151 if blank_count <= 1 {
152 result.push(String::new());
153 }
154 continue;
155 }
156 blank_count = 0;
157
158 if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
159 brace_run.push(trimmed);
160 continue;
161 }
162
163 flush_brace_run(&mut brace_run, &mut result);
164 result.push(line.to_string());
165 }
166 flush_brace_run(&mut brace_run, &mut result);
167
168 result.join("\n")
169}
170
171pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
175 let orig_tokens = super::tokens::count_tokens(original);
176 let comp_tokens = super::tokens::count_tokens(compressed);
177
178 if orig_tokens == 0 {
179 return compressed.to_string();
180 }
181
182 if comp_tokens > orig_tokens {
183 return original.to_string();
184 }
185
186 let ratio = comp_tokens as f64 / orig_tokens as f64;
187 if ratio < 0.05 && orig_tokens < 2000 {
188 original.to_string()
189 } else {
190 compressed.to_string()
191 }
192}
193
194fn normalize_indentation(line: &str) -> String {
195 let content = line.trim_start();
196 let leading = line.len() - content.len();
197 let has_tabs = line.starts_with('\t');
198 let reduced = if has_tabs { leading } else { leading / 2 };
199 format!("{}{}", " ".repeat(reduced), content)
200}
201
202pub fn diff_content(old_content: &str, new_content: &str) -> String {
204 if old_content == new_content {
205 return "(no changes)".to_string();
206 }
207
208 let diff = TextDiff::from_lines(old_content, new_content);
209 let mut changes = Vec::new();
210 let mut additions = 0usize;
211 let mut deletions = 0usize;
212
213 for change in diff.iter_all_changes() {
214 let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
215 let text = change.value().trim_end_matches('\n');
216 match change.tag() {
217 ChangeTag::Insert => {
218 additions += 1;
219 if let Some(n) = line_no {
220 changes.push(format!("+{n}: {text}"));
221 }
222 }
223 ChangeTag::Delete => {
224 deletions += 1;
225 if let Some(n) = line_no {
226 changes.push(format!("-{n}: {text}"));
227 }
228 }
229 ChangeTag::Equal => {}
230 }
231 }
232
233 if changes.is_empty() {
234 return "(no changes)".to_string();
235 }
236
237 changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
238 changes.join("\n")
239}
240
241pub fn verbatim_compact(text: &str) -> String {
243 let mut lines: Vec<String> = Vec::new();
244 let mut blank_count = 0u32;
245 let mut prev_line: Option<String> = None;
246 let mut repeat_count = 0u32;
247
248 for line in text.lines() {
249 let trimmed = line.trim();
250
251 if trimmed.is_empty() {
252 blank_count += 1;
253 if blank_count <= 1 {
254 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
255 lines.push(String::new());
256 }
257 continue;
258 }
259 blank_count = 0;
260
261 if is_boilerplate_line(trimmed) {
262 continue;
263 }
264
265 let normalized = normalize_whitespace(trimmed);
266 let stripped = strip_timestamps_hashes(&normalized);
267
268 if let Some(ref prev) = prev_line {
269 if *prev == stripped {
270 repeat_count += 1;
271 continue;
272 }
273 }
274
275 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
276 prev_line = Some(stripped.clone());
277 repeat_count = 1;
278 lines.push(stripped);
279 }
280
281 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
282 lines.join("\n")
283}
284
285pub fn task_aware_compress(
287 content: &str,
288 ext: Option<&str>,
289 intent: &super::intent_engine::StructuredIntent,
290) -> String {
291 use super::intent_engine::{IntentScope, TaskType};
292
293 let budget_ratio = match intent.scope {
294 IntentScope::SingleFile => 0.7,
295 IntentScope::MultiFile => 0.5,
296 IntentScope::CrossModule => 0.35,
297 IntentScope::ProjectWide => 0.25,
298 };
299
300 match intent.task_type {
301 TaskType::FixBug | TaskType::Debug => {
302 let filtered = super::task_relevance::information_bottleneck_filter_typed(
303 content,
304 &intent.keywords,
305 budget_ratio,
306 Some(intent.task_type),
307 );
308 safeguard_ratio(content, &filtered)
309 }
310 TaskType::Refactor | TaskType::Review => {
311 let cleaned = lightweight_cleanup(content);
312 let filtered = super::task_relevance::information_bottleneck_filter_typed(
313 &cleaned,
314 &intent.keywords,
315 budget_ratio.max(0.5),
316 Some(intent.task_type),
317 );
318 safeguard_ratio(content, &filtered)
319 }
320 TaskType::Generate | TaskType::Test => {
321 let compressed = aggressive_compress(content, ext);
322 safeguard_ratio(content, &compressed)
323 }
324 TaskType::Explore | TaskType::Config | TaskType::Deploy => {
325 let cleaned = lightweight_cleanup(content);
326 safeguard_ratio(content, &cleaned)
327 }
328 }
329}
330
331fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
332 if *count > 1 {
333 if let Some(ref prev) = prev_line {
334 let last_idx = lines.len().saturating_sub(1);
335 if last_idx < lines.len() {
336 lines[last_idx] = format!("[{count}x] {prev}");
337 }
338 }
339 }
340 *count = 0;
341 *prev_line = None;
342}
343
344fn normalize_whitespace(line: &str) -> String {
345 let mut result = String::with_capacity(line.len());
346 let mut prev_space = false;
347 for ch in line.chars() {
348 if ch == ' ' || ch == '\t' {
349 if !prev_space {
350 result.push(' ');
351 prev_space = true;
352 }
353 } else {
354 result.push(ch);
355 prev_space = false;
356 }
357 }
358 result
359}
360
361fn strip_timestamps_hashes(line: &str) -> String {
362 let ts_re =
363 static_regex!(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?");
364 let hash_re = static_regex!(r"\b[0-9a-f]{32,64}\b");
365
366 let s = ts_re.replace_all(line, "[TS]");
367 let s = hash_re.replace_all(&s, "[HASH]");
368 s.into_owned()
369}
370
371fn is_boilerplate_line(trimmed: &str) -> bool {
372 let lower = trimmed.to_lowercase();
373 if lower.starts_with("copyright")
374 || lower.starts_with("licensed under")
375 || lower.starts_with("license:")
376 || lower.starts_with("all rights reserved")
377 {
378 return true;
379 }
380 if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
381 return true;
382 }
383 if trimmed.len() >= 4 {
384 let chars: Vec<char> = trimmed.chars().collect();
385 let first = chars[0];
386 if matches!(first, '=' | '-' | '*' | '─' | '━') {
387 let same = chars.iter().filter(|c| **c == first).count();
388 if same as f64 / chars.len() as f64 > 0.8 {
389 return true;
390 }
391 }
392 }
393 false
394}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399
400 #[test]
401 fn test_diff_insertion() {
402 let old = "line1\nline2\nline3";
403 let new = "line1\nline2\nnew_line\nline3";
404 let result = diff_content(old, new);
405 assert!(result.contains('+'), "should show additions");
406 assert!(result.contains("new_line"));
407 }
408
409 #[test]
410 fn test_diff_deletion() {
411 let old = "line1\nline2\nline3";
412 let new = "line1\nline3";
413 let result = diff_content(old, new);
414 assert!(result.contains('-'), "should show deletions");
415 assert!(result.contains("line2"));
416 }
417
418 #[test]
419 fn test_diff_no_changes() {
420 let content = "same\ncontent";
421 assert_eq!(diff_content(content, content), "(no changes)");
422 }
423
424 #[test]
425 fn test_lightweight_cleanup_collapses_braces() {
426 let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
427 lines.extend(
428 ["}", "}", "}", "}", "}", "}", "}", "}"]
429 .iter()
430 .map(std::string::ToString::to_string),
431 );
432 lines.push("fn next() {}".to_string());
433 let input = lines.join("\n");
434 let result = lightweight_cleanup(&input);
435 assert!(
436 result.contains("[6 brace-only lines collapsed]"),
437 "should collapse long brace runs in large files"
438 );
439 assert!(result.contains("fn next()"));
440 }
441
442 #[test]
443 fn test_lightweight_cleanup_blank_lines() {
444 let input = "line1\n\n\n\n\nline2";
445 let result = lightweight_cleanup(input);
446 let blank_runs = result.split("line1").nth(1).unwrap();
447 let blanks = blank_runs.matches('\n').count();
448 assert!(blanks <= 2, "should collapse multiple blank lines");
449 }
450
451 #[test]
452 fn test_safeguard_ratio_prevents_over_compression_on_small_output() {
453 let original = "a ".repeat(100); let too_compressed = "a";
455 let result = safeguard_ratio(&original, too_compressed);
456 assert_eq!(
457 result, original,
458 "should return original when ratio < 0.05 and output is small"
459 );
460 }
461
462 #[test]
463 fn test_safeguard_ratio_allows_strong_compression_on_large_output() {
464 let original = "line content here\n".repeat(1000); let compressed = "summary: 1000 lines";
466 let result = safeguard_ratio(&original, compressed);
467 assert_eq!(
468 result, compressed,
469 "should allow strong compression for large outputs"
470 );
471 }
472
473 #[test]
474 fn test_aggressive_strips_comments() {
475 let code = "fn main() {\n // a comment\n let x = 1;\n}";
476 let result = aggressive_compress(code, Some("rs"));
477 assert!(!result.contains("// a comment"));
478 assert!(result.contains("let x = 1"));
479 }
480
481 #[test]
482 fn test_aggressive_python_comments() {
483 let code = "def main():\n # comment\n x = 1";
484 let result = aggressive_compress(code, Some("py"));
485 assert!(!result.contains("# comment"));
486 assert!(result.contains("x = 1"));
487 }
488
489 #[test]
490 fn test_aggressive_preserves_doc_comments() {
491 let code = "/// Doc comment\nfn main() {}";
492 let result = aggressive_compress(code, Some("rs"));
493 assert!(result.contains("/// Doc comment"));
494 }
495
496 #[test]
497 fn test_aggressive_block_comment() {
498 let code = "/* start\n * middle\n */ end\nfn main() {}";
499 let result = aggressive_compress(code, Some("rs"));
500 assert!(!result.contains("start"));
501 assert!(!result.contains("middle"));
502 assert!(result.contains("fn main()"));
503 }
504
505 #[test]
506 fn test_strip_ansi_removes_escape_codes() {
507 let input = "\x1b[31mERROR\x1b[0m: something failed";
508 let result = strip_ansi(input);
509 assert_eq!(result, "ERROR: something failed");
510 assert!(!result.contains('\x1b'));
511 }
512
513 #[test]
514 fn test_strip_ansi_passthrough_clean_text() {
515 let input = "clean text without escapes";
516 let result = strip_ansi(input);
517 assert_eq!(result, input);
518 }
519
520 #[test]
521 fn test_ansi_density_zero_for_clean() {
522 assert_eq!(ansi_density("hello world"), 0.0);
523 }
524
525 #[test]
526 fn test_ansi_density_nonzero_for_colored() {
527 let input = "\x1b[31mred\x1b[0m";
528 assert!(ansi_density(input) > 0.0);
529 }
530}