lean_ctx/core/
compressor.rs1use similar::{ChangeTag, TextDiff};
2
3macro_rules! static_regex {
4 ($pattern:expr) => {{
5 static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
6 RE.get_or_init(|| {
7 regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
8 })
9 }};
10}
11
12pub fn strip_ansi(s: &str) -> String {
14 if !s.contains('\x1b') {
15 return s.to_string();
16 }
17 let mut result = String::with_capacity(s.len());
18 let mut in_escape = false;
19 for c in s.chars() {
20 if c == '\x1b' {
21 in_escape = true;
22 continue;
23 }
24 if in_escape {
25 if c.is_ascii_alphabetic() {
26 in_escape = false;
27 }
28 continue;
29 }
30 result.push(c);
31 }
32 result
33}
34
35pub fn ansi_density(s: &str) -> f64 {
37 if s.is_empty() {
38 return 0.0;
39 }
40 let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
41 escape_bytes as f64 / s.len() as f64
42}
43
44pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
46 let mut result: Vec<String> = Vec::new();
47 let is_python = matches!(ext, Some("py"));
48 let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
49 let is_sql = matches!(ext, Some("sql"));
50 let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
51
52 let mut in_block_comment = false;
53
54 for line in content.lines() {
55 let trimmed = line.trim();
56
57 if trimmed.is_empty() {
58 continue;
59 }
60
61 if in_block_comment {
62 if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
63 in_block_comment = false;
64 }
65 continue;
66 }
67
68 if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
69 if !(trimmed.contains("*/") || trimmed.contains("-->")) {
70 in_block_comment = true;
71 }
72 continue;
73 }
74
75 if trimmed.starts_with("//") && !trimmed.starts_with("///") {
76 continue;
77 }
78 if trimmed.starts_with('*') || trimmed.starts_with("*/") {
79 continue;
80 }
81 if is_python && trimmed.starts_with('#') {
82 continue;
83 }
84 if is_sql && trimmed.starts_with("--") {
85 continue;
86 }
87 if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
88 continue;
89 }
90 if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
91 continue;
92 }
93
94 if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
95 if let Some(last) = result.last() {
96 let last_trimmed = last.trim();
97 if matches!(last_trimmed, "}" | "};" | ");" | "});") {
98 if let Some(last_mut) = result.last_mut() {
99 last_mut.push_str(trimmed);
100 }
101 continue;
102 }
103 }
104 result.push(trimmed.to_string());
105 continue;
106 }
107
108 let normalized = normalize_indentation(line);
109 result.push(normalized);
110 }
111
112 result.join("\n")
113}
114
115pub fn lightweight_cleanup(content: &str) -> String {
118 let lines: Vec<&str> = content.lines().collect();
119 let total = lines.len();
120
121 let mut result: Vec<String> = Vec::new();
122 let mut blank_count = 0u32;
123 let mut brace_run: Vec<&str> = Vec::new();
124
125 let flush_brace_run = |run: &mut Vec<&str>, out: &mut Vec<String>| {
126 if total <= 200 || run.len() <= 5 {
127 for l in run.iter() {
128 out.push(l.to_string());
129 }
130 } else {
131 out.push(run[0].to_string());
132 out.push(run[1].to_string());
133 out.push(format!("[{} brace-only lines collapsed]", run.len() - 2));
134 }
135 run.clear();
136 };
137
138 for line in &lines {
139 let trimmed = line.trim();
140
141 if trimmed.is_empty() {
142 flush_brace_run(&mut brace_run, &mut result);
143 blank_count += 1;
144 if blank_count <= 1 {
145 result.push(String::new());
146 }
147 continue;
148 }
149 blank_count = 0;
150
151 if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
152 brace_run.push(trimmed);
153 continue;
154 }
155
156 flush_brace_run(&mut brace_run, &mut result);
157 result.push(line.to_string());
158 }
159 flush_brace_run(&mut brace_run, &mut result);
160
161 result.join("\n")
162}
163
164pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
167 let orig_tokens = super::tokens::count_tokens(original);
168 let comp_tokens = super::tokens::count_tokens(compressed);
169
170 if orig_tokens == 0 {
171 return compressed.to_string();
172 }
173
174 let ratio = comp_tokens as f64 / orig_tokens as f64;
175 if ratio < 0.15 || comp_tokens > orig_tokens {
176 original.to_string()
177 } else {
178 compressed.to_string()
179 }
180}
181
182fn normalize_indentation(line: &str) -> String {
183 let content = line.trim_start();
184 let leading = line.len() - content.len();
185 let has_tabs = line.starts_with('\t');
186 let reduced = if has_tabs { leading } else { leading / 2 };
187 format!("{}{}", " ".repeat(reduced), content)
188}
189
190pub fn diff_content(old_content: &str, new_content: &str) -> String {
192 if old_content == new_content {
193 return "(no changes)".to_string();
194 }
195
196 let diff = TextDiff::from_lines(old_content, new_content);
197 let mut changes = Vec::new();
198 let mut additions = 0usize;
199 let mut deletions = 0usize;
200
201 for change in diff.iter_all_changes() {
202 let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
203 let text = change.value().trim_end_matches('\n');
204 match change.tag() {
205 ChangeTag::Insert => {
206 additions += 1;
207 if let Some(n) = line_no {
208 changes.push(format!("+{n}: {text}"));
209 }
210 }
211 ChangeTag::Delete => {
212 deletions += 1;
213 if let Some(n) = line_no {
214 changes.push(format!("-{n}: {text}"));
215 }
216 }
217 ChangeTag::Equal => {}
218 }
219 }
220
221 if changes.is_empty() {
222 return "(no changes)".to_string();
223 }
224
225 changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
226 changes.join("\n")
227}
228
229pub fn verbatim_compact(text: &str) -> String {
231 let mut lines: Vec<String> = Vec::new();
232 let mut blank_count = 0u32;
233 let mut prev_line: Option<String> = None;
234 let mut repeat_count = 0u32;
235
236 for line in text.lines() {
237 let trimmed = line.trim();
238
239 if trimmed.is_empty() {
240 blank_count += 1;
241 if blank_count <= 1 {
242 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
243 lines.push(String::new());
244 }
245 continue;
246 }
247 blank_count = 0;
248
249 if is_boilerplate_line(trimmed) {
250 continue;
251 }
252
253 let normalized = normalize_whitespace(trimmed);
254 let stripped = strip_timestamps_hashes(&normalized);
255
256 if let Some(ref prev) = prev_line {
257 if *prev == stripped {
258 repeat_count += 1;
259 continue;
260 }
261 }
262
263 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
264 prev_line = Some(stripped.clone());
265 repeat_count = 1;
266 lines.push(stripped);
267 }
268
269 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
270 lines.join("\n")
271}
272
273pub fn task_aware_compress(
275 content: &str,
276 ext: Option<&str>,
277 intent: &super::intent_engine::StructuredIntent,
278) -> String {
279 use super::intent_engine::{IntentScope, TaskType};
280
281 let budget_ratio = match intent.scope {
282 IntentScope::SingleFile => 0.7,
283 IntentScope::MultiFile => 0.5,
284 IntentScope::CrossModule => 0.35,
285 IntentScope::ProjectWide => 0.25,
286 };
287
288 match intent.task_type {
289 TaskType::FixBug | TaskType::Debug => {
290 let filtered = super::task_relevance::information_bottleneck_filter_typed(
291 content,
292 &intent.keywords,
293 budget_ratio,
294 Some(intent.task_type),
295 );
296 safeguard_ratio(content, &filtered)
297 }
298 TaskType::Refactor | TaskType::Review => {
299 let cleaned = lightweight_cleanup(content);
300 let filtered = super::task_relevance::information_bottleneck_filter_typed(
301 &cleaned,
302 &intent.keywords,
303 budget_ratio.max(0.5),
304 Some(intent.task_type),
305 );
306 safeguard_ratio(content, &filtered)
307 }
308 TaskType::Generate | TaskType::Test => {
309 let compressed = aggressive_compress(content, ext);
310 safeguard_ratio(content, &compressed)
311 }
312 TaskType::Explore | TaskType::Config | TaskType::Deploy => {
313 let cleaned = lightweight_cleanup(content);
314 safeguard_ratio(content, &cleaned)
315 }
316 }
317}
318
319fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
320 if *count > 1 {
321 if let Some(ref prev) = prev_line {
322 let last_idx = lines.len().saturating_sub(1);
323 if last_idx < lines.len() {
324 lines[last_idx] = format!("[{count}x] {prev}");
325 }
326 }
327 }
328 *count = 0;
329 *prev_line = None;
330}
331
332fn normalize_whitespace(line: &str) -> String {
333 let mut result = String::with_capacity(line.len());
334 let mut prev_space = false;
335 for ch in line.chars() {
336 if ch == ' ' || ch == '\t' {
337 if !prev_space {
338 result.push(' ');
339 prev_space = true;
340 }
341 } else {
342 result.push(ch);
343 prev_space = false;
344 }
345 }
346 result
347}
348
349fn strip_timestamps_hashes(line: &str) -> String {
350 let ts_re =
351 static_regex!(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?");
352 let hash_re = static_regex!(r"\b[0-9a-f]{32,64}\b");
353
354 let s = ts_re.replace_all(line, "[TS]");
355 let s = hash_re.replace_all(&s, "[HASH]");
356 s.into_owned()
357}
358
359fn is_boilerplate_line(trimmed: &str) -> bool {
360 let lower = trimmed.to_lowercase();
361 if lower.starts_with("copyright")
362 || lower.starts_with("licensed under")
363 || lower.starts_with("license:")
364 || lower.starts_with("all rights reserved")
365 {
366 return true;
367 }
368 if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
369 return true;
370 }
371 if trimmed.len() >= 4 {
372 let chars: Vec<char> = trimmed.chars().collect();
373 let first = chars[0];
374 if matches!(first, '=' | '-' | '*' | '─' | '━') {
375 let same = chars.iter().filter(|c| **c == first).count();
376 if same as f64 / chars.len() as f64 > 0.8 {
377 return true;
378 }
379 }
380 }
381 false
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 #[test]
389 fn test_diff_insertion() {
390 let old = "line1\nline2\nline3";
391 let new = "line1\nline2\nnew_line\nline3";
392 let result = diff_content(old, new);
393 assert!(result.contains('+'), "should show additions");
394 assert!(result.contains("new_line"));
395 }
396
397 #[test]
398 fn test_diff_deletion() {
399 let old = "line1\nline2\nline3";
400 let new = "line1\nline3";
401 let result = diff_content(old, new);
402 assert!(result.contains('-'), "should show deletions");
403 assert!(result.contains("line2"));
404 }
405
406 #[test]
407 fn test_diff_no_changes() {
408 let content = "same\ncontent";
409 assert_eq!(diff_content(content, content), "(no changes)");
410 }
411
412 #[test]
413 fn test_lightweight_cleanup_collapses_braces() {
414 let mut lines: Vec<String> = (0..210).map(|i| format!("line {i}")).collect();
415 lines.extend(
416 ["}", "}", "}", "}", "}", "}", "}", "}"]
417 .iter()
418 .map(std::string::ToString::to_string),
419 );
420 lines.push("fn next() {}".to_string());
421 let input = lines.join("\n");
422 let result = lightweight_cleanup(&input);
423 assert!(
424 result.contains("[6 brace-only lines collapsed]"),
425 "should collapse long brace runs in large files"
426 );
427 assert!(result.contains("fn next()"));
428 }
429
430 #[test]
431 fn test_lightweight_cleanup_blank_lines() {
432 let input = "line1\n\n\n\n\nline2";
433 let result = lightweight_cleanup(input);
434 let blank_runs = result.split("line1").nth(1).unwrap();
435 let blanks = blank_runs.matches('\n').count();
436 assert!(blanks <= 2, "should collapse multiple blank lines");
437 }
438
439 #[test]
440 fn test_safeguard_ratio_prevents_over_compression() {
441 let original = "a ".repeat(100);
442 let too_compressed = "a";
443 let result = safeguard_ratio(&original, too_compressed);
444 assert_eq!(result, original, "should return original when ratio < 0.15");
445 }
446
447 #[test]
448 fn test_aggressive_strips_comments() {
449 let code = "fn main() {\n // a comment\n let x = 1;\n}";
450 let result = aggressive_compress(code, Some("rs"));
451 assert!(!result.contains("// a comment"));
452 assert!(result.contains("let x = 1"));
453 }
454
455 #[test]
456 fn test_aggressive_python_comments() {
457 let code = "def main():\n # comment\n x = 1";
458 let result = aggressive_compress(code, Some("py"));
459 assert!(!result.contains("# comment"));
460 assert!(result.contains("x = 1"));
461 }
462
463 #[test]
464 fn test_aggressive_preserves_doc_comments() {
465 let code = "/// Doc comment\nfn main() {}";
466 let result = aggressive_compress(code, Some("rs"));
467 assert!(result.contains("/// Doc comment"));
468 }
469
470 #[test]
471 fn test_aggressive_block_comment() {
472 let code = "/* start\n * middle\n */ end\nfn main() {}";
473 let result = aggressive_compress(code, Some("rs"));
474 assert!(!result.contains("start"));
475 assert!(!result.contains("middle"));
476 assert!(result.contains("fn main()"));
477 }
478
479 #[test]
480 fn test_strip_ansi_removes_escape_codes() {
481 let input = "\x1b[31mERROR\x1b[0m: something failed";
482 let result = strip_ansi(input);
483 assert_eq!(result, "ERROR: something failed");
484 assert!(!result.contains('\x1b'));
485 }
486
487 #[test]
488 fn test_strip_ansi_passthrough_clean_text() {
489 let input = "clean text without escapes";
490 let result = strip_ansi(input);
491 assert_eq!(result, input);
492 }
493
494 #[test]
495 fn test_ansi_density_zero_for_clean() {
496 assert_eq!(ansi_density("hello world"), 0.0);
497 }
498
499 #[test]
500 fn test_ansi_density_nonzero_for_colored() {
501 let input = "\x1b[31mred\x1b[0m";
502 assert!(ansi_density(input) > 0.0);
503 }
504}