lean_ctx/core/
compressor.rs1use similar::{ChangeTag, TextDiff};
2
3pub fn strip_ansi(s: &str) -> String {
4 if !s.contains('\x1b') {
5 return s.to_string();
6 }
7 let mut result = String::with_capacity(s.len());
8 let mut in_escape = false;
9 for c in s.chars() {
10 if c == '\x1b' {
11 in_escape = true;
12 continue;
13 }
14 if in_escape {
15 if c.is_ascii_alphabetic() {
16 in_escape = false;
17 }
18 continue;
19 }
20 result.push(c);
21 }
22 result
23}
24
25pub fn ansi_density(s: &str) -> f64 {
26 if s.is_empty() {
27 return 0.0;
28 }
29 let escape_bytes = s.chars().filter(|&c| c == '\x1b').count();
30 escape_bytes as f64 / s.len() as f64
31}
32
33pub fn aggressive_compress(content: &str, ext: Option<&str>) -> String {
34 let mut result: Vec<String> = Vec::new();
35 let is_python = matches!(ext, Some("py"));
36 let is_html = matches!(ext, Some("html" | "htm" | "xml" | "svg"));
37 let is_sql = matches!(ext, Some("sql"));
38 let is_shell = matches!(ext, Some("sh" | "bash" | "zsh" | "fish"));
39
40 let mut in_block_comment = false;
41
42 for line in content.lines() {
43 let trimmed = line.trim();
44
45 if trimmed.is_empty() {
46 continue;
47 }
48
49 if in_block_comment {
50 if trimmed.contains("*/") || (is_html && trimmed.contains("-->")) {
51 in_block_comment = false;
52 }
53 continue;
54 }
55
56 if trimmed.starts_with("/*") || (is_html && trimmed.starts_with("<!--")) {
57 if !(trimmed.contains("*/") || trimmed.contains("-->")) {
58 in_block_comment = true;
59 }
60 continue;
61 }
62
63 if trimmed.starts_with("//") && !trimmed.starts_with("///") {
64 continue;
65 }
66 if trimmed.starts_with('*') || trimmed.starts_with("*/") {
67 continue;
68 }
69 if is_python && trimmed.starts_with('#') {
70 continue;
71 }
72 if is_sql && trimmed.starts_with("--") {
73 continue;
74 }
75 if is_shell && trimmed.starts_with('#') && !trimmed.starts_with("#!") {
76 continue;
77 }
78 if !is_python && trimmed.starts_with('#') && trimmed.contains('[') {
79 continue;
80 }
81
82 if trimmed == "}" || trimmed == "};" || trimmed == ");" || trimmed == "});" {
83 if let Some(last) = result.last() {
84 let last_trimmed = last.trim();
85 if matches!(last_trimmed, "}" | "};" | ");" | "});") {
86 if let Some(last_mut) = result.last_mut() {
87 last_mut.push_str(trimmed);
88 }
89 continue;
90 }
91 }
92 result.push(trimmed.to_string());
93 continue;
94 }
95
96 let normalized = normalize_indentation(line);
97 result.push(normalized);
98 }
99
100 result.join("\n")
101}
102
103pub fn lightweight_cleanup(content: &str) -> String {
106 let mut result: Vec<String> = Vec::new();
107 let mut blank_count = 0u32;
108 let mut close_brace_count = 0u32;
109
110 for line in content.lines() {
111 let trimmed = line.trim();
112
113 if trimmed.is_empty() {
114 close_brace_count = 0;
115 blank_count += 1;
116 if blank_count <= 1 {
117 result.push(String::new());
118 }
119 continue;
120 }
121 blank_count = 0;
122
123 if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
124 close_brace_count += 1;
125 if close_brace_count <= 2 {
126 result.push(trimmed.to_string());
127 }
128 continue;
129 }
130 close_brace_count = 0;
131
132 result.push(line.to_string());
133 }
134
135 result.join("\n")
136}
137
138pub fn safeguard_ratio(original: &str, compressed: &str) -> String {
141 let orig_tokens = super::tokens::count_tokens(original);
142 let comp_tokens = super::tokens::count_tokens(compressed);
143
144 if orig_tokens == 0 {
145 return compressed.to_string();
146 }
147
148 let ratio = comp_tokens as f64 / orig_tokens as f64;
149 if ratio < 0.15 || comp_tokens > orig_tokens {
150 original.to_string()
151 } else {
152 compressed.to_string()
153 }
154}
155
156fn normalize_indentation(line: &str) -> String {
157 let content = line.trim_start();
158 let leading = line.len() - content.len();
159 let has_tabs = line.starts_with('\t');
160 let reduced = if has_tabs { leading } else { leading / 2 };
161 format!("{}{}", " ".repeat(reduced), content)
162}
163
164pub fn diff_content(old_content: &str, new_content: &str) -> String {
165 if old_content == new_content {
166 return "(no changes)".to_string();
167 }
168
169 let diff = TextDiff::from_lines(old_content, new_content);
170 let mut changes = Vec::new();
171 let mut additions = 0usize;
172 let mut deletions = 0usize;
173
174 for change in diff.iter_all_changes() {
175 let line_no = change.new_index().or(change.old_index()).map(|i| i + 1);
176 let text = change.value().trim_end_matches('\n');
177 match change.tag() {
178 ChangeTag::Insert => {
179 additions += 1;
180 if let Some(n) = line_no {
181 changes.push(format!("+{n}: {text}"));
182 }
183 }
184 ChangeTag::Delete => {
185 deletions += 1;
186 if let Some(n) = line_no {
187 changes.push(format!("-{n}: {text}"));
188 }
189 }
190 ChangeTag::Equal => {}
191 }
192 }
193
194 if changes.is_empty() {
195 return "(no changes)".to_string();
196 }
197
198 changes.push(format!("\ndiff +{additions}/-{deletions} lines"));
199 changes.join("\n")
200}
201
202pub fn verbatim_compact(text: &str) -> String {
203 let mut lines: Vec<String> = Vec::new();
204 let mut blank_count = 0u32;
205 let mut prev_line: Option<String> = None;
206 let mut repeat_count = 0u32;
207
208 for line in text.lines() {
209 let trimmed = line.trim();
210
211 if trimmed.is_empty() {
212 blank_count += 1;
213 if blank_count <= 1 {
214 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
215 lines.push(String::new());
216 }
217 continue;
218 }
219 blank_count = 0;
220
221 if is_boilerplate_line(trimmed) {
222 continue;
223 }
224
225 let normalized = normalize_whitespace(trimmed);
226 let stripped = strip_timestamps_hashes(&normalized);
227
228 if let Some(ref prev) = prev_line {
229 if *prev == stripped {
230 repeat_count += 1;
231 continue;
232 }
233 }
234
235 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
236 prev_line = Some(stripped.clone());
237 repeat_count = 1;
238 lines.push(stripped);
239 }
240
241 flush_repeats(&mut lines, &mut prev_line, &mut repeat_count);
242 lines.join("\n")
243}
244
245fn flush_repeats(lines: &mut [String], prev_line: &mut Option<String>, count: &mut u32) {
246 if *count > 1 {
247 if let Some(ref prev) = prev_line {
248 let last_idx = lines.len().saturating_sub(1);
249 if last_idx < lines.len() {
250 lines[last_idx] = format!("[{}x] {}", count, prev);
251 }
252 }
253 }
254 *count = 0;
255 *prev_line = None;
256}
257
258fn normalize_whitespace(line: &str) -> String {
259 let mut result = String::with_capacity(line.len());
260 let mut prev_space = false;
261 for ch in line.chars() {
262 if ch == ' ' || ch == '\t' {
263 if !prev_space {
264 result.push(' ');
265 prev_space = true;
266 }
267 } else {
268 result.push(ch);
269 prev_space = false;
270 }
271 }
272 result
273}
274
275fn strip_timestamps_hashes(line: &str) -> String {
276 use regex::Regex;
277 use std::sync::OnceLock;
278
279 static TS_RE: OnceLock<Regex> = OnceLock::new();
280 static HASH_RE: OnceLock<Regex> = OnceLock::new();
281
282 let ts_re = TS_RE.get_or_init(|| {
283 Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
284 .unwrap()
285 });
286 let hash_re = HASH_RE.get_or_init(|| Regex::new(r"\b[0-9a-f]{32,64}\b").unwrap());
287
288 let s = ts_re.replace_all(line, "[TS]");
289 let s = hash_re.replace_all(&s, "[HASH]");
290 s.into_owned()
291}
292
293fn is_boilerplate_line(trimmed: &str) -> bool {
294 let lower = trimmed.to_lowercase();
295 if lower.starts_with("copyright")
296 || lower.starts_with("licensed under")
297 || lower.starts_with("license:")
298 || lower.starts_with("all rights reserved")
299 {
300 return true;
301 }
302 if lower.starts_with("generated by") || lower.starts_with("auto-generated") {
303 return true;
304 }
305 if trimmed.len() >= 4 {
306 let chars: Vec<char> = trimmed.chars().collect();
307 let first = chars[0];
308 if matches!(first, '=' | '-' | '*' | '─' | '━') {
309 let same = chars.iter().filter(|c| **c == first).count();
310 if same as f64 / chars.len() as f64 > 0.8 {
311 return true;
312 }
313 }
314 }
315 false
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321
322 #[test]
323 fn test_diff_insertion() {
324 let old = "line1\nline2\nline3";
325 let new = "line1\nline2\nnew_line\nline3";
326 let result = diff_content(old, new);
327 assert!(result.contains("+"), "should show additions");
328 assert!(result.contains("new_line"));
329 }
330
331 #[test]
332 fn test_diff_deletion() {
333 let old = "line1\nline2\nline3";
334 let new = "line1\nline3";
335 let result = diff_content(old, new);
336 assert!(result.contains("-"), "should show deletions");
337 assert!(result.contains("line2"));
338 }
339
340 #[test]
341 fn test_diff_no_changes() {
342 let content = "same\ncontent";
343 assert_eq!(diff_content(content, content), "(no changes)");
344 }
345
346 #[test]
347 fn test_lightweight_cleanup_collapses_braces() {
348 let input = "fn main() {\n inner()\n}\n}\n}\n}\n}\nfn next() {}";
349 let result = lightweight_cleanup(input);
350 assert!(
351 result.matches('}').count() <= 3,
352 "should collapse consecutive closing braces"
353 );
354 assert!(result.contains("fn next()"));
355 }
356
357 #[test]
358 fn test_lightweight_cleanup_blank_lines() {
359 let input = "line1\n\n\n\n\nline2";
360 let result = lightweight_cleanup(input);
361 let blank_runs = result.split("line1").nth(1).unwrap();
362 let blanks = blank_runs.matches('\n').count();
363 assert!(blanks <= 2, "should collapse multiple blank lines");
364 }
365
366 #[test]
367 fn test_safeguard_ratio_prevents_over_compression() {
368 let original = "a ".repeat(100);
369 let too_compressed = "a";
370 let result = safeguard_ratio(&original, too_compressed);
371 assert_eq!(result, original, "should return original when ratio < 0.15");
372 }
373
374 #[test]
375 fn test_aggressive_strips_comments() {
376 let code = "fn main() {\n // a comment\n let x = 1;\n}";
377 let result = aggressive_compress(code, Some("rs"));
378 assert!(!result.contains("// a comment"));
379 assert!(result.contains("let x = 1"));
380 }
381
382 #[test]
383 fn test_aggressive_python_comments() {
384 let code = "def main():\n # comment\n x = 1";
385 let result = aggressive_compress(code, Some("py"));
386 assert!(!result.contains("# comment"));
387 assert!(result.contains("x = 1"));
388 }
389
390 #[test]
391 fn test_aggressive_preserves_doc_comments() {
392 let code = "/// Doc comment\nfn main() {}";
393 let result = aggressive_compress(code, Some("rs"));
394 assert!(result.contains("/// Doc comment"));
395 }
396
397 #[test]
398 fn test_aggressive_block_comment() {
399 let code = "/* start\n * middle\n */ end\nfn main() {}";
400 let result = aggressive_compress(code, Some("rs"));
401 assert!(!result.contains("start"));
402 assert!(!result.contains("middle"));
403 assert!(result.contains("fn main()"));
404 }
405
406 #[test]
407 fn test_strip_ansi_removes_escape_codes() {
408 let input = "\x1b[31mERROR\x1b[0m: something failed";
409 let result = strip_ansi(input);
410 assert_eq!(result, "ERROR: something failed");
411 assert!(!result.contains('\x1b'));
412 }
413
414 #[test]
415 fn test_strip_ansi_passthrough_clean_text() {
416 let input = "clean text without escapes";
417 let result = strip_ansi(input);
418 assert_eq!(result, input);
419 }
420
421 #[test]
422 fn test_ansi_density_zero_for_clean() {
423 assert_eq!(ansi_density("hello world"), 0.0);
424 }
425
426 #[test]
427 fn test_ansi_density_nonzero_for_colored() {
428 let input = "\x1b[31mred\x1b[0m";
429 assert!(ansi_density(input) > 0.0);
430 }
431}