sqz_engine/
rle_compressor.rs1use crate::error::Result;
11
12#[derive(Debug, Clone)]
14pub struct RleResult {
15 pub text: String,
17 pub runs_collapsed: usize,
19 pub tokens_saved: u32,
21}
22
23pub fn rle_compress(text: &str, min_run_length: usize) -> Result<RleResult> {
28 let lines: Vec<&str> = text.lines().collect();
29 if lines.len() < 2 {
30 return Ok(RleResult {
31 text: text.to_string(),
32 runs_collapsed: 0,
33 tokens_saved: 0,
34 });
35 }
36
37 let mut output = Vec::new();
38 let mut runs_collapsed = 0usize;
39 let mut tokens_saved = 0u32;
40 let mut i = 0;
41
42 while i < lines.len() {
43 let mut run_len = 1;
45 while i + run_len < lines.len() && lines[i] == lines[i + run_len] {
46 run_len += 1;
47 }
48
49 if run_len >= min_run_length {
50 output.push(format!("{} [×{}]", lines[i], run_len));
53 runs_collapsed += 1;
54 let line_tokens = (lines[i].len() as u32 + 3) / 4;
55 tokens_saved += line_tokens * (run_len as u32 - 1);
56 i += run_len;
57 } else {
58 output.push(lines[i].to_string());
65 i += 1;
66 }
67 }
68
69 let trailing_newline = text.ends_with('\n');
70 let mut result = output.join("\n");
71 if trailing_newline && !result.ends_with('\n') {
72 result.push('\n');
73 }
74
75 Ok(RleResult {
76 text: result,
77 runs_collapsed,
78 tokens_saved,
79 })
80}
81
82pub fn sliding_window_dedup(text: &str, min_match_words: usize) -> Result<SlidingWindowResult> {
93 let lines: Vec<&str> = text.lines().collect();
94 if lines.len() < 2 {
95 return Ok(SlidingWindowResult {
96 text: text.to_string(),
97 dedup_count: 0,
98 tokens_saved: 0,
99 });
100 }
101
102 let mut phrase_index: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
104 let mut output = Vec::new();
105 let mut dedup_count = 0usize;
106 let mut tokens_saved = 0u32;
107
108 for (line_idx, &line) in lines.iter().enumerate() {
109 let words: Vec<&str> = line.split_whitespace().collect();
110
111 let phrase = words.join(" ");
113 if phrase.len() >= min_match_words * 3 {
114 if let Some(&first_line) = phrase_index.get(&phrase) {
115 output.push(format!("[→L{}]", first_line + 1));
117 dedup_count += 1;
118 tokens_saved += (phrase.len() as u32 + 3) / 4;
119 continue;
120 }
121 }
122
123 if phrase.len() >= min_match_words * 3 {
125 phrase_index.insert(phrase, line_idx);
126 }
127 output.push(line.to_string());
128 }
129
130 let trailing_newline = text.ends_with('\n');
131 let mut result = output.join("\n");
132 if trailing_newline && !result.ends_with('\n') {
133 result.push('\n');
134 }
135
136 Ok(SlidingWindowResult {
137 text: result,
138 dedup_count,
139 tokens_saved,
140 })
141}
142
143#[derive(Debug, Clone)]
145pub struct SlidingWindowResult {
146 pub text: String,
147 pub dedup_count: usize,
148 pub tokens_saved: u32,
149}
150
151#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
160 fn test_rle_no_repetition() {
161 let result = rle_compress("alpha\nbeta\ngamma\n", 2).unwrap();
162 assert_eq!(result.runs_collapsed, 0);
163 }
164
165 #[test]
166 fn test_rle_exact_repetition() {
167 let input = "ok\nok\nok\nok\nok\n";
168 let result = rle_compress(input, 2).unwrap();
169 assert!(result.runs_collapsed > 0);
170 assert!(result.text.contains("[×5]"), "output: {}", result.text);
171 }
172
173 #[test]
174 fn test_rle_pattern_no_longer_collapses() {
175 let input = "Compiling foo v1.0\nCompiling bar v2.0\nCompiling baz v3.0\ndone\n";
180 let result = rle_compress(input, 2).unwrap();
181 assert_eq!(result.runs_collapsed, 0, "different lines must not be collapsed");
182 assert!(result.text.contains("foo"), "filename 'foo' must survive: {}", result.text);
183 assert!(result.text.contains("bar"), "filename 'bar' must survive: {}", result.text);
184 assert!(result.text.contains("baz"), "filename 'baz' must survive: {}", result.text);
185 }
186
187 #[test]
188 fn test_rle_ls_l_output_preserves_filenames() {
189 let input = "drwxr-xr-x 6 user user 192 Apr 18 10:00 packages\n\
193 drwxr-xr-x 3 user user 96 Apr 18 10:00 configuration\n\
194 drwxr-xr-x 4 user user 128 Apr 18 10:00 documentation\n\
195 drwxr-xr-x 2 user user 64 Apr 18 10:00 environment\n";
196 let result = rle_compress(input, 2).unwrap();
197 for name in &["packages", "configuration", "documentation", "environment"] {
198 assert!(result.text.contains(name),
199 "filename '{}' must be preserved — got:\n{}", name, result.text);
200 }
201 }
202
203 #[test]
204 fn test_rle_mixed_content() {
205 let input = "header\ntest a ... ok\ntest b ... ok\ntest c ... ok\nfooter\n";
206 let result = rle_compress(input, 2).unwrap();
207 assert!(result.text.contains("header"));
208 assert!(result.text.contains("footer"));
209 }
210
211 #[test]
212 fn test_rle_preserves_trailing_newline() {
213 let result = rle_compress("a\na\na\n", 2).unwrap();
214 assert!(result.text.ends_with('\n'));
215 }
216
217 #[test]
218 fn test_rle_empty_input() {
219 let result = rle_compress("", 2).unwrap();
220 assert_eq!(result.text, "");
221 assert_eq!(result.runs_collapsed, 0);
222 }
223
224 #[test]
225 fn test_rle_single_line() {
226 let result = rle_compress("hello\n", 2).unwrap();
227 assert_eq!(result.runs_collapsed, 0);
228 }
229
230 #[test]
231 fn test_rle_min_run_length_respected() {
232 let input = "ok\nok\ndone\n";
233 let result_2 = rle_compress(input, 2).unwrap();
234 let result_3 = rle_compress(input, 3).unwrap();
235 assert!(result_2.runs_collapsed > 0, "run of 2 should collapse with min=2");
236 assert_eq!(result_3.runs_collapsed, 0, "run of 2 should NOT collapse with min=3");
237 }
238
239 #[test]
242 fn test_sliding_window_no_duplicates() {
243 let result = sliding_window_dedup("line 1\nline 2\nline 3\n", 3).unwrap();
244 assert_eq!(result.dedup_count, 0);
245 }
246
247 #[test]
248 fn test_sliding_window_exact_duplicate_lines() {
249 let input = "error: connection refused at port 8080\nsome other line\nerror: connection refused at port 8080\n";
250 let result = sliding_window_dedup(input, 3).unwrap();
251 assert!(result.dedup_count > 0, "should detect duplicate line");
252 assert!(result.text.contains("[→L1]"), "output: {}", result.text);
253 }
254
255 #[test]
256 fn test_sliding_window_preserves_first_occurrence() {
257 let input = "the error message here\nother stuff\nthe error message here\n";
258 let result = sliding_window_dedup(input, 3).unwrap();
259 assert!(result.text.starts_with("the error message here\n"));
261 }
262
263 #[test]
264 fn test_sliding_window_short_lines_ignored() {
265 let input = "ok\nok\nok\n";
266 let result = sliding_window_dedup(input, 3).unwrap();
267 assert_eq!(result.dedup_count, 0);
269 }
270
271 #[test]
272 fn test_sliding_window_empty_input() {
273 let result = sliding_window_dedup("", 3).unwrap();
274 assert_eq!(result.text, "");
275 }
276
277 use proptest::prelude::*;
278
279 proptest! {
280 #[test]
282 fn prop_rle_tokens_saved_non_negative(
283 text in "[a-z]{3,10}(\n[a-z]{3,10}){2,10}\n"
284 ) {
285 let result = rle_compress(&text, 2).unwrap();
286 let _ = result.tokens_saved;
288 prop_assert!(!result.text.is_empty());
290 }
291
292 #[test]
294 fn prop_sliding_window_bounded(
295 text in "[a-z ]{10,50}(\n[a-z ]{10,50}){2,8}\n"
296 ) {
297 let result = sliding_window_dedup(&text, 3).unwrap();
298 let line_count = text.lines().count();
299 prop_assert!(
300 result.dedup_count <= line_count,
301 "dedup count {} exceeds line count {}",
302 result.dedup_count, line_count
303 );
304 }
305 }
306}