Skip to main content

sqz_engine/
rle_compressor.rs

1/// Run-Length Encoding (RLE) for structured repetition patterns.
2///
3/// Generalizes the condense stage to catch repeated patterns within lines
4/// and repeated structured blocks, not just consecutive identical lines.
5///
6/// RLE is provably optimal for data with runs of identical symbols
7/// (Shannon 1948). This implementation works at the token/phrase level
8/// rather than the byte level, making it effective for CLI output patterns.
9
10use crate::error::Result;
11
12/// Result of RLE compression.
13#[derive(Debug, Clone)]
14pub struct RleResult {
15    /// The compressed text.
16    pub text: String,
17    /// Number of runs collapsed.
18    pub runs_collapsed: usize,
19    /// Tokens saved (estimated).
20    pub tokens_saved: u32,
21}
22
23/// Compress text using token-level run-length encoding.
24///
25/// Detects repeated phrases/tokens and replaces runs with compact notation:
26/// `token [×N]` where N is the repeat count.
27pub fn rle_compress(text: &str, min_run_length: usize) -> Result<RleResult> {
28    let lines: Vec<&str> = text.lines().collect();
29    if lines.len() < 2 {
30        return Ok(RleResult {
31            text: text.to_string(),
32            runs_collapsed: 0,
33            tokens_saved: 0,
34        });
35    }
36
37    let mut output = Vec::new();
38    let mut runs_collapsed = 0usize;
39    let mut tokens_saved = 0u32;
40    let mut i = 0;
41
42    while i < lines.len() {
43        // Try to find a run of identical lines (lossless — safe to collapse)
44        let mut run_len = 1;
45        while i + run_len < lines.len() && lines[i] == lines[i + run_len] {
46            run_len += 1;
47        }
48
49        if run_len >= min_run_length {
50            // Collapse the run of identical lines. Lossless: N copies of the
51            // same text can be recovered from "{text} [×N]".
52            output.push(format!("{} [×{}]", lines[i], run_len));
53            runs_collapsed += 1;
54            let line_tokens = (lines[i].len() as u32 + 3) / 4;
55            tokens_saved += line_tokens * (run_len as u32 - 1);
56            i += run_len;
57        } else {
58            // Pattern-run detection (lines with shared word-prefix but
59            // different suffixes) was removed because it was LOSSY: it
60            // replaced the varying parts with "{count} unique values",
61            // discarding real filenames from `ls -l` output. See Reddit
62            // report #1 and the test_ls_output_preserves_all_filenames
63            // regression test. Exact duplicates above are still collapsed.
64            output.push(lines[i].to_string());
65            i += 1;
66        }
67    }
68
69    let trailing_newline = text.ends_with('\n');
70    let mut result = output.join("\n");
71    if trailing_newline && !result.ends_with('\n') {
72        result.push('\n');
73    }
74
75    Ok(RleResult {
76        text: result,
77        runs_collapsed,
78        tokens_saved,
79    })
80}
81
82// ── Sliding Window Dedup ──────────────────────────────────────────────────────
83
84/// Token-level sliding window deduplication.
85///
86/// Finds repeated substrings within a text using a sliding window approach.
87/// When the same phrase appears multiple times, subsequent occurrences are
88/// replaced with back-references: `[→L{line}]`.
89///
90/// This catches repeated substrings that aren't line-aligned — e.g., the
91/// same error message appearing in two different stack frames.
92pub fn sliding_window_dedup(text: &str, min_match_words: usize) -> Result<SlidingWindowResult> {
93    let lines: Vec<&str> = text.lines().collect();
94    if lines.len() < 2 {
95        return Ok(SlidingWindowResult {
96            text: text.to_string(),
97            dedup_count: 0,
98            tokens_saved: 0,
99        });
100    }
101
102    // Build a phrase index: map each n-word phrase to its first occurrence line
103    let mut phrase_index: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
104    let mut output = Vec::new();
105    let mut dedup_count = 0usize;
106    let mut tokens_saved = 0u32;
107
108    for (line_idx, &line) in lines.iter().enumerate() {
109        let words: Vec<&str> = line.split_whitespace().collect();
110
111        // Check if this line (as a phrase) was seen before
112        let phrase = words.join(" ");
113        if phrase.len() >= min_match_words * 3 {
114            if let Some(&first_line) = phrase_index.get(&phrase) {
115                // This exact line appeared before — replace with back-reference
116                output.push(format!("[→L{}]", first_line + 1));
117                dedup_count += 1;
118                tokens_saved += (phrase.len() as u32 + 3) / 4;
119                continue;
120            }
121        }
122
123        // Index this line's phrase
124        if phrase.len() >= min_match_words * 3 {
125            phrase_index.insert(phrase, line_idx);
126        }
127        output.push(line.to_string());
128    }
129
130    let trailing_newline = text.ends_with('\n');
131    let mut result = output.join("\n");
132    if trailing_newline && !result.ends_with('\n') {
133        result.push('\n');
134    }
135
136    Ok(SlidingWindowResult {
137        text: result,
138        dedup_count,
139        tokens_saved,
140    })
141}
142
143/// Result of sliding window deduplication.
144#[derive(Debug, Clone)]
145pub struct SlidingWindowResult {
146    pub text: String,
147    pub dedup_count: usize,
148    pub tokens_saved: u32,
149}
150
151// ── Tests ─────────────────────────────────────────────────────────────────────
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    // --- RLE tests ---
158
159    #[test]
160    fn test_rle_no_repetition() {
161        let result = rle_compress("alpha\nbeta\ngamma\n", 2).unwrap();
162        assert_eq!(result.runs_collapsed, 0);
163    }
164
165    #[test]
166    fn test_rle_exact_repetition() {
167        let input = "ok\nok\nok\nok\nok\n";
168        let result = rle_compress(input, 2).unwrap();
169        assert!(result.runs_collapsed > 0);
170        assert!(result.text.contains("[×5]"), "output: {}", result.text);
171    }
172
173    #[test]
174    fn test_rle_pattern_no_longer_collapses() {
175        // Pattern-run collapsing was removed because it was lossy — it
176        // replaced varying filename/identifier suffixes with "N unique
177        // values", dropping data the LLM needs. Different lines that
178        // merely share a prefix must pass through unchanged.
179        let input = "Compiling foo v1.0\nCompiling bar v2.0\nCompiling baz v3.0\ndone\n";
180        let result = rle_compress(input, 2).unwrap();
181        assert_eq!(result.runs_collapsed, 0, "different lines must not be collapsed");
182        assert!(result.text.contains("foo"), "filename 'foo' must survive: {}", result.text);
183        assert!(result.text.contains("bar"), "filename 'bar' must survive: {}", result.text);
184        assert!(result.text.contains("baz"), "filename 'baz' must survive: {}", result.text);
185    }
186
187    #[test]
188    fn test_rle_ls_l_output_preserves_filenames() {
189        // Regression for https://github.com/ojuschugh1/sqz/issues/1
190        // ls -l lines share the 'drwxr-xr-x' prefix but have distinct
191        // filenames. Pattern-run collapsing used to delete them.
192        let input = "drwxr-xr-x  6 user user  192 Apr 18 10:00 packages\n\
193                     drwxr-xr-x  3 user user   96 Apr 18 10:00 configuration\n\
194                     drwxr-xr-x  4 user user  128 Apr 18 10:00 documentation\n\
195                     drwxr-xr-x  2 user user   64 Apr 18 10:00 environment\n";
196        let result = rle_compress(input, 2).unwrap();
197        for name in &["packages", "configuration", "documentation", "environment"] {
198            assert!(result.text.contains(name),
199                "filename '{}' must be preserved — got:\n{}", name, result.text);
200        }
201    }
202
203    #[test]
204    fn test_rle_mixed_content() {
205        let input = "header\ntest a ... ok\ntest b ... ok\ntest c ... ok\nfooter\n";
206        let result = rle_compress(input, 2).unwrap();
207        assert!(result.text.contains("header"));
208        assert!(result.text.contains("footer"));
209    }
210
211    #[test]
212    fn test_rle_preserves_trailing_newline() {
213        let result = rle_compress("a\na\na\n", 2).unwrap();
214        assert!(result.text.ends_with('\n'));
215    }
216
217    #[test]
218    fn test_rle_empty_input() {
219        let result = rle_compress("", 2).unwrap();
220        assert_eq!(result.text, "");
221        assert_eq!(result.runs_collapsed, 0);
222    }
223
224    #[test]
225    fn test_rle_single_line() {
226        let result = rle_compress("hello\n", 2).unwrap();
227        assert_eq!(result.runs_collapsed, 0);
228    }
229
230    #[test]
231    fn test_rle_min_run_length_respected() {
232        let input = "ok\nok\ndone\n";
233        let result_2 = rle_compress(input, 2).unwrap();
234        let result_3 = rle_compress(input, 3).unwrap();
235        assert!(result_2.runs_collapsed > 0, "run of 2 should collapse with min=2");
236        assert_eq!(result_3.runs_collapsed, 0, "run of 2 should NOT collapse with min=3");
237    }
238
239    // --- Sliding window dedup tests ---
240
241    #[test]
242    fn test_sliding_window_no_duplicates() {
243        let result = sliding_window_dedup("line 1\nline 2\nline 3\n", 3).unwrap();
244        assert_eq!(result.dedup_count, 0);
245    }
246
247    #[test]
248    fn test_sliding_window_exact_duplicate_lines() {
249        let input = "error: connection refused at port 8080\nsome other line\nerror: connection refused at port 8080\n";
250        let result = sliding_window_dedup(input, 3).unwrap();
251        assert!(result.dedup_count > 0, "should detect duplicate line");
252        assert!(result.text.contains("[→L1]"), "output: {}", result.text);
253    }
254
255    #[test]
256    fn test_sliding_window_preserves_first_occurrence() {
257        let input = "the error message here\nother stuff\nthe error message here\n";
258        let result = sliding_window_dedup(input, 3).unwrap();
259        // First occurrence should be preserved verbatim
260        assert!(result.text.starts_with("the error message here\n"));
261    }
262
263    #[test]
264    fn test_sliding_window_short_lines_ignored() {
265        let input = "ok\nok\nok\n";
266        let result = sliding_window_dedup(input, 3).unwrap();
267        // "ok" is too short (< min_match_words * 3 = 9 chars)
268        assert_eq!(result.dedup_count, 0);
269    }
270
271    #[test]
272    fn test_sliding_window_empty_input() {
273        let result = sliding_window_dedup("", 3).unwrap();
274        assert_eq!(result.text, "");
275    }
276
277    use proptest::prelude::*;
278
279    proptest! {
280        /// RLE never produces output longer than input (excluding the run markers).
281        #[test]
282        fn prop_rle_tokens_saved_non_negative(
283            text in "[a-z]{3,10}(\n[a-z]{3,10}){2,10}\n"
284        ) {
285            let result = rle_compress(&text, 2).unwrap();
286            // tokens_saved is u32, always >= 0
287            let _ = result.tokens_saved;
288            // Output should not be empty if input isn't
289            prop_assert!(!result.text.is_empty());
290        }
291
292        /// Sliding window dedup count is non-negative and bounded.
293        #[test]
294        fn prop_sliding_window_bounded(
295            text in "[a-z ]{10,50}(\n[a-z ]{10,50}){2,8}\n"
296        ) {
297            let result = sliding_window_dedup(&text, 3).unwrap();
298            let line_count = text.lines().count();
299            prop_assert!(
300                result.dedup_count <= line_count,
301                "dedup count {} exceeds line count {}",
302                result.dedup_count, line_count
303            );
304        }
305    }
306}