use crate::error::Result;
#[derive(Debug, Clone)]
pub struct RleResult {
pub text: String,
pub runs_collapsed: usize,
pub tokens_saved: u32,
}
pub fn rle_compress(text: &str, min_run_length: usize) -> Result<RleResult> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return Ok(RleResult {
text: text.to_string(),
runs_collapsed: 0,
tokens_saved: 0,
});
}
let mut output = Vec::new();
let mut runs_collapsed = 0usize;
let mut tokens_saved = 0u32;
let mut i = 0;
while i < lines.len() {
let mut run_len = 1;
while i + run_len < lines.len() && lines[i] == lines[i + run_len] {
run_len += 1;
}
if run_len >= min_run_length {
output.push(format!("{} [×{}]", lines[i], run_len));
runs_collapsed += 1;
let line_tokens = (lines[i].len() as u32 + 3) / 4;
tokens_saved += line_tokens * (run_len as u32 - 1);
i += run_len;
} else {
output.push(lines[i].to_string());
i += 1;
}
}
let trailing_newline = text.ends_with('\n');
let mut result = output.join("\n");
if trailing_newline && !result.ends_with('\n') {
result.push('\n');
}
Ok(RleResult {
text: result,
runs_collapsed,
tokens_saved,
})
}
pub fn sliding_window_dedup(text: &str, min_match_words: usize) -> Result<SlidingWindowResult> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return Ok(SlidingWindowResult {
text: text.to_string(),
dedup_count: 0,
tokens_saved: 0,
});
}
let mut phrase_index: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut output = Vec::new();
let mut dedup_count = 0usize;
let mut tokens_saved = 0u32;
for (line_idx, &line) in lines.iter().enumerate() {
let words: Vec<&str> = line.split_whitespace().collect();
let phrase = words.join(" ");
if phrase.len() >= min_match_words * 3 {
if let Some(&first_line) = phrase_index.get(&phrase) {
output.push(format!("[→L{}]", first_line + 1));
dedup_count += 1;
tokens_saved += (phrase.len() as u32 + 3) / 4;
continue;
}
}
if phrase.len() >= min_match_words * 3 {
phrase_index.insert(phrase, line_idx);
}
output.push(line.to_string());
}
let trailing_newline = text.ends_with('\n');
let mut result = output.join("\n");
if trailing_newline && !result.ends_with('\n') {
result.push('\n');
}
Ok(SlidingWindowResult {
text: result,
dedup_count,
tokens_saved,
})
}
#[derive(Debug, Clone)]
pub struct SlidingWindowResult {
pub text: String,
pub dedup_count: usize,
pub tokens_saved: u32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rle_no_repetition() {
let result = rle_compress("alpha\nbeta\ngamma\n", 2).unwrap();
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_exact_repetition() {
let input = "ok\nok\nok\nok\nok\n";
let result = rle_compress(input, 2).unwrap();
assert!(result.runs_collapsed > 0);
assert!(result.text.contains("[×5]"), "output: {}", result.text);
}
#[test]
fn test_rle_pattern_no_longer_collapses() {
let input = "Compiling foo v1.0\nCompiling bar v2.0\nCompiling baz v3.0\ndone\n";
let result = rle_compress(input, 2).unwrap();
assert_eq!(result.runs_collapsed, 0, "different lines must not be collapsed");
assert!(result.text.contains("foo"), "filename 'foo' must survive: {}", result.text);
assert!(result.text.contains("bar"), "filename 'bar' must survive: {}", result.text);
assert!(result.text.contains("baz"), "filename 'baz' must survive: {}", result.text);
}
#[test]
fn test_rle_ls_l_output_preserves_filenames() {
let input = "drwxr-xr-x 6 user user 192 Apr 18 10:00 packages\n\
drwxr-xr-x 3 user user 96 Apr 18 10:00 configuration\n\
drwxr-xr-x 4 user user 128 Apr 18 10:00 documentation\n\
drwxr-xr-x 2 user user 64 Apr 18 10:00 environment\n";
let result = rle_compress(input, 2).unwrap();
for name in &["packages", "configuration", "documentation", "environment"] {
assert!(result.text.contains(name),
"filename '{}' must be preserved — got:\n{}", name, result.text);
}
}
#[test]
fn test_rle_mixed_content() {
let input = "header\ntest a ... ok\ntest b ... ok\ntest c ... ok\nfooter\n";
let result = rle_compress(input, 2).unwrap();
assert!(result.text.contains("header"));
assert!(result.text.contains("footer"));
}
#[test]
fn test_rle_preserves_trailing_newline() {
let result = rle_compress("a\na\na\n", 2).unwrap();
assert!(result.text.ends_with('\n'));
}
#[test]
fn test_rle_empty_input() {
let result = rle_compress("", 2).unwrap();
assert_eq!(result.text, "");
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_single_line() {
let result = rle_compress("hello\n", 2).unwrap();
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_min_run_length_respected() {
let input = "ok\nok\ndone\n";
let result_2 = rle_compress(input, 2).unwrap();
let result_3 = rle_compress(input, 3).unwrap();
assert!(result_2.runs_collapsed > 0, "run of 2 should collapse with min=2");
assert_eq!(result_3.runs_collapsed, 0, "run of 2 should NOT collapse with min=3");
}
#[test]
fn test_sliding_window_no_duplicates() {
let result = sliding_window_dedup("line 1\nline 2\nline 3\n", 3).unwrap();
assert_eq!(result.dedup_count, 0);
}
#[test]
fn test_sliding_window_exact_duplicate_lines() {
let input = "error: connection refused at port 8080\nsome other line\nerror: connection refused at port 8080\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert!(result.dedup_count > 0, "should detect duplicate line");
assert!(result.text.contains("[→L1]"), "output: {}", result.text);
}
#[test]
fn test_sliding_window_preserves_first_occurrence() {
let input = "the error message here\nother stuff\nthe error message here\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert!(result.text.starts_with("the error message here\n"));
}
#[test]
fn test_sliding_window_short_lines_ignored() {
let input = "ok\nok\nok\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert_eq!(result.dedup_count, 0);
}
#[test]
fn test_sliding_window_empty_input() {
let result = sliding_window_dedup("", 3).unwrap();
assert_eq!(result.text, "");
}
use proptest::prelude::*;
proptest! {
#[test]
fn prop_rle_tokens_saved_non_negative(
text in "[a-z]{3,10}(\n[a-z]{3,10}){2,10}\n"
) {
let result = rle_compress(&text, 2).unwrap();
let _ = result.tokens_saved;
prop_assert!(!result.text.is_empty());
}
#[test]
fn prop_sliding_window_bounded(
text in "[a-z ]{10,50}(\n[a-z ]{10,50}){2,8}\n"
) {
let result = sliding_window_dedup(&text, 3).unwrap();
let line_count = text.lines().count();
prop_assert!(
result.dedup_count <= line_count,
"dedup count {} exceeds line count {}",
result.dedup_count, line_count
);
}
}
}