use crate::error::Result;
#[derive(Debug, Clone)]
pub struct RleResult {
pub text: String,
pub runs_collapsed: usize,
pub tokens_saved: u32,
}
pub fn rle_compress(text: &str, min_run_length: usize) -> Result<RleResult> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return Ok(RleResult {
text: text.to_string(),
runs_collapsed: 0,
tokens_saved: 0,
});
}
let mut output = Vec::new();
let mut runs_collapsed = 0usize;
let mut tokens_saved = 0u32;
let mut i = 0;
while i < lines.len() {
let mut run_len = 1;
while i + run_len < lines.len() && lines[i] == lines[i + run_len] {
run_len += 1;
}
if run_len >= min_run_length {
output.push(format!("{} [×{}]", lines[i], run_len));
runs_collapsed += 1;
let line_tokens = (lines[i].len() as u32 + 3) / 4;
tokens_saved += line_tokens * (run_len as u32 - 1);
i += run_len;
} else {
let pattern_run = detect_pattern_run(&lines[i..]);
if pattern_run.count >= min_run_length {
output.push(format!(
"{} [×{}, varying: {}]",
pattern_run.template, pattern_run.count, pattern_run.varying_part
));
runs_collapsed += 1;
let line_tokens = (lines[i].len() as u32 + 3) / 4;
tokens_saved += line_tokens * (pattern_run.count as u32 - 1);
i += pattern_run.count;
} else {
output.push(lines[i].to_string());
i += 1;
}
}
}
let trailing_newline = text.ends_with('\n');
let mut result = output.join("\n");
if trailing_newline && !result.ends_with('\n') {
result.push('\n');
}
Ok(RleResult {
text: result,
runs_collapsed,
tokens_saved,
})
}
struct PatternRun {
template: String,
count: usize,
varying_part: String,
}
fn detect_pattern_run(lines: &[&str]) -> PatternRun {
if lines.len() < 2 {
return PatternRun {
template: lines.first().unwrap_or(&"").to_string(),
count: 1,
varying_part: String::new(),
};
}
let first = lines[0];
let words_first: Vec<&str> = first.split_whitespace().collect();
if words_first.is_empty() {
return PatternRun {
template: first.to_string(),
count: 1,
varying_part: String::new(),
};
}
let second_words: Vec<&str> = lines[1].split_whitespace().collect();
let mut prefix_len = 0;
for w in 0..words_first.len().min(second_words.len()) {
if words_first[w] == second_words[w] {
prefix_len = w + 1;
} else {
break;
}
}
if prefix_len == 0 {
return PatternRun {
template: first.to_string(),
count: 1,
varying_part: String::new(),
};
}
let prefix: String = words_first[..prefix_len].join(" ");
let mut count = 1;
for line in &lines[1..] {
let words: Vec<&str> = line.split_whitespace().collect();
if words.len() >= prefix_len && words[..prefix_len].join(" ") == prefix {
count += 1;
} else {
break;
}
}
if count < 2 {
return PatternRun {
template: first.to_string(),
count: 1,
varying_part: String::new(),
};
}
let varying: Vec<String> = lines[..count]
.iter()
.map(|l| {
let words: Vec<&str> = l.split_whitespace().collect();
words[prefix_len..].join(" ")
})
.collect();
PatternRun {
template: format!("{prefix} ..."),
count,
varying_part: format!("{} unique values", varying.len()),
}
}
pub fn sliding_window_dedup(text: &str, min_match_words: usize) -> Result<SlidingWindowResult> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return Ok(SlidingWindowResult {
text: text.to_string(),
dedup_count: 0,
tokens_saved: 0,
});
}
let mut phrase_index: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut output = Vec::new();
let mut dedup_count = 0usize;
let mut tokens_saved = 0u32;
for (line_idx, &line) in lines.iter().enumerate() {
let words: Vec<&str> = line.split_whitespace().collect();
let phrase = words.join(" ");
if phrase.len() >= min_match_words * 3 {
if let Some(&first_line) = phrase_index.get(&phrase) {
output.push(format!("[→L{}]", first_line + 1));
dedup_count += 1;
tokens_saved += (phrase.len() as u32 + 3) / 4;
continue;
}
}
if phrase.len() >= min_match_words * 3 {
phrase_index.insert(phrase, line_idx);
}
output.push(line.to_string());
}
let trailing_newline = text.ends_with('\n');
let mut result = output.join("\n");
if trailing_newline && !result.ends_with('\n') {
result.push('\n');
}
Ok(SlidingWindowResult {
text: result,
dedup_count,
tokens_saved,
})
}
#[derive(Debug, Clone)]
pub struct SlidingWindowResult {
pub text: String,
pub dedup_count: usize,
pub tokens_saved: u32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rle_no_repetition() {
let result = rle_compress("alpha\nbeta\ngamma\n", 2).unwrap();
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_exact_repetition() {
let input = "ok\nok\nok\nok\nok\n";
let result = rle_compress(input, 2).unwrap();
assert!(result.runs_collapsed > 0);
assert!(result.text.contains("[×5]"), "output: {}", result.text);
}
#[test]
fn test_rle_pattern_repetition() {
let input = "Compiling foo v1.0\nCompiling bar v2.0\nCompiling baz v3.0\ndone\n";
let result = rle_compress(input, 2).unwrap();
assert!(result.runs_collapsed > 0, "should detect pattern run");
assert!(result.text.contains("×3"), "output: {}", result.text);
}
#[test]
fn test_rle_mixed_content() {
let input = "header\ntest a ... ok\ntest b ... ok\ntest c ... ok\nfooter\n";
let result = rle_compress(input, 2).unwrap();
assert!(result.text.contains("header"));
assert!(result.text.contains("footer"));
}
#[test]
fn test_rle_preserves_trailing_newline() {
let result = rle_compress("a\na\na\n", 2).unwrap();
assert!(result.text.ends_with('\n'));
}
#[test]
fn test_rle_empty_input() {
let result = rle_compress("", 2).unwrap();
assert_eq!(result.text, "");
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_single_line() {
let result = rle_compress("hello\n", 2).unwrap();
assert_eq!(result.runs_collapsed, 0);
}
#[test]
fn test_rle_min_run_length_respected() {
let input = "ok\nok\ndone\n";
let result_2 = rle_compress(input, 2).unwrap();
let result_3 = rle_compress(input, 3).unwrap();
assert!(result_2.runs_collapsed > 0, "run of 2 should collapse with min=2");
assert_eq!(result_3.runs_collapsed, 0, "run of 2 should NOT collapse with min=3");
}
#[test]
fn test_sliding_window_no_duplicates() {
let result = sliding_window_dedup("line 1\nline 2\nline 3\n", 3).unwrap();
assert_eq!(result.dedup_count, 0);
}
#[test]
fn test_sliding_window_exact_duplicate_lines() {
let input = "error: connection refused at port 8080\nsome other line\nerror: connection refused at port 8080\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert!(result.dedup_count > 0, "should detect duplicate line");
assert!(result.text.contains("[→L1]"), "output: {}", result.text);
}
#[test]
fn test_sliding_window_preserves_first_occurrence() {
let input = "the error message here\nother stuff\nthe error message here\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert!(result.text.starts_with("the error message here\n"));
}
#[test]
fn test_sliding_window_short_lines_ignored() {
let input = "ok\nok\nok\n";
let result = sliding_window_dedup(input, 3).unwrap();
assert_eq!(result.dedup_count, 0);
}
#[test]
fn test_sliding_window_empty_input() {
let result = sliding_window_dedup("", 3).unwrap();
assert_eq!(result.text, "");
}
use proptest::prelude::*;
proptest! {
#[test]
fn prop_rle_tokens_saved_non_negative(
text in "[a-z]{3,10}(\n[a-z]{3,10}){2,10}\n"
) {
let result = rle_compress(&text, 2).unwrap();
let _ = result.tokens_saved;
prop_assert!(!result.text.is_empty());
}
#[test]
fn prop_sliding_window_bounded(
text in "[a-z ]{10,50}(\n[a-z ]{10,50}){2,8}\n"
) {
let result = sliding_window_dedup(&text, 3).unwrap();
let line_count = text.lines().count();
prop_assert!(
result.dedup_count <= line_count,
"dedup count {} exceeds line count {}",
result.dedup_count, line_count
);
}
}
}