use std::collections::hash_map::DefaultHasher;
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};
pub struct RepetitionResult {
pub percentage: f64,
pub hashes: Vec<u64>,
}
#[must_use]
pub fn analyze_repetition(content: &str, window_size: usize) -> RepetitionResult {
if content.is_empty() || window_size == 0 {
return RepetitionResult {
percentage: 0.0,
hashes: Vec::new(),
};
}
let lines: Vec<&str> = content.lines().collect();
if lines.len() < window_size {
return RepetitionResult {
percentage: 0.0,
hashes: Vec::new(),
};
}
let mut hashes = Vec::with_capacity(lines.len());
for line in &lines {
let mut h = DefaultHasher::new();
line.trim().hash(&mut h);
hashes.push(h.finish());
}
let chunks = get_chunks(&hashes, window_size);
let mut duplicated_lines = HashSet::new();
for positions in chunks.values() {
if positions.len() > 1 {
for &pos in positions {
for i in 0..window_size {
duplicated_lines.insert(pos + i);
}
}
}
}
#[allow(clippy::cast_precision_loss)]
let percentage = (duplicated_lines.len() as f64 / lines.len() as f64) * 100.0;
RepetitionResult { percentage, hashes }
}
#[must_use]
pub fn get_chunks(hashes: &[u64], window_size: usize) -> HashMap<Vec<u64>, Vec<usize>> {
if hashes.len() < window_size {
return HashMap::new();
}
let mut chunks: HashMap<Vec<u64>, Vec<usize>> = HashMap::new();
for i in 0..=hashes.len() - window_size {
let chunk = hashes[i..i + window_size].to_vec();
chunks.entry(chunk).or_default().push(i + 1);
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_repetition() {
let content = "a\nb\nc\na\nb\nc\nd";
let res = analyze_repetition(content, 3);
assert!(res.percentage > 0.0);
assert_eq!(res.hashes.len(), 7);
}
}