mod extension;
mod groups;
mod hashing;
mod validation;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use extension::{extend_backward, extend_forward, verify_extended_block};
use groups::build_group;
use hashing::{hash_location_set, hash_window};
use serde::Serialize;
use validation::validate_hashes;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize)]
pub enum DuplicationSeverity {
Critical,
Tolerable,
}
pub struct NormalizedLine {
pub original_line_number: usize, pub content: String, }
pub struct NormalizedFile {
pub path: PathBuf,
pub lines: Vec<NormalizedLine>,
}
#[derive(Debug, Clone, Serialize)]
pub struct DuplicateLocation {
pub file_path: PathBuf,
pub start_line: usize, pub end_line: usize, }
#[derive(Debug, Clone, Serialize)]
pub struct DuplicateGroup {
pub locations: Vec<DuplicateLocation>,
pub line_count: usize,
pub sample: Vec<String>,
pub severity: DuplicationSeverity,
}
impl DuplicateGroup {
pub fn duplicated_lines(&self) -> usize {
self.line_count * (self.locations.len() - 1)
}
}
const MAX_OCCURRENCES: usize = 100;
type LocationSet = Vec<(usize, usize)>;
fn hash_all_windows(files: &[NormalizedFile], min_lines: usize) -> HashMap<u64, LocationSet> {
let mut hash_map: HashMap<u64, LocationSet> = HashMap::new();
for (file_idx, file) in files.iter().enumerate() {
if file.lines.len() < min_lines {
continue;
}
for offset in 0..=(file.lines.len() - min_lines) {
let hash = hash_window(&file.lines[offset..offset + min_lines]);
hash_map.entry(hash).or_default().push((file_idx, offset));
}
}
hash_map
}
pub fn detect_duplicates(
files: &[NormalizedFile],
min_lines: usize,
quiet: bool,
) -> Vec<DuplicateGroup> {
let hash_map = hash_all_windows(files, min_lines);
let (location_to_hash, valid_hashes) = validate_hashes(hash_map, files, min_lines, quiet);
let mut consumed: HashSet<u64> = HashSet::new();
let mut groups: Vec<DuplicateGroup> = Vec::new();
for (_hash, locations) in &valid_hashes {
let loc_key = hash_location_set(locations);
if consumed.contains(&loc_key) {
continue;
}
consumed.insert(loc_key);
let (start_locs, backward_ext) =
extend_backward(locations, &location_to_hash, &mut consumed);
let forward_ext = extend_forward(locations, &location_to_hash, &mut consumed);
let block_size = min_lines + backward_ext + forward_ext;
let verified_size = verify_extended_block(files, &start_locs, block_size);
groups.push(build_group(files, &start_locs, verified_size));
}
groups.sort_by(|a, b| match a.severity.cmp(&b.severity) {
std::cmp::Ordering::Equal => b.duplicated_lines().cmp(&a.duplicated_lines()),
other => other,
});
groups
}
#[cfg(test)]
#[path = "../detector_test.rs"]
mod tests;