use std::collections::HashMap;
use crate::code_unit::{CodeUnit, CodeUnitKind};
use crate::fingerprint::Fingerprint;
use crate::similarity;
#[derive(Debug, Clone)]
pub struct DuplicateGroup {
pub fingerprint: Fingerprint,
pub members: Vec<CodeUnit>,
pub similarity: f64,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct DuplicationStats {
pub total_code_units: usize,
pub total_lines: usize,
pub exact_duplicate_groups: usize,
pub exact_duplicate_units: usize,
pub near_duplicate_groups: usize,
pub near_duplicate_units: usize,
pub exact_duplicate_lines: usize,
pub near_duplicate_lines: usize,
pub sub_exact_groups: usize,
pub sub_exact_units: usize,
pub sub_near_groups: usize,
pub sub_near_units: usize,
}
impl DuplicationStats {
fn percent_of_total(&self, lines: usize) -> f64 {
if self.total_lines == 0 {
0.0
} else {
lines as f64 / self.total_lines as f64 * 100.0
}
}
#[must_use]
pub fn exact_duplicate_percent(&self) -> f64 {
self.percent_of_total(self.exact_duplicate_lines)
}
#[must_use]
pub fn near_duplicate_percent(&self) -> f64 {
self.percent_of_total(self.near_duplicate_lines)
}
}
#[must_use]
pub fn group_exact_duplicates(units: &[CodeUnit]) -> Vec<DuplicateGroup> {
let mut groups: HashMap<Fingerprint, Vec<CodeUnit>> = HashMap::new();
for unit in units {
groups
.entry(unit.fingerprint)
.or_default()
.push(unit.clone());
}
let mut result: Vec<DuplicateGroup> = groups
.into_iter()
.filter(|(_, members)| members.len() > 1)
.map(|(fp, members)| DuplicateGroup {
fingerprint: fp,
members,
similarity: 1.0,
})
.collect();
result.sort_by(|a, b| {
b.members
.len()
.cmp(&a.members.len())
.then_with(|| a.fingerprint.cmp(&b.fingerprint))
});
result
}
#[must_use]
pub fn find_near_duplicates(
units: &[CodeUnit],
threshold: f64,
exact_fingerprints: &[Fingerprint],
) -> Vec<DuplicateGroup> {
let exact_set: std::collections::HashSet<Fingerprint> =
exact_fingerprints.iter().copied().collect();
let candidates: Vec<&CodeUnit> = units
.iter()
.filter(|u| !exact_set.contains(&u.fingerprint))
.collect();
if candidates.len() < 2 {
return Vec::new();
}
let mut buckets: HashMap<(CodeUnitKind, usize), Vec<&CodeUnit>> = HashMap::new();
for unit in &candidates {
let size_bucket = if unit.node_count == 0 {
0
} else {
(unit.node_count as f64).log2().floor() as usize
};
buckets
.entry((unit.kind.clone(), size_bucket))
.or_default()
.push(unit);
}
let mut pairs: Vec<(usize, usize, f64)> = Vec::new();
let unit_indices: HashMap<*const CodeUnit, usize> = candidates
.iter()
.enumerate()
.map(|(i, u)| (std::ptr::from_ref::<CodeUnit>(*u), i))
.collect();
for bucket in buckets.values() {
if bucket.len() < 2 {
continue;
}
for i in 0..bucket.len() {
for j in (i + 1)..bucket.len() {
let score = similarity::similarity_score(&bucket[i].body, &bucket[j].body);
if score >= threshold {
let idx_i = unit_indices[&std::ptr::from_ref::<CodeUnit>(bucket[i])];
let idx_j = unit_indices[&std::ptr::from_ref::<CodeUnit>(bucket[j])];
pairs.push((idx_i, idx_j, score));
}
}
}
}
let mut parent: Vec<usize> = (0..candidates.len()).collect();
let mut scores: HashMap<(usize, usize), f64> = HashMap::new();
for &(i, j, score) in &pairs {
union(&mut parent, i, j);
let key = (i.min(j), i.max(j));
scores.insert(key, score);
}
let mut group_map: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..candidates.len() {
let root = find(&mut parent, i);
group_map.entry(root).or_default().push(i);
}
let mut result: Vec<DuplicateGroup> = group_map
.into_values()
.filter(|members| members.len() > 1)
.map(|member_indices| {
let mut min_score = f64::INFINITY;
for &i in &member_indices {
for &j in &member_indices {
if i < j
&& let Some(&s) = scores.get(&(i, j))
&& s < min_score
{
min_score = s;
}
}
}
let members: Vec<CodeUnit> = member_indices
.iter()
.map(|&i| candidates[i].clone())
.collect();
let member_fps: Vec<Fingerprint> = members.iter().map(|m| m.fingerprint).collect();
let composite_fp = Fingerprint::from_fingerprints(&member_fps);
DuplicateGroup {
fingerprint: composite_fp,
members,
similarity: if min_score.is_infinite() {
threshold
} else {
min_score
},
}
})
.collect();
result.sort_by(|a, b| {
b.members
.len()
.cmp(&a.members.len())
.then_with(|| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| a.fingerprint.cmp(&b.fingerprint))
});
result
}
fn group_line_count(group: &DuplicateGroup) -> usize {
group
.members
.iter()
.map(|m| m.line_end.saturating_sub(m.line_start) + 1)
.sum()
}
pub fn compute_stats(
units: &[CodeUnit],
exact_groups: &[DuplicateGroup],
near_groups: &[DuplicateGroup],
) -> DuplicationStats {
let total_lines: usize = units
.iter()
.map(|u| u.line_end.saturating_sub(u.line_start) + 1)
.sum();
DuplicationStats {
total_code_units: units.len(),
total_lines,
exact_duplicate_groups: exact_groups.len(),
exact_duplicate_units: exact_groups.iter().map(|g| g.members.len()).sum(),
near_duplicate_groups: near_groups.len(),
near_duplicate_units: near_groups.iter().map(|g| g.members.len()).sum(),
exact_duplicate_lines: exact_groups.iter().map(group_line_count).sum(),
near_duplicate_lines: near_groups.iter().map(group_line_count).sum(),
sub_exact_groups: 0,
sub_exact_units: 0,
sub_near_groups: 0,
sub_near_units: 0,
}
}
#[must_use]
pub fn compute_stats_with_sub(
units: &[CodeUnit],
exact_groups: &[DuplicateGroup],
near_groups: &[DuplicateGroup],
sub_exact_groups: &[DuplicateGroup],
sub_near_groups: &[DuplicateGroup],
) -> DuplicationStats {
let mut stats = compute_stats(units, exact_groups, near_groups);
stats.sub_exact_groups = sub_exact_groups.len();
stats.sub_exact_units = sub_exact_groups.iter().map(|g| g.members.len()).sum();
stats.sub_near_groups = sub_near_groups.len();
stats.sub_near_units = sub_near_groups.iter().map(|g| g.members.len()).sum();
stats
}
fn find(parent: &mut [usize], i: usize) -> usize {
if parent[i] != i {
parent[i] = find(parent, parent[i]);
}
parent[i]
}
fn union(parent: &mut [usize], i: usize, j: usize) {
let ri = find(parent, i);
let rj = find(parent, j);
if ri != rj {
parent[ri] = rj;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_no_groups() {
let groups = group_exact_duplicates(&[]);
assert!(groups.is_empty());
}
#[test]
fn percentage_helpers() {
let stats = DuplicationStats {
total_code_units: 10,
total_lines: 200,
exact_duplicate_groups: 2,
exact_duplicate_units: 4,
near_duplicate_groups: 1,
near_duplicate_units: 3,
exact_duplicate_lines: 50,
near_duplicate_lines: 30,
sub_exact_groups: 0,
sub_exact_units: 0,
sub_near_groups: 0,
sub_near_units: 0,
};
assert!((stats.exact_duplicate_percent() - 25.0).abs() < f64::EPSILON);
assert!((stats.near_duplicate_percent() - 15.0).abs() < f64::EPSILON);
}
#[test]
fn percentage_helpers_zero_total() {
let stats = DuplicationStats {
total_code_units: 0,
total_lines: 0,
exact_duplicate_groups: 0,
exact_duplicate_units: 0,
near_duplicate_groups: 0,
near_duplicate_units: 0,
exact_duplicate_lines: 0,
near_duplicate_lines: 0,
sub_exact_groups: 0,
sub_exact_units: 0,
sub_near_groups: 0,
sub_near_units: 0,
};
assert!((stats.exact_duplicate_percent() - 0.0).abs() < f64::EPSILON);
assert!((stats.near_duplicate_percent() - 0.0).abs() < f64::EPSILON);
}
}