use std::collections::HashMap;
use syn::spanned::Spanned;
use syn::visit::Visit;
use super::{has_cfg_test, has_test_attr, qualify_name, FileVisitor, FunctionHashEntry};
use crate::config::sections::DuplicatesConfig;
pub(crate) struct FunctionCollector<'a> {
pub(crate) config: &'a DuplicatesConfig,
pub(crate) file: String,
pub(crate) entries: Vec<FunctionHashEntry>,
in_test: bool,
parent_type: Option<String>,
is_trait_impl: bool,
}
impl<'a> FunctionCollector<'a> {
pub(crate) fn new(config: &'a DuplicatesConfig) -> Self {
Self {
config,
file: String::new(),
entries: Vec::new(),
in_test: false,
parent_type: None,
is_trait_impl: false,
}
}
}
impl FileVisitor for FunctionCollector<'_> {
fn reset_for_file(&mut self, file_path: &str) {
self.file = file_path.to_string();
self.in_test = false;
self.parent_type = None;
self.is_trait_impl = false;
}
}
impl FunctionCollector<'_> {
fn build_hash_entry(
&self,
name: &str,
line: usize,
body: &syn::Block,
is_test_fn: bool,
is_trait_impl: bool,
) -> Option<FunctionHashEntry> {
let is_test = self.in_test || is_test_fn;
if self.config.ignore_tests && is_test {
return None;
}
if self.config.ignore_trait_impls && is_trait_impl {
return None;
}
let compute = |b: &syn::Block| {
let tokens = crate::normalize::normalize_body(b);
let hash = crate::normalize::structural_hash(&tokens);
(tokens, hash)
};
let (tokens, hash) = compute(body);
if tokens.len() < self.config.min_tokens {
return None;
}
let span = body.span();
let line_count = span.end().line.saturating_sub(span.start().line) + 1;
if line_count < self.config.min_lines {
return None;
}
let qualify = |parent: &Option<String>, n: &str| qualify_name(parent, n);
let qualified_name = qualify(&self.parent_type, name);
Some(FunctionHashEntry {
name: name.to_string(),
qualified_name,
file: self.file.clone(),
line,
hash,
token_count: tokens.len(),
tokens,
})
}
}
impl<'ast> Visit<'ast> for FunctionCollector<'_> {
fn visit_item_fn(&mut self, node: &'ast syn::ItemFn) {
let name = node.sig.ident.to_string();
let line = node.sig.ident.span().start().line;
let is_test = has_test_attr(&node.attrs);
if let Some(entry) = self.build_hash_entry(&name, line, &node.block, is_test, false) {
self.entries.push(entry);
}
syn::visit::visit_item_fn(self, node);
}
fn visit_item_impl(&mut self, node: &'ast syn::ItemImpl) {
let prev_parent = self.parent_type.take();
let prev_is_trait = self.is_trait_impl;
let prev_in_test = self.in_test;
if has_cfg_test(&node.attrs) {
self.in_test = true;
}
self.is_trait_impl = node.trait_.is_some();
if let syn::Type::Path(tp) = &*node.self_ty {
if let Some(seg) = tp.path.segments.last() {
self.parent_type = Some(seg.ident.to_string());
}
}
syn::visit::visit_item_impl(self, node);
self.parent_type = prev_parent;
self.is_trait_impl = prev_is_trait;
self.in_test = prev_in_test;
}
fn visit_impl_item_fn(&mut self, node: &'ast syn::ImplItemFn) {
let name = node.sig.ident.to_string();
let line = node.sig.ident.span().start().line;
let is_test = has_test_attr(&node.attrs);
if let Some(entry) =
self.build_hash_entry(&name, line, &node.block, is_test, self.is_trait_impl)
{
self.entries.push(entry);
}
}
fn visit_trait_item_fn(&mut self, node: &'ast syn::TraitItemFn) {
if let Some(ref block) = node.default {
let name = node.sig.ident.to_string();
let line = node.sig.ident.span().start().line;
if let Some(entry) = self.build_hash_entry(&name, line, block, false, true) {
self.entries.push(entry);
}
}
}
fn visit_item_mod(&mut self, node: &'ast syn::ItemMod) {
let prev_in_test = self.in_test;
if has_cfg_test(&node.attrs) {
self.in_test = true;
}
syn::visit::visit_item_mod(self, node);
self.in_test = prev_in_test;
}
}
const NEAR_DUP_BUCKET_SIZE: usize = 10;
const MAX_BUCKET_SIZE: usize = 50;
#[derive(Debug, Clone)]
pub struct DuplicateGroup {
pub entries: Vec<DuplicateEntry>,
pub kind: DuplicateKind,
pub suppressed: bool,
}
#[derive(Debug, Clone)]
pub struct DuplicateEntry {
pub name: String,
pub qualified_name: String,
pub file: String,
pub line: usize,
}
#[derive(Debug, Clone)]
pub enum DuplicateKind {
Exact,
NearDuplicate { similarity: f64 },
}
pub fn detect_duplicates(
parsed: &[(String, String, syn::File)],
config: &DuplicatesConfig,
) -> Vec<DuplicateGroup> {
let entries = super::collect_function_hashes(parsed, config);
let (exact, remaining_indices) = group_exact_duplicates(&entries);
let near = find_near_duplicates(&entries, &remaining_indices, config.similarity_threshold);
merge_groups(exact, near)
}
fn merge_groups(mut exact: Vec<DuplicateGroup>, near: Vec<DuplicateGroup>) -> Vec<DuplicateGroup> {
exact.extend(near);
exact
}
fn group_exact_duplicates(entries: &[FunctionHashEntry]) -> (Vec<DuplicateGroup>, Vec<usize>) {
let mut hash_groups: HashMap<u64, Vec<usize>> = HashMap::new();
for (i, entry) in entries.iter().enumerate() {
hash_groups.entry(entry.hash).or_default().push(i);
}
let mut groups = Vec::new();
let mut remaining = Vec::new();
for indices in hash_groups.values() {
if indices.len() >= 2 {
let group_entries: Vec<DuplicateEntry> = indices
.iter()
.map(|&i| DuplicateEntry {
name: entries[i].name.clone(),
qualified_name: entries[i].qualified_name.clone(),
file: entries[i].file.clone(),
line: entries[i].line,
})
.collect();
groups.push(DuplicateGroup {
entries: group_entries,
kind: DuplicateKind::Exact,
suppressed: false,
});
} else {
remaining.extend(indices);
}
}
(groups, remaining)
}
fn find_near_duplicates(
entries: &[FunctionHashEntry],
candidate_indices: &[usize],
threshold: f64,
) -> Vec<DuplicateGroup> {
let mut buckets: HashMap<usize, Vec<usize>> = HashMap::new();
for &idx in candidate_indices {
let bucket_key = entries[idx].token_count / NEAR_DUP_BUCKET_SIZE;
buckets.entry(bucket_key).or_default().push(idx);
}
let compute_sim = |a: &[crate::normalize::NormalizedToken],
b: &[crate::normalize::NormalizedToken]|
-> f64 { crate::normalize::jaccard_similarity(a, b) };
let mut groups = Vec::new();
for indices in buckets.values() {
if indices.len() < 2 || indices.len() > MAX_BUCKET_SIZE {
continue;
}
for i in 0..indices.len() {
for j in (i + 1)..indices.len() {
let a = indices[i];
let b = indices[j];
let sim = compute_sim(&entries[a].tokens, &entries[b].tokens);
if sim >= threshold {
groups.push(DuplicateGroup {
entries: vec![
DuplicateEntry {
name: entries[a].name.clone(),
qualified_name: entries[a].qualified_name.clone(),
file: entries[a].file.clone(),
line: entries[a].line,
},
DuplicateEntry {
name: entries[b].name.clone(),
qualified_name: entries[b].qualified_name.clone(),
file: entries[b].file.clone(),
line: entries[b].line,
},
],
kind: DuplicateKind::NearDuplicate { similarity: sim },
suppressed: false,
});
}
}
}
}
groups
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(code: &str) -> Vec<(String, String, syn::File)> {
let syntax = syn::parse_file(code).expect("parse failed");
vec![("test.rs".to_string(), code.to_string(), syntax)]
}
fn parse_multi(files: &[(&str, &str)]) -> Vec<(String, String, syn::File)> {
files
.iter()
.map(|(name, code)| {
let syntax = syn::parse_file(code).expect("parse failed");
(name.to_string(), code.to_string(), syntax)
})
.collect()
}
const TEST_MIN_TOKENS: usize = 3;
fn low_threshold_config() -> DuplicatesConfig {
DuplicatesConfig {
min_tokens: TEST_MIN_TOKENS,
min_lines: 1,
..DuplicatesConfig::default()
}
}
#[test]
fn test_detect_duplicates_no_functions() {
let parsed = parse("");
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
assert!(groups.is_empty());
}
#[test]
fn test_detect_duplicates_no_duplicates() {
let code = r#"
fn foo() { let x = 1; let y = x + 2; let z = y * x; }
fn bar() { let a = "hello"; let b = a.len(); if b > 0 { return; } }
"#;
let parsed = parse(code);
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
assert!(
groups.is_empty(),
"Different functions should not be duplicates"
);
}
#[test]
fn test_detect_exact_duplicates_same_structure() {
let parsed = parse_multi(&[
(
"a.rs",
"fn process_a() { let x = 1; let y = x + 2; let z = y * x; }",
),
(
"b.rs",
"fn process_b() { let a = 1; let b = a + 2; let c = b * a; }",
),
]);
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
assert_eq!(groups.len(), 1, "Should detect one exact duplicate group");
assert!(matches!(groups[0].kind, DuplicateKind::Exact));
assert_eq!(groups[0].entries.len(), 2);
}
#[test]
fn test_detect_duplicates_different_structure() {
let parsed = parse_multi(&[
(
"a.rs",
"fn add() { let x = 1; let y = x + 2; let z = y + x; }",
),
(
"b.rs",
"fn mul() { let a = 1; let b = a * 2; let c = b * a; }",
),
]);
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
let exact_groups: Vec<_> = groups
.iter()
.filter(|g| matches!(g.kind, DuplicateKind::Exact))
.collect();
assert!(
exact_groups.is_empty(),
"Different operators should not be exact duplicates"
);
}
#[test]
fn test_detect_duplicates_below_min_tokens_excluded() {
let parsed = parse_multi(&[
("a.rs", "fn tiny_a() { let x = 1; }"),
("b.rs", "fn tiny_b() { let y = 1; }"),
]);
let config = DuplicatesConfig::default(); let groups = detect_duplicates(&parsed, &config);
assert!(
groups.is_empty(),
"Small functions below min_tokens should be excluded"
);
}
#[test]
fn test_detect_duplicates_test_functions_excluded() {
let code = r#"
#[cfg(test)]
mod tests {
fn helper_a() { let x = 1; let y = x + 2; let z = y * x; }
fn helper_b() { let a = 1; let b = a + 2; let c = b * a; }
}
"#;
let parsed = parse(code);
let mut config = low_threshold_config();
config.ignore_tests = true;
let groups = detect_duplicates(&parsed, &config);
assert!(
groups.is_empty(),
"Test functions should be excluded when ignore_tests=true"
);
}
#[test]
fn test_detect_duplicates_test_functions_included() {
let code = r#"
#[cfg(test)]
mod tests {
fn helper_a() { let x = 1; let y = x + 2; let z = y * x; }
fn helper_b() { let a = 1; let b = a + 2; let c = b * a; }
}
"#;
let parsed = parse(code);
let mut config = low_threshold_config();
config.ignore_tests = false;
let groups = detect_duplicates(&parsed, &config);
assert_eq!(
groups.len(),
1,
"Test functions should be included when ignore_tests=false"
);
}
#[test]
fn test_detect_duplicates_three_way() {
let parsed = parse_multi(&[
(
"a.rs",
"fn func_a() { let x = 1; let y = x + 2; let z = y * x; }",
),
(
"b.rs",
"fn func_b() { let a = 1; let b = a + 2; let c = b * a; }",
),
(
"c.rs",
"fn func_c() { let p = 1; let q = p + 2; let r = q * p; }",
),
]);
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0].entries.len(), 3, "Should detect 3-way duplicate");
}
#[test]
fn test_detect_duplicates_config_disabled() {
let parsed = parse_multi(&[
(
"a.rs",
"fn func_a() { let x = 1; let y = x + 2; let z = y * x; }",
),
(
"b.rs",
"fn func_b() { let a = 1; let b = a + 2; let c = b * a; }",
),
]);
let mut config = low_threshold_config();
config.enabled = false;
let groups = detect_duplicates(&parsed, &config);
assert!(!groups.is_empty());
}
#[test]
fn test_group_exact_duplicates_returns_remaining() {
let entries = vec![
FunctionHashEntry {
name: "a".into(),
qualified_name: "a".into(),
file: "a.rs".into(),
line: 1,
hash: 100,
token_count: 10,
tokens: vec![],
},
FunctionHashEntry {
name: "b".into(),
qualified_name: "b".into(),
file: "b.rs".into(),
line: 1,
hash: 200,
token_count: 10,
tokens: vec![],
},
FunctionHashEntry {
name: "c".into(),
qualified_name: "c".into(),
file: "c.rs".into(),
line: 1,
hash: 100,
token_count: 10,
tokens: vec![],
},
];
let (groups, remaining) = group_exact_duplicates(&entries);
assert_eq!(groups.len(), 1); assert_eq!(remaining.len(), 1); assert_eq!(remaining[0], 1); }
#[test]
fn test_detect_near_duplicates_high_similarity() {
let parsed = parse_multi(&[
(
"a.rs",
"fn func_a() { let x = 1; let y = x + 2; let z = y * x; let w = z + 1; }",
),
(
"b.rs",
"fn func_b() { let x = 1; let y = x + 2; let z = y * x; let w = z - 1; }",
),
]);
let mut config = low_threshold_config();
config.similarity_threshold = 0.80;
let groups = detect_duplicates(&parsed, &config);
let near_groups: Vec<_> = groups
.iter()
.filter(|g| matches!(g.kind, DuplicateKind::NearDuplicate { .. }))
.collect();
if !near_groups.is_empty() {
let DuplicateKind::NearDuplicate { similarity } = near_groups[0].kind else {
panic!("expected near duplicate");
};
assert!(similarity >= 0.80);
}
}
#[test]
fn test_duplicate_entry_has_file_and_line() {
let parsed = parse_multi(&[
(
"module_a.rs",
"fn func_a() { let x = 1; let y = x + 2; let z = y * x; }",
),
(
"module_b.rs",
"fn func_b() { let a = 1; let b = a + 2; let c = b * a; }",
),
]);
let config = low_threshold_config();
let groups = detect_duplicates(&parsed, &config);
assert!(!groups.is_empty());
for entry in &groups[0].entries {
assert!(!entry.file.is_empty());
assert!(!entry.name.is_empty());
}
}
}