use crate::analysis::Workspace;
use crate::config::DuplicateLogicConfig;
use crate::emit::Emitter;
use crate::report::Finding;
use crate::rules::{Rule, RuleBackend, RuleContext, RuleFamily, RuleInfo};
use crate::span::Span;
use quote::ToTokens;
use similar::TextDiff;
use std::collections::{HashMap, HashSet, hash_map::DefaultHasher};
use std::hash::{Hash, Hasher};
use std::path::Path;
use syn::spanned::Spanned;
pub struct DuplicateLogicRule;
impl DuplicateLogicRule {
pub fn static_info() -> RuleInfo {
RuleInfo {
id: "shape.duplicate_logic",
family: RuleFamily::Shape,
backend: RuleBackend::Syntax,
summary: "Reports similar function bodies with token fingerprints.",
default_level: DuplicateLogicConfig::default().level,
schema: "level, min_tokens, threshold, max_results, exclude_globs, kgram",
config_example: "[rules.\"shape.duplicate_logic\"]\nlevel = \"warn\"\nmin_tokens = 80\nthreshold = 0.8",
fixable: false,
}
}
}
impl Rule for DuplicateLogicRule {
fn info(&self) -> RuleInfo {
Self::static_info()
}
fn run(&self, ws: &Workspace, ctx: &RuleContext<'_>, out: &mut dyn Emitter) {
let cfg = match ctx
.policy
.decode_rule::<DuplicateLogicConfig>(Self::static_info().id, None)
{
Ok(cfg) => cfg,
Err(_) => return,
};
let severity = cfg.level.to_severity();
let exclude = build_exclude(&cfg.exclude_globs);
let mut functions = Vec::new();
for file in &ws.files {
if exclude.is_match(file.path.to_string_lossy().as_ref()) {
continue;
}
let Some(ast) = &file.ast else { continue };
collect_functions(&file.path, ast, &mut functions);
}
let mut fingerprints: Vec<HashSet<u64>> = Vec::with_capacity(functions.len());
for f in &functions {
let set = fingerprint(&f.norm_tokens, cfg.kgram);
fingerprints.push(set);
}
let mut index: HashMap<u64, Vec<usize>> = HashMap::new();
for (i, set) in fingerprints.iter().enumerate() {
for h in set {
index.entry(*h).or_default().push(i);
}
}
let mut shared: HashMap<(usize, usize), u32> = HashMap::new();
for (_h, ids) in index {
if ids.len() < 2 {
continue;
}
for a in 0..ids.len() {
for b in (a + 1)..ids.len() {
let i = ids[a];
let j = ids[b];
let key = if i < j { (i, j) } else { (j, i) };
*shared.entry(key).or_insert(0) += 1;
}
}
}
let mut matches = Vec::new();
for ((i, j), _shared) in shared {
let a = &functions[i];
let b = &functions[j];
if a.norm_tokens.len() < cfg.min_tokens || b.norm_tokens.len() < cfg.min_tokens {
continue;
}
let sa = &fingerprints[i];
let sb = &fingerprints[j];
if sa.is_empty() || sb.is_empty() {
continue;
}
let inter = sa.intersection(sb).count() as f32;
let union = (sa.len() + sb.len()) as f32 - inter;
let sim = if union <= 0.0 { 0.0 } else { inter / union };
if sim >= cfg.threshold {
matches.push((sim, i, j));
}
}
matches.sort_by(|a, b| b.0.total_cmp(&a.0));
matches.truncate(cfg.max_results);
for (sim, i, j) in matches {
let a = &functions[i];
let b = &functions[j];
let evidence = side_by_side(&a.source, &b.source);
out.emit(Finding {
rule_id: Self::static_info().id.to_string(),
family: Some(Self::static_info().family),
engine: Some(Self::static_info().backend),
severity,
message: format!(
"duplicate logic: {:.0}% similarity between `{}` and `{}`",
sim * 100.0,
a.name,
b.name
),
primary: Some(a.span.clone()),
secondary: vec![b.span.clone()],
help: Some("Extract shared code or delete one copy.".to_string()),
evidence: Some(evidence),
confidence: None,
tags: vec!["duplication".to_string()],
labels: Vec::new(),
notes: Vec::new(),
fixes: Vec::new(),
});
}
}
}
fn build_exclude(globs: &[String]) -> globset::GlobSet {
let mut b = globset::GlobSetBuilder::new();
for g in globs {
if let Ok(glob) = globset::Glob::new(g) {
b.add(glob);
}
}
b.build()
.unwrap_or_else(|_| globset::GlobSetBuilder::new().build().unwrap())
}
#[derive(Clone)]
struct FnBody {
name: String,
source: String,
norm_tokens: Vec<String>,
span: Span,
}
fn collect_functions(path: &Path, ast: &syn::File, out: &mut Vec<FnBody>) {
for item in &ast.items {
match item {
syn::Item::Fn(f) => {
let name = f.sig.ident.to_string();
let source = f.to_token_stream().to_string();
let norm_tokens = normalize_tokens(f.block.to_token_stream());
let span = Span::from_pm_span(path, f.span());
out.push(FnBody {
name,
source,
norm_tokens,
span,
});
}
syn::Item::Impl(imp) => {
for it in &imp.items {
let syn::ImplItem::Fn(f) = it else { continue };
let name = f.sig.ident.to_string();
let source = f.to_token_stream().to_string();
let norm_tokens = normalize_tokens(f.block.to_token_stream());
let span = Span::from_pm_span(path, f.span());
out.push(FnBody {
name,
source,
norm_tokens,
span,
});
}
}
_ => {}
}
}
}
fn normalize_tokens(ts: proc_macro2::TokenStream) -> Vec<String> {
fn walk(out: &mut Vec<String>, stream: proc_macro2::TokenStream) {
for tt in stream {
match tt {
proc_macro2::TokenTree::Group(g) => {
let (l, r) = match g.delimiter() {
proc_macro2::Delimiter::Parenthesis => ("(", ")"),
proc_macro2::Delimiter::Brace => ("{", "}"),
proc_macro2::Delimiter::Bracket => ("[", "]"),
proc_macro2::Delimiter::None => ("", ""),
};
if !l.is_empty() {
out.push(l.to_string());
}
walk(out, g.stream());
if !r.is_empty() {
out.push(r.to_string());
}
}
proc_macro2::TokenTree::Ident(_) => out.push("ID".to_string()),
proc_macro2::TokenTree::Punct(p) => out.push(p.as_char().to_string()),
proc_macro2::TokenTree::Literal(l) => out.push(classify_literal(&l.to_string())),
}
}
}
let mut out = Vec::new();
walk(&mut out, ts);
out
}
fn classify_literal(s: &str) -> String {
if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") || s.starts_with("b\"") {
return "STR".to_string();
}
if s.chars()
.next()
.is_some_and(|c| c.is_ascii_digit() || c == '-')
{
return "INT".to_string();
}
"LIT".to_string()
}
fn fingerprint(tokens: &[String], k: usize) -> HashSet<u64> {
if k == 0 || tokens.len() < k {
return HashSet::new();
}
let mut out = HashSet::new();
for win in tokens.windows(k) {
let mut hasher = DefaultHasher::new();
for t in win {
t.hash(&mut hasher);
}
out.insert(hasher.finish());
}
out
}
fn side_by_side(a: &str, b: &str) -> String {
let diff = TextDiff::from_lines(a, b);
let mut left = Vec::new();
let mut right = Vec::new();
for change in diff.iter_all_changes() {
match change.tag() {
similar::ChangeTag::Delete => {
left.push(change.to_string());
right.push(String::new());
}
similar::ChangeTag::Insert => {
left.push(String::new());
right.push(change.to_string());
}
similar::ChangeTag::Equal => {
left.push(change.to_string());
right.push(change.to_string());
}
}
}
let width = left.iter().map(|s| s.len()).max().unwrap_or(0).min(120);
let mut out = String::new();
out.push_str("A | B\n");
out.push_str("---|---\n");
for (l, r) in left.iter().zip(right.iter()) {
let l = trim_line(l);
let r = trim_line(r);
out.push_str(&format!("{:width$} | {}\n", l, r, width = width));
}
out
}
fn trim_line(s: &str) -> String {
let mut t = s.replace('\t', " ");
if t.ends_with('\n') {
t.pop();
}
t
}
#[cfg(test)]
mod tests;