Skip to main content

rscheck_cli/rules/duplicate_logic/
mod.rs

1use crate::analysis::Workspace;
2use crate::config::DuplicateLogicConfig;
3use crate::emit::Emitter;
4use crate::report::Finding;
5use crate::rules::{Rule, RuleBackend, RuleContext, RuleFamily, RuleInfo};
6use crate::span::Span;
7use quote::ToTokens;
8use similar::TextDiff;
9use std::collections::{HashMap, HashSet, hash_map::DefaultHasher};
10use std::hash::{Hash, Hasher};
11use std::path::Path;
12use syn::spanned::Spanned;
13
14pub struct DuplicateLogicRule;
15
16impl DuplicateLogicRule {
17    pub fn static_info() -> RuleInfo {
18        RuleInfo {
19            id: "shape.duplicate_logic",
20            family: RuleFamily::Shape,
21            backend: RuleBackend::Syntax,
22            summary: "Reports similar function bodies with token fingerprints.",
23            default_level: DuplicateLogicConfig::default().level,
24            schema: "level, min_tokens, threshold, max_results, exclude_globs, kgram",
25            config_example: "[rules.\"shape.duplicate_logic\"]\nlevel = \"warn\"\nmin_tokens = 80\nthreshold = 0.8",
26            fixable: false,
27        }
28    }
29}
30
31impl Rule for DuplicateLogicRule {
32    fn info(&self) -> RuleInfo {
33        Self::static_info()
34    }
35
36    fn run(&self, ws: &Workspace, ctx: &RuleContext<'_>, out: &mut dyn Emitter) {
37        let cfg = match ctx
38            .policy
39            .decode_rule::<DuplicateLogicConfig>(Self::static_info().id, None)
40        {
41            Ok(cfg) => cfg,
42            Err(_) => return,
43        };
44        let severity = cfg.level.to_severity();
45        let exclude = build_exclude(&cfg.exclude_globs);
46
47        let mut functions = Vec::new();
48        for file in &ws.files {
49            if exclude.is_match(file.path.to_string_lossy().as_ref()) {
50                continue;
51            }
52            let Some(ast) = &file.ast else { continue };
53            collect_functions(&file.path, ast, &mut functions);
54        }
55
56        let mut fingerprints: Vec<HashSet<u64>> = Vec::with_capacity(functions.len());
57        for f in &functions {
58            let set = fingerprint(&f.norm_tokens, cfg.kgram);
59            fingerprints.push(set);
60        }
61
62        let mut index: HashMap<u64, Vec<usize>> = HashMap::new();
63        for (i, set) in fingerprints.iter().enumerate() {
64            for h in set {
65                index.entry(*h).or_default().push(i);
66            }
67        }
68
69        let mut shared: HashMap<(usize, usize), u32> = HashMap::new();
70        for (_h, ids) in index {
71            if ids.len() < 2 {
72                continue;
73            }
74            for a in 0..ids.len() {
75                for b in (a + 1)..ids.len() {
76                    let i = ids[a];
77                    let j = ids[b];
78                    let key = if i < j { (i, j) } else { (j, i) };
79                    *shared.entry(key).or_insert(0) += 1;
80                }
81            }
82        }
83
84        let mut matches = Vec::new();
85        for ((i, j), _shared) in shared {
86            let a = &functions[i];
87            let b = &functions[j];
88            if a.norm_tokens.len() < cfg.min_tokens || b.norm_tokens.len() < cfg.min_tokens {
89                continue;
90            }
91            let sa = &fingerprints[i];
92            let sb = &fingerprints[j];
93            if sa.is_empty() || sb.is_empty() {
94                continue;
95            }
96            let inter = sa.intersection(sb).count() as f32;
97            let union = (sa.len() + sb.len()) as f32 - inter;
98            let sim = if union <= 0.0 { 0.0 } else { inter / union };
99            if sim >= cfg.threshold {
100                matches.push((sim, i, j));
101            }
102        }
103
104        matches.sort_by(|a, b| b.0.total_cmp(&a.0));
105        matches.truncate(cfg.max_results);
106
107        for (sim, i, j) in matches {
108            let a = &functions[i];
109            let b = &functions[j];
110            let evidence = side_by_side(&a.source, &b.source);
111            out.emit(Finding {
112                rule_id: Self::static_info().id.to_string(),
113                family: Some(Self::static_info().family),
114                engine: Some(Self::static_info().backend),
115                severity,
116                message: format!(
117                    "duplicate logic: {:.0}% similarity between `{}` and `{}`",
118                    sim * 100.0,
119                    a.name,
120                    b.name
121                ),
122                primary: Some(a.span.clone()),
123                secondary: vec![b.span.clone()],
124                help: Some("Extract shared code or delete one copy.".to_string()),
125                evidence: Some(evidence),
126                confidence: None,
127                tags: vec!["duplication".to_string()],
128                labels: Vec::new(),
129                notes: Vec::new(),
130                fixes: Vec::new(),
131            });
132        }
133    }
134}
135
136fn build_exclude(globs: &[String]) -> globset::GlobSet {
137    let mut b = globset::GlobSetBuilder::new();
138    for g in globs {
139        if let Ok(glob) = globset::Glob::new(g) {
140            b.add(glob);
141        }
142    }
143    b.build()
144        .unwrap_or_else(|_| globset::GlobSetBuilder::new().build().unwrap())
145}
146
147#[derive(Clone)]
148struct FnBody {
149    name: String,
150    source: String,
151    norm_tokens: Vec<String>,
152    span: Span,
153}
154
155fn collect_functions(path: &Path, ast: &syn::File, out: &mut Vec<FnBody>) {
156    for item in &ast.items {
157        match item {
158            syn::Item::Fn(f) => {
159                let name = f.sig.ident.to_string();
160                let source = f.to_token_stream().to_string();
161                let norm_tokens = normalize_tokens(f.block.to_token_stream());
162                let span = Span::from_pm_span(path, f.span());
163                out.push(FnBody {
164                    name,
165                    source,
166                    norm_tokens,
167                    span,
168                });
169            }
170            syn::Item::Impl(imp) => {
171                for it in &imp.items {
172                    let syn::ImplItem::Fn(f) = it else { continue };
173                    let name = f.sig.ident.to_string();
174                    let source = f.to_token_stream().to_string();
175                    let norm_tokens = normalize_tokens(f.block.to_token_stream());
176                    let span = Span::from_pm_span(path, f.span());
177                    out.push(FnBody {
178                        name,
179                        source,
180                        norm_tokens,
181                        span,
182                    });
183                }
184            }
185            _ => {}
186        }
187    }
188}
189
190fn normalize_tokens(ts: proc_macro2::TokenStream) -> Vec<String> {
191    fn walk(out: &mut Vec<String>, stream: proc_macro2::TokenStream) {
192        for tt in stream {
193            match tt {
194                proc_macro2::TokenTree::Group(g) => {
195                    let (l, r) = match g.delimiter() {
196                        proc_macro2::Delimiter::Parenthesis => ("(", ")"),
197                        proc_macro2::Delimiter::Brace => ("{", "}"),
198                        proc_macro2::Delimiter::Bracket => ("[", "]"),
199                        proc_macro2::Delimiter::None => ("", ""),
200                    };
201                    if !l.is_empty() {
202                        out.push(l.to_string());
203                    }
204                    walk(out, g.stream());
205                    if !r.is_empty() {
206                        out.push(r.to_string());
207                    }
208                }
209                proc_macro2::TokenTree::Ident(_) => out.push("ID".to_string()),
210                proc_macro2::TokenTree::Punct(p) => out.push(p.as_char().to_string()),
211                proc_macro2::TokenTree::Literal(l) => out.push(classify_literal(&l.to_string())),
212            }
213        }
214    }
215
216    let mut out = Vec::new();
217    walk(&mut out, ts);
218    out
219}
220
221fn classify_literal(s: &str) -> String {
222    if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") || s.starts_with("b\"") {
223        return "STR".to_string();
224    }
225    if s.chars()
226        .next()
227        .is_some_and(|c| c.is_ascii_digit() || c == '-')
228    {
229        return "INT".to_string();
230    }
231    "LIT".to_string()
232}
233
234fn fingerprint(tokens: &[String], k: usize) -> HashSet<u64> {
235    if k == 0 || tokens.len() < k {
236        return HashSet::new();
237    }
238    let mut out = HashSet::new();
239    for win in tokens.windows(k) {
240        let mut hasher = DefaultHasher::new();
241        for t in win {
242            t.hash(&mut hasher);
243        }
244        out.insert(hasher.finish());
245    }
246    out
247}
248
249fn side_by_side(a: &str, b: &str) -> String {
250    let diff = TextDiff::from_lines(a, b);
251    let mut left = Vec::new();
252    let mut right = Vec::new();
253    for change in diff.iter_all_changes() {
254        match change.tag() {
255            similar::ChangeTag::Delete => {
256                left.push(change.to_string());
257                right.push(String::new());
258            }
259            similar::ChangeTag::Insert => {
260                left.push(String::new());
261                right.push(change.to_string());
262            }
263            similar::ChangeTag::Equal => {
264                left.push(change.to_string());
265                right.push(change.to_string());
266            }
267        }
268    }
269
270    let width = left.iter().map(|s| s.len()).max().unwrap_or(0).min(120);
271    let mut out = String::new();
272    out.push_str("A | B\n");
273    out.push_str("---|---\n");
274    for (l, r) in left.iter().zip(right.iter()) {
275        let l = trim_line(l);
276        let r = trim_line(r);
277        out.push_str(&format!("{:width$} | {}\n", l, r, width = width));
278    }
279    out
280}
281
282fn trim_line(s: &str) -> String {
283    let mut t = s.replace('\t', "    ");
284    if t.ends_with('\n') {
285        t.pop();
286    }
287    t
288}
289
290#[cfg(test)]
291mod tests;