rscheck_cli/rules/duplicate_logic/
mod.rs1use crate::analysis::Workspace;
2use crate::config::DuplicateLogicConfig;
3use crate::emit::Emitter;
4use crate::report::Finding;
5use crate::rules::{Rule, RuleBackend, RuleContext, RuleFamily, RuleInfo};
6use crate::span::Span;
7use quote::ToTokens;
8use similar::TextDiff;
9use std::collections::{HashMap, HashSet, hash_map::DefaultHasher};
10use std::hash::{Hash, Hasher};
11use std::path::Path;
12use syn::spanned::Spanned;
13
14pub struct DuplicateLogicRule;
15
16impl DuplicateLogicRule {
17 pub fn static_info() -> RuleInfo {
18 RuleInfo {
19 id: "shape.duplicate_logic",
20 family: RuleFamily::Shape,
21 backend: RuleBackend::Syntax,
22 summary: "Reports similar function bodies with token fingerprints.",
23 default_level: DuplicateLogicConfig::default().level,
24 schema: "level, min_tokens, threshold, max_results, exclude_globs, kgram",
25 config_example: "[rules.\"shape.duplicate_logic\"]\nlevel = \"warn\"\nmin_tokens = 80\nthreshold = 0.8",
26 fixable: false,
27 }
28 }
29}
30
31impl Rule for DuplicateLogicRule {
32 fn info(&self) -> RuleInfo {
33 Self::static_info()
34 }
35
36 fn run(&self, ws: &Workspace, ctx: &RuleContext<'_>, out: &mut dyn Emitter) {
37 let cfg = match ctx
38 .policy
39 .decode_rule::<DuplicateLogicConfig>(Self::static_info().id, None)
40 {
41 Ok(cfg) => cfg,
42 Err(_) => return,
43 };
44 let severity = cfg.level.to_severity();
45 let exclude = build_exclude(&cfg.exclude_globs);
46
47 let mut functions = Vec::new();
48 for file in &ws.files {
49 if exclude.is_match(file.path.to_string_lossy().as_ref()) {
50 continue;
51 }
52 let Some(ast) = &file.ast else { continue };
53 collect_functions(&file.path, ast, &mut functions);
54 }
55
56 let mut fingerprints: Vec<HashSet<u64>> = Vec::with_capacity(functions.len());
57 for f in &functions {
58 let set = fingerprint(&f.norm_tokens, cfg.kgram);
59 fingerprints.push(set);
60 }
61
62 let mut index: HashMap<u64, Vec<usize>> = HashMap::new();
63 for (i, set) in fingerprints.iter().enumerate() {
64 for h in set {
65 index.entry(*h).or_default().push(i);
66 }
67 }
68
69 let mut shared: HashMap<(usize, usize), u32> = HashMap::new();
70 for (_h, ids) in index {
71 if ids.len() < 2 {
72 continue;
73 }
74 for a in 0..ids.len() {
75 for b in (a + 1)..ids.len() {
76 let i = ids[a];
77 let j = ids[b];
78 let key = if i < j { (i, j) } else { (j, i) };
79 *shared.entry(key).or_insert(0) += 1;
80 }
81 }
82 }
83
84 let mut matches = Vec::new();
85 for ((i, j), _shared) in shared {
86 let a = &functions[i];
87 let b = &functions[j];
88 if a.norm_tokens.len() < cfg.min_tokens || b.norm_tokens.len() < cfg.min_tokens {
89 continue;
90 }
91 let sa = &fingerprints[i];
92 let sb = &fingerprints[j];
93 if sa.is_empty() || sb.is_empty() {
94 continue;
95 }
96 let inter = sa.intersection(sb).count() as f32;
97 let union = (sa.len() + sb.len()) as f32 - inter;
98 let sim = if union <= 0.0 { 0.0 } else { inter / union };
99 if sim >= cfg.threshold {
100 matches.push((sim, i, j));
101 }
102 }
103
104 matches.sort_by(|a, b| b.0.total_cmp(&a.0));
105 matches.truncate(cfg.max_results);
106
107 for (sim, i, j) in matches {
108 let a = &functions[i];
109 let b = &functions[j];
110 let evidence = side_by_side(&a.source, &b.source);
111 out.emit(Finding {
112 rule_id: Self::static_info().id.to_string(),
113 family: Some(Self::static_info().family),
114 engine: Some(Self::static_info().backend),
115 severity,
116 message: format!(
117 "duplicate logic: {:.0}% similarity between `{}` and `{}`",
118 sim * 100.0,
119 a.name,
120 b.name
121 ),
122 primary: Some(a.span.clone()),
123 secondary: vec![b.span.clone()],
124 help: Some("Extract shared code or delete one copy.".to_string()),
125 evidence: Some(evidence),
126 confidence: None,
127 tags: vec!["duplication".to_string()],
128 labels: Vec::new(),
129 notes: Vec::new(),
130 fixes: Vec::new(),
131 });
132 }
133 }
134}
135
136fn build_exclude(globs: &[String]) -> globset::GlobSet {
137 let mut b = globset::GlobSetBuilder::new();
138 for g in globs {
139 if let Ok(glob) = globset::Glob::new(g) {
140 b.add(glob);
141 }
142 }
143 b.build()
144 .unwrap_or_else(|_| globset::GlobSetBuilder::new().build().unwrap())
145}
146
147#[derive(Clone)]
148struct FnBody {
149 name: String,
150 source: String,
151 norm_tokens: Vec<String>,
152 span: Span,
153}
154
155fn collect_functions(path: &Path, ast: &syn::File, out: &mut Vec<FnBody>) {
156 for item in &ast.items {
157 match item {
158 syn::Item::Fn(f) => {
159 let name = f.sig.ident.to_string();
160 let source = f.to_token_stream().to_string();
161 let norm_tokens = normalize_tokens(f.block.to_token_stream());
162 let span = Span::from_pm_span(path, f.span());
163 out.push(FnBody {
164 name,
165 source,
166 norm_tokens,
167 span,
168 });
169 }
170 syn::Item::Impl(imp) => {
171 for it in &imp.items {
172 let syn::ImplItem::Fn(f) = it else { continue };
173 let name = f.sig.ident.to_string();
174 let source = f.to_token_stream().to_string();
175 let norm_tokens = normalize_tokens(f.block.to_token_stream());
176 let span = Span::from_pm_span(path, f.span());
177 out.push(FnBody {
178 name,
179 source,
180 norm_tokens,
181 span,
182 });
183 }
184 }
185 _ => {}
186 }
187 }
188}
189
190fn normalize_tokens(ts: proc_macro2::TokenStream) -> Vec<String> {
191 fn walk(out: &mut Vec<String>, stream: proc_macro2::TokenStream) {
192 for tt in stream {
193 match tt {
194 proc_macro2::TokenTree::Group(g) => {
195 let (l, r) = match g.delimiter() {
196 proc_macro2::Delimiter::Parenthesis => ("(", ")"),
197 proc_macro2::Delimiter::Brace => ("{", "}"),
198 proc_macro2::Delimiter::Bracket => ("[", "]"),
199 proc_macro2::Delimiter::None => ("", ""),
200 };
201 if !l.is_empty() {
202 out.push(l.to_string());
203 }
204 walk(out, g.stream());
205 if !r.is_empty() {
206 out.push(r.to_string());
207 }
208 }
209 proc_macro2::TokenTree::Ident(_) => out.push("ID".to_string()),
210 proc_macro2::TokenTree::Punct(p) => out.push(p.as_char().to_string()),
211 proc_macro2::TokenTree::Literal(l) => out.push(classify_literal(&l.to_string())),
212 }
213 }
214 }
215
216 let mut out = Vec::new();
217 walk(&mut out, ts);
218 out
219}
220
221fn classify_literal(s: &str) -> String {
222 if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") || s.starts_with("b\"") {
223 return "STR".to_string();
224 }
225 if s.chars()
226 .next()
227 .is_some_and(|c| c.is_ascii_digit() || c == '-')
228 {
229 return "INT".to_string();
230 }
231 "LIT".to_string()
232}
233
234fn fingerprint(tokens: &[String], k: usize) -> HashSet<u64> {
235 if k == 0 || tokens.len() < k {
236 return HashSet::new();
237 }
238 let mut out = HashSet::new();
239 for win in tokens.windows(k) {
240 let mut hasher = DefaultHasher::new();
241 for t in win {
242 t.hash(&mut hasher);
243 }
244 out.insert(hasher.finish());
245 }
246 out
247}
248
249fn side_by_side(a: &str, b: &str) -> String {
250 let diff = TextDiff::from_lines(a, b);
251 let mut left = Vec::new();
252 let mut right = Vec::new();
253 for change in diff.iter_all_changes() {
254 match change.tag() {
255 similar::ChangeTag::Delete => {
256 left.push(change.to_string());
257 right.push(String::new());
258 }
259 similar::ChangeTag::Insert => {
260 left.push(String::new());
261 right.push(change.to_string());
262 }
263 similar::ChangeTag::Equal => {
264 left.push(change.to_string());
265 right.push(change.to_string());
266 }
267 }
268 }
269
270 let width = left.iter().map(|s| s.len()).max().unwrap_or(0).min(120);
271 let mut out = String::new();
272 out.push_str("A | B\n");
273 out.push_str("---|---\n");
274 for (l, r) in left.iter().zip(right.iter()) {
275 let l = trim_line(l);
276 let r = trim_line(r);
277 out.push_str(&format!("{:width$} | {}\n", l, r, width = width));
278 }
279 out
280}
281
282fn trim_line(s: &str) -> String {
283 let mut t = s.replace('\t', " ");
284 if t.ends_with('\n') {
285 t.pop();
286 }
287 t
288}
289
290#[cfg(test)]
291mod tests;