scute_core/code_similarity/
mod.rs1mod check;
2mod detect;
3pub mod language;
4mod tokenize;
5
6use crate::parser::TreeSitterParser;
7
8pub use check::{CHECK_NAME, Definition, check};
9pub use detect::{CloneGroup, Occurrence, detect_clones};
10pub use language::{LanguageConfig, NodeRole};
11pub use tokenize::{Token, TokenizeError, tokenize};
12
13pub struct SourceEntry<'a> {
15 pub source: &'a str,
16 pub source_id: &'a str,
17 pub language: &'a LanguageConfig,
18}
19
20impl<'a> SourceEntry<'a> {
21 #[must_use]
22 pub fn new(source: &'a str, source_id: &'a str, language: &'a LanguageConfig) -> Self {
23 Self {
24 source,
25 source_id,
26 language,
27 }
28 }
29}
30
31#[derive(Debug, Clone)]
33pub struct SourceTokens {
34 pub source_id: String,
35 pub tokens: Vec<Token>,
36}
37
38impl SourceTokens {
39 #[must_use]
40 pub fn new(source_id: String, tokens: Vec<Token>) -> Self {
41 Self { source_id, tokens }
42 }
43}
44
45pub fn find_clones(
55 entries: &[SourceEntry<'_>],
56 min_tokens: usize,
57) -> Result<Vec<CloneGroup>, TokenizeError> {
58 let mut parser = TreeSitterParser::new();
59 let sources: Vec<SourceTokens> = entries
60 .iter()
61 .map(|entry| {
62 let tokens = tokenize(&mut parser, entry.source, entry.language)?;
63 Ok(SourceTokens::new(entry.source_id.to_string(), tokens))
64 })
65 .collect::<Result<_, TokenizeError>>()?;
66
67 Ok(detect_clones(&sources, min_tokens))
68}
69
70#[cfg(test)]
71mod tests {
72 use super::*;
73
74 const LOW_TOKEN_THRESHOLD: usize = 5;
75 const IMPOSSIBLY_HIGH_THRESHOLD: usize = 1000;
76
77 fn tokenize_rust(source: &str, source_id: &str) -> SourceTokens {
78 let mut parser = TreeSitterParser::new();
79 let tokens = tokenize(&mut parser, source, &language::rust()).unwrap();
80 SourceTokens::new(source_id.to_string(), tokens)
81 }
82
83 fn rust_clone_pair() -> [SourceTokens; 2] {
86 [
87 tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "a.rs"),
88 tokenize_rust("fn g(y: u32) -> u32 { y + 1 }", "b.rs"),
89 ]
90 }
91
92 #[test]
93 fn detects_within_file_duplication() {
94 let source = "fn foo(x: i32) -> i32 { x + 1 }\nfn bar(y: i32) -> i32 { y + 1 }";
95
96 let a = tokenize_rust(source, "same.rs");
97 let groups = detect_clones(&[a], LOW_TOKEN_THRESHOLD);
98
99 assert_eq!(groups.len(), 1);
100 assert_eq!(groups[0].occurrences.len(), 2);
101 assert_eq!(groups[0].occurrences[0].source_id, "same.rs");
102 assert_eq!(groups[0].occurrences[1].source_id, "same.rs");
103 }
104
105 #[test]
106 fn detects_cross_file_duplication() {
107 let a = tokenize_rust("fn calc(x: f64, y: f64) -> f64 { x + y }", "a.rs");
108 let b = tokenize_rust("fn add(a: i32, b: i32) -> i32 { a + b }", "b.rs");
109
110 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
111
112 assert_eq!(groups.len(), 1);
113 assert_eq!(groups[0].occurrences[0].source_id, "a.rs");
114 assert_eq!(groups[0].occurrences[1].source_id, "b.rs");
115 }
116
117 #[test]
118 fn groups_three_identical_regions_into_one_group() {
119 let [a, b] = rust_clone_pair();
120 let c = tokenize_rust("fn h(z: f64) -> f64 { z + 1 }", "c.rs");
121
122 let groups = detect_clones(&[a, b, c], LOW_TOKEN_THRESHOLD);
123
124 assert_eq!(groups.len(), 1);
125 assert_eq!(groups[0].occurrences.len(), 3);
126 }
127
128 #[test]
129 fn no_clones_in_distinct_code() {
130 let a = tokenize_rust("let x = 1 + 2;", "a.rs");
131 let b = tokenize_rust("if true { return false; }", "b.rs");
132
133 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
134
135 assert!(groups.is_empty());
136 }
137
138 #[test]
139 fn filters_matches_below_min_tokens() {
140 let [a, b] = rust_clone_pair();
141
142 let groups = detect_clones(&[a, b], IMPOSSIBLY_HIGH_THRESHOLD);
143
144 assert!(groups.is_empty());
145 }
146
147 #[test]
148 fn reports_token_count_at_least_min_tokens() {
149 let [a, b] = rust_clone_pair();
150
151 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
152
153 assert_eq!(groups.len(), 1);
154 assert_eq!(groups[0].token_count, 14);
155 }
156
157 #[test]
158 fn occurrence_lines_are_coherent() {
159 let [a, b] = rust_clone_pair();
160
161 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
162
163 assert_eq!(groups[0].occurrences[0].start_line, 1);
164 assert_eq!(groups[0].occurrences[0].end_line, 1);
165 assert_eq!(groups[0].occurrences[1].start_line, 1);
166 assert_eq!(groups[0].occurrences[1].end_line, 1);
167 }
168
169 #[test]
170 fn same_input_produces_identical_output() {
171 let run = || {
172 let [a, b] = rust_clone_pair();
173 detect_clones(&[a, b], LOW_TOKEN_THRESHOLD)
174 };
175
176 assert_eq!(run(), run());
177 }
178
179 #[test]
180 fn empty_source_produces_no_clones() {
181 let a = tokenize_rust("", "a.rs");
182 let b = tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "b.rs");
183
184 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
185
186 assert!(groups.is_empty());
187 }
188
189 #[test]
190 fn min_tokens_zero_returns_empty() {
191 let [a, b] = rust_clone_pair();
192
193 let groups = detect_clones(&[a, b], 0);
194
195 assert!(groups.is_empty());
196 }
197
198 #[test]
199 fn syntax_errors_do_not_panic() {
200 let mut parser = TreeSitterParser::new();
201 let broken = tokenize(&mut parser, "fn f(x: i32 -> { x + }", &language::rust());
202
203 assert!(broken.is_ok()); }
205
206 #[test]
207 fn single_source_without_duplication_produces_no_clones() {
208 let a = tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "a.rs");
209
210 let groups = detect_clones(&[a], LOW_TOKEN_THRESHOLD);
211
212 assert!(groups.is_empty());
213 }
214
215 #[test]
216 fn comment_only_source_produces_no_clones() {
217 let a = tokenize_rust("// just a comment\n/* block comment */", "a.rs");
218 let b = tokenize_rust("// another comment\n/* block */", "b.rs");
219
220 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
221
222 assert!(groups.is_empty());
223 }
224
225 #[test]
226 fn clone_at_exact_min_tokens_is_detected() {
227 let [a, b] = rust_clone_pair();
228
229 let groups = detect_clones(&[a, b], 14);
230
231 assert_eq!(groups.len(), 1);
232 assert_eq!(groups[0].token_count, 14);
233 }
234
235 #[test]
236 fn clone_one_below_min_tokens_is_not_detected() {
237 let [a, b] = rust_clone_pair();
238
239 let groups = detect_clones(&[a, b], 15);
240
241 assert!(groups.is_empty());
242 }
243
244 #[test]
245 fn multi_line_clone_tracks_correct_line_range() {
246 let source_a = "\
247fn f(x: i32) -> i32 {
248 let result = x + 1;
249 result * 2
250}";
251 let source_b = "\
252fn g(y: u32) -> u32 {
253 let result = y + 1;
254 result * 2
255}";
256 let a = tokenize_rust(source_a, "a.rs");
257 let b = tokenize_rust(source_b, "b.rs");
258
259 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
260
261 assert_eq!(groups.len(), 1);
262 assert_eq!(groups[0].occurrences[0].start_line, 1);
263 assert_eq!(groups[0].occurrences[0].end_line, 4);
264 }
265
266 #[test]
267 fn discards_groups_subsumed_by_a_longer_match() {
268 let a = tokenize_rust("fn f(x: i32, y: i32) -> i32 { x + y + 1 }", "a.rs");
269 let b = tokenize_rust("fn g(a: u32, b: u32) -> u32 { a + b + 1 }", "b.rs");
270
271 let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
272
273 assert_eq!(groups.len(), 1);
276 }
277}