Skip to main content

scute_core/code_similarity/
mod.rs

1mod check;
2mod detect;
3pub mod language;
4mod tokenize;
5
6use crate::parser::TreeSitterParser;
7
8pub use check::{CHECK_NAME, Definition, check};
9pub use detect::{CloneGroup, Occurrence, detect_clones};
10pub use language::{LanguageConfig, NodeRole};
11pub use tokenize::{Token, TokenizeError, tokenize};
12
13/// A source entry for clone detection: raw source code + metadata.
14pub struct SourceEntry<'a> {
15    pub source: &'a str,
16    pub source_id: &'a str,
17    pub language: &'a LanguageConfig,
18}
19
20impl<'a> SourceEntry<'a> {
21    #[must_use]
22    pub fn new(source: &'a str, source_id: &'a str, language: &'a LanguageConfig) -> Self {
23        Self {
24            source,
25            source_id,
26            language,
27        }
28    }
29}
30
31/// Tokens from a single source file, ready for clone detection.
32#[derive(Debug, Clone)]
33pub struct SourceTokens {
34    pub source_id: String,
35    pub tokens: Vec<Token>,
36}
37
38impl SourceTokens {
39    #[must_use]
40    pub fn new(source_id: String, tokens: Vec<Token>) -> Self {
41        Self { source_id, tokens }
42    }
43}
44
45/// Detect clones in a set of source files.
46///
47/// Tokenizes each source entry, then runs clone detection over the
48/// normalized token sequences. This is the main entry point for the
49/// code similarity engine.
50///
51/// # Errors
52///
53/// Returns `TokenizeError` if any source entry fails to parse.
54pub fn find_clones(
55    entries: &[SourceEntry<'_>],
56    min_tokens: usize,
57) -> Result<Vec<CloneGroup>, TokenizeError> {
58    let mut parser = TreeSitterParser::new();
59    let sources: Vec<SourceTokens> = entries
60        .iter()
61        .map(|entry| {
62            let tokens = tokenize(&mut parser, entry.source, entry.language)?;
63            Ok(SourceTokens::new(entry.source_id.to_string(), tokens))
64        })
65        .collect::<Result<_, TokenizeError>>()?;
66
67    Ok(detect_clones(&sources, min_tokens))
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    const LOW_TOKEN_THRESHOLD: usize = 5;
75    const IMPOSSIBLY_HIGH_THRESHOLD: usize = 1000;
76
77    fn tokenize_rust(source: &str, source_id: &str) -> SourceTokens {
78        let mut parser = TreeSitterParser::new();
79        let tokens = tokenize(&mut parser, source, &language::rust()).unwrap();
80        SourceTokens::new(source_id.to_string(), tokens)
81    }
82
83    /// Two single-line functions with identical structure but different names/types.
84    /// Produces 14 normalized tokens each: fn $ID ( $ID : $ID ) -> $ID { $ID + $LIT }
85    fn rust_clone_pair() -> [SourceTokens; 2] {
86        [
87            tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "a.rs"),
88            tokenize_rust("fn g(y: u32) -> u32 { y + 1 }", "b.rs"),
89        ]
90    }
91
92    #[test]
93    fn detects_within_file_duplication() {
94        let source = "fn foo(x: i32) -> i32 { x + 1 }\nfn bar(y: i32) -> i32 { y + 1 }";
95
96        let a = tokenize_rust(source, "same.rs");
97        let groups = detect_clones(&[a], LOW_TOKEN_THRESHOLD);
98
99        assert_eq!(groups.len(), 1);
100        assert_eq!(groups[0].occurrences.len(), 2);
101        assert_eq!(groups[0].occurrences[0].source_id, "same.rs");
102        assert_eq!(groups[0].occurrences[1].source_id, "same.rs");
103    }
104
105    #[test]
106    fn detects_cross_file_duplication() {
107        let a = tokenize_rust("fn calc(x: f64, y: f64) -> f64 { x + y }", "a.rs");
108        let b = tokenize_rust("fn add(a: i32, b: i32) -> i32 { a + b }", "b.rs");
109
110        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
111
112        assert_eq!(groups.len(), 1);
113        assert_eq!(groups[0].occurrences[0].source_id, "a.rs");
114        assert_eq!(groups[0].occurrences[1].source_id, "b.rs");
115    }
116
117    #[test]
118    fn groups_three_identical_regions_into_one_group() {
119        let [a, b] = rust_clone_pair();
120        let c = tokenize_rust("fn h(z: f64) -> f64 { z + 1 }", "c.rs");
121
122        let groups = detect_clones(&[a, b, c], LOW_TOKEN_THRESHOLD);
123
124        assert_eq!(groups.len(), 1);
125        assert_eq!(groups[0].occurrences.len(), 3);
126    }
127
128    #[test]
129    fn no_clones_in_distinct_code() {
130        let a = tokenize_rust("let x = 1 + 2;", "a.rs");
131        let b = tokenize_rust("if true { return false; }", "b.rs");
132
133        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
134
135        assert!(groups.is_empty());
136    }
137
138    #[test]
139    fn filters_matches_below_min_tokens() {
140        let [a, b] = rust_clone_pair();
141
142        let groups = detect_clones(&[a, b], IMPOSSIBLY_HIGH_THRESHOLD);
143
144        assert!(groups.is_empty());
145    }
146
147    #[test]
148    fn reports_token_count_at_least_min_tokens() {
149        let [a, b] = rust_clone_pair();
150
151        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
152
153        assert_eq!(groups.len(), 1);
154        assert_eq!(groups[0].token_count, 14);
155    }
156
157    #[test]
158    fn occurrence_lines_are_coherent() {
159        let [a, b] = rust_clone_pair();
160
161        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
162
163        assert_eq!(groups[0].occurrences[0].start_line, 1);
164        assert_eq!(groups[0].occurrences[0].end_line, 1);
165        assert_eq!(groups[0].occurrences[1].start_line, 1);
166        assert_eq!(groups[0].occurrences[1].end_line, 1);
167    }
168
169    #[test]
170    fn same_input_produces_identical_output() {
171        let run = || {
172            let [a, b] = rust_clone_pair();
173            detect_clones(&[a, b], LOW_TOKEN_THRESHOLD)
174        };
175
176        assert_eq!(run(), run());
177    }
178
179    #[test]
180    fn empty_source_produces_no_clones() {
181        let a = tokenize_rust("", "a.rs");
182        let b = tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "b.rs");
183
184        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
185
186        assert!(groups.is_empty());
187    }
188
189    #[test]
190    fn min_tokens_zero_returns_empty() {
191        let [a, b] = rust_clone_pair();
192
193        let groups = detect_clones(&[a, b], 0);
194
195        assert!(groups.is_empty());
196    }
197
198    #[test]
199    fn syntax_errors_do_not_panic() {
200        let mut parser = TreeSitterParser::new();
201        let broken = tokenize(&mut parser, "fn f(x: i32 -> { x + }", &language::rust());
202
203        assert!(broken.is_ok()); // tree-sitter recovers, never errors
204    }
205
206    #[test]
207    fn single_source_without_duplication_produces_no_clones() {
208        let a = tokenize_rust("fn f(x: i32) -> i32 { x + 1 }", "a.rs");
209
210        let groups = detect_clones(&[a], LOW_TOKEN_THRESHOLD);
211
212        assert!(groups.is_empty());
213    }
214
215    #[test]
216    fn comment_only_source_produces_no_clones() {
217        let a = tokenize_rust("// just a comment\n/* block comment */", "a.rs");
218        let b = tokenize_rust("// another comment\n/* block */", "b.rs");
219
220        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
221
222        assert!(groups.is_empty());
223    }
224
225    #[test]
226    fn clone_at_exact_min_tokens_is_detected() {
227        let [a, b] = rust_clone_pair();
228
229        let groups = detect_clones(&[a, b], 14);
230
231        assert_eq!(groups.len(), 1);
232        assert_eq!(groups[0].token_count, 14);
233    }
234
235    #[test]
236    fn clone_one_below_min_tokens_is_not_detected() {
237        let [a, b] = rust_clone_pair();
238
239        let groups = detect_clones(&[a, b], 15);
240
241        assert!(groups.is_empty());
242    }
243
244    #[test]
245    fn multi_line_clone_tracks_correct_line_range() {
246        let source_a = "\
247fn f(x: i32) -> i32 {
248    let result = x + 1;
249    result * 2
250}";
251        let source_b = "\
252fn g(y: u32) -> u32 {
253    let result = y + 1;
254    result * 2
255}";
256        let a = tokenize_rust(source_a, "a.rs");
257        let b = tokenize_rust(source_b, "b.rs");
258
259        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
260
261        assert_eq!(groups.len(), 1);
262        assert_eq!(groups[0].occurrences[0].start_line, 1);
263        assert_eq!(groups[0].occurrences[0].end_line, 4);
264    }
265
266    #[test]
267    fn discards_groups_subsumed_by_a_longer_match() {
268        let a = tokenize_rust("fn f(x: i32, y: i32) -> i32 { x + y + 1 }", "a.rs");
269        let b = tokenize_rust("fn g(a: u32, b: u32) -> u32 { a + b + 1 }", "b.rs");
270
271        let groups = detect_clones(&[a, b], LOW_TOKEN_THRESHOLD);
272
273        // The suffix array finds many overlapping sub-sequences, but only
274        // the longest match should survive — shorter ones are fully contained.
275        assert_eq!(groups.len(), 1);
276    }
277}