Skip to main content

scute_core/code_similarity/
detect.rs

1use super::SourceTokens;
2use std::collections::HashMap;
3
4/// A group of code regions that share the same normalized token sequence.
5#[derive(Debug, Clone, PartialEq)]
6pub struct CloneGroup {
7    pub token_count: usize,
8    pub occurrences: Vec<Occurrence>,
9}
10
11/// A single occurrence of a clone within a source file.
12#[derive(Debug, Clone, PartialEq)]
13pub struct Occurrence {
14    pub source_id: String,
15    pub start_line: usize,
16    pub end_line: usize,
17}
18
19struct TokenSequence {
20    concat: Vec<usize>,
21    /// Maps each position in `concat` to `(file_index, token_index)`.
22    /// `None` for sentinel positions.
23    pos_map: Vec<Option<(usize, usize)>>,
24}
25
26/// Detect clone groups across the given source token sequences.
27///
28/// Builds a generalized suffix array over the concatenated (normalized) token
29/// streams, then walks the LCP array to find all maximal repeated regions
30/// of at least `min_tokens` length.
31#[must_use]
32pub fn detect_clones(sources: &[SourceTokens], min_tokens: usize) -> Vec<CloneGroup> {
33    if sources.is_empty() || min_tokens == 0 {
34        return vec![];
35    }
36
37    let seq = build_token_sequence(sources);
38    if seq.concat.len() < 2 {
39        return vec![];
40    }
41
42    let sa = build_suffix_array(&seq.concat);
43    let lcp = build_lcp_array(&seq.concat, &sa);
44    let intervals = extract_lcp_intervals(&sa, &lcp, min_tokens);
45
46    let groups = intervals_to_groups(&intervals, &sa, &seq.pos_map, sources);
47    filter_maximal_groups(groups)
48}
49
50/// Concatenate all token streams with unique sentinels between them,
51/// mapping each position back to its source file and token index.
52fn build_token_sequence(sources: &[SourceTokens]) -> TokenSequence {
53    // Real token IDs start at 0. Sentinels use the high end of usize
54    // (usize::MAX, usize::MAX-1, …) so they never collide with real IDs.
55    let mut vocab: HashMap<&str, usize> = HashMap::new();
56    let mut concat: Vec<usize> = Vec::new();
57    let mut pos_map: Vec<Option<(usize, usize)>> = Vec::new();
58
59    for (file_idx, source) in sources.iter().enumerate() {
60        for (tok_idx, token) in source.tokens.iter().enumerate() {
61            let next_id = vocab.len();
62            let id = *vocab.entry(token.text.as_str()).or_insert(next_id);
63            concat.push(id);
64            pos_map.push(Some((file_idx, tok_idx)));
65        }
66        concat.push(usize::MAX - file_idx); // unique sentinel per file
67        pos_map.push(None);
68    }
69
70    TokenSequence { concat, pos_map }
71}
72
73/// Convert LCP intervals into clone groups by resolving positions back to
74/// source files and line numbers.
75fn intervals_to_groups(
76    intervals: &[(usize, usize, usize)],
77    sa: &[usize],
78    pos_map: &[Option<(usize, usize)>],
79    sources: &[SourceTokens],
80) -> Vec<CloneGroup> {
81    let mut groups: Vec<CloneGroup> = Vec::new();
82
83    for &(depth, left, right) in intervals {
84        let mut occurrences: Vec<Occurrence> = Vec::new();
85
86        for &pos in &sa[left..=right] {
87            let Some((file_idx, tok_idx)) = pos_map[pos] else {
88                continue; // sentinel
89            };
90
91            let source = &sources[file_idx];
92            let end_tok = (tok_idx + depth - 1).min(source.tokens.len() - 1);
93            occurrences.push(Occurrence {
94                source_id: source.source_id.clone(),
95                start_line: source.tokens[tok_idx].start_line,
96                end_line: source.tokens[end_tok].end_line,
97            });
98        }
99
100        occurrences.sort_by(|a, b| {
101            a.source_id
102                .cmp(&b.source_id)
103                .then(a.start_line.cmp(&b.start_line))
104        });
105        occurrences.dedup();
106
107        if occurrences.len() >= 2 {
108            groups.push(CloneGroup {
109                token_count: depth,
110                occurrences,
111            });
112        }
113    }
114
115    groups
116}
117
118/// Check if every occurrence in `candidate` is spatially contained within
119/// some occurrence of `accepted`.
120fn is_subsumed_by(candidate: &CloneGroup, accepted: &[CloneGroup]) -> bool {
121    accepted.iter().any(|prev| {
122        candidate.occurrences.iter().all(|occ| {
123            prev.occurrences.iter().any(|p| {
124                p.source_id == occ.source_id
125                    && p.start_line <= occ.start_line
126                    && p.end_line >= occ.end_line
127            })
128        })
129    })
130}
131
132/// Keep only maximal matches: discard groups where every occurrence is
133/// spatially contained within an already-accepted longer group.
134fn filter_maximal_groups(mut groups: Vec<CloneGroup>) -> Vec<CloneGroup> {
135    // Deterministic output: longest matches first, then by occurrence count
136    groups.sort_by(|a, b| {
137        b.token_count
138            .cmp(&a.token_count)
139            .then(a.occurrences.len().cmp(&b.occurrences.len()))
140    });
141
142    let mut accepted: Vec<CloneGroup> = Vec::new();
143    for group in groups {
144        if !is_subsumed_by(&group, &accepted) {
145            accepted.push(group);
146        }
147    }
148
149    accepted
150}
151
152fn build_suffix_array(text: &[usize]) -> Vec<usize> {
153    let mut sa: Vec<usize> = (0..text.len()).collect();
154    sa.sort_by(|&a, &b| text[a..].cmp(&text[b..]));
155    sa
156}
157
158/// Count how many tokens match between `text[i+start..]` and `text[j+start..]`.
159fn count_common_prefix(text: &[usize], i: usize, j: usize, start: usize) -> usize {
160    let n = text.len();
161    let mut len = 0;
162    while i + start + len < n
163        && j + start + len < n
164        && text[i + start + len] == text[j + start + len]
165    {
166        len += 1;
167    }
168    len
169}
170
171fn build_lcp_array(text: &[usize], sa: &[usize]) -> Vec<usize> {
172    let n = text.len();
173    let mut rank = vec![0usize; n];
174    for (i, &s) in sa.iter().enumerate() {
175        rank[s] = i;
176    }
177
178    let mut lcp = vec![0usize; n];
179    let mut h: usize = 0;
180
181    for i in 0..n {
182        if rank[i] == 0 {
183            h = 0;
184            continue;
185        }
186        let j = sa[rank[i] - 1];
187        h += count_common_prefix(text, i, j, h);
188        lcp[rank[i]] = h;
189        h = h.saturating_sub(1);
190    }
191
192    lcp
193}
194
195/// Pop stack entries with depth > `cur`, recording valid intervals.
196/// Returns the leftmost bound seen during popping.
197fn pop_and_record(
198    stack: &mut Vec<(usize, usize)>,
199    intervals: &mut Vec<(usize, usize, usize)>,
200    cur: usize,
201    i: usize,
202    min_tokens: usize,
203) -> usize {
204    let mut lb = i - 1;
205    while stack.last().is_some_and(|&(d, _)| d > cur) {
206        let (depth, left) = stack.pop().unwrap();
207        lb = left;
208        if depth >= min_tokens && i - 1 > left {
209            intervals.push((depth, left, i - 1));
210        }
211    }
212    lb
213}
214
215/// Enumerate all maximal LCP intervals with depth >= `min_tokens`.
216/// Returns `(depth, left_bound, right_bound)` for each interval.
217fn extract_lcp_intervals(
218    sa: &[usize],
219    lcp: &[usize],
220    min_tokens: usize,
221) -> Vec<(usize, usize, usize)> {
222    let n = sa.len();
223    let mut intervals = Vec::new();
224    let mut stack: Vec<(usize, usize)> = Vec::new(); // (depth, left_bound)
225
226    // Standard LCP interval traversal — `i` tracks position for boundary
227    // arithmetic, not just array indexing. Rewriting as an iterator obscures
228    // the algorithm.
229    #[allow(clippy::needless_range_loop)]
230    for i in 1..=n {
231        let cur = lcp.get(i).copied().unwrap_or(0);
232        let lb = pop_and_record(&mut stack, &mut intervals, cur, i, min_tokens);
233
234        if should_push_interval(cur, min_tokens, &stack) {
235            stack.push((cur, lb));
236        }
237    }
238
239    intervals
240}
241
242fn should_push_interval(cur: usize, min_tokens: usize, stack: &[(usize, usize)]) -> bool {
243    cur >= min_tokens && stack.last().is_none_or(|&(d, _)| cur > d)
244}