scute_core/code_similarity/
detect.rs1use super::SourceTokens;
2use std::collections::HashMap;
3
4#[derive(Debug, Clone, PartialEq)]
6pub struct CloneGroup {
7 pub token_count: usize,
8 pub occurrences: Vec<Occurrence>,
9}
10
11#[derive(Debug, Clone, PartialEq)]
13pub struct Occurrence {
14 pub source_id: String,
15 pub start_line: usize,
16 pub end_line: usize,
17}
18
19struct TokenSequence {
20 concat: Vec<usize>,
21 pos_map: Vec<Option<(usize, usize)>>,
24}
25
26#[must_use]
32pub fn detect_clones(sources: &[SourceTokens], min_tokens: usize) -> Vec<CloneGroup> {
33 if sources.is_empty() || min_tokens == 0 {
34 return vec![];
35 }
36
37 let seq = build_token_sequence(sources);
38 if seq.concat.len() < 2 {
39 return vec![];
40 }
41
42 let sa = build_suffix_array(&seq.concat);
43 let lcp = build_lcp_array(&seq.concat, &sa);
44 let intervals = extract_lcp_intervals(&sa, &lcp, min_tokens);
45
46 let groups = intervals_to_groups(&intervals, &sa, &seq.pos_map, sources);
47 filter_maximal_groups(groups)
48}
49
50fn build_token_sequence(sources: &[SourceTokens]) -> TokenSequence {
53 let mut vocab: HashMap<&str, usize> = HashMap::new();
56 let mut concat: Vec<usize> = Vec::new();
57 let mut pos_map: Vec<Option<(usize, usize)>> = Vec::new();
58
59 for (file_idx, source) in sources.iter().enumerate() {
60 for (tok_idx, token) in source.tokens.iter().enumerate() {
61 let next_id = vocab.len();
62 let id = *vocab.entry(token.text.as_str()).or_insert(next_id);
63 concat.push(id);
64 pos_map.push(Some((file_idx, tok_idx)));
65 }
66 concat.push(usize::MAX - file_idx); pos_map.push(None);
68 }
69
70 TokenSequence { concat, pos_map }
71}
72
73fn intervals_to_groups(
76 intervals: &[(usize, usize, usize)],
77 sa: &[usize],
78 pos_map: &[Option<(usize, usize)>],
79 sources: &[SourceTokens],
80) -> Vec<CloneGroup> {
81 let mut groups: Vec<CloneGroup> = Vec::new();
82
83 for &(depth, left, right) in intervals {
84 let mut occurrences: Vec<Occurrence> = Vec::new();
85
86 for &pos in &sa[left..=right] {
87 let Some((file_idx, tok_idx)) = pos_map[pos] else {
88 continue; };
90
91 let source = &sources[file_idx];
92 let end_tok = (tok_idx + depth - 1).min(source.tokens.len() - 1);
93 occurrences.push(Occurrence {
94 source_id: source.source_id.clone(),
95 start_line: source.tokens[tok_idx].start_line,
96 end_line: source.tokens[end_tok].end_line,
97 });
98 }
99
100 occurrences.sort_by(|a, b| {
101 a.source_id
102 .cmp(&b.source_id)
103 .then(a.start_line.cmp(&b.start_line))
104 });
105 occurrences.dedup();
106
107 if occurrences.len() >= 2 {
108 groups.push(CloneGroup {
109 token_count: depth,
110 occurrences,
111 });
112 }
113 }
114
115 groups
116}
117
118fn is_subsumed_by(candidate: &CloneGroup, accepted: &[CloneGroup]) -> bool {
121 accepted.iter().any(|prev| {
122 candidate.occurrences.iter().all(|occ| {
123 prev.occurrences.iter().any(|p| {
124 p.source_id == occ.source_id
125 && p.start_line <= occ.start_line
126 && p.end_line >= occ.end_line
127 })
128 })
129 })
130}
131
132fn filter_maximal_groups(mut groups: Vec<CloneGroup>) -> Vec<CloneGroup> {
135 groups.sort_by(|a, b| {
137 b.token_count
138 .cmp(&a.token_count)
139 .then(a.occurrences.len().cmp(&b.occurrences.len()))
140 });
141
142 let mut accepted: Vec<CloneGroup> = Vec::new();
143 for group in groups {
144 if !is_subsumed_by(&group, &accepted) {
145 accepted.push(group);
146 }
147 }
148
149 accepted
150}
151
152fn build_suffix_array(text: &[usize]) -> Vec<usize> {
153 let mut sa: Vec<usize> = (0..text.len()).collect();
154 sa.sort_by(|&a, &b| text[a..].cmp(&text[b..]));
155 sa
156}
157
158fn count_common_prefix(text: &[usize], i: usize, j: usize, start: usize) -> usize {
160 let n = text.len();
161 let mut len = 0;
162 while i + start + len < n
163 && j + start + len < n
164 && text[i + start + len] == text[j + start + len]
165 {
166 len += 1;
167 }
168 len
169}
170
171fn build_lcp_array(text: &[usize], sa: &[usize]) -> Vec<usize> {
172 let n = text.len();
173 let mut rank = vec![0usize; n];
174 for (i, &s) in sa.iter().enumerate() {
175 rank[s] = i;
176 }
177
178 let mut lcp = vec![0usize; n];
179 let mut h: usize = 0;
180
181 for i in 0..n {
182 if rank[i] == 0 {
183 h = 0;
184 continue;
185 }
186 let j = sa[rank[i] - 1];
187 h += count_common_prefix(text, i, j, h);
188 lcp[rank[i]] = h;
189 h = h.saturating_sub(1);
190 }
191
192 lcp
193}
194
195fn pop_and_record(
198 stack: &mut Vec<(usize, usize)>,
199 intervals: &mut Vec<(usize, usize, usize)>,
200 cur: usize,
201 i: usize,
202 min_tokens: usize,
203) -> usize {
204 let mut lb = i - 1;
205 while stack.last().is_some_and(|&(d, _)| d > cur) {
206 let (depth, left) = stack.pop().unwrap();
207 lb = left;
208 if depth >= min_tokens && i - 1 > left {
209 intervals.push((depth, left, i - 1));
210 }
211 }
212 lb
213}
214
215fn extract_lcp_intervals(
218 sa: &[usize],
219 lcp: &[usize],
220 min_tokens: usize,
221) -> Vec<(usize, usize, usize)> {
222 let n = sa.len();
223 let mut intervals = Vec::new();
224 let mut stack: Vec<(usize, usize)> = Vec::new(); #[allow(clippy::needless_range_loop)]
230 for i in 1..=n {
231 let cur = lcp.get(i).copied().unwrap_or(0);
232 let lb = pop_and_record(&mut stack, &mut intervals, cur, i, min_tokens);
233
234 if should_push_interval(cur, min_tokens, &stack) {
235 stack.push((cur, lb));
236 }
237 }
238
239 intervals
240}
241
242fn should_push_interval(cur: usize, min_tokens: usize, stack: &[(usize, usize)]) -> bool {
243 cur >= min_tokens && stack.last().is_none_or(|&(d, _)| cur > d)
244}