fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28struct FileData {
30 path: PathBuf,
31 hashed_tokens: Vec<HashedToken>,
32 file_tokens: FileTokens,
33 atomic_invocation_spans: Vec<Span>,
34}
35
36pub struct CloneDetector {
42 min_tokens: usize,
44 min_lines: usize,
46 skip_local: bool,
48}
49
50impl CloneDetector {
51 #[must_use]
53 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54 Self {
55 min_tokens,
56 min_lines,
57 skip_local,
58 }
59 }
60
61 pub fn detect(
66 &self,
67 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68 ) -> DuplicationReport {
69 self.detect_inner(file_data, None)
70 }
71
72 pub fn detect_touching_files(
79 &self,
80 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81 focus_files: &FxHashSet<PathBuf>,
82 ) -> DuplicationReport {
83 self.detect_inner(file_data, Some(focus_files))
84 }
85
86 fn detect_inner(
87 &self,
88 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89 focus_files: Option<&FxHashSet<PathBuf>>,
90 ) -> DuplicationReport {
91 let _span = tracing::info_span!("clone_detect").entered();
92
93 if file_data.is_empty() || self.min_tokens == 0 {
94 return empty_report(0);
95 }
96
97 let files: Vec<FileData> = file_data
98 .into_iter()
99 .map(|(path, hashed_tokens, file_tokens)| FileData {
100 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101 path,
102 hashed_tokens,
103 file_tokens,
104 })
105 .collect();
106
107 let total_files = files.len();
109 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
110 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
111 let focus_file_ids = focus_files.map(|focus| {
116 let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus
117 .iter()
118 .map(|p| dunce::simplified(p).to_path_buf())
119 .collect();
120 files
121 .iter()
122 .map(|file| normalized.contains(dunce::simplified(&file.path)))
123 .collect::<Vec<_>>()
124 });
125
126 tracing::debug!(
127 total_files,
128 total_tokens,
129 total_lines,
130 focused_files = focus_file_ids
131 .as_ref()
132 .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
133 "clone detection input"
134 );
135
136 let t0 = std::time::Instant::now();
138 let ranked_files = ranking::rank_reduce(&files);
139 let rank_time = t0.elapsed();
140 let unique_ranks: usize = ranked_files
141 .iter()
142 .flat_map(|f| f.iter())
143 .copied()
144 .max()
145 .map_or(0, |m| m as usize + 1);
146 tracing::debug!(
147 elapsed_us = rank_time.as_micros(),
148 unique_ranks,
149 "step1_rank_reduce"
150 );
151
152 let t0 = std::time::Instant::now();
154 let (text, file_of, file_offsets) =
155 concatenation::concatenate_with_sentinels(&ranked_files);
156 let concat_time = t0.elapsed();
157 tracing::debug!(
158 elapsed_us = concat_time.as_micros(),
159 concat_len = text.len(),
160 "step2_concatenate"
161 );
162
163 if text.is_empty() {
164 return empty_report(total_files);
165 }
166
167 let t0 = std::time::Instant::now();
169 let sa = suffix_array::build_suffix_array(&text);
170 let sa_time = t0.elapsed();
171 tracing::debug!(
172 elapsed_us = sa_time.as_micros(),
173 n = text.len(),
174 "step3_suffix_array"
175 );
176
177 let t0 = std::time::Instant::now();
179 let lcp_arr = lcp::build_lcp(&text, &sa);
180 let lcp_time = t0.elapsed();
181 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
182
183 let t0 = std::time::Instant::now();
185 let raw_groups = extraction::extract_clone_groups(
186 &sa,
187 &lcp_arr,
188 &file_of,
189 &file_offsets,
190 self.min_tokens,
191 &files,
192 focus_file_ids.as_deref(),
193 );
194 let extract_time = t0.elapsed();
195 tracing::debug!(
196 elapsed_us = extract_time.as_micros(),
197 raw_groups = raw_groups.len(),
198 "step5_extract_groups"
199 );
200
201 let t0 = std::time::Instant::now();
203 let clone_groups =
204 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
205 let build_time = t0.elapsed();
206 tracing::debug!(
207 elapsed_us = build_time.as_micros(),
208 final_groups = clone_groups.len(),
209 "step6_build_groups"
210 );
211
212 let t0 = std::time::Instant::now();
214 let stats =
215 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
216 let stats_time = t0.elapsed();
217 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
218
219 tracing::info!(
220 total_us = (rank_time
221 + concat_time
222 + sa_time
223 + lcp_time
224 + extract_time
225 + build_time
226 + stats_time)
227 .as_micros(),
228 rank_us = rank_time.as_micros(),
229 sa_us = sa_time.as_micros(),
230 lcp_us = lcp_time.as_micros(),
231 extract_us = extract_time.as_micros(),
232 build_us = build_time.as_micros(),
233 stats_us = stats_time.as_micros(),
234 total_tokens,
235 clone_groups = clone_groups.len(),
236 "clone detection complete"
237 );
238
239 DuplicationReport {
240 clone_groups,
241 clone_families: vec![], mirrored_directories: vec![],
243 stats,
244 }
245 }
246}
247
248const fn empty_report(total_files: usize) -> DuplicationReport {
250 DuplicationReport {
251 clone_groups: Vec::new(),
252 clone_families: Vec::new(),
253 mirrored_directories: Vec::new(),
254 stats: DuplicationStats {
255 total_files,
256 files_with_clones: 0,
257 total_lines: 0,
258 duplicated_lines: 0,
259 total_tokens: 0,
260 duplicated_tokens: 0,
261 clone_groups: 0,
262 clone_instances: 0,
263 duplication_percentage: 0.0,
264 clone_groups_below_min_occurrences: 0,
265 },
266 }
267}