fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28struct FileData {
30 path: PathBuf,
31 hashed_tokens: Vec<HashedToken>,
32 file_tokens: FileTokens,
33 atomic_invocation_spans: Vec<Span>,
34}
35
36pub struct CloneDetector {
42 min_tokens: usize,
44 min_lines: usize,
46 skip_local: bool,
48}
49
50impl CloneDetector {
51 #[must_use]
53 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54 Self {
55 min_tokens,
56 min_lines,
57 skip_local,
58 }
59 }
60
61 pub fn detect(
66 &self,
67 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68 ) -> DuplicationReport {
69 self.detect_inner(file_data, None)
70 }
71
72 pub fn detect_touching_files(
79 &self,
80 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81 focus_files: &FxHashSet<PathBuf>,
82 ) -> DuplicationReport {
83 self.detect_inner(file_data, Some(focus_files))
84 }
85
86 fn detect_inner(
87 &self,
88 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89 focus_files: Option<&FxHashSet<PathBuf>>,
90 ) -> DuplicationReport {
91 let _span = tracing::info_span!("clone_detect").entered();
92
93 if file_data.is_empty() || self.min_tokens == 0 {
94 return empty_report(0);
95 }
96
97 let files: Vec<FileData> = file_data
98 .into_iter()
99 .map(|(path, hashed_tokens, file_tokens)| FileData {
100 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101 path,
102 hashed_tokens,
103 file_tokens,
104 })
105 .collect();
106
107 let total_files = files.len();
108 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
109 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
110 let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
111 trace_clone_detection_input(
112 total_files,
113 total_tokens,
114 total_lines,
115 focus_file_ids.as_deref(),
116 );
117
118 let t0 = std::time::Instant::now();
119 let ranked_files = ranking::rank_reduce(&files);
120 let rank_time = t0.elapsed();
121 let unique_ranks: usize = ranked_files
122 .iter()
123 .flat_map(|f| f.iter())
124 .copied()
125 .max()
126 .map_or(0, |m| m as usize + 1);
127 tracing::debug!(
128 elapsed_us = rank_time.as_micros(),
129 unique_ranks,
130 "step1_rank_reduce"
131 );
132
133 let t0 = std::time::Instant::now();
134 let (text, file_of, file_offsets) =
135 concatenation::concatenate_with_sentinels(&ranked_files);
136 let concat_time = t0.elapsed();
137 tracing::debug!(
138 elapsed_us = concat_time.as_micros(),
139 concat_len = text.len(),
140 "step2_concatenate"
141 );
142
143 if text.is_empty() {
144 return empty_report(total_files);
145 }
146
147 let t0 = std::time::Instant::now();
148 let sa = suffix_array::build_suffix_array(&text);
149 let sa_time = t0.elapsed();
150 tracing::debug!(
151 elapsed_us = sa_time.as_micros(),
152 n = text.len(),
153 "step3_suffix_array"
154 );
155
156 let t0 = std::time::Instant::now();
157 let lcp_arr = lcp::build_lcp(&text, &sa);
158 let lcp_time = t0.elapsed();
159 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
160
161 let t0 = std::time::Instant::now();
162 let raw_groups = extraction::extract_clone_groups(
163 &sa,
164 &lcp_arr,
165 &file_of,
166 &file_offsets,
167 self.min_tokens,
168 &files,
169 focus_file_ids.as_deref(),
170 );
171 let extract_time = t0.elapsed();
172 tracing::debug!(
173 elapsed_us = extract_time.as_micros(),
174 raw_groups = raw_groups.len(),
175 "step5_extract_groups"
176 );
177
178 let t0 = std::time::Instant::now();
179 let clone_groups =
180 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
181 let build_time = t0.elapsed();
182 tracing::debug!(
183 elapsed_us = build_time.as_micros(),
184 final_groups = clone_groups.len(),
185 "step6_build_groups"
186 );
187
188 let t0 = std::time::Instant::now();
189 let stats =
190 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
191 let stats_time = t0.elapsed();
192 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
193
194 trace_clone_detection_complete(
195 &CloneDetectionTimings {
196 rank: rank_time,
197 concat: concat_time,
198 suffix_array: sa_time,
199 lcp: lcp_time,
200 extract: extract_time,
201 build: build_time,
202 stats: stats_time,
203 },
204 total_tokens,
205 clone_groups.len(),
206 );
207
208 DuplicationReport {
209 clone_groups,
210 clone_families: vec![], mirrored_directories: vec![],
212 stats,
213 }
214 }
215}
216
217struct CloneDetectionTimings {
218 rank: std::time::Duration,
219 concat: std::time::Duration,
220 suffix_array: std::time::Duration,
221 lcp: std::time::Duration,
222 extract: std::time::Duration,
223 build: std::time::Duration,
224 stats: std::time::Duration,
225}
226
227fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
228 let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
229 .iter()
230 .map(|p| dunce::simplified(p).to_path_buf())
231 .collect();
232 files
233 .iter()
234 .map(|file| normalized.contains(dunce::simplified(&file.path)))
235 .collect()
236}
237
238fn trace_clone_detection_input(
239 total_files: usize,
240 total_tokens: usize,
241 total_lines: usize,
242 focus_file_ids: Option<&[bool]>,
243) {
244 tracing::debug!(
245 total_files,
246 total_tokens,
247 total_lines,
248 focused_files =
249 focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
250 "clone detection input"
251 );
252}
253
254fn trace_clone_detection_complete(
255 timings: &CloneDetectionTimings,
256 total_tokens: usize,
257 clone_groups: usize,
258) {
259 tracing::info!(
260 total_us = (timings.rank
261 + timings.concat
262 + timings.suffix_array
263 + timings.lcp
264 + timings.extract
265 + timings.build
266 + timings.stats)
267 .as_micros(),
268 rank_us = timings.rank.as_micros(),
269 sa_us = timings.suffix_array.as_micros(),
270 lcp_us = timings.lcp.as_micros(),
271 extract_us = timings.extract.as_micros(),
272 build_us = timings.build.as_micros(),
273 stats_us = timings.stats.as_micros(),
274 total_tokens,
275 clone_groups,
276 "clone detection complete"
277 );
278}
279
280const fn empty_report(total_files: usize) -> DuplicationReport {
282 DuplicationReport {
283 clone_groups: Vec::new(),
284 clone_families: Vec::new(),
285 mirrored_directories: Vec::new(),
286 stats: DuplicationStats {
287 total_files,
288 files_with_clones: 0,
289 total_lines: 0,
290 duplicated_lines: 0,
291 total_tokens: 0,
292 duplicated_tokens: 0,
293 clone_groups: 0,
294 clone_instances: 0,
295 duplication_percentage: 0.0,
296 clone_groups_below_min_occurrences: 0,
297 },
298 }
299}