fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28struct FileData {
30 path: PathBuf,
31 hashed_tokens: Vec<HashedToken>,
32 file_tokens: FileTokens,
33 atomic_invocation_spans: Vec<Span>,
34}
35
36#[derive(Clone, Copy)]
37pub(super) struct CorpusTotals {
38 pub(super) files: usize,
39 pub(super) lines: usize,
40 pub(super) tokens: usize,
41}
42
43pub struct CloneDetector {
49 min_tokens: usize,
51 min_lines: usize,
53 skip_local: bool,
55}
56
57impl CloneDetector {
58 #[must_use]
60 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
61 Self {
62 min_tokens,
63 min_lines,
64 skip_local,
65 }
66 }
67
68 pub fn detect(
73 &self,
74 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
75 ) -> DuplicationReport {
76 self.detect_inner(file_data, None, None)
77 }
78
79 pub(super) fn detect_with_totals(
80 &self,
81 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
82 totals: CorpusTotals,
83 ) -> DuplicationReport {
84 self.detect_inner(file_data, None, Some(totals))
85 }
86
87 pub fn detect_touching_files(
94 &self,
95 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
96 focus_files: &FxHashSet<PathBuf>,
97 ) -> DuplicationReport {
98 self.detect_inner(file_data, Some(focus_files), None)
99 }
100
101 fn detect_inner(
102 &self,
103 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
104 focus_files: Option<&FxHashSet<PathBuf>>,
105 corpus_totals: Option<CorpusTotals>,
106 ) -> DuplicationReport {
107 let _span = tracing::info_span!("clone_detect").entered();
108
109 if file_data.is_empty() || self.min_tokens == 0 {
110 return empty_report(corpus_totals.unwrap_or(CorpusTotals {
111 files: 0,
112 lines: 0,
113 tokens: 0,
114 }));
115 }
116
117 let files: Vec<FileData> = file_data
118 .into_iter()
119 .map(|(path, hashed_tokens, file_tokens)| FileData {
120 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
121 path,
122 hashed_tokens,
123 file_tokens,
124 })
125 .collect();
126
127 let totals = corpus_totals.unwrap_or_else(|| CorpusTotals {
128 files: files.len(),
129 lines: files.iter().map(|f| f.file_tokens.line_count).sum(),
130 tokens: files.iter().map(|f| f.hashed_tokens.len()).sum(),
131 });
132 let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
133 trace_clone_detection_input(
134 totals.files,
135 totals.tokens,
136 totals.lines,
137 focus_file_ids.as_deref(),
138 );
139
140 let t0 = std::time::Instant::now();
141 let ranked_files = ranking::rank_reduce(&files);
142 let rank_time = t0.elapsed();
143 let unique_ranks: usize = ranked_files
144 .iter()
145 .flat_map(|f| f.iter())
146 .copied()
147 .max()
148 .map_or(0, |m| m as usize + 1);
149 tracing::debug!(
150 elapsed_us = rank_time.as_micros(),
151 unique_ranks,
152 "step1_rank_reduce"
153 );
154
155 let t0 = std::time::Instant::now();
156 let (text, file_of, file_offsets) =
157 concatenation::concatenate_with_sentinels(&ranked_files);
158 let concat_time = t0.elapsed();
159 tracing::debug!(
160 elapsed_us = concat_time.as_micros(),
161 concat_len = text.len(),
162 "step2_concatenate"
163 );
164
165 if text.is_empty() {
166 return empty_report(totals);
167 }
168
169 let t0 = std::time::Instant::now();
170 let sa = suffix_array::build_suffix_array(&text);
171 let sa_time = t0.elapsed();
172 tracing::debug!(
173 elapsed_us = sa_time.as_micros(),
174 n = text.len(),
175 "step3_suffix_array"
176 );
177
178 let t0 = std::time::Instant::now();
179 let lcp_arr = lcp::build_lcp(&text, &sa);
180 let lcp_time = t0.elapsed();
181 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
182
183 let t0 = std::time::Instant::now();
184 let raw_groups = extraction::extract_clone_groups(&extraction::CloneGroupExtractionInput {
185 sa: &sa,
186 lcp: &lcp_arr,
187 file_of: &file_of,
188 file_offsets: &file_offsets,
189 min_tokens: self.min_tokens,
190 files: &files,
191 focus_file_ids: focus_file_ids.as_deref(),
192 });
193 let extract_time = t0.elapsed();
194 tracing::debug!(
195 elapsed_us = extract_time.as_micros(),
196 raw_groups = raw_groups.len(),
197 "step5_extract_groups"
198 );
199
200 let t0 = std::time::Instant::now();
201 let clone_groups =
202 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
203 let build_time = t0.elapsed();
204 tracing::debug!(
205 elapsed_us = build_time.as_micros(),
206 final_groups = clone_groups.len(),
207 "step6_build_groups"
208 );
209
210 let t0 = std::time::Instant::now();
211 let stats =
212 statistics::compute_stats(&clone_groups, totals.files, totals.lines, totals.tokens);
213 let stats_time = t0.elapsed();
214 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
215
216 trace_clone_detection_complete(
217 &CloneDetectionTimings {
218 rank: rank_time,
219 concat: concat_time,
220 suffix_array: sa_time,
221 lcp: lcp_time,
222 extract: extract_time,
223 build: build_time,
224 stats: stats_time,
225 },
226 totals.tokens,
227 clone_groups.len(),
228 );
229
230 DuplicationReport {
231 clone_groups,
232 clone_families: vec![], mirrored_directories: vec![],
234 stats,
235 }
236 }
237}
238
239struct CloneDetectionTimings {
240 rank: std::time::Duration,
241 concat: std::time::Duration,
242 suffix_array: std::time::Duration,
243 lcp: std::time::Duration,
244 extract: std::time::Duration,
245 build: std::time::Duration,
246 stats: std::time::Duration,
247}
248
249fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
250 let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
251 .iter()
252 .map(|p| dunce::simplified(p).to_path_buf())
253 .collect();
254 files
255 .iter()
256 .map(|file| normalized.contains(dunce::simplified(&file.path)))
257 .collect()
258}
259
260fn trace_clone_detection_input(
261 total_files: usize,
262 total_tokens: usize,
263 total_lines: usize,
264 focus_file_ids: Option<&[bool]>,
265) {
266 tracing::debug!(
267 total_files,
268 total_tokens,
269 total_lines,
270 focused_files =
271 focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
272 "clone detection input"
273 );
274}
275
276fn trace_clone_detection_complete(
277 timings: &CloneDetectionTimings,
278 total_tokens: usize,
279 clone_groups: usize,
280) {
281 tracing::info!(
282 total_us = (timings.rank
283 + timings.concat
284 + timings.suffix_array
285 + timings.lcp
286 + timings.extract
287 + timings.build
288 + timings.stats)
289 .as_micros(),
290 rank_us = timings.rank.as_micros(),
291 sa_us = timings.suffix_array.as_micros(),
292 lcp_us = timings.lcp.as_micros(),
293 extract_us = timings.extract.as_micros(),
294 build_us = timings.build.as_micros(),
295 stats_us = timings.stats.as_micros(),
296 total_tokens,
297 clone_groups,
298 "clone detection complete"
299 );
300}
301
302const fn empty_report(totals: CorpusTotals) -> DuplicationReport {
304 DuplicationReport {
305 clone_groups: Vec::new(),
306 clone_families: Vec::new(),
307 mirrored_directories: Vec::new(),
308 stats: DuplicationStats {
309 total_files: totals.files,
310 files_with_clones: 0,
311 total_lines: totals.lines,
312 duplicated_lines: 0,
313 total_tokens: totals.tokens,
314 duplicated_tokens: 0,
315 clone_groups: 0,
316 clone_instances: 0,
317 duplication_percentage: 0.0,
318 clone_groups_below_min_occurrences: 0,
319 },
320 }
321}