1mod boundary;
8mod concatenation;
9mod extraction;
10mod filtering;
11mod lcp;
12mod ranking;
13mod rolling;
14mod statistics;
15mod suffix_array;
16mod utils;
17
18#[cfg(test)]
19mod tests;
20
21use std::path::PathBuf;
22
23use oxc_span::Span;
24use rustc_hash::FxHashSet;
25
26use super::normalize::HashedToken;
27use super::tokenize::{FileTokens, TokenKind};
28use super::types::{DuplicationReport, DuplicationStats};
29
30const ROLLING_DETECTOR_ENV: &str = "FALLOW_DUPES_ROLLING";
31const ROLLING_BOUNDARY_FALLBACK_FILE_RATIO: f64 = 0.20;
32
33struct FileData {
35 path: PathBuf,
36 hashed_tokens: Vec<HashedToken>,
37 file_tokens: FileTokens,
38 atomic_invocation_spans: Vec<Span>,
39}
40
41#[derive(Clone, Copy)]
42pub(super) struct CorpusTotals {
43 pub(super) files: usize,
44 pub(super) lines: usize,
45 pub(super) tokens: usize,
46}
47
48pub struct CloneDetector {
54 min_tokens: usize,
56 min_lines: usize,
58 skip_local: bool,
60}
61
62impl CloneDetector {
63 #[must_use]
65 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
66 Self {
67 min_tokens,
68 min_lines,
69 skip_local,
70 }
71 }
72
73 pub fn detect(
78 &self,
79 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
80 ) -> DuplicationReport {
81 self.detect_inner(file_data, None, None)
82 }
83
84 pub(super) fn detect_with_totals(
85 &self,
86 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
87 totals: CorpusTotals,
88 ) -> DuplicationReport {
89 self.detect_inner(file_data, None, Some(totals))
90 }
91
92 pub fn detect_touching_files(
99 &self,
100 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
101 focus_files: &FxHashSet<PathBuf>,
102 ) -> DuplicationReport {
103 self.detect_inner(file_data, Some(focus_files), None)
104 }
105
106 fn detect_inner(
107 &self,
108 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
109 focus_files: Option<&FxHashSet<PathBuf>>,
110 corpus_totals: Option<CorpusTotals>,
111 ) -> DuplicationReport {
112 let _span = tracing::info_span!("clone_detect").entered();
113
114 if file_data.is_empty() || self.min_tokens == 0 {
115 return empty_report(corpus_totals.unwrap_or(CorpusTotals {
116 files: 0,
117 lines: 0,
118 tokens: 0,
119 }));
120 }
121
122 let (files, totals, focus_file_ids) =
123 build_detection_inputs(file_data, focus_files, corpus_totals);
124 self.run_clone_pipeline(&files, totals, focus_file_ids.as_deref())
125 }
126
127 fn run_clone_pipeline(
130 &self,
131 files: &[FileData],
132 totals: CorpusTotals,
133 focus_file_ids: Option<&[bool]>,
134 ) -> DuplicationReport {
135 if std::env::var_os(ROLLING_DETECTOR_ENV).is_some() {
136 let boundary_summary = summarize_boundaries(files);
137 if !boundary_summary.is_component_heavy {
138 return rolling::detect(
139 files,
140 self.min_tokens,
141 self.min_lines,
142 self.skip_local,
143 totals,
144 boundary_summary.has_any_boundary,
145 );
146 }
147 }
148
149 let (text, file_of, file_offsets, rank_time, concat_time) = rank_and_concatenate(files);
150
151 if text.is_empty() {
152 return empty_report(totals);
153 }
154
155 let (raw_groups, sa_time, lcp_time, extract_time) =
156 self.extract_raw_groups(&text, &file_of, &file_offsets, files, focus_file_ids);
157
158 let t0 = std::time::Instant::now();
159 let clone_groups =
160 filtering::build_groups(raw_groups, files, self.min_lines, self.skip_local);
161 let build_time = t0.elapsed();
162 tracing::debug!(
163 elapsed_us = build_time.as_micros(),
164 final_groups = clone_groups.len(),
165 "step6_build_groups"
166 );
167
168 let t0 = std::time::Instant::now();
169 let stats =
170 statistics::compute_stats(&clone_groups, totals.files, totals.lines, totals.tokens);
171 let stats_time = t0.elapsed();
172 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
173
174 trace_clone_detection_complete(
175 &CloneDetectionTimings {
176 rank: rank_time,
177 concat: concat_time,
178 suffix_array: sa_time,
179 lcp: lcp_time,
180 extract: extract_time,
181 build: build_time,
182 stats: stats_time,
183 },
184 totals.tokens,
185 clone_groups.len(),
186 );
187
188 DuplicationReport {
189 clone_groups,
190 clone_families: vec![], mirrored_directories: vec![],
192 stats,
193 }
194 }
195
196 fn extract_raw_groups(
200 &self,
201 text: &[i64],
202 file_of: &[usize],
203 file_offsets: &[usize],
204 files: &[FileData],
205 focus_file_ids: Option<&[bool]>,
206 ) -> (
207 Vec<extraction::RawGroup>,
208 std::time::Duration,
209 std::time::Duration,
210 std::time::Duration,
211 ) {
212 let t0 = std::time::Instant::now();
213 let sa = suffix_array::build_suffix_array(text);
214 let sa_time = t0.elapsed();
215 tracing::debug!(
216 elapsed_us = sa_time.as_micros(),
217 n = text.len(),
218 "step3_suffix_array"
219 );
220
221 let t0 = std::time::Instant::now();
222 let lcp_arr = lcp::build_lcp(text, &sa);
223 let lcp_time = t0.elapsed();
224 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
225
226 let t0 = std::time::Instant::now();
227 let raw_groups = extraction::extract_clone_groups(&extraction::CloneGroupExtractionInput {
228 sa: &sa,
229 lcp: &lcp_arr,
230 file_of,
231 file_offsets,
232 min_tokens: self.min_tokens,
233 files,
234 focus_file_ids,
235 may_have_boundaries: files_may_have_boundaries(files),
236 });
237 let extract_time = t0.elapsed();
238 tracing::debug!(
239 elapsed_us = extract_time.as_micros(),
240 raw_groups = raw_groups.len(),
241 "step5_extract_groups"
242 );
243
244 (raw_groups, sa_time, lcp_time, extract_time)
245 }
246}
247
248fn build_detection_inputs(
251 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
252 focus_files: Option<&FxHashSet<PathBuf>>,
253 corpus_totals: Option<CorpusTotals>,
254) -> (Vec<FileData>, CorpusTotals, Option<Vec<bool>>) {
255 let files: Vec<FileData> = file_data
256 .into_iter()
257 .map(|(path, hashed_tokens, file_tokens)| FileData {
258 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
259 path,
260 hashed_tokens,
261 file_tokens,
262 })
263 .collect();
264
265 let totals = corpus_totals.unwrap_or_else(|| CorpusTotals {
266 files: files.len(),
267 lines: files.iter().map(|f| f.file_tokens.line_count).sum(),
268 tokens: files.iter().map(|f| f.hashed_tokens.len()).sum(),
269 });
270 let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
271 trace_clone_detection_input(
272 totals.files,
273 totals.tokens,
274 totals.lines,
275 focus_file_ids.as_deref(),
276 );
277 (files, totals, focus_file_ids)
278}
279
280fn rank_and_concatenate(
284 files: &[FileData],
285) -> (
286 Vec<i64>,
287 Vec<usize>,
288 Vec<usize>,
289 std::time::Duration,
290 std::time::Duration,
291) {
292 let t0 = std::time::Instant::now();
293 let ranked_files = ranking::rank_reduce(files);
294 let rank_time = t0.elapsed();
295 let unique_ranks: usize = ranked_files
296 .iter()
297 .flat_map(|f| f.iter())
298 .copied()
299 .max()
300 .map_or(0, |m| m as usize + 1);
301 tracing::debug!(
302 elapsed_us = rank_time.as_micros(),
303 unique_ranks,
304 "step1_rank_reduce"
305 );
306
307 let t0 = std::time::Instant::now();
308 let (text, file_of, file_offsets) = concatenation::concatenate_with_sentinels(&ranked_files);
309 let concat_time = t0.elapsed();
310 tracing::debug!(
311 elapsed_us = concat_time.as_micros(),
312 concat_len = text.len(),
313 "step2_concatenate"
314 );
315
316 (text, file_of, file_offsets, rank_time, concat_time)
317}
318
319struct CloneDetectionTimings {
320 rank: std::time::Duration,
321 concat: std::time::Duration,
322 suffix_array: std::time::Duration,
323 lcp: std::time::Duration,
324 extract: std::time::Duration,
325 build: std::time::Duration,
326 stats: std::time::Duration,
327}
328
329fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
330 let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
331 .iter()
332 .map(|p| dunce::simplified(p).to_path_buf())
333 .collect();
334 files
335 .iter()
336 .map(|file| normalized.contains(dunce::simplified(&file.path)))
337 .collect()
338}
339
340#[derive(Clone, Copy)]
341struct BoundarySummary {
342 is_component_heavy: bool,
343 has_any_boundary: bool,
344}
345
346fn summarize_boundaries(files: &[FileData]) -> BoundarySummary {
347 let mut files_with_tokens = 0_usize;
348 let mut files_with_boundaries = 0_usize;
349
350 for file in files {
351 if file.hashed_tokens.is_empty() {
352 continue;
353 }
354
355 files_with_tokens += 1;
356 if file_has_boundary(file) {
357 files_with_boundaries += 1;
358 }
359 }
360
361 BoundarySummary {
362 is_component_heavy: files_with_tokens > 0
363 && (files_with_boundaries as f64 / files_with_tokens as f64)
364 >= ROLLING_BOUNDARY_FALLBACK_FILE_RATIO,
365 has_any_boundary: files_with_boundaries > 0,
366 }
367}
368
369fn file_has_boundary(file: &FileData) -> bool {
370 file.hashed_tokens.iter().any(|token| {
371 file.file_tokens
372 .tokens
373 .get(token.original_index)
374 .is_some_and(|source| matches!(source.kind, TokenKind::Boundary(_)))
375 })
376}
377
378fn files_may_have_boundaries(files: &[FileData]) -> bool {
379 files.iter().any(file_may_have_boundaries)
380}
381
382fn file_may_have_boundaries(file: &FileData) -> bool {
383 matches!(
384 file.path
385 .extension()
386 .and_then(|extension| extension.to_str()),
387 Some("astro" | "css" | "less" | "sass" | "scss" | "svelte" | "vue")
388 )
389}
390
391fn trace_clone_detection_input(
392 total_files: usize,
393 total_tokens: usize,
394 total_lines: usize,
395 focus_file_ids: Option<&[bool]>,
396) {
397 tracing::debug!(
398 total_files,
399 total_tokens,
400 total_lines,
401 focused_files =
402 focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
403 "clone detection input"
404 );
405}
406
407fn trace_clone_detection_complete(
408 timings: &CloneDetectionTimings,
409 total_tokens: usize,
410 clone_groups: usize,
411) {
412 tracing::info!(
413 total_us = (timings.rank
414 + timings.concat
415 + timings.suffix_array
416 + timings.lcp
417 + timings.extract
418 + timings.build
419 + timings.stats)
420 .as_micros(),
421 rank_us = timings.rank.as_micros(),
422 sa_us = timings.suffix_array.as_micros(),
423 lcp_us = timings.lcp.as_micros(),
424 extract_us = timings.extract.as_micros(),
425 build_us = timings.build.as_micros(),
426 stats_us = timings.stats.as_micros(),
427 total_tokens,
428 clone_groups,
429 "clone detection complete"
430 );
431}
432
433const fn empty_report(totals: CorpusTotals) -> DuplicationReport {
435 DuplicationReport {
436 clone_groups: Vec::new(),
437 clone_families: Vec::new(),
438 mirrored_directories: Vec::new(),
439 stats: DuplicationStats {
440 total_files: totals.files,
441 files_with_clones: 0,
442 total_lines: totals.lines,
443 duplicated_lines: 0,
444 total_tokens: totals.tokens,
445 duplicated_tokens: 0,
446 clone_groups: 0,
447 clone_instances: 0,
448 duplication_percentage: 0.0,
449 clone_groups_below_min_occurrences: 0,
450 },
451 }
452}