Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28/// Data for a single file being analyzed.
29struct FileData {
30    path: PathBuf,
31    hashed_tokens: Vec<HashedToken>,
32    file_tokens: FileTokens,
33    atomic_invocation_spans: Vec<Span>,
34}
35
36#[derive(Clone, Copy)]
37pub(super) struct CorpusTotals {
38    pub(super) files: usize,
39    pub(super) lines: usize,
40    pub(super) tokens: usize,
41}
42
43/// Suffix Array + LCP based clone detection engine.
44///
45/// Concatenates all files' token sequences (separated by unique sentinels),
46/// builds a suffix array and LCP array, then extracts maximal clone groups
47/// from contiguous LCP intervals.
48pub struct CloneDetector {
49    /// Minimum clone size in tokens.
50    min_tokens: usize,
51    /// Minimum clone size in lines.
52    min_lines: usize,
53    /// Only report cross-directory duplicates.
54    skip_local: bool,
55}
56
57impl CloneDetector {
58    /// Create a new detector with the given thresholds.
59    #[must_use]
60    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
61        Self {
62            min_tokens,
63            min_lines,
64            skip_local,
65        }
66    }
67
68    /// Run clone detection across all files.
69    ///
70    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
71    /// one per analyzed file.
72    pub fn detect(
73        &self,
74        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
75    ) -> DuplicationReport {
76        self.detect_inner(file_data, None, None)
77    }
78
79    pub(super) fn detect_with_totals(
80        &self,
81        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
82        totals: CorpusTotals,
83    ) -> DuplicationReport {
84        self.detect_inner(file_data, None, Some(totals))
85    }
86
87    /// Run clone detection while only materializing groups that touch one of the
88    /// given files.
89    ///
90    /// All files still participate in matching, so focused files can be reported
91    /// as duplicated against unchanged files. Groups that only involve
92    /// non-focused files are dropped before expensive result building.
93    pub fn detect_touching_files(
94        &self,
95        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
96        focus_files: &FxHashSet<PathBuf>,
97    ) -> DuplicationReport {
98        self.detect_inner(file_data, Some(focus_files), None)
99    }
100
101    fn detect_inner(
102        &self,
103        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
104        focus_files: Option<&FxHashSet<PathBuf>>,
105        corpus_totals: Option<CorpusTotals>,
106    ) -> DuplicationReport {
107        let _span = tracing::info_span!("clone_detect").entered();
108
109        if file_data.is_empty() || self.min_tokens == 0 {
110            return empty_report(corpus_totals.unwrap_or(CorpusTotals {
111                files: 0,
112                lines: 0,
113                tokens: 0,
114            }));
115        }
116
117        let files: Vec<FileData> = file_data
118            .into_iter()
119            .map(|(path, hashed_tokens, file_tokens)| FileData {
120                atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
121                path,
122                hashed_tokens,
123                file_tokens,
124            })
125            .collect();
126
127        let totals = corpus_totals.unwrap_or_else(|| CorpusTotals {
128            files: files.len(),
129            lines: files.iter().map(|f| f.file_tokens.line_count).sum(),
130            tokens: files.iter().map(|f| f.hashed_tokens.len()).sum(),
131        });
132        let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
133        trace_clone_detection_input(
134            totals.files,
135            totals.tokens,
136            totals.lines,
137            focus_file_ids.as_deref(),
138        );
139
140        let t0 = std::time::Instant::now();
141        let ranked_files = ranking::rank_reduce(&files);
142        let rank_time = t0.elapsed();
143        let unique_ranks: usize = ranked_files
144            .iter()
145            .flat_map(|f| f.iter())
146            .copied()
147            .max()
148            .map_or(0, |m| m as usize + 1);
149        tracing::debug!(
150            elapsed_us = rank_time.as_micros(),
151            unique_ranks,
152            "step1_rank_reduce"
153        );
154
155        let t0 = std::time::Instant::now();
156        let (text, file_of, file_offsets) =
157            concatenation::concatenate_with_sentinels(&ranked_files);
158        let concat_time = t0.elapsed();
159        tracing::debug!(
160            elapsed_us = concat_time.as_micros(),
161            concat_len = text.len(),
162            "step2_concatenate"
163        );
164
165        if text.is_empty() {
166            return empty_report(totals);
167        }
168
169        let t0 = std::time::Instant::now();
170        let sa = suffix_array::build_suffix_array(&text);
171        let sa_time = t0.elapsed();
172        tracing::debug!(
173            elapsed_us = sa_time.as_micros(),
174            n = text.len(),
175            "step3_suffix_array"
176        );
177
178        let t0 = std::time::Instant::now();
179        let lcp_arr = lcp::build_lcp(&text, &sa);
180        let lcp_time = t0.elapsed();
181        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
182
183        let t0 = std::time::Instant::now();
184        let raw_groups = extraction::extract_clone_groups(&extraction::CloneGroupExtractionInput {
185            sa: &sa,
186            lcp: &lcp_arr,
187            file_of: &file_of,
188            file_offsets: &file_offsets,
189            min_tokens: self.min_tokens,
190            files: &files,
191            focus_file_ids: focus_file_ids.as_deref(),
192        });
193        let extract_time = t0.elapsed();
194        tracing::debug!(
195            elapsed_us = extract_time.as_micros(),
196            raw_groups = raw_groups.len(),
197            "step5_extract_groups"
198        );
199
200        let t0 = std::time::Instant::now();
201        let clone_groups =
202            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
203        let build_time = t0.elapsed();
204        tracing::debug!(
205            elapsed_us = build_time.as_micros(),
206            final_groups = clone_groups.len(),
207            "step6_build_groups"
208        );
209
210        let t0 = std::time::Instant::now();
211        let stats =
212            statistics::compute_stats(&clone_groups, totals.files, totals.lines, totals.tokens);
213        let stats_time = t0.elapsed();
214        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
215
216        trace_clone_detection_complete(
217            &CloneDetectionTimings {
218                rank: rank_time,
219                concat: concat_time,
220                suffix_array: sa_time,
221                lcp: lcp_time,
222                extract: extract_time,
223                build: build_time,
224                stats: stats_time,
225            },
226            totals.tokens,
227            clone_groups.len(),
228        );
229
230        DuplicationReport {
231            clone_groups,
232            clone_families: vec![], // Populated by the caller after suppression filtering
233            mirrored_directories: vec![],
234            stats,
235        }
236    }
237}
238
239struct CloneDetectionTimings {
240    rank: std::time::Duration,
241    concat: std::time::Duration,
242    suffix_array: std::time::Duration,
243    lcp: std::time::Duration,
244    extract: std::time::Duration,
245    build: std::time::Duration,
246    stats: std::time::Duration,
247}
248
249fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
250    let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
251        .iter()
252        .map(|p| dunce::simplified(p).to_path_buf())
253        .collect();
254    files
255        .iter()
256        .map(|file| normalized.contains(dunce::simplified(&file.path)))
257        .collect()
258}
259
260fn trace_clone_detection_input(
261    total_files: usize,
262    total_tokens: usize,
263    total_lines: usize,
264    focus_file_ids: Option<&[bool]>,
265) {
266    tracing::debug!(
267        total_files,
268        total_tokens,
269        total_lines,
270        focused_files =
271            focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
272        "clone detection input"
273    );
274}
275
276fn trace_clone_detection_complete(
277    timings: &CloneDetectionTimings,
278    total_tokens: usize,
279    clone_groups: usize,
280) {
281    tracing::info!(
282        total_us = (timings.rank
283            + timings.concat
284            + timings.suffix_array
285            + timings.lcp
286            + timings.extract
287            + timings.build
288            + timings.stats)
289            .as_micros(),
290        rank_us = timings.rank.as_micros(),
291        sa_us = timings.suffix_array.as_micros(),
292        lcp_us = timings.lcp.as_micros(),
293        extract_us = timings.extract.as_micros(),
294        build_us = timings.build.as_micros(),
295        stats_us = timings.stats.as_micros(),
296        total_tokens,
297        clone_groups,
298        "clone detection complete"
299    );
300}
301
302/// Create an empty report when there are no files to analyze.
303const fn empty_report(totals: CorpusTotals) -> DuplicationReport {
304    DuplicationReport {
305        clone_groups: Vec::new(),
306        clone_families: Vec::new(),
307        mirrored_directories: Vec::new(),
308        stats: DuplicationStats {
309            total_files: totals.files,
310            files_with_clones: 0,
311            total_lines: totals.lines,
312            duplicated_lines: 0,
313            total_tokens: totals.tokens,
314            duplicated_tokens: 0,
315            clone_groups: 0,
316            clone_instances: 0,
317            duplication_percentage: 0.0,
318            clone_groups_below_min_occurrences: 0,
319        },
320    }
321}