Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28/// Data for a single file being analyzed.
29struct FileData {
30    path: PathBuf,
31    hashed_tokens: Vec<HashedToken>,
32    file_tokens: FileTokens,
33    atomic_invocation_spans: Vec<Span>,
34}
35
36/// Suffix Array + LCP based clone detection engine.
37///
38/// Concatenates all files' token sequences (separated by unique sentinels),
39/// builds a suffix array and LCP array, then extracts maximal clone groups
40/// from contiguous LCP intervals.
41pub struct CloneDetector {
42    /// Minimum clone size in tokens.
43    min_tokens: usize,
44    /// Minimum clone size in lines.
45    min_lines: usize,
46    /// Only report cross-directory duplicates.
47    skip_local: bool,
48}
49
50impl CloneDetector {
51    /// Create a new detector with the given thresholds.
52    #[must_use]
53    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54        Self {
55            min_tokens,
56            min_lines,
57            skip_local,
58        }
59    }
60
61    /// Run clone detection across all files.
62    ///
63    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
64    /// one per analyzed file.
65    pub fn detect(
66        &self,
67        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68    ) -> DuplicationReport {
69        self.detect_inner(file_data, None)
70    }
71
72    /// Run clone detection while only materializing groups that touch one of the
73    /// given files.
74    ///
75    /// All files still participate in matching, so focused files can be reported
76    /// as duplicated against unchanged files. Groups that only involve
77    /// non-focused files are dropped before expensive result building.
78    pub fn detect_touching_files(
79        &self,
80        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81        focus_files: &FxHashSet<PathBuf>,
82    ) -> DuplicationReport {
83        self.detect_inner(file_data, Some(focus_files))
84    }
85
86    fn detect_inner(
87        &self,
88        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89        focus_files: Option<&FxHashSet<PathBuf>>,
90    ) -> DuplicationReport {
91        let _span = tracing::info_span!("clone_detect").entered();
92
93        if file_data.is_empty() || self.min_tokens == 0 {
94            return empty_report(0);
95        }
96
97        let files: Vec<FileData> = file_data
98            .into_iter()
99            .map(|(path, hashed_tokens, file_tokens)| FileData {
100                atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101                path,
102                hashed_tokens,
103                file_tokens,
104            })
105            .collect();
106
107        let total_files = files.len();
108        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
109        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
110        let focus_file_ids = focus_files.map(|focus| {
111            let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus
112                .iter()
113                .map(|p| dunce::simplified(p).to_path_buf())
114                .collect();
115            files
116                .iter()
117                .map(|file| normalized.contains(dunce::simplified(&file.path)))
118                .collect::<Vec<_>>()
119        });
120
121        tracing::debug!(
122            total_files,
123            total_tokens,
124            total_lines,
125            focused_files = focus_file_ids
126                .as_ref()
127                .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
128            "clone detection input"
129        );
130
131        let t0 = std::time::Instant::now();
132        let ranked_files = ranking::rank_reduce(&files);
133        let rank_time = t0.elapsed();
134        let unique_ranks: usize = ranked_files
135            .iter()
136            .flat_map(|f| f.iter())
137            .copied()
138            .max()
139            .map_or(0, |m| m as usize + 1);
140        tracing::debug!(
141            elapsed_us = rank_time.as_micros(),
142            unique_ranks,
143            "step1_rank_reduce"
144        );
145
146        let t0 = std::time::Instant::now();
147        let (text, file_of, file_offsets) =
148            concatenation::concatenate_with_sentinels(&ranked_files);
149        let concat_time = t0.elapsed();
150        tracing::debug!(
151            elapsed_us = concat_time.as_micros(),
152            concat_len = text.len(),
153            "step2_concatenate"
154        );
155
156        if text.is_empty() {
157            return empty_report(total_files);
158        }
159
160        let t0 = std::time::Instant::now();
161        let sa = suffix_array::build_suffix_array(&text);
162        let sa_time = t0.elapsed();
163        tracing::debug!(
164            elapsed_us = sa_time.as_micros(),
165            n = text.len(),
166            "step3_suffix_array"
167        );
168
169        let t0 = std::time::Instant::now();
170        let lcp_arr = lcp::build_lcp(&text, &sa);
171        let lcp_time = t0.elapsed();
172        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
173
174        let t0 = std::time::Instant::now();
175        let raw_groups = extraction::extract_clone_groups(
176            &sa,
177            &lcp_arr,
178            &file_of,
179            &file_offsets,
180            self.min_tokens,
181            &files,
182            focus_file_ids.as_deref(),
183        );
184        let extract_time = t0.elapsed();
185        tracing::debug!(
186            elapsed_us = extract_time.as_micros(),
187            raw_groups = raw_groups.len(),
188            "step5_extract_groups"
189        );
190
191        let t0 = std::time::Instant::now();
192        let clone_groups =
193            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
194        let build_time = t0.elapsed();
195        tracing::debug!(
196            elapsed_us = build_time.as_micros(),
197            final_groups = clone_groups.len(),
198            "step6_build_groups"
199        );
200
201        let t0 = std::time::Instant::now();
202        let stats =
203            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
204        let stats_time = t0.elapsed();
205        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
206
207        tracing::info!(
208            total_us = (rank_time
209                + concat_time
210                + sa_time
211                + lcp_time
212                + extract_time
213                + build_time
214                + stats_time)
215                .as_micros(),
216            rank_us = rank_time.as_micros(),
217            sa_us = sa_time.as_micros(),
218            lcp_us = lcp_time.as_micros(),
219            extract_us = extract_time.as_micros(),
220            build_us = build_time.as_micros(),
221            stats_us = stats_time.as_micros(),
222            total_tokens,
223            clone_groups = clone_groups.len(),
224            "clone detection complete"
225        );
226
227        DuplicationReport {
228            clone_groups,
229            clone_families: vec![], // Populated by the caller after suppression filtering
230            mirrored_directories: vec![],
231            stats,
232        }
233    }
234}
235
236/// Create an empty report when there are no files to analyze.
237const fn empty_report(total_files: usize) -> DuplicationReport {
238    DuplicationReport {
239        clone_groups: Vec::new(),
240        clone_families: Vec::new(),
241        mirrored_directories: Vec::new(),
242        stats: DuplicationStats {
243            total_files,
244            files_with_clones: 0,
245            total_lines: 0,
246            duplicated_lines: 0,
247            total_tokens: 0,
248            duplicated_tokens: 0,
249            clone_groups: 0,
250            clone_instances: 0,
251            duplication_percentage: 0.0,
252            clone_groups_below_min_occurrences: 0,
253        },
254    }
255}