Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28/// Data for a single file being analyzed.
29struct FileData {
30    path: PathBuf,
31    hashed_tokens: Vec<HashedToken>,
32    file_tokens: FileTokens,
33    atomic_invocation_spans: Vec<Span>,
34}
35
36/// Suffix Array + LCP based clone detection engine.
37///
38/// Concatenates all files' token sequences (separated by unique sentinels),
39/// builds a suffix array and LCP array, then extracts maximal clone groups
40/// from contiguous LCP intervals.
41pub struct CloneDetector {
42    /// Minimum clone size in tokens.
43    min_tokens: usize,
44    /// Minimum clone size in lines.
45    min_lines: usize,
46    /// Only report cross-directory duplicates.
47    skip_local: bool,
48}
49
50impl CloneDetector {
51    /// Create a new detector with the given thresholds.
52    #[must_use]
53    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54        Self {
55            min_tokens,
56            min_lines,
57            skip_local,
58        }
59    }
60
61    /// Run clone detection across all files.
62    ///
63    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
64    /// one per analyzed file.
65    pub fn detect(
66        &self,
67        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68    ) -> DuplicationReport {
69        self.detect_inner(file_data, None)
70    }
71
72    /// Run clone detection while only materializing groups that touch one of the
73    /// given files.
74    ///
75    /// All files still participate in matching, so focused files can be reported
76    /// as duplicated against unchanged files. Groups that only involve
77    /// non-focused files are dropped before expensive result building.
78    pub fn detect_touching_files(
79        &self,
80        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81        focus_files: &FxHashSet<PathBuf>,
82    ) -> DuplicationReport {
83        self.detect_inner(file_data, Some(focus_files))
84    }
85
86    fn detect_inner(
87        &self,
88        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89        focus_files: Option<&FxHashSet<PathBuf>>,
90    ) -> DuplicationReport {
91        let _span = tracing::info_span!("clone_detect").entered();
92
93        if file_data.is_empty() || self.min_tokens == 0 {
94            return empty_report(0);
95        }
96
97        let files: Vec<FileData> = file_data
98            .into_iter()
99            .map(|(path, hashed_tokens, file_tokens)| FileData {
100                atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101                path,
102                hashed_tokens,
103                file_tokens,
104            })
105            .collect();
106
107        let total_files = files.len();
108        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
109        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
110        let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
111        trace_clone_detection_input(
112            total_files,
113            total_tokens,
114            total_lines,
115            focus_file_ids.as_deref(),
116        );
117
118        let t0 = std::time::Instant::now();
119        let ranked_files = ranking::rank_reduce(&files);
120        let rank_time = t0.elapsed();
121        let unique_ranks: usize = ranked_files
122            .iter()
123            .flat_map(|f| f.iter())
124            .copied()
125            .max()
126            .map_or(0, |m| m as usize + 1);
127        tracing::debug!(
128            elapsed_us = rank_time.as_micros(),
129            unique_ranks,
130            "step1_rank_reduce"
131        );
132
133        let t0 = std::time::Instant::now();
134        let (text, file_of, file_offsets) =
135            concatenation::concatenate_with_sentinels(&ranked_files);
136        let concat_time = t0.elapsed();
137        tracing::debug!(
138            elapsed_us = concat_time.as_micros(),
139            concat_len = text.len(),
140            "step2_concatenate"
141        );
142
143        if text.is_empty() {
144            return empty_report(total_files);
145        }
146
147        let t0 = std::time::Instant::now();
148        let sa = suffix_array::build_suffix_array(&text);
149        let sa_time = t0.elapsed();
150        tracing::debug!(
151            elapsed_us = sa_time.as_micros(),
152            n = text.len(),
153            "step3_suffix_array"
154        );
155
156        let t0 = std::time::Instant::now();
157        let lcp_arr = lcp::build_lcp(&text, &sa);
158        let lcp_time = t0.elapsed();
159        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
160
161        let t0 = std::time::Instant::now();
162        let raw_groups = extraction::extract_clone_groups(
163            &sa,
164            &lcp_arr,
165            &file_of,
166            &file_offsets,
167            self.min_tokens,
168            &files,
169            focus_file_ids.as_deref(),
170        );
171        let extract_time = t0.elapsed();
172        tracing::debug!(
173            elapsed_us = extract_time.as_micros(),
174            raw_groups = raw_groups.len(),
175            "step5_extract_groups"
176        );
177
178        let t0 = std::time::Instant::now();
179        let clone_groups =
180            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
181        let build_time = t0.elapsed();
182        tracing::debug!(
183            elapsed_us = build_time.as_micros(),
184            final_groups = clone_groups.len(),
185            "step6_build_groups"
186        );
187
188        let t0 = std::time::Instant::now();
189        let stats =
190            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
191        let stats_time = t0.elapsed();
192        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
193
194        trace_clone_detection_complete(
195            &CloneDetectionTimings {
196                rank: rank_time,
197                concat: concat_time,
198                suffix_array: sa_time,
199                lcp: lcp_time,
200                extract: extract_time,
201                build: build_time,
202                stats: stats_time,
203            },
204            total_tokens,
205            clone_groups.len(),
206        );
207
208        DuplicationReport {
209            clone_groups,
210            clone_families: vec![], // Populated by the caller after suppression filtering
211            mirrored_directories: vec![],
212            stats,
213        }
214    }
215}
216
217struct CloneDetectionTimings {
218    rank: std::time::Duration,
219    concat: std::time::Duration,
220    suffix_array: std::time::Duration,
221    lcp: std::time::Duration,
222    extract: std::time::Duration,
223    build: std::time::Duration,
224    stats: std::time::Duration,
225}
226
227fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
228    let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
229        .iter()
230        .map(|p| dunce::simplified(p).to_path_buf())
231        .collect();
232    files
233        .iter()
234        .map(|file| normalized.contains(dunce::simplified(&file.path)))
235        .collect()
236}
237
238fn trace_clone_detection_input(
239    total_files: usize,
240    total_tokens: usize,
241    total_lines: usize,
242    focus_file_ids: Option<&[bool]>,
243) {
244    tracing::debug!(
245        total_files,
246        total_tokens,
247        total_lines,
248        focused_files =
249            focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
250        "clone detection input"
251    );
252}
253
254fn trace_clone_detection_complete(
255    timings: &CloneDetectionTimings,
256    total_tokens: usize,
257    clone_groups: usize,
258) {
259    tracing::info!(
260        total_us = (timings.rank
261            + timings.concat
262            + timings.suffix_array
263            + timings.lcp
264            + timings.extract
265            + timings.build
266            + timings.stats)
267            .as_micros(),
268        rank_us = timings.rank.as_micros(),
269        sa_us = timings.suffix_array.as_micros(),
270        lcp_us = timings.lcp.as_micros(),
271        extract_us = timings.extract.as_micros(),
272        build_us = timings.build.as_micros(),
273        stats_us = timings.stats.as_micros(),
274        total_tokens,
275        clone_groups,
276        "clone detection complete"
277    );
278}
279
280/// Create an empty report when there are no files to analyze.
281const fn empty_report(total_files: usize) -> DuplicationReport {
282    DuplicationReport {
283        clone_groups: Vec::new(),
284        clone_families: Vec::new(),
285        mirrored_directories: Vec::new(),
286        stats: DuplicationStats {
287            total_files,
288            files_with_clones: 0,
289            total_lines: 0,
290            duplicated_lines: 0,
291            total_tokens: 0,
292            duplicated_tokens: 0,
293            clone_groups: 0,
294            clone_instances: 0,
295            duplication_percentage: 0.0,
296            clone_groups_below_min_occurrences: 0,
297        },
298    }
299}