Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28/// Data for a single file being analyzed.
29struct FileData {
30    path: PathBuf,
31    hashed_tokens: Vec<HashedToken>,
32    file_tokens: FileTokens,
33    atomic_invocation_spans: Vec<Span>,
34}
35
36/// Suffix Array + LCP based clone detection engine.
37///
38/// Concatenates all files' token sequences (separated by unique sentinels),
39/// builds a suffix array and LCP array, then extracts maximal clone groups
40/// from contiguous LCP intervals.
41pub struct CloneDetector {
42    /// Minimum clone size in tokens.
43    min_tokens: usize,
44    /// Minimum clone size in lines.
45    min_lines: usize,
46    /// Only report cross-directory duplicates.
47    skip_local: bool,
48}
49
50impl CloneDetector {
51    /// Create a new detector with the given thresholds.
52    #[must_use]
53    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54        Self {
55            min_tokens,
56            min_lines,
57            skip_local,
58        }
59    }
60
61    /// Run clone detection across all files.
62    ///
63    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
64    /// one per analyzed file.
65    pub fn detect(
66        &self,
67        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68    ) -> DuplicationReport {
69        self.detect_inner(file_data, None)
70    }
71
72    /// Run clone detection while only materializing groups that touch one of the
73    /// given files.
74    ///
75    /// All files still participate in matching, so focused files can be reported
76    /// as duplicated against unchanged files. Groups that only involve
77    /// non-focused files are dropped before expensive result building.
78    pub fn detect_touching_files(
79        &self,
80        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81        focus_files: &FxHashSet<PathBuf>,
82    ) -> DuplicationReport {
83        self.detect_inner(file_data, Some(focus_files))
84    }
85
86    fn detect_inner(
87        &self,
88        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89        focus_files: Option<&FxHashSet<PathBuf>>,
90    ) -> DuplicationReport {
91        let _span = tracing::info_span!("clone_detect").entered();
92
93        if file_data.is_empty() || self.min_tokens == 0 {
94            return empty_report(0);
95        }
96
97        let files: Vec<FileData> = file_data
98            .into_iter()
99            .map(|(path, hashed_tokens, file_tokens)| FileData {
100                atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101                path,
102                hashed_tokens,
103                file_tokens,
104            })
105            .collect();
106
107        // Compute total stats.
108        let total_files = files.len();
109        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
110        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
111        // See shingle_filter::filter_to_focus_candidates for the rationale:
112        // normalise both sides through `dunce::simplified` so the Windows
113        // verbatim-vs-non-verbatim prefix mismatch does not silently mark
114        // every file as non-focus.
115        let focus_file_ids = focus_files.map(|focus| {
116            let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus
117                .iter()
118                .map(|p| dunce::simplified(p).to_path_buf())
119                .collect();
120            files
121                .iter()
122                .map(|file| normalized.contains(dunce::simplified(&file.path)))
123                .collect::<Vec<_>>()
124        });
125
126        tracing::debug!(
127            total_files,
128            total_tokens,
129            total_lines,
130            focused_files = focus_file_ids
131                .as_ref()
132                .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
133            "clone detection input"
134        );
135
136        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
137        let t0 = std::time::Instant::now();
138        let ranked_files = ranking::rank_reduce(&files);
139        let rank_time = t0.elapsed();
140        let unique_ranks: usize = ranked_files
141            .iter()
142            .flat_map(|f| f.iter())
143            .copied()
144            .max()
145            .map_or(0, |m| m as usize + 1);
146        tracing::debug!(
147            elapsed_us = rank_time.as_micros(),
148            unique_ranks,
149            "step1_rank_reduce"
150        );
151
152        // Step 2: Concatenate with sentinels.
153        let t0 = std::time::Instant::now();
154        let (text, file_of, file_offsets) =
155            concatenation::concatenate_with_sentinels(&ranked_files);
156        let concat_time = t0.elapsed();
157        tracing::debug!(
158            elapsed_us = concat_time.as_micros(),
159            concat_len = text.len(),
160            "step2_concatenate"
161        );
162
163        if text.is_empty() {
164            return empty_report(total_files);
165        }
166
167        // Step 3: Build suffix array.
168        let t0 = std::time::Instant::now();
169        let sa = suffix_array::build_suffix_array(&text);
170        let sa_time = t0.elapsed();
171        tracing::debug!(
172            elapsed_us = sa_time.as_micros(),
173            n = text.len(),
174            "step3_suffix_array"
175        );
176
177        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
178        let t0 = std::time::Instant::now();
179        let lcp_arr = lcp::build_lcp(&text, &sa);
180        let lcp_time = t0.elapsed();
181        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
182
183        // Step 5: Extract clone groups from LCP intervals.
184        let t0 = std::time::Instant::now();
185        let raw_groups = extraction::extract_clone_groups(
186            &sa,
187            &lcp_arr,
188            &file_of,
189            &file_offsets,
190            self.min_tokens,
191            &files,
192            focus_file_ids.as_deref(),
193        );
194        let extract_time = t0.elapsed();
195        tracing::debug!(
196            elapsed_us = extract_time.as_micros(),
197            raw_groups = raw_groups.len(),
198            "step5_extract_groups"
199        );
200
201        // Step 6: Build CloneGroup structs with line info, apply filters.
202        let t0 = std::time::Instant::now();
203        let clone_groups =
204            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
205        let build_time = t0.elapsed();
206        tracing::debug!(
207            elapsed_us = build_time.as_micros(),
208            final_groups = clone_groups.len(),
209            "step6_build_groups"
210        );
211
212        // Step 7: Compute stats.
213        let t0 = std::time::Instant::now();
214        let stats =
215            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
216        let stats_time = t0.elapsed();
217        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
218
219        tracing::info!(
220            total_us = (rank_time
221                + concat_time
222                + sa_time
223                + lcp_time
224                + extract_time
225                + build_time
226                + stats_time)
227                .as_micros(),
228            rank_us = rank_time.as_micros(),
229            sa_us = sa_time.as_micros(),
230            lcp_us = lcp_time.as_micros(),
231            extract_us = extract_time.as_micros(),
232            build_us = build_time.as_micros(),
233            stats_us = stats_time.as_micros(),
234            total_tokens,
235            clone_groups = clone_groups.len(),
236            "clone detection complete"
237        );
238
239        DuplicationReport {
240            clone_groups,
241            clone_families: vec![], // Populated by the caller after suppression filtering
242            mirrored_directories: vec![],
243            stats,
244        }
245    }
246}
247
248/// Create an empty report when there are no files to analyze.
249const fn empty_report(total_files: usize) -> DuplicationReport {
250    DuplicationReport {
251        clone_groups: Vec::new(),
252        clone_families: Vec::new(),
253        mirrored_directories: Vec::new(),
254        stats: DuplicationStats {
255            total_files,
256            files_with_clones: 0,
257            total_lines: 0,
258            duplicated_lines: 0,
259            total_tokens: 0,
260            duplicated_tokens: 0,
261            clone_groups: 0,
262            clone_instances: 0,
263            duplication_percentage: 0.0,
264            clone_groups_below_min_occurrences: 0,
265        },
266    }
267}