Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28/// Data for a single file being analyzed.
29struct FileData {
30    path: PathBuf,
31    hashed_tokens: Vec<HashedToken>,
32    file_tokens: FileTokens,
33    atomic_invocation_spans: Vec<Span>,
34}
35
36/// Suffix Array + LCP based clone detection engine.
37///
38/// Concatenates all files' token sequences (separated by unique sentinels),
39/// builds a suffix array and LCP array, then extracts maximal clone groups
40/// from contiguous LCP intervals.
41pub struct CloneDetector {
42    /// Minimum clone size in tokens.
43    min_tokens: usize,
44    /// Minimum clone size in lines.
45    min_lines: usize,
46    /// Only report cross-directory duplicates.
47    skip_local: bool,
48}
49
50impl CloneDetector {
51    /// Create a new detector with the given thresholds.
52    #[must_use]
53    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54        Self {
55            min_tokens,
56            min_lines,
57            skip_local,
58        }
59    }
60
61    /// Run clone detection across all files.
62    ///
63    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
64    /// one per analyzed file.
65    pub fn detect(
66        &self,
67        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68    ) -> DuplicationReport {
69        self.detect_inner(file_data, None)
70    }
71
72    /// Run clone detection while only materializing groups that touch one of the
73    /// given files.
74    ///
75    /// All files still participate in matching, so focused files can be reported
76    /// as duplicated against unchanged files. Groups that only involve
77    /// non-focused files are dropped before expensive result building.
78    pub fn detect_touching_files(
79        &self,
80        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81        focus_files: &FxHashSet<PathBuf>,
82    ) -> DuplicationReport {
83        self.detect_inner(file_data, Some(focus_files))
84    }
85
86    fn detect_inner(
87        &self,
88        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89        focus_files: Option<&FxHashSet<PathBuf>>,
90    ) -> DuplicationReport {
91        let _span = tracing::info_span!("clone_detect").entered();
92
93        if file_data.is_empty() || self.min_tokens == 0 {
94            return empty_report(0);
95        }
96
97        let files: Vec<FileData> = file_data
98            .into_iter()
99            .map(|(path, hashed_tokens, file_tokens)| FileData {
100                atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101                path,
102                hashed_tokens,
103                file_tokens,
104            })
105            .collect();
106
107        // Compute total stats.
108        let total_files = files.len();
109        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
110        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
111        let focus_file_ids = focus_files.map(|focus| {
112            files
113                .iter()
114                .map(|file| focus.contains(&file.path))
115                .collect::<Vec<_>>()
116        });
117
118        tracing::debug!(
119            total_files,
120            total_tokens,
121            total_lines,
122            focused_files = focus_file_ids
123                .as_ref()
124                .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
125            "clone detection input"
126        );
127
128        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
129        let t0 = std::time::Instant::now();
130        let ranked_files = ranking::rank_reduce(&files);
131        let rank_time = t0.elapsed();
132        let unique_ranks: usize = ranked_files
133            .iter()
134            .flat_map(|f| f.iter())
135            .copied()
136            .max()
137            .map_or(0, |m| m as usize + 1);
138        tracing::debug!(
139            elapsed_us = rank_time.as_micros(),
140            unique_ranks,
141            "step1_rank_reduce"
142        );
143
144        // Step 2: Concatenate with sentinels.
145        let t0 = std::time::Instant::now();
146        let (text, file_of, file_offsets) =
147            concatenation::concatenate_with_sentinels(&ranked_files);
148        let concat_time = t0.elapsed();
149        tracing::debug!(
150            elapsed_us = concat_time.as_micros(),
151            concat_len = text.len(),
152            "step2_concatenate"
153        );
154
155        if text.is_empty() {
156            return empty_report(total_files);
157        }
158
159        // Step 3: Build suffix array.
160        let t0 = std::time::Instant::now();
161        let sa = suffix_array::build_suffix_array(&text);
162        let sa_time = t0.elapsed();
163        tracing::debug!(
164            elapsed_us = sa_time.as_micros(),
165            n = text.len(),
166            "step3_suffix_array"
167        );
168
169        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
170        let t0 = std::time::Instant::now();
171        let lcp_arr = lcp::build_lcp(&text, &sa);
172        let lcp_time = t0.elapsed();
173        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
174
175        // Step 5: Extract clone groups from LCP intervals.
176        let t0 = std::time::Instant::now();
177        let raw_groups = extraction::extract_clone_groups(
178            &sa,
179            &lcp_arr,
180            &file_of,
181            &file_offsets,
182            self.min_tokens,
183            &files,
184            focus_file_ids.as_deref(),
185        );
186        let extract_time = t0.elapsed();
187        tracing::debug!(
188            elapsed_us = extract_time.as_micros(),
189            raw_groups = raw_groups.len(),
190            "step5_extract_groups"
191        );
192
193        // Step 6: Build CloneGroup structs with line info, apply filters.
194        let t0 = std::time::Instant::now();
195        let clone_groups =
196            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
197        let build_time = t0.elapsed();
198        tracing::debug!(
199            elapsed_us = build_time.as_micros(),
200            final_groups = clone_groups.len(),
201            "step6_build_groups"
202        );
203
204        // Step 7: Compute stats.
205        let t0 = std::time::Instant::now();
206        let stats =
207            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
208        let stats_time = t0.elapsed();
209        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
210
211        tracing::info!(
212            total_us = (rank_time
213                + concat_time
214                + sa_time
215                + lcp_time
216                + extract_time
217                + build_time
218                + stats_time)
219                .as_micros(),
220            rank_us = rank_time.as_micros(),
221            sa_us = sa_time.as_micros(),
222            lcp_us = lcp_time.as_micros(),
223            extract_us = extract_time.as_micros(),
224            build_us = build_time.as_micros(),
225            stats_us = stats_time.as_micros(),
226            total_tokens,
227            clone_groups = clone_groups.len(),
228            "clone detection complete"
229        );
230
231        DuplicationReport {
232            clone_groups,
233            clone_families: vec![], // Populated by the caller after suppression filtering
234            mirrored_directories: vec![],
235            stats,
236        }
237    }
238}
239
240/// Create an empty report when there are no files to analyze.
241const fn empty_report(total_files: usize) -> DuplicationReport {
242    DuplicationReport {
243        clone_groups: Vec::new(),
244        clone_families: Vec::new(),
245        mirrored_directories: Vec::new(),
246        stats: DuplicationStats {
247            total_files,
248            files_with_clones: 0,
249            total_lines: 0,
250            duplicated_lines: 0,
251            total_tokens: 0,
252            duplicated_tokens: 0,
253            clone_groups: 0,
254            clone_instances: 0,
255            duplication_percentage: 0.0,
256            clone_groups_below_min_occurrences: 0,
257        },
258    }
259}