Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use rustc_hash::FxHashSet;
22
23use super::normalize::HashedToken;
24use super::tokenize::FileTokens;
25use super::types::{DuplicationReport, DuplicationStats};
26
27/// Data for a single file being analyzed.
28struct FileData {
29    path: PathBuf,
30    hashed_tokens: Vec<HashedToken>,
31    file_tokens: FileTokens,
32}
33
34/// Suffix Array + LCP based clone detection engine.
35///
36/// Concatenates all files' token sequences (separated by unique sentinels),
37/// builds a suffix array and LCP array, then extracts maximal clone groups
38/// from contiguous LCP intervals.
39pub struct CloneDetector {
40    /// Minimum clone size in tokens.
41    min_tokens: usize,
42    /// Minimum clone size in lines.
43    min_lines: usize,
44    /// Only report cross-directory duplicates.
45    skip_local: bool,
46}
47
48impl CloneDetector {
49    /// Create a new detector with the given thresholds.
50    #[must_use]
51    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
52        Self {
53            min_tokens,
54            min_lines,
55            skip_local,
56        }
57    }
58
59    /// Run clone detection across all files.
60    ///
61    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
62    /// one per analyzed file.
63    pub fn detect(
64        &self,
65        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
66    ) -> DuplicationReport {
67        self.detect_inner(file_data, None)
68    }
69
70    /// Run clone detection while only materializing groups that touch one of the
71    /// given files.
72    ///
73    /// All files still participate in matching, so focused files can be reported
74    /// as duplicated against unchanged files. Groups that only involve
75    /// non-focused files are dropped before expensive result building.
76    pub fn detect_touching_files(
77        &self,
78        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
79        focus_files: &FxHashSet<PathBuf>,
80    ) -> DuplicationReport {
81        self.detect_inner(file_data, Some(focus_files))
82    }
83
84    fn detect_inner(
85        &self,
86        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
87        focus_files: Option<&FxHashSet<PathBuf>>,
88    ) -> DuplicationReport {
89        let _span = tracing::info_span!("clone_detect").entered();
90
91        if file_data.is_empty() || self.min_tokens == 0 {
92            return empty_report(0);
93        }
94
95        let files: Vec<FileData> = file_data
96            .into_iter()
97            .map(|(path, hashed_tokens, file_tokens)| FileData {
98                path,
99                hashed_tokens,
100                file_tokens,
101            })
102            .collect();
103
104        // Compute total stats.
105        let total_files = files.len();
106        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
107        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
108        let focus_file_ids = focus_files.map(|focus| {
109            files
110                .iter()
111                .map(|file| focus.contains(&file.path))
112                .collect::<Vec<_>>()
113        });
114
115        tracing::debug!(
116            total_files,
117            total_tokens,
118            total_lines,
119            focused_files = focus_file_ids
120                .as_ref()
121                .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
122            "clone detection input"
123        );
124
125        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
126        let t0 = std::time::Instant::now();
127        let ranked_files = ranking::rank_reduce(&files);
128        let rank_time = t0.elapsed();
129        let unique_ranks: usize = ranked_files
130            .iter()
131            .flat_map(|f| f.iter())
132            .copied()
133            .max()
134            .map_or(0, |m| m as usize + 1);
135        tracing::debug!(
136            elapsed_us = rank_time.as_micros(),
137            unique_ranks,
138            "step1_rank_reduce"
139        );
140
141        // Step 2: Concatenate with sentinels.
142        let t0 = std::time::Instant::now();
143        let (text, file_of, file_offsets) =
144            concatenation::concatenate_with_sentinels(&ranked_files);
145        let concat_time = t0.elapsed();
146        tracing::debug!(
147            elapsed_us = concat_time.as_micros(),
148            concat_len = text.len(),
149            "step2_concatenate"
150        );
151
152        if text.is_empty() {
153            return empty_report(total_files);
154        }
155
156        // Step 3: Build suffix array.
157        let t0 = std::time::Instant::now();
158        let sa = suffix_array::build_suffix_array(&text);
159        let sa_time = t0.elapsed();
160        tracing::debug!(
161            elapsed_us = sa_time.as_micros(),
162            n = text.len(),
163            "step3_suffix_array"
164        );
165
166        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
167        let t0 = std::time::Instant::now();
168        let lcp_arr = lcp::build_lcp(&text, &sa);
169        let lcp_time = t0.elapsed();
170        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
171
172        // Step 5: Extract clone groups from LCP intervals.
173        let t0 = std::time::Instant::now();
174        let raw_groups = extraction::extract_clone_groups(
175            &sa,
176            &lcp_arr,
177            &file_of,
178            &file_offsets,
179            self.min_tokens,
180            &files,
181            focus_file_ids.as_deref(),
182        );
183        let extract_time = t0.elapsed();
184        tracing::debug!(
185            elapsed_us = extract_time.as_micros(),
186            raw_groups = raw_groups.len(),
187            "step5_extract_groups"
188        );
189
190        // Step 6: Build CloneGroup structs with line info, apply filters.
191        let t0 = std::time::Instant::now();
192        let clone_groups =
193            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
194        let build_time = t0.elapsed();
195        tracing::debug!(
196            elapsed_us = build_time.as_micros(),
197            final_groups = clone_groups.len(),
198            "step6_build_groups"
199        );
200
201        // Step 7: Compute stats.
202        let t0 = std::time::Instant::now();
203        let stats =
204            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
205        let stats_time = t0.elapsed();
206        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
207
208        tracing::info!(
209            total_us = (rank_time
210                + concat_time
211                + sa_time
212                + lcp_time
213                + extract_time
214                + build_time
215                + stats_time)
216                .as_micros(),
217            rank_us = rank_time.as_micros(),
218            sa_us = sa_time.as_micros(),
219            lcp_us = lcp_time.as_micros(),
220            extract_us = extract_time.as_micros(),
221            build_us = build_time.as_micros(),
222            stats_us = stats_time.as_micros(),
223            total_tokens,
224            clone_groups = clone_groups.len(),
225            "clone detection complete"
226        );
227
228        DuplicationReport {
229            clone_groups,
230            clone_families: vec![], // Populated by the caller after suppression filtering
231            mirrored_directories: vec![],
232            stats,
233        }
234    }
235}
236
237/// Create an empty report when there are no files to analyze.
238const fn empty_report(total_files: usize) -> DuplicationReport {
239    DuplicationReport {
240        clone_groups: Vec::new(),
241        clone_families: Vec::new(),
242        mirrored_directories: Vec::new(),
243        stats: DuplicationStats {
244            total_files,
245            files_with_clones: 0,
246            total_lines: 0,
247            duplicated_lines: 0,
248            total_tokens: 0,
249            duplicated_tokens: 0,
250            clone_groups: 0,
251            clone_instances: 0,
252            duplication_percentage: 0.0,
253        },
254    }
255}