Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use super::normalize::HashedToken;
22use super::tokenize::FileTokens;
23use super::types::{DuplicationReport, DuplicationStats};
24
25/// Data for a single file being analyzed.
26struct FileData {
27    path: PathBuf,
28    hashed_tokens: Vec<HashedToken>,
29    file_tokens: FileTokens,
30}
31
32/// Suffix Array + LCP based clone detection engine.
33///
34/// Concatenates all files' token sequences (separated by unique sentinels),
35/// builds a suffix array and LCP array, then extracts maximal clone groups
36/// from contiguous LCP intervals.
37pub struct CloneDetector {
38    /// Minimum clone size in tokens.
39    min_tokens: usize,
40    /// Minimum clone size in lines.
41    min_lines: usize,
42    /// Only report cross-directory duplicates.
43    skip_local: bool,
44}
45
46impl CloneDetector {
47    /// Create a new detector with the given thresholds.
48    #[must_use]
49    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
50        Self {
51            min_tokens,
52            min_lines,
53            skip_local,
54        }
55    }
56
57    /// Run clone detection across all files.
58    ///
59    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
60    /// one per analyzed file.
61    pub fn detect(
62        &self,
63        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
64    ) -> DuplicationReport {
65        let _span = tracing::info_span!("clone_detect").entered();
66
67        if file_data.is_empty() || self.min_tokens == 0 {
68            return empty_report(0);
69        }
70
71        let files: Vec<FileData> = file_data
72            .into_iter()
73            .map(|(path, hashed_tokens, file_tokens)| FileData {
74                path,
75                hashed_tokens,
76                file_tokens,
77            })
78            .collect();
79
80        // Compute total stats.
81        let total_files = files.len();
82        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
83        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
84
85        tracing::debug!(
86            total_files,
87            total_tokens,
88            total_lines,
89            "clone detection input"
90        );
91
92        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
93        let t0 = std::time::Instant::now();
94        let ranked_files = ranking::rank_reduce(&files);
95        let rank_time = t0.elapsed();
96        let unique_ranks: usize = ranked_files
97            .iter()
98            .flat_map(|f| f.iter())
99            .copied()
100            .max()
101            .map_or(0, |m| m as usize + 1);
102        tracing::debug!(
103            elapsed_us = rank_time.as_micros(),
104            unique_ranks,
105            "step1_rank_reduce"
106        );
107
108        // Step 2: Concatenate with sentinels.
109        let t0 = std::time::Instant::now();
110        let (text, file_of, file_offsets) =
111            concatenation::concatenate_with_sentinels(&ranked_files);
112        let concat_time = t0.elapsed();
113        tracing::debug!(
114            elapsed_us = concat_time.as_micros(),
115            concat_len = text.len(),
116            "step2_concatenate"
117        );
118
119        if text.is_empty() {
120            return empty_report(total_files);
121        }
122
123        // Step 3: Build suffix array.
124        let t0 = std::time::Instant::now();
125        let sa = suffix_array::build_suffix_array(&text);
126        let sa_time = t0.elapsed();
127        tracing::debug!(
128            elapsed_us = sa_time.as_micros(),
129            n = text.len(),
130            "step3_suffix_array"
131        );
132
133        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
134        let t0 = std::time::Instant::now();
135        let lcp_arr = lcp::build_lcp(&text, &sa);
136        let lcp_time = t0.elapsed();
137        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
138
139        // Step 5: Extract clone groups from LCP intervals.
140        let t0 = std::time::Instant::now();
141        let raw_groups = extraction::extract_clone_groups(
142            &sa,
143            &lcp_arr,
144            &file_of,
145            &file_offsets,
146            self.min_tokens,
147            &files,
148        );
149        let extract_time = t0.elapsed();
150        tracing::debug!(
151            elapsed_us = extract_time.as_micros(),
152            raw_groups = raw_groups.len(),
153            "step5_extract_groups"
154        );
155
156        // Step 6: Build CloneGroup structs with line info, apply filters.
157        let t0 = std::time::Instant::now();
158        let clone_groups =
159            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
160        let build_time = t0.elapsed();
161        tracing::debug!(
162            elapsed_us = build_time.as_micros(),
163            final_groups = clone_groups.len(),
164            "step6_build_groups"
165        );
166
167        // Step 7: Compute stats.
168        let t0 = std::time::Instant::now();
169        let stats =
170            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
171        let stats_time = t0.elapsed();
172        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
173
174        tracing::info!(
175            total_us = (rank_time
176                + concat_time
177                + sa_time
178                + lcp_time
179                + extract_time
180                + build_time
181                + stats_time)
182                .as_micros(),
183            rank_us = rank_time.as_micros(),
184            sa_us = sa_time.as_micros(),
185            lcp_us = lcp_time.as_micros(),
186            extract_us = extract_time.as_micros(),
187            build_us = build_time.as_micros(),
188            stats_us = stats_time.as_micros(),
189            total_tokens,
190            clone_groups = clone_groups.len(),
191            "clone detection complete"
192        );
193
194        DuplicationReport {
195            clone_groups,
196            clone_families: vec![], // Populated by the caller after suppression filtering
197            mirrored_directories: vec![],
198            stats,
199        }
200    }
201}
202
203/// Create an empty report when there are no files to analyze.
204const fn empty_report(total_files: usize) -> DuplicationReport {
205    DuplicationReport {
206        clone_groups: Vec::new(),
207        clone_families: Vec::new(),
208        mirrored_directories: Vec::new(),
209        stats: DuplicationStats {
210            total_files,
211            files_with_clones: 0,
212            total_lines: 0,
213            duplicated_lines: 0,
214            total_tokens: 0,
215            duplicated_tokens: 0,
216            clone_groups: 0,
217            clone_instances: 0,
218            duplication_percentage: 0.0,
219        },
220    }
221}