Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use super::normalize::HashedToken;
22use super::tokenize::FileTokens;
23use super::types::{DuplicationReport, DuplicationStats};
24
25/// Data for a single file being analyzed.
26struct FileData {
27    path: PathBuf,
28    hashed_tokens: Vec<HashedToken>,
29    file_tokens: FileTokens,
30}
31
32/// Suffix Array + LCP based clone detection engine.
33///
34/// Concatenates all files' token sequences (separated by unique sentinels),
35/// builds a suffix array and LCP array, then extracts maximal clone groups
36/// from contiguous LCP intervals.
37pub struct CloneDetector {
38    /// Minimum clone size in tokens.
39    min_tokens: usize,
40    /// Minimum clone size in lines.
41    min_lines: usize,
42    /// Only report cross-directory duplicates.
43    skip_local: bool,
44}
45
46impl CloneDetector {
47    /// Create a new detector with the given thresholds.
48    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
49        Self {
50            min_tokens,
51            min_lines,
52            skip_local,
53        }
54    }
55
56    /// Run clone detection across all files.
57    ///
58    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
59    /// one per analyzed file.
60    pub fn detect(
61        &self,
62        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
63    ) -> DuplicationReport {
64        let _span = tracing::info_span!("clone_detect").entered();
65
66        if file_data.is_empty() || self.min_tokens == 0 {
67            return empty_report(0);
68        }
69
70        let files: Vec<FileData> = file_data
71            .into_iter()
72            .map(|(path, hashed_tokens, file_tokens)| FileData {
73                path,
74                hashed_tokens,
75                file_tokens,
76            })
77            .collect();
78
79        // Compute total stats.
80        let total_files = files.len();
81        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
82        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
83
84        tracing::debug!(
85            total_files,
86            total_tokens,
87            total_lines,
88            "clone detection input"
89        );
90
91        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
92        let t0 = std::time::Instant::now();
93        let ranked_files = ranking::rank_reduce(&files);
94        let rank_time = t0.elapsed();
95        let unique_ranks: usize = ranked_files
96            .iter()
97            .flat_map(|f| f.iter())
98            .copied()
99            .max()
100            .map_or(0, |m| m as usize + 1);
101        tracing::debug!(
102            elapsed_us = rank_time.as_micros(),
103            unique_ranks,
104            "step1_rank_reduce"
105        );
106
107        // Step 2: Concatenate with sentinels.
108        let t0 = std::time::Instant::now();
109        let (text, file_of, file_offsets) =
110            concatenation::concatenate_with_sentinels(&ranked_files);
111        let concat_time = t0.elapsed();
112        tracing::debug!(
113            elapsed_us = concat_time.as_micros(),
114            concat_len = text.len(),
115            "step2_concatenate"
116        );
117
118        if text.is_empty() {
119            return empty_report(total_files);
120        }
121
122        // Step 3: Build suffix array.
123        let t0 = std::time::Instant::now();
124        let sa = suffix_array::build_suffix_array(&text);
125        let sa_time = t0.elapsed();
126        tracing::debug!(
127            elapsed_us = sa_time.as_micros(),
128            n = text.len(),
129            "step3_suffix_array"
130        );
131
132        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
133        let t0 = std::time::Instant::now();
134        let lcp_arr = lcp::build_lcp(&text, &sa);
135        let lcp_time = t0.elapsed();
136        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
137
138        // Step 5: Extract clone groups from LCP intervals.
139        let t0 = std::time::Instant::now();
140        let raw_groups = extraction::extract_clone_groups(
141            &sa,
142            &lcp_arr,
143            &file_of,
144            &file_offsets,
145            self.min_tokens,
146            &files,
147        );
148        let extract_time = t0.elapsed();
149        tracing::debug!(
150            elapsed_us = extract_time.as_micros(),
151            raw_groups = raw_groups.len(),
152            "step5_extract_groups"
153        );
154
155        // Step 6: Build CloneGroup structs with line info, apply filters.
156        let t0 = std::time::Instant::now();
157        let clone_groups =
158            filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
159        let build_time = t0.elapsed();
160        tracing::debug!(
161            elapsed_us = build_time.as_micros(),
162            final_groups = clone_groups.len(),
163            "step6_build_groups"
164        );
165
166        // Step 7: Compute stats.
167        let t0 = std::time::Instant::now();
168        let stats =
169            statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
170        let stats_time = t0.elapsed();
171        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
172
173        tracing::info!(
174            total_us = (rank_time
175                + concat_time
176                + sa_time
177                + lcp_time
178                + extract_time
179                + build_time
180                + stats_time)
181                .as_micros(),
182            rank_us = rank_time.as_micros(),
183            sa_us = sa_time.as_micros(),
184            lcp_us = lcp_time.as_micros(),
185            extract_us = extract_time.as_micros(),
186            build_us = build_time.as_micros(),
187            stats_us = stats_time.as_micros(),
188            total_tokens,
189            clone_groups = clone_groups.len(),
190            "clone detection complete"
191        );
192
193        DuplicationReport {
194            clone_groups,
195            clone_families: vec![], // Populated by the caller after suppression filtering
196            stats,
197        }
198    }
199}
200
201/// Create an empty report when there are no files to analyze.
202const fn empty_report(total_files: usize) -> DuplicationReport {
203    DuplicationReport {
204        clone_groups: Vec::new(),
205        clone_families: Vec::new(),
206        stats: DuplicationStats {
207            total_files,
208            files_with_clones: 0,
209            total_lines: 0,
210            duplicated_lines: 0,
211            total_tokens: 0,
212            duplicated_tokens: 0,
213            clone_groups: 0,
214            clone_instances: 0,
215            duplication_percentage: 0.0,
216        },
217    }
218}