Skip to main content

fallow_core/duplicates/detect/
mod.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses a linear-time SA-IS (suffix array by induced sorting) construction
4//! followed by an O(N) LCP scan. This avoids quadratic pairwise comparisons
5//! and naturally finds all maximal clones in a single linear pass.
6
7mod boundary;
8mod concatenation;
9mod extraction;
10mod filtering;
11mod lcp;
12mod ranking;
13mod rolling;
14mod statistics;
15mod suffix_array;
16mod utils;
17
18#[cfg(test)]
19mod tests;
20
21use std::path::PathBuf;
22
23use oxc_span::Span;
24use rustc_hash::FxHashSet;
25
26use super::normalize::HashedToken;
27use super::tokenize::{FileTokens, TokenKind};
28use super::types::{DuplicationReport, DuplicationStats};
29
30const ROLLING_DETECTOR_ENV: &str = "FALLOW_DUPES_ROLLING";
31const ROLLING_BOUNDARY_FALLBACK_FILE_RATIO: f64 = 0.20;
32
33/// Data for a single file being analyzed.
34struct FileData {
35    path: PathBuf,
36    hashed_tokens: Vec<HashedToken>,
37    file_tokens: FileTokens,
38    atomic_invocation_spans: Vec<Span>,
39}
40
41#[derive(Clone, Copy)]
42pub(super) struct CorpusTotals {
43    pub(super) files: usize,
44    pub(super) lines: usize,
45    pub(super) tokens: usize,
46}
47
48/// Suffix Array + LCP based clone detection engine.
49///
50/// Concatenates all files' token sequences (separated by unique sentinels),
51/// builds a suffix array and LCP array, then extracts maximal clone groups
52/// from contiguous LCP intervals.
53pub struct CloneDetector {
54    /// Minimum clone size in tokens.
55    min_tokens: usize,
56    /// Minimum clone size in lines.
57    min_lines: usize,
58    /// Only report cross-directory duplicates.
59    skip_local: bool,
60}
61
62impl CloneDetector {
63    /// Create a new detector with the given thresholds.
64    #[must_use]
65    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
66        Self {
67            min_tokens,
68            min_lines,
69            skip_local,
70        }
71    }
72
73    /// Run clone detection across all files.
74    ///
75    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
76    /// one per analyzed file.
77    pub fn detect(
78        &self,
79        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
80    ) -> DuplicationReport {
81        self.detect_inner(file_data, None, None)
82    }
83
84    pub(super) fn detect_with_totals(
85        &self,
86        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
87        totals: CorpusTotals,
88    ) -> DuplicationReport {
89        self.detect_inner(file_data, None, Some(totals))
90    }
91
92    /// Run clone detection while only materializing groups that touch one of the
93    /// given files.
94    ///
95    /// All files still participate in matching, so focused files can be reported
96    /// as duplicated against unchanged files. Groups that only involve
97    /// non-focused files are dropped before expensive result building.
98    pub fn detect_touching_files(
99        &self,
100        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
101        focus_files: &FxHashSet<PathBuf>,
102    ) -> DuplicationReport {
103        self.detect_inner(file_data, Some(focus_files), None)
104    }
105
106    fn detect_inner(
107        &self,
108        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
109        focus_files: Option<&FxHashSet<PathBuf>>,
110        corpus_totals: Option<CorpusTotals>,
111    ) -> DuplicationReport {
112        let _span = tracing::info_span!("clone_detect").entered();
113
114        if file_data.is_empty() || self.min_tokens == 0 {
115            return empty_report(corpus_totals.unwrap_or(CorpusTotals {
116                files: 0,
117                lines: 0,
118                tokens: 0,
119            }));
120        }
121
122        let (files, totals, focus_file_ids) =
123            build_detection_inputs(file_data, focus_files, corpus_totals);
124        self.run_clone_pipeline(&files, totals, focus_file_ids.as_deref())
125    }
126
127    /// Run the timed clone-detection pipeline (rank, concatenate, suffix array,
128    /// LCP, extract, build, stats) over prepared inputs and assemble the report.
129    fn run_clone_pipeline(
130        &self,
131        files: &[FileData],
132        totals: CorpusTotals,
133        focus_file_ids: Option<&[bool]>,
134    ) -> DuplicationReport {
135        if std::env::var_os(ROLLING_DETECTOR_ENV).is_some() {
136            let boundary_summary = summarize_boundaries(files);
137            if !boundary_summary.is_component_heavy {
138                return rolling::detect(
139                    files,
140                    self.min_tokens,
141                    self.min_lines,
142                    self.skip_local,
143                    totals,
144                    boundary_summary.has_any_boundary,
145                );
146            }
147        }
148
149        let (text, file_of, file_offsets, rank_time, concat_time) = rank_and_concatenate(files);
150
151        if text.is_empty() {
152            return empty_report(totals);
153        }
154
155        let (raw_groups, sa_time, lcp_time, extract_time) =
156            self.extract_raw_groups(&text, &file_of, &file_offsets, files, focus_file_ids);
157
158        let t0 = std::time::Instant::now();
159        let clone_groups =
160            filtering::build_groups(raw_groups, files, self.min_lines, self.skip_local);
161        let build_time = t0.elapsed();
162        tracing::debug!(
163            elapsed_us = build_time.as_micros(),
164            final_groups = clone_groups.len(),
165            "step6_build_groups"
166        );
167
168        let t0 = std::time::Instant::now();
169        let stats =
170            statistics::compute_stats(&clone_groups, totals.files, totals.lines, totals.tokens);
171        let stats_time = t0.elapsed();
172        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
173
174        trace_clone_detection_complete(
175            &CloneDetectionTimings {
176                rank: rank_time,
177                concat: concat_time,
178                suffix_array: sa_time,
179                lcp: lcp_time,
180                extract: extract_time,
181                build: build_time,
182                stats: stats_time,
183            },
184            totals.tokens,
185            clone_groups.len(),
186        );
187
188        DuplicationReport {
189            clone_groups,
190            clone_families: vec![], // Populated by the caller after suppression filtering
191            mirrored_directories: vec![],
192            stats,
193        }
194    }
195
196    /// Run the suffix-array clone-extraction core (steps 3-5: suffix array, LCP,
197    /// extract) over the concatenated text, returning the raw groups and each
198    /// step's elapsed time for the completion trace.
199    fn extract_raw_groups(
200        &self,
201        text: &[i64],
202        file_of: &[usize],
203        file_offsets: &[usize],
204        files: &[FileData],
205        focus_file_ids: Option<&[bool]>,
206    ) -> (
207        Vec<extraction::RawGroup>,
208        std::time::Duration,
209        std::time::Duration,
210        std::time::Duration,
211    ) {
212        let t0 = std::time::Instant::now();
213        let sa = suffix_array::build_suffix_array(text);
214        let sa_time = t0.elapsed();
215        tracing::debug!(
216            elapsed_us = sa_time.as_micros(),
217            n = text.len(),
218            "step3_suffix_array"
219        );
220
221        let t0 = std::time::Instant::now();
222        let lcp_arr = lcp::build_lcp(text, &sa);
223        let lcp_time = t0.elapsed();
224        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
225
226        let t0 = std::time::Instant::now();
227        let raw_groups = extraction::extract_clone_groups(&extraction::CloneGroupExtractionInput {
228            sa: &sa,
229            lcp: &lcp_arr,
230            file_of,
231            file_offsets,
232            min_tokens: self.min_tokens,
233            files,
234            focus_file_ids,
235            may_have_boundaries: files_may_have_boundaries(files),
236        });
237        let extract_time = t0.elapsed();
238        tracing::debug!(
239            elapsed_us = extract_time.as_micros(),
240            raw_groups = raw_groups.len(),
241            "step5_extract_groups"
242        );
243
244        (raw_groups, sa_time, lcp_time, extract_time)
245    }
246}
247
248/// Build the prepared clone-detection inputs (`FileData` vec, corpus totals,
249/// optional focus-file id mask) from raw file data and emit the input trace.
250fn build_detection_inputs(
251    file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
252    focus_files: Option<&FxHashSet<PathBuf>>,
253    corpus_totals: Option<CorpusTotals>,
254) -> (Vec<FileData>, CorpusTotals, Option<Vec<bool>>) {
255    let files: Vec<FileData> = file_data
256        .into_iter()
257        .map(|(path, hashed_tokens, file_tokens)| FileData {
258            atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
259            path,
260            hashed_tokens,
261            file_tokens,
262        })
263        .collect();
264
265    let totals = corpus_totals.unwrap_or_else(|| CorpusTotals {
266        files: files.len(),
267        lines: files.iter().map(|f| f.file_tokens.line_count).sum(),
268        tokens: files.iter().map(|f| f.hashed_tokens.len()).sum(),
269    });
270    let focus_file_ids = focus_files.map(|focus| build_focus_file_ids(&files, focus));
271    trace_clone_detection_input(
272        totals.files,
273        totals.tokens,
274        totals.lines,
275        focus_file_ids.as_deref(),
276    );
277    (files, totals, focus_file_ids)
278}
279
280/// Run the clone-detection front half (steps 1-2: rank-reduce then concatenate
281/// with sentinels), returning the concatenated text, file index maps, and each
282/// step's elapsed time for the completion trace.
283fn rank_and_concatenate(
284    files: &[FileData],
285) -> (
286    Vec<i64>,
287    Vec<usize>,
288    Vec<usize>,
289    std::time::Duration,
290    std::time::Duration,
291) {
292    let t0 = std::time::Instant::now();
293    let ranked_files = ranking::rank_reduce(files);
294    let rank_time = t0.elapsed();
295    let unique_ranks: usize = ranked_files
296        .iter()
297        .flat_map(|f| f.iter())
298        .copied()
299        .max()
300        .map_or(0, |m| m as usize + 1);
301    tracing::debug!(
302        elapsed_us = rank_time.as_micros(),
303        unique_ranks,
304        "step1_rank_reduce"
305    );
306
307    let t0 = std::time::Instant::now();
308    let (text, file_of, file_offsets) = concatenation::concatenate_with_sentinels(&ranked_files);
309    let concat_time = t0.elapsed();
310    tracing::debug!(
311        elapsed_us = concat_time.as_micros(),
312        concat_len = text.len(),
313        "step2_concatenate"
314    );
315
316    (text, file_of, file_offsets, rank_time, concat_time)
317}
318
319struct CloneDetectionTimings {
320    rank: std::time::Duration,
321    concat: std::time::Duration,
322    suffix_array: std::time::Duration,
323    lcp: std::time::Duration,
324    extract: std::time::Duration,
325    build: std::time::Duration,
326    stats: std::time::Duration,
327}
328
329fn build_focus_file_ids(files: &[FileData], focus_files: &FxHashSet<PathBuf>) -> Vec<bool> {
330    let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus_files
331        .iter()
332        .map(|p| dunce::simplified(p).to_path_buf())
333        .collect();
334    files
335        .iter()
336        .map(|file| normalized.contains(dunce::simplified(&file.path)))
337        .collect()
338}
339
340#[derive(Clone, Copy)]
341struct BoundarySummary {
342    is_component_heavy: bool,
343    has_any_boundary: bool,
344}
345
346fn summarize_boundaries(files: &[FileData]) -> BoundarySummary {
347    let mut files_with_tokens = 0_usize;
348    let mut files_with_boundaries = 0_usize;
349
350    for file in files {
351        if file.hashed_tokens.is_empty() {
352            continue;
353        }
354
355        files_with_tokens += 1;
356        if file_has_boundary(file) {
357            files_with_boundaries += 1;
358        }
359    }
360
361    BoundarySummary {
362        is_component_heavy: files_with_tokens > 0
363            && (files_with_boundaries as f64 / files_with_tokens as f64)
364                >= ROLLING_BOUNDARY_FALLBACK_FILE_RATIO,
365        has_any_boundary: files_with_boundaries > 0,
366    }
367}
368
369fn file_has_boundary(file: &FileData) -> bool {
370    file.hashed_tokens.iter().any(|token| {
371        file.file_tokens
372            .tokens
373            .get(token.original_index)
374            .is_some_and(|source| matches!(source.kind, TokenKind::Boundary(_)))
375    })
376}
377
378fn files_may_have_boundaries(files: &[FileData]) -> bool {
379    files.iter().any(file_may_have_boundaries)
380}
381
382fn file_may_have_boundaries(file: &FileData) -> bool {
383    matches!(
384        file.path
385            .extension()
386            .and_then(|extension| extension.to_str()),
387        Some("astro" | "css" | "less" | "sass" | "scss" | "svelte" | "vue")
388    )
389}
390
391fn trace_clone_detection_input(
392    total_files: usize,
393    total_tokens: usize,
394    total_lines: usize,
395    focus_file_ids: Option<&[bool]>,
396) {
397    tracing::debug!(
398        total_files,
399        total_tokens,
400        total_lines,
401        focused_files =
402            focus_file_ids.map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
403        "clone detection input"
404    );
405}
406
407fn trace_clone_detection_complete(
408    timings: &CloneDetectionTimings,
409    total_tokens: usize,
410    clone_groups: usize,
411) {
412    tracing::info!(
413        total_us = (timings.rank
414            + timings.concat
415            + timings.suffix_array
416            + timings.lcp
417            + timings.extract
418            + timings.build
419            + timings.stats)
420            .as_micros(),
421        rank_us = timings.rank.as_micros(),
422        sa_us = timings.suffix_array.as_micros(),
423        lcp_us = timings.lcp.as_micros(),
424        extract_us = timings.extract.as_micros(),
425        build_us = timings.build.as_micros(),
426        stats_us = timings.stats.as_micros(),
427        total_tokens,
428        clone_groups,
429        "clone detection complete"
430    );
431}
432
433/// Create an empty report when there are no files to analyze.
434const fn empty_report(totals: CorpusTotals) -> DuplicationReport {
435    DuplicationReport {
436        clone_groups: Vec::new(),
437        clone_families: Vec::new(),
438        mirrored_directories: Vec::new(),
439        stats: DuplicationStats {
440            total_files: totals.files,
441            files_with_clones: 0,
442            total_lines: totals.lines,
443            duplicated_lines: 0,
444            total_tokens: totals.tokens,
445            duplicated_tokens: 0,
446            clone_groups: 0,
447            clone_instances: 0,
448            duplication_percentage: 0.0,
449            clone_groups_below_min_occurrences: 0,
450        },
451    }
452}