fallow_core/duplicates/
detect.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7use rustc_hash::{FxHashMap, FxHashSet};
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14/// Data for a single file being analyzed.
15struct FileData {
16    path: PathBuf,
17    hashed_tokens: Vec<HashedToken>,
18    file_tokens: FileTokens,
19}
20
21/// Suffix Array + LCP based clone detection engine.
22///
23/// Concatenates all files' token sequences (separated by unique sentinels),
24/// builds a suffix array and LCP array, then extracts maximal clone groups
25/// from contiguous LCP intervals.
26pub struct CloneDetector {
27    /// Minimum clone size in tokens.
28    min_tokens: usize,
29    /// Minimum clone size in lines.
30    min_lines: usize,
31    /// Only report cross-directory duplicates.
32    skip_local: bool,
33}
34
35impl CloneDetector {
36    /// Create a new detector with the given thresholds.
37    pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38        Self {
39            min_tokens,
40            min_lines,
41            skip_local,
42        }
43    }
44
45    /// Run clone detection across all files.
46    ///
47    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
48    /// one per analyzed file.
49    pub fn detect(
50        &self,
51        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52    ) -> DuplicationReport {
53        let _span = tracing::info_span!("clone_detect").entered();
54
55        if file_data.is_empty() || self.min_tokens == 0 {
56            return empty_report(0);
57        }
58
59        let files: Vec<FileData> = file_data
60            .into_iter()
61            .map(|(path, hashed_tokens, file_tokens)| FileData {
62                path,
63                hashed_tokens,
64                file_tokens,
65            })
66            .collect();
67
68        // Compute total stats.
69        let total_files = files.len();
70        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73        tracing::debug!(
74            total_files,
75            total_tokens,
76            total_lines,
77            "clone detection input"
78        );
79
80        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
81        let t0 = std::time::Instant::now();
82        let ranked_files = rank_reduce(&files);
83        let rank_time = t0.elapsed();
84        let unique_ranks: usize = ranked_files
85            .iter()
86            .flat_map(|f| f.iter())
87            .copied()
88            .max()
89            .map_or(0, |m| m as usize + 1);
90        tracing::debug!(
91            elapsed_us = rank_time.as_micros(),
92            unique_ranks,
93            "step1_rank_reduce"
94        );
95
96        // Step 2: Concatenate with sentinels.
97        let t0 = std::time::Instant::now();
98        let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99        let concat_time = t0.elapsed();
100        tracing::debug!(
101            elapsed_us = concat_time.as_micros(),
102            concat_len = text.len(),
103            "step2_concatenate"
104        );
105
106        if text.is_empty() {
107            return empty_report(total_files);
108        }
109
110        // Step 3: Build suffix array.
111        let t0 = std::time::Instant::now();
112        let sa = build_suffix_array(&text);
113        let sa_time = t0.elapsed();
114        tracing::debug!(
115            elapsed_us = sa_time.as_micros(),
116            n = text.len(),
117            "step3_suffix_array"
118        );
119
120        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
121        let t0 = std::time::Instant::now();
122        let lcp = build_lcp(&text, &sa);
123        let lcp_time = t0.elapsed();
124        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126        // Step 5: Extract clone groups from LCP intervals.
127        let t0 = std::time::Instant::now();
128        let raw_groups =
129            extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130        let extract_time = t0.elapsed();
131        tracing::debug!(
132            elapsed_us = extract_time.as_micros(),
133            raw_groups = raw_groups.len(),
134            "step5_extract_groups"
135        );
136
137        // Step 6: Build CloneGroup structs with line info, apply filters.
138        let t0 = std::time::Instant::now();
139        let clone_groups = self.build_groups(raw_groups, &files);
140        let build_time = t0.elapsed();
141        tracing::debug!(
142            elapsed_us = build_time.as_micros(),
143            final_groups = clone_groups.len(),
144            "step6_build_groups"
145        );
146
147        // Step 7: Compute stats.
148        let t0 = std::time::Instant::now();
149        let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150        let stats_time = t0.elapsed();
151        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153        tracing::info!(
154            total_us = (rank_time
155                + concat_time
156                + sa_time
157                + lcp_time
158                + extract_time
159                + build_time
160                + stats_time)
161                .as_micros(),
162            rank_us = rank_time.as_micros(),
163            sa_us = sa_time.as_micros(),
164            lcp_us = lcp_time.as_micros(),
165            extract_us = extract_time.as_micros(),
166            build_us = build_time.as_micros(),
167            stats_us = stats_time.as_micros(),
168            total_tokens,
169            clone_groups = clone_groups.len(),
170            "clone detection complete"
171        );
172
173        DuplicationReport {
174            clone_groups,
175            clone_families: vec![], // Populated by the caller after suppression filtering
176            stats,
177        }
178    }
179
180    /// Convert raw groups into `CloneGroup` structs, applying `min_lines` and
181    /// `skip_local` filters, deduplication, and subset removal.
182    fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
183        if raw_groups.is_empty() {
184            return Vec::new();
185        }
186
187        // ── Token-level subset removal (cheap) ────────────────
188        //
189        // Sort raw groups by length desc. For each instance (file_id, offset),
190        // track non-overlapping covered intervals per file. A smaller group is
191        // skipped if all its instances fall within already-covered intervals.
192        // This eliminates the vast majority of raw groups before the expensive
193        // line-calculation step.
194        let raw_count = raw_groups.len();
195        raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
196
197        // covered[file_id] is a sorted vec of non-overlapping (start, end)
198        // intervals. Kept sorted by start for binary search.
199        let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
200        let mut surviving_groups: Vec<RawGroup> = Vec::new();
201
202        for rg in raw_groups {
203            let len = rg.length;
204            // Check if all instances are fully covered by existing intervals.
205            let all_covered = rg.instances.iter().all(|&(fid, offset)| {
206                let intervals = &covered[fid];
207                // Binary search for the interval that could contain [offset, offset+len).
208                let idx = intervals.partition_point(|&(s, _)| s <= offset);
209                if idx > 0 {
210                    let (s, e) = intervals[idx - 1];
211                    offset >= s && offset + len <= e
212                } else {
213                    false
214                }
215            });
216
217            if !all_covered {
218                // Insert covered intervals for this group's instances.
219                for &(fid, offset) in &rg.instances {
220                    let end = offset + len;
221                    let intervals = &mut covered[fid];
222                    let idx = intervals.partition_point(|&(s, _)| s < offset);
223                    // Check if the new interval merges with an existing one.
224                    if idx > 0 {
225                        let prev = &mut intervals[idx - 1];
226                        if prev.1 >= offset {
227                            // Extend previous interval if needed.
228                            if end > prev.1 {
229                                prev.1 = end;
230                            }
231                            continue;
232                        }
233                    }
234                    intervals.insert(idx, (offset, end));
235                }
236                surviving_groups.push(rg);
237            }
238        }
239
240        tracing::trace!(
241            raw = raw_count,
242            surviving = surviving_groups.len(),
243            "token-level subset removal"
244        );
245
246        // ── Pre-compute line offset tables ────────────────────
247        //
248        // For each file, build a sorted vec of newline byte positions so that
249        // byte_offset_to_line_col can use binary search (O(log L)) instead of
250        // linear scan (O(L)).
251        let line_tables: Vec<Vec<usize>> = files
252            .iter()
253            .map(|f| {
254                f.file_tokens
255                    .source
256                    .bytes()
257                    .enumerate()
258                    .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
259                    .collect()
260            })
261            .collect();
262
263        // ── Build CloneGroups for survivors ────────────────────
264        let mut clone_groups: Vec<CloneGroup> = Vec::new();
265
266        for rg in surviving_groups {
267            // Build instances, deduplicating by (file_id, offset).
268            let mut seen: FxHashSet<(usize, usize)> = FxHashSet::default();
269            let mut group_instances: Vec<CloneInstance> = Vec::new();
270
271            for &(file_id, offset) in &rg.instances {
272                if !seen.insert((file_id, offset)) {
273                    continue;
274                }
275
276                let file = &files[file_id];
277                if let Some(inst) =
278                    build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279                {
280                    group_instances.push(inst);
281                }
282            }
283
284            // Apply skip_local: only keep cross-directory clones.
285            if self.skip_local && group_instances.len() >= 2 {
286                let dirs: FxHashSet<_> = group_instances
287                    .iter()
288                    .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289                    .collect();
290                if dirs.len() < 2 {
291                    continue;
292                }
293            }
294
295            if group_instances.len() < 2 {
296                continue;
297            }
298
299            // Calculate line count from the instances.
300            let line_count = group_instances
301                .iter()
302                .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303                .max()
304                .unwrap_or(0);
305
306            // Apply minimum line filter.
307            if line_count < self.min_lines {
308                continue;
309            }
310
311            // Sort instances by file path then start line for stable output.
312            group_instances
313                .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315            // Deduplicate instances that map to overlapping line ranges within
316            // the same file (different token offsets can resolve to overlapping
317            // source spans). When two instances overlap, keep the wider one.
318            group_instances.dedup_by(|b, a| {
319                if a.file != b.file {
320                    return false;
321                }
322                // Instances are sorted by start_line. `b` starts at or after `a`.
323                // If b's start overlaps with a's range, merge by extending a.
324                if b.start_line <= a.end_line {
325                    // Keep the wider range in `a`.
326                    if b.end_line > a.end_line {
327                        a.end_line = b.end_line;
328                        a.end_col = b.end_col;
329                    }
330                    true
331                } else {
332                    false
333                }
334            });
335
336            if group_instances.len() < 2 {
337                continue;
338            }
339
340            clone_groups.push(CloneGroup {
341                instances: group_instances,
342                token_count: rg.length,
343                line_count,
344            });
345        }
346
347        // Sort groups by token count (largest first), breaking ties by instance
348        // count (most instances first). This ensures that for equal token counts,
349        // N-way groups come before M-way (M<N) subsets, so subset removal works
350        // correctly regardless of the suffix array's extraction order.
351        clone_groups.sort_by(|a, b| {
352            b.token_count
353                .cmp(&a.token_count)
354                .then(b.instances.len().cmp(&a.instances.len()))
355        });
356
357        // Remove groups whose line ranges are fully contained within another
358        // group's line ranges. Uses a per-file interval index to avoid O(g²×m×n).
359        //
360        // Strategy: iterate groups from largest to smallest. For each kept group,
361        // register its (file, start_line, end_line) spans into a spatial index.
362        // Smaller groups are checked against this index in O(instances × log(intervals)).
363
364        // Build file path → index mapping for interval tracking
365        let mut path_to_idx: FxHashMap<PathBuf, usize> = FxHashMap::default();
366        let mut next_idx = 0usize;
367        for group in &clone_groups {
368            for inst in &group.instances {
369                path_to_idx.entry(inst.file.clone()).or_insert_with(|| {
370                    let idx = next_idx;
371                    next_idx += 1;
372                    idx
373                });
374            }
375        }
376        let mut file_intervals: Vec<Vec<(usize, usize)>> = vec![Vec::new(); next_idx];
377        let mut kept_groups: Vec<CloneGroup> = Vec::new();
378
379        for group in clone_groups {
380            // Check if ALL instances of this group are contained within existing intervals
381            let all_contained = group.instances.iter().all(|inst| {
382                let fidx = path_to_idx[&inst.file];
383                let intervals = &file_intervals[fidx];
384                let idx = intervals.partition_point(|&(s, _)| s <= inst.start_line);
385                idx > 0 && {
386                    let (s, e) = intervals[idx - 1];
387                    inst.start_line >= s && inst.end_line <= e
388                }
389            });
390
391            if !all_contained {
392                // Register this group's instances into the spatial index
393                for inst in &group.instances {
394                    let fidx = path_to_idx[&inst.file];
395                    let intervals = &mut file_intervals[fidx];
396                    let idx = intervals.partition_point(|&(s, _)| s < inst.start_line);
397                    if idx > 0 && intervals[idx - 1].1 >= inst.start_line {
398                        if inst.end_line > intervals[idx - 1].1 {
399                            intervals[idx - 1].1 = inst.end_line;
400                        }
401                    } else {
402                        intervals.insert(idx, (inst.start_line, inst.end_line));
403                    }
404                }
405                kept_groups.push(group);
406            }
407        }
408
409        kept_groups
410    }
411}
412
413// ── Raw group from LCP extraction ──────────────────────────
414
415/// A raw clone group before conversion to `CloneGroup`.
416struct RawGroup {
417    /// List of (`file_id`, `token_offset`) instances.
418    instances: Vec<(usize, usize)>,
419    /// Clone length in tokens.
420    length: usize,
421}
422
423// ── Step 1: Rank reduction ─────────────────────────────────
424
425/// Map all unique token hashes (u64) to consecutive integer ranks (u32).
426///
427/// Returns `ranked_files` where `ranked_files[i]` contains the rank
428/// sequence for `files[i]`.
429fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
430    // Single-pass: assign ranks on first encounter. The exact rank values
431    // don't matter as long as equal hashes get equal ranks. Skipping the
432    // sort+dedup saves O(N log N) and a second allocation.
433    let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
434    let mut hash_to_rank: FxHashMap<u64, u32> =
435        FxHashMap::with_capacity_and_hasher(total / 2, rustc_hash::FxBuildHasher);
436    let mut next_rank: u32 = 0;
437
438    files
439        .iter()
440        .map(|file| {
441            file.hashed_tokens
442                .iter()
443                .map(|ht| {
444                    *hash_to_rank.entry(ht.hash).or_insert_with(|| {
445                        let r = next_rank;
446                        next_rank += 1;
447                        r
448                    })
449                })
450                .collect()
451        })
452        .collect()
453}
454
455// ── Step 2: Concatenation with sentinels ───────────────────
456
457/// Concatenate all ranked token sequences into a single `Vec<i64>`,
458/// inserting unique negative sentinel values between files.
459///
460/// Returns `(text, file_of, file_offsets)` where:
461/// - `text` is the concatenated sequence
462/// - `file_of[pos]` maps a position in `text` to a file index
463///   (`usize::MAX` for sentinel positions)
464/// - `file_offsets[file_id]` is the starting position of file `file_id`
465///   in `text`
466fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
467    let sentinel_count = ranked_files.len().saturating_sub(1);
468    let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
469
470    let mut text = Vec::with_capacity(total_len);
471    let mut file_of = Vec::with_capacity(total_len);
472    let mut file_offsets = Vec::with_capacity(ranked_files.len());
473
474    let mut sentinel: i64 = -1;
475
476    for (file_id, ranks) in ranked_files.iter().enumerate() {
477        file_offsets.push(text.len());
478
479        for &r in ranks {
480            text.push(i64::from(r));
481            file_of.push(file_id);
482        }
483
484        // Insert sentinel between files (not after the last one).
485        if file_id + 1 < ranked_files.len() {
486            text.push(sentinel);
487            file_of.push(usize::MAX);
488            sentinel -= 1;
489        }
490    }
491
492    (text, file_of, file_offsets)
493}
494
495// ── Step 3: Suffix array construction (prefix doubling) ────
496
497/// Build a suffix array using the O(N log N) prefix-doubling algorithm with
498/// radix sort.
499///
500/// Returns `sa` where `sa[i]` is the starting position of the i-th
501/// lexicographically smallest suffix in `text`.
502fn build_suffix_array(text: &[i64]) -> Vec<usize> {
503    let n = text.len();
504    if n == 0 {
505        return vec![];
506    }
507
508    // Initial ranks based on raw values. Shift so sentinels (negative) sort
509    // before all real tokens.
510    let min_val = text.iter().copied().min().unwrap_or(0);
511    let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
512    let mut sa: Vec<usize> = (0..n).collect();
513    let mut tmp: Vec<i64> = vec![0; n];
514    let mut k: usize = 1;
515    let mut iterations = 0u32;
516
517    // Scratch buffers for radix sort (reused across iterations).
518    let mut sa_tmp: Vec<usize> = vec![0; n];
519
520    while k < n {
521        iterations += 1;
522        let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
523
524        // Two-pass radix sort: sort by secondary key (rank[i+k]) first,
525        // then by primary key (rank[i]). Each pass is O(N + K) where
526        // K = max_rank + 2 (including the -1 sentinel rank).
527        let bucket_count = max_rank + 2; // ranks 0..=max_rank plus -1 mapped to 0
528
529        // Pass 1: sort by secondary key (rank at offset k).
530        let mut counts = vec![0usize; bucket_count + 1];
531        for &i in &sa {
532            let r2 = if i + k < n {
533                rank[i + k] as usize + 1
534            } else {
535                0
536            };
537            counts[r2] += 1;
538        }
539        // Prefix sum.
540        let mut sum = 0;
541        for c in &mut counts {
542            let v = *c;
543            *c = sum;
544            sum += v;
545        }
546        for &i in &sa {
547            let r2 = if i + k < n {
548                rank[i + k] as usize + 1
549            } else {
550                0
551            };
552            sa_tmp[counts[r2]] = i;
553            counts[r2] += 1;
554        }
555
556        // Pass 2: sort by primary key (rank[i]), stable.
557        // No +1 offset needed here: rank[i] is always >= 0 because the
558        // initial ranks are shifted by min_val, and subsequent iterations
559        // assign ranks starting from 0.
560        counts.fill(0);
561        counts.resize(bucket_count + 1, 0);
562        for &i in &sa_tmp {
563            let r1 = rank[i] as usize;
564            counts[r1] += 1;
565        }
566        sum = 0;
567        for c in &mut counts {
568            let v = *c;
569            *c = sum;
570            sum += v;
571        }
572        for &i in &sa_tmp {
573            let r1 = rank[i] as usize;
574            sa[counts[r1]] = i;
575            counts[r1] += 1;
576        }
577
578        // Compute new ranks.
579        tmp[sa[0]] = 0;
580        for i in 1..n {
581            let prev = sa[i - 1];
582            let curr = sa[i];
583            let same = rank[prev] == rank[curr] && {
584                let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
585                let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
586                rp2 == rc2
587            };
588            tmp[curr] = tmp[prev] + i64::from(!same);
589        }
590
591        // Early exit when all ranks are unique.
592        let new_max_rank = tmp[sa[n - 1]];
593        std::mem::swap(&mut rank, &mut tmp);
594
595        if new_max_rank as usize == n - 1 {
596            break;
597        }
598
599        k *= 2;
600    }
601
602    tracing::trace!(n, iterations, "suffix array constructed");
603    sa
604}
605
606// ── Step 4: LCP array (Kasai's algorithm) ──────────────────
607
608/// Build the LCP (Longest Common Prefix) array using Kasai's algorithm.
609///
610/// `lcp[i]` is the length of the longest common prefix between suffixes
611/// `sa[i]` and `sa[i-1]`. `lcp[0]` is always 0.
612///
613/// The LCP computation stops at sentinel boundaries (negative values in
614/// `text`) to prevent matches from crossing file boundaries.
615fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
616    let n = sa.len();
617    if n == 0 {
618        return vec![];
619    }
620
621    let mut rank = vec![0usize; n];
622    for i in 0..n {
623        rank[sa[i]] = i;
624    }
625
626    let mut lcp = vec![0usize; n];
627    let mut k: usize = 0;
628
629    for i in 0..n {
630        if rank[i] == 0 {
631            k = 0;
632            continue;
633        }
634        let j = sa[rank[i] - 1];
635        while i + k < n && j + k < n {
636            // Stop at sentinels (negative values).
637            if text[i + k] < 0 || text[j + k] < 0 {
638                break;
639            }
640            if text[i + k] != text[j + k] {
641                break;
642            }
643            k += 1;
644        }
645        lcp[rank[i]] = k;
646        k = k.saturating_sub(1);
647    }
648
649    lcp
650}
651
652// ── Step 5: Clone group extraction ─────────────────────────
653
654/// Extract clone groups from the suffix array and LCP array.
655///
656/// Uses a stack-based approach to find all maximal LCP intervals where the
657/// minimum LCP value is >= `min_tokens`, and the interval contains suffixes
658/// from at least two different positions (cross-file or non-overlapping
659/// same-file).
660fn extract_clone_groups(
661    sa: &[usize],
662    lcp: &[usize],
663    file_of: &[usize],
664    file_offsets: &[usize],
665    min_tokens: usize,
666    files: &[FileData],
667) -> Vec<RawGroup> {
668    let n = sa.len();
669    if n < 2 {
670        return vec![];
671    }
672
673    // Stack-based LCP interval extraction.
674    // Each stack entry: (lcp_value, start_index_in_sa).
675    let mut stack: Vec<(usize, usize)> = Vec::new();
676    let mut groups: Vec<RawGroup> = Vec::new();
677
678    #[allow(clippy::needless_range_loop)] // `i` is used as a value, not just as an index
679    for i in 1..=n {
680        let cur_lcp = if i < n { lcp[i] } else { 0 };
681        let mut start = i;
682
683        while let Some(&(top_lcp, top_start)) = stack.last() {
684            if top_lcp <= cur_lcp {
685                break;
686            }
687            stack.pop();
688            start = top_start;
689
690            if top_lcp >= min_tokens {
691                // The interval [start-1 .. i-1] shares a common prefix of
692                // length `top_lcp`.
693                let interval_begin = start - 1;
694                let interval_end = i; // exclusive
695
696                if let Some(group) = build_raw_group(
697                    sa,
698                    file_of,
699                    file_offsets,
700                    files,
701                    interval_begin,
702                    interval_end,
703                    top_lcp,
704                ) {
705                    groups.push(group);
706                }
707            }
708        }
709
710        if i < n {
711            stack.push((cur_lcp, start));
712        }
713    }
714
715    groups
716}
717
718/// Build a `RawGroup` from an LCP interval, filtering to non-overlapping
719/// instances.
720fn build_raw_group(
721    sa: &[usize],
722    file_of: &[usize],
723    file_offsets: &[usize],
724    files: &[FileData],
725    begin: usize,
726    end: usize,
727    length: usize,
728) -> Option<RawGroup> {
729    let mut instances: Vec<(usize, usize)> = Vec::new();
730
731    for &pos in &sa[begin..end] {
732        let fid = file_of[pos];
733        if fid == usize::MAX {
734            continue; // sentinel position
735        }
736        let offset_in_file = pos - file_offsets[fid];
737
738        // Verify the clone doesn't extend beyond the file boundary.
739        if offset_in_file + length > files[fid].hashed_tokens.len() {
740            continue;
741        }
742
743        instances.push((fid, offset_in_file));
744    }
745
746    if instances.len() < 2 {
747        return None;
748    }
749
750    // Remove overlapping instances within the same file.
751    // Sort by (file_id, offset) and remove overlaps.
752    instances.sort_unstable();
753    let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
754    for &(fid, offset) in &instances {
755        if let Some(&(last_fid, last_offset)) = deduped.last()
756            && fid == last_fid
757            && offset < last_offset + length
758        {
759            continue; // overlapping within the same file
760        }
761        deduped.push((fid, offset));
762    }
763
764    if deduped.len() < 2 {
765        return None;
766    }
767
768    Some(RawGroup {
769        instances: deduped,
770        length,
771    })
772}
773
774// ── Utility functions ──────────────────────────────────────
775
776/// Check if all instances of `smaller` overlap with instances of `larger`.
777/// Build a `CloneInstance` using a pre-computed line offset table for fast lookup.
778fn build_clone_instance_fast(
779    file: &FileData,
780    token_offset: usize,
781    token_length: usize,
782    line_table: &[usize],
783) -> Option<CloneInstance> {
784    let tokens = &file.hashed_tokens;
785    let source_tokens = &file.file_tokens.tokens;
786
787    if token_offset + token_length > tokens.len() {
788        return None;
789    }
790
791    // Map from hashed token indices back to source token spans.
792    let first_hashed = &tokens[token_offset];
793    let last_hashed = &tokens[token_offset + token_length - 1];
794
795    let first_source = &source_tokens[first_hashed.original_index];
796    let last_source = &source_tokens[last_hashed.original_index];
797
798    let start_byte = first_source.span.start as usize;
799    let end_byte = last_source.span.end as usize;
800
801    // Guard against inverted spans that can occur when normalization reorders
802    // token original_index values for very small windows.
803    if start_byte > end_byte {
804        return None;
805    }
806
807    let source = &file.file_tokens.source;
808    let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
809    let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
810
811    // Extract the fragment, snapping to valid char boundaries.
812    let fragment = if end_byte <= source.len() {
813        let mut sb = start_byte;
814        while sb > 0 && !source.is_char_boundary(sb) {
815            sb -= 1;
816        }
817        let mut eb = end_byte;
818        while eb < source.len() && !source.is_char_boundary(eb) {
819            eb += 1;
820        }
821        source[sb..eb].to_string()
822    } else {
823        String::new()
824    };
825
826    Some(CloneInstance {
827        file: file.path.clone(),
828        start_line,
829        end_line,
830        start_col,
831        end_col,
832        fragment,
833    })
834}
835
836/// Convert a byte offset into a 1-based line number and 0-based character column
837/// using a pre-computed table of newline positions for O(log L) lookup.
838fn byte_offset_to_line_col_fast(
839    source: &str,
840    byte_offset: usize,
841    line_table: &[usize],
842) -> (usize, usize) {
843    let mut offset = byte_offset.min(source.len());
844    // Snap to a valid char boundary (byte_offset may land inside a multi-byte char)
845    while offset > 0 && !source.is_char_boundary(offset) {
846        offset -= 1;
847    }
848    // Binary search: find the number of newlines before this offset.
849    let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
850    let line = line_idx + 1; // 1-based
851    let line_start = if line_idx == 0 {
852        0
853    } else {
854        line_table[line_idx - 1] + 1
855    };
856    let col = source[line_start..offset].chars().count();
857    (line, col)
858}
859
860/// Convert a byte offset into a 1-based line number and 0-based character column.
861#[cfg(test)]
862fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
863    let mut offset = byte_offset.min(source.len());
864    while offset > 0 && !source.is_char_boundary(offset) {
865        offset -= 1;
866    }
867    let before = &source[..offset];
868    let line = before.matches('\n').count() + 1;
869    let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
870    let col = before[line_start..].chars().count();
871    (line, col)
872}
873
874/// Compute aggregate duplication statistics.
875fn compute_stats(
876    clone_groups: &[CloneGroup],
877    total_files: usize,
878    total_lines: usize,
879    total_tokens: usize,
880) -> DuplicationStats {
881    use std::path::Path;
882
883    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
884    // Group duplicated lines by file to avoid cloning PathBuf per line.
885    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
886    let mut duplicated_tokens = 0usize;
887    let mut clone_instances = 0usize;
888
889    for group in clone_groups {
890        for instance in &group.instances {
891            files_with_clones.insert(&instance.file);
892            clone_instances += 1;
893            let lines = file_dup_lines.entry(&instance.file).or_default();
894            for line in instance.start_line..=instance.end_line {
895                lines.insert(line);
896            }
897        }
898        // Each instance contributes token_count duplicated tokens,
899        // but only count duplicates (all instances beyond the first).
900        if group.instances.len() > 1 {
901            duplicated_tokens += group.token_count * (group.instances.len() - 1);
902        }
903    }
904
905    let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
906    let duplication_percentage = if total_lines > 0 {
907        (dup_line_count as f64 / total_lines as f64) * 100.0
908    } else {
909        0.0
910    };
911
912    // Cap duplicated_tokens to total_tokens to avoid impossible values
913    // when overlapping clone groups double-count the same token positions.
914    let duplicated_tokens = duplicated_tokens.min(total_tokens);
915
916    DuplicationStats {
917        total_files,
918        files_with_clones: files_with_clones.len(),
919        total_lines,
920        duplicated_lines: dup_line_count,
921        total_tokens,
922        duplicated_tokens,
923        clone_groups: clone_groups.len(),
924        clone_instances,
925        duplication_percentage,
926    }
927}
928
929/// Create an empty report when there are no files to analyze.
930const fn empty_report(total_files: usize) -> DuplicationReport {
931    DuplicationReport {
932        clone_groups: Vec::new(),
933        clone_families: Vec::new(),
934        stats: DuplicationStats {
935            total_files,
936            files_with_clones: 0,
937            total_lines: 0,
938            duplicated_lines: 0,
939            total_tokens: 0,
940            duplicated_tokens: 0,
941            clone_groups: 0,
942            clone_instances: 0,
943            duplication_percentage: 0.0,
944        },
945    }
946}
947
948#[cfg(test)]
949#[allow(clippy::disallowed_types)]
950mod tests {
951    use std::collections::HashMap;
952
953    use super::*;
954    use crate::duplicates::normalize::HashedToken;
955    use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
956    use oxc_span::Span;
957
958    fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
959        hashes
960            .iter()
961            .enumerate()
962            .map(|(i, &hash)| HashedToken {
963                hash,
964                original_index: i,
965            })
966            .collect()
967    }
968
969    fn make_source_tokens(count: usize) -> Vec<SourceToken> {
970        (0..count)
971            .map(|i| SourceToken {
972                kind: TokenKind::Identifier(format!("t{i}")),
973                span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
974            })
975            .collect()
976    }
977
978    fn make_file_tokens(source: &str, count: usize) -> FileTokens {
979        FileTokens {
980            tokens: make_source_tokens(count),
981            source: source.to_string(),
982            line_count: source.lines().count().max(1),
983        }
984    }
985
986    // ── Existing tests (adapted for CloneDetector) ─────────
987
988    #[test]
989    fn empty_input_produces_empty_report() {
990        let detector = CloneDetector::new(5, 1, false);
991        let report = detector.detect(vec![]);
992        assert!(report.clone_groups.is_empty());
993        assert_eq!(report.stats.total_files, 0);
994    }
995
996    #[test]
997    fn single_file_no_clones() {
998        let detector = CloneDetector::new(3, 1, false);
999        let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
1000        let ft = make_file_tokens("a b c d e", 5);
1001        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1002        assert!(report.clone_groups.is_empty());
1003    }
1004
1005    #[test]
1006    fn detects_exact_duplicate_across_files() {
1007        let detector = CloneDetector::new(3, 1, false);
1008
1009        // Same token sequence in two files.
1010        let hashes = vec![10, 20, 30, 40, 50];
1011        let source_a = "a\nb\nc\nd\ne";
1012        let source_b = "a\nb\nc\nd\ne";
1013
1014        let hashed_a = make_hashed_tokens(&hashes);
1015        let hashed_b = make_hashed_tokens(&hashes);
1016        let ft_a = make_file_tokens(source_a, 5);
1017        let ft_b = make_file_tokens(source_b, 5);
1018
1019        let report = detector.detect(vec![
1020            (PathBuf::from("a.ts"), hashed_a, ft_a),
1021            (PathBuf::from("b.ts"), hashed_b, ft_b),
1022        ]);
1023
1024        assert!(
1025            !report.clone_groups.is_empty(),
1026            "Should detect at least one clone group"
1027        );
1028    }
1029
1030    #[test]
1031    fn no_detection_below_min_tokens() {
1032        let detector = CloneDetector::new(10, 1, false);
1033
1034        let hashes = vec![10, 20, 30]; // Only 3 tokens, min is 10
1035        let hashed_a = make_hashed_tokens(&hashes);
1036        let hashed_b = make_hashed_tokens(&hashes);
1037        let ft_a = make_file_tokens("abc", 3);
1038        let ft_b = make_file_tokens("abc", 3);
1039
1040        let report = detector.detect(vec![
1041            (PathBuf::from("a.ts"), hashed_a, ft_a),
1042            (PathBuf::from("b.ts"), hashed_b, ft_b),
1043        ]);
1044
1045        assert!(report.clone_groups.is_empty());
1046    }
1047
1048    #[test]
1049    fn byte_offset_to_line_col_basic() {
1050        let source = "abc\ndef\nghi";
1051        assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1052        assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1053        assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1054        assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1055    }
1056
1057    #[test]
1058    fn byte_offset_beyond_source() {
1059        let source = "abc";
1060        // Should clamp to end of source.
1061        let (line, col) = byte_offset_to_line_col(source, 100);
1062        assert_eq!(line, 1);
1063        assert_eq!(col, 3);
1064    }
1065
1066    #[test]
1067    fn skip_local_filters_same_directory() {
1068        let detector = CloneDetector::new(3, 1, true);
1069
1070        let hashes = vec![10, 20, 30, 40, 50];
1071        let source = "a\nb\nc\nd\ne";
1072
1073        let hashed_a = make_hashed_tokens(&hashes);
1074        let hashed_b = make_hashed_tokens(&hashes);
1075        let ft_a = make_file_tokens(source, 5);
1076        let ft_b = make_file_tokens(source, 5);
1077
1078        // Same directory -> should be filtered with skip_local.
1079        let report = detector.detect(vec![
1080            (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1081            (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1082        ]);
1083
1084        assert!(
1085            report.clone_groups.is_empty(),
1086            "Same-directory clones should be filtered with skip_local"
1087        );
1088    }
1089
1090    #[test]
1091    fn skip_local_keeps_cross_directory() {
1092        let detector = CloneDetector::new(3, 1, true);
1093
1094        let hashes = vec![10, 20, 30, 40, 50];
1095        let source = "a\nb\nc\nd\ne";
1096
1097        let hashed_a = make_hashed_tokens(&hashes);
1098        let hashed_b = make_hashed_tokens(&hashes);
1099        let ft_a = make_file_tokens(source, 5);
1100        let ft_b = make_file_tokens(source, 5);
1101
1102        // Different directories -> should be kept.
1103        let report = detector.detect(vec![
1104            (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1105            (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1106        ]);
1107
1108        assert!(
1109            !report.clone_groups.is_empty(),
1110            "Cross-directory clones should be kept with skip_local"
1111        );
1112    }
1113
1114    #[test]
1115    fn stats_computation() {
1116        let groups = vec![CloneGroup {
1117            instances: vec![
1118                CloneInstance {
1119                    file: PathBuf::from("a.ts"),
1120                    start_line: 1,
1121                    end_line: 5,
1122                    start_col: 0,
1123                    end_col: 10,
1124                    fragment: "...".to_string(),
1125                },
1126                CloneInstance {
1127                    file: PathBuf::from("b.ts"),
1128                    start_line: 10,
1129                    end_line: 14,
1130                    start_col: 0,
1131                    end_col: 10,
1132                    fragment: "...".to_string(),
1133                },
1134            ],
1135            token_count: 50,
1136            line_count: 5,
1137        }];
1138
1139        let stats = compute_stats(&groups, 10, 200, 1000);
1140        assert_eq!(stats.total_files, 10);
1141        assert_eq!(stats.files_with_clones, 2);
1142        assert_eq!(stats.clone_groups, 1);
1143        assert_eq!(stats.clone_instances, 2);
1144        assert_eq!(stats.duplicated_lines, 10); // 5 lines in each of 2 instances
1145        assert!(stats.duplication_percentage > 0.0);
1146    }
1147
1148    // ── New suffix array / LCP tests ───────────────────────
1149
1150    #[test]
1151    fn sa_construction_basic() {
1152        // "banana" encoded as integers: b=1, a=0, n=2
1153        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1154        let sa = build_suffix_array(&text);
1155
1156        // Suffixes sorted lexicographically:
1157        // SA[0] = 5: "a"           (0)
1158        // SA[1] = 3: "ana"         (0,2,0)
1159        // SA[2] = 1: "anana"       (0,2,0,2,0)
1160        // SA[3] = 0: "banana"      (1,0,2,0,2,0)
1161        // SA[4] = 4: "na"          (2,0)
1162        // SA[5] = 2: "nana"        (2,0,2,0)
1163        assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1164    }
1165
1166    #[test]
1167    fn lcp_construction_basic() {
1168        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1169        let sa = build_suffix_array(&text);
1170        let lcp = build_lcp(&text, &sa);
1171
1172        // LCP values for "banana":
1173        // lcp[0] = 0 (by definition)
1174        // lcp[1] = 1 (LCP of "a" and "ana" = "a" = 1)
1175        // lcp[2] = 3 (LCP of "ana" and "anana" = "ana" = 3)
1176        // lcp[3] = 0 (LCP of "anana" and "banana" = "" = 0)
1177        // lcp[4] = 0 (LCP of "banana" and "na" = "" = 0)
1178        // lcp[5] = 2 (LCP of "na" and "nana" = "na" = 2)
1179        assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1180    }
1181
1182    #[test]
1183    fn lcp_stops_at_sentinels() {
1184        // Two "files": [0, 1, 2] sentinel [-1] [0, 1, 2]
1185        let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1186        let sa = build_suffix_array(&text);
1187        let lcp = build_lcp(&text, &sa);
1188
1189        // Find the SA positions corresponding to text positions 0 and 4
1190        // (both start "0 1 2 ..."). LCP should be exactly 3.
1191        let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1192        let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1193        let (lo, hi) = if rank_0 < rank_4 {
1194            (rank_0, rank_4)
1195        } else {
1196            (rank_4, rank_0)
1197        };
1198
1199        // The minimum LCP in the range (lo, hi] gives the LCP between them.
1200        let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1201        assert_eq!(
1202            min_lcp, 3,
1203            "LCP between identical sequences across sentinel should be 3"
1204        );
1205    }
1206
1207    #[test]
1208    fn rank_reduction_maps_correctly() {
1209        let files = vec![
1210            FileData {
1211                path: PathBuf::from("a.ts"),
1212                hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1213                file_tokens: make_file_tokens("a b c", 3),
1214            },
1215            FileData {
1216                path: PathBuf::from("b.ts"),
1217                hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1218                file_tokens: make_file_tokens("d e f", 3),
1219            },
1220        ];
1221
1222        let ranked = rank_reduce(&files);
1223
1224        // Unique hashes: 100, 200, 300, 400 -> ranks 0, 1, 2, 3
1225        assert_eq!(ranked[0], vec![0, 1, 2]);
1226        assert_eq!(ranked[1], vec![1, 2, 3]);
1227    }
1228
1229    #[test]
1230    fn three_file_grouping() {
1231        let detector = CloneDetector::new(3, 1, false);
1232
1233        let hashes = vec![10, 20, 30, 40, 50];
1234        let source = "a\nb\nc\nd\ne";
1235
1236        let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1237            .map(|i| {
1238                (
1239                    PathBuf::from(format!("file{i}.ts")),
1240                    make_hashed_tokens(&hashes),
1241                    make_file_tokens(source, 5),
1242                )
1243            })
1244            .collect();
1245
1246        let report = detector.detect(data);
1247
1248        assert!(
1249            !report.clone_groups.is_empty(),
1250            "Should detect clones across 3 identical files"
1251        );
1252
1253        // The largest group should contain 3 instances.
1254        let max_instances = report
1255            .clone_groups
1256            .iter()
1257            .map(|g| g.instances.len())
1258            .max()
1259            .unwrap_or(0);
1260        assert_eq!(
1261            max_instances, 3,
1262            "3 identical files should produce a group with 3 instances"
1263        );
1264    }
1265
1266    #[test]
1267    fn overlapping_clones_largest_wins() {
1268        let detector = CloneDetector::new(3, 1, false);
1269
1270        // File A and B: identical 10-token sequences.
1271        let hashes: Vec<u64> = (1..=10).collect();
1272        let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1273
1274        let hashed_a = make_hashed_tokens(&hashes);
1275        let hashed_b = make_hashed_tokens(&hashes);
1276        let ft_a = make_file_tokens(source, 10);
1277        let ft_b = make_file_tokens(source, 10);
1278
1279        let report = detector.detect(vec![
1280            (PathBuf::from("a.ts"), hashed_a, ft_a),
1281            (PathBuf::from("b.ts"), hashed_b, ft_b),
1282        ]);
1283
1284        assert!(!report.clone_groups.is_empty());
1285        // The first group (sorted by token_count desc) should cover all 10.
1286        assert_eq!(
1287            report.clone_groups[0].token_count, 10,
1288            "Maximal clone should cover all 10 tokens"
1289        );
1290    }
1291
1292    #[test]
1293    fn no_self_overlap() {
1294        let detector = CloneDetector::new(3, 1, false);
1295
1296        // File with repeated pattern: [1,2,3,1,2,3]
1297        // The pattern [1,2,3] appears at offset 0 and offset 3.
1298        let hashes = vec![1, 2, 3, 1, 2, 3];
1299        // Source must be long enough for synthetic spans: token i has span (i*3, i*3+2).
1300        // Last token (5) has span (15, 17), so source must be >= 17 bytes.
1301        // Use a source with enough content spread across distinct lines.
1302        let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1303
1304        let hashed = make_hashed_tokens(&hashes);
1305        let ft = make_file_tokens(source, 6);
1306
1307        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1308
1309        // Verify that no clone instance overlaps with another in the same file.
1310        for group in &report.clone_groups {
1311            let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1312            for inst in &group.instances {
1313                file_instances
1314                    .entry(&inst.file)
1315                    .or_default()
1316                    .push((inst.start_line, inst.end_line));
1317            }
1318            for (_file, mut ranges) in file_instances {
1319                ranges.sort();
1320                for w in ranges.windows(2) {
1321                    assert!(
1322                        w[1].0 > w[0].1,
1323                        "Clone instances in the same file should not overlap: {:?} and {:?}",
1324                        w[0],
1325                        w[1]
1326                    );
1327                }
1328            }
1329        }
1330    }
1331
1332    #[test]
1333    fn empty_input_edge_case() {
1334        let detector = CloneDetector::new(0, 0, false);
1335        let report = detector.detect(vec![]);
1336        assert!(report.clone_groups.is_empty());
1337        assert_eq!(report.stats.total_files, 0);
1338    }
1339
1340    #[test]
1341    fn single_file_internal_duplication() {
1342        let detector = CloneDetector::new(3, 1, false);
1343
1344        // File with a repeated block separated by a different token.
1345        // [10, 20, 30, 99, 10, 20, 30]
1346        let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1347        let source = "a\nb\nc\nx\na\nb\nc";
1348
1349        let hashed = make_hashed_tokens(&hashes);
1350        let ft = make_file_tokens(source, 7);
1351
1352        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1353
1354        // Should detect the [10, 20, 30] clone at offsets 0 and 4.
1355        assert!(
1356            !report.clone_groups.is_empty(),
1357            "Should detect internal duplication within a single file"
1358        );
1359    }
1360}
fallow_core/duplicates/detect.rs

fallow_core/duplicates/
detect.rs