fallow_core/duplicates/
detect.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7use std::collections::HashMap;
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14/// Data for a single file being analyzed.
15struct FileData {
16    path: PathBuf,
17    hashed_tokens: Vec<HashedToken>,
18    file_tokens: FileTokens,
19}
20
21/// Suffix Array + LCP based clone detection engine.
22///
23/// Concatenates all files' token sequences (separated by unique sentinels),
24/// builds a suffix array and LCP array, then extracts maximal clone groups
25/// from contiguous LCP intervals.
26pub struct CloneDetector {
27    /// Minimum clone size in tokens.
28    min_tokens: usize,
29    /// Minimum clone size in lines.
30    min_lines: usize,
31    /// Only report cross-directory duplicates.
32    skip_local: bool,
33}
34
35impl CloneDetector {
36    /// Create a new detector with the given thresholds.
37    pub fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38        Self {
39            min_tokens,
40            min_lines,
41            skip_local,
42        }
43    }
44
45    /// Run clone detection across all files.
46    ///
47    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
48    /// one per analyzed file.
49    pub fn detect(
50        &self,
51        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52    ) -> DuplicationReport {
53        let _span = tracing::info_span!("clone_detect").entered();
54
55        if file_data.is_empty() || self.min_tokens == 0 {
56            return empty_report(0);
57        }
58
59        let files: Vec<FileData> = file_data
60            .into_iter()
61            .map(|(path, hashed_tokens, file_tokens)| FileData {
62                path,
63                hashed_tokens,
64                file_tokens,
65            })
66            .collect();
67
68        // Compute total stats.
69        let total_files = files.len();
70        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73        tracing::debug!(
74            total_files,
75            total_tokens,
76            total_lines,
77            "clone detection input"
78        );
79
80        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
81        let t0 = std::time::Instant::now();
82        let ranked_files = rank_reduce(&files);
83        let rank_time = t0.elapsed();
84        let unique_ranks: usize = ranked_files
85            .iter()
86            .flat_map(|f| f.iter())
87            .copied()
88            .max()
89            .map_or(0, |m| m as usize + 1);
90        tracing::debug!(
91            elapsed_us = rank_time.as_micros(),
92            unique_ranks,
93            "step1_rank_reduce"
94        );
95
96        // Step 2: Concatenate with sentinels.
97        let t0 = std::time::Instant::now();
98        let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99        let concat_time = t0.elapsed();
100        tracing::debug!(
101            elapsed_us = concat_time.as_micros(),
102            concat_len = text.len(),
103            "step2_concatenate"
104        );
105
106        if text.is_empty() {
107            return empty_report(total_files);
108        }
109
110        // Step 3: Build suffix array.
111        let t0 = std::time::Instant::now();
112        let sa = build_suffix_array(&text);
113        let sa_time = t0.elapsed();
114        tracing::debug!(
115            elapsed_us = sa_time.as_micros(),
116            n = text.len(),
117            "step3_suffix_array"
118        );
119
120        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
121        let t0 = std::time::Instant::now();
122        let lcp = build_lcp(&text, &sa);
123        let lcp_time = t0.elapsed();
124        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126        // Step 5: Extract clone groups from LCP intervals.
127        let t0 = std::time::Instant::now();
128        let raw_groups =
129            extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130        let extract_time = t0.elapsed();
131        tracing::debug!(
132            elapsed_us = extract_time.as_micros(),
133            raw_groups = raw_groups.len(),
134            "step5_extract_groups"
135        );
136
137        // Step 6: Build CloneGroup structs with line info, apply filters.
138        let t0 = std::time::Instant::now();
139        let clone_groups = self.build_groups(raw_groups, &files);
140        let build_time = t0.elapsed();
141        tracing::debug!(
142            elapsed_us = build_time.as_micros(),
143            final_groups = clone_groups.len(),
144            "step6_build_groups"
145        );
146
147        // Step 7: Compute stats.
148        let t0 = std::time::Instant::now();
149        let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150        let stats_time = t0.elapsed();
151        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153        tracing::info!(
154            total_us = (rank_time
155                + concat_time
156                + sa_time
157                + lcp_time
158                + extract_time
159                + build_time
160                + stats_time)
161                .as_micros(),
162            rank_us = rank_time.as_micros(),
163            sa_us = sa_time.as_micros(),
164            lcp_us = lcp_time.as_micros(),
165            extract_us = extract_time.as_micros(),
166            build_us = build_time.as_micros(),
167            stats_us = stats_time.as_micros(),
168            total_tokens,
169            clone_groups = clone_groups.len(),
170            "clone detection complete"
171        );
172
173        DuplicationReport {
174            clone_groups,
175            stats,
176        }
177    }
178
179    /// Convert raw groups into `CloneGroup` structs, applying min_lines and
180    /// skip_local filters, deduplication, and subset removal.
181    fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
182        if raw_groups.is_empty() {
183            return Vec::new();
184        }
185
186        // ── Token-level subset removal (cheap) ────────────────
187        //
188        // Sort raw groups by length desc. For each instance (file_id, offset),
189        // track non-overlapping covered intervals per file. A smaller group is
190        // skipped if all its instances fall within already-covered intervals.
191        // This eliminates the vast majority of raw groups before the expensive
192        // line-calculation step.
193        let raw_count = raw_groups.len();
194        raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
195
196        // covered[file_id] is a sorted vec of non-overlapping (start, end)
197        // intervals. Kept sorted by start for binary search.
198        let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
199        let mut surviving_groups: Vec<RawGroup> = Vec::new();
200
201        for rg in raw_groups {
202            let len = rg.length;
203            // Check if all instances are fully covered by existing intervals.
204            let all_covered = rg.instances.iter().all(|&(fid, offset)| {
205                let intervals = &covered[fid];
206                // Binary search for the interval that could contain [offset, offset+len).
207                let idx = intervals.partition_point(|&(s, _)| s <= offset);
208                if idx > 0 {
209                    let (s, e) = intervals[idx - 1];
210                    offset >= s && offset + len <= e
211                } else {
212                    false
213                }
214            });
215
216            if !all_covered {
217                // Insert covered intervals for this group's instances.
218                for &(fid, offset) in &rg.instances {
219                    let end = offset + len;
220                    let intervals = &mut covered[fid];
221                    let idx = intervals.partition_point(|&(s, _)| s < offset);
222                    // Check if the new interval merges with an existing one.
223                    if idx > 0 {
224                        let prev = &mut intervals[idx - 1];
225                        if prev.1 >= offset {
226                            // Extend previous interval if needed.
227                            if end > prev.1 {
228                                prev.1 = end;
229                            }
230                            continue;
231                        }
232                    }
233                    intervals.insert(idx, (offset, end));
234                }
235                surviving_groups.push(rg);
236            }
237        }
238
239        tracing::trace!(
240            raw = raw_count,
241            surviving = surviving_groups.len(),
242            "token-level subset removal"
243        );
244
245        // ── Pre-compute line offset tables ────────────────────
246        //
247        // For each file, build a sorted vec of newline byte positions so that
248        // byte_offset_to_line_col can use binary search (O(log L)) instead of
249        // linear scan (O(L)).
250        let line_tables: Vec<Vec<usize>> = files
251            .iter()
252            .map(|f| {
253                f.file_tokens
254                    .source
255                    .bytes()
256                    .enumerate()
257                    .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
258                    .collect()
259            })
260            .collect();
261
262        // ── Build CloneGroups for survivors ────────────────────
263        let mut clone_groups: Vec<CloneGroup> = Vec::new();
264
265        for rg in surviving_groups {
266            // Build instances, deduplicating by (file_id, offset).
267            let mut seen: HashMap<(usize, usize), bool> = HashMap::new();
268            let mut group_instances: Vec<CloneInstance> = Vec::new();
269
270            for &(file_id, offset) in &rg.instances {
271                if seen.contains_key(&(file_id, offset)) {
272                    continue;
273                }
274                seen.insert((file_id, offset), true);
275
276                let file = &files[file_id];
277                if let Some(inst) =
278                    build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279                {
280                    group_instances.push(inst);
281                }
282            }
283
284            // Apply skip_local: only keep cross-directory clones.
285            if self.skip_local && group_instances.len() >= 2 {
286                let dirs: std::collections::HashSet<_> = group_instances
287                    .iter()
288                    .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289                    .collect();
290                if dirs.len() < 2 {
291                    continue;
292                }
293            }
294
295            if group_instances.len() < 2 {
296                continue;
297            }
298
299            // Calculate line count from the instances.
300            let line_count = group_instances
301                .iter()
302                .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303                .max()
304                .unwrap_or(0);
305
306            // Apply minimum line filter.
307            if line_count < self.min_lines {
308                continue;
309            }
310
311            // Sort instances by file path then start line for stable output.
312            group_instances
313                .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315            // Deduplicate instances that map to overlapping line ranges within
316            // the same file (different token offsets can resolve to overlapping
317            // source spans). When two instances overlap, keep the wider one.
318            group_instances.dedup_by(|b, a| {
319                if a.file != b.file {
320                    return false;
321                }
322                // Instances are sorted by start_line. `b` starts at or after `a`.
323                // If b's start overlaps with a's range, merge by extending a.
324                if b.start_line <= a.end_line {
325                    // Keep the wider range in `a`.
326                    if b.end_line > a.end_line {
327                        a.end_line = b.end_line;
328                        a.end_col = b.end_col;
329                    }
330                    true
331                } else {
332                    false
333                }
334            });
335
336            if group_instances.len() < 2 {
337                continue;
338            }
339
340            clone_groups.push(CloneGroup {
341                instances: group_instances,
342                token_count: rg.length,
343                line_count,
344            });
345        }
346
347        // Sort groups by token count (largest first), breaking ties by instance
348        // count (most instances first). This ensures that for equal token counts,
349        // N-way groups come before M-way (M<N) subsets, so subset removal works
350        // correctly regardless of the suffix array's extraction order.
351        clone_groups.sort_by(|a, b| {
352            b.token_count
353                .cmp(&a.token_count)
354                .then(b.instances.len().cmp(&a.instances.len()))
355        });
356
357        // Remove groups whose line ranges are fully contained within another
358        // group's line ranges. Check bidirectionally: a group with more tokens
359        // but a narrower line range can still be a subset of a group with fewer
360        // tokens but a wider line range.
361        let mut keep = vec![true; clone_groups.len()];
362        for i in 0..clone_groups.len() {
363            if !keep[i] {
364                continue;
365            }
366            for j in (i + 1)..clone_groups.len() {
367                if !keep[j] {
368                    continue;
369                }
370                if is_subset_group(&clone_groups[j], &clone_groups[i]) {
371                    keep[j] = false;
372                } else if is_subset_group(&clone_groups[i], &clone_groups[j]) {
373                    keep[i] = false;
374                    break;
375                }
376            }
377        }
378
379        clone_groups
380            .into_iter()
381            .enumerate()
382            .filter(|(i, _)| keep[*i])
383            .map(|(_, g)| g)
384            .collect()
385    }
386}
387
388// ── Raw group from LCP extraction ──────────────────────────
389
390/// A raw clone group before conversion to `CloneGroup`.
391struct RawGroup {
392    /// List of (file_id, token_offset) instances.
393    instances: Vec<(usize, usize)>,
394    /// Clone length in tokens.
395    length: usize,
396}
397
398// ── Step 1: Rank reduction ─────────────────────────────────
399
400/// Map all unique token hashes (u64) to consecutive integer ranks (u32).
401///
402/// Returns `ranked_files` where `ranked_files[i]` contains the rank
403/// sequence for `files[i]`.
404fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
405    // Single-pass: assign ranks on first encounter. The exact rank values
406    // don't matter as long as equal hashes get equal ranks. Skipping the
407    // sort+dedup saves O(N log N) and a second allocation.
408    let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
409    let mut hash_to_rank: HashMap<u64, u32> = HashMap::with_capacity(total / 2);
410    let mut next_rank: u32 = 0;
411
412    files
413        .iter()
414        .map(|file| {
415            file.hashed_tokens
416                .iter()
417                .map(|ht| {
418                    *hash_to_rank.entry(ht.hash).or_insert_with(|| {
419                        let r = next_rank;
420                        next_rank += 1;
421                        r
422                    })
423                })
424                .collect()
425        })
426        .collect()
427}
428
429// ── Step 2: Concatenation with sentinels ───────────────────
430
431/// Concatenate all ranked token sequences into a single `Vec<i64>`,
432/// inserting unique negative sentinel values between files.
433///
434/// Returns `(text, file_of, file_offsets)` where:
435/// - `text` is the concatenated sequence
436/// - `file_of[pos]` maps a position in `text` to a file index
437///   (`usize::MAX` for sentinel positions)
438/// - `file_offsets[file_id]` is the starting position of file `file_id`
439///   in `text`
440fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
441    let sentinel_count = ranked_files.len().saturating_sub(1);
442    let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
443
444    let mut text = Vec::with_capacity(total_len);
445    let mut file_of = Vec::with_capacity(total_len);
446    let mut file_offsets = Vec::with_capacity(ranked_files.len());
447
448    let mut sentinel: i64 = -1;
449
450    for (file_id, ranks) in ranked_files.iter().enumerate() {
451        file_offsets.push(text.len());
452
453        for &r in ranks {
454            text.push(r as i64);
455            file_of.push(file_id);
456        }
457
458        // Insert sentinel between files (not after the last one).
459        if file_id + 1 < ranked_files.len() {
460            text.push(sentinel);
461            file_of.push(usize::MAX);
462            sentinel -= 1;
463        }
464    }
465
466    (text, file_of, file_offsets)
467}
468
469// ── Step 3: Suffix array construction (prefix doubling) ────
470
471/// Build a suffix array using the O(N log N) prefix-doubling algorithm with
472/// radix sort.
473///
474/// Returns `sa` where `sa[i]` is the starting position of the i-th
475/// lexicographically smallest suffix in `text`.
476fn build_suffix_array(text: &[i64]) -> Vec<usize> {
477    let n = text.len();
478    if n == 0 {
479        return vec![];
480    }
481
482    // Initial ranks based on raw values. Shift so sentinels (negative) sort
483    // before all real tokens.
484    let min_val = text.iter().copied().min().unwrap_or(0);
485    let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
486    let mut sa: Vec<usize> = (0..n).collect();
487    let mut tmp: Vec<i64> = vec![0; n];
488    let mut k: usize = 1;
489    let mut iterations = 0u32;
490
491    // Scratch buffers for radix sort (reused across iterations).
492    let mut sa_tmp: Vec<usize> = vec![0; n];
493
494    while k < n {
495        iterations += 1;
496        let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
497
498        // Two-pass radix sort: sort by secondary key (rank[i+k]) first,
499        // then by primary key (rank[i]). Each pass is O(N + K) where
500        // K = max_rank + 2 (including the -1 sentinel rank).
501        let bucket_count = max_rank + 2; // ranks 0..=max_rank plus -1 mapped to 0
502
503        // Pass 1: sort by secondary key (rank at offset k).
504        let mut counts = vec![0usize; bucket_count + 1];
505        for &i in &sa {
506            let r2 = if i + k < n {
507                rank[i + k] as usize + 1
508            } else {
509                0
510            };
511            counts[r2] += 1;
512        }
513        // Prefix sum.
514        let mut sum = 0;
515        for c in counts.iter_mut() {
516            let v = *c;
517            *c = sum;
518            sum += v;
519        }
520        for &i in &sa {
521            let r2 = if i + k < n {
522                rank[i + k] as usize + 1
523            } else {
524                0
525            };
526            sa_tmp[counts[r2]] = i;
527            counts[r2] += 1;
528        }
529
530        // Pass 2: sort by primary key (rank[i]), stable.
531        // No +1 offset needed here: rank[i] is always >= 0 because the
532        // initial ranks are shifted by min_val, and subsequent iterations
533        // assign ranks starting from 0.
534        counts.iter_mut().for_each(|c| *c = 0);
535        counts.resize(bucket_count + 1, 0);
536        for &i in &sa_tmp {
537            let r1 = rank[i] as usize;
538            counts[r1] += 1;
539        }
540        sum = 0;
541        for c in counts.iter_mut() {
542            let v = *c;
543            *c = sum;
544            sum += v;
545        }
546        for &i in &sa_tmp {
547            let r1 = rank[i] as usize;
548            sa[counts[r1]] = i;
549            counts[r1] += 1;
550        }
551
552        // Compute new ranks.
553        tmp[sa[0]] = 0;
554        for i in 1..n {
555            let prev = sa[i - 1];
556            let curr = sa[i];
557            let same = rank[prev] == rank[curr] && {
558                let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
559                let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
560                rp2 == rc2
561            };
562            tmp[curr] = tmp[prev] + if same { 0 } else { 1 };
563        }
564
565        // Early exit when all ranks are unique.
566        let new_max_rank = tmp[sa[n - 1]];
567        std::mem::swap(&mut rank, &mut tmp);
568
569        if new_max_rank as usize == n - 1 {
570            break;
571        }
572
573        k *= 2;
574    }
575
576    tracing::trace!(n, iterations, "suffix array constructed");
577    sa
578}
579
580// ── Step 4: LCP array (Kasai's algorithm) ──────────────────
581
582/// Build the LCP (Longest Common Prefix) array using Kasai's algorithm.
583///
584/// `lcp[i]` is the length of the longest common prefix between suffixes
585/// `sa[i]` and `sa[i-1]`. `lcp[0]` is always 0.
586///
587/// The LCP computation stops at sentinel boundaries (negative values in
588/// `text`) to prevent matches from crossing file boundaries.
589fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
590    let n = sa.len();
591    if n == 0 {
592        return vec![];
593    }
594
595    let mut rank = vec![0usize; n];
596    for i in 0..n {
597        rank[sa[i]] = i;
598    }
599
600    let mut lcp = vec![0usize; n];
601    let mut k: usize = 0;
602
603    for i in 0..n {
604        if rank[i] == 0 {
605            k = 0;
606            continue;
607        }
608        let j = sa[rank[i] - 1];
609        while i + k < n && j + k < n {
610            // Stop at sentinels (negative values).
611            if text[i + k] < 0 || text[j + k] < 0 {
612                break;
613            }
614            if text[i + k] != text[j + k] {
615                break;
616            }
617            k += 1;
618        }
619        lcp[rank[i]] = k;
620        k = k.saturating_sub(1);
621    }
622
623    lcp
624}
625
626// ── Step 5: Clone group extraction ─────────────────────────
627
628/// Extract clone groups from the suffix array and LCP array.
629///
630/// Uses a stack-based approach to find all maximal LCP intervals where the
631/// minimum LCP value is >= `min_tokens`, and the interval contains suffixes
632/// from at least two different positions (cross-file or non-overlapping
633/// same-file).
634fn extract_clone_groups(
635    sa: &[usize],
636    lcp: &[usize],
637    file_of: &[usize],
638    file_offsets: &[usize],
639    min_tokens: usize,
640    files: &[FileData],
641) -> Vec<RawGroup> {
642    let n = sa.len();
643    if n < 2 {
644        return vec![];
645    }
646
647    // Stack-based LCP interval extraction.
648    // Each stack entry: (lcp_value, start_index_in_sa).
649    let mut stack: Vec<(usize, usize)> = Vec::new();
650    let mut groups: Vec<RawGroup> = Vec::new();
651
652    #[allow(clippy::needless_range_loop)] // `i` is used as a value, not just as an index
653    for i in 1..=n {
654        let cur_lcp = if i < n { lcp[i] } else { 0 };
655        let mut start = i;
656
657        while let Some(&(top_lcp, top_start)) = stack.last() {
658            if top_lcp <= cur_lcp {
659                break;
660            }
661            stack.pop();
662            start = top_start;
663
664            if top_lcp >= min_tokens {
665                // The interval [start-1 .. i-1] shares a common prefix of
666                // length `top_lcp`.
667                let interval_begin = start - 1;
668                let interval_end = i; // exclusive
669
670                if let Some(group) = build_raw_group(
671                    sa,
672                    file_of,
673                    file_offsets,
674                    files,
675                    interval_begin,
676                    interval_end,
677                    top_lcp,
678                ) {
679                    groups.push(group);
680                }
681            }
682        }
683
684        if i < n {
685            stack.push((cur_lcp, start));
686        }
687    }
688
689    groups
690}
691
692/// Build a `RawGroup` from an LCP interval, filtering to non-overlapping
693/// instances.
694fn build_raw_group(
695    sa: &[usize],
696    file_of: &[usize],
697    file_offsets: &[usize],
698    files: &[FileData],
699    begin: usize,
700    end: usize,
701    length: usize,
702) -> Option<RawGroup> {
703    let mut instances: Vec<(usize, usize)> = Vec::new();
704
705    for &pos in &sa[begin..end] {
706        let fid = file_of[pos];
707        if fid == usize::MAX {
708            continue; // sentinel position
709        }
710        let offset_in_file = pos - file_offsets[fid];
711
712        // Verify the clone doesn't extend beyond the file boundary.
713        if offset_in_file + length > files[fid].hashed_tokens.len() {
714            continue;
715        }
716
717        instances.push((fid, offset_in_file));
718    }
719
720    if instances.len() < 2 {
721        return None;
722    }
723
724    // Remove overlapping instances within the same file.
725    // Sort by (file_id, offset) and remove overlaps.
726    instances.sort_unstable();
727    let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
728    for &(fid, offset) in &instances {
729        if let Some(&(last_fid, last_offset)) = deduped.last()
730            && fid == last_fid
731            && offset < last_offset + length
732        {
733            continue; // overlapping within the same file
734        }
735        deduped.push((fid, offset));
736    }
737
738    if deduped.len() < 2 {
739        return None;
740    }
741
742    Some(RawGroup {
743        instances: deduped,
744        length,
745    })
746}
747
748// ── Utility functions ──────────────────────────────────────
749
750/// Check if all instances of `smaller` overlap with instances of `larger`.
751fn is_subset_group(smaller: &CloneGroup, larger: &CloneGroup) -> bool {
752    smaller.instances.iter().all(|s_inst| {
753        larger.instances.iter().any(|l_inst| {
754            s_inst.file == l_inst.file
755                && s_inst.start_line >= l_inst.start_line
756                && s_inst.end_line <= l_inst.end_line
757        })
758    })
759}
760
761/// Build a `CloneInstance` using a pre-computed line offset table for fast lookup.
762fn build_clone_instance_fast(
763    file: &FileData,
764    token_offset: usize,
765    token_length: usize,
766    line_table: &[usize],
767) -> Option<CloneInstance> {
768    let tokens = &file.hashed_tokens;
769    let source_tokens = &file.file_tokens.tokens;
770
771    if token_offset + token_length > tokens.len() {
772        return None;
773    }
774
775    // Map from hashed token indices back to source token spans.
776    let first_hashed = &tokens[token_offset];
777    let last_hashed = &tokens[token_offset + token_length - 1];
778
779    let first_source = &source_tokens[first_hashed.original_index];
780    let last_source = &source_tokens[last_hashed.original_index];
781
782    let start_byte = first_source.span.start as usize;
783    let end_byte = last_source.span.end as usize;
784
785    // Guard against inverted spans that can occur when normalization reorders
786    // token original_index values for very small windows.
787    if start_byte > end_byte {
788        return None;
789    }
790
791    let source = &file.file_tokens.source;
792    let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
793    let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
794
795    // Extract the fragment.
796    let fragment = if end_byte <= source.len() {
797        source[start_byte..end_byte].to_string()
798    } else {
799        String::new()
800    };
801
802    Some(CloneInstance {
803        file: file.path.clone(),
804        start_line,
805        end_line,
806        start_col,
807        end_col,
808        fragment,
809    })
810}
811
812/// Build a `CloneInstance` from file data and token offset/length.
813#[cfg(test)]
814fn build_clone_instance(
815    file: &FileData,
816    token_offset: usize,
817    token_length: usize,
818) -> Option<CloneInstance> {
819    let tokens = &file.hashed_tokens;
820    let source_tokens = &file.file_tokens.tokens;
821
822    if token_offset + token_length > tokens.len() {
823        return None;
824    }
825
826    // Map from hashed token indices back to source token spans.
827    let first_hashed = &tokens[token_offset];
828    let last_hashed = &tokens[token_offset + token_length - 1];
829
830    let first_source = &source_tokens[first_hashed.original_index];
831    let last_source = &source_tokens[last_hashed.original_index];
832
833    let start_byte = first_source.span.start as usize;
834    let end_byte = last_source.span.end as usize;
835
836    // Guard against inverted spans that can occur when normalization reorders
837    // token original_index values for very small windows.
838    if start_byte > end_byte {
839        return None;
840    }
841
842    let source = &file.file_tokens.source;
843    let (start_line, start_col) = byte_offset_to_line_col(source, start_byte);
844    let (end_line, end_col) = byte_offset_to_line_col(source, end_byte);
845
846    // Extract the fragment.
847    let fragment = if end_byte <= source.len() {
848        source[start_byte..end_byte].to_string()
849    } else {
850        String::new()
851    };
852
853    Some(CloneInstance {
854        file: file.path.clone(),
855        start_line,
856        end_line,
857        start_col,
858        end_col,
859        fragment,
860    })
861}
862
863/// Convert a byte offset into a 1-based line number and 0-based character column
864/// using a pre-computed table of newline positions for O(log L) lookup.
865fn byte_offset_to_line_col_fast(
866    source: &str,
867    byte_offset: usize,
868    line_table: &[usize],
869) -> (usize, usize) {
870    let offset = byte_offset.min(source.len());
871    // Binary search: find the number of newlines before this offset.
872    let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
873    let line = line_idx + 1; // 1-based
874    let line_start = if line_idx == 0 {
875        0
876    } else {
877        line_table[line_idx - 1] + 1
878    };
879    let col = source[line_start..offset].chars().count();
880    (line, col)
881}
882
883/// Convert a byte offset into a 1-based line number and 0-based character column.
884#[cfg(test)]
885fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
886    let offset = byte_offset.min(source.len());
887    let before = &source[..offset];
888    let line = before.matches('\n').count() + 1;
889    let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
890    let col = before[line_start..].chars().count();
891    (line, col)
892}
893
894/// Compute aggregate duplication statistics.
895fn compute_stats(
896    clone_groups: &[CloneGroup],
897    total_files: usize,
898    total_lines: usize,
899    total_tokens: usize,
900) -> DuplicationStats {
901    use std::collections::HashSet;
902    use std::path::Path;
903
904    let mut files_with_clones: HashSet<&Path> = HashSet::new();
905    // Group duplicated lines by file to avoid cloning PathBuf per line.
906    let mut file_dup_lines: HashMap<&Path, HashSet<usize>> = HashMap::new();
907    let mut duplicated_tokens = 0usize;
908    let mut clone_instances = 0usize;
909
910    for group in clone_groups {
911        for instance in &group.instances {
912            files_with_clones.insert(&instance.file);
913            clone_instances += 1;
914            let lines = file_dup_lines.entry(&instance.file).or_default();
915            for line in instance.start_line..=instance.end_line {
916                lines.insert(line);
917            }
918        }
919        // Each instance contributes token_count duplicated tokens,
920        // but only count duplicates (all instances beyond the first).
921        if group.instances.len() > 1 {
922            duplicated_tokens += group.token_count * (group.instances.len() - 1);
923        }
924    }
925
926    let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
927    let duplication_percentage = if total_lines > 0 {
928        (dup_line_count as f64 / total_lines as f64) * 100.0
929    } else {
930        0.0
931    };
932
933    // Cap duplicated_tokens to total_tokens to avoid impossible values
934    // when overlapping clone groups double-count the same token positions.
935    let duplicated_tokens = duplicated_tokens.min(total_tokens);
936
937    DuplicationStats {
938        total_files,
939        files_with_clones: files_with_clones.len(),
940        total_lines,
941        duplicated_lines: dup_line_count,
942        total_tokens,
943        duplicated_tokens,
944        clone_groups: clone_groups.len(),
945        clone_instances,
946        duplication_percentage,
947    }
948}
949
950/// Create an empty report when there are no files to analyze.
951fn empty_report(total_files: usize) -> DuplicationReport {
952    DuplicationReport {
953        clone_groups: vec![],
954        stats: DuplicationStats {
955            total_files,
956            files_with_clones: 0,
957            total_lines: 0,
958            duplicated_lines: 0,
959            total_tokens: 0,
960            duplicated_tokens: 0,
961            clone_groups: 0,
962            clone_instances: 0,
963            duplication_percentage: 0.0,
964        },
965    }
966}
967
968#[cfg(test)]
969mod tests {
970    use super::*;
971    use crate::duplicates::normalize::HashedToken;
972    use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
973    use oxc_span::Span;
974
975    fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
976        hashes
977            .iter()
978            .enumerate()
979            .map(|(i, &hash)| HashedToken {
980                hash,
981                original_index: i,
982            })
983            .collect()
984    }
985
986    fn make_source_tokens(count: usize) -> Vec<SourceToken> {
987        (0..count)
988            .map(|i| SourceToken {
989                kind: TokenKind::Identifier(format!("t{i}")),
990                span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
991            })
992            .collect()
993    }
994
995    fn make_file_tokens(source: &str, count: usize) -> FileTokens {
996        FileTokens {
997            tokens: make_source_tokens(count),
998            source: source.to_string(),
999            line_count: source.lines().count().max(1),
1000        }
1001    }
1002
1003    // ── Existing tests (adapted for CloneDetector) ─────────
1004
1005    #[test]
1006    fn empty_input_produces_empty_report() {
1007        let detector = CloneDetector::new(5, 1, false);
1008        let report = detector.detect(vec![]);
1009        assert!(report.clone_groups.is_empty());
1010        assert_eq!(report.stats.total_files, 0);
1011    }
1012
1013    #[test]
1014    fn single_file_no_clones() {
1015        let detector = CloneDetector::new(3, 1, false);
1016        let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
1017        let ft = make_file_tokens("a b c d e", 5);
1018        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1019        assert!(report.clone_groups.is_empty());
1020    }
1021
1022    #[test]
1023    fn detects_exact_duplicate_across_files() {
1024        let detector = CloneDetector::new(3, 1, false);
1025
1026        // Same token sequence in two files.
1027        let hashes = vec![10, 20, 30, 40, 50];
1028        let source_a = "a\nb\nc\nd\ne";
1029        let source_b = "a\nb\nc\nd\ne";
1030
1031        let hashed_a = make_hashed_tokens(&hashes);
1032        let hashed_b = make_hashed_tokens(&hashes);
1033        let ft_a = make_file_tokens(source_a, 5);
1034        let ft_b = make_file_tokens(source_b, 5);
1035
1036        let report = detector.detect(vec![
1037            (PathBuf::from("a.ts"), hashed_a, ft_a),
1038            (PathBuf::from("b.ts"), hashed_b, ft_b),
1039        ]);
1040
1041        assert!(
1042            !report.clone_groups.is_empty(),
1043            "Should detect at least one clone group"
1044        );
1045    }
1046
1047    #[test]
1048    fn no_detection_below_min_tokens() {
1049        let detector = CloneDetector::new(10, 1, false);
1050
1051        let hashes = vec![10, 20, 30]; // Only 3 tokens, min is 10
1052        let hashed_a = make_hashed_tokens(&hashes);
1053        let hashed_b = make_hashed_tokens(&hashes);
1054        let ft_a = make_file_tokens("abc", 3);
1055        let ft_b = make_file_tokens("abc", 3);
1056
1057        let report = detector.detect(vec![
1058            (PathBuf::from("a.ts"), hashed_a, ft_a),
1059            (PathBuf::from("b.ts"), hashed_b, ft_b),
1060        ]);
1061
1062        assert!(report.clone_groups.is_empty());
1063    }
1064
1065    #[test]
1066    fn byte_offset_to_line_col_basic() {
1067        let source = "abc\ndef\nghi";
1068        assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1069        assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1070        assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1071        assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1072    }
1073
1074    #[test]
1075    fn byte_offset_beyond_source() {
1076        let source = "abc";
1077        // Should clamp to end of source.
1078        let (line, col) = byte_offset_to_line_col(source, 100);
1079        assert_eq!(line, 1);
1080        assert_eq!(col, 3);
1081    }
1082
1083    #[test]
1084    fn skip_local_filters_same_directory() {
1085        let detector = CloneDetector::new(3, 1, true);
1086
1087        let hashes = vec![10, 20, 30, 40, 50];
1088        let source = "a\nb\nc\nd\ne";
1089
1090        let hashed_a = make_hashed_tokens(&hashes);
1091        let hashed_b = make_hashed_tokens(&hashes);
1092        let ft_a = make_file_tokens(source, 5);
1093        let ft_b = make_file_tokens(source, 5);
1094
1095        // Same directory -> should be filtered with skip_local.
1096        let report = detector.detect(vec![
1097            (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1098            (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1099        ]);
1100
1101        assert!(
1102            report.clone_groups.is_empty(),
1103            "Same-directory clones should be filtered with skip_local"
1104        );
1105    }
1106
1107    #[test]
1108    fn skip_local_keeps_cross_directory() {
1109        let detector = CloneDetector::new(3, 1, true);
1110
1111        let hashes = vec![10, 20, 30, 40, 50];
1112        let source = "a\nb\nc\nd\ne";
1113
1114        let hashed_a = make_hashed_tokens(&hashes);
1115        let hashed_b = make_hashed_tokens(&hashes);
1116        let ft_a = make_file_tokens(source, 5);
1117        let ft_b = make_file_tokens(source, 5);
1118
1119        // Different directories -> should be kept.
1120        let report = detector.detect(vec![
1121            (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1122            (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1123        ]);
1124
1125        assert!(
1126            !report.clone_groups.is_empty(),
1127            "Cross-directory clones should be kept with skip_local"
1128        );
1129    }
1130
1131    #[test]
1132    fn stats_computation() {
1133        let groups = vec![CloneGroup {
1134            instances: vec![
1135                CloneInstance {
1136                    file: PathBuf::from("a.ts"),
1137                    start_line: 1,
1138                    end_line: 5,
1139                    start_col: 0,
1140                    end_col: 10,
1141                    fragment: "...".to_string(),
1142                },
1143                CloneInstance {
1144                    file: PathBuf::from("b.ts"),
1145                    start_line: 10,
1146                    end_line: 14,
1147                    start_col: 0,
1148                    end_col: 10,
1149                    fragment: "...".to_string(),
1150                },
1151            ],
1152            token_count: 50,
1153            line_count: 5,
1154        }];
1155
1156        let stats = compute_stats(&groups, 10, 200, 1000);
1157        assert_eq!(stats.total_files, 10);
1158        assert_eq!(stats.files_with_clones, 2);
1159        assert_eq!(stats.clone_groups, 1);
1160        assert_eq!(stats.clone_instances, 2);
1161        assert_eq!(stats.duplicated_lines, 10); // 5 lines in each of 2 instances
1162        assert!(stats.duplication_percentage > 0.0);
1163    }
1164
1165    // ── New suffix array / LCP tests ───────────────────────
1166
1167    #[test]
1168    fn sa_construction_basic() {
1169        // "banana" encoded as integers: b=1, a=0, n=2
1170        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1171        let sa = build_suffix_array(&text);
1172
1173        // Suffixes sorted lexicographically:
1174        // SA[0] = 5: "a"           (0)
1175        // SA[1] = 3: "ana"         (0,2,0)
1176        // SA[2] = 1: "anana"       (0,2,0,2,0)
1177        // SA[3] = 0: "banana"      (1,0,2,0,2,0)
1178        // SA[4] = 4: "na"          (2,0)
1179        // SA[5] = 2: "nana"        (2,0,2,0)
1180        assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1181    }
1182
1183    #[test]
1184    fn lcp_construction_basic() {
1185        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1186        let sa = build_suffix_array(&text);
1187        let lcp = build_lcp(&text, &sa);
1188
1189        // LCP values for "banana":
1190        // lcp[0] = 0 (by definition)
1191        // lcp[1] = 1 (LCP of "a" and "ana" = "a" = 1)
1192        // lcp[2] = 3 (LCP of "ana" and "anana" = "ana" = 3)
1193        // lcp[3] = 0 (LCP of "anana" and "banana" = "" = 0)
1194        // lcp[4] = 0 (LCP of "banana" and "na" = "" = 0)
1195        // lcp[5] = 2 (LCP of "na" and "nana" = "na" = 2)
1196        assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1197    }
1198
1199    #[test]
1200    fn lcp_stops_at_sentinels() {
1201        // Two "files": [0, 1, 2] sentinel [-1] [0, 1, 2]
1202        let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1203        let sa = build_suffix_array(&text);
1204        let lcp = build_lcp(&text, &sa);
1205
1206        // Find the SA positions corresponding to text positions 0 and 4
1207        // (both start "0 1 2 ..."). LCP should be exactly 3.
1208        let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1209        let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1210        let (lo, hi) = if rank_0 < rank_4 {
1211            (rank_0, rank_4)
1212        } else {
1213            (rank_4, rank_0)
1214        };
1215
1216        // The minimum LCP in the range (lo, hi] gives the LCP between them.
1217        let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1218        assert_eq!(
1219            min_lcp, 3,
1220            "LCP between identical sequences across sentinel should be 3"
1221        );
1222    }
1223
1224    #[test]
1225    fn rank_reduction_maps_correctly() {
1226        let files = vec![
1227            FileData {
1228                path: PathBuf::from("a.ts"),
1229                hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1230                file_tokens: make_file_tokens("a b c", 3),
1231            },
1232            FileData {
1233                path: PathBuf::from("b.ts"),
1234                hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1235                file_tokens: make_file_tokens("d e f", 3),
1236            },
1237        ];
1238
1239        let ranked = rank_reduce(&files);
1240
1241        // Unique hashes: 100, 200, 300, 400 -> ranks 0, 1, 2, 3
1242        assert_eq!(ranked[0], vec![0, 1, 2]);
1243        assert_eq!(ranked[1], vec![1, 2, 3]);
1244    }
1245
1246    #[test]
1247    fn three_file_grouping() {
1248        let detector = CloneDetector::new(3, 1, false);
1249
1250        let hashes = vec![10, 20, 30, 40, 50];
1251        let source = "a\nb\nc\nd\ne";
1252
1253        let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1254            .map(|i| {
1255                (
1256                    PathBuf::from(format!("file{i}.ts")),
1257                    make_hashed_tokens(&hashes),
1258                    make_file_tokens(source, 5),
1259                )
1260            })
1261            .collect();
1262
1263        let report = detector.detect(data);
1264
1265        assert!(
1266            !report.clone_groups.is_empty(),
1267            "Should detect clones across 3 identical files"
1268        );
1269
1270        // The largest group should contain 3 instances.
1271        let max_instances = report
1272            .clone_groups
1273            .iter()
1274            .map(|g| g.instances.len())
1275            .max()
1276            .unwrap_or(0);
1277        assert_eq!(
1278            max_instances, 3,
1279            "3 identical files should produce a group with 3 instances"
1280        );
1281    }
1282
1283    #[test]
1284    fn overlapping_clones_largest_wins() {
1285        let detector = CloneDetector::new(3, 1, false);
1286
1287        // File A and B: identical 10-token sequences.
1288        let hashes: Vec<u64> = (1..=10).collect();
1289        let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1290
1291        let hashed_a = make_hashed_tokens(&hashes);
1292        let hashed_b = make_hashed_tokens(&hashes);
1293        let ft_a = make_file_tokens(source, 10);
1294        let ft_b = make_file_tokens(source, 10);
1295
1296        let report = detector.detect(vec![
1297            (PathBuf::from("a.ts"), hashed_a, ft_a),
1298            (PathBuf::from("b.ts"), hashed_b, ft_b),
1299        ]);
1300
1301        assert!(!report.clone_groups.is_empty());
1302        // The first group (sorted by token_count desc) should cover all 10.
1303        assert_eq!(
1304            report.clone_groups[0].token_count, 10,
1305            "Maximal clone should cover all 10 tokens"
1306        );
1307    }
1308
1309    #[test]
1310    fn no_self_overlap() {
1311        let detector = CloneDetector::new(3, 1, false);
1312
1313        // File with repeated pattern: [1,2,3,1,2,3]
1314        // The pattern [1,2,3] appears at offset 0 and offset 3.
1315        let hashes = vec![1, 2, 3, 1, 2, 3];
1316        // Source must be long enough for synthetic spans: token i has span (i*3, i*3+2).
1317        // Last token (5) has span (15, 17), so source must be >= 17 bytes.
1318        // Use a source with enough content spread across distinct lines.
1319        let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1320
1321        let hashed = make_hashed_tokens(&hashes);
1322        let ft = make_file_tokens(source, 6);
1323
1324        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1325
1326        // Verify that no clone instance overlaps with another in the same file.
1327        for group in &report.clone_groups {
1328            let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1329            for inst in &group.instances {
1330                file_instances
1331                    .entry(&inst.file)
1332                    .or_default()
1333                    .push((inst.start_line, inst.end_line));
1334            }
1335            for (_file, mut ranges) in file_instances {
1336                ranges.sort();
1337                for w in ranges.windows(2) {
1338                    assert!(
1339                        w[1].0 > w[0].1,
1340                        "Clone instances in the same file should not overlap: {:?} and {:?}",
1341                        w[0],
1342                        w[1]
1343                    );
1344                }
1345            }
1346        }
1347    }
1348
1349    #[test]
1350    fn empty_input_edge_case() {
1351        let detector = CloneDetector::new(0, 0, false);
1352        let report = detector.detect(vec![]);
1353        assert!(report.clone_groups.is_empty());
1354        assert_eq!(report.stats.total_files, 0);
1355    }
1356
1357    #[test]
1358    fn single_file_internal_duplication() {
1359        let detector = CloneDetector::new(3, 1, false);
1360
1361        // File with a repeated block separated by a different token.
1362        // [10, 20, 30, 99, 10, 20, 30]
1363        let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1364        let source = "a\nb\nc\nx\na\nb\nc";
1365
1366        let hashed = make_hashed_tokens(&hashes);
1367        let ft = make_file_tokens(source, 7);
1368
1369        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1370
1371        // Should detect the [10, 20, 30] clone at offsets 0 and 4.
1372        assert!(
1373            !report.clone_groups.is_empty(),
1374            "Should detect internal duplication within a single file"
1375        );
1376    }
1377}
fallow_core/duplicates/detect.rs

fallow_core/duplicates/
detect.rs