fallow_core/duplicates/
detect.rs

1//! Suffix Array + LCP based clone detection engine.
2//!
3//! Uses an O(N log N) prefix-doubling suffix array construction (with radix
4//! sort) followed by an O(N) LCP scan. This avoids quadratic pairwise
5//! comparisons and naturally finds all maximal clones in a single linear pass.
6
7use std::collections::{HashMap, HashSet};
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14/// Data for a single file being analyzed.
15struct FileData {
16    path: PathBuf,
17    hashed_tokens: Vec<HashedToken>,
18    file_tokens: FileTokens,
19}
20
21/// Suffix Array + LCP based clone detection engine.
22///
23/// Concatenates all files' token sequences (separated by unique sentinels),
24/// builds a suffix array and LCP array, then extracts maximal clone groups
25/// from contiguous LCP intervals.
26pub struct CloneDetector {
27    /// Minimum clone size in tokens.
28    min_tokens: usize,
29    /// Minimum clone size in lines.
30    min_lines: usize,
31    /// Only report cross-directory duplicates.
32    skip_local: bool,
33}
34
35impl CloneDetector {
36    /// Create a new detector with the given thresholds.
37    pub fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38        Self {
39            min_tokens,
40            min_lines,
41            skip_local,
42        }
43    }
44
45    /// Run clone detection across all files.
46    ///
47    /// `file_data` is a list of `(path, hashed_tokens, file_tokens)` tuples,
48    /// one per analyzed file.
49    pub fn detect(
50        &self,
51        file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52    ) -> DuplicationReport {
53        let _span = tracing::info_span!("clone_detect").entered();
54
55        if file_data.is_empty() || self.min_tokens == 0 {
56            return empty_report(0);
57        }
58
59        let files: Vec<FileData> = file_data
60            .into_iter()
61            .map(|(path, hashed_tokens, file_tokens)| FileData {
62                path,
63                hashed_tokens,
64                file_tokens,
65            })
66            .collect();
67
68        // Compute total stats.
69        let total_files = files.len();
70        let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71        let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73        tracing::debug!(
74            total_files,
75            total_tokens,
76            total_lines,
77            "clone detection input"
78        );
79
80        // Step 1: Rank reduction — map u64 hashes to consecutive u32 ranks.
81        let t0 = std::time::Instant::now();
82        let ranked_files = rank_reduce(&files);
83        let rank_time = t0.elapsed();
84        let unique_ranks: usize = ranked_files
85            .iter()
86            .flat_map(|f| f.iter())
87            .copied()
88            .max()
89            .map_or(0, |m| m as usize + 1);
90        tracing::debug!(
91            elapsed_us = rank_time.as_micros(),
92            unique_ranks,
93            "step1_rank_reduce"
94        );
95
96        // Step 2: Concatenate with sentinels.
97        let t0 = std::time::Instant::now();
98        let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99        let concat_time = t0.elapsed();
100        tracing::debug!(
101            elapsed_us = concat_time.as_micros(),
102            concat_len = text.len(),
103            "step2_concatenate"
104        );
105
106        if text.is_empty() {
107            return empty_report(total_files);
108        }
109
110        // Step 3: Build suffix array.
111        let t0 = std::time::Instant::now();
112        let sa = build_suffix_array(&text);
113        let sa_time = t0.elapsed();
114        tracing::debug!(
115            elapsed_us = sa_time.as_micros(),
116            n = text.len(),
117            "step3_suffix_array"
118        );
119
120        // Step 4: Build LCP array (Kasai's algorithm, sentinel-aware).
121        let t0 = std::time::Instant::now();
122        let lcp = build_lcp(&text, &sa);
123        let lcp_time = t0.elapsed();
124        tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126        // Step 5: Extract clone groups from LCP intervals.
127        let t0 = std::time::Instant::now();
128        let raw_groups =
129            extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130        let extract_time = t0.elapsed();
131        tracing::debug!(
132            elapsed_us = extract_time.as_micros(),
133            raw_groups = raw_groups.len(),
134            "step5_extract_groups"
135        );
136
137        // Step 6: Build CloneGroup structs with line info, apply filters.
138        let t0 = std::time::Instant::now();
139        let clone_groups = self.build_groups(raw_groups, &files);
140        let build_time = t0.elapsed();
141        tracing::debug!(
142            elapsed_us = build_time.as_micros(),
143            final_groups = clone_groups.len(),
144            "step6_build_groups"
145        );
146
147        // Step 7: Compute stats.
148        let t0 = std::time::Instant::now();
149        let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150        let stats_time = t0.elapsed();
151        tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153        tracing::info!(
154            total_us = (rank_time
155                + concat_time
156                + sa_time
157                + lcp_time
158                + extract_time
159                + build_time
160                + stats_time)
161                .as_micros(),
162            rank_us = rank_time.as_micros(),
163            sa_us = sa_time.as_micros(),
164            lcp_us = lcp_time.as_micros(),
165            extract_us = extract_time.as_micros(),
166            build_us = build_time.as_micros(),
167            stats_us = stats_time.as_micros(),
168            total_tokens,
169            clone_groups = clone_groups.len(),
170            "clone detection complete"
171        );
172
173        DuplicationReport {
174            clone_groups,
175            clone_families: vec![], // Populated by the caller after suppression filtering
176            stats,
177        }
178    }
179
180    /// Convert raw groups into `CloneGroup` structs, applying min_lines and
181    /// skip_local filters, deduplication, and subset removal.
182    fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
183        if raw_groups.is_empty() {
184            return Vec::new();
185        }
186
187        // ── Token-level subset removal (cheap) ────────────────
188        //
189        // Sort raw groups by length desc. For each instance (file_id, offset),
190        // track non-overlapping covered intervals per file. A smaller group is
191        // skipped if all its instances fall within already-covered intervals.
192        // This eliminates the vast majority of raw groups before the expensive
193        // line-calculation step.
194        let raw_count = raw_groups.len();
195        raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
196
197        // covered[file_id] is a sorted vec of non-overlapping (start, end)
198        // intervals. Kept sorted by start for binary search.
199        let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
200        let mut surviving_groups: Vec<RawGroup> = Vec::new();
201
202        for rg in raw_groups {
203            let len = rg.length;
204            // Check if all instances are fully covered by existing intervals.
205            let all_covered = rg.instances.iter().all(|&(fid, offset)| {
206                let intervals = &covered[fid];
207                // Binary search for the interval that could contain [offset, offset+len).
208                let idx = intervals.partition_point(|&(s, _)| s <= offset);
209                if idx > 0 {
210                    let (s, e) = intervals[idx - 1];
211                    offset >= s && offset + len <= e
212                } else {
213                    false
214                }
215            });
216
217            if !all_covered {
218                // Insert covered intervals for this group's instances.
219                for &(fid, offset) in &rg.instances {
220                    let end = offset + len;
221                    let intervals = &mut covered[fid];
222                    let idx = intervals.partition_point(|&(s, _)| s < offset);
223                    // Check if the new interval merges with an existing one.
224                    if idx > 0 {
225                        let prev = &mut intervals[idx - 1];
226                        if prev.1 >= offset {
227                            // Extend previous interval if needed.
228                            if end > prev.1 {
229                                prev.1 = end;
230                            }
231                            continue;
232                        }
233                    }
234                    intervals.insert(idx, (offset, end));
235                }
236                surviving_groups.push(rg);
237            }
238        }
239
240        tracing::trace!(
241            raw = raw_count,
242            surviving = surviving_groups.len(),
243            "token-level subset removal"
244        );
245
246        // ── Pre-compute line offset tables ────────────────────
247        //
248        // For each file, build a sorted vec of newline byte positions so that
249        // byte_offset_to_line_col can use binary search (O(log L)) instead of
250        // linear scan (O(L)).
251        let line_tables: Vec<Vec<usize>> = files
252            .iter()
253            .map(|f| {
254                f.file_tokens
255                    .source
256                    .bytes()
257                    .enumerate()
258                    .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
259                    .collect()
260            })
261            .collect();
262
263        // ── Build CloneGroups for survivors ────────────────────
264        let mut clone_groups: Vec<CloneGroup> = Vec::new();
265
266        for rg in surviving_groups {
267            // Build instances, deduplicating by (file_id, offset).
268            let mut seen: HashSet<(usize, usize)> = HashSet::new();
269            let mut group_instances: Vec<CloneInstance> = Vec::new();
270
271            for &(file_id, offset) in &rg.instances {
272                if !seen.insert((file_id, offset)) {
273                    continue;
274                }
275
276                let file = &files[file_id];
277                if let Some(inst) =
278                    build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279                {
280                    group_instances.push(inst);
281                }
282            }
283
284            // Apply skip_local: only keep cross-directory clones.
285            if self.skip_local && group_instances.len() >= 2 {
286                let dirs: std::collections::HashSet<_> = group_instances
287                    .iter()
288                    .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289                    .collect();
290                if dirs.len() < 2 {
291                    continue;
292                }
293            }
294
295            if group_instances.len() < 2 {
296                continue;
297            }
298
299            // Calculate line count from the instances.
300            let line_count = group_instances
301                .iter()
302                .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303                .max()
304                .unwrap_or(0);
305
306            // Apply minimum line filter.
307            if line_count < self.min_lines {
308                continue;
309            }
310
311            // Sort instances by file path then start line for stable output.
312            group_instances
313                .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315            // Deduplicate instances that map to overlapping line ranges within
316            // the same file (different token offsets can resolve to overlapping
317            // source spans). When two instances overlap, keep the wider one.
318            group_instances.dedup_by(|b, a| {
319                if a.file != b.file {
320                    return false;
321                }
322                // Instances are sorted by start_line. `b` starts at or after `a`.
323                // If b's start overlaps with a's range, merge by extending a.
324                if b.start_line <= a.end_line {
325                    // Keep the wider range in `a`.
326                    if b.end_line > a.end_line {
327                        a.end_line = b.end_line;
328                        a.end_col = b.end_col;
329                    }
330                    true
331                } else {
332                    false
333                }
334            });
335
336            if group_instances.len() < 2 {
337                continue;
338            }
339
340            clone_groups.push(CloneGroup {
341                instances: group_instances,
342                token_count: rg.length,
343                line_count,
344            });
345        }
346
347        // Sort groups by token count (largest first), breaking ties by instance
348        // count (most instances first). This ensures that for equal token counts,
349        // N-way groups come before M-way (M<N) subsets, so subset removal works
350        // correctly regardless of the suffix array's extraction order.
351        clone_groups.sort_by(|a, b| {
352            b.token_count
353                .cmp(&a.token_count)
354                .then(b.instances.len().cmp(&a.instances.len()))
355        });
356
357        // Remove groups whose line ranges are fully contained within another
358        // group's line ranges. Uses a per-file interval index to avoid O(g²×m×n).
359        //
360        // Strategy: iterate groups from largest to smallest. For each kept group,
361        // register its (file, start_line, end_line) spans into a spatial index.
362        // Smaller groups are checked against this index in O(instances × log(intervals)).
363
364        // Build file path → index mapping for interval tracking
365        let mut path_to_idx: HashMap<PathBuf, usize> = HashMap::new();
366        let mut next_idx = 0usize;
367        for group in &clone_groups {
368            for inst in &group.instances {
369                path_to_idx.entry(inst.file.clone()).or_insert_with(|| {
370                    let idx = next_idx;
371                    next_idx += 1;
372                    idx
373                });
374            }
375        }
376        let mut file_intervals: Vec<Vec<(usize, usize)>> = vec![Vec::new(); next_idx];
377        let mut kept_groups: Vec<CloneGroup> = Vec::new();
378
379        for group in clone_groups {
380            // Check if ALL instances of this group are contained within existing intervals
381            let all_contained = group.instances.iter().all(|inst| {
382                let fidx = path_to_idx[&inst.file];
383                let intervals = &file_intervals[fidx];
384                let idx = intervals.partition_point(|&(s, _)| s <= inst.start_line);
385                idx > 0 && {
386                    let (s, e) = intervals[idx - 1];
387                    inst.start_line >= s && inst.end_line <= e
388                }
389            });
390
391            if !all_contained {
392                // Register this group's instances into the spatial index
393                for inst in &group.instances {
394                    let fidx = path_to_idx[&inst.file];
395                    let intervals = &mut file_intervals[fidx];
396                    let idx = intervals.partition_point(|&(s, _)| s < inst.start_line);
397                    if idx > 0 && intervals[idx - 1].1 >= inst.start_line {
398                        if inst.end_line > intervals[idx - 1].1 {
399                            intervals[idx - 1].1 = inst.end_line;
400                        }
401                    } else {
402                        intervals.insert(idx, (inst.start_line, inst.end_line));
403                    }
404                }
405                kept_groups.push(group);
406            }
407        }
408
409        kept_groups
410    }
411}
412
413// ── Raw group from LCP extraction ──────────────────────────
414
415/// A raw clone group before conversion to `CloneGroup`.
416struct RawGroup {
417    /// List of (file_id, token_offset) instances.
418    instances: Vec<(usize, usize)>,
419    /// Clone length in tokens.
420    length: usize,
421}
422
423// ── Step 1: Rank reduction ─────────────────────────────────
424
425/// Map all unique token hashes (u64) to consecutive integer ranks (u32).
426///
427/// Returns `ranked_files` where `ranked_files[i]` contains the rank
428/// sequence for `files[i]`.
429fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
430    // Single-pass: assign ranks on first encounter. The exact rank values
431    // don't matter as long as equal hashes get equal ranks. Skipping the
432    // sort+dedup saves O(N log N) and a second allocation.
433    let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
434    let mut hash_to_rank: HashMap<u64, u32> = HashMap::with_capacity(total / 2);
435    let mut next_rank: u32 = 0;
436
437    files
438        .iter()
439        .map(|file| {
440            file.hashed_tokens
441                .iter()
442                .map(|ht| {
443                    *hash_to_rank.entry(ht.hash).or_insert_with(|| {
444                        let r = next_rank;
445                        next_rank += 1;
446                        r
447                    })
448                })
449                .collect()
450        })
451        .collect()
452}
453
454// ── Step 2: Concatenation with sentinels ───────────────────
455
456/// Concatenate all ranked token sequences into a single `Vec<i64>`,
457/// inserting unique negative sentinel values between files.
458///
459/// Returns `(text, file_of, file_offsets)` where:
460/// - `text` is the concatenated sequence
461/// - `file_of[pos]` maps a position in `text` to a file index
462///   (`usize::MAX` for sentinel positions)
463/// - `file_offsets[file_id]` is the starting position of file `file_id`
464///   in `text`
465fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
466    let sentinel_count = ranked_files.len().saturating_sub(1);
467    let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
468
469    let mut text = Vec::with_capacity(total_len);
470    let mut file_of = Vec::with_capacity(total_len);
471    let mut file_offsets = Vec::with_capacity(ranked_files.len());
472
473    let mut sentinel: i64 = -1;
474
475    for (file_id, ranks) in ranked_files.iter().enumerate() {
476        file_offsets.push(text.len());
477
478        for &r in ranks {
479            text.push(r as i64);
480            file_of.push(file_id);
481        }
482
483        // Insert sentinel between files (not after the last one).
484        if file_id + 1 < ranked_files.len() {
485            text.push(sentinel);
486            file_of.push(usize::MAX);
487            sentinel -= 1;
488        }
489    }
490
491    (text, file_of, file_offsets)
492}
493
494// ── Step 3: Suffix array construction (prefix doubling) ────
495
496/// Build a suffix array using the O(N log N) prefix-doubling algorithm with
497/// radix sort.
498///
499/// Returns `sa` where `sa[i]` is the starting position of the i-th
500/// lexicographically smallest suffix in `text`.
501fn build_suffix_array(text: &[i64]) -> Vec<usize> {
502    let n = text.len();
503    if n == 0 {
504        return vec![];
505    }
506
507    // Initial ranks based on raw values. Shift so sentinels (negative) sort
508    // before all real tokens.
509    let min_val = text.iter().copied().min().unwrap_or(0);
510    let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
511    let mut sa: Vec<usize> = (0..n).collect();
512    let mut tmp: Vec<i64> = vec![0; n];
513    let mut k: usize = 1;
514    let mut iterations = 0u32;
515
516    // Scratch buffers for radix sort (reused across iterations).
517    let mut sa_tmp: Vec<usize> = vec![0; n];
518
519    while k < n {
520        iterations += 1;
521        let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
522
523        // Two-pass radix sort: sort by secondary key (rank[i+k]) first,
524        // then by primary key (rank[i]). Each pass is O(N + K) where
525        // K = max_rank + 2 (including the -1 sentinel rank).
526        let bucket_count = max_rank + 2; // ranks 0..=max_rank plus -1 mapped to 0
527
528        // Pass 1: sort by secondary key (rank at offset k).
529        let mut counts = vec![0usize; bucket_count + 1];
530        for &i in &sa {
531            let r2 = if i + k < n {
532                rank[i + k] as usize + 1
533            } else {
534                0
535            };
536            counts[r2] += 1;
537        }
538        // Prefix sum.
539        let mut sum = 0;
540        for c in counts.iter_mut() {
541            let v = *c;
542            *c = sum;
543            sum += v;
544        }
545        for &i in &sa {
546            let r2 = if i + k < n {
547                rank[i + k] as usize + 1
548            } else {
549                0
550            };
551            sa_tmp[counts[r2]] = i;
552            counts[r2] += 1;
553        }
554
555        // Pass 2: sort by primary key (rank[i]), stable.
556        // No +1 offset needed here: rank[i] is always >= 0 because the
557        // initial ranks are shifted by min_val, and subsequent iterations
558        // assign ranks starting from 0.
559        counts.iter_mut().for_each(|c| *c = 0);
560        counts.resize(bucket_count + 1, 0);
561        for &i in &sa_tmp {
562            let r1 = rank[i] as usize;
563            counts[r1] += 1;
564        }
565        sum = 0;
566        for c in counts.iter_mut() {
567            let v = *c;
568            *c = sum;
569            sum += v;
570        }
571        for &i in &sa_tmp {
572            let r1 = rank[i] as usize;
573            sa[counts[r1]] = i;
574            counts[r1] += 1;
575        }
576
577        // Compute new ranks.
578        tmp[sa[0]] = 0;
579        for i in 1..n {
580            let prev = sa[i - 1];
581            let curr = sa[i];
582            let same = rank[prev] == rank[curr] && {
583                let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
584                let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
585                rp2 == rc2
586            };
587            tmp[curr] = tmp[prev] + if same { 0 } else { 1 };
588        }
589
590        // Early exit when all ranks are unique.
591        let new_max_rank = tmp[sa[n - 1]];
592        std::mem::swap(&mut rank, &mut tmp);
593
594        if new_max_rank as usize == n - 1 {
595            break;
596        }
597
598        k *= 2;
599    }
600
601    tracing::trace!(n, iterations, "suffix array constructed");
602    sa
603}
604
605// ── Step 4: LCP array (Kasai's algorithm) ──────────────────
606
607/// Build the LCP (Longest Common Prefix) array using Kasai's algorithm.
608///
609/// `lcp[i]` is the length of the longest common prefix between suffixes
610/// `sa[i]` and `sa[i-1]`. `lcp[0]` is always 0.
611///
612/// The LCP computation stops at sentinel boundaries (negative values in
613/// `text`) to prevent matches from crossing file boundaries.
614fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
615    let n = sa.len();
616    if n == 0 {
617        return vec![];
618    }
619
620    let mut rank = vec![0usize; n];
621    for i in 0..n {
622        rank[sa[i]] = i;
623    }
624
625    let mut lcp = vec![0usize; n];
626    let mut k: usize = 0;
627
628    for i in 0..n {
629        if rank[i] == 0 {
630            k = 0;
631            continue;
632        }
633        let j = sa[rank[i] - 1];
634        while i + k < n && j + k < n {
635            // Stop at sentinels (negative values).
636            if text[i + k] < 0 || text[j + k] < 0 {
637                break;
638            }
639            if text[i + k] != text[j + k] {
640                break;
641            }
642            k += 1;
643        }
644        lcp[rank[i]] = k;
645        k = k.saturating_sub(1);
646    }
647
648    lcp
649}
650
651// ── Step 5: Clone group extraction ─────────────────────────
652
653/// Extract clone groups from the suffix array and LCP array.
654///
655/// Uses a stack-based approach to find all maximal LCP intervals where the
656/// minimum LCP value is >= `min_tokens`, and the interval contains suffixes
657/// from at least two different positions (cross-file or non-overlapping
658/// same-file).
659fn extract_clone_groups(
660    sa: &[usize],
661    lcp: &[usize],
662    file_of: &[usize],
663    file_offsets: &[usize],
664    min_tokens: usize,
665    files: &[FileData],
666) -> Vec<RawGroup> {
667    let n = sa.len();
668    if n < 2 {
669        return vec![];
670    }
671
672    // Stack-based LCP interval extraction.
673    // Each stack entry: (lcp_value, start_index_in_sa).
674    let mut stack: Vec<(usize, usize)> = Vec::new();
675    let mut groups: Vec<RawGroup> = Vec::new();
676
677    #[allow(clippy::needless_range_loop)] // `i` is used as a value, not just as an index
678    for i in 1..=n {
679        let cur_lcp = if i < n { lcp[i] } else { 0 };
680        let mut start = i;
681
682        while let Some(&(top_lcp, top_start)) = stack.last() {
683            if top_lcp <= cur_lcp {
684                break;
685            }
686            stack.pop();
687            start = top_start;
688
689            if top_lcp >= min_tokens {
690                // The interval [start-1 .. i-1] shares a common prefix of
691                // length `top_lcp`.
692                let interval_begin = start - 1;
693                let interval_end = i; // exclusive
694
695                if let Some(group) = build_raw_group(
696                    sa,
697                    file_of,
698                    file_offsets,
699                    files,
700                    interval_begin,
701                    interval_end,
702                    top_lcp,
703                ) {
704                    groups.push(group);
705                }
706            }
707        }
708
709        if i < n {
710            stack.push((cur_lcp, start));
711        }
712    }
713
714    groups
715}
716
717/// Build a `RawGroup` from an LCP interval, filtering to non-overlapping
718/// instances.
719fn build_raw_group(
720    sa: &[usize],
721    file_of: &[usize],
722    file_offsets: &[usize],
723    files: &[FileData],
724    begin: usize,
725    end: usize,
726    length: usize,
727) -> Option<RawGroup> {
728    let mut instances: Vec<(usize, usize)> = Vec::new();
729
730    for &pos in &sa[begin..end] {
731        let fid = file_of[pos];
732        if fid == usize::MAX {
733            continue; // sentinel position
734        }
735        let offset_in_file = pos - file_offsets[fid];
736
737        // Verify the clone doesn't extend beyond the file boundary.
738        if offset_in_file + length > files[fid].hashed_tokens.len() {
739            continue;
740        }
741
742        instances.push((fid, offset_in_file));
743    }
744
745    if instances.len() < 2 {
746        return None;
747    }
748
749    // Remove overlapping instances within the same file.
750    // Sort by (file_id, offset) and remove overlaps.
751    instances.sort_unstable();
752    let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
753    for &(fid, offset) in &instances {
754        if let Some(&(last_fid, last_offset)) = deduped.last()
755            && fid == last_fid
756            && offset < last_offset + length
757        {
758            continue; // overlapping within the same file
759        }
760        deduped.push((fid, offset));
761    }
762
763    if deduped.len() < 2 {
764        return None;
765    }
766
767    Some(RawGroup {
768        instances: deduped,
769        length,
770    })
771}
772
773// ── Utility functions ──────────────────────────────────────
774
775/// Check if all instances of `smaller` overlap with instances of `larger`.
776/// Build a `CloneInstance` using a pre-computed line offset table for fast lookup.
777fn build_clone_instance_fast(
778    file: &FileData,
779    token_offset: usize,
780    token_length: usize,
781    line_table: &[usize],
782) -> Option<CloneInstance> {
783    let tokens = &file.hashed_tokens;
784    let source_tokens = &file.file_tokens.tokens;
785
786    if token_offset + token_length > tokens.len() {
787        return None;
788    }
789
790    // Map from hashed token indices back to source token spans.
791    let first_hashed = &tokens[token_offset];
792    let last_hashed = &tokens[token_offset + token_length - 1];
793
794    let first_source = &source_tokens[first_hashed.original_index];
795    let last_source = &source_tokens[last_hashed.original_index];
796
797    let start_byte = first_source.span.start as usize;
798    let end_byte = last_source.span.end as usize;
799
800    // Guard against inverted spans that can occur when normalization reorders
801    // token original_index values for very small windows.
802    if start_byte > end_byte {
803        return None;
804    }
805
806    let source = &file.file_tokens.source;
807    let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
808    let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
809
810    // Extract the fragment.
811    let fragment = if end_byte <= source.len() {
812        source[start_byte..end_byte].to_string()
813    } else {
814        String::new()
815    };
816
817    Some(CloneInstance {
818        file: file.path.clone(),
819        start_line,
820        end_line,
821        start_col,
822        end_col,
823        fragment,
824    })
825}
826
827/// Convert a byte offset into a 1-based line number and 0-based character column
828/// using a pre-computed table of newline positions for O(log L) lookup.
829fn byte_offset_to_line_col_fast(
830    source: &str,
831    byte_offset: usize,
832    line_table: &[usize],
833) -> (usize, usize) {
834    let offset = byte_offset.min(source.len());
835    // Binary search: find the number of newlines before this offset.
836    let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
837    let line = line_idx + 1; // 1-based
838    let line_start = if line_idx == 0 {
839        0
840    } else {
841        line_table[line_idx - 1] + 1
842    };
843    let col = source[line_start..offset].chars().count();
844    (line, col)
845}
846
847/// Convert a byte offset into a 1-based line number and 0-based character column.
848#[cfg(test)]
849fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
850    let offset = byte_offset.min(source.len());
851    let before = &source[..offset];
852    let line = before.matches('\n').count() + 1;
853    let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
854    let col = before[line_start..].chars().count();
855    (line, col)
856}
857
858/// Compute aggregate duplication statistics.
859fn compute_stats(
860    clone_groups: &[CloneGroup],
861    total_files: usize,
862    total_lines: usize,
863    total_tokens: usize,
864) -> DuplicationStats {
865    use std::collections::HashSet;
866    use std::path::Path;
867
868    let mut files_with_clones: HashSet<&Path> = HashSet::new();
869    // Group duplicated lines by file to avoid cloning PathBuf per line.
870    let mut file_dup_lines: HashMap<&Path, HashSet<usize>> = HashMap::new();
871    let mut duplicated_tokens = 0usize;
872    let mut clone_instances = 0usize;
873
874    for group in clone_groups {
875        for instance in &group.instances {
876            files_with_clones.insert(&instance.file);
877            clone_instances += 1;
878            let lines = file_dup_lines.entry(&instance.file).or_default();
879            for line in instance.start_line..=instance.end_line {
880                lines.insert(line);
881            }
882        }
883        // Each instance contributes token_count duplicated tokens,
884        // but only count duplicates (all instances beyond the first).
885        if group.instances.len() > 1 {
886            duplicated_tokens += group.token_count * (group.instances.len() - 1);
887        }
888    }
889
890    let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
891    let duplication_percentage = if total_lines > 0 {
892        (dup_line_count as f64 / total_lines as f64) * 100.0
893    } else {
894        0.0
895    };
896
897    // Cap duplicated_tokens to total_tokens to avoid impossible values
898    // when overlapping clone groups double-count the same token positions.
899    let duplicated_tokens = duplicated_tokens.min(total_tokens);
900
901    DuplicationStats {
902        total_files,
903        files_with_clones: files_with_clones.len(),
904        total_lines,
905        duplicated_lines: dup_line_count,
906        total_tokens,
907        duplicated_tokens,
908        clone_groups: clone_groups.len(),
909        clone_instances,
910        duplication_percentage,
911    }
912}
913
914/// Create an empty report when there are no files to analyze.
915fn empty_report(total_files: usize) -> DuplicationReport {
916    DuplicationReport {
917        clone_groups: vec![],
918        clone_families: vec![],
919        stats: DuplicationStats {
920            total_files,
921            files_with_clones: 0,
922            total_lines: 0,
923            duplicated_lines: 0,
924            total_tokens: 0,
925            duplicated_tokens: 0,
926            clone_groups: 0,
927            clone_instances: 0,
928            duplication_percentage: 0.0,
929        },
930    }
931}
932
933#[cfg(test)]
934mod tests {
935    use super::*;
936    use crate::duplicates::normalize::HashedToken;
937    use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
938    use oxc_span::Span;
939
940    fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
941        hashes
942            .iter()
943            .enumerate()
944            .map(|(i, &hash)| HashedToken {
945                hash,
946                original_index: i,
947            })
948            .collect()
949    }
950
951    fn make_source_tokens(count: usize) -> Vec<SourceToken> {
952        (0..count)
953            .map(|i| SourceToken {
954                kind: TokenKind::Identifier(format!("t{i}")),
955                span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
956            })
957            .collect()
958    }
959
960    fn make_file_tokens(source: &str, count: usize) -> FileTokens {
961        FileTokens {
962            tokens: make_source_tokens(count),
963            source: source.to_string(),
964            line_count: source.lines().count().max(1),
965        }
966    }
967
968    // ── Existing tests (adapted for CloneDetector) ─────────
969
970    #[test]
971    fn empty_input_produces_empty_report() {
972        let detector = CloneDetector::new(5, 1, false);
973        let report = detector.detect(vec![]);
974        assert!(report.clone_groups.is_empty());
975        assert_eq!(report.stats.total_files, 0);
976    }
977
978    #[test]
979    fn single_file_no_clones() {
980        let detector = CloneDetector::new(3, 1, false);
981        let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
982        let ft = make_file_tokens("a b c d e", 5);
983        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
984        assert!(report.clone_groups.is_empty());
985    }
986
987    #[test]
988    fn detects_exact_duplicate_across_files() {
989        let detector = CloneDetector::new(3, 1, false);
990
991        // Same token sequence in two files.
992        let hashes = vec![10, 20, 30, 40, 50];
993        let source_a = "a\nb\nc\nd\ne";
994        let source_b = "a\nb\nc\nd\ne";
995
996        let hashed_a = make_hashed_tokens(&hashes);
997        let hashed_b = make_hashed_tokens(&hashes);
998        let ft_a = make_file_tokens(source_a, 5);
999        let ft_b = make_file_tokens(source_b, 5);
1000
1001        let report = detector.detect(vec![
1002            (PathBuf::from("a.ts"), hashed_a, ft_a),
1003            (PathBuf::from("b.ts"), hashed_b, ft_b),
1004        ]);
1005
1006        assert!(
1007            !report.clone_groups.is_empty(),
1008            "Should detect at least one clone group"
1009        );
1010    }
1011
1012    #[test]
1013    fn no_detection_below_min_tokens() {
1014        let detector = CloneDetector::new(10, 1, false);
1015
1016        let hashes = vec![10, 20, 30]; // Only 3 tokens, min is 10
1017        let hashed_a = make_hashed_tokens(&hashes);
1018        let hashed_b = make_hashed_tokens(&hashes);
1019        let ft_a = make_file_tokens("abc", 3);
1020        let ft_b = make_file_tokens("abc", 3);
1021
1022        let report = detector.detect(vec![
1023            (PathBuf::from("a.ts"), hashed_a, ft_a),
1024            (PathBuf::from("b.ts"), hashed_b, ft_b),
1025        ]);
1026
1027        assert!(report.clone_groups.is_empty());
1028    }
1029
1030    #[test]
1031    fn byte_offset_to_line_col_basic() {
1032        let source = "abc\ndef\nghi";
1033        assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1034        assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1035        assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1036        assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1037    }
1038
1039    #[test]
1040    fn byte_offset_beyond_source() {
1041        let source = "abc";
1042        // Should clamp to end of source.
1043        let (line, col) = byte_offset_to_line_col(source, 100);
1044        assert_eq!(line, 1);
1045        assert_eq!(col, 3);
1046    }
1047
1048    #[test]
1049    fn skip_local_filters_same_directory() {
1050        let detector = CloneDetector::new(3, 1, true);
1051
1052        let hashes = vec![10, 20, 30, 40, 50];
1053        let source = "a\nb\nc\nd\ne";
1054
1055        let hashed_a = make_hashed_tokens(&hashes);
1056        let hashed_b = make_hashed_tokens(&hashes);
1057        let ft_a = make_file_tokens(source, 5);
1058        let ft_b = make_file_tokens(source, 5);
1059
1060        // Same directory -> should be filtered with skip_local.
1061        let report = detector.detect(vec![
1062            (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1063            (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1064        ]);
1065
1066        assert!(
1067            report.clone_groups.is_empty(),
1068            "Same-directory clones should be filtered with skip_local"
1069        );
1070    }
1071
1072    #[test]
1073    fn skip_local_keeps_cross_directory() {
1074        let detector = CloneDetector::new(3, 1, true);
1075
1076        let hashes = vec![10, 20, 30, 40, 50];
1077        let source = "a\nb\nc\nd\ne";
1078
1079        let hashed_a = make_hashed_tokens(&hashes);
1080        let hashed_b = make_hashed_tokens(&hashes);
1081        let ft_a = make_file_tokens(source, 5);
1082        let ft_b = make_file_tokens(source, 5);
1083
1084        // Different directories -> should be kept.
1085        let report = detector.detect(vec![
1086            (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1087            (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1088        ]);
1089
1090        assert!(
1091            !report.clone_groups.is_empty(),
1092            "Cross-directory clones should be kept with skip_local"
1093        );
1094    }
1095
1096    #[test]
1097    fn stats_computation() {
1098        let groups = vec![CloneGroup {
1099            instances: vec![
1100                CloneInstance {
1101                    file: PathBuf::from("a.ts"),
1102                    start_line: 1,
1103                    end_line: 5,
1104                    start_col: 0,
1105                    end_col: 10,
1106                    fragment: "...".to_string(),
1107                },
1108                CloneInstance {
1109                    file: PathBuf::from("b.ts"),
1110                    start_line: 10,
1111                    end_line: 14,
1112                    start_col: 0,
1113                    end_col: 10,
1114                    fragment: "...".to_string(),
1115                },
1116            ],
1117            token_count: 50,
1118            line_count: 5,
1119        }];
1120
1121        let stats = compute_stats(&groups, 10, 200, 1000);
1122        assert_eq!(stats.total_files, 10);
1123        assert_eq!(stats.files_with_clones, 2);
1124        assert_eq!(stats.clone_groups, 1);
1125        assert_eq!(stats.clone_instances, 2);
1126        assert_eq!(stats.duplicated_lines, 10); // 5 lines in each of 2 instances
1127        assert!(stats.duplication_percentage > 0.0);
1128    }
1129
1130    // ── New suffix array / LCP tests ───────────────────────
1131
1132    #[test]
1133    fn sa_construction_basic() {
1134        // "banana" encoded as integers: b=1, a=0, n=2
1135        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1136        let sa = build_suffix_array(&text);
1137
1138        // Suffixes sorted lexicographically:
1139        // SA[0] = 5: "a"           (0)
1140        // SA[1] = 3: "ana"         (0,2,0)
1141        // SA[2] = 1: "anana"       (0,2,0,2,0)
1142        // SA[3] = 0: "banana"      (1,0,2,0,2,0)
1143        // SA[4] = 4: "na"          (2,0)
1144        // SA[5] = 2: "nana"        (2,0,2,0)
1145        assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1146    }
1147
1148    #[test]
1149    fn lcp_construction_basic() {
1150        let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1151        let sa = build_suffix_array(&text);
1152        let lcp = build_lcp(&text, &sa);
1153
1154        // LCP values for "banana":
1155        // lcp[0] = 0 (by definition)
1156        // lcp[1] = 1 (LCP of "a" and "ana" = "a" = 1)
1157        // lcp[2] = 3 (LCP of "ana" and "anana" = "ana" = 3)
1158        // lcp[3] = 0 (LCP of "anana" and "banana" = "" = 0)
1159        // lcp[4] = 0 (LCP of "banana" and "na" = "" = 0)
1160        // lcp[5] = 2 (LCP of "na" and "nana" = "na" = 2)
1161        assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1162    }
1163
1164    #[test]
1165    fn lcp_stops_at_sentinels() {
1166        // Two "files": [0, 1, 2] sentinel [-1] [0, 1, 2]
1167        let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1168        let sa = build_suffix_array(&text);
1169        let lcp = build_lcp(&text, &sa);
1170
1171        // Find the SA positions corresponding to text positions 0 and 4
1172        // (both start "0 1 2 ..."). LCP should be exactly 3.
1173        let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1174        let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1175        let (lo, hi) = if rank_0 < rank_4 {
1176            (rank_0, rank_4)
1177        } else {
1178            (rank_4, rank_0)
1179        };
1180
1181        // The minimum LCP in the range (lo, hi] gives the LCP between them.
1182        let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1183        assert_eq!(
1184            min_lcp, 3,
1185            "LCP between identical sequences across sentinel should be 3"
1186        );
1187    }
1188
1189    #[test]
1190    fn rank_reduction_maps_correctly() {
1191        let files = vec![
1192            FileData {
1193                path: PathBuf::from("a.ts"),
1194                hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1195                file_tokens: make_file_tokens("a b c", 3),
1196            },
1197            FileData {
1198                path: PathBuf::from("b.ts"),
1199                hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1200                file_tokens: make_file_tokens("d e f", 3),
1201            },
1202        ];
1203
1204        let ranked = rank_reduce(&files);
1205
1206        // Unique hashes: 100, 200, 300, 400 -> ranks 0, 1, 2, 3
1207        assert_eq!(ranked[0], vec![0, 1, 2]);
1208        assert_eq!(ranked[1], vec![1, 2, 3]);
1209    }
1210
1211    #[test]
1212    fn three_file_grouping() {
1213        let detector = CloneDetector::new(3, 1, false);
1214
1215        let hashes = vec![10, 20, 30, 40, 50];
1216        let source = "a\nb\nc\nd\ne";
1217
1218        let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1219            .map(|i| {
1220                (
1221                    PathBuf::from(format!("file{i}.ts")),
1222                    make_hashed_tokens(&hashes),
1223                    make_file_tokens(source, 5),
1224                )
1225            })
1226            .collect();
1227
1228        let report = detector.detect(data);
1229
1230        assert!(
1231            !report.clone_groups.is_empty(),
1232            "Should detect clones across 3 identical files"
1233        );
1234
1235        // The largest group should contain 3 instances.
1236        let max_instances = report
1237            .clone_groups
1238            .iter()
1239            .map(|g| g.instances.len())
1240            .max()
1241            .unwrap_or(0);
1242        assert_eq!(
1243            max_instances, 3,
1244            "3 identical files should produce a group with 3 instances"
1245        );
1246    }
1247
1248    #[test]
1249    fn overlapping_clones_largest_wins() {
1250        let detector = CloneDetector::new(3, 1, false);
1251
1252        // File A and B: identical 10-token sequences.
1253        let hashes: Vec<u64> = (1..=10).collect();
1254        let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1255
1256        let hashed_a = make_hashed_tokens(&hashes);
1257        let hashed_b = make_hashed_tokens(&hashes);
1258        let ft_a = make_file_tokens(source, 10);
1259        let ft_b = make_file_tokens(source, 10);
1260
1261        let report = detector.detect(vec![
1262            (PathBuf::from("a.ts"), hashed_a, ft_a),
1263            (PathBuf::from("b.ts"), hashed_b, ft_b),
1264        ]);
1265
1266        assert!(!report.clone_groups.is_empty());
1267        // The first group (sorted by token_count desc) should cover all 10.
1268        assert_eq!(
1269            report.clone_groups[0].token_count, 10,
1270            "Maximal clone should cover all 10 tokens"
1271        );
1272    }
1273
1274    #[test]
1275    fn no_self_overlap() {
1276        let detector = CloneDetector::new(3, 1, false);
1277
1278        // File with repeated pattern: [1,2,3,1,2,3]
1279        // The pattern [1,2,3] appears at offset 0 and offset 3.
1280        let hashes = vec![1, 2, 3, 1, 2, 3];
1281        // Source must be long enough for synthetic spans: token i has span (i*3, i*3+2).
1282        // Last token (5) has span (15, 17), so source must be >= 17 bytes.
1283        // Use a source with enough content spread across distinct lines.
1284        let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1285
1286        let hashed = make_hashed_tokens(&hashes);
1287        let ft = make_file_tokens(source, 6);
1288
1289        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1290
1291        // Verify that no clone instance overlaps with another in the same file.
1292        for group in &report.clone_groups {
1293            let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1294            for inst in &group.instances {
1295                file_instances
1296                    .entry(&inst.file)
1297                    .or_default()
1298                    .push((inst.start_line, inst.end_line));
1299            }
1300            for (_file, mut ranges) in file_instances {
1301                ranges.sort();
1302                for w in ranges.windows(2) {
1303                    assert!(
1304                        w[1].0 > w[0].1,
1305                        "Clone instances in the same file should not overlap: {:?} and {:?}",
1306                        w[0],
1307                        w[1]
1308                    );
1309                }
1310            }
1311        }
1312    }
1313
1314    #[test]
1315    fn empty_input_edge_case() {
1316        let detector = CloneDetector::new(0, 0, false);
1317        let report = detector.detect(vec![]);
1318        assert!(report.clone_groups.is_empty());
1319        assert_eq!(report.stats.total_files, 0);
1320    }
1321
1322    #[test]
1323    fn single_file_internal_duplication() {
1324        let detector = CloneDetector::new(3, 1, false);
1325
1326        // File with a repeated block separated by a different token.
1327        // [10, 20, 30, 99, 10, 20, 30]
1328        let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1329        let source = "a\nb\nc\nx\na\nb\nc";
1330
1331        let hashed = make_hashed_tokens(&hashes);
1332        let ft = make_file_tokens(source, 7);
1333
1334        let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1335
1336        // Should detect the [10, 20, 30] clone at offsets 0 and 4.
1337        assert!(
1338            !report.clone_groups.is_empty(),
1339            "Should detect internal duplication within a single file"
1340        );
1341    }
1342}
fallow_core/duplicates/detect.rs

fallow_core/duplicates/
detect.rs