1use rustc_hash::{FxHashMap, FxHashSet};
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14struct FileData {
16 path: PathBuf,
17 hashed_tokens: Vec<HashedToken>,
18 file_tokens: FileTokens,
19}
20
21pub struct CloneDetector {
27 min_tokens: usize,
29 min_lines: usize,
31 skip_local: bool,
33}
34
35impl CloneDetector {
36 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38 Self {
39 min_tokens,
40 min_lines,
41 skip_local,
42 }
43 }
44
45 pub fn detect(
50 &self,
51 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52 ) -> DuplicationReport {
53 let _span = tracing::info_span!("clone_detect").entered();
54
55 if file_data.is_empty() || self.min_tokens == 0 {
56 return empty_report(0);
57 }
58
59 let files: Vec<FileData> = file_data
60 .into_iter()
61 .map(|(path, hashed_tokens, file_tokens)| FileData {
62 path,
63 hashed_tokens,
64 file_tokens,
65 })
66 .collect();
67
68 let total_files = files.len();
70 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73 tracing::debug!(
74 total_files,
75 total_tokens,
76 total_lines,
77 "clone detection input"
78 );
79
80 let t0 = std::time::Instant::now();
82 let ranked_files = rank_reduce(&files);
83 let rank_time = t0.elapsed();
84 let unique_ranks: usize = ranked_files
85 .iter()
86 .flat_map(|f| f.iter())
87 .copied()
88 .max()
89 .map_or(0, |m| m as usize + 1);
90 tracing::debug!(
91 elapsed_us = rank_time.as_micros(),
92 unique_ranks,
93 "step1_rank_reduce"
94 );
95
96 let t0 = std::time::Instant::now();
98 let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99 let concat_time = t0.elapsed();
100 tracing::debug!(
101 elapsed_us = concat_time.as_micros(),
102 concat_len = text.len(),
103 "step2_concatenate"
104 );
105
106 if text.is_empty() {
107 return empty_report(total_files);
108 }
109
110 let t0 = std::time::Instant::now();
112 let sa = build_suffix_array(&text);
113 let sa_time = t0.elapsed();
114 tracing::debug!(
115 elapsed_us = sa_time.as_micros(),
116 n = text.len(),
117 "step3_suffix_array"
118 );
119
120 let t0 = std::time::Instant::now();
122 let lcp = build_lcp(&text, &sa);
123 let lcp_time = t0.elapsed();
124 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126 let t0 = std::time::Instant::now();
128 let raw_groups =
129 extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130 let extract_time = t0.elapsed();
131 tracing::debug!(
132 elapsed_us = extract_time.as_micros(),
133 raw_groups = raw_groups.len(),
134 "step5_extract_groups"
135 );
136
137 let t0 = std::time::Instant::now();
139 let clone_groups = self.build_groups(raw_groups, &files);
140 let build_time = t0.elapsed();
141 tracing::debug!(
142 elapsed_us = build_time.as_micros(),
143 final_groups = clone_groups.len(),
144 "step6_build_groups"
145 );
146
147 let t0 = std::time::Instant::now();
149 let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150 let stats_time = t0.elapsed();
151 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153 tracing::info!(
154 total_us = (rank_time
155 + concat_time
156 + sa_time
157 + lcp_time
158 + extract_time
159 + build_time
160 + stats_time)
161 .as_micros(),
162 rank_us = rank_time.as_micros(),
163 sa_us = sa_time.as_micros(),
164 lcp_us = lcp_time.as_micros(),
165 extract_us = extract_time.as_micros(),
166 build_us = build_time.as_micros(),
167 stats_us = stats_time.as_micros(),
168 total_tokens,
169 clone_groups = clone_groups.len(),
170 "clone detection complete"
171 );
172
173 DuplicationReport {
174 clone_groups,
175 clone_families: vec![], stats,
177 }
178 }
179
180 fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
183 if raw_groups.is_empty() {
184 return Vec::new();
185 }
186
187 let raw_count = raw_groups.len();
195 raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
196
197 let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
200 let mut surviving_groups: Vec<RawGroup> = Vec::new();
201
202 for rg in raw_groups {
203 let len = rg.length;
204 let all_covered = rg.instances.iter().all(|&(fid, offset)| {
206 let intervals = &covered[fid];
207 let idx = intervals.partition_point(|&(s, _)| s <= offset);
209 if idx > 0 {
210 let (s, e) = intervals[idx - 1];
211 offset >= s && offset + len <= e
212 } else {
213 false
214 }
215 });
216
217 if !all_covered {
218 for &(fid, offset) in &rg.instances {
220 let end = offset + len;
221 let intervals = &mut covered[fid];
222 let idx = intervals.partition_point(|&(s, _)| s < offset);
223 if idx > 0 {
225 let prev = &mut intervals[idx - 1];
226 if prev.1 >= offset {
227 if end > prev.1 {
229 prev.1 = end;
230 }
231 continue;
232 }
233 }
234 intervals.insert(idx, (offset, end));
235 }
236 surviving_groups.push(rg);
237 }
238 }
239
240 tracing::trace!(
241 raw = raw_count,
242 surviving = surviving_groups.len(),
243 "token-level subset removal"
244 );
245
246 let line_tables: Vec<Vec<usize>> = files
252 .iter()
253 .map(|f| {
254 f.file_tokens
255 .source
256 .bytes()
257 .enumerate()
258 .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
259 .collect()
260 })
261 .collect();
262
263 let mut clone_groups: Vec<CloneGroup> = Vec::new();
265
266 for rg in surviving_groups {
267 let mut seen: FxHashSet<(usize, usize)> = FxHashSet::default();
269 let mut group_instances: Vec<CloneInstance> = Vec::new();
270
271 for &(file_id, offset) in &rg.instances {
272 if !seen.insert((file_id, offset)) {
273 continue;
274 }
275
276 let file = &files[file_id];
277 if let Some(inst) =
278 build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279 {
280 group_instances.push(inst);
281 }
282 }
283
284 if self.skip_local && group_instances.len() >= 2 {
286 let dirs: FxHashSet<_> = group_instances
287 .iter()
288 .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289 .collect();
290 if dirs.len() < 2 {
291 continue;
292 }
293 }
294
295 if group_instances.len() < 2 {
296 continue;
297 }
298
299 let line_count = group_instances
301 .iter()
302 .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303 .max()
304 .unwrap_or(0);
305
306 if line_count < self.min_lines {
308 continue;
309 }
310
311 group_instances
313 .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315 group_instances.dedup_by(|b, a| {
319 if a.file != b.file {
320 return false;
321 }
322 if b.start_line <= a.end_line {
325 if b.end_line > a.end_line {
327 a.end_line = b.end_line;
328 a.end_col = b.end_col;
329 }
330 true
331 } else {
332 false
333 }
334 });
335
336 if group_instances.len() < 2 {
337 continue;
338 }
339
340 clone_groups.push(CloneGroup {
341 instances: group_instances,
342 token_count: rg.length,
343 line_count,
344 });
345 }
346
347 clone_groups.sort_by(|a, b| {
352 b.token_count
353 .cmp(&a.token_count)
354 .then(b.instances.len().cmp(&a.instances.len()))
355 });
356
357 let mut path_to_idx: FxHashMap<PathBuf, usize> = FxHashMap::default();
366 let mut next_idx = 0usize;
367 for group in &clone_groups {
368 for inst in &group.instances {
369 path_to_idx.entry(inst.file.clone()).or_insert_with(|| {
370 let idx = next_idx;
371 next_idx += 1;
372 idx
373 });
374 }
375 }
376 let mut file_intervals: Vec<Vec<(usize, usize)>> = vec![Vec::new(); next_idx];
377 let mut kept_groups: Vec<CloneGroup> = Vec::new();
378
379 for group in clone_groups {
380 let all_contained = group.instances.iter().all(|inst| {
382 let fidx = path_to_idx[&inst.file];
383 let intervals = &file_intervals[fidx];
384 let idx = intervals.partition_point(|&(s, _)| s <= inst.start_line);
385 idx > 0 && {
386 let (s, e) = intervals[idx - 1];
387 inst.start_line >= s && inst.end_line <= e
388 }
389 });
390
391 if !all_contained {
392 for inst in &group.instances {
394 let fidx = path_to_idx[&inst.file];
395 let intervals = &mut file_intervals[fidx];
396 let idx = intervals.partition_point(|&(s, _)| s < inst.start_line);
397 if idx > 0 && intervals[idx - 1].1 >= inst.start_line {
398 if inst.end_line > intervals[idx - 1].1 {
399 intervals[idx - 1].1 = inst.end_line;
400 }
401 } else {
402 intervals.insert(idx, (inst.start_line, inst.end_line));
403 }
404 }
405 kept_groups.push(group);
406 }
407 }
408
409 kept_groups
410 }
411}
412
413struct RawGroup {
417 instances: Vec<(usize, usize)>,
419 length: usize,
421}
422
423fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
430 let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
434 let mut hash_to_rank: FxHashMap<u64, u32> =
435 FxHashMap::with_capacity_and_hasher(total / 2, rustc_hash::FxBuildHasher);
436 let mut next_rank: u32 = 0;
437
438 files
439 .iter()
440 .map(|file| {
441 file.hashed_tokens
442 .iter()
443 .map(|ht| {
444 *hash_to_rank.entry(ht.hash).or_insert_with(|| {
445 let r = next_rank;
446 next_rank += 1;
447 r
448 })
449 })
450 .collect()
451 })
452 .collect()
453}
454
455fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
467 let sentinel_count = ranked_files.len().saturating_sub(1);
468 let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
469
470 let mut text = Vec::with_capacity(total_len);
471 let mut file_of = Vec::with_capacity(total_len);
472 let mut file_offsets = Vec::with_capacity(ranked_files.len());
473
474 let mut sentinel: i64 = -1;
475
476 for (file_id, ranks) in ranked_files.iter().enumerate() {
477 file_offsets.push(text.len());
478
479 for &r in ranks {
480 text.push(i64::from(r));
481 file_of.push(file_id);
482 }
483
484 if file_id + 1 < ranked_files.len() {
486 text.push(sentinel);
487 file_of.push(usize::MAX);
488 sentinel -= 1;
489 }
490 }
491
492 (text, file_of, file_offsets)
493}
494
495fn build_suffix_array(text: &[i64]) -> Vec<usize> {
503 let n = text.len();
504 if n == 0 {
505 return vec![];
506 }
507
508 let min_val = text.iter().copied().min().unwrap_or(0);
511 let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
512 let mut sa: Vec<usize> = (0..n).collect();
513 let mut tmp: Vec<i64> = vec![0; n];
514 let mut k: usize = 1;
515 let mut iterations = 0u32;
516
517 let mut sa_tmp: Vec<usize> = vec![0; n];
519
520 while k < n {
521 iterations += 1;
522 let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
523
524 let bucket_count = max_rank + 2; let mut counts = vec![0usize; bucket_count + 1];
531 for &i in &sa {
532 let r2 = if i + k < n {
533 rank[i + k] as usize + 1
534 } else {
535 0
536 };
537 counts[r2] += 1;
538 }
539 let mut sum = 0;
541 for c in &mut counts {
542 let v = *c;
543 *c = sum;
544 sum += v;
545 }
546 for &i in &sa {
547 let r2 = if i + k < n {
548 rank[i + k] as usize + 1
549 } else {
550 0
551 };
552 sa_tmp[counts[r2]] = i;
553 counts[r2] += 1;
554 }
555
556 counts.fill(0);
561 counts.resize(bucket_count + 1, 0);
562 for &i in &sa_tmp {
563 let r1 = rank[i] as usize;
564 counts[r1] += 1;
565 }
566 sum = 0;
567 for c in &mut counts {
568 let v = *c;
569 *c = sum;
570 sum += v;
571 }
572 for &i in &sa_tmp {
573 let r1 = rank[i] as usize;
574 sa[counts[r1]] = i;
575 counts[r1] += 1;
576 }
577
578 tmp[sa[0]] = 0;
580 for i in 1..n {
581 let prev = sa[i - 1];
582 let curr = sa[i];
583 let same = rank[prev] == rank[curr] && {
584 let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
585 let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
586 rp2 == rc2
587 };
588 tmp[curr] = tmp[prev] + i64::from(!same);
589 }
590
591 let new_max_rank = tmp[sa[n - 1]];
593 std::mem::swap(&mut rank, &mut tmp);
594
595 if new_max_rank as usize == n - 1 {
596 break;
597 }
598
599 k *= 2;
600 }
601
602 tracing::trace!(n, iterations, "suffix array constructed");
603 sa
604}
605
606fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
616 let n = sa.len();
617 if n == 0 {
618 return vec![];
619 }
620
621 let mut rank = vec![0usize; n];
622 for i in 0..n {
623 rank[sa[i]] = i;
624 }
625
626 let mut lcp = vec![0usize; n];
627 let mut k: usize = 0;
628
629 for i in 0..n {
630 if rank[i] == 0 {
631 k = 0;
632 continue;
633 }
634 let j = sa[rank[i] - 1];
635 while i + k < n && j + k < n {
636 if text[i + k] < 0 || text[j + k] < 0 {
638 break;
639 }
640 if text[i + k] != text[j + k] {
641 break;
642 }
643 k += 1;
644 }
645 lcp[rank[i]] = k;
646 k = k.saturating_sub(1);
647 }
648
649 lcp
650}
651
652fn extract_clone_groups(
661 sa: &[usize],
662 lcp: &[usize],
663 file_of: &[usize],
664 file_offsets: &[usize],
665 min_tokens: usize,
666 files: &[FileData],
667) -> Vec<RawGroup> {
668 let n = sa.len();
669 if n < 2 {
670 return vec![];
671 }
672
673 let mut stack: Vec<(usize, usize)> = Vec::new();
676 let mut groups: Vec<RawGroup> = Vec::new();
677
678 #[allow(clippy::needless_range_loop)] for i in 1..=n {
680 let cur_lcp = if i < n { lcp[i] } else { 0 };
681 let mut start = i;
682
683 while let Some(&(top_lcp, top_start)) = stack.last() {
684 if top_lcp <= cur_lcp {
685 break;
686 }
687 stack.pop();
688 start = top_start;
689
690 if top_lcp >= min_tokens {
691 let interval_begin = start - 1;
694 let interval_end = i; if let Some(group) = build_raw_group(
697 sa,
698 file_of,
699 file_offsets,
700 files,
701 interval_begin,
702 interval_end,
703 top_lcp,
704 ) {
705 groups.push(group);
706 }
707 }
708 }
709
710 if i < n {
711 stack.push((cur_lcp, start));
712 }
713 }
714
715 groups
716}
717
718fn build_raw_group(
721 sa: &[usize],
722 file_of: &[usize],
723 file_offsets: &[usize],
724 files: &[FileData],
725 begin: usize,
726 end: usize,
727 length: usize,
728) -> Option<RawGroup> {
729 let mut instances: Vec<(usize, usize)> = Vec::new();
730
731 for &pos in &sa[begin..end] {
732 let fid = file_of[pos];
733 if fid == usize::MAX {
734 continue; }
736 let offset_in_file = pos - file_offsets[fid];
737
738 if offset_in_file + length > files[fid].hashed_tokens.len() {
740 continue;
741 }
742
743 instances.push((fid, offset_in_file));
744 }
745
746 if instances.len() < 2 {
747 return None;
748 }
749
750 instances.sort_unstable();
753 let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
754 for &(fid, offset) in &instances {
755 if let Some(&(last_fid, last_offset)) = deduped.last()
756 && fid == last_fid
757 && offset < last_offset + length
758 {
759 continue; }
761 deduped.push((fid, offset));
762 }
763
764 if deduped.len() < 2 {
765 return None;
766 }
767
768 Some(RawGroup {
769 instances: deduped,
770 length,
771 })
772}
773
774fn build_clone_instance_fast(
779 file: &FileData,
780 token_offset: usize,
781 token_length: usize,
782 line_table: &[usize],
783) -> Option<CloneInstance> {
784 let tokens = &file.hashed_tokens;
785 let source_tokens = &file.file_tokens.tokens;
786
787 if token_offset + token_length > tokens.len() {
788 return None;
789 }
790
791 let first_hashed = &tokens[token_offset];
793 let last_hashed = &tokens[token_offset + token_length - 1];
794
795 let first_source = &source_tokens[first_hashed.original_index];
796 let last_source = &source_tokens[last_hashed.original_index];
797
798 let start_byte = first_source.span.start as usize;
799 let end_byte = last_source.span.end as usize;
800
801 if start_byte > end_byte {
804 return None;
805 }
806
807 let source = &file.file_tokens.source;
808 let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
809 let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
810
811 let fragment = if end_byte <= source.len() {
813 let mut sb = start_byte;
814 while sb > 0 && !source.is_char_boundary(sb) {
815 sb -= 1;
816 }
817 let mut eb = end_byte;
818 while eb < source.len() && !source.is_char_boundary(eb) {
819 eb += 1;
820 }
821 source[sb..eb].to_string()
822 } else {
823 String::new()
824 };
825
826 Some(CloneInstance {
827 file: file.path.clone(),
828 start_line,
829 end_line,
830 start_col,
831 end_col,
832 fragment,
833 })
834}
835
836fn byte_offset_to_line_col_fast(
839 source: &str,
840 byte_offset: usize,
841 line_table: &[usize],
842) -> (usize, usize) {
843 let mut offset = byte_offset.min(source.len());
844 while offset > 0 && !source.is_char_boundary(offset) {
846 offset -= 1;
847 }
848 let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
850 let line = line_idx + 1; let line_start = if line_idx == 0 {
852 0
853 } else {
854 line_table[line_idx - 1] + 1
855 };
856 let col = source[line_start..offset].chars().count();
857 (line, col)
858}
859
860#[cfg(test)]
862fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
863 let mut offset = byte_offset.min(source.len());
864 while offset > 0 && !source.is_char_boundary(offset) {
865 offset -= 1;
866 }
867 let before = &source[..offset];
868 let line = before.matches('\n').count() + 1;
869 let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
870 let col = before[line_start..].chars().count();
871 (line, col)
872}
873
874fn compute_stats(
876 clone_groups: &[CloneGroup],
877 total_files: usize,
878 total_lines: usize,
879 total_tokens: usize,
880) -> DuplicationStats {
881 use std::path::Path;
882
883 let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
884 let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
886 let mut duplicated_tokens = 0usize;
887 let mut clone_instances = 0usize;
888
889 for group in clone_groups {
890 for instance in &group.instances {
891 files_with_clones.insert(&instance.file);
892 clone_instances += 1;
893 let lines = file_dup_lines.entry(&instance.file).or_default();
894 for line in instance.start_line..=instance.end_line {
895 lines.insert(line);
896 }
897 }
898 if group.instances.len() > 1 {
901 duplicated_tokens += group.token_count * (group.instances.len() - 1);
902 }
903 }
904
905 let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
906 let duplication_percentage = if total_lines > 0 {
907 (dup_line_count as f64 / total_lines as f64) * 100.0
908 } else {
909 0.0
910 };
911
912 let duplicated_tokens = duplicated_tokens.min(total_tokens);
915
916 DuplicationStats {
917 total_files,
918 files_with_clones: files_with_clones.len(),
919 total_lines,
920 duplicated_lines: dup_line_count,
921 total_tokens,
922 duplicated_tokens,
923 clone_groups: clone_groups.len(),
924 clone_instances,
925 duplication_percentage,
926 }
927}
928
929const fn empty_report(total_files: usize) -> DuplicationReport {
931 DuplicationReport {
932 clone_groups: Vec::new(),
933 clone_families: Vec::new(),
934 stats: DuplicationStats {
935 total_files,
936 files_with_clones: 0,
937 total_lines: 0,
938 duplicated_lines: 0,
939 total_tokens: 0,
940 duplicated_tokens: 0,
941 clone_groups: 0,
942 clone_instances: 0,
943 duplication_percentage: 0.0,
944 },
945 }
946}
947
948#[cfg(test)]
949#[allow(clippy::disallowed_types)]
950mod tests {
951 use std::collections::HashMap;
952
953 use super::*;
954 use crate::duplicates::normalize::HashedToken;
955 use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
956 use oxc_span::Span;
957
958 fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
959 hashes
960 .iter()
961 .enumerate()
962 .map(|(i, &hash)| HashedToken {
963 hash,
964 original_index: i,
965 })
966 .collect()
967 }
968
969 fn make_source_tokens(count: usize) -> Vec<SourceToken> {
970 (0..count)
971 .map(|i| SourceToken {
972 kind: TokenKind::Identifier(format!("t{i}")),
973 span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
974 })
975 .collect()
976 }
977
978 fn make_file_tokens(source: &str, count: usize) -> FileTokens {
979 FileTokens {
980 tokens: make_source_tokens(count),
981 source: source.to_string(),
982 line_count: source.lines().count().max(1),
983 }
984 }
985
986 #[test]
989 fn empty_input_produces_empty_report() {
990 let detector = CloneDetector::new(5, 1, false);
991 let report = detector.detect(vec![]);
992 assert!(report.clone_groups.is_empty());
993 assert_eq!(report.stats.total_files, 0);
994 }
995
996 #[test]
997 fn single_file_no_clones() {
998 let detector = CloneDetector::new(3, 1, false);
999 let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
1000 let ft = make_file_tokens("a b c d e", 5);
1001 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1002 assert!(report.clone_groups.is_empty());
1003 }
1004
1005 #[test]
1006 fn detects_exact_duplicate_across_files() {
1007 let detector = CloneDetector::new(3, 1, false);
1008
1009 let hashes = vec![10, 20, 30, 40, 50];
1011 let source_a = "a\nb\nc\nd\ne";
1012 let source_b = "a\nb\nc\nd\ne";
1013
1014 let hashed_a = make_hashed_tokens(&hashes);
1015 let hashed_b = make_hashed_tokens(&hashes);
1016 let ft_a = make_file_tokens(source_a, 5);
1017 let ft_b = make_file_tokens(source_b, 5);
1018
1019 let report = detector.detect(vec![
1020 (PathBuf::from("a.ts"), hashed_a, ft_a),
1021 (PathBuf::from("b.ts"), hashed_b, ft_b),
1022 ]);
1023
1024 assert!(
1025 !report.clone_groups.is_empty(),
1026 "Should detect at least one clone group"
1027 );
1028 }
1029
1030 #[test]
1031 fn no_detection_below_min_tokens() {
1032 let detector = CloneDetector::new(10, 1, false);
1033
1034 let hashes = vec![10, 20, 30]; let hashed_a = make_hashed_tokens(&hashes);
1036 let hashed_b = make_hashed_tokens(&hashes);
1037 let ft_a = make_file_tokens("abc", 3);
1038 let ft_b = make_file_tokens("abc", 3);
1039
1040 let report = detector.detect(vec![
1041 (PathBuf::from("a.ts"), hashed_a, ft_a),
1042 (PathBuf::from("b.ts"), hashed_b, ft_b),
1043 ]);
1044
1045 assert!(report.clone_groups.is_empty());
1046 }
1047
1048 #[test]
1049 fn byte_offset_to_line_col_basic() {
1050 let source = "abc\ndef\nghi";
1051 assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1052 assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1053 assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1054 assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1055 }
1056
1057 #[test]
1058 fn byte_offset_beyond_source() {
1059 let source = "abc";
1060 let (line, col) = byte_offset_to_line_col(source, 100);
1062 assert_eq!(line, 1);
1063 assert_eq!(col, 3);
1064 }
1065
1066 #[test]
1067 fn skip_local_filters_same_directory() {
1068 let detector = CloneDetector::new(3, 1, true);
1069
1070 let hashes = vec![10, 20, 30, 40, 50];
1071 let source = "a\nb\nc\nd\ne";
1072
1073 let hashed_a = make_hashed_tokens(&hashes);
1074 let hashed_b = make_hashed_tokens(&hashes);
1075 let ft_a = make_file_tokens(source, 5);
1076 let ft_b = make_file_tokens(source, 5);
1077
1078 let report = detector.detect(vec![
1080 (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1081 (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1082 ]);
1083
1084 assert!(
1085 report.clone_groups.is_empty(),
1086 "Same-directory clones should be filtered with skip_local"
1087 );
1088 }
1089
1090 #[test]
1091 fn skip_local_keeps_cross_directory() {
1092 let detector = CloneDetector::new(3, 1, true);
1093
1094 let hashes = vec![10, 20, 30, 40, 50];
1095 let source = "a\nb\nc\nd\ne";
1096
1097 let hashed_a = make_hashed_tokens(&hashes);
1098 let hashed_b = make_hashed_tokens(&hashes);
1099 let ft_a = make_file_tokens(source, 5);
1100 let ft_b = make_file_tokens(source, 5);
1101
1102 let report = detector.detect(vec![
1104 (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1105 (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1106 ]);
1107
1108 assert!(
1109 !report.clone_groups.is_empty(),
1110 "Cross-directory clones should be kept with skip_local"
1111 );
1112 }
1113
1114 #[test]
1115 fn stats_computation() {
1116 let groups = vec![CloneGroup {
1117 instances: vec![
1118 CloneInstance {
1119 file: PathBuf::from("a.ts"),
1120 start_line: 1,
1121 end_line: 5,
1122 start_col: 0,
1123 end_col: 10,
1124 fragment: "...".to_string(),
1125 },
1126 CloneInstance {
1127 file: PathBuf::from("b.ts"),
1128 start_line: 10,
1129 end_line: 14,
1130 start_col: 0,
1131 end_col: 10,
1132 fragment: "...".to_string(),
1133 },
1134 ],
1135 token_count: 50,
1136 line_count: 5,
1137 }];
1138
1139 let stats = compute_stats(&groups, 10, 200, 1000);
1140 assert_eq!(stats.total_files, 10);
1141 assert_eq!(stats.files_with_clones, 2);
1142 assert_eq!(stats.clone_groups, 1);
1143 assert_eq!(stats.clone_instances, 2);
1144 assert_eq!(stats.duplicated_lines, 10); assert!(stats.duplication_percentage > 0.0);
1146 }
1147
1148 #[test]
1151 fn sa_construction_basic() {
1152 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1154 let sa = build_suffix_array(&text);
1155
1156 assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1164 }
1165
1166 #[test]
1167 fn lcp_construction_basic() {
1168 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1169 let sa = build_suffix_array(&text);
1170 let lcp = build_lcp(&text, &sa);
1171
1172 assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1180 }
1181
1182 #[test]
1183 fn lcp_stops_at_sentinels() {
1184 let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1186 let sa = build_suffix_array(&text);
1187 let lcp = build_lcp(&text, &sa);
1188
1189 let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1192 let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1193 let (lo, hi) = if rank_0 < rank_4 {
1194 (rank_0, rank_4)
1195 } else {
1196 (rank_4, rank_0)
1197 };
1198
1199 let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1201 assert_eq!(
1202 min_lcp, 3,
1203 "LCP between identical sequences across sentinel should be 3"
1204 );
1205 }
1206
1207 #[test]
1208 fn rank_reduction_maps_correctly() {
1209 let files = vec![
1210 FileData {
1211 path: PathBuf::from("a.ts"),
1212 hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1213 file_tokens: make_file_tokens("a b c", 3),
1214 },
1215 FileData {
1216 path: PathBuf::from("b.ts"),
1217 hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1218 file_tokens: make_file_tokens("d e f", 3),
1219 },
1220 ];
1221
1222 let ranked = rank_reduce(&files);
1223
1224 assert_eq!(ranked[0], vec![0, 1, 2]);
1226 assert_eq!(ranked[1], vec![1, 2, 3]);
1227 }
1228
1229 #[test]
1230 fn three_file_grouping() {
1231 let detector = CloneDetector::new(3, 1, false);
1232
1233 let hashes = vec![10, 20, 30, 40, 50];
1234 let source = "a\nb\nc\nd\ne";
1235
1236 let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1237 .map(|i| {
1238 (
1239 PathBuf::from(format!("file{i}.ts")),
1240 make_hashed_tokens(&hashes),
1241 make_file_tokens(source, 5),
1242 )
1243 })
1244 .collect();
1245
1246 let report = detector.detect(data);
1247
1248 assert!(
1249 !report.clone_groups.is_empty(),
1250 "Should detect clones across 3 identical files"
1251 );
1252
1253 let max_instances = report
1255 .clone_groups
1256 .iter()
1257 .map(|g| g.instances.len())
1258 .max()
1259 .unwrap_or(0);
1260 assert_eq!(
1261 max_instances, 3,
1262 "3 identical files should produce a group with 3 instances"
1263 );
1264 }
1265
1266 #[test]
1267 fn overlapping_clones_largest_wins() {
1268 let detector = CloneDetector::new(3, 1, false);
1269
1270 let hashes: Vec<u64> = (1..=10).collect();
1272 let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1273
1274 let hashed_a = make_hashed_tokens(&hashes);
1275 let hashed_b = make_hashed_tokens(&hashes);
1276 let ft_a = make_file_tokens(source, 10);
1277 let ft_b = make_file_tokens(source, 10);
1278
1279 let report = detector.detect(vec![
1280 (PathBuf::from("a.ts"), hashed_a, ft_a),
1281 (PathBuf::from("b.ts"), hashed_b, ft_b),
1282 ]);
1283
1284 assert!(!report.clone_groups.is_empty());
1285 assert_eq!(
1287 report.clone_groups[0].token_count, 10,
1288 "Maximal clone should cover all 10 tokens"
1289 );
1290 }
1291
1292 #[test]
1293 fn no_self_overlap() {
1294 let detector = CloneDetector::new(3, 1, false);
1295
1296 let hashes = vec![1, 2, 3, 1, 2, 3];
1299 let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1303
1304 let hashed = make_hashed_tokens(&hashes);
1305 let ft = make_file_tokens(source, 6);
1306
1307 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1308
1309 for group in &report.clone_groups {
1311 let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1312 for inst in &group.instances {
1313 file_instances
1314 .entry(&inst.file)
1315 .or_default()
1316 .push((inst.start_line, inst.end_line));
1317 }
1318 for (_file, mut ranges) in file_instances {
1319 ranges.sort();
1320 for w in ranges.windows(2) {
1321 assert!(
1322 w[1].0 > w[0].1,
1323 "Clone instances in the same file should not overlap: {:?} and {:?}",
1324 w[0],
1325 w[1]
1326 );
1327 }
1328 }
1329 }
1330 }
1331
1332 #[test]
1333 fn empty_input_edge_case() {
1334 let detector = CloneDetector::new(0, 0, false);
1335 let report = detector.detect(vec![]);
1336 assert!(report.clone_groups.is_empty());
1337 assert_eq!(report.stats.total_files, 0);
1338 }
1339
1340 #[test]
1341 fn single_file_internal_duplication() {
1342 let detector = CloneDetector::new(3, 1, false);
1343
1344 let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1347 let source = "a\nb\nc\nx\na\nb\nc";
1348
1349 let hashed = make_hashed_tokens(&hashes);
1350 let ft = make_file_tokens(source, 7);
1351
1352 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1353
1354 assert!(
1356 !report.clone_groups.is_empty(),
1357 "Should detect internal duplication within a single file"
1358 );
1359 }
1360}