1use std::collections::HashMap;
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14struct FileData {
16 path: PathBuf,
17 hashed_tokens: Vec<HashedToken>,
18 file_tokens: FileTokens,
19}
20
21pub struct CloneDetector {
27 min_tokens: usize,
29 min_lines: usize,
31 skip_local: bool,
33}
34
35impl CloneDetector {
36 pub fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38 Self {
39 min_tokens,
40 min_lines,
41 skip_local,
42 }
43 }
44
45 pub fn detect(
50 &self,
51 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52 ) -> DuplicationReport {
53 let _span = tracing::info_span!("clone_detect").entered();
54
55 if file_data.is_empty() || self.min_tokens == 0 {
56 return empty_report(0);
57 }
58
59 let files: Vec<FileData> = file_data
60 .into_iter()
61 .map(|(path, hashed_tokens, file_tokens)| FileData {
62 path,
63 hashed_tokens,
64 file_tokens,
65 })
66 .collect();
67
68 let total_files = files.len();
70 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73 tracing::debug!(
74 total_files,
75 total_tokens,
76 total_lines,
77 "clone detection input"
78 );
79
80 let t0 = std::time::Instant::now();
82 let ranked_files = rank_reduce(&files);
83 let rank_time = t0.elapsed();
84 let unique_ranks: usize = ranked_files
85 .iter()
86 .flat_map(|f| f.iter())
87 .copied()
88 .max()
89 .map_or(0, |m| m as usize + 1);
90 tracing::debug!(
91 elapsed_us = rank_time.as_micros(),
92 unique_ranks,
93 "step1_rank_reduce"
94 );
95
96 let t0 = std::time::Instant::now();
98 let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99 let concat_time = t0.elapsed();
100 tracing::debug!(
101 elapsed_us = concat_time.as_micros(),
102 concat_len = text.len(),
103 "step2_concatenate"
104 );
105
106 if text.is_empty() {
107 return empty_report(total_files);
108 }
109
110 let t0 = std::time::Instant::now();
112 let sa = build_suffix_array(&text);
113 let sa_time = t0.elapsed();
114 tracing::debug!(
115 elapsed_us = sa_time.as_micros(),
116 n = text.len(),
117 "step3_suffix_array"
118 );
119
120 let t0 = std::time::Instant::now();
122 let lcp = build_lcp(&text, &sa);
123 let lcp_time = t0.elapsed();
124 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126 let t0 = std::time::Instant::now();
128 let raw_groups =
129 extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130 let extract_time = t0.elapsed();
131 tracing::debug!(
132 elapsed_us = extract_time.as_micros(),
133 raw_groups = raw_groups.len(),
134 "step5_extract_groups"
135 );
136
137 let t0 = std::time::Instant::now();
139 let clone_groups = self.build_groups(raw_groups, &files);
140 let build_time = t0.elapsed();
141 tracing::debug!(
142 elapsed_us = build_time.as_micros(),
143 final_groups = clone_groups.len(),
144 "step6_build_groups"
145 );
146
147 let t0 = std::time::Instant::now();
149 let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150 let stats_time = t0.elapsed();
151 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153 tracing::info!(
154 total_us = (rank_time
155 + concat_time
156 + sa_time
157 + lcp_time
158 + extract_time
159 + build_time
160 + stats_time)
161 .as_micros(),
162 rank_us = rank_time.as_micros(),
163 sa_us = sa_time.as_micros(),
164 lcp_us = lcp_time.as_micros(),
165 extract_us = extract_time.as_micros(),
166 build_us = build_time.as_micros(),
167 stats_us = stats_time.as_micros(),
168 total_tokens,
169 clone_groups = clone_groups.len(),
170 "clone detection complete"
171 );
172
173 DuplicationReport {
174 clone_groups,
175 stats,
176 }
177 }
178
179 fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
182 if raw_groups.is_empty() {
183 return Vec::new();
184 }
185
186 let raw_count = raw_groups.len();
194 raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
195
196 let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
199 let mut surviving_groups: Vec<RawGroup> = Vec::new();
200
201 for rg in raw_groups {
202 let len = rg.length;
203 let all_covered = rg.instances.iter().all(|&(fid, offset)| {
205 let intervals = &covered[fid];
206 let idx = intervals.partition_point(|&(s, _)| s <= offset);
208 if idx > 0 {
209 let (s, e) = intervals[idx - 1];
210 offset >= s && offset + len <= e
211 } else {
212 false
213 }
214 });
215
216 if !all_covered {
217 for &(fid, offset) in &rg.instances {
219 let end = offset + len;
220 let intervals = &mut covered[fid];
221 let idx = intervals.partition_point(|&(s, _)| s < offset);
222 if idx > 0 {
224 let prev = &mut intervals[idx - 1];
225 if prev.1 >= offset {
226 if end > prev.1 {
228 prev.1 = end;
229 }
230 continue;
231 }
232 }
233 intervals.insert(idx, (offset, end));
234 }
235 surviving_groups.push(rg);
236 }
237 }
238
239 tracing::trace!(
240 raw = raw_count,
241 surviving = surviving_groups.len(),
242 "token-level subset removal"
243 );
244
245 let line_tables: Vec<Vec<usize>> = files
251 .iter()
252 .map(|f| {
253 f.file_tokens
254 .source
255 .bytes()
256 .enumerate()
257 .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
258 .collect()
259 })
260 .collect();
261
262 let mut clone_groups: Vec<CloneGroup> = Vec::new();
264
265 for rg in surviving_groups {
266 let mut seen: HashMap<(usize, usize), bool> = HashMap::new();
268 let mut group_instances: Vec<CloneInstance> = Vec::new();
269
270 for &(file_id, offset) in &rg.instances {
271 if seen.contains_key(&(file_id, offset)) {
272 continue;
273 }
274 seen.insert((file_id, offset), true);
275
276 let file = &files[file_id];
277 if let Some(inst) =
278 build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279 {
280 group_instances.push(inst);
281 }
282 }
283
284 if self.skip_local && group_instances.len() >= 2 {
286 let dirs: std::collections::HashSet<_> = group_instances
287 .iter()
288 .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289 .collect();
290 if dirs.len() < 2 {
291 continue;
292 }
293 }
294
295 if group_instances.len() < 2 {
296 continue;
297 }
298
299 let line_count = group_instances
301 .iter()
302 .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303 .max()
304 .unwrap_or(0);
305
306 if line_count < self.min_lines {
308 continue;
309 }
310
311 group_instances
313 .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315 group_instances.dedup_by(|b, a| {
319 if a.file != b.file {
320 return false;
321 }
322 if b.start_line <= a.end_line {
325 if b.end_line > a.end_line {
327 a.end_line = b.end_line;
328 a.end_col = b.end_col;
329 }
330 true
331 } else {
332 false
333 }
334 });
335
336 if group_instances.len() < 2 {
337 continue;
338 }
339
340 clone_groups.push(CloneGroup {
341 instances: group_instances,
342 token_count: rg.length,
343 line_count,
344 });
345 }
346
347 clone_groups.sort_by(|a, b| {
352 b.token_count
353 .cmp(&a.token_count)
354 .then(b.instances.len().cmp(&a.instances.len()))
355 });
356
357 let mut keep = vec![true; clone_groups.len()];
362 for i in 0..clone_groups.len() {
363 if !keep[i] {
364 continue;
365 }
366 for j in (i + 1)..clone_groups.len() {
367 if !keep[j] {
368 continue;
369 }
370 if is_subset_group(&clone_groups[j], &clone_groups[i]) {
371 keep[j] = false;
372 } else if is_subset_group(&clone_groups[i], &clone_groups[j]) {
373 keep[i] = false;
374 break;
375 }
376 }
377 }
378
379 clone_groups
380 .into_iter()
381 .enumerate()
382 .filter(|(i, _)| keep[*i])
383 .map(|(_, g)| g)
384 .collect()
385 }
386}
387
388struct RawGroup {
392 instances: Vec<(usize, usize)>,
394 length: usize,
396}
397
398fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
405 let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
409 let mut hash_to_rank: HashMap<u64, u32> = HashMap::with_capacity(total / 2);
410 let mut next_rank: u32 = 0;
411
412 files
413 .iter()
414 .map(|file| {
415 file.hashed_tokens
416 .iter()
417 .map(|ht| {
418 *hash_to_rank.entry(ht.hash).or_insert_with(|| {
419 let r = next_rank;
420 next_rank += 1;
421 r
422 })
423 })
424 .collect()
425 })
426 .collect()
427}
428
429fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
441 let sentinel_count = ranked_files.len().saturating_sub(1);
442 let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
443
444 let mut text = Vec::with_capacity(total_len);
445 let mut file_of = Vec::with_capacity(total_len);
446 let mut file_offsets = Vec::with_capacity(ranked_files.len());
447
448 let mut sentinel: i64 = -1;
449
450 for (file_id, ranks) in ranked_files.iter().enumerate() {
451 file_offsets.push(text.len());
452
453 for &r in ranks {
454 text.push(r as i64);
455 file_of.push(file_id);
456 }
457
458 if file_id + 1 < ranked_files.len() {
460 text.push(sentinel);
461 file_of.push(usize::MAX);
462 sentinel -= 1;
463 }
464 }
465
466 (text, file_of, file_offsets)
467}
468
469fn build_suffix_array(text: &[i64]) -> Vec<usize> {
477 let n = text.len();
478 if n == 0 {
479 return vec![];
480 }
481
482 let min_val = text.iter().copied().min().unwrap_or(0);
485 let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
486 let mut sa: Vec<usize> = (0..n).collect();
487 let mut tmp: Vec<i64> = vec![0; n];
488 let mut k: usize = 1;
489 let mut iterations = 0u32;
490
491 let mut sa_tmp: Vec<usize> = vec![0; n];
493
494 while k < n {
495 iterations += 1;
496 let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
497
498 let bucket_count = max_rank + 2; let mut counts = vec![0usize; bucket_count + 1];
505 for &i in &sa {
506 let r2 = if i + k < n {
507 rank[i + k] as usize + 1
508 } else {
509 0
510 };
511 counts[r2] += 1;
512 }
513 let mut sum = 0;
515 for c in counts.iter_mut() {
516 let v = *c;
517 *c = sum;
518 sum += v;
519 }
520 for &i in &sa {
521 let r2 = if i + k < n {
522 rank[i + k] as usize + 1
523 } else {
524 0
525 };
526 sa_tmp[counts[r2]] = i;
527 counts[r2] += 1;
528 }
529
530 counts.iter_mut().for_each(|c| *c = 0);
535 counts.resize(bucket_count + 1, 0);
536 for &i in &sa_tmp {
537 let r1 = rank[i] as usize;
538 counts[r1] += 1;
539 }
540 sum = 0;
541 for c in counts.iter_mut() {
542 let v = *c;
543 *c = sum;
544 sum += v;
545 }
546 for &i in &sa_tmp {
547 let r1 = rank[i] as usize;
548 sa[counts[r1]] = i;
549 counts[r1] += 1;
550 }
551
552 tmp[sa[0]] = 0;
554 for i in 1..n {
555 let prev = sa[i - 1];
556 let curr = sa[i];
557 let same = rank[prev] == rank[curr] && {
558 let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
559 let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
560 rp2 == rc2
561 };
562 tmp[curr] = tmp[prev] + if same { 0 } else { 1 };
563 }
564
565 let new_max_rank = tmp[sa[n - 1]];
567 std::mem::swap(&mut rank, &mut tmp);
568
569 if new_max_rank as usize == n - 1 {
570 break;
571 }
572
573 k *= 2;
574 }
575
576 tracing::trace!(n, iterations, "suffix array constructed");
577 sa
578}
579
580fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
590 let n = sa.len();
591 if n == 0 {
592 return vec![];
593 }
594
595 let mut rank = vec![0usize; n];
596 for i in 0..n {
597 rank[sa[i]] = i;
598 }
599
600 let mut lcp = vec![0usize; n];
601 let mut k: usize = 0;
602
603 for i in 0..n {
604 if rank[i] == 0 {
605 k = 0;
606 continue;
607 }
608 let j = sa[rank[i] - 1];
609 while i + k < n && j + k < n {
610 if text[i + k] < 0 || text[j + k] < 0 {
612 break;
613 }
614 if text[i + k] != text[j + k] {
615 break;
616 }
617 k += 1;
618 }
619 lcp[rank[i]] = k;
620 k = k.saturating_sub(1);
621 }
622
623 lcp
624}
625
626fn extract_clone_groups(
635 sa: &[usize],
636 lcp: &[usize],
637 file_of: &[usize],
638 file_offsets: &[usize],
639 min_tokens: usize,
640 files: &[FileData],
641) -> Vec<RawGroup> {
642 let n = sa.len();
643 if n < 2 {
644 return vec![];
645 }
646
647 let mut stack: Vec<(usize, usize)> = Vec::new();
650 let mut groups: Vec<RawGroup> = Vec::new();
651
652 #[allow(clippy::needless_range_loop)] for i in 1..=n {
654 let cur_lcp = if i < n { lcp[i] } else { 0 };
655 let mut start = i;
656
657 while let Some(&(top_lcp, top_start)) = stack.last() {
658 if top_lcp <= cur_lcp {
659 break;
660 }
661 stack.pop();
662 start = top_start;
663
664 if top_lcp >= min_tokens {
665 let interval_begin = start - 1;
668 let interval_end = i; if let Some(group) = build_raw_group(
671 sa,
672 file_of,
673 file_offsets,
674 files,
675 interval_begin,
676 interval_end,
677 top_lcp,
678 ) {
679 groups.push(group);
680 }
681 }
682 }
683
684 if i < n {
685 stack.push((cur_lcp, start));
686 }
687 }
688
689 groups
690}
691
692fn build_raw_group(
695 sa: &[usize],
696 file_of: &[usize],
697 file_offsets: &[usize],
698 files: &[FileData],
699 begin: usize,
700 end: usize,
701 length: usize,
702) -> Option<RawGroup> {
703 let mut instances: Vec<(usize, usize)> = Vec::new();
704
705 for &pos in &sa[begin..end] {
706 let fid = file_of[pos];
707 if fid == usize::MAX {
708 continue; }
710 let offset_in_file = pos - file_offsets[fid];
711
712 if offset_in_file + length > files[fid].hashed_tokens.len() {
714 continue;
715 }
716
717 instances.push((fid, offset_in_file));
718 }
719
720 if instances.len() < 2 {
721 return None;
722 }
723
724 instances.sort_unstable();
727 let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
728 for &(fid, offset) in &instances {
729 if let Some(&(last_fid, last_offset)) = deduped.last()
730 && fid == last_fid
731 && offset < last_offset + length
732 {
733 continue; }
735 deduped.push((fid, offset));
736 }
737
738 if deduped.len() < 2 {
739 return None;
740 }
741
742 Some(RawGroup {
743 instances: deduped,
744 length,
745 })
746}
747
748fn is_subset_group(smaller: &CloneGroup, larger: &CloneGroup) -> bool {
752 smaller.instances.iter().all(|s_inst| {
753 larger.instances.iter().any(|l_inst| {
754 s_inst.file == l_inst.file
755 && s_inst.start_line >= l_inst.start_line
756 && s_inst.end_line <= l_inst.end_line
757 })
758 })
759}
760
761fn build_clone_instance_fast(
763 file: &FileData,
764 token_offset: usize,
765 token_length: usize,
766 line_table: &[usize],
767) -> Option<CloneInstance> {
768 let tokens = &file.hashed_tokens;
769 let source_tokens = &file.file_tokens.tokens;
770
771 if token_offset + token_length > tokens.len() {
772 return None;
773 }
774
775 let first_hashed = &tokens[token_offset];
777 let last_hashed = &tokens[token_offset + token_length - 1];
778
779 let first_source = &source_tokens[first_hashed.original_index];
780 let last_source = &source_tokens[last_hashed.original_index];
781
782 let start_byte = first_source.span.start as usize;
783 let end_byte = last_source.span.end as usize;
784
785 if start_byte > end_byte {
788 return None;
789 }
790
791 let source = &file.file_tokens.source;
792 let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
793 let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
794
795 let fragment = if end_byte <= source.len() {
797 source[start_byte..end_byte].to_string()
798 } else {
799 String::new()
800 };
801
802 Some(CloneInstance {
803 file: file.path.clone(),
804 start_line,
805 end_line,
806 start_col,
807 end_col,
808 fragment,
809 })
810}
811
812#[cfg(test)]
814fn build_clone_instance(
815 file: &FileData,
816 token_offset: usize,
817 token_length: usize,
818) -> Option<CloneInstance> {
819 let tokens = &file.hashed_tokens;
820 let source_tokens = &file.file_tokens.tokens;
821
822 if token_offset + token_length > tokens.len() {
823 return None;
824 }
825
826 let first_hashed = &tokens[token_offset];
828 let last_hashed = &tokens[token_offset + token_length - 1];
829
830 let first_source = &source_tokens[first_hashed.original_index];
831 let last_source = &source_tokens[last_hashed.original_index];
832
833 let start_byte = first_source.span.start as usize;
834 let end_byte = last_source.span.end as usize;
835
836 if start_byte > end_byte {
839 return None;
840 }
841
842 let source = &file.file_tokens.source;
843 let (start_line, start_col) = byte_offset_to_line_col(source, start_byte);
844 let (end_line, end_col) = byte_offset_to_line_col(source, end_byte);
845
846 let fragment = if end_byte <= source.len() {
848 source[start_byte..end_byte].to_string()
849 } else {
850 String::new()
851 };
852
853 Some(CloneInstance {
854 file: file.path.clone(),
855 start_line,
856 end_line,
857 start_col,
858 end_col,
859 fragment,
860 })
861}
862
863fn byte_offset_to_line_col_fast(
866 source: &str,
867 byte_offset: usize,
868 line_table: &[usize],
869) -> (usize, usize) {
870 let offset = byte_offset.min(source.len());
871 let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
873 let line = line_idx + 1; let line_start = if line_idx == 0 {
875 0
876 } else {
877 line_table[line_idx - 1] + 1
878 };
879 let col = source[line_start..offset].chars().count();
880 (line, col)
881}
882
883#[cfg(test)]
885fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
886 let offset = byte_offset.min(source.len());
887 let before = &source[..offset];
888 let line = before.matches('\n').count() + 1;
889 let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
890 let col = before[line_start..].chars().count();
891 (line, col)
892}
893
894fn compute_stats(
896 clone_groups: &[CloneGroup],
897 total_files: usize,
898 total_lines: usize,
899 total_tokens: usize,
900) -> DuplicationStats {
901 use std::collections::HashSet;
902 use std::path::Path;
903
904 let mut files_with_clones: HashSet<&Path> = HashSet::new();
905 let mut file_dup_lines: HashMap<&Path, HashSet<usize>> = HashMap::new();
907 let mut duplicated_tokens = 0usize;
908 let mut clone_instances = 0usize;
909
910 for group in clone_groups {
911 for instance in &group.instances {
912 files_with_clones.insert(&instance.file);
913 clone_instances += 1;
914 let lines = file_dup_lines.entry(&instance.file).or_default();
915 for line in instance.start_line..=instance.end_line {
916 lines.insert(line);
917 }
918 }
919 if group.instances.len() > 1 {
922 duplicated_tokens += group.token_count * (group.instances.len() - 1);
923 }
924 }
925
926 let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
927 let duplication_percentage = if total_lines > 0 {
928 (dup_line_count as f64 / total_lines as f64) * 100.0
929 } else {
930 0.0
931 };
932
933 let duplicated_tokens = duplicated_tokens.min(total_tokens);
936
937 DuplicationStats {
938 total_files,
939 files_with_clones: files_with_clones.len(),
940 total_lines,
941 duplicated_lines: dup_line_count,
942 total_tokens,
943 duplicated_tokens,
944 clone_groups: clone_groups.len(),
945 clone_instances,
946 duplication_percentage,
947 }
948}
949
950fn empty_report(total_files: usize) -> DuplicationReport {
952 DuplicationReport {
953 clone_groups: vec![],
954 stats: DuplicationStats {
955 total_files,
956 files_with_clones: 0,
957 total_lines: 0,
958 duplicated_lines: 0,
959 total_tokens: 0,
960 duplicated_tokens: 0,
961 clone_groups: 0,
962 clone_instances: 0,
963 duplication_percentage: 0.0,
964 },
965 }
966}
967
968#[cfg(test)]
969mod tests {
970 use super::*;
971 use crate::duplicates::normalize::HashedToken;
972 use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
973 use oxc_span::Span;
974
975 fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
976 hashes
977 .iter()
978 .enumerate()
979 .map(|(i, &hash)| HashedToken {
980 hash,
981 original_index: i,
982 })
983 .collect()
984 }
985
986 fn make_source_tokens(count: usize) -> Vec<SourceToken> {
987 (0..count)
988 .map(|i| SourceToken {
989 kind: TokenKind::Identifier(format!("t{i}")),
990 span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
991 })
992 .collect()
993 }
994
995 fn make_file_tokens(source: &str, count: usize) -> FileTokens {
996 FileTokens {
997 tokens: make_source_tokens(count),
998 source: source.to_string(),
999 line_count: source.lines().count().max(1),
1000 }
1001 }
1002
1003 #[test]
1006 fn empty_input_produces_empty_report() {
1007 let detector = CloneDetector::new(5, 1, false);
1008 let report = detector.detect(vec![]);
1009 assert!(report.clone_groups.is_empty());
1010 assert_eq!(report.stats.total_files, 0);
1011 }
1012
1013 #[test]
1014 fn single_file_no_clones() {
1015 let detector = CloneDetector::new(3, 1, false);
1016 let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
1017 let ft = make_file_tokens("a b c d e", 5);
1018 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1019 assert!(report.clone_groups.is_empty());
1020 }
1021
1022 #[test]
1023 fn detects_exact_duplicate_across_files() {
1024 let detector = CloneDetector::new(3, 1, false);
1025
1026 let hashes = vec![10, 20, 30, 40, 50];
1028 let source_a = "a\nb\nc\nd\ne";
1029 let source_b = "a\nb\nc\nd\ne";
1030
1031 let hashed_a = make_hashed_tokens(&hashes);
1032 let hashed_b = make_hashed_tokens(&hashes);
1033 let ft_a = make_file_tokens(source_a, 5);
1034 let ft_b = make_file_tokens(source_b, 5);
1035
1036 let report = detector.detect(vec![
1037 (PathBuf::from("a.ts"), hashed_a, ft_a),
1038 (PathBuf::from("b.ts"), hashed_b, ft_b),
1039 ]);
1040
1041 assert!(
1042 !report.clone_groups.is_empty(),
1043 "Should detect at least one clone group"
1044 );
1045 }
1046
1047 #[test]
1048 fn no_detection_below_min_tokens() {
1049 let detector = CloneDetector::new(10, 1, false);
1050
1051 let hashes = vec![10, 20, 30]; let hashed_a = make_hashed_tokens(&hashes);
1053 let hashed_b = make_hashed_tokens(&hashes);
1054 let ft_a = make_file_tokens("abc", 3);
1055 let ft_b = make_file_tokens("abc", 3);
1056
1057 let report = detector.detect(vec![
1058 (PathBuf::from("a.ts"), hashed_a, ft_a),
1059 (PathBuf::from("b.ts"), hashed_b, ft_b),
1060 ]);
1061
1062 assert!(report.clone_groups.is_empty());
1063 }
1064
1065 #[test]
1066 fn byte_offset_to_line_col_basic() {
1067 let source = "abc\ndef\nghi";
1068 assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1069 assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1070 assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1071 assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1072 }
1073
1074 #[test]
1075 fn byte_offset_beyond_source() {
1076 let source = "abc";
1077 let (line, col) = byte_offset_to_line_col(source, 100);
1079 assert_eq!(line, 1);
1080 assert_eq!(col, 3);
1081 }
1082
1083 #[test]
1084 fn skip_local_filters_same_directory() {
1085 let detector = CloneDetector::new(3, 1, true);
1086
1087 let hashes = vec![10, 20, 30, 40, 50];
1088 let source = "a\nb\nc\nd\ne";
1089
1090 let hashed_a = make_hashed_tokens(&hashes);
1091 let hashed_b = make_hashed_tokens(&hashes);
1092 let ft_a = make_file_tokens(source, 5);
1093 let ft_b = make_file_tokens(source, 5);
1094
1095 let report = detector.detect(vec![
1097 (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1098 (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1099 ]);
1100
1101 assert!(
1102 report.clone_groups.is_empty(),
1103 "Same-directory clones should be filtered with skip_local"
1104 );
1105 }
1106
1107 #[test]
1108 fn skip_local_keeps_cross_directory() {
1109 let detector = CloneDetector::new(3, 1, true);
1110
1111 let hashes = vec![10, 20, 30, 40, 50];
1112 let source = "a\nb\nc\nd\ne";
1113
1114 let hashed_a = make_hashed_tokens(&hashes);
1115 let hashed_b = make_hashed_tokens(&hashes);
1116 let ft_a = make_file_tokens(source, 5);
1117 let ft_b = make_file_tokens(source, 5);
1118
1119 let report = detector.detect(vec![
1121 (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1122 (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1123 ]);
1124
1125 assert!(
1126 !report.clone_groups.is_empty(),
1127 "Cross-directory clones should be kept with skip_local"
1128 );
1129 }
1130
1131 #[test]
1132 fn stats_computation() {
1133 let groups = vec![CloneGroup {
1134 instances: vec![
1135 CloneInstance {
1136 file: PathBuf::from("a.ts"),
1137 start_line: 1,
1138 end_line: 5,
1139 start_col: 0,
1140 end_col: 10,
1141 fragment: "...".to_string(),
1142 },
1143 CloneInstance {
1144 file: PathBuf::from("b.ts"),
1145 start_line: 10,
1146 end_line: 14,
1147 start_col: 0,
1148 end_col: 10,
1149 fragment: "...".to_string(),
1150 },
1151 ],
1152 token_count: 50,
1153 line_count: 5,
1154 }];
1155
1156 let stats = compute_stats(&groups, 10, 200, 1000);
1157 assert_eq!(stats.total_files, 10);
1158 assert_eq!(stats.files_with_clones, 2);
1159 assert_eq!(stats.clone_groups, 1);
1160 assert_eq!(stats.clone_instances, 2);
1161 assert_eq!(stats.duplicated_lines, 10); assert!(stats.duplication_percentage > 0.0);
1163 }
1164
1165 #[test]
1168 fn sa_construction_basic() {
1169 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1171 let sa = build_suffix_array(&text);
1172
1173 assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1181 }
1182
1183 #[test]
1184 fn lcp_construction_basic() {
1185 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1186 let sa = build_suffix_array(&text);
1187 let lcp = build_lcp(&text, &sa);
1188
1189 assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1197 }
1198
1199 #[test]
1200 fn lcp_stops_at_sentinels() {
1201 let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1203 let sa = build_suffix_array(&text);
1204 let lcp = build_lcp(&text, &sa);
1205
1206 let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1209 let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1210 let (lo, hi) = if rank_0 < rank_4 {
1211 (rank_0, rank_4)
1212 } else {
1213 (rank_4, rank_0)
1214 };
1215
1216 let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1218 assert_eq!(
1219 min_lcp, 3,
1220 "LCP between identical sequences across sentinel should be 3"
1221 );
1222 }
1223
1224 #[test]
1225 fn rank_reduction_maps_correctly() {
1226 let files = vec![
1227 FileData {
1228 path: PathBuf::from("a.ts"),
1229 hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1230 file_tokens: make_file_tokens("a b c", 3),
1231 },
1232 FileData {
1233 path: PathBuf::from("b.ts"),
1234 hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1235 file_tokens: make_file_tokens("d e f", 3),
1236 },
1237 ];
1238
1239 let ranked = rank_reduce(&files);
1240
1241 assert_eq!(ranked[0], vec![0, 1, 2]);
1243 assert_eq!(ranked[1], vec![1, 2, 3]);
1244 }
1245
1246 #[test]
1247 fn three_file_grouping() {
1248 let detector = CloneDetector::new(3, 1, false);
1249
1250 let hashes = vec![10, 20, 30, 40, 50];
1251 let source = "a\nb\nc\nd\ne";
1252
1253 let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1254 .map(|i| {
1255 (
1256 PathBuf::from(format!("file{i}.ts")),
1257 make_hashed_tokens(&hashes),
1258 make_file_tokens(source, 5),
1259 )
1260 })
1261 .collect();
1262
1263 let report = detector.detect(data);
1264
1265 assert!(
1266 !report.clone_groups.is_empty(),
1267 "Should detect clones across 3 identical files"
1268 );
1269
1270 let max_instances = report
1272 .clone_groups
1273 .iter()
1274 .map(|g| g.instances.len())
1275 .max()
1276 .unwrap_or(0);
1277 assert_eq!(
1278 max_instances, 3,
1279 "3 identical files should produce a group with 3 instances"
1280 );
1281 }
1282
1283 #[test]
1284 fn overlapping_clones_largest_wins() {
1285 let detector = CloneDetector::new(3, 1, false);
1286
1287 let hashes: Vec<u64> = (1..=10).collect();
1289 let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1290
1291 let hashed_a = make_hashed_tokens(&hashes);
1292 let hashed_b = make_hashed_tokens(&hashes);
1293 let ft_a = make_file_tokens(source, 10);
1294 let ft_b = make_file_tokens(source, 10);
1295
1296 let report = detector.detect(vec![
1297 (PathBuf::from("a.ts"), hashed_a, ft_a),
1298 (PathBuf::from("b.ts"), hashed_b, ft_b),
1299 ]);
1300
1301 assert!(!report.clone_groups.is_empty());
1302 assert_eq!(
1304 report.clone_groups[0].token_count, 10,
1305 "Maximal clone should cover all 10 tokens"
1306 );
1307 }
1308
1309 #[test]
1310 fn no_self_overlap() {
1311 let detector = CloneDetector::new(3, 1, false);
1312
1313 let hashes = vec![1, 2, 3, 1, 2, 3];
1316 let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1320
1321 let hashed = make_hashed_tokens(&hashes);
1322 let ft = make_file_tokens(source, 6);
1323
1324 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1325
1326 for group in &report.clone_groups {
1328 let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1329 for inst in &group.instances {
1330 file_instances
1331 .entry(&inst.file)
1332 .or_default()
1333 .push((inst.start_line, inst.end_line));
1334 }
1335 for (_file, mut ranges) in file_instances {
1336 ranges.sort();
1337 for w in ranges.windows(2) {
1338 assert!(
1339 w[1].0 > w[0].1,
1340 "Clone instances in the same file should not overlap: {:?} and {:?}",
1341 w[0],
1342 w[1]
1343 );
1344 }
1345 }
1346 }
1347 }
1348
1349 #[test]
1350 fn empty_input_edge_case() {
1351 let detector = CloneDetector::new(0, 0, false);
1352 let report = detector.detect(vec![]);
1353 assert!(report.clone_groups.is_empty());
1354 assert_eq!(report.stats.total_files, 0);
1355 }
1356
1357 #[test]
1358 fn single_file_internal_duplication() {
1359 let detector = CloneDetector::new(3, 1, false);
1360
1361 let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1364 let source = "a\nb\nc\nx\na\nb\nc";
1365
1366 let hashed = make_hashed_tokens(&hashes);
1367 let ft = make_file_tokens(source, 7);
1368
1369 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1370
1371 assert!(
1373 !report.clone_groups.is_empty(),
1374 "Should detect internal duplication within a single file"
1375 );
1376 }
1377}