1use std::collections::{HashMap, HashSet};
8use std::path::PathBuf;
9
10use super::normalize::HashedToken;
11use super::tokenize::FileTokens;
12use super::types::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
13
14struct FileData {
16 path: PathBuf,
17 hashed_tokens: Vec<HashedToken>,
18 file_tokens: FileTokens,
19}
20
21pub struct CloneDetector {
27 min_tokens: usize,
29 min_lines: usize,
31 skip_local: bool,
33}
34
35impl CloneDetector {
36 pub fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
38 Self {
39 min_tokens,
40 min_lines,
41 skip_local,
42 }
43 }
44
45 pub fn detect(
50 &self,
51 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
52 ) -> DuplicationReport {
53 let _span = tracing::info_span!("clone_detect").entered();
54
55 if file_data.is_empty() || self.min_tokens == 0 {
56 return empty_report(0);
57 }
58
59 let files: Vec<FileData> = file_data
60 .into_iter()
61 .map(|(path, hashed_tokens, file_tokens)| FileData {
62 path,
63 hashed_tokens,
64 file_tokens,
65 })
66 .collect();
67
68 let total_files = files.len();
70 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
71 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
72
73 tracing::debug!(
74 total_files,
75 total_tokens,
76 total_lines,
77 "clone detection input"
78 );
79
80 let t0 = std::time::Instant::now();
82 let ranked_files = rank_reduce(&files);
83 let rank_time = t0.elapsed();
84 let unique_ranks: usize = ranked_files
85 .iter()
86 .flat_map(|f| f.iter())
87 .copied()
88 .max()
89 .map_or(0, |m| m as usize + 1);
90 tracing::debug!(
91 elapsed_us = rank_time.as_micros(),
92 unique_ranks,
93 "step1_rank_reduce"
94 );
95
96 let t0 = std::time::Instant::now();
98 let (text, file_of, file_offsets) = concatenate_with_sentinels(&ranked_files);
99 let concat_time = t0.elapsed();
100 tracing::debug!(
101 elapsed_us = concat_time.as_micros(),
102 concat_len = text.len(),
103 "step2_concatenate"
104 );
105
106 if text.is_empty() {
107 return empty_report(total_files);
108 }
109
110 let t0 = std::time::Instant::now();
112 let sa = build_suffix_array(&text);
113 let sa_time = t0.elapsed();
114 tracing::debug!(
115 elapsed_us = sa_time.as_micros(),
116 n = text.len(),
117 "step3_suffix_array"
118 );
119
120 let t0 = std::time::Instant::now();
122 let lcp = build_lcp(&text, &sa);
123 let lcp_time = t0.elapsed();
124 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
125
126 let t0 = std::time::Instant::now();
128 let raw_groups =
129 extract_clone_groups(&sa, &lcp, &file_of, &file_offsets, self.min_tokens, &files);
130 let extract_time = t0.elapsed();
131 tracing::debug!(
132 elapsed_us = extract_time.as_micros(),
133 raw_groups = raw_groups.len(),
134 "step5_extract_groups"
135 );
136
137 let t0 = std::time::Instant::now();
139 let clone_groups = self.build_groups(raw_groups, &files);
140 let build_time = t0.elapsed();
141 tracing::debug!(
142 elapsed_us = build_time.as_micros(),
143 final_groups = clone_groups.len(),
144 "step6_build_groups"
145 );
146
147 let t0 = std::time::Instant::now();
149 let stats = compute_stats(&clone_groups, total_files, total_lines, total_tokens);
150 let stats_time = t0.elapsed();
151 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
152
153 tracing::info!(
154 total_us = (rank_time
155 + concat_time
156 + sa_time
157 + lcp_time
158 + extract_time
159 + build_time
160 + stats_time)
161 .as_micros(),
162 rank_us = rank_time.as_micros(),
163 sa_us = sa_time.as_micros(),
164 lcp_us = lcp_time.as_micros(),
165 extract_us = extract_time.as_micros(),
166 build_us = build_time.as_micros(),
167 stats_us = stats_time.as_micros(),
168 total_tokens,
169 clone_groups = clone_groups.len(),
170 "clone detection complete"
171 );
172
173 DuplicationReport {
174 clone_groups,
175 clone_families: vec![], stats,
177 }
178 }
179
180 fn build_groups(&self, mut raw_groups: Vec<RawGroup>, files: &[FileData]) -> Vec<CloneGroup> {
183 if raw_groups.is_empty() {
184 return Vec::new();
185 }
186
187 let raw_count = raw_groups.len();
195 raw_groups.sort_by(|a, b| b.length.cmp(&a.length));
196
197 let mut covered: Vec<Vec<(usize, usize)>> = vec![Vec::new(); files.len()];
200 let mut surviving_groups: Vec<RawGroup> = Vec::new();
201
202 for rg in raw_groups {
203 let len = rg.length;
204 let all_covered = rg.instances.iter().all(|&(fid, offset)| {
206 let intervals = &covered[fid];
207 let idx = intervals.partition_point(|&(s, _)| s <= offset);
209 if idx > 0 {
210 let (s, e) = intervals[idx - 1];
211 offset >= s && offset + len <= e
212 } else {
213 false
214 }
215 });
216
217 if !all_covered {
218 for &(fid, offset) in &rg.instances {
220 let end = offset + len;
221 let intervals = &mut covered[fid];
222 let idx = intervals.partition_point(|&(s, _)| s < offset);
223 if idx > 0 {
225 let prev = &mut intervals[idx - 1];
226 if prev.1 >= offset {
227 if end > prev.1 {
229 prev.1 = end;
230 }
231 continue;
232 }
233 }
234 intervals.insert(idx, (offset, end));
235 }
236 surviving_groups.push(rg);
237 }
238 }
239
240 tracing::trace!(
241 raw = raw_count,
242 surviving = surviving_groups.len(),
243 "token-level subset removal"
244 );
245
246 let line_tables: Vec<Vec<usize>> = files
252 .iter()
253 .map(|f| {
254 f.file_tokens
255 .source
256 .bytes()
257 .enumerate()
258 .filter_map(|(i, b)| if b == b'\n' { Some(i) } else { None })
259 .collect()
260 })
261 .collect();
262
263 let mut clone_groups: Vec<CloneGroup> = Vec::new();
265
266 for rg in surviving_groups {
267 let mut seen: HashSet<(usize, usize)> = HashSet::new();
269 let mut group_instances: Vec<CloneInstance> = Vec::new();
270
271 for &(file_id, offset) in &rg.instances {
272 if !seen.insert((file_id, offset)) {
273 continue;
274 }
275
276 let file = &files[file_id];
277 if let Some(inst) =
278 build_clone_instance_fast(file, offset, rg.length, &line_tables[file_id])
279 {
280 group_instances.push(inst);
281 }
282 }
283
284 if self.skip_local && group_instances.len() >= 2 {
286 let dirs: std::collections::HashSet<_> = group_instances
287 .iter()
288 .filter_map(|inst| inst.file.parent().map(|p| p.to_path_buf()))
289 .collect();
290 if dirs.len() < 2 {
291 continue;
292 }
293 }
294
295 if group_instances.len() < 2 {
296 continue;
297 }
298
299 let line_count = group_instances
301 .iter()
302 .map(|inst| inst.end_line.saturating_sub(inst.start_line) + 1)
303 .max()
304 .unwrap_or(0);
305
306 if line_count < self.min_lines {
308 continue;
309 }
310
311 group_instances
313 .sort_by(|a, b| a.file.cmp(&b.file).then(a.start_line.cmp(&b.start_line)));
314
315 group_instances.dedup_by(|b, a| {
319 if a.file != b.file {
320 return false;
321 }
322 if b.start_line <= a.end_line {
325 if b.end_line > a.end_line {
327 a.end_line = b.end_line;
328 a.end_col = b.end_col;
329 }
330 true
331 } else {
332 false
333 }
334 });
335
336 if group_instances.len() < 2 {
337 continue;
338 }
339
340 clone_groups.push(CloneGroup {
341 instances: group_instances,
342 token_count: rg.length,
343 line_count,
344 });
345 }
346
347 clone_groups.sort_by(|a, b| {
352 b.token_count
353 .cmp(&a.token_count)
354 .then(b.instances.len().cmp(&a.instances.len()))
355 });
356
357 let mut path_to_idx: HashMap<PathBuf, usize> = HashMap::new();
366 let mut next_idx = 0usize;
367 for group in &clone_groups {
368 for inst in &group.instances {
369 path_to_idx.entry(inst.file.clone()).or_insert_with(|| {
370 let idx = next_idx;
371 next_idx += 1;
372 idx
373 });
374 }
375 }
376 let mut file_intervals: Vec<Vec<(usize, usize)>> = vec![Vec::new(); next_idx];
377 let mut kept_groups: Vec<CloneGroup> = Vec::new();
378
379 for group in clone_groups {
380 let all_contained = group.instances.iter().all(|inst| {
382 let fidx = path_to_idx[&inst.file];
383 let intervals = &file_intervals[fidx];
384 let idx = intervals.partition_point(|&(s, _)| s <= inst.start_line);
385 idx > 0 && {
386 let (s, e) = intervals[idx - 1];
387 inst.start_line >= s && inst.end_line <= e
388 }
389 });
390
391 if !all_contained {
392 for inst in &group.instances {
394 let fidx = path_to_idx[&inst.file];
395 let intervals = &mut file_intervals[fidx];
396 let idx = intervals.partition_point(|&(s, _)| s < inst.start_line);
397 if idx > 0 && intervals[idx - 1].1 >= inst.start_line {
398 if inst.end_line > intervals[idx - 1].1 {
399 intervals[idx - 1].1 = inst.end_line;
400 }
401 } else {
402 intervals.insert(idx, (inst.start_line, inst.end_line));
403 }
404 }
405 kept_groups.push(group);
406 }
407 }
408
409 kept_groups
410 }
411}
412
413struct RawGroup {
417 instances: Vec<(usize, usize)>,
419 length: usize,
421}
422
423fn rank_reduce(files: &[FileData]) -> Vec<Vec<u32>> {
430 let total: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
434 let mut hash_to_rank: HashMap<u64, u32> = HashMap::with_capacity(total / 2);
435 let mut next_rank: u32 = 0;
436
437 files
438 .iter()
439 .map(|file| {
440 file.hashed_tokens
441 .iter()
442 .map(|ht| {
443 *hash_to_rank.entry(ht.hash).or_insert_with(|| {
444 let r = next_rank;
445 next_rank += 1;
446 r
447 })
448 })
449 .collect()
450 })
451 .collect()
452}
453
454fn concatenate_with_sentinels(ranked_files: &[Vec<u32>]) -> (Vec<i64>, Vec<usize>, Vec<usize>) {
466 let sentinel_count = ranked_files.len().saturating_sub(1);
467 let total_len: usize = ranked_files.iter().map(|f| f.len()).sum::<usize>() + sentinel_count;
468
469 let mut text = Vec::with_capacity(total_len);
470 let mut file_of = Vec::with_capacity(total_len);
471 let mut file_offsets = Vec::with_capacity(ranked_files.len());
472
473 let mut sentinel: i64 = -1;
474
475 for (file_id, ranks) in ranked_files.iter().enumerate() {
476 file_offsets.push(text.len());
477
478 for &r in ranks {
479 text.push(r as i64);
480 file_of.push(file_id);
481 }
482
483 if file_id + 1 < ranked_files.len() {
485 text.push(sentinel);
486 file_of.push(usize::MAX);
487 sentinel -= 1;
488 }
489 }
490
491 (text, file_of, file_offsets)
492}
493
494fn build_suffix_array(text: &[i64]) -> Vec<usize> {
502 let n = text.len();
503 if n == 0 {
504 return vec![];
505 }
506
507 let min_val = text.iter().copied().min().unwrap_or(0);
510 let mut rank: Vec<i64> = text.iter().map(|&v| v - min_val).collect();
511 let mut sa: Vec<usize> = (0..n).collect();
512 let mut tmp: Vec<i64> = vec![0; n];
513 let mut k: usize = 1;
514 let mut iterations = 0u32;
515
516 let mut sa_tmp: Vec<usize> = vec![0; n];
518
519 while k < n {
520 iterations += 1;
521 let max_rank = rank.iter().copied().max().unwrap_or(0) as usize;
522
523 let bucket_count = max_rank + 2; let mut counts = vec![0usize; bucket_count + 1];
530 for &i in &sa {
531 let r2 = if i + k < n {
532 rank[i + k] as usize + 1
533 } else {
534 0
535 };
536 counts[r2] += 1;
537 }
538 let mut sum = 0;
540 for c in counts.iter_mut() {
541 let v = *c;
542 *c = sum;
543 sum += v;
544 }
545 for &i in &sa {
546 let r2 = if i + k < n {
547 rank[i + k] as usize + 1
548 } else {
549 0
550 };
551 sa_tmp[counts[r2]] = i;
552 counts[r2] += 1;
553 }
554
555 counts.iter_mut().for_each(|c| *c = 0);
560 counts.resize(bucket_count + 1, 0);
561 for &i in &sa_tmp {
562 let r1 = rank[i] as usize;
563 counts[r1] += 1;
564 }
565 sum = 0;
566 for c in counts.iter_mut() {
567 let v = *c;
568 *c = sum;
569 sum += v;
570 }
571 for &i in &sa_tmp {
572 let r1 = rank[i] as usize;
573 sa[counts[r1]] = i;
574 counts[r1] += 1;
575 }
576
577 tmp[sa[0]] = 0;
579 for i in 1..n {
580 let prev = sa[i - 1];
581 let curr = sa[i];
582 let same = rank[prev] == rank[curr] && {
583 let rp2 = if prev + k < n { rank[prev + k] } else { -1 };
584 let rc2 = if curr + k < n { rank[curr + k] } else { -1 };
585 rp2 == rc2
586 };
587 tmp[curr] = tmp[prev] + if same { 0 } else { 1 };
588 }
589
590 let new_max_rank = tmp[sa[n - 1]];
592 std::mem::swap(&mut rank, &mut tmp);
593
594 if new_max_rank as usize == n - 1 {
595 break;
596 }
597
598 k *= 2;
599 }
600
601 tracing::trace!(n, iterations, "suffix array constructed");
602 sa
603}
604
605fn build_lcp(text: &[i64], sa: &[usize]) -> Vec<usize> {
615 let n = sa.len();
616 if n == 0 {
617 return vec![];
618 }
619
620 let mut rank = vec![0usize; n];
621 for i in 0..n {
622 rank[sa[i]] = i;
623 }
624
625 let mut lcp = vec![0usize; n];
626 let mut k: usize = 0;
627
628 for i in 0..n {
629 if rank[i] == 0 {
630 k = 0;
631 continue;
632 }
633 let j = sa[rank[i] - 1];
634 while i + k < n && j + k < n {
635 if text[i + k] < 0 || text[j + k] < 0 {
637 break;
638 }
639 if text[i + k] != text[j + k] {
640 break;
641 }
642 k += 1;
643 }
644 lcp[rank[i]] = k;
645 k = k.saturating_sub(1);
646 }
647
648 lcp
649}
650
651fn extract_clone_groups(
660 sa: &[usize],
661 lcp: &[usize],
662 file_of: &[usize],
663 file_offsets: &[usize],
664 min_tokens: usize,
665 files: &[FileData],
666) -> Vec<RawGroup> {
667 let n = sa.len();
668 if n < 2 {
669 return vec![];
670 }
671
672 let mut stack: Vec<(usize, usize)> = Vec::new();
675 let mut groups: Vec<RawGroup> = Vec::new();
676
677 #[allow(clippy::needless_range_loop)] for i in 1..=n {
679 let cur_lcp = if i < n { lcp[i] } else { 0 };
680 let mut start = i;
681
682 while let Some(&(top_lcp, top_start)) = stack.last() {
683 if top_lcp <= cur_lcp {
684 break;
685 }
686 stack.pop();
687 start = top_start;
688
689 if top_lcp >= min_tokens {
690 let interval_begin = start - 1;
693 let interval_end = i; if let Some(group) = build_raw_group(
696 sa,
697 file_of,
698 file_offsets,
699 files,
700 interval_begin,
701 interval_end,
702 top_lcp,
703 ) {
704 groups.push(group);
705 }
706 }
707 }
708
709 if i < n {
710 stack.push((cur_lcp, start));
711 }
712 }
713
714 groups
715}
716
717fn build_raw_group(
720 sa: &[usize],
721 file_of: &[usize],
722 file_offsets: &[usize],
723 files: &[FileData],
724 begin: usize,
725 end: usize,
726 length: usize,
727) -> Option<RawGroup> {
728 let mut instances: Vec<(usize, usize)> = Vec::new();
729
730 for &pos in &sa[begin..end] {
731 let fid = file_of[pos];
732 if fid == usize::MAX {
733 continue; }
735 let offset_in_file = pos - file_offsets[fid];
736
737 if offset_in_file + length > files[fid].hashed_tokens.len() {
739 continue;
740 }
741
742 instances.push((fid, offset_in_file));
743 }
744
745 if instances.len() < 2 {
746 return None;
747 }
748
749 instances.sort_unstable();
752 let mut deduped: Vec<(usize, usize)> = Vec::with_capacity(instances.len());
753 for &(fid, offset) in &instances {
754 if let Some(&(last_fid, last_offset)) = deduped.last()
755 && fid == last_fid
756 && offset < last_offset + length
757 {
758 continue; }
760 deduped.push((fid, offset));
761 }
762
763 if deduped.len() < 2 {
764 return None;
765 }
766
767 Some(RawGroup {
768 instances: deduped,
769 length,
770 })
771}
772
773fn build_clone_instance_fast(
778 file: &FileData,
779 token_offset: usize,
780 token_length: usize,
781 line_table: &[usize],
782) -> Option<CloneInstance> {
783 let tokens = &file.hashed_tokens;
784 let source_tokens = &file.file_tokens.tokens;
785
786 if token_offset + token_length > tokens.len() {
787 return None;
788 }
789
790 let first_hashed = &tokens[token_offset];
792 let last_hashed = &tokens[token_offset + token_length - 1];
793
794 let first_source = &source_tokens[first_hashed.original_index];
795 let last_source = &source_tokens[last_hashed.original_index];
796
797 let start_byte = first_source.span.start as usize;
798 let end_byte = last_source.span.end as usize;
799
800 if start_byte > end_byte {
803 return None;
804 }
805
806 let source = &file.file_tokens.source;
807 let (start_line, start_col) = byte_offset_to_line_col_fast(source, start_byte, line_table);
808 let (end_line, end_col) = byte_offset_to_line_col_fast(source, end_byte, line_table);
809
810 let fragment = if end_byte <= source.len() {
812 source[start_byte..end_byte].to_string()
813 } else {
814 String::new()
815 };
816
817 Some(CloneInstance {
818 file: file.path.clone(),
819 start_line,
820 end_line,
821 start_col,
822 end_col,
823 fragment,
824 })
825}
826
827fn byte_offset_to_line_col_fast(
830 source: &str,
831 byte_offset: usize,
832 line_table: &[usize],
833) -> (usize, usize) {
834 let offset = byte_offset.min(source.len());
835 let line_idx = line_table.partition_point(|&nl_pos| nl_pos < offset);
837 let line = line_idx + 1; let line_start = if line_idx == 0 {
839 0
840 } else {
841 line_table[line_idx - 1] + 1
842 };
843 let col = source[line_start..offset].chars().count();
844 (line, col)
845}
846
847#[cfg(test)]
849fn byte_offset_to_line_col(source: &str, byte_offset: usize) -> (usize, usize) {
850 let offset = byte_offset.min(source.len());
851 let before = &source[..offset];
852 let line = before.matches('\n').count() + 1;
853 let line_start = before.rfind('\n').map_or(0, |pos| pos + 1);
854 let col = before[line_start..].chars().count();
855 (line, col)
856}
857
858fn compute_stats(
860 clone_groups: &[CloneGroup],
861 total_files: usize,
862 total_lines: usize,
863 total_tokens: usize,
864) -> DuplicationStats {
865 use std::collections::HashSet;
866 use std::path::Path;
867
868 let mut files_with_clones: HashSet<&Path> = HashSet::new();
869 let mut file_dup_lines: HashMap<&Path, HashSet<usize>> = HashMap::new();
871 let mut duplicated_tokens = 0usize;
872 let mut clone_instances = 0usize;
873
874 for group in clone_groups {
875 for instance in &group.instances {
876 files_with_clones.insert(&instance.file);
877 clone_instances += 1;
878 let lines = file_dup_lines.entry(&instance.file).or_default();
879 for line in instance.start_line..=instance.end_line {
880 lines.insert(line);
881 }
882 }
883 if group.instances.len() > 1 {
886 duplicated_tokens += group.token_count * (group.instances.len() - 1);
887 }
888 }
889
890 let dup_line_count: usize = file_dup_lines.values().map(|s| s.len()).sum();
891 let duplication_percentage = if total_lines > 0 {
892 (dup_line_count as f64 / total_lines as f64) * 100.0
893 } else {
894 0.0
895 };
896
897 let duplicated_tokens = duplicated_tokens.min(total_tokens);
900
901 DuplicationStats {
902 total_files,
903 files_with_clones: files_with_clones.len(),
904 total_lines,
905 duplicated_lines: dup_line_count,
906 total_tokens,
907 duplicated_tokens,
908 clone_groups: clone_groups.len(),
909 clone_instances,
910 duplication_percentage,
911 }
912}
913
914fn empty_report(total_files: usize) -> DuplicationReport {
916 DuplicationReport {
917 clone_groups: vec![],
918 clone_families: vec![],
919 stats: DuplicationStats {
920 total_files,
921 files_with_clones: 0,
922 total_lines: 0,
923 duplicated_lines: 0,
924 total_tokens: 0,
925 duplicated_tokens: 0,
926 clone_groups: 0,
927 clone_instances: 0,
928 duplication_percentage: 0.0,
929 },
930 }
931}
932
933#[cfg(test)]
934mod tests {
935 use super::*;
936 use crate::duplicates::normalize::HashedToken;
937 use crate::duplicates::tokenize::{FileTokens, SourceToken, TokenKind};
938 use oxc_span::Span;
939
940 fn make_hashed_tokens(hashes: &[u64]) -> Vec<HashedToken> {
941 hashes
942 .iter()
943 .enumerate()
944 .map(|(i, &hash)| HashedToken {
945 hash,
946 original_index: i,
947 })
948 .collect()
949 }
950
951 fn make_source_tokens(count: usize) -> Vec<SourceToken> {
952 (0..count)
953 .map(|i| SourceToken {
954 kind: TokenKind::Identifier(format!("t{i}")),
955 span: Span::new((i * 3) as u32, (i * 3 + 2) as u32),
956 })
957 .collect()
958 }
959
960 fn make_file_tokens(source: &str, count: usize) -> FileTokens {
961 FileTokens {
962 tokens: make_source_tokens(count),
963 source: source.to_string(),
964 line_count: source.lines().count().max(1),
965 }
966 }
967
968 #[test]
971 fn empty_input_produces_empty_report() {
972 let detector = CloneDetector::new(5, 1, false);
973 let report = detector.detect(vec![]);
974 assert!(report.clone_groups.is_empty());
975 assert_eq!(report.stats.total_files, 0);
976 }
977
978 #[test]
979 fn single_file_no_clones() {
980 let detector = CloneDetector::new(3, 1, false);
981 let hashed = make_hashed_tokens(&[1, 2, 3, 4, 5]);
982 let ft = make_file_tokens("a b c d e", 5);
983 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
984 assert!(report.clone_groups.is_empty());
985 }
986
987 #[test]
988 fn detects_exact_duplicate_across_files() {
989 let detector = CloneDetector::new(3, 1, false);
990
991 let hashes = vec![10, 20, 30, 40, 50];
993 let source_a = "a\nb\nc\nd\ne";
994 let source_b = "a\nb\nc\nd\ne";
995
996 let hashed_a = make_hashed_tokens(&hashes);
997 let hashed_b = make_hashed_tokens(&hashes);
998 let ft_a = make_file_tokens(source_a, 5);
999 let ft_b = make_file_tokens(source_b, 5);
1000
1001 let report = detector.detect(vec![
1002 (PathBuf::from("a.ts"), hashed_a, ft_a),
1003 (PathBuf::from("b.ts"), hashed_b, ft_b),
1004 ]);
1005
1006 assert!(
1007 !report.clone_groups.is_empty(),
1008 "Should detect at least one clone group"
1009 );
1010 }
1011
1012 #[test]
1013 fn no_detection_below_min_tokens() {
1014 let detector = CloneDetector::new(10, 1, false);
1015
1016 let hashes = vec![10, 20, 30]; let hashed_a = make_hashed_tokens(&hashes);
1018 let hashed_b = make_hashed_tokens(&hashes);
1019 let ft_a = make_file_tokens("abc", 3);
1020 let ft_b = make_file_tokens("abc", 3);
1021
1022 let report = detector.detect(vec![
1023 (PathBuf::from("a.ts"), hashed_a, ft_a),
1024 (PathBuf::from("b.ts"), hashed_b, ft_b),
1025 ]);
1026
1027 assert!(report.clone_groups.is_empty());
1028 }
1029
1030 #[test]
1031 fn byte_offset_to_line_col_basic() {
1032 let source = "abc\ndef\nghi";
1033 assert_eq!(byte_offset_to_line_col(source, 0), (1, 0));
1034 assert_eq!(byte_offset_to_line_col(source, 4), (2, 0));
1035 assert_eq!(byte_offset_to_line_col(source, 5), (2, 1));
1036 assert_eq!(byte_offset_to_line_col(source, 8), (3, 0));
1037 }
1038
1039 #[test]
1040 fn byte_offset_beyond_source() {
1041 let source = "abc";
1042 let (line, col) = byte_offset_to_line_col(source, 100);
1044 assert_eq!(line, 1);
1045 assert_eq!(col, 3);
1046 }
1047
1048 #[test]
1049 fn skip_local_filters_same_directory() {
1050 let detector = CloneDetector::new(3, 1, true);
1051
1052 let hashes = vec![10, 20, 30, 40, 50];
1053 let source = "a\nb\nc\nd\ne";
1054
1055 let hashed_a = make_hashed_tokens(&hashes);
1056 let hashed_b = make_hashed_tokens(&hashes);
1057 let ft_a = make_file_tokens(source, 5);
1058 let ft_b = make_file_tokens(source, 5);
1059
1060 let report = detector.detect(vec![
1062 (PathBuf::from("src/a.ts"), hashed_a, ft_a),
1063 (PathBuf::from("src/b.ts"), hashed_b, ft_b),
1064 ]);
1065
1066 assert!(
1067 report.clone_groups.is_empty(),
1068 "Same-directory clones should be filtered with skip_local"
1069 );
1070 }
1071
1072 #[test]
1073 fn skip_local_keeps_cross_directory() {
1074 let detector = CloneDetector::new(3, 1, true);
1075
1076 let hashes = vec![10, 20, 30, 40, 50];
1077 let source = "a\nb\nc\nd\ne";
1078
1079 let hashed_a = make_hashed_tokens(&hashes);
1080 let hashed_b = make_hashed_tokens(&hashes);
1081 let ft_a = make_file_tokens(source, 5);
1082 let ft_b = make_file_tokens(source, 5);
1083
1084 let report = detector.detect(vec![
1086 (PathBuf::from("src/components/a.ts"), hashed_a, ft_a),
1087 (PathBuf::from("src/utils/b.ts"), hashed_b, ft_b),
1088 ]);
1089
1090 assert!(
1091 !report.clone_groups.is_empty(),
1092 "Cross-directory clones should be kept with skip_local"
1093 );
1094 }
1095
1096 #[test]
1097 fn stats_computation() {
1098 let groups = vec![CloneGroup {
1099 instances: vec![
1100 CloneInstance {
1101 file: PathBuf::from("a.ts"),
1102 start_line: 1,
1103 end_line: 5,
1104 start_col: 0,
1105 end_col: 10,
1106 fragment: "...".to_string(),
1107 },
1108 CloneInstance {
1109 file: PathBuf::from("b.ts"),
1110 start_line: 10,
1111 end_line: 14,
1112 start_col: 0,
1113 end_col: 10,
1114 fragment: "...".to_string(),
1115 },
1116 ],
1117 token_count: 50,
1118 line_count: 5,
1119 }];
1120
1121 let stats = compute_stats(&groups, 10, 200, 1000);
1122 assert_eq!(stats.total_files, 10);
1123 assert_eq!(stats.files_with_clones, 2);
1124 assert_eq!(stats.clone_groups, 1);
1125 assert_eq!(stats.clone_instances, 2);
1126 assert_eq!(stats.duplicated_lines, 10); assert!(stats.duplication_percentage > 0.0);
1128 }
1129
1130 #[test]
1133 fn sa_construction_basic() {
1134 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1136 let sa = build_suffix_array(&text);
1137
1138 assert_eq!(sa, vec![5, 3, 1, 0, 4, 2]);
1146 }
1147
1148 #[test]
1149 fn lcp_construction_basic() {
1150 let text: Vec<i64> = vec![1, 0, 2, 0, 2, 0];
1151 let sa = build_suffix_array(&text);
1152 let lcp = build_lcp(&text, &sa);
1153
1154 assert_eq!(lcp, vec![0, 1, 3, 0, 0, 2]);
1162 }
1163
1164 #[test]
1165 fn lcp_stops_at_sentinels() {
1166 let text: Vec<i64> = vec![0, 1, 2, -1, 0, 1, 2];
1168 let sa = build_suffix_array(&text);
1169 let lcp = build_lcp(&text, &sa);
1170
1171 let rank_0 = sa.iter().position(|&s| s == 0).expect("pos 0 in SA");
1174 let rank_4 = sa.iter().position(|&s| s == 4).expect("pos 4 in SA");
1175 let (lo, hi) = if rank_0 < rank_4 {
1176 (rank_0, rank_4)
1177 } else {
1178 (rank_4, rank_0)
1179 };
1180
1181 let min_lcp = lcp[(lo + 1)..=hi].iter().copied().min().unwrap_or(0);
1183 assert_eq!(
1184 min_lcp, 3,
1185 "LCP between identical sequences across sentinel should be 3"
1186 );
1187 }
1188
1189 #[test]
1190 fn rank_reduction_maps_correctly() {
1191 let files = vec![
1192 FileData {
1193 path: PathBuf::from("a.ts"),
1194 hashed_tokens: make_hashed_tokens(&[100, 200, 300]),
1195 file_tokens: make_file_tokens("a b c", 3),
1196 },
1197 FileData {
1198 path: PathBuf::from("b.ts"),
1199 hashed_tokens: make_hashed_tokens(&[200, 300, 400]),
1200 file_tokens: make_file_tokens("d e f", 3),
1201 },
1202 ];
1203
1204 let ranked = rank_reduce(&files);
1205
1206 assert_eq!(ranked[0], vec![0, 1, 2]);
1208 assert_eq!(ranked[1], vec![1, 2, 3]);
1209 }
1210
1211 #[test]
1212 fn three_file_grouping() {
1213 let detector = CloneDetector::new(3, 1, false);
1214
1215 let hashes = vec![10, 20, 30, 40, 50];
1216 let source = "a\nb\nc\nd\ne";
1217
1218 let data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)> = (0..3)
1219 .map(|i| {
1220 (
1221 PathBuf::from(format!("file{i}.ts")),
1222 make_hashed_tokens(&hashes),
1223 make_file_tokens(source, 5),
1224 )
1225 })
1226 .collect();
1227
1228 let report = detector.detect(data);
1229
1230 assert!(
1231 !report.clone_groups.is_empty(),
1232 "Should detect clones across 3 identical files"
1233 );
1234
1235 let max_instances = report
1237 .clone_groups
1238 .iter()
1239 .map(|g| g.instances.len())
1240 .max()
1241 .unwrap_or(0);
1242 assert_eq!(
1243 max_instances, 3,
1244 "3 identical files should produce a group with 3 instances"
1245 );
1246 }
1247
1248 #[test]
1249 fn overlapping_clones_largest_wins() {
1250 let detector = CloneDetector::new(3, 1, false);
1251
1252 let hashes: Vec<u64> = (1..=10).collect();
1254 let source = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj";
1255
1256 let hashed_a = make_hashed_tokens(&hashes);
1257 let hashed_b = make_hashed_tokens(&hashes);
1258 let ft_a = make_file_tokens(source, 10);
1259 let ft_b = make_file_tokens(source, 10);
1260
1261 let report = detector.detect(vec![
1262 (PathBuf::from("a.ts"), hashed_a, ft_a),
1263 (PathBuf::from("b.ts"), hashed_b, ft_b),
1264 ]);
1265
1266 assert!(!report.clone_groups.is_empty());
1267 assert_eq!(
1269 report.clone_groups[0].token_count, 10,
1270 "Maximal clone should cover all 10 tokens"
1271 );
1272 }
1273
1274 #[test]
1275 fn no_self_overlap() {
1276 let detector = CloneDetector::new(3, 1, false);
1277
1278 let hashes = vec![1, 2, 3, 1, 2, 3];
1281 let source = "aa\nbb\ncc\ndd\nee\nff\ngg";
1285
1286 let hashed = make_hashed_tokens(&hashes);
1287 let ft = make_file_tokens(source, 6);
1288
1289 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1290
1291 for group in &report.clone_groups {
1293 let mut file_instances: HashMap<&PathBuf, Vec<(usize, usize)>> = HashMap::new();
1294 for inst in &group.instances {
1295 file_instances
1296 .entry(&inst.file)
1297 .or_default()
1298 .push((inst.start_line, inst.end_line));
1299 }
1300 for (_file, mut ranges) in file_instances {
1301 ranges.sort();
1302 for w in ranges.windows(2) {
1303 assert!(
1304 w[1].0 > w[0].1,
1305 "Clone instances in the same file should not overlap: {:?} and {:?}",
1306 w[0],
1307 w[1]
1308 );
1309 }
1310 }
1311 }
1312 }
1313
1314 #[test]
1315 fn empty_input_edge_case() {
1316 let detector = CloneDetector::new(0, 0, false);
1317 let report = detector.detect(vec![]);
1318 assert!(report.clone_groups.is_empty());
1319 assert_eq!(report.stats.total_files, 0);
1320 }
1321
1322 #[test]
1323 fn single_file_internal_duplication() {
1324 let detector = CloneDetector::new(3, 1, false);
1325
1326 let hashes = vec![10, 20, 30, 99, 10, 20, 30];
1329 let source = "a\nb\nc\nx\na\nb\nc";
1330
1331 let hashed = make_hashed_tokens(&hashes);
1332 let ft = make_file_tokens(source, 7);
1333
1334 let report = detector.detect(vec![(PathBuf::from("a.ts"), hashed, ft)]);
1335
1336 assert!(
1338 !report.clone_groups.is_empty(),
1339 "Should detect internal duplication within a single file"
1340 );
1341 }
1342}