1mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17pub use hashing::{
19 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
20 detect_duplicates_with_extension, detect_type3_clones, normalize, CloneMatch, RollingHash,
21 Token,
22};
23pub use parsing::{
24 extract_functions, extract_javascript_functions, extract_python_functions,
25 extract_rust_functions, FunctionNode,
26};
27
28use anyhow::{anyhow, Context, Result};
29use ignore::WalkBuilder;
30use rayon::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::fs;
33use std::path::{Path, PathBuf};
34use std::sync::Arc;
35use tree_sitter::Language;
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub enum CloneType {
40 #[serde(rename = "type-1")]
42 Type1,
43 #[serde(rename = "type-2")]
45 Type2,
46 #[serde(rename = "type-3")]
48 Type3,
49}
50
51fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
53 start1 < end2 && start2 < end1
54}
55
56fn canonical_pair_key<'a>(
58 func1: &'a FunctionHash,
59 func2: &'a FunctionHash,
60 source_start: usize,
61 target_start: usize,
62 length: usize,
63) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
64 if func1.file_path.as_ref() < func2.file_path.as_ref() {
65 (
66 func1.file_path.as_ref(),
67 func2.file_path.as_ref(),
68 func1.start_line,
69 func2.start_line,
70 source_start,
71 target_start,
72 length,
73 )
74 } else {
75 (
76 func2.file_path.as_ref(),
77 func1.file_path.as_ref(),
78 func2.start_line,
79 func1.start_line,
80 target_start,
81 source_start,
82 length,
83 )
84 }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89pub struct DuplicateMatch {
90 pub file1: String,
91 pub file2: String,
92 pub start_line1: usize,
93 pub start_line2: usize,
94 pub length: usize,
95 pub similarity: f64,
96 pub hash: u64,
97 pub clone_type: CloneType,
98 #[serde(skip_serializing_if = "Option::is_none")]
100 pub edit_distance: Option<usize>,
101}
102
103#[derive(Debug, Clone)]
105struct FunctionHash {
106 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
109 #[allow(dead_code)] start_byte: usize,
111 #[allow(dead_code)] end_byte: usize,
113 start_line: usize,
114 #[allow(dead_code)] end_line: usize,
116 tokens: Vec<Token>, raw_body: String, }
119
120#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct Baseline {
123 pub version: String,
125 pub created_at: String,
127 pub duplicates: Vec<DuplicateMatch>,
129}
130
131impl Baseline {
132 pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
134 Self {
135 version: env!("CARGO_PKG_VERSION").to_string(),
136 created_at: chrono::Utc::now().to_rfc3339(),
137 duplicates,
138 }
139 }
140
141 pub fn save_to_file(&self, path: &Path) -> Result<()> {
143 let json =
144 serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
145 fs::write(path, json).context("Failed to write baseline file")?;
146 Ok(())
147 }
148
149 pub fn load_from_file(path: &Path) -> Result<Self> {
151 let content = fs::read_to_string(path)
152 .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
153 let baseline: Baseline =
154 serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
155 Ok(baseline)
156 }
157
158 pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
160 let baseline_set: std::collections::HashSet<_> =
161 self.duplicates.iter().map(duplicate_key).collect();
162
163 current
164 .iter()
165 .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
166 .cloned()
167 .collect()
168 }
169}
170
171fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
173 let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
175 (
176 dup.file1.clone(),
177 dup.file2.clone(),
178 dup.start_line1,
179 dup.start_line2,
180 )
181 } else {
182 (
183 dup.file2.clone(),
184 dup.file1.clone(),
185 dup.start_line2,
186 dup.start_line1,
187 )
188 };
189 (file1, file2, line1, line2, dup.length)
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct Report {
195 #[serde(skip_serializing_if = "Option::is_none")]
197 pub version: Option<String>,
198 #[serde(skip_serializing_if = "Option::is_none")]
200 pub scan_time: Option<String>,
201 #[serde(skip_serializing_if = "Option::is_none")]
203 pub config: Option<ScanConfig>,
204 pub files_scanned: usize,
206 pub functions_analyzed: usize,
208 pub duplicates: Vec<DuplicateMatch>,
210 pub stats: ScanStats,
212}
213
214#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ScanConfig {
217 pub threshold: usize,
219 pub similarity: f64,
221 pub type3_enabled: bool,
223 #[serde(skip_serializing_if = "Option::is_none")]
225 pub paths: Option<Vec<String>>,
226}
227
228#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct ScanStats {
231 pub total_lines: usize,
233 pub total_tokens: usize,
235 pub unique_hashes: usize,
237 pub duration_ms: u64,
239}
240
241#[allow(dead_code)] pub struct Scanner {
244 min_block_size: usize,
246 similarity_threshold: f64,
248 exclude_patterns: Vec<String>,
250 enable_type3: bool,
252 type3_tolerance: f64,
254}
255
256impl Scanner {
257 pub fn new() -> Self {
261 Self {
262 min_block_size: 50,
263 similarity_threshold: 0.85,
264 exclude_patterns: vec![
265 "**/*.test.ts".to_string(),
266 "**/*.test.js".to_string(),
267 "**/*.test.tsx".to_string(),
268 "**/*.test.jsx".to_string(),
269 "**/*.spec.ts".to_string(),
270 "**/*.spec.js".to_string(),
271 "**/*.spec.tsx".to_string(),
272 "**/*.spec.jsx".to_string(),
273 "**/__tests__/**".to_string(),
274 "**/*.test.py".to_string(),
275 ],
276 enable_type3: false,
277 type3_tolerance: 0.85,
278 }
279 }
280
281 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
283 Ok(Self {
284 min_block_size,
285 similarity_threshold,
286 exclude_patterns: vec![
287 "**/*.test.ts".to_string(),
288 "**/*.test.js".to_string(),
289 "**/*.test.tsx".to_string(),
290 "**/*.test.jsx".to_string(),
291 "**/*.spec.ts".to_string(),
292 "**/*.spec.js".to_string(),
293 "**/*.spec.tsx".to_string(),
294 "**/*.spec.jsx".to_string(),
295 "**/__tests__/**".to_string(),
296 "**/*.test.py".to_string(),
297 ],
298 enable_type3: false,
299 type3_tolerance: 0.85,
300 })
301 }
302
303 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
305 self.exclude_patterns = patterns;
306 self
307 }
308
309 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
311 if !(0.0..=1.0).contains(&tolerance) {
312 return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
313 }
314 self.enable_type3 = true;
315 self.type3_tolerance = tolerance;
316 Ok(self)
317 }
318
319 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
327 use std::time::Instant;
328 let start_time = Instant::now();
329
330 let source_files = self.collect_source_files(paths)?;
332
333 let function_hashes: Vec<FunctionHash> = source_files
335 .par_iter()
336 .filter_map(|path| self.process_file(path).ok())
337 .flatten()
338 .collect();
339
340 let duplicates = self.find_duplicate_hashes(&function_hashes);
342
343 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
345
346 let unique_hashes: usize = {
347 let mut hash_set = std::collections::HashSet::new();
348 for fh in &function_hashes {
349 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
351 for (hash, _) in hashes {
352 hash_set.insert(hash);
353 }
354 }
355 hash_set.len()
356 };
357
358 let duration_ms = start_time.elapsed().as_millis() as u64;
359
360 let total_lines: usize = source_files
362 .iter()
363 .filter_map(|path| std::fs::read_to_string(path).ok())
364 .map(|content| content.lines().count())
365 .sum();
366
367 Ok(Report {
368 version: None, scan_time: None, config: None, files_scanned: source_files.len(),
372 functions_analyzed: function_hashes.len(),
373 duplicates,
374 stats: ScanStats {
375 total_lines,
376 total_tokens,
377 unique_hashes,
378 duration_ms,
379 },
380 })
381 }
382
383 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
388 let mut files = Vec::new();
389
390 for path in paths {
391 if path.is_file() {
392 if self.is_supported_file(&path) && !self.is_excluded(&path) {
393 files.push(path);
394 }
395 } else if path.is_dir() {
396 let walker = WalkBuilder::new(&path)
398 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
405
406 for entry in walker {
407 match entry {
408 Ok(entry) => {
409 let path = entry.path();
410 if path.is_file()
411 && self.is_supported_file(path)
412 && !self.is_excluded(path)
413 {
414 files.push(path.to_path_buf());
415 }
416 }
417 Err(err) => {
418 eprintln!("Warning: Failed to access path: {}", err);
420 }
421 }
422 }
423 }
424 }
425
426 Ok(files)
427 }
428
429 fn is_supported_file(&self, path: &Path) -> bool {
431 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
432 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
433 } else {
434 false
435 }
436 }
437
438 fn is_excluded(&self, path: &Path) -> bool {
440 use globset::{Glob, GlobSetBuilder};
441
442 let mut builder = GlobSetBuilder::new();
444 for pattern in &self.exclude_patterns {
445 if let Ok(glob) = Glob::new(pattern) {
446 builder.add(glob);
447 }
448 }
449
450 if let Ok(glob_set) = builder.build() {
451 glob_set.is_match(path)
452 } else {
453 false
454 }
455 }
456
457 fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
459 let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
460
461 let lang = self.detect_language(path)?;
462 let functions = extract_functions(&code, lang)?;
463
464 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
466 let mut function_hashes = Vec::new();
467
468 for func in functions {
469 let raw_body = func.body.clone();
471 let tokens = normalize(&func.body);
472
473 if tokens.len() < self.min_block_size {
475 continue;
476 }
477
478 function_hashes.push(FunctionHash {
480 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
482 start_byte: func.start_byte,
483 end_byte: func.end_byte,
484 start_line: func.start_line,
485 end_line: func.end_line,
486 tokens,
487 raw_body,
488 });
489 }
490
491 Ok(function_hashes)
492 }
493
494 fn detect_language(&self, path: &Path) -> Result<Language> {
496 let ext = path
497 .extension()
498 .and_then(|e| e.to_str())
499 .ok_or_else(|| anyhow!("No file extension"))?;
500
501 match ext {
502 "rs" => Ok(tree_sitter_rust::language()),
503 "py" => Ok(tree_sitter_python::language()),
504 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
505 _ => Err(anyhow!("Unsupported file extension: {}", ext)),
506 }
507 }
508
509 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
511 let mut duplicates = Vec::new();
512 let mut seen_pairs: std::collections::HashSet<(
513 &str,
514 &str,
515 usize,
516 usize,
517 usize,
518 usize,
519 usize,
520 )> = std::collections::HashSet::new();
521
522 for i in 0..function_hashes.len() {
524 for j in (i + 1)..function_hashes.len() {
525 let func1 = &function_hashes[i];
526 let func2 = &function_hashes[j];
527
528 let matches = self.find_clones_between_functions(func1, func2);
530
531 for clone_match in matches {
532 let pair_key = canonical_pair_key(
534 func1,
535 func2,
536 clone_match.source_start,
537 clone_match.target_start,
538 clone_match.length,
539 );
540
541 if seen_pairs.contains(&pair_key) {
542 continue;
543 }
544 seen_pairs.insert(pair_key);
545
546 use std::collections::hash_map::DefaultHasher;
548 use std::hash::{Hash, Hasher};
549 let mut hasher = DefaultHasher::new();
550 func1.tokens
551 [clone_match.source_start..clone_match.source_start + clone_match.length]
552 .hash(&mut hasher);
553 let match_hash = hasher.finish();
554
555 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
558
559 let actual_start1 = func1.start_line + clone_match.source_start;
561 let actual_start2 = func2.start_line + clone_match.target_start;
562
563 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
566 continue;
567 }
568
569 duplicates.push(DuplicateMatch {
570 file1: func1.file_path.to_string(),
571 file2: func2.file_path.to_string(),
572 start_line1: actual_start1,
573 start_line2: actual_start2,
574 length: clone_match.length,
575 similarity: clone_match.similarity,
576 hash: match_hash,
577 clone_type,
578 edit_distance: None, });
580 }
581 }
582 }
583
584 if self.enable_type3 {
586 let mut type3_candidates = Vec::new();
588
589 for i in 0..function_hashes.len() {
590 for j in (i + 1)..function_hashes.len() {
591 let func1 = &function_hashes[i];
592 let func2 = &function_hashes[j];
593
594 let type3_matches = detect_type3_clones(
599 &func1.tokens,
600 &func2.tokens,
601 self.min_block_size,
602 self.type3_tolerance,
603 );
604
605 for clone_match in type3_matches {
606 let pair_key = canonical_pair_key(
608 func1,
609 func2,
610 clone_match.source_start,
611 clone_match.target_start,
612 clone_match.length,
613 );
614
615 if seen_pairs.contains(&pair_key) {
616 continue;
617 }
618
619 type3_candidates.push((func1, func2, clone_match));
620 }
621 }
622 }
623
624 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
626
627 for (func1, func2, clone_match) in deduplicated {
629 let window1 = &func1.tokens
631 [clone_match.source_start..clone_match.source_start + clone_match.length];
632 let window2 = &func2.tokens[clone_match.target_start
633 ..clone_match.target_start + clone_match.target_length];
634 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
635
636 use std::collections::hash_map::DefaultHasher;
638 use std::hash::{Hash, Hasher};
639 let mut hasher = DefaultHasher::new();
640 window1.hash(&mut hasher);
641 let match_hash = hasher.finish();
642
643 duplicates.push(DuplicateMatch {
644 file1: func1.file_path.to_string(),
645 file2: func2.file_path.to_string(),
646 start_line1: func1.start_line,
647 start_line2: func2.start_line,
648 length: clone_match.length,
649 similarity: clone_match.similarity,
650 hash: match_hash,
651 clone_type: CloneType::Type3,
652 edit_distance: Some(edit_dist),
653 });
654 }
655 }
656
657 duplicates
658 }
659
660 fn deduplicate_overlapping_matches<'a>(
666 &self,
667 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
668 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
669 if candidates.is_empty() {
670 return Vec::new();
671 }
672
673 let mut used = vec![false; candidates.len()];
675 let mut deduplicated = Vec::new();
676
677 for i in 0..candidates.len() {
678 if used[i] {
679 continue;
680 }
681
682 let (func1, func2, current) = &candidates[i];
683 let mut best_match = (*func1, *func2, current.clone());
684 used[i] = true;
685
686 let mut found_overlap = true;
689 while found_overlap {
690 found_overlap = false;
691
692 for j in (i + 1)..candidates.len() {
693 if used[j] {
694 continue;
695 }
696
697 let (f1, f2, candidate) = &candidates[j];
698
699 let same_pair = (func1.file_path == f1.file_path
701 && func2.file_path == f2.file_path
702 && func1.start_line == f1.start_line
703 && func2.start_line == f2.start_line)
704 || (func1.file_path == f2.file_path
705 && func2.file_path == f1.file_path
706 && func1.start_line == f2.start_line
707 && func2.start_line == f1.start_line);
708
709 if !same_pair {
710 continue;
711 }
712
713 let source_overlap = ranges_overlap(
716 best_match.2.source_start,
717 best_match.2.source_start + best_match.2.length,
718 candidate.source_start,
719 candidate.source_start + candidate.length,
720 );
721 let target_overlap = ranges_overlap(
722 best_match.2.target_start,
723 best_match.2.target_start + best_match.2.target_length,
724 candidate.target_start,
725 candidate.target_start + candidate.target_length,
726 );
727
728 if source_overlap && target_overlap {
729 let best_span = best_match.2.length.max(best_match.2.target_length);
730 let candidate_span = candidate.length.max(candidate.target_length);
731
732 if candidate_span > best_span
734 || (candidate_span == best_span
735 && candidate.similarity > best_match.2.similarity)
736 {
737 best_match = (*f1, *f2, candidate.clone());
738 found_overlap = true; }
740 used[j] = true;
741 }
742 }
743 }
744
745 deduplicated.push(best_match);
746 }
747
748 deduplicated
749 }
750
751 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
753 let normalized1 = raw1.split_whitespace().collect::<String>();
755 let normalized2 = raw2.split_whitespace().collect::<String>();
756
757 if normalized1 == normalized2 {
759 CloneType::Type1
760 } else {
761 CloneType::Type2
763 }
764 }
765
766 fn find_clones_between_functions(
768 &self,
769 func1: &FunctionHash,
770 func2: &FunctionHash,
771 ) -> Vec<CloneMatch> {
772 use std::collections::HashMap;
773
774 let mut matches = Vec::new();
775 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
776
777 let mut i = 0;
779 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
780 let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
781 hash_map.entry(hash).or_default().push(i);
782 i += 1;
783 }
784
785 let mut j = 0;
787 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
788 let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
789
790 if let Some(func1_positions) = hash_map.get(&hash) {
791 for &func1_pos in func1_positions {
792 if self.verify_window_match(
794 &func1.tokens,
795 &func2.tokens,
796 func1_pos,
797 j,
798 self.min_block_size,
799 ) {
800 let mut extension = 0;
802 while (func1_pos + self.min_block_size + extension < func1.tokens.len())
803 && (j + self.min_block_size + extension < func2.tokens.len())
804 && (func1.tokens[func1_pos + self.min_block_size + extension]
805 == func2.tokens[j + self.min_block_size + extension])
806 {
807 extension += 1;
808 }
809
810 let total_length = self.min_block_size + extension;
811
812 matches.push(CloneMatch {
813 source_start: func1_pos,
814 target_start: j,
815 length: total_length,
816 target_length: total_length,
817 similarity: 1.0, });
819
820 j += extension.max(1);
822 break;
823 }
824 }
825 }
826
827 j += 1;
828 }
829
830 matches
831 }
832
833 fn compute_window_hash(&self, window: &[Token]) -> u64 {
839 const BASE: u64 = 257;
841 const MODULUS: u64 = 1_000_000_007;
843
844 let mut hash: u64 = 0;
845 for token in window {
846 use std::collections::hash_map::DefaultHasher;
847 use std::hash::{Hash, Hasher};
848 let mut hasher = DefaultHasher::new();
849 token.as_hash_string().hash(&mut hasher);
850 let token_hash = hasher.finish();
851 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
853 hash = wide_hash as u64;
854 }
855 hash
856 }
857
858 fn verify_window_match(
860 &self,
861 tokens1: &[Token],
862 tokens2: &[Token],
863 idx1: usize,
864 idx2: usize,
865 len: usize,
866 ) -> bool {
867 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
868 return false;
869 }
870 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
871 }
872}
873
874impl Default for Scanner {
875 fn default() -> Self {
876 Self::new() }
878}
879
880pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
888 let scanner = Scanner::new();
889 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
890 scanner.scan(path_bufs)
891}
892
893pub fn find_duplicates_with_config(
895 paths: Vec<String>,
896 min_block_size: usize,
897 similarity_threshold: f64,
898) -> Result<Report> {
899 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
900 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
901 scanner.scan(path_bufs)
902}
903
904#[cfg(test)]
905mod tests {
906 use super::*;
907
908 #[test]
909 fn test_scanner_creation() {
910 let _scanner = Scanner::new(); }
912
913 #[test]
914 fn test_scanner_with_config() {
915 let scanner = Scanner::with_config(30, 0.9);
916 assert!(scanner.is_ok());
917 let s = scanner.unwrap();
918 assert_eq!(s.min_block_size, 30);
919 assert_eq!(s.similarity_threshold, 0.9);
920 }
921
922 #[test]
923 fn test_type3_tolerance_validation() {
924 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
925 assert!(Scanner::new().with_type3_detection(1.2).is_err());
926 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
927 }
928
929 #[test]
930 fn test_type3_not_dropped_when_functions_share_offsets() {
931 fn make_function(
932 file: &str,
933 start_line: usize,
934 tokens: Vec<Token>,
935 raw_body: &str,
936 ) -> FunctionHash {
937 FunctionHash {
938 file_path: Arc::<str>::from(file),
939 function_name: None,
940 start_byte: 0,
941 end_byte: 0,
942 start_line,
943 end_line: start_line + tokens.len(),
944 tokens,
945 raw_body: raw_body.to_string(),
946 }
947 }
948
949 let scanner = Scanner::with_config(3, 0.85)
950 .unwrap()
951 .with_type3_detection(0.6)
952 .unwrap();
953
954 let type1_tokens = vec![
955 Token::Keyword("return".into()),
956 Token::NumberLiteral,
957 Token::Punctuation(";".into()),
958 ];
959 let near_tokens_a = vec![
960 Token::Keyword("compute".into()),
961 Token::Identifier,
962 Token::Identifier,
963 ];
964 let near_tokens_b = vec![
965 Token::Keyword("compute".into()),
966 Token::Identifier,
967 Token::NumberLiteral,
968 ];
969
970 let functions = vec![
971 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
972 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
973 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
974 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
975 ];
976
977 let duplicates = scanner.find_duplicate_hashes(&functions);
978
979 let type1_present = duplicates.iter().any(|d| {
980 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
981 && d.start_line1 == 10
982 && d.start_line2 == 20
983 });
984 assert!(
985 type1_present,
986 "expected Type-1/2 match for the first function pair"
987 );
988
989 let type3_present = duplicates.iter().any(|d| {
990 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
991 });
992 assert!(
993 type3_present,
994 "Type-3 match between later functions should not be deduped"
995 );
996
997 assert_eq!(
998 duplicates.len(),
999 2,
1000 "should keep both the Type-1/2 and Type-3 matches"
1001 );
1002 }
1003
1004 #[test]
1005 fn test_find_duplicates_empty() {
1006 let result = find_duplicates(vec![]);
1007 assert!(result.is_ok());
1008 let report = result.unwrap();
1009 assert_eq!(report.duplicates.len(), 0);
1010 }
1011
1012 #[test]
1013 fn test_is_supported_file() {
1014 let scanner = Scanner::new();
1015
1016 assert!(scanner.is_supported_file(Path::new("test.rs")));
1017 assert!(scanner.is_supported_file(Path::new("test.py")));
1018 assert!(scanner.is_supported_file(Path::new("test.js")));
1019 assert!(scanner.is_supported_file(Path::new("test.ts")));
1020 assert!(!scanner.is_supported_file(Path::new("test.txt")));
1021 assert!(!scanner.is_supported_file(Path::new("test.md")));
1022 }
1023
1024 #[test]
1025 fn test_detect_language() {
1026 let scanner = Scanner::new();
1027
1028 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1029 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1030 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1031 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1032 }
1033}