1mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17pub use hashing::{
19 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
20 detect_duplicates_with_extension, detect_type3_clones, normalize, CloneMatch, RollingHash,
21 Token,
22};
23pub use parsing::{
24 extract_functions, extract_javascript_functions, extract_python_functions,
25 extract_rust_functions, FunctionNode,
26};
27
28use anyhow::{anyhow, Context, Result};
29use ignore::WalkBuilder;
30use rayon::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::fs;
33use std::path::{Path, PathBuf};
34use std::sync::Arc;
35use tree_sitter::Language;
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub enum CloneType {
40 #[serde(rename = "type-1")]
42 Type1,
43 #[serde(rename = "type-2")]
45 Type2,
46 #[serde(rename = "type-3")]
48 Type3,
49}
50
51fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
53 start1 < end2 && start2 < end1
54}
55
56fn canonical_pair_key<'a>(
58 func1: &'a FunctionHash,
59 func2: &'a FunctionHash,
60 source_start: usize,
61 target_start: usize,
62 length: usize,
63) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
64 if func1.file_path.as_ref() < func2.file_path.as_ref() {
65 (
66 func1.file_path.as_ref(),
67 func2.file_path.as_ref(),
68 func1.start_line,
69 func2.start_line,
70 source_start,
71 target_start,
72 length,
73 )
74 } else {
75 (
76 func2.file_path.as_ref(),
77 func1.file_path.as_ref(),
78 func2.start_line,
79 func1.start_line,
80 target_start,
81 source_start,
82 length,
83 )
84 }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89pub struct DuplicateMatch {
90 pub file1: String,
91 pub file2: String,
92 pub start_line1: usize,
93 pub start_line2: usize,
94 pub length: usize,
95 pub similarity: f64,
96 pub hash: u64,
97 pub clone_type: CloneType,
98 #[serde(skip_serializing_if = "Option::is_none")]
100 pub edit_distance: Option<usize>,
101}
102
103#[derive(Debug, Clone)]
105struct FunctionHash {
106 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
109 #[allow(dead_code)] start_byte: usize,
111 #[allow(dead_code)] end_byte: usize,
113 start_line: usize,
114 #[allow(dead_code)] end_line: usize,
116 tokens: Vec<Token>, raw_body: String, }
119
120#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct Report {
123 pub files_scanned: usize,
125 pub functions_analyzed: usize,
127 pub duplicates: Vec<DuplicateMatch>,
129 pub stats: ScanStats,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct ScanStats {
136 pub total_lines: usize,
138 pub total_tokens: usize,
140 pub unique_hashes: usize,
142 pub duration_ms: u64,
144}
145
146#[allow(dead_code)] pub struct Scanner {
149 min_block_size: usize,
151 similarity_threshold: f64,
153 exclude_patterns: Vec<String>,
155 enable_type3: bool,
157 type3_tolerance: f64,
159}
160
161impl Scanner {
162 pub fn new() -> Self {
166 Self {
167 min_block_size: 50,
168 similarity_threshold: 0.85,
169 exclude_patterns: vec![
170 "**/*.test.ts".to_string(),
171 "**/*.test.js".to_string(),
172 "**/*.test.tsx".to_string(),
173 "**/*.test.jsx".to_string(),
174 "**/*.spec.ts".to_string(),
175 "**/*.spec.js".to_string(),
176 "**/*.spec.tsx".to_string(),
177 "**/*.spec.jsx".to_string(),
178 "**/__tests__/**".to_string(),
179 "**/*.test.py".to_string(),
180 ],
181 enable_type3: false,
182 type3_tolerance: 0.85,
183 }
184 }
185
186 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
188 Ok(Self {
189 min_block_size,
190 similarity_threshold,
191 exclude_patterns: vec![
192 "**/*.test.ts".to_string(),
193 "**/*.test.js".to_string(),
194 "**/*.test.tsx".to_string(),
195 "**/*.test.jsx".to_string(),
196 "**/*.spec.ts".to_string(),
197 "**/*.spec.js".to_string(),
198 "**/*.spec.tsx".to_string(),
199 "**/*.spec.jsx".to_string(),
200 "**/__tests__/**".to_string(),
201 "**/*.test.py".to_string(),
202 ],
203 enable_type3: false,
204 type3_tolerance: 0.85,
205 })
206 }
207
208 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
210 self.exclude_patterns = patterns;
211 self
212 }
213
214 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
216 if !(0.0..=1.0).contains(&tolerance) {
217 return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
218 }
219 self.enable_type3 = true;
220 self.type3_tolerance = tolerance;
221 Ok(self)
222 }
223
224 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
232 use std::time::Instant;
233 let start_time = Instant::now();
234
235 let source_files = self.collect_source_files(paths)?;
237
238 let function_hashes: Vec<FunctionHash> = source_files
240 .par_iter()
241 .filter_map(|path| self.process_file(path).ok())
242 .flatten()
243 .collect();
244
245 let duplicates = self.find_duplicate_hashes(&function_hashes);
247
248 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
250
251 let unique_hashes: usize = {
252 let mut hash_set = std::collections::HashSet::new();
253 for fh in &function_hashes {
254 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
256 for (hash, _) in hashes {
257 hash_set.insert(hash);
258 }
259 }
260 hash_set.len()
261 };
262
263 let duration_ms = start_time.elapsed().as_millis() as u64;
264
265 let total_lines: usize = source_files
267 .iter()
268 .filter_map(|path| std::fs::read_to_string(path).ok())
269 .map(|content| content.lines().count())
270 .sum();
271
272 Ok(Report {
273 files_scanned: source_files.len(),
274 functions_analyzed: function_hashes.len(),
275 duplicates,
276 stats: ScanStats {
277 total_lines,
278 total_tokens,
279 unique_hashes,
280 duration_ms,
281 },
282 })
283 }
284
285 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
290 let mut files = Vec::new();
291
292 for path in paths {
293 if path.is_file() {
294 if self.is_supported_file(&path) && !self.is_excluded(&path) {
295 files.push(path);
296 }
297 } else if path.is_dir() {
298 let walker = WalkBuilder::new(&path)
300 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
307
308 for entry in walker {
309 match entry {
310 Ok(entry) => {
311 let path = entry.path();
312 if path.is_file()
313 && self.is_supported_file(path)
314 && !self.is_excluded(path)
315 {
316 files.push(path.to_path_buf());
317 }
318 }
319 Err(err) => {
320 eprintln!("Warning: Failed to access path: {}", err);
322 }
323 }
324 }
325 }
326 }
327
328 Ok(files)
329 }
330
331 fn is_supported_file(&self, path: &Path) -> bool {
333 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
334 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
335 } else {
336 false
337 }
338 }
339
340 fn is_excluded(&self, path: &Path) -> bool {
342 use globset::{Glob, GlobSetBuilder};
343
344 let mut builder = GlobSetBuilder::new();
346 for pattern in &self.exclude_patterns {
347 if let Ok(glob) = Glob::new(pattern) {
348 builder.add(glob);
349 }
350 }
351
352 if let Ok(glob_set) = builder.build() {
353 glob_set.is_match(path)
354 } else {
355 false
356 }
357 }
358
359 fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
361 let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
362
363 let lang = self.detect_language(path)?;
364 let functions = extract_functions(&code, lang)?;
365
366 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
368 let mut function_hashes = Vec::new();
369
370 for func in functions {
371 let raw_body = func.body.clone();
373 let tokens = normalize(&func.body);
374
375 if tokens.len() < self.min_block_size {
377 continue;
378 }
379
380 function_hashes.push(FunctionHash {
382 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
384 start_byte: func.start_byte,
385 end_byte: func.end_byte,
386 start_line: func.start_line,
387 end_line: func.end_line,
388 tokens,
389 raw_body,
390 });
391 }
392
393 Ok(function_hashes)
394 }
395
396 fn detect_language(&self, path: &Path) -> Result<Language> {
398 let ext = path
399 .extension()
400 .and_then(|e| e.to_str())
401 .ok_or_else(|| anyhow!("No file extension"))?;
402
403 match ext {
404 "rs" => Ok(tree_sitter_rust::language()),
405 "py" => Ok(tree_sitter_python::language()),
406 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
407 _ => Err(anyhow!("Unsupported file extension: {}", ext)),
408 }
409 }
410
411 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
413 let mut duplicates = Vec::new();
414 let mut seen_pairs: std::collections::HashSet<(
415 &str,
416 &str,
417 usize,
418 usize,
419 usize,
420 usize,
421 usize,
422 )> = std::collections::HashSet::new();
423
424 for i in 0..function_hashes.len() {
426 for j in (i + 1)..function_hashes.len() {
427 let func1 = &function_hashes[i];
428 let func2 = &function_hashes[j];
429
430 let matches = self.find_clones_between_functions(func1, func2);
432
433 for clone_match in matches {
434 let pair_key = canonical_pair_key(
436 func1,
437 func2,
438 clone_match.source_start,
439 clone_match.target_start,
440 clone_match.length,
441 );
442
443 if seen_pairs.contains(&pair_key) {
444 continue;
445 }
446 seen_pairs.insert(pair_key);
447
448 use std::collections::hash_map::DefaultHasher;
450 use std::hash::{Hash, Hasher};
451 let mut hasher = DefaultHasher::new();
452 func1.tokens
453 [clone_match.source_start..clone_match.source_start + clone_match.length]
454 .hash(&mut hasher);
455 let match_hash = hasher.finish();
456
457 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
460
461 let actual_start1 = func1.start_line + clone_match.source_start;
463 let actual_start2 = func2.start_line + clone_match.target_start;
464
465 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
468 continue;
469 }
470
471 duplicates.push(DuplicateMatch {
472 file1: func1.file_path.to_string(),
473 file2: func2.file_path.to_string(),
474 start_line1: actual_start1,
475 start_line2: actual_start2,
476 length: clone_match.length,
477 similarity: clone_match.similarity,
478 hash: match_hash,
479 clone_type,
480 edit_distance: None, });
482 }
483 }
484 }
485
486 if self.enable_type3 {
488 let mut type3_candidates = Vec::new();
490
491 for i in 0..function_hashes.len() {
492 for j in (i + 1)..function_hashes.len() {
493 let func1 = &function_hashes[i];
494 let func2 = &function_hashes[j];
495
496 let type3_matches = detect_type3_clones(
501 &func1.tokens,
502 &func2.tokens,
503 self.min_block_size,
504 self.type3_tolerance,
505 );
506
507 for clone_match in type3_matches {
508 let pair_key = canonical_pair_key(
510 func1,
511 func2,
512 clone_match.source_start,
513 clone_match.target_start,
514 clone_match.length,
515 );
516
517 if seen_pairs.contains(&pair_key) {
518 continue;
519 }
520
521 type3_candidates.push((func1, func2, clone_match));
522 }
523 }
524 }
525
526 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
528
529 for (func1, func2, clone_match) in deduplicated {
531 let window1 = &func1.tokens
533 [clone_match.source_start..clone_match.source_start + clone_match.length];
534 let window2 = &func2.tokens[clone_match.target_start
535 ..clone_match.target_start + clone_match.target_length];
536 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
537
538 use std::collections::hash_map::DefaultHasher;
540 use std::hash::{Hash, Hasher};
541 let mut hasher = DefaultHasher::new();
542 window1.hash(&mut hasher);
543 let match_hash = hasher.finish();
544
545 duplicates.push(DuplicateMatch {
546 file1: func1.file_path.to_string(),
547 file2: func2.file_path.to_string(),
548 start_line1: func1.start_line,
549 start_line2: func2.start_line,
550 length: clone_match.length,
551 similarity: clone_match.similarity,
552 hash: match_hash,
553 clone_type: CloneType::Type3,
554 edit_distance: Some(edit_dist),
555 });
556 }
557 }
558
559 duplicates
560 }
561
562 fn deduplicate_overlapping_matches<'a>(
568 &self,
569 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
570 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
571 if candidates.is_empty() {
572 return Vec::new();
573 }
574
575 let mut used = vec![false; candidates.len()];
577 let mut deduplicated = Vec::new();
578
579 for i in 0..candidates.len() {
580 if used[i] {
581 continue;
582 }
583
584 let (func1, func2, current) = &candidates[i];
585 let mut best_match = (*func1, *func2, current.clone());
586 used[i] = true;
587
588 let mut found_overlap = true;
591 while found_overlap {
592 found_overlap = false;
593
594 for j in (i + 1)..candidates.len() {
595 if used[j] {
596 continue;
597 }
598
599 let (f1, f2, candidate) = &candidates[j];
600
601 let same_pair = (func1.file_path == f1.file_path
603 && func2.file_path == f2.file_path
604 && func1.start_line == f1.start_line
605 && func2.start_line == f2.start_line)
606 || (func1.file_path == f2.file_path
607 && func2.file_path == f1.file_path
608 && func1.start_line == f2.start_line
609 && func2.start_line == f1.start_line);
610
611 if !same_pair {
612 continue;
613 }
614
615 let source_overlap = ranges_overlap(
618 best_match.2.source_start,
619 best_match.2.source_start + best_match.2.length,
620 candidate.source_start,
621 candidate.source_start + candidate.length,
622 );
623 let target_overlap = ranges_overlap(
624 best_match.2.target_start,
625 best_match.2.target_start + best_match.2.target_length,
626 candidate.target_start,
627 candidate.target_start + candidate.target_length,
628 );
629
630 if source_overlap && target_overlap {
631 let best_span = best_match.2.length.max(best_match.2.target_length);
632 let candidate_span = candidate.length.max(candidate.target_length);
633
634 if candidate_span > best_span
636 || (candidate_span == best_span
637 && candidate.similarity > best_match.2.similarity)
638 {
639 best_match = (*f1, *f2, candidate.clone());
640 found_overlap = true; }
642 used[j] = true;
643 }
644 }
645 }
646
647 deduplicated.push(best_match);
648 }
649
650 deduplicated
651 }
652
653 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
655 let normalized1 = raw1.split_whitespace().collect::<String>();
657 let normalized2 = raw2.split_whitespace().collect::<String>();
658
659 if normalized1 == normalized2 {
661 CloneType::Type1
662 } else {
663 CloneType::Type2
665 }
666 }
667
668 fn find_clones_between_functions(
670 &self,
671 func1: &FunctionHash,
672 func2: &FunctionHash,
673 ) -> Vec<CloneMatch> {
674 use std::collections::HashMap;
675
676 let mut matches = Vec::new();
677 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
678
679 let mut i = 0;
681 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
682 let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
683 hash_map.entry(hash).or_default().push(i);
684 i += 1;
685 }
686
687 let mut j = 0;
689 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
690 let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
691
692 if let Some(func1_positions) = hash_map.get(&hash) {
693 for &func1_pos in func1_positions {
694 if self.verify_window_match(
696 &func1.tokens,
697 &func2.tokens,
698 func1_pos,
699 j,
700 self.min_block_size,
701 ) {
702 let mut extension = 0;
704 while (func1_pos + self.min_block_size + extension < func1.tokens.len())
705 && (j + self.min_block_size + extension < func2.tokens.len())
706 && (func1.tokens[func1_pos + self.min_block_size + extension]
707 == func2.tokens[j + self.min_block_size + extension])
708 {
709 extension += 1;
710 }
711
712 let total_length = self.min_block_size + extension;
713
714 matches.push(CloneMatch {
715 source_start: func1_pos,
716 target_start: j,
717 length: total_length,
718 target_length: total_length,
719 similarity: 1.0, });
721
722 j += extension.max(1);
724 break;
725 }
726 }
727 }
728
729 j += 1;
730 }
731
732 matches
733 }
734
735 fn compute_window_hash(&self, window: &[Token]) -> u64 {
741 const BASE: u64 = 257;
743 const MODULUS: u64 = 1_000_000_007;
745
746 let mut hash: u64 = 0;
747 for token in window {
748 use std::collections::hash_map::DefaultHasher;
749 use std::hash::{Hash, Hasher};
750 let mut hasher = DefaultHasher::new();
751 token.as_hash_string().hash(&mut hasher);
752 let token_hash = hasher.finish();
753 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
755 hash = wide_hash as u64;
756 }
757 hash
758 }
759
760 fn verify_window_match(
762 &self,
763 tokens1: &[Token],
764 tokens2: &[Token],
765 idx1: usize,
766 idx2: usize,
767 len: usize,
768 ) -> bool {
769 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
770 return false;
771 }
772 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
773 }
774}
775
776impl Default for Scanner {
777 fn default() -> Self {
778 Self::new() }
780}
781
782pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
790 let scanner = Scanner::new();
791 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
792 scanner.scan(path_bufs)
793}
794
795pub fn find_duplicates_with_config(
797 paths: Vec<String>,
798 min_block_size: usize,
799 similarity_threshold: f64,
800) -> Result<Report> {
801 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
802 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
803 scanner.scan(path_bufs)
804}
805
806#[cfg(test)]
807mod tests {
808 use super::*;
809
810 #[test]
811 fn test_scanner_creation() {
812 let _scanner = Scanner::new(); }
814
815 #[test]
816 fn test_scanner_with_config() {
817 let scanner = Scanner::with_config(30, 0.9);
818 assert!(scanner.is_ok());
819 let s = scanner.unwrap();
820 assert_eq!(s.min_block_size, 30);
821 assert_eq!(s.similarity_threshold, 0.9);
822 }
823
824 #[test]
825 fn test_type3_tolerance_validation() {
826 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
827 assert!(Scanner::new().with_type3_detection(1.2).is_err());
828 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
829 }
830
831 #[test]
832 fn test_type3_not_dropped_when_functions_share_offsets() {
833 fn make_function(
834 file: &str,
835 start_line: usize,
836 tokens: Vec<Token>,
837 raw_body: &str,
838 ) -> FunctionHash {
839 FunctionHash {
840 file_path: Arc::<str>::from(file),
841 function_name: None,
842 start_byte: 0,
843 end_byte: 0,
844 start_line,
845 end_line: start_line + tokens.len(),
846 tokens,
847 raw_body: raw_body.to_string(),
848 }
849 }
850
851 let scanner = Scanner::with_config(3, 0.85)
852 .unwrap()
853 .with_type3_detection(0.6)
854 .unwrap();
855
856 let type1_tokens = vec![
857 Token::Keyword("return".into()),
858 Token::NumberLiteral,
859 Token::Punctuation(";".into()),
860 ];
861 let near_tokens_a = vec![
862 Token::Keyword("compute".into()),
863 Token::Identifier,
864 Token::Identifier,
865 ];
866 let near_tokens_b = vec![
867 Token::Keyword("compute".into()),
868 Token::Identifier,
869 Token::NumberLiteral,
870 ];
871
872 let functions = vec![
873 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
874 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
875 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
876 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
877 ];
878
879 let duplicates = scanner.find_duplicate_hashes(&functions);
880
881 let type1_present = duplicates.iter().any(|d| {
882 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
883 && d.start_line1 == 10
884 && d.start_line2 == 20
885 });
886 assert!(
887 type1_present,
888 "expected Type-1/2 match for the first function pair"
889 );
890
891 let type3_present = duplicates.iter().any(|d| {
892 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
893 });
894 assert!(
895 type3_present,
896 "Type-3 match between later functions should not be deduped"
897 );
898
899 assert_eq!(
900 duplicates.len(),
901 2,
902 "should keep both the Type-1/2 and Type-3 matches"
903 );
904 }
905
906 #[test]
907 fn test_find_duplicates_empty() {
908 let result = find_duplicates(vec![]);
909 assert!(result.is_ok());
910 let report = result.unwrap();
911 assert_eq!(report.duplicates.len(), 0);
912 }
913
914 #[test]
915 fn test_is_supported_file() {
916 let scanner = Scanner::new();
917
918 assert!(scanner.is_supported_file(Path::new("test.rs")));
919 assert!(scanner.is_supported_file(Path::new("test.py")));
920 assert!(scanner.is_supported_file(Path::new("test.js")));
921 assert!(scanner.is_supported_file(Path::new("test.ts")));
922 assert!(!scanner.is_supported_file(Path::new("test.txt")));
923 assert!(!scanner.is_supported_file(Path::new("test.md")));
924 }
925
926 #[test]
927 fn test_detect_language() {
928 let scanner = Scanner::new();
929
930 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
931 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
932 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
933 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
934 }
935}