1mod directives;
8mod hashing;
9mod ignore_rules;
10mod parsing;
11mod queries;
12
13#[cfg(test)]
14mod proptest_fuzzing;
15
16#[cfg(test)]
17mod snapshot_tests;
18
19pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
21pub use hashing::{
22 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
23 compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
24 normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
25 Token,
26};
27pub use ignore_rules::{
28 compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
29};
30pub use parsing::{
31 extract_functions, extract_javascript_functions, extract_python_functions,
32 extract_rust_functions, FunctionNode,
33};
34
35use anyhow::{anyhow, Context, Result};
36use ignore::WalkBuilder;
37use rayon::prelude::*;
38use serde::{Deserialize, Serialize};
39use std::collections::HashMap;
40use std::fs;
41use std::path::{Path, PathBuf};
42use std::sync::Arc;
43use tree_sitter::Language;
44
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
47pub enum CloneType {
48 #[serde(rename = "type-1")]
50 Type1,
51 #[serde(rename = "type-2")]
53 Type2,
54 #[serde(rename = "type-3")]
56 Type3,
57}
58
59fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
61 start1 < end2 && start2 < end1
62}
63
64fn canonical_pair_key<'a>(
66 func1: &'a FunctionHash,
67 func2: &'a FunctionHash,
68 source_start: usize,
69 target_start: usize,
70 length: usize,
71) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
72 if func1.file_path.as_ref() < func2.file_path.as_ref() {
73 (
74 func1.file_path.as_ref(),
75 func2.file_path.as_ref(),
76 func1.start_line,
77 func2.start_line,
78 source_start,
79 target_start,
80 length,
81 )
82 } else {
83 (
84 func2.file_path.as_ref(),
85 func1.file_path.as_ref(),
86 func2.start_line,
87 func1.start_line,
88 target_start,
89 source_start,
90 length,
91 )
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
97pub struct DuplicateMatch {
98 pub file1: String,
99 pub file2: String,
100 pub start_line1: usize,
101 pub start_line2: usize,
102 #[serde(skip)]
103 pub end_line1: Option<usize>,
104 #[serde(skip)]
105 pub end_line2: Option<usize>,
106 pub length: usize,
107 pub similarity: f64,
108 pub hash: u64,
109 pub clone_type: CloneType,
110 #[serde(skip_serializing_if = "Option::is_none")]
112 pub edit_distance: Option<usize>,
113 #[serde(skip_serializing_if = "Option::is_none")]
115 pub suppressed_by_directive: Option<bool>,
116 #[serde(skip)]
118 token_offset1: Option<usize>,
119 #[serde(skip)]
121 token_offset2: Option<usize>,
122 #[serde(skip)]
124 target_length: Option<usize>,
125 #[serde(skip_serializing_if = "Option::is_none")]
127 pub duplicate_id: Option<String>,
128}
129
130#[derive(Debug, Clone)]
132struct FunctionHash {
133 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
136 #[allow(dead_code)] start_byte: usize,
138 #[allow(dead_code)] end_byte: usize,
140 start_line: usize,
141 #[allow(dead_code)] end_line: usize,
143 tokens: Vec<Token>, token_line_offsets: Vec<usize>,
146 raw_body: String, }
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct Baseline {
152 pub version: String,
154 pub created_at: String,
156 pub duplicates: Vec<DuplicateMatch>,
158}
159
160impl Baseline {
161 pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
163 Self {
164 version: env!("CARGO_PKG_VERSION").to_string(),
165 created_at: chrono::Utc::now().to_rfc3339(),
166 duplicates,
167 }
168 }
169
170 pub fn save_to_file(&self, path: &Path) -> Result<()> {
172 let json =
173 serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
174 fs::write(path, json).context("Failed to write baseline file")?;
175 Ok(())
176 }
177
178 pub fn load_from_file(path: &Path) -> Result<Self> {
180 let content = fs::read_to_string(path)
181 .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
182 let baseline: Baseline =
183 serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
184 Ok(baseline)
185 }
186
187 pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
189 let baseline_set: std::collections::HashSet<_> =
190 self.duplicates.iter().map(duplicate_key).collect();
191
192 current
193 .iter()
194 .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
195 .cloned()
196 .collect()
197 }
198}
199
200fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
202 let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
204 (
205 dup.file1.clone(),
206 dup.file2.clone(),
207 dup.start_line1,
208 dup.start_line2,
209 )
210 } else {
211 (
212 dup.file2.clone(),
213 dup.file1.clone(),
214 dup.start_line2,
215 dup.start_line1,
216 )
217 };
218 (file1, file2, line1, line2, dup.length)
219}
220
221#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct Report {
224 #[serde(skip_serializing_if = "Option::is_none")]
226 pub version: Option<String>,
227 #[serde(skip_serializing_if = "Option::is_none")]
229 pub scan_time: Option<String>,
230 #[serde(skip_serializing_if = "Option::is_none")]
232 pub config: Option<ScanConfig>,
233 pub files_scanned: usize,
235 pub functions_analyzed: usize,
237 pub duplicates: Vec<DuplicateMatch>,
239 pub stats: ScanStats,
241}
242
243#[derive(Debug, Clone, Serialize, Deserialize)]
245pub struct ScanConfig {
246 pub threshold: usize,
248 pub similarity: f64,
250 pub type3_enabled: bool,
252 #[serde(skip_serializing_if = "Option::is_none")]
254 pub paths: Option<Vec<String>>,
255}
256
257#[derive(Debug, Clone, Serialize, Deserialize)]
259pub struct ScanStats {
260 pub total_lines: usize,
262 pub total_tokens: usize,
264 pub unique_hashes: usize,
266 pub duration_ms: u64,
268}
269
270#[allow(dead_code)] pub struct Scanner {
273 min_block_size: usize,
275 similarity_threshold: f64,
277 exclude_patterns: Vec<String>,
279 enable_type3: bool,
281 type3_tolerance: f64,
283 ignore_manager: Option<IgnoreManager>,
285 enable_directives: bool,
287 include_tests: bool,
289}
290
291fn default_exclude_patterns() -> Vec<String> {
293 vec![
294 "**/*.test.ts".to_string(),
296 "**/*.test.js".to_string(),
297 "**/*.test.tsx".to_string(),
298 "**/*.test.jsx".to_string(),
299 "**/*.spec.ts".to_string(),
300 "**/*.spec.js".to_string(),
301 "**/*.spec.tsx".to_string(),
302 "**/*.spec.jsx".to_string(),
303 "**/__tests__/**".to_string(),
304 "**/*.test.py".to_string(),
305 ]
306}
307
308fn build_artifact_patterns() -> Vec<String> {
310 vec![
311 "**/node_modules/**".to_string(),
312 "**/target/**".to_string(),
313 "**/dist/**".to_string(),
314 "**/build/**".to_string(),
315 "**/.git/**".to_string(),
316 ]
317}
318
319impl Scanner {
320 pub fn new() -> Self {
324 let mut exclude = build_artifact_patterns();
325 exclude.extend(default_exclude_patterns());
326
327 Self {
328 min_block_size: 50,
329 similarity_threshold: 0.85,
330 exclude_patterns: exclude,
331 enable_type3: false,
332 type3_tolerance: 0.85,
333 ignore_manager: None,
334 enable_directives: false,
335 include_tests: false,
336 }
337 }
338
339 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
341 let mut exclude = build_artifact_patterns();
342 exclude.extend(default_exclude_patterns());
343
344 Ok(Self {
345 min_block_size,
346 similarity_threshold,
347 exclude_patterns: exclude,
348 enable_type3: false,
349 type3_tolerance: 0.85,
350 ignore_manager: None,
351 enable_directives: false,
352 include_tests: false,
353 })
354 }
355
356 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
358 self.exclude_patterns = patterns;
359 self
360 }
361
362 pub fn with_test_files(mut self, include: bool) -> Self {
364 self.include_tests = include;
365 if include {
366 let test_patterns = default_exclude_patterns();
368 self.exclude_patterns.retain(|p| !test_patterns.contains(p));
369 }
370 self
371 }
372
373 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
375 if !(0.0..=1.0).contains(&tolerance) {
376 return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
377 }
378 self.enable_type3 = true;
379 self.type3_tolerance = tolerance;
380 Ok(self)
381 }
382
383 pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
385 self.ignore_manager = Some(manager);
386 self
387 }
388
389 pub fn with_directives(mut self, enabled: bool) -> Self {
391 self.enable_directives = enabled;
392 self
393 }
394
395 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
404 use std::time::Instant;
405 let start_time = Instant::now();
406
407 let source_files = self.collect_source_files(paths.clone())?;
409
410 let directives_map: HashMap<PathBuf, crate::directives::FileDirectives> =
412 if self.enable_directives {
413 source_files
414 .par_iter()
415 .filter_map(|path| {
416 crate::directives::detect_directives_in_file(path)
417 .ok()
418 .map(|d| (path.clone(), d))
419 })
420 .collect()
421 } else {
422 HashMap::new()
423 };
424
425 let function_hashes: Vec<FunctionHash> = source_files
427 .par_iter()
428 .filter_map(|path| self.process_file(path).ok())
429 .flatten()
430 .collect();
431
432 let mut duplicates = self.find_duplicate_hashes(&function_hashes);
434
435 if self.enable_directives && !directives_map.is_empty() {
437 for dup in &mut duplicates {
438 let suppressed =
439 self.is_suppressed_by_directive(dup, &directives_map, &function_hashes);
440 if suppressed {
441 dup.suppressed_by_directive = Some(true);
442 }
443 }
444
445 duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
447 }
448
449 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
451
452 let unique_hashes: usize = {
453 let mut hash_set = std::collections::HashSet::new();
454 for fh in &function_hashes {
455 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
457 for (hash, _) in hashes {
458 hash_set.insert(hash);
459 }
460 }
461 hash_set.len()
462 };
463
464 let duration_ms = start_time.elapsed().as_millis() as u64;
465
466 let total_lines: usize = source_files
468 .iter()
469 .filter_map(|path| std::fs::read_to_string(path).ok())
470 .map(|content| content.lines().count())
471 .sum();
472
473 Ok(Report {
474 version: None, scan_time: None, config: None, files_scanned: source_files.len(),
478 functions_analyzed: function_hashes.len(),
479 duplicates,
480 stats: ScanStats {
481 total_lines,
482 total_tokens,
483 unique_hashes,
484 duration_ms,
485 },
486 })
487 }
488
489 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
494 let mut files = Vec::new();
495
496 for path in paths {
497 if path.is_file() {
498 if self.is_supported_file(&path) && !self.is_excluded(&path) {
499 files.push(path);
500 }
501 } else if path.is_dir() {
502 let walker = WalkBuilder::new(&path)
504 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
511
512 for entry in walker {
513 match entry {
514 Ok(entry) => {
515 let path = entry.path();
516 if path.is_file()
517 && self.is_supported_file(path)
518 && !self.is_excluded(path)
519 {
520 files.push(path.to_path_buf());
521 }
522 }
523 Err(err) => {
524 eprintln!("Warning: Failed to access path: {}", err);
526 }
527 }
528 }
529 }
530 }
531
532 Ok(files)
533 }
534
535 fn is_supported_file(&self, path: &Path) -> bool {
537 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
538 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
539 } else {
540 false
541 }
542 }
543
544 fn is_excluded(&self, path: &Path) -> bool {
546 use globset::{Glob, GlobSetBuilder};
547
548 let mut builder = GlobSetBuilder::new();
550 for pattern in &self.exclude_patterns {
551 if let Ok(glob) = Glob::new(pattern) {
552 builder.add(glob);
553 }
554 }
555
556 if let Ok(glob_set) = builder.build() {
557 glob_set.is_match(path)
558 } else {
559 false
560 }
561 }
562
563 fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
565 let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
566
567 let lang = self.detect_language(path)?;
568 let functions = extract_functions(&code, lang)?;
569
570 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
572 let mut function_hashes = Vec::new();
573
574 for func in functions {
575 let raw_body = func.body.clone();
577 let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
578
579 if tokens.len() < self.min_block_size {
581 continue;
582 }
583
584 function_hashes.push(FunctionHash {
586 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
588 start_byte: func.start_byte,
589 end_byte: func.end_byte,
590 start_line: func.start_line,
591 end_line: func.end_line,
592 tokens,
593 token_line_offsets,
594 raw_body,
595 });
596 }
597
598 Ok(function_hashes)
599 }
600
601 fn detect_language(&self, path: &Path) -> Result<Language> {
603 let ext = path
604 .extension()
605 .and_then(|e| e.to_str())
606 .ok_or_else(|| anyhow!("No file extension"))?;
607
608 match ext {
609 "rs" => Ok(tree_sitter_rust::language()),
610 "py" => Ok(tree_sitter_python::language()),
611 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
612 _ => Err(anyhow!("Unsupported file extension: {}", ext)),
613 }
614 }
615
616 fn compute_line_span(
618 &self,
619 func: &FunctionHash,
620 start_offset: usize,
621 length: usize,
622 ) -> (usize, usize) {
623 let start_line = func
624 .token_line_offsets
625 .get(start_offset)
626 .map(|offset| func.start_line + offset)
627 .unwrap_or(func.start_line + start_offset);
628
629 let end_index = start_offset + length.saturating_sub(1);
630 let end_line = func
631 .token_line_offsets
632 .get(end_index)
633 .map(|offset| func.start_line + offset)
634 .unwrap_or(func.start_line + end_index);
635
636 (start_line, end_line)
637 }
638
639 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
647 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
649
650 let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
652 std::collections::HashSet::new();
653
654 let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
656
657 if self.enable_type3 {
659 self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
660 }
661
662 self.compute_duplicate_ids(function_hashes, &mut duplicates);
664
665 self.filter_ignored_duplicates(&mut duplicates);
667
668 duplicates
669 }
670
671 fn find_type12_duplicates<'a>(
675 &self,
676 function_hashes: &'a [FunctionHash],
677 seen_pairs: &mut std::collections::HashSet<(
678 &'a str,
679 &'a str,
680 usize,
681 usize,
682 usize,
683 usize,
684 usize,
685 )>,
686 ) -> Vec<DuplicateMatch> {
687 let mut duplicates = Vec::new();
688
689 for i in 0..function_hashes.len() {
690 for j in (i + 1)..function_hashes.len() {
691 let func1 = &function_hashes[i];
692 let func2 = &function_hashes[j];
693
694 let matches = self.find_clones_between_functions(func1, func2);
695
696 for clone_match in matches {
697 let pair_key = canonical_pair_key(
698 func1,
699 func2,
700 clone_match.source_start,
701 clone_match.target_start,
702 clone_match.length,
703 );
704
705 if seen_pairs.contains(&pair_key) {
706 continue;
707 }
708 seen_pairs.insert(pair_key);
709
710 let match_hash = Self::compute_match_hash(
712 &func1.tokens[clone_match.source_start
713 ..clone_match.source_start + clone_match.length],
714 );
715
716 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
717
718 let (actual_start1, actual_end1) =
719 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
720 let (actual_start2, actual_end2) =
721 self.compute_line_span(func2, clone_match.target_start, clone_match.length);
722
723 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
725 continue;
726 }
727
728 duplicates.push(DuplicateMatch {
729 file1: func1.file_path.to_string(),
730 file2: func2.file_path.to_string(),
731 start_line1: actual_start1,
732 start_line2: actual_start2,
733 end_line1: Some(actual_end1),
734 end_line2: Some(actual_end2),
735 length: clone_match.length,
736 similarity: clone_match.similarity,
737 hash: match_hash,
738 clone_type,
739 edit_distance: None,
740 suppressed_by_directive: None,
741 token_offset1: Some(clone_match.source_start),
742 token_offset2: Some(clone_match.target_start),
743 target_length: Some(clone_match.length),
744 duplicate_id: None,
745 });
746 }
747 }
748 }
749
750 duplicates
751 }
752
753 fn find_type3_duplicates<'a>(
757 &self,
758 function_hashes: &'a [FunctionHash],
759 seen_pairs: &std::collections::HashSet<(
760 &'a str,
761 &'a str,
762 usize,
763 usize,
764 usize,
765 usize,
766 usize,
767 )>,
768 duplicates: &mut Vec<DuplicateMatch>,
769 ) {
770 let mut type3_candidates = Vec::new();
771
772 for i in 0..function_hashes.len() {
773 for j in (i + 1)..function_hashes.len() {
774 let func1 = &function_hashes[i];
775 let func2 = &function_hashes[j];
776
777 let type3_matches = detect_type3_clones(
778 &func1.tokens,
779 &func2.tokens,
780 self.min_block_size,
781 self.type3_tolerance,
782 );
783
784 for clone_match in type3_matches {
785 let pair_key = canonical_pair_key(
786 func1,
787 func2,
788 clone_match.source_start,
789 clone_match.target_start,
790 clone_match.length,
791 );
792
793 if seen_pairs.contains(&pair_key) {
794 continue;
795 }
796
797 type3_candidates.push((func1, func2, clone_match));
798 }
799 }
800 }
801
802 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
804
805 for (func1, func2, clone_match) in deduplicated {
807 let (actual_start1, actual_end1) =
808 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
809 let (actual_start2, actual_end2) =
810 self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
811
812 let window1 = &func1.tokens
813 [clone_match.source_start..clone_match.source_start + clone_match.length];
814 let window2 = &func2.tokens
815 [clone_match.target_start..clone_match.target_start + clone_match.target_length];
816 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
817
818 let match_hash = Self::compute_match_hash(window1);
819
820 duplicates.push(DuplicateMatch {
821 file1: func1.file_path.to_string(),
822 file2: func2.file_path.to_string(),
823 start_line1: actual_start1,
824 start_line2: actual_start2,
825 end_line1: Some(actual_end1),
826 end_line2: Some(actual_end2),
827 length: clone_match.length,
828 similarity: clone_match.similarity,
829 hash: match_hash,
830 clone_type: CloneType::Type3,
831 edit_distance: Some(edit_dist),
832 suppressed_by_directive: None,
833 token_offset1: Some(clone_match.source_start),
834 token_offset2: Some(clone_match.target_start),
835 target_length: Some(clone_match.target_length),
836 duplicate_id: None,
837 });
838 }
839 }
840
841 fn compute_duplicate_ids(
845 &self,
846 function_hashes: &[FunctionHash],
847 duplicates: &mut [DuplicateMatch],
848 ) {
849 for dup in duplicates.iter_mut() {
850 if dup.duplicate_id.is_some() {
851 continue;
852 }
853
854 let tokens1 = self.extract_duplicate_tokens(
855 function_hashes,
856 &dup.file1,
857 dup.start_line1,
858 dup.token_offset1,
859 dup.length,
860 );
861
862 let tokens2 = self.extract_duplicate_tokens(
863 function_hashes,
864 &dup.file2,
865 dup.start_line2,
866 dup.token_offset2,
867 dup.target_length.unwrap_or(dup.length),
868 );
869
870 if let Some(tokens1) = tokens1 {
871 let id = if let Some(tokens2) = tokens2 {
872 ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
873 } else {
874 ignore_rules::compute_duplicate_id(&tokens1)
875 };
876 dup.duplicate_id = Some(id);
877 }
878 }
879 }
880
881 fn extract_duplicate_tokens(
883 &self,
884 function_hashes: &[FunctionHash],
885 file: &str,
886 reported_start: usize,
887 token_offset: Option<usize>,
888 length: usize,
889 ) -> Option<Vec<String>> {
890 let token_offset = token_offset?;
891 function_hashes
892 .iter()
893 .find(|fh| {
894 fh.file_path.as_ref() == file
895 && fh.start_line <= reported_start
896 && reported_start <= fh.end_line
897 })
898 .and_then(|fh| {
899 if token_offset + length <= fh.tokens.len() {
900 Some(
901 fh.tokens
902 .iter()
903 .skip(token_offset)
904 .take(length)
905 .map(|t| t.as_hash_string().to_string())
906 .collect(),
907 )
908 } else {
909 None
910 }
911 })
912 }
913
914 fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
916 if let Some(ref ignore_manager) = self.ignore_manager {
917 duplicates.retain(|dup| {
918 if let Some(ref id) = dup.duplicate_id {
919 !ignore_manager.is_ignored(id)
920 } else {
921 true
923 }
924 });
925 }
926 }
927
928 fn compute_match_hash(tokens: &[Token]) -> u64 {
930 use std::collections::hash_map::DefaultHasher;
931 use std::hash::{Hash, Hasher};
932 let mut hasher = DefaultHasher::new();
933 tokens.hash(&mut hasher);
934 hasher.finish()
935 }
936
937 fn is_suppressed_by_directive(
942 &self,
943 dup: &DuplicateMatch,
944 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
945 function_hashes: &[FunctionHash],
946 ) -> bool {
947 let file1_path = PathBuf::from(&dup.file1);
949 let file2_path = PathBuf::from(&dup.file2);
950
951 if let Some(directives) = directives_map.get(&file1_path) {
953 let func_start =
954 self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
955 let check_line = func_start.unwrap_or(dup.start_line1);
957
958 if directives.is_suppressed(check_line, check_line).is_some() {
959 return true;
960 }
961 }
962
963 if let Some(directives) = directives_map.get(&file2_path) {
965 let func_start =
966 self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
967 let check_line = func_start.unwrap_or(dup.start_line2);
969
970 if directives.is_suppressed(check_line, check_line).is_some() {
971 return true;
972 }
973 }
974
975 false
976 }
977
978 fn find_owning_function_start(
980 &self,
981 file: &str,
982 line: usize,
983 function_hashes: &[FunctionHash],
984 ) -> Option<usize> {
985 function_hashes
986 .iter()
987 .find(|fh| {
988 fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
989 })
990 .map(|fh| fh.start_line)
991 }
992
993 fn deduplicate_overlapping_matches<'a>(
999 &self,
1000 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1001 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1002 if candidates.is_empty() {
1003 return Vec::new();
1004 }
1005
1006 let mut used = vec![false; candidates.len()];
1008 let mut deduplicated = Vec::new();
1009
1010 for i in 0..candidates.len() {
1011 if used[i] {
1012 continue;
1013 }
1014
1015 let (func1, func2, current) = &candidates[i];
1016 let mut best_match = (*func1, *func2, current.clone());
1017 used[i] = true;
1018
1019 let mut found_overlap = true;
1022 while found_overlap {
1023 found_overlap = false;
1024
1025 for j in (i + 1)..candidates.len() {
1026 if used[j] {
1027 continue;
1028 }
1029
1030 let (f1, f2, candidate) = &candidates[j];
1031
1032 let same_pair = (func1.file_path == f1.file_path
1034 && func2.file_path == f2.file_path
1035 && func1.start_line == f1.start_line
1036 && func2.start_line == f2.start_line)
1037 || (func1.file_path == f2.file_path
1038 && func2.file_path == f1.file_path
1039 && func1.start_line == f2.start_line
1040 && func2.start_line == f1.start_line);
1041
1042 if !same_pair {
1043 continue;
1044 }
1045
1046 let source_overlap = ranges_overlap(
1049 best_match.2.source_start,
1050 best_match.2.source_start + best_match.2.length,
1051 candidate.source_start,
1052 candidate.source_start + candidate.length,
1053 );
1054 let target_overlap = ranges_overlap(
1055 best_match.2.target_start,
1056 best_match.2.target_start + best_match.2.target_length,
1057 candidate.target_start,
1058 candidate.target_start + candidate.target_length,
1059 );
1060
1061 if source_overlap && target_overlap {
1062 let best_span = best_match.2.length.max(best_match.2.target_length);
1063 let candidate_span = candidate.length.max(candidate.target_length);
1064
1065 if candidate_span > best_span
1067 || (candidate_span == best_span
1068 && candidate.similarity > best_match.2.similarity)
1069 {
1070 best_match = (*f1, *f2, candidate.clone());
1071 found_overlap = true; }
1073 used[j] = true;
1074 }
1075 }
1076 }
1077
1078 deduplicated.push(best_match);
1079 }
1080
1081 deduplicated
1082 }
1083
1084 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1086 let normalized1 = raw1.split_whitespace().collect::<String>();
1088 let normalized2 = raw2.split_whitespace().collect::<String>();
1089
1090 if normalized1 == normalized2 {
1092 CloneType::Type1
1093 } else {
1094 CloneType::Type2
1096 }
1097 }
1098
1099 fn find_clones_between_functions(
1101 &self,
1102 func1: &FunctionHash,
1103 func2: &FunctionHash,
1104 ) -> Vec<CloneMatch> {
1105 use std::collections::HashMap;
1106
1107 let mut matches = Vec::new();
1108 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1109
1110 let mut i = 0;
1112 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1113 let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1114 hash_map.entry(hash).or_default().push(i);
1115 i += 1;
1116 }
1117
1118 let mut j = 0;
1120 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1121 let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1122
1123 if let Some(func1_positions) = hash_map.get(&hash) {
1124 for &func1_pos in func1_positions {
1125 if hashing::verify_cross_window_match(
1127 &func1.tokens,
1128 &func2.tokens,
1129 func1_pos,
1130 j,
1131 self.min_block_size,
1132 ) {
1133 let extension = hashing::extend_match(
1135 &func1.tokens,
1136 &func2.tokens,
1137 func1_pos,
1138 j,
1139 self.min_block_size,
1140 );
1141
1142 let total_length = self.min_block_size + extension;
1143
1144 matches.push(CloneMatch {
1145 source_start: func1_pos,
1146 target_start: j,
1147 length: total_length,
1148 target_length: total_length,
1149 similarity: 1.0, });
1151
1152 j += extension.max(1);
1154 break;
1155 }
1156 }
1157 }
1158
1159 j += 1;
1160 }
1161
1162 matches
1163 }
1164}
1165
1166impl Default for Scanner {
1167 fn default() -> Self {
1168 Self::new() }
1170}
1171
1172pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1180 let scanner = Scanner::new();
1181 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1182 scanner.scan(path_bufs)
1183}
1184
1185pub fn find_duplicates_with_config(
1187 paths: Vec<String>,
1188 min_block_size: usize,
1189 similarity_threshold: f64,
1190) -> Result<Report> {
1191 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1192 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1193 scanner.scan(path_bufs)
1194}
1195
1196#[cfg(test)]
1197mod tests {
1198 use super::*;
1199
1200 fn make_test_function(
1202 file: &str,
1203 start_line: usize,
1204 tokens: Vec<Token>,
1205 raw_body: &str,
1206 ) -> FunctionHash {
1207 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1208 FunctionHash {
1209 file_path: Arc::<str>::from(file),
1210 function_name: None,
1211 start_byte: 0,
1212 end_byte: 0,
1213 start_line,
1214 end_line: start_line + tokens.len(),
1215 tokens,
1216 token_line_offsets,
1217 raw_body: raw_body.to_string(),
1218 }
1219 }
1220
1221 fn make_test_function_same_line(
1223 file: &str,
1224 start_line: usize,
1225 end_line: usize,
1226 tokens: Vec<Token>,
1227 raw_body: &str,
1228 ) -> FunctionHash {
1229 let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1230 FunctionHash {
1231 file_path: Arc::<str>::from(file),
1232 function_name: None,
1233 start_byte: 0,
1234 end_byte: 0,
1235 start_line,
1236 end_line,
1237 tokens,
1238 token_line_offsets,
1239 raw_body: raw_body.to_string(),
1240 }
1241 }
1242
1243 fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1245 vec![
1246 Token::Keyword(keyword.into()),
1247 Token::Identifier,
1248 Token::Operator(op.into()),
1249 Token::Identifier,
1250 Token::Punctuation(";".into()),
1251 ]
1252 }
1253
1254 #[test]
1255 fn test_scanner_creation() {
1256 let _scanner = Scanner::new(); }
1258
1259 #[test]
1260 fn test_scanner_with_config() {
1261 let scanner = Scanner::with_config(30, 0.9);
1262 assert!(scanner.is_ok());
1263 let s = scanner.unwrap();
1264 assert_eq!(s.min_block_size, 30);
1265 assert_eq!(s.similarity_threshold, 0.9);
1266 }
1267
1268 #[test]
1269 fn test_type3_tolerance_validation() {
1270 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1271 assert!(Scanner::new().with_type3_detection(1.2).is_err());
1272 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1273 }
1274
1275 #[test]
1276 fn test_type3_not_dropped_when_functions_share_offsets() {
1277 fn make_function(
1278 file: &str,
1279 start_line: usize,
1280 tokens: Vec<Token>,
1281 raw_body: &str,
1282 ) -> FunctionHash {
1283 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1284 FunctionHash {
1285 file_path: Arc::<str>::from(file),
1286 function_name: None,
1287 start_byte: 0,
1288 end_byte: 0,
1289 start_line,
1290 end_line: start_line + tokens.len(),
1291 tokens,
1292 token_line_offsets,
1293 raw_body: raw_body.to_string(),
1294 }
1295 }
1296
1297 let scanner = Scanner::with_config(3, 0.85)
1298 .unwrap()
1299 .with_type3_detection(0.6)
1300 .unwrap();
1301
1302 let type1_tokens = vec![
1303 Token::Keyword("return".into()),
1304 Token::NumberLiteral,
1305 Token::Punctuation(";".into()),
1306 ];
1307 let near_tokens_a = vec![
1308 Token::Keyword("compute".into()),
1309 Token::Identifier,
1310 Token::Identifier,
1311 ];
1312 let near_tokens_b = vec![
1313 Token::Keyword("compute".into()),
1314 Token::Identifier,
1315 Token::NumberLiteral,
1316 ];
1317
1318 let functions = vec![
1319 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1320 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1321 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1322 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1323 ];
1324
1325 let duplicates = scanner.find_duplicate_hashes(&functions);
1326
1327 let type1_present = duplicates.iter().any(|d| {
1328 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1329 && d.start_line1 == 10
1330 && d.start_line2 == 20
1331 });
1332 assert!(
1333 type1_present,
1334 "expected Type-1/2 match for the first function pair"
1335 );
1336
1337 let type3_present = duplicates.iter().any(|d| {
1338 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1339 });
1340 assert!(
1341 type3_present,
1342 "Type-3 match between later functions should not be deduped"
1343 );
1344
1345 assert_eq!(
1346 duplicates.len(),
1347 2,
1348 "should keep both the Type-1/2 and Type-3 matches"
1349 );
1350 }
1351
1352 #[test]
1353 fn test_type3_reports_token_offsets_in_start_lines() {
1354 let scanner = Scanner::with_config(3, 0.85)
1355 .unwrap()
1356 .with_type3_detection(0.75)
1357 .unwrap();
1358
1359 let functions = vec![
1360 make_test_function_same_line(
1361 "file_a.rs",
1362 100,
1363 105,
1364 make_expr_tokens("let", "+"),
1365 "let a = b + c;",
1366 ),
1367 make_test_function_same_line(
1368 "file_b.rs",
1369 200,
1370 205,
1371 make_expr_tokens("mut", "-"),
1372 "let a = b - c;",
1373 ),
1374 ];
1375
1376 let duplicates = scanner.find_duplicate_hashes(&functions);
1377
1378 let type3 = duplicates
1379 .iter()
1380 .find(|d| matches!(d.clone_type, CloneType::Type3))
1381 .expect("expected a Type-3 duplicate match");
1382
1383 assert_eq!(
1384 type3.start_line1, 100,
1385 "should report the actual source line even when tokens share a line"
1386 );
1387 assert_eq!(
1388 type3.start_line2, 200,
1389 "should report the actual target line even when tokens share a line"
1390 );
1391 assert_eq!(type3.token_offset1, Some(1));
1392 assert_eq!(type3.token_offset2, Some(1));
1393 }
1394
1395 #[test]
1396 fn type3_duplicate_ids_are_symmetric() {
1397 use tempfile::TempDir;
1398
1399 let tokens_a = make_expr_tokens("let", "+");
1400 let mut tokens_b = make_expr_tokens("let", "-");
1402 tokens_b.push(Token::Identifier);
1403
1404 let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1405 let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1406
1407 let temp_dir = TempDir::new().unwrap();
1408 let scanner = Scanner::with_config(3, 0.85)
1409 .unwrap()
1410 .with_type3_detection(0.75)
1411 .unwrap()
1412 .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1413
1414 let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1415 let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1416
1417 let id_forward = forward
1418 .into_iter()
1419 .find(|d| matches!(d.clone_type, CloneType::Type3))
1420 .and_then(|d| d.duplicate_id)
1421 .expect("expected a Type-3 duplicate ID");
1422
1423 let id_reverse = reverse
1424 .into_iter()
1425 .find(|d| matches!(d.clone_type, CloneType::Type3))
1426 .and_then(|d| d.duplicate_id)
1427 .expect("expected a Type-3 duplicate ID");
1428
1429 assert_eq!(
1430 id_forward, id_reverse,
1431 "Type-3 IDs should not depend on function order"
1432 );
1433 }
1434
1435 #[test]
1436 fn duplicate_matches_store_actual_end_lines() {
1437 let scanner = Scanner::with_config(2, 0.85).unwrap();
1438
1439 let tokens = vec![
1440 Token::Keyword("fn".into()),
1441 Token::Identifier,
1442 Token::Identifier,
1443 Token::Punctuation("{".into()),
1444 Token::Punctuation("}".into()),
1445 ];
1446
1447 let func1 = FunctionHash {
1448 file_path: Arc::<str>::from("file_a.rs"),
1449 function_name: None,
1450 start_byte: 0,
1451 end_byte: 0,
1452 start_line: 10,
1453 end_line: 14,
1454 tokens: tokens.clone(),
1455 token_line_offsets: vec![0, 0, 1, 1, 2],
1456 raw_body: "fn a() {}".to_string(),
1457 };
1458
1459 let func2 = FunctionHash {
1460 file_path: Arc::<str>::from("file_b.rs"),
1461 function_name: None,
1462 start_byte: 0,
1463 end_byte: 0,
1464 start_line: 20,
1465 end_line: 24,
1466 tokens,
1467 token_line_offsets: vec![0, 1, 1, 2, 2],
1468 raw_body: "fn b() {}".to_string(),
1469 };
1470
1471 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1472 let dup = duplicates.first().expect("expected a duplicate match");
1473
1474 assert_eq!(dup.start_line1, 10);
1475 assert_eq!(dup.start_line2, 20);
1476 assert_eq!(dup.end_line1, Some(12));
1477 assert_eq!(dup.end_line2, Some(22));
1478 }
1479
1480 #[test]
1481 fn test_find_duplicates_empty() {
1482 let result = find_duplicates(vec![]);
1483 assert!(result.is_ok());
1484 let report = result.unwrap();
1485 assert_eq!(report.duplicates.len(), 0);
1486 }
1487
1488 #[test]
1489 fn test_is_supported_file() {
1490 let scanner = Scanner::new();
1491
1492 assert!(scanner.is_supported_file(Path::new("test.rs")));
1493 assert!(scanner.is_supported_file(Path::new("test.py")));
1494 assert!(scanner.is_supported_file(Path::new("test.js")));
1495 assert!(scanner.is_supported_file(Path::new("test.ts")));
1496 assert!(!scanner.is_supported_file(Path::new("test.txt")));
1497 assert!(!scanner.is_supported_file(Path::new("test.md")));
1498 }
1499
1500 #[test]
1501 fn test_detect_language() {
1502 let scanner = Scanner::new();
1503
1504 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1505 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1506 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1507 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1508 }
1509}