1mod directives;
8mod error;
9mod hashing;
10mod ignore_rules;
11mod parsing;
12mod queries;
13
14#[cfg(test)]
15mod proptest_fuzzing;
16
17#[cfg(test)]
18mod snapshot_tests;
19
20pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
22pub use error::{PolyDupError, Result};
23pub use hashing::{
24 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
25 compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
26 normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
27 Token,
28};
29pub use ignore_rules::{
30 compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
31};
32pub use parsing::{
33 extract_functions, extract_javascript_functions, extract_python_functions,
34 extract_rust_functions, FunctionNode,
35};
36
37use anyhow::Context;
38use ignore::WalkBuilder;
39use rayon::prelude::*;
40use serde::{Deserialize, Serialize};
41use std::collections::HashMap;
42use std::fs;
43use std::path::{Path, PathBuf};
44use std::sync::Arc;
45use tree_sitter::Language;
46
47#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
49pub enum CloneType {
50 #[serde(rename = "type-1")]
52 Type1,
53 #[serde(rename = "type-2")]
55 Type2,
56 #[serde(rename = "type-3")]
58 Type3,
59}
60
61fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
63 start1 < end2 && start2 < end1
64}
65
66fn canonical_pair_key<'a>(
68 func1: &'a FunctionHash,
69 func2: &'a FunctionHash,
70 source_start: usize,
71 target_start: usize,
72 length: usize,
73) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
74 if func1.file_path.as_ref() < func2.file_path.as_ref() {
75 (
76 func1.file_path.as_ref(),
77 func2.file_path.as_ref(),
78 func1.start_line,
79 func2.start_line,
80 source_start,
81 target_start,
82 length,
83 )
84 } else {
85 (
86 func2.file_path.as_ref(),
87 func1.file_path.as_ref(),
88 func2.start_line,
89 func1.start_line,
90 target_start,
91 source_start,
92 length,
93 )
94 }
95}
96
97#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
99pub struct DuplicateMatch {
100 pub file1: String,
101 pub file2: String,
102 pub start_line1: usize,
103 pub start_line2: usize,
104 #[serde(skip_serializing_if = "Option::is_none")]
105 pub end_line1: Option<usize>,
106 #[serde(skip_serializing_if = "Option::is_none")]
107 pub end_line2: Option<usize>,
108 pub length: usize,
109 pub similarity: f64,
110 pub hash: u64,
111 pub clone_type: CloneType,
112 #[serde(skip_serializing_if = "Option::is_none")]
114 pub edit_distance: Option<usize>,
115 #[serde(skip_serializing_if = "Option::is_none")]
117 pub suppressed_by_directive: Option<bool>,
118 #[serde(skip)]
120 token_offset1: Option<usize>,
121 #[serde(skip)]
123 token_offset2: Option<usize>,
124 #[serde(skip)]
126 target_length: Option<usize>,
127 #[serde(skip_serializing_if = "Option::is_none")]
129 pub duplicate_id: Option<String>,
130}
131
132#[derive(Debug, Clone)]
134struct FunctionHash {
135 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
138 #[allow(dead_code)] start_byte: usize,
140 #[allow(dead_code)] end_byte: usize,
142 start_line: usize,
143 #[allow(dead_code)] end_line: usize,
145 tokens: Vec<Token>, token_line_offsets: Vec<usize>,
148 raw_body: String, }
150
151#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct Baseline {
154 pub version: String,
156 pub created_at: String,
158 pub duplicates: Vec<DuplicateMatch>,
160}
161
162impl Baseline {
163 pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
165 Self {
166 version: env!("CARGO_PKG_VERSION").to_string(),
167 created_at: chrono::Utc::now().to_rfc3339(),
168 duplicates,
169 }
170 }
171
172 pub fn save_to_file(&self, path: &Path) -> Result<()> {
174 let json =
175 serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
176 fs::write(path, json).context("Failed to write baseline file")?;
177 Ok(())
178 }
179
180 pub fn load_from_file(path: &Path) -> Result<Self> {
182 let content = fs::read_to_string(path)
183 .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
184 let baseline: Baseline =
185 serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
186 Ok(baseline)
187 }
188
189 pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
191 let baseline_set: std::collections::HashSet<_> =
192 self.duplicates.iter().map(duplicate_key).collect();
193
194 current
195 .iter()
196 .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
197 .cloned()
198 .collect()
199 }
200}
201
202fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
204 let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
206 (
207 dup.file1.clone(),
208 dup.file2.clone(),
209 dup.start_line1,
210 dup.start_line2,
211 )
212 } else {
213 (
214 dup.file2.clone(),
215 dup.file1.clone(),
216 dup.start_line2,
217 dup.start_line1,
218 )
219 };
220 (file1, file2, line1, line2, dup.length)
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct Report {
226 #[serde(skip_serializing_if = "Option::is_none")]
228 pub version: Option<String>,
229 #[serde(skip_serializing_if = "Option::is_none")]
231 pub scan_time: Option<String>,
232 #[serde(skip_serializing_if = "Option::is_none")]
234 pub config: Option<ScanConfig>,
235 pub files_scanned: usize,
237 pub functions_analyzed: usize,
239 pub duplicates: Vec<DuplicateMatch>,
241 pub stats: ScanStats,
243}
244
245#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct ScanConfig {
248 pub threshold: usize,
250 pub similarity: f64,
252 pub type3_enabled: bool,
254 #[serde(skip_serializing_if = "Option::is_none")]
256 pub paths: Option<Vec<String>>,
257}
258
259#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct ScanStats {
262 pub total_lines: usize,
264 pub total_tokens: usize,
266 pub unique_hashes: usize,
268 pub duration_ms: u64,
270}
271
272#[allow(dead_code)] pub struct Scanner {
275 min_block_size: usize,
277 similarity_threshold: f64,
279 exclude_patterns: Vec<String>,
281 enable_type3: bool,
283 type3_tolerance: f64,
285 ignore_manager: Option<IgnoreManager>,
287 enable_directives: bool,
289 include_tests: bool,
291}
292
293fn default_exclude_patterns() -> Vec<String> {
295 vec![
296 "**/*.test.ts".to_string(),
298 "**/*.test.js".to_string(),
299 "**/*.test.tsx".to_string(),
300 "**/*.test.jsx".to_string(),
301 "**/*.spec.ts".to_string(),
302 "**/*.spec.js".to_string(),
303 "**/*.spec.tsx".to_string(),
304 "**/*.spec.jsx".to_string(),
305 "**/__tests__/**".to_string(),
306 "**/*.test.py".to_string(),
307 ]
308}
309
310fn build_artifact_patterns() -> Vec<String> {
312 vec![
313 "**/node_modules/**".to_string(),
314 "**/target/**".to_string(),
315 "**/dist/**".to_string(),
316 "**/build/**".to_string(),
317 "**/.git/**".to_string(),
318 ]
319}
320
321impl Scanner {
322 pub fn new() -> Self {
326 let mut exclude = build_artifact_patterns();
327 exclude.extend(default_exclude_patterns());
328
329 Self {
330 min_block_size: 50,
331 similarity_threshold: 0.85,
332 exclude_patterns: exclude,
333 enable_type3: false,
334 type3_tolerance: 0.85,
335 ignore_manager: None,
336 enable_directives: false,
337 include_tests: false,
338 }
339 }
340
341 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
343 let mut exclude = build_artifact_patterns();
344 exclude.extend(default_exclude_patterns());
345
346 Ok(Self {
347 min_block_size,
348 similarity_threshold,
349 exclude_patterns: exclude,
350 enable_type3: false,
351 type3_tolerance: 0.85,
352 ignore_manager: None,
353 enable_directives: false,
354 include_tests: false,
355 })
356 }
357
358 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
360 self.exclude_patterns = patterns;
361 self
362 }
363
364 pub fn with_test_files(mut self, include: bool) -> Self {
366 self.include_tests = include;
367 if include {
368 let test_patterns = default_exclude_patterns();
370 self.exclude_patterns.retain(|p| !test_patterns.contains(p));
371 }
372 self
373 }
374
375 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
377 if !(0.0..=1.0).contains(&tolerance) {
378 return Err(PolyDupError::Config(
379 "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
380 ));
381 }
382 self.enable_type3 = true;
383 self.type3_tolerance = tolerance;
384 Ok(self)
385 }
386
387 pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
389 self.ignore_manager = Some(manager);
390 self
391 }
392
393 pub fn with_directives(mut self, enabled: bool) -> Self {
395 self.enable_directives = enabled;
396 self
397 }
398
399 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
408 use std::time::Instant;
409 let start_time = Instant::now();
410
411 let source_files = self.collect_source_files(paths)?;
413
414 let directives_map = self.collect_directives(&source_files);
416
417 let (function_hashes, total_lines) = self.analyze_files(&source_files)?;
419
420 let mut duplicates = self.find_duplicate_hashes(&function_hashes);
422
423 if self.enable_directives && !directives_map.is_empty() {
425 self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes);
426 }
427
428 let stats = self.compute_stats(&function_hashes, total_lines, start_time);
430
431 Ok(Report {
432 version: None, scan_time: None, config: None, files_scanned: source_files.len(),
436 functions_analyzed: function_hashes.len(),
437 duplicates,
438 stats,
439 })
440 }
441
442 fn collect_directives(
444 &self,
445 source_files: &[PathBuf],
446 ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
447 if self.enable_directives {
448 source_files
449 .par_iter()
450 .filter_map(|path| {
451 crate::directives::detect_directives_in_file(path)
452 .ok()
453 .map(|d| (path.clone(), d))
454 })
455 .collect()
456 } else {
457 HashMap::new()
458 }
459 }
460
461 fn analyze_files(&self, source_files: &[PathBuf]) -> Result<(Vec<FunctionHash>, usize)> {
463 let results: Vec<Result<(Vec<FunctionHash>, usize)>> = source_files
465 .par_iter()
466 .map(|path| {
467 let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
469 let line_count = content.lines().count();
470
471 let hashes = self.process_file_content(path, &content)?;
473 Ok((hashes, line_count))
474 })
475 .collect();
476
477 let mut all_hashes = Vec::new();
479 let mut total_lines = 0;
480
481 for res in results {
482 match res {
483 Ok((hashes, lines)) => {
484 all_hashes.extend(hashes);
485 total_lines += lines;
486 }
487 Err(_) => {
488 }
491 }
492 }
493
494 Ok((all_hashes, total_lines))
495 }
496
497 fn apply_directive_filtering(
499 &self,
500 duplicates: &mut Vec<DuplicateMatch>,
501 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
502 function_hashes: &[FunctionHash],
503 ) {
504 for dup in duplicates.iter_mut() {
505 let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
506 if suppressed {
507 dup.suppressed_by_directive = Some(true);
508 }
509 }
510
511 duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
513 }
514
515 fn compute_stats(
517 &self,
518 function_hashes: &[FunctionHash],
519 total_lines: usize,
520 start_time: std::time::Instant,
521 ) -> ScanStats {
522 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
523
524 let unique_hashes: usize = {
525 let mut hash_set = std::collections::HashSet::new();
526 for fh in function_hashes {
527 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
529 for (hash, _) in hashes {
530 hash_set.insert(hash);
531 }
532 }
533 hash_set.len()
534 };
535
536 let duration_ms = start_time.elapsed().as_millis() as u64;
537
538 ScanStats {
539 total_lines,
540 total_tokens,
541 unique_hashes,
542 duration_ms,
543 }
544 }
545
546 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
551 let mut files = Vec::new();
552
553 for path in paths {
554 if path.is_file() {
555 if self.is_supported_file(&path) && !self.is_excluded(&path) {
556 files.push(path);
557 }
558 } else if path.is_dir() {
559 let walker = WalkBuilder::new(&path)
561 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
568
569 for entry in walker {
570 match entry {
571 Ok(entry) => {
572 let path = entry.path();
573 if path.is_file()
574 && self.is_supported_file(path)
575 && !self.is_excluded(path)
576 {
577 files.push(path.to_path_buf());
578 }
579 }
580 Err(err) => {
581 eprintln!("Warning: Failed to access path: {}", err);
583 }
584 }
585 }
586 }
587 }
588
589 Ok(files)
590 }
591
592 fn is_supported_file(&self, path: &Path) -> bool {
594 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
595 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
596 } else {
597 false
598 }
599 }
600
601 fn is_excluded(&self, path: &Path) -> bool {
603 use globset::{Glob, GlobSetBuilder};
604
605 let mut builder = GlobSetBuilder::new();
607 for pattern in &self.exclude_patterns {
608 if let Ok(glob) = Glob::new(pattern) {
609 builder.add(glob);
610 }
611 }
612
613 if let Ok(glob_set) = builder.build() {
614 glob_set.is_match(path)
615 } else {
616 false
617 }
618 }
619
620 fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
622 let lang = self.detect_language(path)?;
623 let functions = extract_functions(code, lang)?;
624
625 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
627 let mut function_hashes = Vec::new();
628
629 for func in functions {
630 let raw_body = func.body.clone();
632 let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
633
634 if tokens.len() < self.min_block_size {
636 continue;
637 }
638
639 function_hashes.push(FunctionHash {
641 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
643 start_byte: func.start_byte,
644 end_byte: func.end_byte,
645 start_line: func.start_line,
646 end_line: func.end_line,
647 tokens,
648 token_line_offsets,
649 raw_body,
650 });
651 }
652
653 Ok(function_hashes)
654 }
655
656 fn detect_language(&self, path: &Path) -> Result<Language> {
658 let ext = path
659 .extension()
660 .and_then(|e| e.to_str())
661 .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
662
663 match ext {
664 "rs" => Ok(tree_sitter_rust::language()),
665 "py" => Ok(tree_sitter_python::language()),
666 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
667 _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
668 }
669 }
670
671 fn compute_line_span(
673 &self,
674 func: &FunctionHash,
675 start_offset: usize,
676 length: usize,
677 ) -> (usize, usize) {
678 let start_line = func
679 .token_line_offsets
680 .get(start_offset)
681 .map(|offset| func.start_line + offset)
682 .unwrap_or(func.start_line + start_offset);
683
684 let end_index = start_offset + length.saturating_sub(1);
685 let end_line = func
686 .token_line_offsets
687 .get(end_index)
688 .map(|offset| func.start_line + offset)
689 .unwrap_or(func.start_line + end_index);
690
691 (start_line, end_line)
692 }
693
694 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
702 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
704
705 let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
707 std::collections::HashSet::new();
708
709 let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
711
712 if self.enable_type3 {
714 self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
715 }
716
717 self.compute_duplicate_ids(function_hashes, &mut duplicates);
719
720 self.filter_ignored_duplicates(&mut duplicates);
722
723 duplicates
724 }
725
726 fn find_type12_duplicates<'a>(
730 &self,
731 function_hashes: &'a [FunctionHash],
732 seen_pairs: &mut std::collections::HashSet<(
733 &'a str,
734 &'a str,
735 usize,
736 usize,
737 usize,
738 usize,
739 usize,
740 )>,
741 ) -> Vec<DuplicateMatch> {
742 let mut duplicates = Vec::new();
743
744 for i in 0..function_hashes.len() {
745 for j in (i + 1)..function_hashes.len() {
746 let func1 = &function_hashes[i];
747 let func2 = &function_hashes[j];
748
749 let matches = self.find_clones_between_functions(func1, func2);
750
751 for clone_match in matches {
752 let pair_key = canonical_pair_key(
753 func1,
754 func2,
755 clone_match.source_start,
756 clone_match.target_start,
757 clone_match.length,
758 );
759
760 if seen_pairs.contains(&pair_key) {
761 continue;
762 }
763 seen_pairs.insert(pair_key);
764
765 let match_hash = Self::compute_match_hash(
767 &func1.tokens[clone_match.source_start
768 ..clone_match.source_start + clone_match.length],
769 );
770
771 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
772
773 let (actual_start1, actual_end1) =
774 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
775 let (actual_start2, actual_end2) =
776 self.compute_line_span(func2, clone_match.target_start, clone_match.length);
777
778 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
780 continue;
781 }
782
783 duplicates.push(DuplicateMatch {
784 file1: func1.file_path.to_string(),
785 file2: func2.file_path.to_string(),
786 start_line1: actual_start1,
787 start_line2: actual_start2,
788 end_line1: Some(actual_end1),
789 end_line2: Some(actual_end2),
790 length: clone_match.length,
791 similarity: clone_match.similarity,
792 hash: match_hash,
793 clone_type,
794 edit_distance: None,
795 suppressed_by_directive: None,
796 token_offset1: Some(clone_match.source_start),
797 token_offset2: Some(clone_match.target_start),
798 target_length: Some(clone_match.length),
799 duplicate_id: None,
800 });
801 }
802 }
803 }
804
805 duplicates
806 }
807
808 fn find_type3_duplicates<'a>(
812 &self,
813 function_hashes: &'a [FunctionHash],
814 seen_pairs: &std::collections::HashSet<(
815 &'a str,
816 &'a str,
817 usize,
818 usize,
819 usize,
820 usize,
821 usize,
822 )>,
823 duplicates: &mut Vec<DuplicateMatch>,
824 ) {
825 let mut type3_candidates = Vec::new();
826
827 for i in 0..function_hashes.len() {
828 for j in (i + 1)..function_hashes.len() {
829 let func1 = &function_hashes[i];
830 let func2 = &function_hashes[j];
831
832 let type3_matches = detect_type3_clones(
833 &func1.tokens,
834 &func2.tokens,
835 self.min_block_size,
836 self.type3_tolerance,
837 );
838
839 for clone_match in type3_matches {
840 let pair_key = canonical_pair_key(
841 func1,
842 func2,
843 clone_match.source_start,
844 clone_match.target_start,
845 clone_match.length,
846 );
847
848 if seen_pairs.contains(&pair_key) {
849 continue;
850 }
851
852 type3_candidates.push((func1, func2, clone_match));
853 }
854 }
855 }
856
857 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
859
860 for (func1, func2, clone_match) in deduplicated {
862 let (actual_start1, actual_end1) =
863 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
864 let (actual_start2, actual_end2) =
865 self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
866
867 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
870 continue;
871 }
872
873 let window1 = &func1.tokens
874 [clone_match.source_start..clone_match.source_start + clone_match.length];
875 let window2 = &func2.tokens
876 [clone_match.target_start..clone_match.target_start + clone_match.target_length];
877 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
878
879 let match_hash = Self::compute_match_hash(window1);
880
881 duplicates.push(DuplicateMatch {
882 file1: func1.file_path.to_string(),
883 file2: func2.file_path.to_string(),
884 start_line1: actual_start1,
885 start_line2: actual_start2,
886 end_line1: Some(actual_end1),
887 end_line2: Some(actual_end2),
888 length: clone_match.length,
889 similarity: clone_match.similarity,
890 hash: match_hash,
891 clone_type: CloneType::Type3,
892 edit_distance: Some(edit_dist),
893 suppressed_by_directive: None,
894 token_offset1: Some(clone_match.source_start),
895 token_offset2: Some(clone_match.target_start),
896 target_length: Some(clone_match.target_length),
897 duplicate_id: None,
898 });
899 }
900 }
901
902 fn compute_duplicate_ids(
906 &self,
907 function_hashes: &[FunctionHash],
908 duplicates: &mut [DuplicateMatch],
909 ) {
910 for dup in duplicates.iter_mut() {
911 if dup.duplicate_id.is_some() {
912 continue;
913 }
914
915 let tokens1 = self.extract_duplicate_tokens(
916 function_hashes,
917 &dup.file1,
918 dup.start_line1,
919 dup.token_offset1,
920 dup.length,
921 );
922
923 let tokens2 = self.extract_duplicate_tokens(
924 function_hashes,
925 &dup.file2,
926 dup.start_line2,
927 dup.token_offset2,
928 dup.target_length.unwrap_or(dup.length),
929 );
930
931 if let Some(tokens1) = tokens1 {
932 let id = if let Some(tokens2) = tokens2 {
933 ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
934 } else {
935 ignore_rules::compute_duplicate_id(&tokens1)
936 };
937 dup.duplicate_id = Some(id);
938 }
939 }
940 }
941
942 fn extract_duplicate_tokens(
944 &self,
945 function_hashes: &[FunctionHash],
946 file: &str,
947 reported_start: usize,
948 token_offset: Option<usize>,
949 length: usize,
950 ) -> Option<Vec<String>> {
951 let token_offset = token_offset?;
952 function_hashes
953 .iter()
954 .find(|fh| {
955 fh.file_path.as_ref() == file
956 && fh.start_line <= reported_start
957 && reported_start <= fh.end_line
958 })
959 .and_then(|fh| {
960 if token_offset + length <= fh.tokens.len() {
961 Some(
962 fh.tokens
963 .iter()
964 .skip(token_offset)
965 .take(length)
966 .map(|t| t.as_hash_string().to_string())
967 .collect(),
968 )
969 } else {
970 None
971 }
972 })
973 }
974
975 fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
977 if let Some(ref ignore_manager) = self.ignore_manager {
978 duplicates.retain(|dup| {
979 if let Some(ref id) = dup.duplicate_id {
980 !ignore_manager.is_ignored(id)
981 } else {
982 true
984 }
985 });
986 }
987 }
988
989 fn compute_match_hash(tokens: &[Token]) -> u64 {
991 use std::collections::hash_map::DefaultHasher;
992 use std::hash::{Hash, Hasher};
993 let mut hasher = DefaultHasher::new();
994 tokens.hash(&mut hasher);
995 hasher.finish()
996 }
997
998 fn is_suppressed_by_directive(
1003 &self,
1004 dup: &DuplicateMatch,
1005 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1006 function_hashes: &[FunctionHash],
1007 ) -> bool {
1008 let file1_path = PathBuf::from(&dup.file1);
1010 let file2_path = PathBuf::from(&dup.file2);
1011
1012 if let Some(directives) = directives_map.get(&file1_path) {
1014 let func_start =
1015 self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1016 let check_line = func_start.unwrap_or(dup.start_line1);
1018
1019 if directives.is_suppressed(check_line, check_line).is_some() {
1020 return true;
1021 }
1022 }
1023
1024 if let Some(directives) = directives_map.get(&file2_path) {
1026 let func_start =
1027 self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1028 let check_line = func_start.unwrap_or(dup.start_line2);
1030
1031 if directives.is_suppressed(check_line, check_line).is_some() {
1032 return true;
1033 }
1034 }
1035
1036 false
1037 }
1038
1039 fn find_owning_function_start(
1041 &self,
1042 file: &str,
1043 line: usize,
1044 function_hashes: &[FunctionHash],
1045 ) -> Option<usize> {
1046 function_hashes
1047 .iter()
1048 .find(|fh| {
1049 fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1050 })
1051 .map(|fh| fh.start_line)
1052 }
1053
1054 fn deduplicate_overlapping_matches<'a>(
1060 &self,
1061 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1062 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1063 if candidates.is_empty() {
1064 return Vec::new();
1065 }
1066
1067 let mut used = vec![false; candidates.len()];
1069 let mut deduplicated = Vec::new();
1070
1071 for i in 0..candidates.len() {
1072 if used[i] {
1073 continue;
1074 }
1075
1076 let (func1, func2, current) = &candidates[i];
1077 let mut best_match = (*func1, *func2, current.clone());
1078 used[i] = true;
1079
1080 let mut found_overlap = true;
1083 while found_overlap {
1084 found_overlap = false;
1085
1086 for j in (i + 1)..candidates.len() {
1087 if used[j] {
1088 continue;
1089 }
1090
1091 let (f1, f2, candidate) = &candidates[j];
1092
1093 let same_pair = (func1.file_path == f1.file_path
1095 && func2.file_path == f2.file_path
1096 && func1.start_line == f1.start_line
1097 && func2.start_line == f2.start_line)
1098 || (func1.file_path == f2.file_path
1099 && func2.file_path == f1.file_path
1100 && func1.start_line == f2.start_line
1101 && func2.start_line == f1.start_line);
1102
1103 if !same_pair {
1104 continue;
1105 }
1106
1107 let source_overlap = ranges_overlap(
1110 best_match.2.source_start,
1111 best_match.2.source_start + best_match.2.length,
1112 candidate.source_start,
1113 candidate.source_start + candidate.length,
1114 );
1115 let target_overlap = ranges_overlap(
1116 best_match.2.target_start,
1117 best_match.2.target_start + best_match.2.target_length,
1118 candidate.target_start,
1119 candidate.target_start + candidate.target_length,
1120 );
1121
1122 if source_overlap && target_overlap {
1123 let best_span = best_match.2.length.max(best_match.2.target_length);
1124 let candidate_span = candidate.length.max(candidate.target_length);
1125
1126 if candidate_span > best_span
1128 || (candidate_span == best_span
1129 && candidate.similarity > best_match.2.similarity)
1130 {
1131 best_match = (*f1, *f2, candidate.clone());
1132 found_overlap = true; }
1134 used[j] = true;
1135 }
1136 }
1137 }
1138
1139 deduplicated.push(best_match);
1140 }
1141
1142 deduplicated
1143 }
1144
1145 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1147 let normalized1 = raw1.split_whitespace().collect::<String>();
1149 let normalized2 = raw2.split_whitespace().collect::<String>();
1150
1151 if normalized1 == normalized2 {
1153 CloneType::Type1
1154 } else {
1155 CloneType::Type2
1157 }
1158 }
1159
1160 fn find_clones_between_functions(
1162 &self,
1163 func1: &FunctionHash,
1164 func2: &FunctionHash,
1165 ) -> Vec<CloneMatch> {
1166 use std::collections::HashMap;
1167
1168 let mut matches = Vec::new();
1169 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1170
1171 let mut i = 0;
1173 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1174 let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1175 hash_map.entry(hash).or_default().push(i);
1176 i += 1;
1177 }
1178
1179 let mut j = 0;
1181 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1182 let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1183
1184 if let Some(func1_positions) = hash_map.get(&hash) {
1185 for &func1_pos in func1_positions {
1186 if hashing::verify_cross_window_match(
1188 &func1.tokens,
1189 &func2.tokens,
1190 func1_pos,
1191 j,
1192 self.min_block_size,
1193 ) {
1194 let extension = hashing::extend_match(
1196 &func1.tokens,
1197 &func2.tokens,
1198 func1_pos,
1199 j,
1200 self.min_block_size,
1201 );
1202
1203 let total_length = self.min_block_size + extension;
1204
1205 matches.push(CloneMatch {
1206 source_start: func1_pos,
1207 target_start: j,
1208 length: total_length,
1209 target_length: total_length,
1210 similarity: 1.0, });
1212
1213 j += extension.max(1);
1215 break;
1216 }
1217 }
1218 }
1219
1220 j += 1;
1221 }
1222
1223 matches
1224 }
1225}
1226
1227impl Default for Scanner {
1228 fn default() -> Self {
1229 Self::new() }
1231}
1232
1233pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1241 let scanner = Scanner::new();
1242 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1243 scanner.scan(path_bufs)
1244}
1245
1246pub fn find_duplicates_with_config(
1248 paths: Vec<String>,
1249 min_block_size: usize,
1250 similarity_threshold: f64,
1251) -> Result<Report> {
1252 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1253 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1254 scanner.scan(path_bufs)
1255}
1256
1257#[cfg(test)]
1258mod tests {
1259 use super::*;
1260
1261 fn make_test_function(
1263 file: &str,
1264 start_line: usize,
1265 tokens: Vec<Token>,
1266 raw_body: &str,
1267 ) -> FunctionHash {
1268 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1269 FunctionHash {
1270 file_path: Arc::<str>::from(file),
1271 function_name: None,
1272 start_byte: 0,
1273 end_byte: 0,
1274 start_line,
1275 end_line: start_line + tokens.len(),
1276 tokens,
1277 token_line_offsets,
1278 raw_body: raw_body.to_string(),
1279 }
1280 }
1281
1282 fn make_test_function_same_line(
1284 file: &str,
1285 start_line: usize,
1286 end_line: usize,
1287 tokens: Vec<Token>,
1288 raw_body: &str,
1289 ) -> FunctionHash {
1290 let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1291 FunctionHash {
1292 file_path: Arc::<str>::from(file),
1293 function_name: None,
1294 start_byte: 0,
1295 end_byte: 0,
1296 start_line,
1297 end_line,
1298 tokens,
1299 token_line_offsets,
1300 raw_body: raw_body.to_string(),
1301 }
1302 }
1303
1304 fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1306 vec![
1307 Token::Keyword(keyword.into()),
1308 Token::Identifier,
1309 Token::Operator(op.into()),
1310 Token::Identifier,
1311 Token::Punctuation(";".into()),
1312 ]
1313 }
1314
1315 #[test]
1316 fn test_scanner_creation() {
1317 let _scanner = Scanner::new(); }
1319
1320 #[test]
1321 fn test_scanner_with_config() {
1322 let scanner = Scanner::with_config(30, 0.9);
1323 assert!(scanner.is_ok());
1324 let s = scanner.unwrap();
1325 assert_eq!(s.min_block_size, 30);
1326 assert_eq!(s.similarity_threshold, 0.9);
1327 }
1328
1329 #[test]
1330 fn test_type3_tolerance_validation() {
1331 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1332 assert!(Scanner::new().with_type3_detection(1.2).is_err());
1333 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1334 }
1335
1336 #[test]
1337 fn test_type3_not_dropped_when_functions_share_offsets() {
1338 fn make_function(
1339 file: &str,
1340 start_line: usize,
1341 tokens: Vec<Token>,
1342 raw_body: &str,
1343 ) -> FunctionHash {
1344 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1345 FunctionHash {
1346 file_path: Arc::<str>::from(file),
1347 function_name: None,
1348 start_byte: 0,
1349 end_byte: 0,
1350 start_line,
1351 end_line: start_line + tokens.len(),
1352 tokens,
1353 token_line_offsets,
1354 raw_body: raw_body.to_string(),
1355 }
1356 }
1357
1358 let scanner = Scanner::with_config(3, 0.85)
1359 .unwrap()
1360 .with_type3_detection(0.6)
1361 .unwrap();
1362
1363 let type1_tokens = vec![
1364 Token::Keyword("return".into()),
1365 Token::NumberLiteral,
1366 Token::Punctuation(";".into()),
1367 ];
1368 let near_tokens_a = vec![
1369 Token::Keyword("compute".into()),
1370 Token::Identifier,
1371 Token::Identifier,
1372 ];
1373 let near_tokens_b = vec![
1374 Token::Keyword("compute".into()),
1375 Token::Identifier,
1376 Token::NumberLiteral,
1377 ];
1378
1379 let functions = vec![
1380 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1381 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1382 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1383 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1384 ];
1385
1386 let duplicates = scanner.find_duplicate_hashes(&functions);
1387
1388 let type1_present = duplicates.iter().any(|d| {
1389 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1390 && d.start_line1 == 10
1391 && d.start_line2 == 20
1392 });
1393 assert!(
1394 type1_present,
1395 "expected Type-1/2 match for the first function pair"
1396 );
1397
1398 let type3_present = duplicates.iter().any(|d| {
1399 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1400 });
1401 assert!(
1402 type3_present,
1403 "Type-3 match between later functions should not be deduped"
1404 );
1405
1406 assert_eq!(
1407 duplicates.len(),
1408 2,
1409 "should keep both the Type-1/2 and Type-3 matches"
1410 );
1411 }
1412
1413 #[test]
1414 fn test_type3_reports_token_offsets_in_start_lines() {
1415 let scanner = Scanner::with_config(3, 0.85)
1416 .unwrap()
1417 .with_type3_detection(0.75)
1418 .unwrap();
1419
1420 let functions = vec![
1421 make_test_function_same_line(
1422 "file_a.rs",
1423 100,
1424 105,
1425 make_expr_tokens("let", "+"),
1426 "let a = b + c;",
1427 ),
1428 make_test_function_same_line(
1429 "file_b.rs",
1430 200,
1431 205,
1432 make_expr_tokens("mut", "-"),
1433 "let a = b - c;",
1434 ),
1435 ];
1436
1437 let duplicates = scanner.find_duplicate_hashes(&functions);
1438
1439 let type3 = duplicates
1440 .iter()
1441 .find(|d| matches!(d.clone_type, CloneType::Type3))
1442 .expect("expected a Type-3 duplicate match");
1443
1444 assert_eq!(
1445 type3.start_line1, 100,
1446 "should report the actual source line even when tokens share a line"
1447 );
1448 assert_eq!(
1449 type3.start_line2, 200,
1450 "should report the actual target line even when tokens share a line"
1451 );
1452 assert_eq!(type3.token_offset1, Some(1));
1453 assert_eq!(type3.token_offset2, Some(1));
1454 }
1455
1456 #[test]
1457 fn type3_duplicate_ids_are_symmetric() {
1458 use tempfile::TempDir;
1459
1460 let tokens_a = make_expr_tokens("let", "+");
1461 let mut tokens_b = make_expr_tokens("let", "-");
1463 tokens_b.push(Token::Identifier);
1464
1465 let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1466 let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1467
1468 let temp_dir = TempDir::new().unwrap();
1469 let scanner = Scanner::with_config(3, 0.85)
1470 .unwrap()
1471 .with_type3_detection(0.75)
1472 .unwrap()
1473 .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1474
1475 let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1476 let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1477
1478 let id_forward = forward
1479 .into_iter()
1480 .find(|d| matches!(d.clone_type, CloneType::Type3))
1481 .and_then(|d| d.duplicate_id)
1482 .expect("expected a Type-3 duplicate ID");
1483
1484 let id_reverse = reverse
1485 .into_iter()
1486 .find(|d| matches!(d.clone_type, CloneType::Type3))
1487 .and_then(|d| d.duplicate_id)
1488 .expect("expected a Type-3 duplicate ID");
1489
1490 assert_eq!(
1491 id_forward, id_reverse,
1492 "Type-3 IDs should not depend on function order"
1493 );
1494 }
1495
1496 #[test]
1497 fn type3_does_not_report_self_matches() {
1498 let scanner = Scanner::with_config(3, 0.85)
1501 .unwrap()
1502 .with_type3_detection(0.75)
1503 .unwrap();
1504
1505 let tokens = make_expr_tokens("let", "+");
1508 let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
1509 let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
1510
1511 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1512
1513 let self_matches: Vec<_> = duplicates
1515 .iter()
1516 .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
1517 .collect();
1518
1519 assert!(
1520 self_matches.is_empty(),
1521 "Type-3 should never report self-matches (same file and line). Found: {:?}",
1522 self_matches
1523 );
1524 }
1525
1526 #[test]
1527 fn type3_still_detects_same_file_different_line_duplicates() {
1528 let scanner = Scanner::with_config(3, 0.85)
1530 .unwrap()
1531 .with_type3_detection(0.75)
1532 .unwrap();
1533
1534 let tokens1 = make_expr_tokens("let", "+");
1536 let mut tokens2 = make_expr_tokens("let", "-");
1537 tokens2.push(Token::Identifier); let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
1540 let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
1541
1542 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1543
1544 let same_file_different_line: Vec<_> = duplicates
1545 .iter()
1546 .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
1547 .collect();
1548
1549 assert!(
1550 !same_file_different_line.is_empty(),
1551 "Type-3 should still detect duplicates in the same file at different lines"
1552 );
1553 }
1554
1555 #[test]
1556 fn duplicate_matches_store_actual_end_lines() {
1557 let scanner = Scanner::with_config(2, 0.85).unwrap();
1558
1559 let tokens = vec![
1560 Token::Keyword("fn".into()),
1561 Token::Identifier,
1562 Token::Identifier,
1563 Token::Punctuation("{".into()),
1564 Token::Punctuation("}".into()),
1565 ];
1566
1567 let func1 = FunctionHash {
1568 file_path: Arc::<str>::from("file_a.rs"),
1569 function_name: None,
1570 start_byte: 0,
1571 end_byte: 0,
1572 start_line: 10,
1573 end_line: 14,
1574 tokens: tokens.clone(),
1575 token_line_offsets: vec![0, 0, 1, 1, 2],
1576 raw_body: "fn a() {}".to_string(),
1577 };
1578
1579 let func2 = FunctionHash {
1580 file_path: Arc::<str>::from("file_b.rs"),
1581 function_name: None,
1582 start_byte: 0,
1583 end_byte: 0,
1584 start_line: 20,
1585 end_line: 24,
1586 tokens,
1587 token_line_offsets: vec![0, 1, 1, 2, 2],
1588 raw_body: "fn b() {}".to_string(),
1589 };
1590
1591 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1592 let dup = duplicates.first().expect("expected a duplicate match");
1593
1594 assert_eq!(dup.start_line1, 10);
1595 assert_eq!(dup.start_line2, 20);
1596 assert_eq!(dup.end_line1, Some(12));
1597 assert_eq!(dup.end_line2, Some(22));
1598 }
1599
1600 #[test]
1601 fn test_find_duplicates_empty() {
1602 let result = find_duplicates(vec![]);
1603 assert!(result.is_ok());
1604 let report = result.unwrap();
1605 assert_eq!(report.duplicates.len(), 0);
1606 }
1607
1608 #[test]
1609 fn test_is_supported_file() {
1610 let scanner = Scanner::new();
1611
1612 assert!(scanner.is_supported_file(Path::new("test.rs")));
1613 assert!(scanner.is_supported_file(Path::new("test.py")));
1614 assert!(scanner.is_supported_file(Path::new("test.js")));
1615 assert!(scanner.is_supported_file(Path::new("test.ts")));
1616 assert!(!scanner.is_supported_file(Path::new("test.txt")));
1617 assert!(!scanner.is_supported_file(Path::new("test.md")));
1618 }
1619
1620 #[test]
1621 fn test_detect_language() {
1622 let scanner = Scanner::new();
1623
1624 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1625 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1626 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1627 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1628 }
1629}