1mod cache;
8mod directives;
9mod error;
10mod hashing;
11mod ignore_rules;
12mod parsing;
13mod queries;
14
15#[cfg(test)]
16mod proptest_fuzzing;
17
18#[cfg(test)]
19mod snapshot_tests;
20
21pub use cache::{CacheStats, CodeLocation, FileCacheMetadata, HashCache};
23pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
24pub use error::{PolyDupError, Result};
25pub use hashing::{
26 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
27 compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
28 normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
29 Token,
30};
31pub use ignore_rules::{
32 compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
33};
34pub use parsing::{
35 extract_functions, extract_javascript_functions, extract_python_functions,
36 extract_rust_functions, FunctionNode,
37};
38
39use anyhow::Context;
40use ignore::WalkBuilder;
41use rayon::prelude::*;
42use serde::{Deserialize, Serialize};
43use std::collections::{HashMap, HashSet};
44use std::fs;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use tree_sitter::Language;
48
49#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51pub enum CloneType {
52 #[serde(rename = "type-1")]
54 Type1,
55 #[serde(rename = "type-2")]
57 Type2,
58 #[serde(rename = "type-3")]
60 Type3,
61}
62
63fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
65 start1 < end2 && start2 < end1
66}
67
68fn canonical_pair_key<'a>(
70 func1: &'a FunctionHash,
71 func2: &'a FunctionHash,
72 source_start: usize,
73 target_start: usize,
74 length: usize,
75) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
76 if func1.file_path.as_ref() < func2.file_path.as_ref() {
77 (
78 func1.file_path.as_ref(),
79 func2.file_path.as_ref(),
80 func1.start_line,
81 func2.start_line,
82 source_start,
83 target_start,
84 length,
85 )
86 } else {
87 (
88 func2.file_path.as_ref(),
89 func1.file_path.as_ref(),
90 func2.start_line,
91 func1.start_line,
92 target_start,
93 source_start,
94 length,
95 )
96 }
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
101pub struct DuplicateMatch {
102 pub file1: String,
103 pub file2: String,
104 pub start_line1: usize,
105 pub start_line2: usize,
106 #[serde(skip_serializing_if = "Option::is_none")]
107 pub end_line1: Option<usize>,
108 #[serde(skip_serializing_if = "Option::is_none")]
109 pub end_line2: Option<usize>,
110 pub length: usize,
111 pub similarity: f64,
112 pub hash: u64,
113 pub clone_type: CloneType,
114 #[serde(skip_serializing_if = "Option::is_none")]
116 pub edit_distance: Option<usize>,
117 #[serde(skip_serializing_if = "Option::is_none")]
119 pub suppressed_by_directive: Option<bool>,
120 #[serde(skip)]
122 token_offset1: Option<usize>,
123 #[serde(skip)]
125 token_offset2: Option<usize>,
126 #[serde(skip)]
128 target_length: Option<usize>,
129 #[serde(skip_serializing_if = "Option::is_none")]
131 pub duplicate_id: Option<String>,
132}
133
134#[derive(Debug, Clone)]
136struct FunctionHash {
137 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
140 #[allow(dead_code)] start_byte: usize,
142 #[allow(dead_code)] end_byte: usize,
144 start_line: usize,
145 #[allow(dead_code)] end_line: usize,
147 tokens: Vec<Token>, token_line_offsets: Vec<usize>,
150 raw_body: String, }
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct Baseline {
156 pub version: String,
158 pub created_at: String,
160 pub duplicates: Vec<DuplicateMatch>,
162}
163
164impl Baseline {
165 pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
167 Self {
168 version: env!("CARGO_PKG_VERSION").to_string(),
169 created_at: chrono::Utc::now().to_rfc3339(),
170 duplicates,
171 }
172 }
173
174 pub fn save_to_file(&self, path: &Path) -> Result<()> {
176 let json =
177 serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
178 fs::write(path, json).context("Failed to write baseline file")?;
179 Ok(())
180 }
181
182 pub fn load_from_file(path: &Path) -> Result<Self> {
184 let content = fs::read_to_string(path)
185 .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
186 let baseline: Baseline =
187 serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
188 Ok(baseline)
189 }
190
191 pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
193 let baseline_set: std::collections::HashSet<_> =
194 self.duplicates.iter().map(duplicate_key).collect();
195
196 current
197 .iter()
198 .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
199 .cloned()
200 .collect()
201 }
202}
203
204fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
206 let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
208 (
209 dup.file1.clone(),
210 dup.file2.clone(),
211 dup.start_line1,
212 dup.start_line2,
213 )
214 } else {
215 (
216 dup.file2.clone(),
217 dup.file1.clone(),
218 dup.start_line2,
219 dup.start_line1,
220 )
221 };
222 (file1, file2, line1, line2, dup.length)
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct Report {
228 #[serde(skip_serializing_if = "Option::is_none")]
230 pub version: Option<String>,
231 #[serde(skip_serializing_if = "Option::is_none")]
233 pub scan_time: Option<String>,
234 #[serde(skip_serializing_if = "Option::is_none")]
236 pub config: Option<ScanConfig>,
237 pub files_scanned: usize,
239 pub functions_analyzed: usize,
241 pub duplicates: Vec<DuplicateMatch>,
243 pub stats: ScanStats,
245}
246
247#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct ScanConfig {
250 pub threshold: usize,
252 pub similarity: f64,
254 pub type3_enabled: bool,
256 #[serde(skip_serializing_if = "Option::is_none")]
258 pub paths: Option<Vec<String>>,
259}
260
261#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct ScanStats {
264 pub total_lines: usize,
266 pub total_tokens: usize,
268 pub unique_hashes: usize,
270 pub duration_ms: u64,
272}
273
274#[allow(dead_code)] pub struct Scanner {
277 min_block_size: usize,
279 similarity_threshold: f64,
281 exclude_patterns: Vec<String>,
283 enable_type3: bool,
285 type3_tolerance: f64,
287 ignore_manager: Option<IgnoreManager>,
289 enable_directives: bool,
291 include_tests: bool,
293}
294
295fn default_exclude_patterns() -> Vec<String> {
297 vec![
298 "**/*.test.ts".to_string(),
300 "**/*.test.js".to_string(),
301 "**/*.test.tsx".to_string(),
302 "**/*.test.jsx".to_string(),
303 "**/*.spec.ts".to_string(),
304 "**/*.spec.js".to_string(),
305 "**/*.spec.tsx".to_string(),
306 "**/*.spec.jsx".to_string(),
307 "**/__tests__/**".to_string(),
308 "**/*.test.py".to_string(),
309 ]
310}
311
312fn build_artifact_patterns() -> Vec<String> {
314 vec![
315 "**/node_modules/**".to_string(),
316 "**/target/**".to_string(),
317 "**/dist/**".to_string(),
318 "**/build/**".to_string(),
319 "**/.git/**".to_string(),
320 ]
321}
322
323impl Scanner {
324 pub fn new() -> Self {
328 let mut exclude = build_artifact_patterns();
329 exclude.extend(default_exclude_patterns());
330
331 Self {
332 min_block_size: 50,
333 similarity_threshold: 0.85,
334 exclude_patterns: exclude,
335 enable_type3: false,
336 type3_tolerance: 0.85,
337 ignore_manager: None,
338 enable_directives: false,
339 include_tests: false,
340 }
341 }
342
343 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
345 let mut exclude = build_artifact_patterns();
346 exclude.extend(default_exclude_patterns());
347
348 Ok(Self {
349 min_block_size,
350 similarity_threshold,
351 exclude_patterns: exclude,
352 enable_type3: false,
353 type3_tolerance: 0.85,
354 ignore_manager: None,
355 enable_directives: false,
356 include_tests: false,
357 })
358 }
359
360 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
362 self.exclude_patterns = patterns;
363 self
364 }
365
366 pub fn with_test_files(mut self, include: bool) -> Self {
368 self.include_tests = include;
369 if include {
370 let test_patterns = default_exclude_patterns();
372 self.exclude_patterns.retain(|p| !test_patterns.contains(p));
373 }
374 self
375 }
376
377 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
379 if !(0.0..=1.0).contains(&tolerance) {
380 return Err(PolyDupError::Config(
381 "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
382 ));
383 }
384 self.enable_type3 = true;
385 self.type3_tolerance = tolerance;
386 Ok(self)
387 }
388
389 pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
391 self.ignore_manager = Some(manager);
392 self
393 }
394
395 pub fn with_directives(mut self, enabled: bool) -> Self {
397 self.enable_directives = enabled;
398 self
399 }
400
401 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
410 use std::time::Instant;
411 let start_time = Instant::now();
412
413 let source_files = self.collect_source_files(paths)?;
415
416 let directives_map = self.collect_directives(&source_files);
418
419 let (function_hashes, total_lines) = self.analyze_files(&source_files)?;
421
422 let mut duplicates = self.find_duplicate_hashes(&function_hashes);
424
425 if self.enable_directives && !directives_map.is_empty() {
427 self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes);
428 }
429
430 let stats = self.compute_stats(&function_hashes, total_lines, start_time);
432
433 Ok(Report {
434 version: None, scan_time: None, config: None, files_scanned: source_files.len(),
438 functions_analyzed: function_hashes.len(),
439 duplicates,
440 stats,
441 })
442 }
443
444 fn collect_directives(
446 &self,
447 source_files: &[PathBuf],
448 ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
449 if self.enable_directives {
450 source_files
451 .par_iter()
452 .filter_map(|path| {
453 crate::directives::detect_directives_in_file(path)
454 .ok()
455 .map(|d| (path.clone(), d))
456 })
457 .collect()
458 } else {
459 HashMap::new()
460 }
461 }
462
463 fn analyze_files(&self, source_files: &[PathBuf]) -> Result<(Vec<FunctionHash>, usize)> {
465 let results: Vec<Result<(Vec<FunctionHash>, usize)>> = source_files
467 .par_iter()
468 .map(|path| {
469 let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
471 let line_count = content.lines().count();
472
473 let hashes = self.process_file_content(path, &content)?;
475 Ok((hashes, line_count))
476 })
477 .collect();
478
479 let mut all_hashes = Vec::new();
481 let mut total_lines = 0;
482
483 for res in results {
484 match res {
485 Ok((hashes, lines)) => {
486 all_hashes.extend(hashes);
487 total_lines += lines;
488 }
489 Err(_) => {
490 }
493 }
494 }
495
496 Ok((all_hashes, total_lines))
497 }
498
499 fn apply_directive_filtering(
501 &self,
502 duplicates: &mut Vec<DuplicateMatch>,
503 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
504 function_hashes: &[FunctionHash],
505 ) {
506 for dup in duplicates.iter_mut() {
507 let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
508 if suppressed {
509 dup.suppressed_by_directive = Some(true);
510 }
511 }
512
513 duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
515 }
516
517 fn compute_stats(
519 &self,
520 function_hashes: &[FunctionHash],
521 total_lines: usize,
522 start_time: std::time::Instant,
523 ) -> ScanStats {
524 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
525
526 let unique_hashes: usize = {
527 let mut hash_set = std::collections::HashSet::new();
528 for fh in function_hashes {
529 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
531 for (hash, _) in hashes {
532 hash_set.insert(hash);
533 }
534 }
535 hash_set.len()
536 };
537
538 let duration_ms = start_time.elapsed().as_millis() as u64;
539
540 ScanStats {
541 total_lines,
542 total_tokens,
543 unique_hashes,
544 duration_ms,
545 }
546 }
547
548 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
553 let mut files = Vec::new();
554
555 for path in paths {
556 if path.is_file() {
557 if self.is_supported_file(&path) && !self.is_excluded(&path) {
558 files.push(path);
559 }
560 } else if path.is_dir() {
561 let walker = WalkBuilder::new(&path)
563 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
570
571 for entry in walker {
572 match entry {
573 Ok(entry) => {
574 let path = entry.path();
575 if path.is_file()
576 && self.is_supported_file(path)
577 && !self.is_excluded(path)
578 {
579 files.push(path.to_path_buf());
580 }
581 }
582 Err(err) => {
583 eprintln!("Warning: Failed to access path: {}", err);
585 }
586 }
587 }
588 }
589 }
590
591 Ok(files)
592 }
593
594 fn is_supported_file(&self, path: &Path) -> bool {
596 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
597 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
598 } else {
599 false
600 }
601 }
602
603 fn is_excluded(&self, path: &Path) -> bool {
605 use globset::{Glob, GlobSetBuilder};
606
607 let mut builder = GlobSetBuilder::new();
609 for pattern in &self.exclude_patterns {
610 if let Ok(glob) = Glob::new(pattern) {
611 builder.add(glob);
612 }
613 }
614
615 if let Ok(glob_set) = builder.build() {
616 glob_set.is_match(path)
617 } else {
618 false
619 }
620 }
621
622 fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
624 let lang = self.detect_language(path)?;
625 let functions = extract_functions(code, lang)?;
626
627 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
629 let mut function_hashes = Vec::new();
630
631 for func in functions {
632 let raw_body = func.body.clone();
634 let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
635
636 if tokens.len() < self.min_block_size {
638 continue;
639 }
640
641 function_hashes.push(FunctionHash {
643 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
645 start_byte: func.start_byte,
646 end_byte: func.end_byte,
647 start_line: func.start_line,
648 end_line: func.end_line,
649 tokens,
650 token_line_offsets,
651 raw_body,
652 });
653 }
654
655 Ok(function_hashes)
656 }
657
658 fn detect_language(&self, path: &Path) -> Result<Language> {
660 let ext = path
661 .extension()
662 .and_then(|e| e.to_str())
663 .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
664
665 match ext {
666 "rs" => Ok(tree_sitter_rust::language()),
667 "py" => Ok(tree_sitter_python::language()),
668 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
669 _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
670 }
671 }
672
673 fn compute_line_span(
675 &self,
676 func: &FunctionHash,
677 start_offset: usize,
678 length: usize,
679 ) -> (usize, usize) {
680 let start_line = func
681 .token_line_offsets
682 .get(start_offset)
683 .map(|offset| func.start_line + offset)
684 .unwrap_or(func.start_line + start_offset);
685
686 let end_index = start_offset + length.saturating_sub(1);
687 let end_line = func
688 .token_line_offsets
689 .get(end_index)
690 .map(|offset| func.start_line + offset)
691 .unwrap_or(func.start_line + end_index);
692
693 (start_line, end_line)
694 }
695
696 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
704 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
706
707 let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
709 std::collections::HashSet::new();
710
711 let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
713
714 if self.enable_type3 {
716 self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
717 }
718
719 self.compute_duplicate_ids(function_hashes, &mut duplicates);
721
722 self.filter_ignored_duplicates(&mut duplicates);
724
725 duplicates
726 }
727
728 fn find_type12_duplicates<'a>(
732 &self,
733 function_hashes: &'a [FunctionHash],
734 seen_pairs: &mut std::collections::HashSet<(
735 &'a str,
736 &'a str,
737 usize,
738 usize,
739 usize,
740 usize,
741 usize,
742 )>,
743 ) -> Vec<DuplicateMatch> {
744 let mut duplicates = Vec::new();
745
746 for i in 0..function_hashes.len() {
747 for j in (i + 1)..function_hashes.len() {
748 let func1 = &function_hashes[i];
749 let func2 = &function_hashes[j];
750
751 let matches = self.find_clones_between_functions(func1, func2);
752
753 for clone_match in matches {
754 let pair_key = canonical_pair_key(
755 func1,
756 func2,
757 clone_match.source_start,
758 clone_match.target_start,
759 clone_match.length,
760 );
761
762 if seen_pairs.contains(&pair_key) {
763 continue;
764 }
765 seen_pairs.insert(pair_key);
766
767 let match_hash = Self::compute_match_hash(
769 &func1.tokens[clone_match.source_start
770 ..clone_match.source_start + clone_match.length],
771 );
772
773 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
774
775 let (actual_start1, actual_end1) =
776 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
777 let (actual_start2, actual_end2) =
778 self.compute_line_span(func2, clone_match.target_start, clone_match.length);
779
780 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
782 continue;
783 }
784
785 duplicates.push(DuplicateMatch {
786 file1: func1.file_path.to_string(),
787 file2: func2.file_path.to_string(),
788 start_line1: actual_start1,
789 start_line2: actual_start2,
790 end_line1: Some(actual_end1),
791 end_line2: Some(actual_end2),
792 length: clone_match.length,
793 similarity: clone_match.similarity,
794 hash: match_hash,
795 clone_type,
796 edit_distance: None,
797 suppressed_by_directive: None,
798 token_offset1: Some(clone_match.source_start),
799 token_offset2: Some(clone_match.target_start),
800 target_length: Some(clone_match.length),
801 duplicate_id: None,
802 });
803 }
804 }
805 }
806
807 duplicates
808 }
809
810 fn find_type3_duplicates<'a>(
814 &self,
815 function_hashes: &'a [FunctionHash],
816 seen_pairs: &std::collections::HashSet<(
817 &'a str,
818 &'a str,
819 usize,
820 usize,
821 usize,
822 usize,
823 usize,
824 )>,
825 duplicates: &mut Vec<DuplicateMatch>,
826 ) {
827 let mut type3_candidates = Vec::new();
828
829 for i in 0..function_hashes.len() {
830 for j in (i + 1)..function_hashes.len() {
831 let func1 = &function_hashes[i];
832 let func2 = &function_hashes[j];
833
834 let type3_matches = detect_type3_clones(
835 &func1.tokens,
836 &func2.tokens,
837 self.min_block_size,
838 self.type3_tolerance,
839 );
840
841 for clone_match in type3_matches {
842 let pair_key = canonical_pair_key(
843 func1,
844 func2,
845 clone_match.source_start,
846 clone_match.target_start,
847 clone_match.length,
848 );
849
850 if seen_pairs.contains(&pair_key) {
851 continue;
852 }
853
854 type3_candidates.push((func1, func2, clone_match));
855 }
856 }
857 }
858
859 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
861
862 for (func1, func2, clone_match) in deduplicated {
864 let (actual_start1, actual_end1) =
865 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
866 let (actual_start2, actual_end2) =
867 self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
868
869 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
872 continue;
873 }
874
875 let window1 = &func1.tokens
876 [clone_match.source_start..clone_match.source_start + clone_match.length];
877 let window2 = &func2.tokens
878 [clone_match.target_start..clone_match.target_start + clone_match.target_length];
879 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
880
881 let match_hash = Self::compute_match_hash(window1);
882
883 duplicates.push(DuplicateMatch {
884 file1: func1.file_path.to_string(),
885 file2: func2.file_path.to_string(),
886 start_line1: actual_start1,
887 start_line2: actual_start2,
888 end_line1: Some(actual_end1),
889 end_line2: Some(actual_end2),
890 length: clone_match.length,
891 similarity: clone_match.similarity,
892 hash: match_hash,
893 clone_type: CloneType::Type3,
894 edit_distance: Some(edit_dist),
895 suppressed_by_directive: None,
896 token_offset1: Some(clone_match.source_start),
897 token_offset2: Some(clone_match.target_start),
898 target_length: Some(clone_match.target_length),
899 duplicate_id: None,
900 });
901 }
902 }
903
904 fn compute_duplicate_ids(
908 &self,
909 function_hashes: &[FunctionHash],
910 duplicates: &mut [DuplicateMatch],
911 ) {
912 for dup in duplicates.iter_mut() {
913 if dup.duplicate_id.is_some() {
914 continue;
915 }
916
917 let tokens1 = self.extract_duplicate_tokens(
918 function_hashes,
919 &dup.file1,
920 dup.start_line1,
921 dup.end_line1,
922 dup.token_offset1,
923 dup.length,
924 );
925
926 let tokens2 = self.extract_duplicate_tokens(
927 function_hashes,
928 &dup.file2,
929 dup.start_line2,
930 dup.end_line2,
931 dup.token_offset2,
932 dup.target_length.unwrap_or(dup.length),
933 );
934
935 if let Some(tokens1) = tokens1 {
936 let id = if let Some(tokens2) = tokens2 {
937 ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
938 } else {
939 ignore_rules::compute_duplicate_id(&tokens1)
940 };
941 dup.duplicate_id = Some(id);
942 }
943 }
944 }
945
946 fn extract_duplicate_tokens(
948 &self,
949 function_hashes: &[FunctionHash],
950 file: &str,
951 reported_start: usize,
952 reported_end: Option<usize>,
953 token_offset: Option<usize>,
954 length: usize,
955 ) -> Option<Vec<String>> {
956 function_hashes.iter().find_map(|fh| {
957 if fh.file_path.as_ref() != file
958 || fh.start_line > reported_start
959 || reported_start > fh.end_line
960 {
961 return None;
962 }
963
964 let start_offset = match token_offset {
965 Some(offset) if offset + length <= fh.tokens.len() => Some(offset),
966 _ => self.infer_token_offset(fh, reported_start, reported_end, length),
967 }?;
968
969 if start_offset + length > fh.tokens.len() {
970 return None;
971 }
972
973 Some(
974 fh.tokens
975 .iter()
976 .skip(start_offset)
977 .take(length)
978 .map(|t| t.as_hash_string().to_string())
979 .collect(),
980 )
981 })
982 }
983
984 fn infer_token_offset(
986 &self,
987 func_hash: &FunctionHash,
988 reported_start: usize,
989 reported_end: Option<usize>,
990 length: usize,
991 ) -> Option<usize> {
992 let start_line_offset = reported_start.checked_sub(func_hash.start_line)?;
993 let end_line = reported_end.unwrap_or(reported_start);
994
995 func_hash
996 .token_line_offsets
997 .iter()
998 .enumerate()
999 .filter_map(|(idx, line_offset)| {
1000 if *line_offset != start_line_offset {
1001 return None;
1002 }
1003
1004 let end_idx = idx.checked_add(length.checked_sub(1)?)?;
1005 let end_offset = func_hash.token_line_offsets.get(end_idx)?;
1006 if func_hash.start_line + *end_offset == end_line {
1007 Some(idx)
1008 } else {
1009 None
1010 }
1011 })
1012 .next()
1013 }
1014
1015 fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
1017 if let Some(ref ignore_manager) = self.ignore_manager {
1018 duplicates.retain(|dup| {
1019 if let Some(ref id) = dup.duplicate_id {
1020 !ignore_manager.is_ignored(id)
1021 } else {
1022 true
1024 }
1025 });
1026 }
1027 }
1028
1029 fn compute_match_hash(tokens: &[Token]) -> u64 {
1031 use std::collections::hash_map::DefaultHasher;
1032 use std::hash::{Hash, Hasher};
1033 let mut hasher = DefaultHasher::new();
1034 tokens.hash(&mut hasher);
1035 hasher.finish()
1036 }
1037
1038 fn is_suppressed_by_directive(
1043 &self,
1044 dup: &DuplicateMatch,
1045 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1046 function_hashes: &[FunctionHash],
1047 ) -> bool {
1048 let file1_path = PathBuf::from(&dup.file1);
1050 let file2_path = PathBuf::from(&dup.file2);
1051
1052 if let Some(directives) = directives_map.get(&file1_path) {
1054 let func_start =
1055 self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1056 let check_line = func_start.unwrap_or(dup.start_line1);
1058
1059 if directives.is_suppressed(check_line, check_line).is_some() {
1060 return true;
1061 }
1062 }
1063
1064 if let Some(directives) = directives_map.get(&file2_path) {
1066 let func_start =
1067 self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1068 let check_line = func_start.unwrap_or(dup.start_line2);
1070
1071 if directives.is_suppressed(check_line, check_line).is_some() {
1072 return true;
1073 }
1074 }
1075
1076 false
1077 }
1078
1079 fn find_owning_function_start(
1081 &self,
1082 file: &str,
1083 line: usize,
1084 function_hashes: &[FunctionHash],
1085 ) -> Option<usize> {
1086 function_hashes
1087 .iter()
1088 .find(|fh| {
1089 fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1090 })
1091 .map(|fh| fh.start_line)
1092 }
1093
1094 fn deduplicate_overlapping_matches<'a>(
1100 &self,
1101 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1102 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1103 if candidates.is_empty() {
1104 return Vec::new();
1105 }
1106
1107 let mut used = vec![false; candidates.len()];
1109 let mut deduplicated = Vec::new();
1110
1111 for i in 0..candidates.len() {
1112 if used[i] {
1113 continue;
1114 }
1115
1116 let (func1, func2, current) = &candidates[i];
1117 let mut best_match = (*func1, *func2, current.clone());
1118 used[i] = true;
1119
1120 let mut found_overlap = true;
1123 while found_overlap {
1124 found_overlap = false;
1125
1126 for j in (i + 1)..candidates.len() {
1127 if used[j] {
1128 continue;
1129 }
1130
1131 let (f1, f2, candidate) = &candidates[j];
1132
1133 let same_pair = (func1.file_path == f1.file_path
1135 && func2.file_path == f2.file_path
1136 && func1.start_line == f1.start_line
1137 && func2.start_line == f2.start_line)
1138 || (func1.file_path == f2.file_path
1139 && func2.file_path == f1.file_path
1140 && func1.start_line == f2.start_line
1141 && func2.start_line == f1.start_line);
1142
1143 if !same_pair {
1144 continue;
1145 }
1146
1147 let source_overlap = ranges_overlap(
1150 best_match.2.source_start,
1151 best_match.2.source_start + best_match.2.length,
1152 candidate.source_start,
1153 candidate.source_start + candidate.length,
1154 );
1155 let target_overlap = ranges_overlap(
1156 best_match.2.target_start,
1157 best_match.2.target_start + best_match.2.target_length,
1158 candidate.target_start,
1159 candidate.target_start + candidate.target_length,
1160 );
1161
1162 if source_overlap && target_overlap {
1163 let best_span = best_match.2.length.max(best_match.2.target_length);
1164 let candidate_span = candidate.length.max(candidate.target_length);
1165
1166 if candidate_span > best_span
1168 || (candidate_span == best_span
1169 && candidate.similarity > best_match.2.similarity)
1170 {
1171 best_match = (*f1, *f2, candidate.clone());
1172 found_overlap = true; }
1174 used[j] = true;
1175 }
1176 }
1177 }
1178
1179 deduplicated.push(best_match);
1180 }
1181
1182 deduplicated
1183 }
1184
1185 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1187 let normalized1 = raw1.split_whitespace().collect::<String>();
1189 let normalized2 = raw2.split_whitespace().collect::<String>();
1190
1191 if normalized1 == normalized2 {
1193 CloneType::Type1
1194 } else {
1195 CloneType::Type2
1197 }
1198 }
1199
1200 fn find_clones_between_functions(
1202 &self,
1203 func1: &FunctionHash,
1204 func2: &FunctionHash,
1205 ) -> Vec<CloneMatch> {
1206 use std::collections::HashMap;
1207
1208 let mut matches = Vec::new();
1209 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1210
1211 let mut i = 0;
1213 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1214 let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1215 hash_map.entry(hash).or_default().push(i);
1216 i += 1;
1217 }
1218
1219 let mut j = 0;
1221 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1222 let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1223
1224 if let Some(func1_positions) = hash_map.get(&hash) {
1225 for &func1_pos in func1_positions {
1226 if hashing::verify_cross_window_match(
1228 &func1.tokens,
1229 &func2.tokens,
1230 func1_pos,
1231 j,
1232 self.min_block_size,
1233 ) {
1234 let extension = hashing::extend_match(
1236 &func1.tokens,
1237 &func2.tokens,
1238 func1_pos,
1239 j,
1240 self.min_block_size,
1241 );
1242
1243 let total_length = self.min_block_size + extension;
1244
1245 matches.push(CloneMatch {
1246 source_start: func1_pos,
1247 target_start: j,
1248 length: total_length,
1249 target_length: total_length,
1250 similarity: 1.0, });
1252
1253 j += extension.max(1);
1255 break;
1256 }
1257 }
1258 }
1259
1260 j += 1;
1261 }
1262
1263 matches
1264 }
1265
1266 fn add_hashes_to_cache(&self, function_hashes: &[FunctionHash], cache: &mut HashCache) {
1267 for func_hash in function_hashes {
1268 let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1269
1270 for (hash, offset) in hashes {
1271 let end_token_idx = offset + self.min_block_size;
1272 let (start_line, end_line) =
1273 self.compute_line_span(func_hash, offset, self.min_block_size);
1274
1275 let location = CodeLocation {
1276 file_path: func_hash.file_path.to_string(),
1277 start_line,
1278 end_line,
1279 token_offset: Some(offset),
1280 token_length: self.min_block_size,
1281 tokens: func_hash.tokens[offset..end_token_idx].to_vec(),
1282 raw_source: func_hash.raw_body.clone(),
1283 };
1284
1285 cache.add_hash(hash, location);
1286 }
1287 }
1288 }
1289
1290 pub fn build_cache(&self, paths: Vec<PathBuf>) -> Result<HashCache> {
1295 let mut cache = HashCache::new(self.min_block_size);
1296
1297 let source_files = self.collect_source_files(paths)?;
1299
1300 for file_path in source_files {
1302 let content = match std::fs::read_to_string(&file_path) {
1303 Ok(c) => c,
1304 Err(_) => continue, };
1306
1307 let function_hashes = match self.process_file_content(&file_path, &content) {
1308 Ok(fh) => fh,
1309 Err(_) => continue, };
1311
1312 self.add_hashes_to_cache(&function_hashes, &mut cache);
1313 }
1314
1315 Ok(cache)
1316 }
1317
1318 pub fn scan_with_cache(
1323 &self,
1324 changed_files: Vec<PathBuf>,
1325 cache: &mut HashCache,
1326 ) -> Result<Report> {
1327 use std::time::Instant;
1328 let start_time = Instant::now();
1329
1330 let stale_files = cache.invalidate_stale_files();
1332 let normalize_path =
1333 |path: &Path| path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
1334 let changed_set: HashSet<PathBuf> =
1335 changed_files.iter().map(|p| normalize_path(p)).collect();
1336
1337 if !stale_files.is_empty() {
1338 let stale_paths: Vec<PathBuf> = stale_files
1341 .into_iter()
1342 .filter_map(|path| {
1343 let raw_path = PathBuf::from(&path);
1344 let normalized = normalize_path(&raw_path);
1345
1346 if !normalized.exists() || changed_set.contains(&normalized) {
1347 return None;
1348 }
1349
1350 Some(raw_path)
1351 })
1352 .collect();
1353
1354 if !stale_paths.is_empty() {
1355 let (stale_hashes, _) = self.analyze_files(&stale_paths)?;
1356 self.add_hashes_to_cache(&stale_hashes, cache);
1357 }
1358 }
1359
1360 let function_hashes_result = self.analyze_files(&changed_files)?;
1362 let (function_hashes, total_lines) = function_hashes_result;
1363
1364 let mut duplicates = Vec::new();
1366 let mut cached_hits_by_file: HashMap<String, Vec<CodeLocation>> = HashMap::new();
1367 let mut cached_function_hashes: Vec<FunctionHash> = Vec::new();
1368
1369 for func_hash in &function_hashes {
1370 let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1371
1372 for (hash, offset) in hashes {
1373 if let Some(cached_locations) = cache.lookup(hash) {
1375 for cached_loc in cached_locations {
1376 let changed_file_path = Path::new(func_hash.file_path.as_ref())
1378 .canonicalize()
1379 .unwrap_or_else(|_| {
1380 Path::new(func_hash.file_path.as_ref()).to_path_buf()
1381 });
1382 let cached_file_path = Path::new(&cached_loc.file_path)
1383 .canonicalize()
1384 .unwrap_or_else(|_| Path::new(&cached_loc.file_path).to_path_buf());
1385
1386 if changed_file_path == cached_file_path {
1388 continue;
1389 }
1390
1391 cached_hits_by_file
1392 .entry(cached_loc.file_path.clone())
1393 .or_default()
1394 .push(cached_loc.clone());
1395
1396 let start_token_idx = offset;
1398 let end_token_idx =
1399 (offset + self.min_block_size).min(func_hash.tokens.len());
1400
1401 let start_line_offset =
1402 if start_token_idx < func_hash.token_line_offsets.len() {
1403 func_hash.token_line_offsets[start_token_idx]
1404 } else {
1405 0
1406 };
1407
1408 let end_line_offset = if end_token_idx > 0
1409 && end_token_idx - 1 < func_hash.token_line_offsets.len()
1410 {
1411 func_hash.token_line_offsets[end_token_idx - 1]
1412 } else {
1413 start_line_offset
1414 };
1415
1416 let similarity = compute_token_similarity(
1418 &func_hash.tokens[start_token_idx..end_token_idx],
1419 &cached_loc.tokens,
1420 );
1421
1422 if similarity >= self.similarity_threshold {
1423 let clone_type = if func_hash.raw_body == cached_loc.raw_source {
1424 CloneType::Type1
1425 } else {
1426 CloneType::Type2
1427 };
1428
1429 duplicates.push(DuplicateMatch {
1430 file1: func_hash.file_path.to_string(),
1431 file2: cached_loc.file_path.clone(),
1432 start_line1: func_hash.start_line + start_line_offset,
1433 start_line2: cached_loc.start_line,
1434 end_line1: Some(func_hash.start_line + end_line_offset),
1435 end_line2: Some(cached_loc.end_line),
1436 length: self.min_block_size,
1437 similarity,
1438 hash,
1439 clone_type,
1440 edit_distance: None,
1441 suppressed_by_directive: None,
1442 token_offset1: Some(offset),
1443 token_offset2: cached_loc.token_offset,
1444 target_length: Some(cached_loc.token_length),
1445 duplicate_id: None,
1446 });
1447 }
1448 }
1449 }
1450 }
1451 }
1452
1453 if self.enable_type3 && !cached_hits_by_file.is_empty() {
1455 let mut seen_functions: HashSet<(String, usize)> = HashSet::new();
1456
1457 for locations in cached_hits_by_file.values() {
1458 for loc in locations {
1459 let token_offset = match loc.token_offset {
1460 Some(offset) => offset,
1461 None => continue,
1462 };
1463
1464 let normalized_path = normalize_path(Path::new(&loc.file_path));
1465 if changed_set.contains(&normalized_path) {
1466 continue;
1467 }
1468
1469 let (tokens, token_line_offsets) = normalize_with_line_numbers(&loc.raw_source);
1470 if tokens.len() < self.min_block_size
1471 || token_offset >= token_line_offsets.len()
1472 {
1473 continue;
1474 }
1475
1476 let line_offset = token_line_offsets[token_offset];
1477 let start_line = loc.start_line.saturating_sub(line_offset);
1478 let key = (loc.file_path.clone(), start_line);
1479
1480 if !seen_functions.insert(key.clone()) {
1481 continue;
1482 }
1483
1484 let end_line =
1485 start_line + token_line_offsets.last().copied().unwrap_or_default();
1486
1487 cached_function_hashes.push(FunctionHash {
1488 file_path: Arc::<str>::from(key.0),
1489 function_name: None,
1490 start_byte: 0,
1491 end_byte: 0,
1492 start_line,
1493 end_line,
1494 tokens,
1495 token_line_offsets,
1496 raw_body: loc.raw_source.clone(),
1497 });
1498 }
1499 }
1500
1501 if !cached_function_hashes.is_empty() {
1502 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
1504
1505 let mut seen_pairs: HashSet<SeenPairKey<'_>> = HashSet::new();
1506
1507 for dup in &duplicates {
1508 if let (Some(offset1), Some(offset2)) = (dup.token_offset1, dup.token_offset2) {
1509 if let (Some(func1), Some(func2)) = (
1510 function_hashes.iter().find(|fh| {
1511 fh.file_path.as_ref() == dup.file1.as_str()
1512 && fh.start_line <= dup.start_line1
1513 && dup.start_line1 <= fh.end_line
1514 }),
1515 cached_function_hashes.iter().find(|fh| {
1516 fh.file_path.as_ref() == dup.file2.as_str()
1517 && fh.start_line <= dup.start_line2
1518 && dup.start_line2 <= fh.end_line
1519 }),
1520 ) {
1521 seen_pairs.insert(canonical_pair_key(
1522 func1, func2, offset1, offset2, dup.length,
1523 ));
1524 }
1525 }
1526 }
1527
1528 let mut type3_candidates = Vec::new();
1529
1530 for func1 in &function_hashes {
1531 for func2 in &cached_function_hashes {
1532 let type3_matches = detect_type3_clones(
1533 &func1.tokens,
1534 &func2.tokens,
1535 self.min_block_size,
1536 self.type3_tolerance,
1537 );
1538
1539 for clone_match in type3_matches {
1540 let pair_key = canonical_pair_key(
1541 func1,
1542 func2,
1543 clone_match.source_start,
1544 clone_match.target_start,
1545 clone_match.length,
1546 );
1547
1548 if seen_pairs.contains(&pair_key) {
1549 continue;
1550 }
1551
1552 type3_candidates.push((func1, func2, clone_match));
1553 }
1554 }
1555 }
1556
1557 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1558
1559 for (func1, func2, clone_match) in deduplicated {
1560 let (actual_start1, actual_end1) =
1561 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1562 let (actual_start2, actual_end2) = self.compute_line_span(
1563 func2,
1564 clone_match.target_start,
1565 clone_match.target_length,
1566 );
1567
1568 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1569 continue;
1570 }
1571
1572 let window1 = &func1.tokens
1573 [clone_match.source_start..clone_match.source_start + clone_match.length];
1574 let window2 = &func2.tokens[clone_match.target_start
1575 ..clone_match.target_start + clone_match.target_length];
1576
1577 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1578 let match_hash = Self::compute_match_hash(window1);
1579
1580 duplicates.push(DuplicateMatch {
1581 file1: func1.file_path.to_string(),
1582 file2: func2.file_path.to_string(),
1583 start_line1: actual_start1,
1584 start_line2: actual_start2,
1585 end_line1: Some(actual_end1),
1586 end_line2: Some(actual_end2),
1587 length: clone_match.length,
1588 similarity: clone_match.similarity,
1589 hash: match_hash,
1590 clone_type: CloneType::Type3,
1591 edit_distance: Some(edit_dist),
1592 suppressed_by_directive: None,
1593 token_offset1: Some(clone_match.source_start),
1594 token_offset2: Some(clone_match.target_start),
1595 target_length: Some(clone_match.target_length),
1596 duplicate_id: None,
1597 });
1598 }
1599 }
1600 }
1601
1602 let intra_duplicates = self.find_duplicate_hashes(&function_hashes);
1604 duplicates.extend(intra_duplicates);
1605
1606 duplicates.sort_by(|a, b| {
1608 (&a.file1, &a.file2, a.start_line1, a.start_line2).cmp(&(
1609 &b.file1,
1610 &b.file2,
1611 b.start_line1,
1612 b.start_line2,
1613 ))
1614 });
1615 duplicates.dedup_by(|a, b| {
1616 a.file1 == b.file1
1617 && a.file2 == b.file2
1618 && a.start_line1 == b.start_line1
1619 && a.start_line2 == b.start_line2
1620 });
1621
1622 let mut lookup_function_hashes = function_hashes.clone();
1625 if !cached_function_hashes.is_empty() {
1626 lookup_function_hashes.extend(cached_function_hashes.clone());
1627 }
1628 let hashed_files: HashSet<&str> = lookup_function_hashes
1629 .iter()
1630 .map(|fh| fh.file_path.as_ref())
1631 .collect();
1632
1633 let mut missing_files: HashSet<String> = HashSet::new();
1634 for dup in &duplicates {
1635 if !hashed_files.contains(dup.file1.as_str()) {
1636 missing_files.insert(dup.file1.clone());
1637 }
1638 if !hashed_files.contains(dup.file2.as_str()) {
1639 missing_files.insert(dup.file2.clone());
1640 }
1641 }
1642
1643 if !missing_files.is_empty() {
1644 let missing_paths: Vec<PathBuf> = missing_files.iter().map(PathBuf::from).collect();
1645 let (mut extra_hashes, _) = self.analyze_files(&missing_paths)?;
1646 lookup_function_hashes.append(&mut extra_hashes);
1647 }
1648
1649 self.compute_duplicate_ids(&lookup_function_hashes, &mut duplicates);
1651 self.filter_ignored_duplicates(&mut duplicates);
1652
1653 if self.enable_directives && !duplicates.is_empty() {
1655 let directive_paths: HashSet<PathBuf> = lookup_function_hashes
1656 .iter()
1657 .map(|fh| PathBuf::from(fh.file_path.as_ref()))
1658 .collect();
1659 let directives_map =
1660 self.collect_directives(&directive_paths.into_iter().collect::<Vec<_>>());
1661
1662 if !directives_map.is_empty() {
1663 self.apply_directive_filtering(
1664 &mut duplicates,
1665 &directives_map,
1666 &lookup_function_hashes,
1667 );
1668 }
1669 }
1670
1671 self.add_hashes_to_cache(&function_hashes, cache);
1673
1674 let stats = self.compute_stats(&function_hashes, total_lines, start_time);
1676
1677 Ok(Report {
1678 version: None,
1679 scan_time: None,
1680 config: None,
1681 files_scanned: changed_files.len(),
1682 functions_analyzed: function_hashes.len(),
1683 duplicates,
1684 stats,
1685 })
1686 }
1687}
1688
1689impl Default for Scanner {
1690 fn default() -> Self {
1691 Self::new() }
1693}
1694
1695pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1703 let scanner = Scanner::new();
1704 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1705 scanner.scan(path_bufs)
1706}
1707
1708pub fn find_duplicates_with_config(
1710 paths: Vec<String>,
1711 min_block_size: usize,
1712 similarity_threshold: f64,
1713) -> Result<Report> {
1714 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1715 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1716 scanner.scan(path_bufs)
1717}
1718
1719#[cfg(test)]
1720mod tests {
1721 use super::*;
1722
1723 fn make_test_function(
1725 file: &str,
1726 start_line: usize,
1727 tokens: Vec<Token>,
1728 raw_body: &str,
1729 ) -> FunctionHash {
1730 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1731 FunctionHash {
1732 file_path: Arc::<str>::from(file),
1733 function_name: None,
1734 start_byte: 0,
1735 end_byte: 0,
1736 start_line,
1737 end_line: start_line + tokens.len(),
1738 tokens,
1739 token_line_offsets,
1740 raw_body: raw_body.to_string(),
1741 }
1742 }
1743
1744 fn make_test_function_same_line(
1746 file: &str,
1747 start_line: usize,
1748 end_line: usize,
1749 tokens: Vec<Token>,
1750 raw_body: &str,
1751 ) -> FunctionHash {
1752 let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1753 FunctionHash {
1754 file_path: Arc::<str>::from(file),
1755 function_name: None,
1756 start_byte: 0,
1757 end_byte: 0,
1758 start_line,
1759 end_line,
1760 tokens,
1761 token_line_offsets,
1762 raw_body: raw_body.to_string(),
1763 }
1764 }
1765
1766 fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1768 vec![
1769 Token::Keyword(keyword.into()),
1770 Token::Identifier,
1771 Token::Operator(op.into()),
1772 Token::Identifier,
1773 Token::Punctuation(";".into()),
1774 ]
1775 }
1776
1777 #[test]
1778 fn test_scanner_creation() {
1779 let _scanner = Scanner::new(); }
1781
1782 #[test]
1783 fn test_scanner_with_config() {
1784 let scanner = Scanner::with_config(30, 0.9);
1785 assert!(scanner.is_ok());
1786 let s = scanner.unwrap();
1787 assert_eq!(s.min_block_size, 30);
1788 assert_eq!(s.similarity_threshold, 0.9);
1789 }
1790
1791 #[test]
1792 fn test_type3_tolerance_validation() {
1793 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1794 assert!(Scanner::new().with_type3_detection(1.2).is_err());
1795 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1796 }
1797
1798 #[test]
1799 fn test_type3_not_dropped_when_functions_share_offsets() {
1800 fn make_function(
1801 file: &str,
1802 start_line: usize,
1803 tokens: Vec<Token>,
1804 raw_body: &str,
1805 ) -> FunctionHash {
1806 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1807 FunctionHash {
1808 file_path: Arc::<str>::from(file),
1809 function_name: None,
1810 start_byte: 0,
1811 end_byte: 0,
1812 start_line,
1813 end_line: start_line + tokens.len(),
1814 tokens,
1815 token_line_offsets,
1816 raw_body: raw_body.to_string(),
1817 }
1818 }
1819
1820 let scanner = Scanner::with_config(3, 0.85)
1821 .unwrap()
1822 .with_type3_detection(0.6)
1823 .unwrap();
1824
1825 let type1_tokens = vec![
1826 Token::Keyword("return".into()),
1827 Token::NumberLiteral,
1828 Token::Punctuation(";".into()),
1829 ];
1830 let near_tokens_a = vec![
1831 Token::Keyword("compute".into()),
1832 Token::Identifier,
1833 Token::Identifier,
1834 ];
1835 let near_tokens_b = vec![
1836 Token::Keyword("compute".into()),
1837 Token::Identifier,
1838 Token::NumberLiteral,
1839 ];
1840
1841 let functions = vec![
1842 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1843 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1844 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1845 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1846 ];
1847
1848 let duplicates = scanner.find_duplicate_hashes(&functions);
1849
1850 let type1_present = duplicates.iter().any(|d| {
1851 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1852 && d.start_line1 == 10
1853 && d.start_line2 == 20
1854 });
1855 assert!(
1856 type1_present,
1857 "expected Type-1/2 match for the first function pair"
1858 );
1859
1860 let type3_present = duplicates.iter().any(|d| {
1861 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1862 });
1863 assert!(
1864 type3_present,
1865 "Type-3 match between later functions should not be deduped"
1866 );
1867
1868 assert_eq!(
1869 duplicates.len(),
1870 2,
1871 "should keep both the Type-1/2 and Type-3 matches"
1872 );
1873 }
1874
1875 #[test]
1876 fn test_type3_reports_token_offsets_in_start_lines() {
1877 let scanner = Scanner::with_config(3, 0.85)
1878 .unwrap()
1879 .with_type3_detection(0.75)
1880 .unwrap();
1881
1882 let functions = vec![
1883 make_test_function_same_line(
1884 "file_a.rs",
1885 100,
1886 105,
1887 make_expr_tokens("let", "+"),
1888 "let a = b + c;",
1889 ),
1890 make_test_function_same_line(
1891 "file_b.rs",
1892 200,
1893 205,
1894 make_expr_tokens("mut", "-"),
1895 "let a = b - c;",
1896 ),
1897 ];
1898
1899 let duplicates = scanner.find_duplicate_hashes(&functions);
1900
1901 let type3 = duplicates
1902 .iter()
1903 .find(|d| matches!(d.clone_type, CloneType::Type3))
1904 .expect("expected a Type-3 duplicate match");
1905
1906 assert_eq!(
1907 type3.start_line1, 100,
1908 "should report the actual source line even when tokens share a line"
1909 );
1910 assert_eq!(
1911 type3.start_line2, 200,
1912 "should report the actual target line even when tokens share a line"
1913 );
1914 assert_eq!(type3.token_offset1, Some(1));
1915 assert_eq!(type3.token_offset2, Some(1));
1916 }
1917
1918 #[test]
1919 fn type3_duplicate_ids_are_symmetric() {
1920 use tempfile::TempDir;
1921
1922 let tokens_a = make_expr_tokens("let", "+");
1923 let mut tokens_b = make_expr_tokens("let", "-");
1925 tokens_b.push(Token::Identifier);
1926
1927 let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1928 let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1929
1930 let temp_dir = TempDir::new().unwrap();
1931 let scanner = Scanner::with_config(3, 0.85)
1932 .unwrap()
1933 .with_type3_detection(0.75)
1934 .unwrap()
1935 .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1936
1937 let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1938 let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1939
1940 let id_forward = forward
1941 .into_iter()
1942 .find(|d| matches!(d.clone_type, CloneType::Type3))
1943 .and_then(|d| d.duplicate_id)
1944 .expect("expected a Type-3 duplicate ID");
1945
1946 let id_reverse = reverse
1947 .into_iter()
1948 .find(|d| matches!(d.clone_type, CloneType::Type3))
1949 .and_then(|d| d.duplicate_id)
1950 .expect("expected a Type-3 duplicate ID");
1951
1952 assert_eq!(
1953 id_forward, id_reverse,
1954 "Type-3 IDs should not depend on function order"
1955 );
1956 }
1957
1958 #[test]
1959 fn type3_does_not_report_self_matches() {
1960 let scanner = Scanner::with_config(3, 0.85)
1963 .unwrap()
1964 .with_type3_detection(0.75)
1965 .unwrap();
1966
1967 let tokens = make_expr_tokens("let", "+");
1970 let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
1971 let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
1972
1973 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1974
1975 let self_matches: Vec<_> = duplicates
1977 .iter()
1978 .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
1979 .collect();
1980
1981 assert!(
1982 self_matches.is_empty(),
1983 "Type-3 should never report self-matches (same file and line). Found: {:?}",
1984 self_matches
1985 );
1986 }
1987
1988 #[test]
1989 fn type3_still_detects_same_file_different_line_duplicates() {
1990 let scanner = Scanner::with_config(3, 0.85)
1992 .unwrap()
1993 .with_type3_detection(0.75)
1994 .unwrap();
1995
1996 let tokens1 = make_expr_tokens("let", "+");
1998 let mut tokens2 = make_expr_tokens("let", "-");
1999 tokens2.push(Token::Identifier); let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
2002 let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
2003
2004 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
2005
2006 let same_file_different_line: Vec<_> = duplicates
2007 .iter()
2008 .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
2009 .collect();
2010
2011 assert!(
2012 !same_file_different_line.is_empty(),
2013 "Type-3 should still detect duplicates in the same file at different lines"
2014 );
2015 }
2016
2017 #[test]
2018 fn duplicate_matches_store_actual_end_lines() {
2019 let scanner = Scanner::with_config(2, 0.85).unwrap();
2020
2021 let tokens = vec![
2022 Token::Keyword("fn".into()),
2023 Token::Identifier,
2024 Token::Identifier,
2025 Token::Punctuation("{".into()),
2026 Token::Punctuation("}".into()),
2027 ];
2028
2029 let func1 = FunctionHash {
2030 file_path: Arc::<str>::from("file_a.rs"),
2031 function_name: None,
2032 start_byte: 0,
2033 end_byte: 0,
2034 start_line: 10,
2035 end_line: 14,
2036 tokens: tokens.clone(),
2037 token_line_offsets: vec![0, 0, 1, 1, 2],
2038 raw_body: "fn a() {}".to_string(),
2039 };
2040
2041 let func2 = FunctionHash {
2042 file_path: Arc::<str>::from("file_b.rs"),
2043 function_name: None,
2044 start_byte: 0,
2045 end_byte: 0,
2046 start_line: 20,
2047 end_line: 24,
2048 tokens,
2049 token_line_offsets: vec![0, 1, 1, 2, 2],
2050 raw_body: "fn b() {}".to_string(),
2051 };
2052
2053 let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
2054 let dup = duplicates.first().expect("expected a duplicate match");
2055
2056 assert_eq!(dup.start_line1, 10);
2057 assert_eq!(dup.start_line2, 20);
2058 assert_eq!(dup.end_line1, Some(12));
2059 assert_eq!(dup.end_line2, Some(22));
2060 }
2061
2062 #[test]
2063 fn scan_with_cache_prunes_stale_entries() {
2064 let temp_dir = tempfile::tempdir().unwrap();
2065 let file_a = temp_dir.path().join("a.js");
2066 let file_b = temp_dir.path().join("b.js");
2067
2068 let shared_fn = r#"
2069 function shared() {
2070 return 1 + 1;
2071 }
2072 "#;
2073 std::fs::write(&file_a, shared_fn).unwrap();
2074 std::fs::write(&file_b, shared_fn).unwrap();
2075
2076 let scanner = Scanner::with_config(3, 0.85).unwrap();
2077 let mut cache = scanner
2078 .build_cache(vec![file_a.clone(), file_b.clone()])
2079 .unwrap();
2080
2081 std::thread::sleep(std::time::Duration::from_millis(1100));
2083 std::fs::write(&file_b, "const unrelated = 42;\n").unwrap();
2084
2085 let report = scanner
2086 .scan_with_cache(vec![file_a.clone()], &mut cache)
2087 .unwrap();
2088
2089 assert!(
2090 report.duplicates.is_empty(),
2091 "stale cache entries should be invalidated before lookup"
2092 );
2093 }
2094
2095 #[test]
2096 fn scan_with_cache_repopulates_changed_entries() {
2097 let temp_dir = tempfile::tempdir().unwrap();
2098 let file_a = temp_dir.path().join("a.js");
2099
2100 let original = r#"
2101 function shared() {
2102 return 1 + 1;
2103 }
2104 "#;
2105
2106 let updated = r#"
2107 function shared() {
2108 return 7 + 8;
2109 }
2110 "#;
2111
2112 std::fs::write(&file_a, original).unwrap();
2113
2114 let scanner = Scanner::with_config(3, 0.85).unwrap();
2115 let mut cache = scanner.build_cache(vec![file_a.clone()]).unwrap();
2116
2117 std::thread::sleep(std::time::Duration::from_millis(1100));
2118 std::fs::write(&file_a, updated).unwrap();
2119
2120 let file_a_str = file_a.to_string_lossy().to_string();
2121 assert!(
2122 cache.file_needs_rescan(&file_a_str),
2123 "modified files should be considered stale before cache lookup"
2124 );
2125
2126 scanner
2127 .scan_with_cache(vec![file_a.clone()], &mut cache)
2128 .unwrap();
2129
2130 let cached_entries: Vec<&CodeLocation> = cache
2131 .hash_index
2132 .values()
2133 .flat_map(|locs| locs.iter())
2134 .filter(|loc| loc.file_path == file_a_str)
2135 .collect();
2136
2137 assert!(
2138 !cached_entries.is_empty(),
2139 "changed files should be added back into the cache after rescan"
2140 );
2141 assert!(
2142 cached_entries
2143 .iter()
2144 .any(|loc| loc.raw_source.contains("return 7 + 8;")),
2145 "cache should contain hashes for the refreshed file contents"
2146 );
2147 assert!(
2148 cache.file_metadata.contains_key(&file_a_str),
2149 "file metadata should be refreshed after rescanning changed files"
2150 );
2151 }
2152
2153 #[test]
2154 fn scan_with_cache_rehydrates_stale_unchanged_files() {
2155 let temp_dir = tempfile::tempdir().unwrap();
2156 let changed_file = temp_dir.path().join("changed.js");
2157 let unchanged_file = temp_dir.path().join("unchanged.js");
2158
2159 let shared_fn = r#"
2160 function shared() {
2161 return 1 + 1;
2162 }
2163 "#;
2164
2165 std::fs::write(&changed_file, shared_fn).unwrap();
2166 std::fs::write(&unchanged_file, shared_fn).unwrap();
2167
2168 let scanner = Scanner::with_config(3, 0.85).unwrap();
2169 let mut cache = scanner
2170 .build_cache(vec![temp_dir.path().to_path_buf()])
2171 .unwrap();
2172
2173 std::thread::sleep(std::time::Duration::from_millis(1100));
2175 std::fs::write(
2176 &changed_file,
2177 r#"
2178 function shared() {
2179 return 1 + 1;
2180 }
2181 function another() {
2182 return 1 + 1;
2183 }
2184 "#,
2185 )
2186 .unwrap();
2187 std::fs::write(&unchanged_file, shared_fn).unwrap();
2188
2189 let report = scanner
2190 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2191 .unwrap();
2192
2193 assert!(
2194 report.duplicates.iter().any(|dup| {
2195 (dup.file1.ends_with("changed.js") && dup.file2.ends_with("unchanged.js"))
2196 || (dup.file1.ends_with("unchanged.js") && dup.file2.ends_with("changed.js"))
2197 }),
2198 "invalidated entries should be rebuilt so unchanged files still match against diffs"
2199 );
2200 }
2201
2202 #[test]
2203 fn scan_with_cache_respects_ignore_file() {
2204 let temp_dir = tempfile::tempdir().unwrap();
2205 let file_a = temp_dir.path().join("a.js");
2206 let file_b = temp_dir.path().join("b.js");
2207
2208 let shared_fn = r#"
2209 function shared() {
2210 return 1 + 1;
2211 }
2212 "#;
2213 std::fs::write(&file_a, shared_fn).unwrap();
2214 std::fs::write(&file_b, shared_fn).unwrap();
2215
2216 let base_scanner = Scanner::with_config(3, 0.85).unwrap();
2217 let mut cache = base_scanner
2218 .build_cache(vec![temp_dir.path().to_path_buf()])
2219 .unwrap();
2220
2221 let initial_report = base_scanner
2222 .scan_with_cache(vec![file_a.clone()], &mut cache)
2223 .unwrap();
2224 assert!(
2225 !initial_report.duplicates.is_empty(),
2226 "expected an initial duplicate to seed ignore entries"
2227 );
2228 let ignored_ids: Vec<String> = initial_report
2229 .duplicates
2230 .iter()
2231 .map(|d| {
2232 d.duplicate_id
2233 .clone()
2234 .expect("expected cache path to compute duplicate IDs")
2235 })
2236 .collect();
2237
2238 let mut manager = IgnoreManager::new(temp_dir.path());
2239 for id in ignored_ids {
2240 manager.add_ignore(IgnoreEntry::new(
2241 id,
2242 vec![],
2243 "test ignore".to_string(),
2244 "tester".to_string(),
2245 ));
2246 }
2247
2248 let scanner = base_scanner.with_ignore_manager(manager);
2249 let report = scanner
2250 .scan_with_cache(vec![file_a.clone()], &mut cache)
2251 .unwrap();
2252
2253 assert!(
2254 report.duplicates.is_empty(),
2255 "duplicates present in .polydup-ignore should be filtered when using cache"
2256 );
2257 }
2258
2259 #[test]
2260 fn scan_with_cache_uses_symmetric_ids_for_existing_ignores() {
2261 let temp_dir = tempfile::tempdir().unwrap();
2262 let file_a = temp_dir.path().join("a.js");
2263 let file_b = temp_dir.path().join("b.js");
2264
2265 let shared_fn = r#"
2266 function shared() {
2267 return 1 + 1;
2268 }
2269 "#;
2270 std::fs::write(&file_a, shared_fn).unwrap();
2271 std::fs::write(&file_b, shared_fn).unwrap();
2272
2273 let base_scanner = Scanner::with_config(7, 0.85).unwrap();
2274 let mut cache = base_scanner
2275 .build_cache(vec![temp_dir.path().to_path_buf()])
2276 .unwrap();
2277
2278 let baseline_report = base_scanner
2279 .scan(vec![temp_dir.path().to_path_buf()])
2280 .unwrap();
2281 let baseline_id = baseline_report
2282 .duplicates
2283 .first()
2284 .and_then(|dup| dup.duplicate_id.clone())
2285 .expect("expected duplicate IDs from full scans");
2286 let baseline_id_for_ignore = baseline_id.clone();
2287
2288 let mut manager = IgnoreManager::new(temp_dir.path());
2289 manager.add_ignore(IgnoreEntry::new(
2290 baseline_id_for_ignore,
2291 vec![],
2292 "test ignore".to_string(),
2293 "tester".to_string(),
2294 ));
2295
2296 let scanner = base_scanner.with_ignore_manager(manager);
2297 let report = scanner
2298 .scan_with_cache(vec![file_a.clone()], &mut cache)
2299 .unwrap();
2300
2301 assert!(
2302 report.duplicates.is_empty(),
2303 "cached scans should honor ignores generated from full scans"
2304 );
2305 }
2306
2307 #[test]
2308 fn scan_with_cache_respects_directives_from_cached_files() {
2309 let temp_dir = tempfile::tempdir().unwrap();
2310 let changed_file = temp_dir.path().join("changed.js");
2311 let cached_file = temp_dir.path().join("cached.js");
2312
2313 let suppressed_fn = r#"
2314 // polydup-ignore: generated code
2315 function shared() {
2316 return 1 + 1;
2317 }
2318 "#;
2319
2320 let changed_fn = r#"
2321 function shared() {
2322 return 1 + 1;
2323 }
2324 "#;
2325
2326 std::fs::write(&cached_file, suppressed_fn).unwrap();
2327 std::fs::write(&changed_file, changed_fn).unwrap();
2328
2329 let scanner = Scanner::with_config(3, 0.85).unwrap().with_directives(true);
2330 let mut cache = scanner
2331 .build_cache(vec![temp_dir.path().to_path_buf()])
2332 .unwrap();
2333
2334 let report = scanner
2335 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2336 .unwrap();
2337
2338 assert!(
2339 report.duplicates.is_empty(),
2340 "duplicates suppressed by directives in cached files should stay suppressed when using cache"
2341 );
2342 }
2343
2344 #[test]
2345 fn scan_with_cache_runs_type3_detection_against_cached_files() {
2346 let temp_dir = tempfile::tempdir().unwrap();
2347 let changed_file = temp_dir.path().join("changed.js");
2348 let cached_file = temp_dir.path().join("cached.js");
2349
2350 let cached_fn = r#"
2351 function cached() {
2352 step1();
2353 step2();
2354 step3();
2355 step4();
2356 step5();
2357 }
2358 "#;
2359
2360 let changed_fn = r#"
2361 function cached() {
2362 step1();
2363 step2();
2364 insert_gap();
2365 step3();
2366 step4();
2367 step5();
2368 }
2369 "#;
2370
2371 std::fs::write(&cached_file, cached_fn).unwrap();
2372 std::fs::write(&changed_file, changed_fn).unwrap();
2373
2374 let scanner = Scanner::with_config(3, 0.8)
2375 .unwrap()
2376 .with_type3_detection(0.8)
2377 .unwrap();
2378 let mut cache = scanner
2379 .build_cache(vec![temp_dir.path().to_path_buf()])
2380 .unwrap();
2381
2382 let report = scanner
2383 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2384 .unwrap();
2385
2386 assert!(
2387 report.duplicates.iter().any(|dup| {
2388 matches!(dup.clone_type, CloneType::Type3)
2389 && dup.file1.ends_with("changed.js")
2390 && dup.file2.ends_with("cached.js")
2391 }),
2392 "Type-3 should run for cached comparisons so near-miss clones surface in git-diff mode"
2393 );
2394 }
2395
2396 #[test]
2397 fn test_find_duplicates_empty() {
2398 let result = find_duplicates(vec![]);
2399 assert!(result.is_ok());
2400 let report = result.unwrap();
2401 assert_eq!(report.duplicates.len(), 0);
2402 }
2403
2404 #[test]
2405 fn test_is_supported_file() {
2406 let scanner = Scanner::new();
2407
2408 assert!(scanner.is_supported_file(Path::new("test.rs")));
2409 assert!(scanner.is_supported_file(Path::new("test.py")));
2410 assert!(scanner.is_supported_file(Path::new("test.js")));
2411 assert!(scanner.is_supported_file(Path::new("test.ts")));
2412 assert!(!scanner.is_supported_file(Path::new("test.txt")));
2413 assert!(!scanner.is_supported_file(Path::new("test.md")));
2414 }
2415
2416 #[test]
2417 fn test_detect_language() {
2418 let scanner = Scanner::new();
2419
2420 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
2421 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
2422 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
2423 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
2424 }
2425}