1mod cache;
8mod directives;
9mod error;
10mod hashing;
11mod ignore_rules;
12mod parsing;
13mod queries;
14
15#[cfg(test)]
16mod proptest_fuzzing;
17
18#[cfg(test)]
19mod snapshot_tests;
20
21pub use cache::{CacheStats, CodeLocation, FileCacheMetadata, HashCache};
23pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
24pub use error::{PolyDupError, Result};
25pub use hashing::{
26 compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
27 compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
28 normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
29 Token,
30};
31pub use ignore_rules::{
32 compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
33};
34pub use parsing::{
35 extract_functions, extract_javascript_functions, extract_python_functions,
36 extract_rust_functions, FunctionNode,
37};
38
39use anyhow::Context;
40use globset::GlobSet;
41use ignore::WalkBuilder;
42use once_cell::sync::OnceCell;
43use rayon::prelude::*;
44use serde::{Deserialize, Serialize};
45use std::collections::{HashMap, HashSet};
46use std::fs;
47use std::path::{Path, PathBuf};
48use std::sync::Arc;
49use tree_sitter::Language;
50
51#[derive(Debug, Clone, PartialEq, Eq)]
53pub struct LanguageInfo {
54 pub name: &'static str,
56 pub extensions: &'static [&'static str],
58 pub parser: &'static str,
60 pub type3_support: bool,
62 pub status: LanguageStatus,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum LanguageStatus {
69 Full,
71 Partial,
73 Planned,
75}
76
77pub fn get_supported_languages() -> Vec<LanguageInfo> {
79 vec![
80 LanguageInfo {
81 name: "Rust",
82 extensions: &["rs"],
83 parser: "tree-sitter-rust",
84 type3_support: true,
85 status: LanguageStatus::Full,
86 },
87 LanguageInfo {
88 name: "Python",
89 extensions: &["py", "pyi"],
90 parser: "tree-sitter-python",
91 type3_support: true,
92 status: LanguageStatus::Full,
93 },
94 LanguageInfo {
95 name: "JavaScript",
96 extensions: &["js", "mjs", "cjs"],
97 parser: "tree-sitter-javascript",
98 type3_support: true,
99 status: LanguageStatus::Full,
100 },
101 LanguageInfo {
102 name: "TypeScript",
103 extensions: &["ts", "mts", "cts"],
104 parser: "tree-sitter-javascript",
105 type3_support: true,
106 status: LanguageStatus::Full,
107 },
108 LanguageInfo {
109 name: "JSX",
110 extensions: &["jsx"],
111 parser: "tree-sitter-javascript",
112 type3_support: true,
113 status: LanguageStatus::Full,
114 },
115 LanguageInfo {
116 name: "TSX",
117 extensions: &["tsx"],
118 parser: "tree-sitter-javascript",
119 type3_support: true,
120 status: LanguageStatus::Full,
121 },
122 LanguageInfo {
123 name: "Vue",
124 extensions: &["vue"],
125 parser: "tree-sitter-javascript",
126 type3_support: true,
127 status: LanguageStatus::Partial,
128 },
129 LanguageInfo {
130 name: "Svelte",
131 extensions: &["svelte"],
132 parser: "tree-sitter-javascript",
133 type3_support: true,
134 status: LanguageStatus::Partial,
135 },
136 LanguageInfo {
137 name: "Go",
138 extensions: &["go"],
139 parser: "tree-sitter-go",
140 type3_support: true,
141 status: LanguageStatus::Planned,
142 },
143 LanguageInfo {
144 name: "Java",
145 extensions: &["java"],
146 parser: "tree-sitter-java",
147 type3_support: true,
148 status: LanguageStatus::Planned,
149 },
150 LanguageInfo {
151 name: "C/C++",
152 extensions: &["c", "cc", "cpp", "cxx", "h", "hpp"],
153 parser: "tree-sitter-cpp",
154 type3_support: true,
155 status: LanguageStatus::Planned,
156 },
157 ]
158}
159
160#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
162pub enum CloneType {
163 #[serde(rename = "type-1")]
165 Type1,
166 #[serde(rename = "type-2")]
168 Type2,
169 #[serde(rename = "type-3")]
171 Type3,
172}
173
174fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
176 start1 < end2 && start2 < end1
177}
178
179fn canonical_pair_key<'a>(
181 func1: &'a FunctionHash,
182 func2: &'a FunctionHash,
183 source_start: usize,
184 target_start: usize,
185 length: usize,
186) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
187 if func1.file_path.as_ref() < func2.file_path.as_ref() {
188 (
189 func1.file_path.as_ref(),
190 func2.file_path.as_ref(),
191 func1.start_line,
192 func2.start_line,
193 source_start,
194 target_start,
195 length,
196 )
197 } else {
198 (
199 func2.file_path.as_ref(),
200 func1.file_path.as_ref(),
201 func2.start_line,
202 func1.start_line,
203 target_start,
204 source_start,
205 length,
206 )
207 }
208}
209
210#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
212pub struct DuplicateMatch {
213 pub file1: String,
214 pub file2: String,
215 pub start_line1: usize,
216 pub start_line2: usize,
217 #[serde(skip_serializing_if = "Option::is_none")]
218 pub end_line1: Option<usize>,
219 #[serde(skip_serializing_if = "Option::is_none")]
220 pub end_line2: Option<usize>,
221 pub length: usize,
222 pub similarity: f64,
223 pub hash: u64,
224 pub clone_type: CloneType,
225 #[serde(skip_serializing_if = "Option::is_none")]
227 pub edit_distance: Option<usize>,
228 #[serde(skip_serializing_if = "Option::is_none")]
230 pub suppressed_by_directive: Option<bool>,
231 #[serde(skip)]
233 token_offset1: Option<usize>,
234 #[serde(skip)]
236 token_offset2: Option<usize>,
237 #[serde(skip)]
239 target_length: Option<usize>,
240 #[serde(skip_serializing_if = "Option::is_none")]
242 pub duplicate_id: Option<String>,
243}
244
245#[derive(Debug, Clone)]
247struct FunctionHash {
248 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
251 #[allow(dead_code)] start_byte: usize,
253 #[allow(dead_code)] end_byte: usize,
255 start_line: usize,
256 #[allow(dead_code)] end_line: usize,
258 tokens: Vec<Token>, token_line_offsets: Vec<usize>,
261 raw_body: String, }
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct Baseline {
267 pub version: String,
269 pub created_at: String,
271 pub duplicates: Vec<DuplicateMatch>,
273}
274
275impl Baseline {
276 pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
278 Self {
279 version: env!("CARGO_PKG_VERSION").to_string(),
280 created_at: chrono::Utc::now().to_rfc3339(),
281 duplicates,
282 }
283 }
284
285 pub fn save_to_file(&self, path: &Path) -> Result<()> {
287 let json =
288 serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
289 fs::write(path, json).context("Failed to write baseline file")?;
290 Ok(())
291 }
292
293 pub fn load_from_file(path: &Path) -> Result<Self> {
295 let content = fs::read_to_string(path)
296 .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
297 let baseline: Baseline =
298 serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
299 Ok(baseline)
300 }
301
302 pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
304 let baseline_set: std::collections::HashSet<_> =
305 self.duplicates.iter().map(duplicate_key).collect();
306
307 current
308 .iter()
309 .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
310 .cloned()
311 .collect()
312 }
313}
314
315fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
317 let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
319 (
320 dup.file1.clone(),
321 dup.file2.clone(),
322 dup.start_line1,
323 dup.start_line2,
324 )
325 } else {
326 (
327 dup.file2.clone(),
328 dup.file1.clone(),
329 dup.start_line2,
330 dup.start_line1,
331 )
332 };
333 (file1, file2, line1, line2, dup.length)
334}
335
336#[derive(Debug, Clone, Serialize, Deserialize)]
338pub struct SkippedFile {
339 pub path: String,
341 pub reason: String,
343}
344
345#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct Report {
348 #[serde(skip_serializing_if = "Option::is_none")]
350 pub version: Option<String>,
351 #[serde(skip_serializing_if = "Option::is_none")]
353 pub scan_time: Option<String>,
354 #[serde(skip_serializing_if = "Option::is_none")]
356 pub config: Option<ScanConfig>,
357 pub files_scanned: usize,
359 pub functions_analyzed: usize,
361 pub duplicates: Vec<DuplicateMatch>,
363 #[serde(default, skip_serializing_if = "Vec::is_empty")]
365 pub skipped_files: Vec<SkippedFile>,
366 pub stats: ScanStats,
368}
369
370#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct ScanConfig {
373 pub threshold: usize,
375 pub similarity: f64,
377 pub type3_enabled: bool,
379 #[serde(skip_serializing_if = "Option::is_none")]
381 pub paths: Option<Vec<String>>,
382}
383
384fn is_zero(n: &usize) -> bool {
386 *n == 0
387}
388
389#[derive(Debug, Clone, Serialize, Deserialize)]
391pub struct ScanStats {
392 pub total_lines: usize,
394 pub total_tokens: usize,
396 pub unique_hashes: usize,
398 pub duration_ms: u64,
400 #[serde(default, skip_serializing_if = "is_zero")]
402 pub suppressed_by_ignore_file: usize,
403 #[serde(default, skip_serializing_if = "is_zero")]
405 pub suppressed_by_directive: usize,
406}
407
408#[allow(dead_code)] pub struct Scanner {
411 min_block_size: usize,
413 similarity_threshold: f64,
415 exclude_patterns: Vec<String>,
417 exclude_glob_set: OnceCell<GlobSet>,
419 enable_type3: bool,
421 type3_tolerance: f64,
423 ignore_manager: Option<IgnoreManager>,
425 enable_directives: bool,
427 include_tests: bool,
429}
430
431fn default_exclude_patterns() -> Vec<String> {
433 vec![
434 "**/*.test.ts".to_string(),
436 "**/*.test.js".to_string(),
437 "**/*.test.tsx".to_string(),
438 "**/*.test.jsx".to_string(),
439 "**/*.spec.ts".to_string(),
440 "**/*.spec.js".to_string(),
441 "**/*.spec.tsx".to_string(),
442 "**/*.spec.jsx".to_string(),
443 "**/__tests__/**".to_string(),
444 "**/*.test.py".to_string(),
445 ]
446}
447
448fn build_artifact_patterns() -> Vec<String> {
450 vec![
451 "**/node_modules/**".to_string(),
452 "**/target/**".to_string(),
453 "**/dist/**".to_string(),
454 "**/build/**".to_string(),
455 "**/.git/**".to_string(),
456 ]
457}
458
459impl Scanner {
460 pub fn new() -> Self {
464 let mut exclude = build_artifact_patterns();
465 exclude.extend(default_exclude_patterns());
466
467 Self {
468 min_block_size: 50,
469 similarity_threshold: 0.85,
470 exclude_patterns: exclude,
471 exclude_glob_set: OnceCell::new(),
472 enable_type3: false,
473 type3_tolerance: 0.85,
474 ignore_manager: None,
475 enable_directives: false,
476 include_tests: false,
477 }
478 }
479
480 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
482 let mut exclude = build_artifact_patterns();
483 exclude.extend(default_exclude_patterns());
484
485 Ok(Self {
486 min_block_size,
487 similarity_threshold,
488 exclude_patterns: exclude,
489 exclude_glob_set: OnceCell::new(),
490 enable_type3: false,
491 type3_tolerance: 0.85,
492 ignore_manager: None,
493 enable_directives: false,
494 include_tests: false,
495 })
496 }
497
498 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
500 self.exclude_patterns = patterns;
501 self
502 }
503
504 pub fn with_test_files(mut self, include: bool) -> Self {
506 self.include_tests = include;
507 if include {
508 let test_patterns = default_exclude_patterns();
510 self.exclude_patterns.retain(|p| !test_patterns.contains(p));
511 }
512 self
513 }
514
515 pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
517 if !(0.0..=1.0).contains(&tolerance) {
518 return Err(PolyDupError::Config(
519 "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
520 ));
521 }
522 self.enable_type3 = true;
523 self.type3_tolerance = tolerance;
524 Ok(self)
525 }
526
527 pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
529 self.ignore_manager = Some(manager);
530 self
531 }
532
533 pub fn with_directives(mut self, enabled: bool) -> Self {
535 self.enable_directives = enabled;
536 self
537 }
538
539 pub fn collect_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
544 self.collect_source_files(paths)
545 }
546
547 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
556 use std::time::Instant;
557 let start_time = Instant::now();
558
559 let source_files = self.collect_source_files(paths)?;
561
562 let directives_map = self.collect_directives(&source_files);
564
565 let (function_hashes, total_lines, skipped_files) = self.analyze_files(&source_files)?;
567
568 let (mut duplicates, suppressed_by_ignore_file) =
570 self.find_duplicate_hashes(&function_hashes);
571
572 let suppressed_by_directive = if self.enable_directives && !directives_map.is_empty() {
574 self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes)
575 } else {
576 0
577 };
578
579 let stats = self.compute_stats(
581 &function_hashes,
582 total_lines,
583 start_time,
584 suppressed_by_ignore_file,
585 suppressed_by_directive,
586 );
587
588 let files_scanned = source_files.len().saturating_sub(skipped_files.len());
590
591 Ok(Report {
592 version: None, scan_time: None, config: None, files_scanned,
596 functions_analyzed: function_hashes.len(),
597 duplicates,
598 skipped_files,
599 stats,
600 })
601 }
602
603 fn collect_directives(
605 &self,
606 source_files: &[PathBuf],
607 ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
608 if self.enable_directives {
609 source_files
610 .par_iter()
611 .filter_map(|path| {
612 crate::directives::detect_directives_in_file(path)
613 .ok()
614 .map(|d| (path.clone(), d))
615 })
616 .collect()
617 } else {
618 HashMap::new()
619 }
620 }
621
622 #[allow(clippy::type_complexity)]
625 fn analyze_files(
626 &self,
627 source_files: &[PathBuf],
628 ) -> Result<(Vec<FunctionHash>, usize, Vec<SkippedFile>)> {
629 let results: Vec<(PathBuf, Result<(Vec<FunctionHash>, usize)>)> = source_files
631 .par_iter()
632 .map(|path| {
633 let result = (|| {
634 let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
636 let line_count = content.lines().count();
637
638 let hashes = self.process_file_content(path, &content)?;
640 Ok((hashes, line_count))
641 })();
642 (path.clone(), result)
643 })
644 .collect();
645
646 let mut all_hashes = Vec::new();
648 let mut total_lines = 0;
649 let mut skipped_files = Vec::new();
650
651 for (path, res) in results {
652 match res {
653 Ok((hashes, lines)) => {
654 all_hashes.extend(hashes);
655 total_lines += lines;
656 }
657 Err(e) => {
658 let reason = match &e {
660 PolyDupError::Io(io_err) => {
661 if io_err.kind() == std::io::ErrorKind::PermissionDenied {
662 "Permission denied".to_string()
663 } else {
664 format!("IO error: {}", io_err)
665 }
666 }
667 PolyDupError::Parsing(msg) => format!("Parse error: {}", msg),
668 PolyDupError::Config(msg) => format!("Config error: {}", msg),
669 PolyDupError::LanguageNotSupported(lang) => {
670 format!("Language not supported: {}", lang)
671 }
672 PolyDupError::LanguageDetection(_) => {
673 "Could not detect language".to_string()
674 }
675 PolyDupError::ParallelExecution(msg) => {
676 format!("Parallel execution error: {}", msg)
677 }
678 PolyDupError::IgnoreRule(msg) => format!("Ignore rule error: {}", msg),
679 PolyDupError::Other(e) => format!("Error: {}", e),
680 };
681 skipped_files.push(SkippedFile {
682 path: path.display().to_string(),
683 reason,
684 });
685 }
686 }
687 }
688
689 Ok((all_hashes, total_lines, skipped_files))
690 }
691
692 fn apply_directive_filtering(
695 &self,
696 duplicates: &mut Vec<DuplicateMatch>,
697 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
698 function_hashes: &[FunctionHash],
699 ) -> usize {
700 let original_count = duplicates.len();
701 for dup in duplicates.iter_mut() {
702 let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
703 if suppressed {
704 dup.suppressed_by_directive = Some(true);
705 }
706 }
707
708 duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
710 original_count - duplicates.len()
711 }
712
713 fn compute_stats(
715 &self,
716 function_hashes: &[FunctionHash],
717 total_lines: usize,
718 start_time: std::time::Instant,
719 suppressed_by_ignore_file: usize,
720 suppressed_by_directive: usize,
721 ) -> ScanStats {
722 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
723
724 let unique_hashes: usize = {
725 let mut hash_set = std::collections::HashSet::new();
726 for fh in function_hashes {
727 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
729 for (hash, _) in hashes {
730 hash_set.insert(hash);
731 }
732 }
733 hash_set.len()
734 };
735
736 let duration_ms = start_time.elapsed().as_millis() as u64;
737
738 ScanStats {
739 total_lines,
740 total_tokens,
741 unique_hashes,
742 duration_ms,
743 suppressed_by_ignore_file,
744 suppressed_by_directive,
745 }
746 }
747
748 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
753 let mut files = Vec::new();
754
755 for path in paths {
756 if path.is_file() {
757 if self.is_supported_file(&path) && !self.is_excluded(&path) {
758 files.push(path);
759 }
760 } else if path.is_dir() {
761 let walker = WalkBuilder::new(&path)
763 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
770
771 for entry in walker {
772 match entry {
773 Ok(entry) => {
774 let path = entry.path();
775 if path.is_file()
776 && self.is_supported_file(path)
777 && !self.is_excluded(path)
778 {
779 files.push(path.to_path_buf());
780 }
781 }
782 Err(err) => {
783 eprintln!("Warning: Failed to access path: {}", err);
785 }
786 }
787 }
788 }
789 }
790
791 Ok(files)
792 }
793
794 fn is_supported_file(&self, path: &Path) -> bool {
796 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
797 matches!(
798 ext,
799 "rs" | "py"
800 | "pyi"
801 | "js"
802 | "mjs"
803 | "cjs"
804 | "ts"
805 | "mts"
806 | "cts"
807 | "jsx"
808 | "tsx"
809 | "vue"
810 | "svelte"
811 )
812 } else {
813 false
814 }
815 }
816
817 fn is_excluded(&self, path: &Path) -> bool {
819 let glob_set = self.exclude_glob_set.get_or_init(|| {
821 use globset::{Glob, GlobSetBuilder};
822
823 let mut builder = GlobSetBuilder::new();
824 for pattern in &self.exclude_patterns {
825 if let Ok(glob) = Glob::new(pattern) {
826 builder.add(glob);
827 }
828 }
829
830 builder.build().unwrap_or_else(|_| GlobSet::empty())
831 });
832
833 glob_set.is_match(path)
834 }
835
836 fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
838 let lang = self.detect_language(path)?;
839 let functions = extract_functions(code, lang)?;
840
841 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
843 let mut function_hashes = Vec::new();
844
845 for func in functions {
846 let raw_body = func.body.clone();
848 let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
849
850 if tokens.len() < self.min_block_size {
852 continue;
853 }
854
855 function_hashes.push(FunctionHash {
857 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
859 start_byte: func.start_byte,
860 end_byte: func.end_byte,
861 start_line: func.start_line,
862 end_line: func.end_line,
863 tokens,
864 token_line_offsets,
865 raw_body,
866 });
867 }
868
869 Ok(function_hashes)
870 }
871
872 fn detect_language(&self, path: &Path) -> Result<Language> {
874 let ext = path
875 .extension()
876 .and_then(|e| e.to_str())
877 .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
878
879 match ext {
880 "rs" => Ok(tree_sitter_rust::language()),
881 "py" | "pyi" => Ok(tree_sitter_python::language()),
882 "js" | "mjs" | "cjs" | "jsx" | "ts" | "mts" | "cts" | "tsx" | "vue" | "svelte" => {
883 Ok(tree_sitter_javascript::language())
884 }
885 _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
886 }
887 }
888
889 fn compute_line_span(
891 &self,
892 func: &FunctionHash,
893 start_offset: usize,
894 length: usize,
895 ) -> (usize, usize) {
896 let start_line = func
897 .token_line_offsets
898 .get(start_offset)
899 .map(|offset| func.start_line + offset)
900 .unwrap_or(func.start_line + start_offset);
901
902 let end_index = start_offset + length.saturating_sub(1);
903 let end_line = func
904 .token_line_offsets
905 .get(end_index)
906 .map(|offset| func.start_line + offset)
907 .unwrap_or(func.start_line + end_index);
908
909 (start_line, end_line)
910 }
911
912 fn find_duplicate_hashes(
922 &self,
923 function_hashes: &[FunctionHash],
924 ) -> (Vec<DuplicateMatch>, usize) {
925 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
927
928 let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
930 std::collections::HashSet::new();
931
932 let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
934
935 if self.enable_type3 {
937 self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
938 }
939
940 self.compute_duplicate_ids(function_hashes, &mut duplicates);
942
943 let suppressed_count = self.filter_ignored_duplicates(&mut duplicates);
945
946 (duplicates, suppressed_count)
947 }
948
949 fn find_type12_duplicates<'a>(
953 &self,
954 function_hashes: &'a [FunctionHash],
955 seen_pairs: &mut std::collections::HashSet<(
956 &'a str,
957 &'a str,
958 usize,
959 usize,
960 usize,
961 usize,
962 usize,
963 )>,
964 ) -> Vec<DuplicateMatch> {
965 let mut duplicates = Vec::new();
966
967 for i in 0..function_hashes.len() {
968 for j in (i + 1)..function_hashes.len() {
969 let func1 = &function_hashes[i];
970 let func2 = &function_hashes[j];
971
972 let matches = self.find_clones_between_functions(func1, func2);
973
974 for clone_match in matches {
975 let pair_key = canonical_pair_key(
976 func1,
977 func2,
978 clone_match.source_start,
979 clone_match.target_start,
980 clone_match.length,
981 );
982
983 if seen_pairs.contains(&pair_key) {
984 continue;
985 }
986 seen_pairs.insert(pair_key);
987
988 let match_hash = Self::compute_match_hash(
990 &func1.tokens[clone_match.source_start
991 ..clone_match.source_start + clone_match.length],
992 );
993
994 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
995
996 let (actual_start1, actual_end1) =
997 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
998 let (actual_start2, actual_end2) =
999 self.compute_line_span(func2, clone_match.target_start, clone_match.length);
1000
1001 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1003 continue;
1004 }
1005
1006 duplicates.push(DuplicateMatch {
1007 file1: func1.file_path.to_string(),
1008 file2: func2.file_path.to_string(),
1009 start_line1: actual_start1,
1010 start_line2: actual_start2,
1011 end_line1: Some(actual_end1),
1012 end_line2: Some(actual_end2),
1013 length: clone_match.length,
1014 similarity: clone_match.similarity,
1015 hash: match_hash,
1016 clone_type,
1017 edit_distance: None,
1018 suppressed_by_directive: None,
1019 token_offset1: Some(clone_match.source_start),
1020 token_offset2: Some(clone_match.target_start),
1021 target_length: Some(clone_match.length),
1022 duplicate_id: None,
1023 });
1024 }
1025 }
1026 }
1027
1028 duplicates
1029 }
1030
1031 fn find_type3_duplicates<'a>(
1035 &self,
1036 function_hashes: &'a [FunctionHash],
1037 seen_pairs: &std::collections::HashSet<(
1038 &'a str,
1039 &'a str,
1040 usize,
1041 usize,
1042 usize,
1043 usize,
1044 usize,
1045 )>,
1046 duplicates: &mut Vec<DuplicateMatch>,
1047 ) {
1048 let mut type3_candidates = Vec::new();
1049
1050 for i in 0..function_hashes.len() {
1051 for j in (i + 1)..function_hashes.len() {
1052 let func1 = &function_hashes[i];
1053 let func2 = &function_hashes[j];
1054
1055 let type3_matches = detect_type3_clones(
1056 &func1.tokens,
1057 &func2.tokens,
1058 self.min_block_size,
1059 self.type3_tolerance,
1060 );
1061
1062 for clone_match in type3_matches {
1063 let pair_key = canonical_pair_key(
1064 func1,
1065 func2,
1066 clone_match.source_start,
1067 clone_match.target_start,
1068 clone_match.length,
1069 );
1070
1071 if seen_pairs.contains(&pair_key) {
1072 continue;
1073 }
1074
1075 type3_candidates.push((func1, func2, clone_match));
1076 }
1077 }
1078 }
1079
1080 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1082
1083 for (func1, func2, clone_match) in deduplicated {
1085 let (actual_start1, actual_end1) =
1086 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1087 let (actual_start2, actual_end2) =
1088 self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
1089
1090 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1093 continue;
1094 }
1095
1096 let window1 = &func1.tokens
1097 [clone_match.source_start..clone_match.source_start + clone_match.length];
1098 let window2 = &func2.tokens
1099 [clone_match.target_start..clone_match.target_start + clone_match.target_length];
1100 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1101
1102 let match_hash = Self::compute_match_hash(window1);
1103
1104 duplicates.push(DuplicateMatch {
1105 file1: func1.file_path.to_string(),
1106 file2: func2.file_path.to_string(),
1107 start_line1: actual_start1,
1108 start_line2: actual_start2,
1109 end_line1: Some(actual_end1),
1110 end_line2: Some(actual_end2),
1111 length: clone_match.length,
1112 similarity: clone_match.similarity,
1113 hash: match_hash,
1114 clone_type: CloneType::Type3,
1115 edit_distance: Some(edit_dist),
1116 suppressed_by_directive: None,
1117 token_offset1: Some(clone_match.source_start),
1118 token_offset2: Some(clone_match.target_start),
1119 target_length: Some(clone_match.target_length),
1120 duplicate_id: None,
1121 });
1122 }
1123 }
1124
1125 fn compute_duplicate_ids(
1129 &self,
1130 function_hashes: &[FunctionHash],
1131 duplicates: &mut [DuplicateMatch],
1132 ) {
1133 for dup in duplicates.iter_mut() {
1134 if dup.duplicate_id.is_some() {
1135 continue;
1136 }
1137
1138 let tokens1 = self.extract_duplicate_tokens(
1139 function_hashes,
1140 &dup.file1,
1141 dup.start_line1,
1142 dup.end_line1,
1143 dup.token_offset1,
1144 dup.length,
1145 );
1146
1147 let tokens2 = self.extract_duplicate_tokens(
1148 function_hashes,
1149 &dup.file2,
1150 dup.start_line2,
1151 dup.end_line2,
1152 dup.token_offset2,
1153 dup.target_length.unwrap_or(dup.length),
1154 );
1155
1156 if let Some(tokens1) = tokens1 {
1157 let id = if let Some(tokens2) = tokens2 {
1158 ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
1159 } else {
1160 ignore_rules::compute_duplicate_id(&tokens1)
1161 };
1162 dup.duplicate_id = Some(id);
1163 }
1164 }
1165 }
1166
1167 fn extract_duplicate_tokens(
1169 &self,
1170 function_hashes: &[FunctionHash],
1171 file: &str,
1172 reported_start: usize,
1173 reported_end: Option<usize>,
1174 token_offset: Option<usize>,
1175 length: usize,
1176 ) -> Option<Vec<String>> {
1177 function_hashes.iter().find_map(|fh| {
1178 if fh.file_path.as_ref() != file
1179 || fh.start_line > reported_start
1180 || reported_start > fh.end_line
1181 {
1182 return None;
1183 }
1184
1185 let start_offset = match token_offset {
1186 Some(offset) if offset + length <= fh.tokens.len() => Some(offset),
1187 _ => self.infer_token_offset(fh, reported_start, reported_end, length),
1188 }?;
1189
1190 if start_offset + length > fh.tokens.len() {
1191 return None;
1192 }
1193
1194 Some(
1195 fh.tokens
1196 .iter()
1197 .skip(start_offset)
1198 .take(length)
1199 .map(|t| t.as_hash_string().to_string())
1200 .collect(),
1201 )
1202 })
1203 }
1204
1205 fn infer_token_offset(
1207 &self,
1208 func_hash: &FunctionHash,
1209 reported_start: usize,
1210 reported_end: Option<usize>,
1211 length: usize,
1212 ) -> Option<usize> {
1213 let start_line_offset = reported_start.checked_sub(func_hash.start_line)?;
1214 let end_line = reported_end.unwrap_or(reported_start);
1215
1216 func_hash
1217 .token_line_offsets
1218 .iter()
1219 .enumerate()
1220 .filter_map(|(idx, line_offset)| {
1221 if *line_offset != start_line_offset {
1222 return None;
1223 }
1224
1225 let end_idx = idx.checked_add(length.checked_sub(1)?)?;
1226 let end_offset = func_hash.token_line_offsets.get(end_idx)?;
1227 if func_hash.start_line + *end_offset == end_line {
1228 Some(idx)
1229 } else {
1230 None
1231 }
1232 })
1233 .next()
1234 }
1235
1236 fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) -> usize {
1238 let original_count = duplicates.len();
1239 if let Some(ref ignore_manager) = self.ignore_manager {
1240 duplicates.retain(|dup| {
1241 if let Some(ref id) = dup.duplicate_id {
1242 !ignore_manager.is_ignored(id)
1243 } else {
1244 true
1246 }
1247 });
1248 }
1249 original_count - duplicates.len()
1250 }
1251
1252 fn compute_match_hash(tokens: &[Token]) -> u64 {
1254 use std::collections::hash_map::DefaultHasher;
1255 use std::hash::{Hash, Hasher};
1256 let mut hasher = DefaultHasher::new();
1257 tokens.hash(&mut hasher);
1258 hasher.finish()
1259 }
1260
1261 fn is_suppressed_by_directive(
1266 &self,
1267 dup: &DuplicateMatch,
1268 directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1269 function_hashes: &[FunctionHash],
1270 ) -> bool {
1271 let file1_path = PathBuf::from(&dup.file1);
1273 let file2_path = PathBuf::from(&dup.file2);
1274
1275 if let Some(directives) = directives_map.get(&file1_path) {
1277 let func_start =
1278 self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1279 let check_line = func_start.unwrap_or(dup.start_line1);
1281
1282 if directives.is_suppressed(check_line, check_line).is_some() {
1283 return true;
1284 }
1285 }
1286
1287 if let Some(directives) = directives_map.get(&file2_path) {
1289 let func_start =
1290 self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1291 let check_line = func_start.unwrap_or(dup.start_line2);
1293
1294 if directives.is_suppressed(check_line, check_line).is_some() {
1295 return true;
1296 }
1297 }
1298
1299 false
1300 }
1301
1302 fn find_owning_function_start(
1304 &self,
1305 file: &str,
1306 line: usize,
1307 function_hashes: &[FunctionHash],
1308 ) -> Option<usize> {
1309 function_hashes
1310 .iter()
1311 .find(|fh| {
1312 fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1313 })
1314 .map(|fh| fh.start_line)
1315 }
1316
1317 fn deduplicate_overlapping_matches<'a>(
1323 &self,
1324 candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1325 ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1326 if candidates.is_empty() {
1327 return Vec::new();
1328 }
1329
1330 let mut used = vec![false; candidates.len()];
1332 let mut deduplicated = Vec::new();
1333
1334 for i in 0..candidates.len() {
1335 if used[i] {
1336 continue;
1337 }
1338
1339 let (func1, func2, current) = &candidates[i];
1340 let mut best_match = (*func1, *func2, current.clone());
1341 used[i] = true;
1342
1343 let mut found_overlap = true;
1346 while found_overlap {
1347 found_overlap = false;
1348
1349 for j in (i + 1)..candidates.len() {
1350 if used[j] {
1351 continue;
1352 }
1353
1354 let (f1, f2, candidate) = &candidates[j];
1355
1356 let same_pair = (func1.file_path == f1.file_path
1358 && func2.file_path == f2.file_path
1359 && func1.start_line == f1.start_line
1360 && func2.start_line == f2.start_line)
1361 || (func1.file_path == f2.file_path
1362 && func2.file_path == f1.file_path
1363 && func1.start_line == f2.start_line
1364 && func2.start_line == f1.start_line);
1365
1366 if !same_pair {
1367 continue;
1368 }
1369
1370 let source_overlap = ranges_overlap(
1373 best_match.2.source_start,
1374 best_match.2.source_start + best_match.2.length,
1375 candidate.source_start,
1376 candidate.source_start + candidate.length,
1377 );
1378 let target_overlap = ranges_overlap(
1379 best_match.2.target_start,
1380 best_match.2.target_start + best_match.2.target_length,
1381 candidate.target_start,
1382 candidate.target_start + candidate.target_length,
1383 );
1384
1385 if source_overlap && target_overlap {
1386 let best_span = best_match.2.length.max(best_match.2.target_length);
1387 let candidate_span = candidate.length.max(candidate.target_length);
1388
1389 if candidate_span > best_span
1391 || (candidate_span == best_span
1392 && candidate.similarity > best_match.2.similarity)
1393 {
1394 best_match = (*f1, *f2, candidate.clone());
1395 found_overlap = true; }
1397 used[j] = true;
1398 }
1399 }
1400 }
1401
1402 deduplicated.push(best_match);
1403 }
1404
1405 deduplicated
1406 }
1407
1408 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1410 let normalized1 = raw1.split_whitespace().collect::<String>();
1412 let normalized2 = raw2.split_whitespace().collect::<String>();
1413
1414 if normalized1 == normalized2 {
1416 CloneType::Type1
1417 } else {
1418 CloneType::Type2
1420 }
1421 }
1422
1423 fn find_clones_between_functions(
1425 &self,
1426 func1: &FunctionHash,
1427 func2: &FunctionHash,
1428 ) -> Vec<CloneMatch> {
1429 use std::collections::HashMap;
1430
1431 let mut matches = Vec::new();
1432 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1433
1434 let mut i = 0;
1436 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1437 let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1438 hash_map.entry(hash).or_default().push(i);
1439 i += 1;
1440 }
1441
1442 let mut j = 0;
1444 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1445 let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1446
1447 if let Some(func1_positions) = hash_map.get(&hash) {
1448 for &func1_pos in func1_positions {
1449 if hashing::verify_cross_window_match(
1451 &func1.tokens,
1452 &func2.tokens,
1453 func1_pos,
1454 j,
1455 self.min_block_size,
1456 ) {
1457 let extension = hashing::extend_match(
1459 &func1.tokens,
1460 &func2.tokens,
1461 func1_pos,
1462 j,
1463 self.min_block_size,
1464 );
1465
1466 let total_length = self.min_block_size + extension;
1467
1468 matches.push(CloneMatch {
1469 source_start: func1_pos,
1470 target_start: j,
1471 length: total_length,
1472 target_length: total_length,
1473 similarity: 1.0, });
1475
1476 j += extension.max(1);
1478 break;
1479 }
1480 }
1481 }
1482
1483 j += 1;
1484 }
1485
1486 matches
1487 }
1488
1489 fn add_hashes_to_cache(&self, function_hashes: &[FunctionHash], cache: &mut HashCache) {
1490 for func_hash in function_hashes {
1491 let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1492
1493 for (hash, offset) in hashes {
1494 let end_token_idx = offset + self.min_block_size;
1495 let (start_line, end_line) =
1496 self.compute_line_span(func_hash, offset, self.min_block_size);
1497
1498 let location = CodeLocation {
1499 file_path: func_hash.file_path.to_string(),
1500 start_line,
1501 end_line,
1502 token_offset: Some(offset),
1503 token_length: self.min_block_size,
1504 tokens: func_hash.tokens[offset..end_token_idx].to_vec(),
1505 raw_source: func_hash.raw_body.clone(),
1506 };
1507
1508 cache.add_hash(hash, location);
1509 }
1510 }
1511 }
1512
1513 pub fn build_cache(&self, paths: Vec<PathBuf>) -> Result<HashCache> {
1518 let mut cache = HashCache::new(self.min_block_size);
1519
1520 let source_files = self.collect_source_files(paths)?;
1522
1523 for file_path in source_files {
1525 let content = match std::fs::read_to_string(&file_path) {
1526 Ok(c) => c,
1527 Err(_) => continue, };
1529
1530 let function_hashes = match self.process_file_content(&file_path, &content) {
1531 Ok(fh) => fh,
1532 Err(_) => continue, };
1534
1535 self.add_hashes_to_cache(&function_hashes, &mut cache);
1536 }
1537
1538 Ok(cache)
1539 }
1540
1541 pub fn scan_with_cache(
1546 &self,
1547 changed_files: Vec<PathBuf>,
1548 cache: &mut HashCache,
1549 ) -> Result<Report> {
1550 use std::time::Instant;
1551 let start_time = Instant::now();
1552
1553 let stale_files = cache.invalidate_stale_files();
1555 let normalize_path =
1556 |path: &Path| path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
1557 let changed_set: HashSet<PathBuf> =
1558 changed_files.iter().map(|p| normalize_path(p)).collect();
1559
1560 if !stale_files.is_empty() {
1561 let stale_paths: Vec<PathBuf> = stale_files
1564 .into_iter()
1565 .filter_map(|path| {
1566 let raw_path = PathBuf::from(&path);
1567 let normalized = normalize_path(&raw_path);
1568
1569 if !normalized.exists() || changed_set.contains(&normalized) {
1570 return None;
1571 }
1572
1573 Some(raw_path)
1574 })
1575 .collect();
1576
1577 if !stale_paths.is_empty() {
1578 let (stale_hashes, _, _) = self.analyze_files(&stale_paths)?;
1579 self.add_hashes_to_cache(&stale_hashes, cache);
1580 }
1581 }
1582
1583 let (function_hashes, total_lines, skipped_files) = self.analyze_files(&changed_files)?;
1585
1586 let mut duplicates = Vec::new();
1588 let mut cached_hits_by_file: HashMap<String, Vec<CodeLocation>> = HashMap::new();
1589 let mut cached_function_hashes: Vec<FunctionHash> = Vec::new();
1590
1591 for func_hash in &function_hashes {
1592 let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1593
1594 for (hash, offset) in hashes {
1595 if let Some(cached_locations) = cache.lookup(hash) {
1597 for cached_loc in cached_locations {
1598 let changed_file_path = Path::new(func_hash.file_path.as_ref())
1600 .canonicalize()
1601 .unwrap_or_else(|_| {
1602 Path::new(func_hash.file_path.as_ref()).to_path_buf()
1603 });
1604 let cached_file_path = Path::new(&cached_loc.file_path)
1605 .canonicalize()
1606 .unwrap_or_else(|_| Path::new(&cached_loc.file_path).to_path_buf());
1607
1608 if changed_file_path == cached_file_path {
1610 continue;
1611 }
1612
1613 cached_hits_by_file
1614 .entry(cached_loc.file_path.clone())
1615 .or_default()
1616 .push(cached_loc.clone());
1617
1618 let start_token_idx = offset;
1620 let end_token_idx =
1621 (offset + self.min_block_size).min(func_hash.tokens.len());
1622
1623 let start_line_offset =
1624 if start_token_idx < func_hash.token_line_offsets.len() {
1625 func_hash.token_line_offsets[start_token_idx]
1626 } else {
1627 0
1628 };
1629
1630 let end_line_offset = if end_token_idx > 0
1631 && end_token_idx - 1 < func_hash.token_line_offsets.len()
1632 {
1633 func_hash.token_line_offsets[end_token_idx - 1]
1634 } else {
1635 start_line_offset
1636 };
1637
1638 let similarity = compute_token_similarity(
1640 &func_hash.tokens[start_token_idx..end_token_idx],
1641 &cached_loc.tokens,
1642 );
1643
1644 if similarity >= self.similarity_threshold {
1645 let clone_type = if func_hash.raw_body == cached_loc.raw_source {
1646 CloneType::Type1
1647 } else {
1648 CloneType::Type2
1649 };
1650
1651 duplicates.push(DuplicateMatch {
1652 file1: func_hash.file_path.to_string(),
1653 file2: cached_loc.file_path.clone(),
1654 start_line1: func_hash.start_line + start_line_offset,
1655 start_line2: cached_loc.start_line,
1656 end_line1: Some(func_hash.start_line + end_line_offset),
1657 end_line2: Some(cached_loc.end_line),
1658 length: self.min_block_size,
1659 similarity,
1660 hash,
1661 clone_type,
1662 edit_distance: None,
1663 suppressed_by_directive: None,
1664 token_offset1: Some(offset),
1665 token_offset2: cached_loc.token_offset,
1666 target_length: Some(cached_loc.token_length),
1667 duplicate_id: None,
1668 });
1669 }
1670 }
1671 }
1672 }
1673 }
1674
1675 if self.enable_type3 && !cached_hits_by_file.is_empty() {
1677 let mut seen_functions: HashSet<(String, usize)> = HashSet::new();
1678
1679 for locations in cached_hits_by_file.values() {
1680 for loc in locations {
1681 let token_offset = match loc.token_offset {
1682 Some(offset) => offset,
1683 None => continue,
1684 };
1685
1686 let normalized_path = normalize_path(Path::new(&loc.file_path));
1687 if changed_set.contains(&normalized_path) {
1688 continue;
1689 }
1690
1691 let (tokens, token_line_offsets) = normalize_with_line_numbers(&loc.raw_source);
1692 if tokens.len() < self.min_block_size
1693 || token_offset >= token_line_offsets.len()
1694 {
1695 continue;
1696 }
1697
1698 let line_offset = token_line_offsets[token_offset];
1699 let start_line = loc.start_line.saturating_sub(line_offset);
1700 let key = (loc.file_path.clone(), start_line);
1701
1702 if !seen_functions.insert(key.clone()) {
1703 continue;
1704 }
1705
1706 let end_line =
1707 start_line + token_line_offsets.last().copied().unwrap_or_default();
1708
1709 cached_function_hashes.push(FunctionHash {
1710 file_path: Arc::<str>::from(key.0),
1711 function_name: None,
1712 start_byte: 0,
1713 end_byte: 0,
1714 start_line,
1715 end_line,
1716 tokens,
1717 token_line_offsets,
1718 raw_body: loc.raw_source.clone(),
1719 });
1720 }
1721 }
1722
1723 if !cached_function_hashes.is_empty() {
1724 type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
1726
1727 let mut seen_pairs: HashSet<SeenPairKey<'_>> = HashSet::new();
1728
1729 for dup in &duplicates {
1730 if let (Some(offset1), Some(offset2)) = (dup.token_offset1, dup.token_offset2) {
1731 if let (Some(func1), Some(func2)) = (
1732 function_hashes.iter().find(|fh| {
1733 fh.file_path.as_ref() == dup.file1.as_str()
1734 && fh.start_line <= dup.start_line1
1735 && dup.start_line1 <= fh.end_line
1736 }),
1737 cached_function_hashes.iter().find(|fh| {
1738 fh.file_path.as_ref() == dup.file2.as_str()
1739 && fh.start_line <= dup.start_line2
1740 && dup.start_line2 <= fh.end_line
1741 }),
1742 ) {
1743 seen_pairs.insert(canonical_pair_key(
1744 func1, func2, offset1, offset2, dup.length,
1745 ));
1746 }
1747 }
1748 }
1749
1750 let mut type3_candidates = Vec::new();
1751
1752 for func1 in &function_hashes {
1753 for func2 in &cached_function_hashes {
1754 let type3_matches = detect_type3_clones(
1755 &func1.tokens,
1756 &func2.tokens,
1757 self.min_block_size,
1758 self.type3_tolerance,
1759 );
1760
1761 for clone_match in type3_matches {
1762 let pair_key = canonical_pair_key(
1763 func1,
1764 func2,
1765 clone_match.source_start,
1766 clone_match.target_start,
1767 clone_match.length,
1768 );
1769
1770 if seen_pairs.contains(&pair_key) {
1771 continue;
1772 }
1773
1774 type3_candidates.push((func1, func2, clone_match));
1775 }
1776 }
1777 }
1778
1779 let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1780
1781 for (func1, func2, clone_match) in deduplicated {
1782 let (actual_start1, actual_end1) =
1783 self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1784 let (actual_start2, actual_end2) = self.compute_line_span(
1785 func2,
1786 clone_match.target_start,
1787 clone_match.target_length,
1788 );
1789
1790 if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1791 continue;
1792 }
1793
1794 let window1 = &func1.tokens
1795 [clone_match.source_start..clone_match.source_start + clone_match.length];
1796 let window2 = &func2.tokens[clone_match.target_start
1797 ..clone_match.target_start + clone_match.target_length];
1798
1799 let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1800 let match_hash = Self::compute_match_hash(window1);
1801
1802 duplicates.push(DuplicateMatch {
1803 file1: func1.file_path.to_string(),
1804 file2: func2.file_path.to_string(),
1805 start_line1: actual_start1,
1806 start_line2: actual_start2,
1807 end_line1: Some(actual_end1),
1808 end_line2: Some(actual_end2),
1809 length: clone_match.length,
1810 similarity: clone_match.similarity,
1811 hash: match_hash,
1812 clone_type: CloneType::Type3,
1813 edit_distance: Some(edit_dist),
1814 suppressed_by_directive: None,
1815 token_offset1: Some(clone_match.source_start),
1816 token_offset2: Some(clone_match.target_start),
1817 target_length: Some(clone_match.target_length),
1818 duplicate_id: None,
1819 });
1820 }
1821 }
1822 }
1823
1824 let (intra_duplicates, _) = self.find_duplicate_hashes(&function_hashes);
1826 duplicates.extend(intra_duplicates);
1827
1828 duplicates.sort_by(|a, b| {
1830 (&a.file1, &a.file2, a.start_line1, a.start_line2).cmp(&(
1831 &b.file1,
1832 &b.file2,
1833 b.start_line1,
1834 b.start_line2,
1835 ))
1836 });
1837 duplicates.dedup_by(|a, b| {
1838 a.file1 == b.file1
1839 && a.file2 == b.file2
1840 && a.start_line1 == b.start_line1
1841 && a.start_line2 == b.start_line2
1842 });
1843
1844 let mut lookup_function_hashes = function_hashes.clone();
1847 if !cached_function_hashes.is_empty() {
1848 lookup_function_hashes.extend(cached_function_hashes.clone());
1849 }
1850 let hashed_files: HashSet<&str> = lookup_function_hashes
1851 .iter()
1852 .map(|fh| fh.file_path.as_ref())
1853 .collect();
1854
1855 let mut missing_files: HashSet<String> = HashSet::new();
1856 for dup in &duplicates {
1857 if !hashed_files.contains(dup.file1.as_str()) {
1858 missing_files.insert(dup.file1.clone());
1859 }
1860 if !hashed_files.contains(dup.file2.as_str()) {
1861 missing_files.insert(dup.file2.clone());
1862 }
1863 }
1864
1865 if !missing_files.is_empty() {
1866 let missing_paths: Vec<PathBuf> = missing_files.iter().map(PathBuf::from).collect();
1867 let (mut extra_hashes, _, _) = self.analyze_files(&missing_paths)?;
1868 lookup_function_hashes.append(&mut extra_hashes);
1869 }
1870
1871 self.compute_duplicate_ids(&lookup_function_hashes, &mut duplicates);
1873 let suppressed_by_ignore_file = self.filter_ignored_duplicates(&mut duplicates);
1874
1875 let suppressed_by_directive = if self.enable_directives && !duplicates.is_empty() {
1877 let directive_paths: HashSet<PathBuf> = lookup_function_hashes
1878 .iter()
1879 .map(|fh| PathBuf::from(fh.file_path.as_ref()))
1880 .collect();
1881 let directives_map =
1882 self.collect_directives(&directive_paths.into_iter().collect::<Vec<_>>());
1883
1884 if !directives_map.is_empty() {
1885 self.apply_directive_filtering(
1886 &mut duplicates,
1887 &directives_map,
1888 &lookup_function_hashes,
1889 )
1890 } else {
1891 0
1892 }
1893 } else {
1894 0
1895 };
1896
1897 self.add_hashes_to_cache(&function_hashes, cache);
1899
1900 let stats = self.compute_stats(
1902 &function_hashes,
1903 total_lines,
1904 start_time,
1905 suppressed_by_ignore_file,
1906 suppressed_by_directive,
1907 );
1908
1909 let files_scanned = changed_files.len().saturating_sub(skipped_files.len());
1911
1912 Ok(Report {
1913 version: None,
1914 scan_time: None,
1915 config: None,
1916 files_scanned,
1917 functions_analyzed: function_hashes.len(),
1918 duplicates,
1919 skipped_files,
1920 stats,
1921 })
1922 }
1923}
1924
1925impl Default for Scanner {
1926 fn default() -> Self {
1927 Self::new() }
1929}
1930
1931pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1939 let scanner = Scanner::new();
1940 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1941 scanner.scan(path_bufs)
1942}
1943
1944pub fn find_duplicates_with_config(
1946 paths: Vec<String>,
1947 min_block_size: usize,
1948 similarity_threshold: f64,
1949) -> Result<Report> {
1950 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1951 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1952 scanner.scan(path_bufs)
1953}
1954
1955#[cfg(test)]
1956mod tests {
1957 use super::*;
1958
1959 fn make_test_function(
1961 file: &str,
1962 start_line: usize,
1963 tokens: Vec<Token>,
1964 raw_body: &str,
1965 ) -> FunctionHash {
1966 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1967 FunctionHash {
1968 file_path: Arc::<str>::from(file),
1969 function_name: None,
1970 start_byte: 0,
1971 end_byte: 0,
1972 start_line,
1973 end_line: start_line + tokens.len(),
1974 tokens,
1975 token_line_offsets,
1976 raw_body: raw_body.to_string(),
1977 }
1978 }
1979
1980 fn make_test_function_same_line(
1982 file: &str,
1983 start_line: usize,
1984 end_line: usize,
1985 tokens: Vec<Token>,
1986 raw_body: &str,
1987 ) -> FunctionHash {
1988 let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1989 FunctionHash {
1990 file_path: Arc::<str>::from(file),
1991 function_name: None,
1992 start_byte: 0,
1993 end_byte: 0,
1994 start_line,
1995 end_line,
1996 tokens,
1997 token_line_offsets,
1998 raw_body: raw_body.to_string(),
1999 }
2000 }
2001
2002 fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
2004 vec![
2005 Token::Keyword(keyword.into()),
2006 Token::Identifier,
2007 Token::Operator(op.into()),
2008 Token::Identifier,
2009 Token::Punctuation(";".into()),
2010 ]
2011 }
2012
2013 #[test]
2014 fn test_scanner_creation() {
2015 let _scanner = Scanner::new(); }
2017
2018 #[test]
2019 fn test_scanner_with_config() {
2020 let scanner = Scanner::with_config(30, 0.9);
2021 assert!(scanner.is_ok());
2022 let s = scanner.unwrap();
2023 assert_eq!(s.min_block_size, 30);
2024 assert_eq!(s.similarity_threshold, 0.9);
2025 }
2026
2027 #[test]
2028 fn test_type3_tolerance_validation() {
2029 assert!(Scanner::new().with_type3_detection(0.9).is_ok());
2030 assert!(Scanner::new().with_type3_detection(1.2).is_err());
2031 assert!(Scanner::new().with_type3_detection(-0.1).is_err());
2032 }
2033
2034 #[test]
2035 fn test_type3_not_dropped_when_functions_share_offsets() {
2036 fn make_function(
2037 file: &str,
2038 start_line: usize,
2039 tokens: Vec<Token>,
2040 raw_body: &str,
2041 ) -> FunctionHash {
2042 let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
2043 FunctionHash {
2044 file_path: Arc::<str>::from(file),
2045 function_name: None,
2046 start_byte: 0,
2047 end_byte: 0,
2048 start_line,
2049 end_line: start_line + tokens.len(),
2050 tokens,
2051 token_line_offsets,
2052 raw_body: raw_body.to_string(),
2053 }
2054 }
2055
2056 let scanner = Scanner::with_config(3, 0.85)
2057 .unwrap()
2058 .with_type3_detection(0.6)
2059 .unwrap();
2060
2061 let type1_tokens = vec![
2062 Token::Keyword("return".into()),
2063 Token::NumberLiteral,
2064 Token::Punctuation(";".into()),
2065 ];
2066 let near_tokens_a = vec![
2067 Token::Keyword("compute".into()),
2068 Token::Identifier,
2069 Token::Identifier,
2070 ];
2071 let near_tokens_b = vec![
2072 Token::Keyword("compute".into()),
2073 Token::Identifier,
2074 Token::NumberLiteral,
2075 ];
2076
2077 let functions = vec![
2078 make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
2079 make_function("file_b.rs", 20, type1_tokens, "return 1;"),
2080 make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
2081 make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
2082 ];
2083
2084 let (duplicates, _) = scanner.find_duplicate_hashes(&functions);
2085
2086 let type1_present = duplicates.iter().any(|d| {
2087 matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
2088 && d.start_line1 == 10
2089 && d.start_line2 == 20
2090 });
2091 assert!(
2092 type1_present,
2093 "expected Type-1/2 match for the first function pair"
2094 );
2095
2096 let type3_present = duplicates.iter().any(|d| {
2097 matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
2098 });
2099 assert!(
2100 type3_present,
2101 "Type-3 match between later functions should not be deduped"
2102 );
2103
2104 assert_eq!(
2105 duplicates.len(),
2106 2,
2107 "should keep both the Type-1/2 and Type-3 matches"
2108 );
2109 }
2110
2111 #[test]
2112 fn test_type3_reports_token_offsets_in_start_lines() {
2113 let scanner = Scanner::with_config(3, 0.85)
2114 .unwrap()
2115 .with_type3_detection(0.75)
2116 .unwrap();
2117
2118 let functions = vec![
2119 make_test_function_same_line(
2120 "file_a.rs",
2121 100,
2122 105,
2123 make_expr_tokens("let", "+"),
2124 "let a = b + c;",
2125 ),
2126 make_test_function_same_line(
2127 "file_b.rs",
2128 200,
2129 205,
2130 make_expr_tokens("mut", "-"),
2131 "let a = b - c;",
2132 ),
2133 ];
2134
2135 let (duplicates, _) = scanner.find_duplicate_hashes(&functions);
2136
2137 let type3 = duplicates
2138 .iter()
2139 .find(|d| matches!(d.clone_type, CloneType::Type3))
2140 .expect("expected a Type-3 duplicate match");
2141
2142 assert_eq!(
2143 type3.start_line1, 100,
2144 "should report the actual source line even when tokens share a line"
2145 );
2146 assert_eq!(
2147 type3.start_line2, 200,
2148 "should report the actual target line even when tokens share a line"
2149 );
2150 assert_eq!(type3.token_offset1, Some(1));
2151 assert_eq!(type3.token_offset2, Some(1));
2152 }
2153
2154 #[test]
2155 fn type3_duplicate_ids_are_symmetric() {
2156 use tempfile::TempDir;
2157
2158 let tokens_a = make_expr_tokens("let", "+");
2159 let mut tokens_b = make_expr_tokens("let", "-");
2161 tokens_b.push(Token::Identifier);
2162
2163 let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
2164 let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
2165
2166 let temp_dir = TempDir::new().unwrap();
2167 let scanner = Scanner::with_config(3, 0.85)
2168 .unwrap()
2169 .with_type3_detection(0.75)
2170 .unwrap()
2171 .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
2172
2173 let (forward, _) = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
2174 let (reverse, _) = scanner.find_duplicate_hashes(&[func_b, func_a]);
2175
2176 let id_forward = forward
2177 .into_iter()
2178 .find(|d| matches!(d.clone_type, CloneType::Type3))
2179 .and_then(|d| d.duplicate_id)
2180 .expect("expected a Type-3 duplicate ID");
2181
2182 let id_reverse = reverse
2183 .into_iter()
2184 .find(|d| matches!(d.clone_type, CloneType::Type3))
2185 .and_then(|d| d.duplicate_id)
2186 .expect("expected a Type-3 duplicate ID");
2187
2188 assert_eq!(
2189 id_forward, id_reverse,
2190 "Type-3 IDs should not depend on function order"
2191 );
2192 }
2193
2194 #[test]
2195 fn type3_does_not_report_self_matches() {
2196 let scanner = Scanner::with_config(3, 0.85)
2199 .unwrap()
2200 .with_type3_detection(0.75)
2201 .unwrap();
2202
2203 let tokens = make_expr_tokens("let", "+");
2206 let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
2207 let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
2208
2209 let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2210
2211 let self_matches: Vec<_> = duplicates
2213 .iter()
2214 .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
2215 .collect();
2216
2217 assert!(
2218 self_matches.is_empty(),
2219 "Type-3 should never report self-matches (same file and line). Found: {:?}",
2220 self_matches
2221 );
2222 }
2223
2224 #[test]
2225 fn type3_still_detects_same_file_different_line_duplicates() {
2226 let scanner = Scanner::with_config(3, 0.85)
2228 .unwrap()
2229 .with_type3_detection(0.75)
2230 .unwrap();
2231
2232 let tokens1 = make_expr_tokens("let", "+");
2234 let mut tokens2 = make_expr_tokens("let", "-");
2235 tokens2.push(Token::Identifier); let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
2238 let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
2239
2240 let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2241
2242 let same_file_different_line: Vec<_> = duplicates
2243 .iter()
2244 .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
2245 .collect();
2246
2247 assert!(
2248 !same_file_different_line.is_empty(),
2249 "Type-3 should still detect duplicates in the same file at different lines"
2250 );
2251 }
2252
2253 #[test]
2254 fn duplicate_matches_store_actual_end_lines() {
2255 let scanner = Scanner::with_config(2, 0.85).unwrap();
2256
2257 let tokens = vec![
2258 Token::Keyword("fn".into()),
2259 Token::Identifier,
2260 Token::Identifier,
2261 Token::Punctuation("{".into()),
2262 Token::Punctuation("}".into()),
2263 ];
2264
2265 let func1 = FunctionHash {
2266 file_path: Arc::<str>::from("file_a.rs"),
2267 function_name: None,
2268 start_byte: 0,
2269 end_byte: 0,
2270 start_line: 10,
2271 end_line: 14,
2272 tokens: tokens.clone(),
2273 token_line_offsets: vec![0, 0, 1, 1, 2],
2274 raw_body: "fn a() {}".to_string(),
2275 };
2276
2277 let func2 = FunctionHash {
2278 file_path: Arc::<str>::from("file_b.rs"),
2279 function_name: None,
2280 start_byte: 0,
2281 end_byte: 0,
2282 start_line: 20,
2283 end_line: 24,
2284 tokens,
2285 token_line_offsets: vec![0, 1, 1, 2, 2],
2286 raw_body: "fn b() {}".to_string(),
2287 };
2288
2289 let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2290 let dup = duplicates.first().expect("expected a duplicate match");
2291
2292 assert_eq!(dup.start_line1, 10);
2293 assert_eq!(dup.start_line2, 20);
2294 assert_eq!(dup.end_line1, Some(12));
2295 assert_eq!(dup.end_line2, Some(22));
2296 }
2297
2298 #[test]
2299 fn scan_with_cache_prunes_stale_entries() {
2300 let temp_dir = tempfile::tempdir().unwrap();
2301 let file_a = temp_dir.path().join("a.js");
2302 let file_b = temp_dir.path().join("b.js");
2303
2304 let shared_fn = r#"
2305 function shared() {
2306 return 1 + 1;
2307 }
2308 "#;
2309 std::fs::write(&file_a, shared_fn).unwrap();
2310 std::fs::write(&file_b, shared_fn).unwrap();
2311
2312 let scanner = Scanner::with_config(3, 0.85).unwrap();
2313 let mut cache = scanner
2314 .build_cache(vec![file_a.clone(), file_b.clone()])
2315 .unwrap();
2316
2317 std::thread::sleep(std::time::Duration::from_millis(1100));
2319 std::fs::write(&file_b, "const unrelated = 42;\n").unwrap();
2320
2321 let report = scanner
2322 .scan_with_cache(vec![file_a.clone()], &mut cache)
2323 .unwrap();
2324
2325 assert!(
2326 report.duplicates.is_empty(),
2327 "stale cache entries should be invalidated before lookup"
2328 );
2329 }
2330
2331 #[test]
2332 fn scan_with_cache_repopulates_changed_entries() {
2333 let temp_dir = tempfile::tempdir().unwrap();
2334 let file_a = temp_dir.path().join("a.js");
2335
2336 let original = r#"
2337 function shared() {
2338 return 1 + 1;
2339 }
2340 "#;
2341
2342 let updated = r#"
2343 function shared() {
2344 return 7 + 8;
2345 }
2346 "#;
2347
2348 std::fs::write(&file_a, original).unwrap();
2349
2350 let scanner = Scanner::with_config(3, 0.85).unwrap();
2351 let mut cache = scanner.build_cache(vec![file_a.clone()]).unwrap();
2352
2353 std::thread::sleep(std::time::Duration::from_millis(1100));
2354 std::fs::write(&file_a, updated).unwrap();
2355
2356 let file_a_str = file_a.to_string_lossy().to_string();
2357 assert!(
2358 cache.file_needs_rescan(&file_a_str),
2359 "modified files should be considered stale before cache lookup"
2360 );
2361
2362 scanner
2363 .scan_with_cache(vec![file_a.clone()], &mut cache)
2364 .unwrap();
2365
2366 let cached_entries: Vec<&CodeLocation> = cache
2367 .hash_index
2368 .values()
2369 .flat_map(|locs| locs.iter())
2370 .filter(|loc| loc.file_path == file_a_str)
2371 .collect();
2372
2373 assert!(
2374 !cached_entries.is_empty(),
2375 "changed files should be added back into the cache after rescan"
2376 );
2377 assert!(
2378 cached_entries
2379 .iter()
2380 .any(|loc| loc.raw_source.contains("return 7 + 8;")),
2381 "cache should contain hashes for the refreshed file contents"
2382 );
2383 assert!(
2384 cache.file_metadata.contains_key(&file_a_str),
2385 "file metadata should be refreshed after rescanning changed files"
2386 );
2387 }
2388
2389 #[test]
2390 fn scan_with_cache_rehydrates_stale_unchanged_files() {
2391 let temp_dir = tempfile::tempdir().unwrap();
2392 let changed_file = temp_dir.path().join("changed.js");
2393 let unchanged_file = temp_dir.path().join("unchanged.js");
2394
2395 let shared_fn = r#"
2396 function shared() {
2397 return 1 + 1;
2398 }
2399 "#;
2400
2401 std::fs::write(&changed_file, shared_fn).unwrap();
2402 std::fs::write(&unchanged_file, shared_fn).unwrap();
2403
2404 let scanner = Scanner::with_config(3, 0.85).unwrap();
2405 let mut cache = scanner
2406 .build_cache(vec![temp_dir.path().to_path_buf()])
2407 .unwrap();
2408
2409 std::thread::sleep(std::time::Duration::from_millis(1100));
2411 std::fs::write(
2412 &changed_file,
2413 r#"
2414 function shared() {
2415 return 1 + 1;
2416 }
2417 function another() {
2418 return 1 + 1;
2419 }
2420 "#,
2421 )
2422 .unwrap();
2423 std::fs::write(&unchanged_file, shared_fn).unwrap();
2424
2425 let report = scanner
2426 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2427 .unwrap();
2428
2429 assert!(
2430 report.duplicates.iter().any(|dup| {
2431 (dup.file1.ends_with("changed.js") && dup.file2.ends_with("unchanged.js"))
2432 || (dup.file1.ends_with("unchanged.js") && dup.file2.ends_with("changed.js"))
2433 }),
2434 "invalidated entries should be rebuilt so unchanged files still match against diffs"
2435 );
2436 }
2437
2438 #[test]
2439 fn scan_with_cache_respects_ignore_file() {
2440 let temp_dir = tempfile::tempdir().unwrap();
2441 let file_a = temp_dir.path().join("a.js");
2442 let file_b = temp_dir.path().join("b.js");
2443
2444 let shared_fn = r#"
2445 function shared() {
2446 return 1 + 1;
2447 }
2448 "#;
2449 std::fs::write(&file_a, shared_fn).unwrap();
2450 std::fs::write(&file_b, shared_fn).unwrap();
2451
2452 let base_scanner = Scanner::with_config(3, 0.85).unwrap();
2453 let mut cache = base_scanner
2454 .build_cache(vec![temp_dir.path().to_path_buf()])
2455 .unwrap();
2456
2457 let initial_report = base_scanner
2458 .scan_with_cache(vec![file_a.clone()], &mut cache)
2459 .unwrap();
2460 assert!(
2461 !initial_report.duplicates.is_empty(),
2462 "expected an initial duplicate to seed ignore entries"
2463 );
2464 let ignored_ids: Vec<String> = initial_report
2465 .duplicates
2466 .iter()
2467 .map(|d| {
2468 d.duplicate_id
2469 .clone()
2470 .expect("expected cache path to compute duplicate IDs")
2471 })
2472 .collect();
2473
2474 let mut manager = IgnoreManager::new(temp_dir.path());
2475 for id in ignored_ids {
2476 manager.add_ignore(IgnoreEntry::new(
2477 id,
2478 vec![],
2479 "test ignore".to_string(),
2480 "tester".to_string(),
2481 ));
2482 }
2483
2484 let scanner = base_scanner.with_ignore_manager(manager);
2485 let report = scanner
2486 .scan_with_cache(vec![file_a.clone()], &mut cache)
2487 .unwrap();
2488
2489 assert!(
2490 report.duplicates.is_empty(),
2491 "duplicates present in .polydup-ignore should be filtered when using cache"
2492 );
2493 }
2494
2495 #[test]
2496 fn scan_with_cache_uses_symmetric_ids_for_existing_ignores() {
2497 let temp_dir = tempfile::tempdir().unwrap();
2498 let file_a = temp_dir.path().join("a.js");
2499 let file_b = temp_dir.path().join("b.js");
2500
2501 let shared_fn = r#"
2502 function shared() {
2503 return 1 + 1;
2504 }
2505 "#;
2506 std::fs::write(&file_a, shared_fn).unwrap();
2507 std::fs::write(&file_b, shared_fn).unwrap();
2508
2509 let base_scanner = Scanner::with_config(7, 0.85).unwrap();
2510 let mut cache = base_scanner
2511 .build_cache(vec![temp_dir.path().to_path_buf()])
2512 .unwrap();
2513
2514 let baseline_report = base_scanner
2515 .scan(vec![temp_dir.path().to_path_buf()])
2516 .unwrap();
2517 let baseline_id = baseline_report
2518 .duplicates
2519 .first()
2520 .and_then(|dup| dup.duplicate_id.clone())
2521 .expect("expected duplicate IDs from full scans");
2522 let baseline_id_for_ignore = baseline_id.clone();
2523
2524 let mut manager = IgnoreManager::new(temp_dir.path());
2525 manager.add_ignore(IgnoreEntry::new(
2526 baseline_id_for_ignore,
2527 vec![],
2528 "test ignore".to_string(),
2529 "tester".to_string(),
2530 ));
2531
2532 let scanner = base_scanner.with_ignore_manager(manager);
2533 let report = scanner
2534 .scan_with_cache(vec![file_a.clone()], &mut cache)
2535 .unwrap();
2536
2537 assert!(
2538 report.duplicates.is_empty(),
2539 "cached scans should honor ignores generated from full scans"
2540 );
2541 }
2542
2543 #[test]
2544 fn scan_with_cache_respects_directives_from_cached_files() {
2545 let temp_dir = tempfile::tempdir().unwrap();
2546 let changed_file = temp_dir.path().join("changed.js");
2547 let cached_file = temp_dir.path().join("cached.js");
2548
2549 let suppressed_fn = r#"
2550 // polydup-ignore: generated code
2551 function shared() {
2552 return 1 + 1;
2553 }
2554 "#;
2555
2556 let changed_fn = r#"
2557 function shared() {
2558 return 1 + 1;
2559 }
2560 "#;
2561
2562 std::fs::write(&cached_file, suppressed_fn).unwrap();
2563 std::fs::write(&changed_file, changed_fn).unwrap();
2564
2565 let scanner = Scanner::with_config(3, 0.85).unwrap().with_directives(true);
2566 let mut cache = scanner
2567 .build_cache(vec![temp_dir.path().to_path_buf()])
2568 .unwrap();
2569
2570 let report = scanner
2571 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2572 .unwrap();
2573
2574 assert!(
2575 report.duplicates.is_empty(),
2576 "duplicates suppressed by directives in cached files should stay suppressed when using cache"
2577 );
2578 }
2579
2580 #[test]
2581 fn scan_with_cache_runs_type3_detection_against_cached_files() {
2582 let temp_dir = tempfile::tempdir().unwrap();
2583 let changed_file = temp_dir.path().join("changed.js");
2584 let cached_file = temp_dir.path().join("cached.js");
2585
2586 let cached_fn = r#"
2587 function cached() {
2588 step1();
2589 step2();
2590 step3();
2591 step4();
2592 step5();
2593 }
2594 "#;
2595
2596 let changed_fn = r#"
2597 function cached() {
2598 step1();
2599 step2();
2600 insert_gap();
2601 step3();
2602 step4();
2603 step5();
2604 }
2605 "#;
2606
2607 std::fs::write(&cached_file, cached_fn).unwrap();
2608 std::fs::write(&changed_file, changed_fn).unwrap();
2609
2610 let scanner = Scanner::with_config(3, 0.8)
2611 .unwrap()
2612 .with_type3_detection(0.8)
2613 .unwrap();
2614 let mut cache = scanner
2615 .build_cache(vec![temp_dir.path().to_path_buf()])
2616 .unwrap();
2617
2618 let report = scanner
2619 .scan_with_cache(vec![changed_file.clone()], &mut cache)
2620 .unwrap();
2621
2622 assert!(
2623 report.duplicates.iter().any(|dup| {
2624 matches!(dup.clone_type, CloneType::Type3)
2625 && dup.file1.ends_with("changed.js")
2626 && dup.file2.ends_with("cached.js")
2627 }),
2628 "Type-3 should run for cached comparisons so near-miss clones surface in git-diff mode"
2629 );
2630 }
2631
2632 #[test]
2633 fn test_find_duplicates_empty() {
2634 let result = find_duplicates(vec![]);
2635 assert!(result.is_ok());
2636 let report = result.unwrap();
2637 assert_eq!(report.duplicates.len(), 0);
2638 }
2639
2640 #[test]
2641 fn test_is_supported_file() {
2642 let scanner = Scanner::new();
2643
2644 assert!(scanner.is_supported_file(Path::new("test.rs")));
2645 assert!(scanner.is_supported_file(Path::new("test.py")));
2646 assert!(scanner.is_supported_file(Path::new("test.js")));
2647 assert!(scanner.is_supported_file(Path::new("test.ts")));
2648 assert!(!scanner.is_supported_file(Path::new("test.txt")));
2649 assert!(!scanner.is_supported_file(Path::new("test.md")));
2650 }
2651
2652 #[test]
2653 fn test_detect_language() {
2654 let scanner = Scanner::new();
2655
2656 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
2657 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
2658 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
2659 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
2660 }
2661}