dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod cache;
8mod directives;
9mod error;
10mod hashing;
11mod ignore_rules;
12mod parsing;
13mod queries;
14
15#[cfg(test)]
16mod proptest_fuzzing;
17
18#[cfg(test)]
19mod snapshot_tests;
20
21// Re-export public types
22pub use cache::{CacheStats, CodeLocation, FileCacheMetadata, HashCache};
23pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
24pub use error::{PolyDupError, Result};
25pub use hashing::{
26    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
27    compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
28    normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
29    Token,
30};
31pub use ignore_rules::{
32    compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
33};
34pub use parsing::{
35    extract_functions, extract_javascript_functions, extract_python_functions,
36    extract_rust_functions, FunctionNode,
37};
38
39use anyhow::Context;
40use globset::GlobSet;
41use ignore::WalkBuilder;
42use once_cell::sync::OnceCell;
43use rayon::prelude::*;
44use serde::{Deserialize, Serialize};
45use std::collections::{HashMap, HashSet};
46use std::fs;
47use std::path::{Path, PathBuf};
48use std::sync::Arc;
49use tree_sitter::Language;
50
51/// Information about a supported programming language
52#[derive(Debug, Clone, PartialEq, Eq)]
53pub struct LanguageInfo {
54    /// Display name of the language
55    pub name: &'static str,
56    /// File extensions supported (without leading dot)
57    pub extensions: &'static [&'static str],
58    /// Tree-sitter parser name
59    pub parser: &'static str,
60    /// Whether Type-3 clone detection is supported
61    pub type3_support: bool,
62    /// Current support status
63    pub status: LanguageStatus,
64}
65
66/// Support status for a language
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum LanguageStatus {
69    /// Full support with dedicated Tree-sitter parser
70    Full,
71    /// Partial support (e.g., uses another language's parser)
72    Partial,
73    /// Planned but not yet implemented
74    Planned,
75}
76
77/// Returns information about all supported programming languages
78pub fn get_supported_languages() -> Vec<LanguageInfo> {
79    vec![
80        LanguageInfo {
81            name: "Rust",
82            extensions: &["rs"],
83            parser: "tree-sitter-rust",
84            type3_support: true,
85            status: LanguageStatus::Full,
86        },
87        LanguageInfo {
88            name: "Python",
89            extensions: &["py", "pyi"],
90            parser: "tree-sitter-python",
91            type3_support: true,
92            status: LanguageStatus::Full,
93        },
94        LanguageInfo {
95            name: "JavaScript",
96            extensions: &["js", "mjs", "cjs"],
97            parser: "tree-sitter-javascript",
98            type3_support: true,
99            status: LanguageStatus::Full,
100        },
101        LanguageInfo {
102            name: "TypeScript",
103            extensions: &["ts", "mts", "cts"],
104            parser: "tree-sitter-javascript",
105            type3_support: true,
106            status: LanguageStatus::Full,
107        },
108        LanguageInfo {
109            name: "JSX",
110            extensions: &["jsx"],
111            parser: "tree-sitter-javascript",
112            type3_support: true,
113            status: LanguageStatus::Full,
114        },
115        LanguageInfo {
116            name: "TSX",
117            extensions: &["tsx"],
118            parser: "tree-sitter-javascript",
119            type3_support: true,
120            status: LanguageStatus::Full,
121        },
122        LanguageInfo {
123            name: "Vue",
124            extensions: &["vue"],
125            parser: "tree-sitter-javascript",
126            type3_support: true,
127            status: LanguageStatus::Partial,
128        },
129        LanguageInfo {
130            name: "Svelte",
131            extensions: &["svelte"],
132            parser: "tree-sitter-javascript",
133            type3_support: true,
134            status: LanguageStatus::Partial,
135        },
136        LanguageInfo {
137            name: "Go",
138            extensions: &["go"],
139            parser: "tree-sitter-go",
140            type3_support: true,
141            status: LanguageStatus::Planned,
142        },
143        LanguageInfo {
144            name: "Java",
145            extensions: &["java"],
146            parser: "tree-sitter-java",
147            type3_support: true,
148            status: LanguageStatus::Planned,
149        },
150        LanguageInfo {
151            name: "C/C++",
152            extensions: &["c", "cc", "cpp", "cxx", "h", "hpp"],
153            parser: "tree-sitter-cpp",
154            type3_support: true,
155            status: LanguageStatus::Planned,
156        },
157    ]
158}
159
160/// Clone type classification
161#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
162pub enum CloneType {
163    /// Type-1: Exact copies (only whitespace/comments differ)
164    #[serde(rename = "type-1")]
165    Type1,
166    /// Type-2: Structurally identical but renamed identifiers/literals
167    #[serde(rename = "type-2")]
168    Type2,
169    /// Type-3: Near-miss clones with modifications (not yet implemented)
170    #[serde(rename = "type-3")]
171    Type3,
172}
173
174/// Helper function to check if two ranges overlap
175fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
176    start1 < end2 && start2 < end1
177}
178
179// Stable key for deduplicating matches within the same file pair.
180fn canonical_pair_key<'a>(
181    func1: &'a FunctionHash,
182    func2: &'a FunctionHash,
183    source_start: usize,
184    target_start: usize,
185    length: usize,
186) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
187    if func1.file_path.as_ref() < func2.file_path.as_ref() {
188        (
189            func1.file_path.as_ref(),
190            func2.file_path.as_ref(),
191            func1.start_line,
192            func2.start_line,
193            source_start,
194            target_start,
195            length,
196        )
197    } else {
198        (
199            func2.file_path.as_ref(),
200            func1.file_path.as_ref(),
201            func2.start_line,
202            func1.start_line,
203            target_start,
204            source_start,
205            length,
206        )
207    }
208}
209
210/// Represents a detected duplicate code fragment
211#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
212pub struct DuplicateMatch {
213    pub file1: String,
214    pub file2: String,
215    pub start_line1: usize,
216    pub start_line2: usize,
217    #[serde(skip_serializing_if = "Option::is_none")]
218    pub end_line1: Option<usize>,
219    #[serde(skip_serializing_if = "Option::is_none")]
220    pub end_line2: Option<usize>,
221    pub length: usize,
222    pub similarity: f64,
223    pub hash: u64,
224    pub clone_type: CloneType,
225    /// Edit distance (Type-3 only). None for Type-1/2
226    #[serde(skip_serializing_if = "Option::is_none")]
227    pub edit_distance: Option<usize>,
228    /// Indicates if this duplicate is suppressed by an inline directive
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub suppressed_by_directive: Option<bool>,
231    /// Token offset within function for file1 (used for ignore ID computation)
232    #[serde(skip)]
233    token_offset1: Option<usize>,
234    /// Token offset within function for file2 (used for ignore ID computation)
235    #[serde(skip)]
236    token_offset2: Option<usize>,
237    /// Token length of the second window (Type-3 may differ from `length`)
238    #[serde(skip)]
239    target_length: Option<usize>,
240    /// Content-based ID for this duplicate (SHA256 of normalized tokens)
241    #[serde(skip_serializing_if = "Option::is_none")]
242    pub duplicate_id: Option<String>,
243}
244
245/// Represents a function with its tokens for duplicate detection
246#[derive(Debug, Clone)]
247struct FunctionHash {
248    file_path: Arc<str>, // Shared ownership, cheap to clone
249    #[allow(dead_code)] // Kept for potential future reporting improvements
250    function_name: Option<String>,
251    #[allow(dead_code)] // Kept for byte-level analysis in future
252    start_byte: usize,
253    #[allow(dead_code)] // Kept for byte-level analysis in future
254    end_byte: usize,
255    start_line: usize,
256    #[allow(dead_code)] // Kept for future detailed reporting
257    end_line: usize,
258    tokens: Vec<Token>, // Normalized token sequence
259    /// Zero-based line offset for each token relative to start_line
260    token_line_offsets: Vec<usize>,
261    raw_body: String, // Original (unnormalized) function body for Type-1 detection
262}
263
264/// Baseline snapshot for comparing duplicate detection across runs
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct Baseline {
267    /// Version of the baseline format
268    pub version: String,
269    /// Timestamp when baseline was created
270    pub created_at: String,
271    /// Duplicates that existed at baseline time
272    pub duplicates: Vec<DuplicateMatch>,
273}
274
275impl Baseline {
276    /// Create a new baseline from scan results
277    pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
278        Self {
279            version: env!("CARGO_PKG_VERSION").to_string(),
280            created_at: chrono::Utc::now().to_rfc3339(),
281            duplicates,
282        }
283    }
284
285    /// Save baseline to a JSON file
286    pub fn save_to_file(&self, path: &Path) -> Result<()> {
287        let json =
288            serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
289        fs::write(path, json).context("Failed to write baseline file")?;
290        Ok(())
291    }
292
293    /// Load baseline from a JSON file
294    pub fn load_from_file(path: &Path) -> Result<Self> {
295        let content = fs::read_to_string(path)
296            .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
297        let baseline: Baseline =
298            serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
299        Ok(baseline)
300    }
301
302    /// Compare current duplicates against baseline and return only new ones
303    pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
304        let baseline_set: std::collections::HashSet<_> =
305            self.duplicates.iter().map(duplicate_key).collect();
306
307        current
308            .iter()
309            .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
310            .cloned()
311            .collect()
312    }
313}
314
315/// Create a unique key for a duplicate match for comparison
316fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
317    // Normalize file order for consistent comparison
318    let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
319        (
320            dup.file1.clone(),
321            dup.file2.clone(),
322            dup.start_line1,
323            dup.start_line2,
324        )
325    } else {
326        (
327            dup.file2.clone(),
328            dup.file1.clone(),
329            dup.start_line2,
330            dup.start_line1,
331        )
332    };
333    (file1, file2, line1, line2, dup.length)
334}
335
336/// A file that was skipped during scanning
337#[derive(Debug, Clone, Serialize, Deserialize)]
338pub struct SkippedFile {
339    /// Path to the skipped file
340    pub path: String,
341    /// Reason the file was skipped
342    pub reason: String,
343}
344
345/// Report containing scan results
346#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct Report {
348    /// PolyDup version
349    #[serde(skip_serializing_if = "Option::is_none")]
350    pub version: Option<String>,
351    /// Scan start time (ISO 8601)
352    #[serde(skip_serializing_if = "Option::is_none")]
353    pub scan_time: Option<String>,
354    /// Configuration used for the scan
355    #[serde(skip_serializing_if = "Option::is_none")]
356    pub config: Option<ScanConfig>,
357    /// Total number of files scanned
358    pub files_scanned: usize,
359    /// Total number of functions analyzed
360    pub functions_analyzed: usize,
361    /// Detected duplicate matches
362    pub duplicates: Vec<DuplicateMatch>,
363    /// Files that were skipped during scanning (parse errors, permission issues, etc.)
364    #[serde(default, skip_serializing_if = "Vec::is_empty")]
365    pub skipped_files: Vec<SkippedFile>,
366    /// Scan statistics
367    pub stats: ScanStats,
368}
369
370/// Configuration used for scanning
371#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct ScanConfig {
373    /// Minimum block size in tokens
374    pub threshold: usize,
375    /// Similarity threshold (0.0 - 1.0)
376    pub similarity: f64,
377    /// Type-3 detection enabled
378    pub type3_enabled: bool,
379    /// Paths scanned
380    #[serde(skip_serializing_if = "Option::is_none")]
381    pub paths: Option<Vec<String>>,
382}
383
384/// Helper for serde skip_serializing_if
385fn is_zero(n: &usize) -> bool {
386    *n == 0
387}
388
389/// Statistics from the scanning process
390#[derive(Debug, Clone, Serialize, Deserialize)]
391pub struct ScanStats {
392    /// Total lines of code scanned
393    pub total_lines: usize,
394    /// Total tokens processed
395    pub total_tokens: usize,
396    /// Number of unique hashes computed
397    pub unique_hashes: usize,
398    /// Scan duration in milliseconds
399    pub duration_ms: u64,
400    /// Number of duplicates suppressed by .polydup-ignore file
401    #[serde(default, skip_serializing_if = "is_zero")]
402    pub suppressed_by_ignore_file: usize,
403    /// Number of duplicates suppressed by inline directives
404    #[serde(default, skip_serializing_if = "is_zero")]
405    pub suppressed_by_directive: usize,
406}
407
408/// Main scanner for detecting duplicates
409#[allow(dead_code)] // similarity_threshold reserved for future use
410pub struct Scanner {
411    /// Minimum code block size to consider (in tokens)
412    min_block_size: usize,
413    /// Similarity threshold (0.0 - 1.0)
414    similarity_threshold: f64,
415    /// Glob patterns to exclude from scanning
416    exclude_patterns: Vec<String>,
417    /// Cached compiled GlobSet for efficient exclude pattern matching
418    exclude_glob_set: OnceCell<GlobSet>,
419    /// Enable Type-3 (gap-tolerant) clone detection
420    enable_type3: bool,
421    /// Type-3 similarity tolerance (0.0 - 1.0)
422    type3_tolerance: f64,
423    /// Ignore manager for filtering false positives
424    ignore_manager: Option<IgnoreManager>,
425    /// Enable inline directive detection
426    enable_directives: bool,
427    /// Include test files in scanning (*.test.*, *.spec.*, etc.)
428    include_tests: bool,
429}
430
431/// Default exclude patterns for test files and build artifacts
432fn default_exclude_patterns() -> Vec<String> {
433    vec![
434        // Test files (excluded by default, enable with --include-tests)
435        "**/*.test.ts".to_string(),
436        "**/*.test.js".to_string(),
437        "**/*.test.tsx".to_string(),
438        "**/*.test.jsx".to_string(),
439        "**/*.spec.ts".to_string(),
440        "**/*.spec.js".to_string(),
441        "**/*.spec.tsx".to_string(),
442        "**/*.spec.jsx".to_string(),
443        "**/__tests__/**".to_string(),
444        "**/*.test.py".to_string(),
445    ]
446}
447
448/// Exclude patterns for build artifacts (always excluded)
449fn build_artifact_patterns() -> Vec<String> {
450    vec![
451        "**/node_modules/**".to_string(),
452        "**/target/**".to_string(),
453        "**/dist/**".to_string(),
454        "**/build/**".to_string(),
455        "**/.git/**".to_string(),
456    ]
457}
458
459impl Scanner {
460    /// Creates a new Scanner with default settings
461    ///
462    /// This is now infallible as there are no I/O or allocation failures.
463    pub fn new() -> Self {
464        let mut exclude = build_artifact_patterns();
465        exclude.extend(default_exclude_patterns());
466
467        Self {
468            min_block_size: 50,
469            similarity_threshold: 0.85,
470            exclude_patterns: exclude,
471            exclude_glob_set: OnceCell::new(),
472            enable_type3: false,
473            type3_tolerance: 0.85,
474            ignore_manager: None,
475            enable_directives: false,
476            include_tests: false,
477        }
478    }
479
480    /// Creates a new Scanner with custom settings
481    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
482        let mut exclude = build_artifact_patterns();
483        exclude.extend(default_exclude_patterns());
484
485        Ok(Self {
486            min_block_size,
487            similarity_threshold,
488            exclude_patterns: exclude,
489            exclude_glob_set: OnceCell::new(),
490            enable_type3: false,
491            type3_tolerance: 0.85,
492            ignore_manager: None,
493            enable_directives: false,
494            include_tests: false,
495        })
496    }
497
498    /// Sets custom exclude patterns, replacing the defaults
499    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
500        self.exclude_patterns = patterns;
501        self
502    }
503
504    /// Enables test file scanning (removes test file patterns from exclusions)
505    pub fn with_test_files(mut self, include: bool) -> Self {
506        self.include_tests = include;
507        if include {
508            // Remove test file patterns from exclusions
509            let test_patterns = default_exclude_patterns();
510            self.exclude_patterns.retain(|p| !test_patterns.contains(p));
511        }
512        self
513    }
514
515    /// Enables Type-3 clone detection with the specified tolerance
516    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
517        if !(0.0..=1.0).contains(&tolerance) {
518            return Err(PolyDupError::Config(
519                "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
520            ));
521        }
522        self.enable_type3 = true;
523        self.type3_tolerance = tolerance;
524        Ok(self)
525    }
526
527    /// Sets the ignore manager for filtering false positives
528    pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
529        self.ignore_manager = Some(manager);
530        self
531    }
532
533    /// Enables inline directive detection (// polydup-ignore comments)
534    pub fn with_directives(mut self, enabled: bool) -> Self {
535        self.enable_directives = enabled;
536        self
537    }
538
539    /// Collect source files that would be scanned (for dry-run mode)
540    ///
541    /// Returns a list of file paths that match the scanner's configuration
542    /// (supported languages, not excluded, respecting .gitignore, etc.)
543    pub fn collect_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
544        self.collect_source_files(paths)
545    }
546
547    /// Scans the given paths and returns a Report with detected duplicates
548    ///
549    /// Uses Rayon for parallel file processing:
550    /// 1. Read and parse files
551    /// 2. Extract functions
552    /// 3. Normalize and hash function bodies
553    /// 4. Compare hashes to find duplicates
554    /// 5. Apply directive-based filtering if enabled
555    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
556        use std::time::Instant;
557        let start_time = Instant::now();
558
559        // Collect all source files
560        let source_files = self.collect_source_files(paths)?;
561
562        // Detect directives if enabled
563        let directives_map = self.collect_directives(&source_files);
564
565        // Process files in parallel to extract functions and compute hashes
566        let (function_hashes, total_lines, skipped_files) = self.analyze_files(&source_files)?;
567
568        // Find duplicates by comparing hashes
569        let (mut duplicates, suppressed_by_ignore_file) =
570            self.find_duplicate_hashes(&function_hashes);
571
572        // Apply directive-based filtering
573        let suppressed_by_directive = if self.enable_directives && !directives_map.is_empty() {
574            self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes)
575        } else {
576            0
577        };
578
579        // Calculate statistics
580        let stats = self.compute_stats(
581            &function_hashes,
582            total_lines,
583            start_time,
584            suppressed_by_ignore_file,
585            suppressed_by_directive,
586        );
587
588        // files_scanned is the count of successfully scanned files (total - skipped)
589        let files_scanned = source_files.len().saturating_sub(skipped_files.len());
590
591        Ok(Report {
592            version: None,   // Will be set by CLI
593            scan_time: None, // Will be set by CLI
594            config: None,    // Will be set by CLI
595            files_scanned,
596            functions_analyzed: function_hashes.len(),
597            duplicates,
598            skipped_files,
599            stats,
600        })
601    }
602
603    /// Parallel collection of directives from source files
604    fn collect_directives(
605        &self,
606        source_files: &[PathBuf],
607    ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
608        if self.enable_directives {
609            source_files
610                .par_iter()
611                .filter_map(|path| {
612                    crate::directives::detect_directives_in_file(path)
613                        .ok()
614                        .map(|d| (path.clone(), d))
615                })
616                .collect()
617        } else {
618            HashMap::new()
619        }
620    }
621
622    /// Analyze files in parallel to extract functions and metadata
623    /// Returns (function_hashes, total_lines, skipped_files)
624    #[allow(clippy::type_complexity)]
625    fn analyze_files(
626        &self,
627        source_files: &[PathBuf],
628    ) -> Result<(Vec<FunctionHash>, usize, Vec<SkippedFile>)> {
629        // Collect function hashes, line counts, and track errors
630        let results: Vec<(PathBuf, Result<(Vec<FunctionHash>, usize)>)> = source_files
631            .par_iter()
632            .map(|path| {
633                let result = (|| {
634                    // Count lines first
635                    let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
636                    let line_count = content.lines().count();
637
638                    // Process file for functions
639                    let hashes = self.process_file_content(path, &content)?;
640                    Ok((hashes, line_count))
641                })();
642                (path.clone(), result)
643            })
644            .collect();
645
646        // Aggregate results
647        let mut all_hashes = Vec::new();
648        let mut total_lines = 0;
649        let mut skipped_files = Vec::new();
650
651        for (path, res) in results {
652            match res {
653                Ok((hashes, lines)) => {
654                    all_hashes.extend(hashes);
655                    total_lines += lines;
656                }
657                Err(e) => {
658                    // Track skipped files with their error reason
659                    let reason = match &e {
660                        PolyDupError::Io(io_err) => {
661                            if io_err.kind() == std::io::ErrorKind::PermissionDenied {
662                                "Permission denied".to_string()
663                            } else {
664                                format!("IO error: {}", io_err)
665                            }
666                        }
667                        PolyDupError::Parsing(msg) => format!("Parse error: {}", msg),
668                        PolyDupError::Config(msg) => format!("Config error: {}", msg),
669                        PolyDupError::LanguageNotSupported(lang) => {
670                            format!("Language not supported: {}", lang)
671                        }
672                        PolyDupError::LanguageDetection(_) => {
673                            "Could not detect language".to_string()
674                        }
675                        PolyDupError::ParallelExecution(msg) => {
676                            format!("Parallel execution error: {}", msg)
677                        }
678                        PolyDupError::IgnoreRule(msg) => format!("Ignore rule error: {}", msg),
679                        PolyDupError::Other(e) => format!("Error: {}", e),
680                    };
681                    skipped_files.push(SkippedFile {
682                        path: path.display().to_string(),
683                        reason,
684                    });
685                }
686            }
687        }
688
689        Ok((all_hashes, total_lines, skipped_files))
690    }
691
692    /// Filter duplicates based on directives
693    /// Returns the count of duplicates that were suppressed
694    fn apply_directive_filtering(
695        &self,
696        duplicates: &mut Vec<DuplicateMatch>,
697        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
698        function_hashes: &[FunctionHash],
699    ) -> usize {
700        let original_count = duplicates.len();
701        for dup in duplicates.iter_mut() {
702            let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
703            if suppressed {
704                dup.suppressed_by_directive = Some(true);
705            }
706        }
707
708        // Filter out suppressed duplicates (they shouldn't appear in reports or fail CI)
709        duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
710        original_count - duplicates.len()
711    }
712
713    /// Compute scan statistics
714    fn compute_stats(
715        &self,
716        function_hashes: &[FunctionHash],
717        total_lines: usize,
718        start_time: std::time::Instant,
719        suppressed_by_ignore_file: usize,
720        suppressed_by_directive: usize,
721    ) -> ScanStats {
722        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
723
724        let unique_hashes: usize = {
725            let mut hash_set = std::collections::HashSet::new();
726            for fh in function_hashes {
727                // Compute rolling hashes just for statistics
728                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
729                for (hash, _) in hashes {
730                    hash_set.insert(hash);
731                }
732            }
733            hash_set.len()
734        };
735
736        let duration_ms = start_time.elapsed().as_millis() as u64;
737
738        ScanStats {
739            total_lines,
740            total_tokens,
741            unique_hashes,
742            duration_ms,
743            suppressed_by_ignore_file,
744            suppressed_by_directive,
745        }
746    }
747
748    /// Collects all source files from the given paths
749    ///
750    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
751    /// and common ignore patterns (node_modules, target, etc.)
752    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
753        let mut files = Vec::new();
754
755        for path in paths {
756            if path.is_file() {
757                if self.is_supported_file(&path) && !self.is_excluded(&path) {
758                    files.push(path);
759                }
760            } else if path.is_dir() {
761                // Use ignore crate's WalkBuilder to respect .gitignore
762                let walker = WalkBuilder::new(&path)
763                    .git_ignore(true) // Respect .gitignore
764                    .git_global(true) // Respect global gitignore
765                    .git_exclude(true) // Respect .git/info/exclude
766                    .ignore(true) // Respect .ignore files
767                    .hidden(false) // Don't skip hidden files (e.g., .config/)
768                    .parents(true) // Respect parent .gitignore files
769                    .build();
770
771                for entry in walker {
772                    match entry {
773                        Ok(entry) => {
774                            let path = entry.path();
775                            if path.is_file()
776                                && self.is_supported_file(path)
777                                && !self.is_excluded(path)
778                            {
779                                files.push(path.to_path_buf());
780                            }
781                        }
782                        Err(err) => {
783                            // Log but don't fail on individual entry errors
784                            eprintln!("Warning: Failed to access path: {}", err);
785                        }
786                    }
787                }
788            }
789        }
790
791        Ok(files)
792    }
793
794    /// Checks if a file is a supported source file
795    fn is_supported_file(&self, path: &Path) -> bool {
796        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
797            matches!(
798                ext,
799                "rs" | "py"
800                    | "pyi"
801                    | "js"
802                    | "mjs"
803                    | "cjs"
804                    | "ts"
805                    | "mts"
806                    | "cts"
807                    | "jsx"
808                    | "tsx"
809                    | "vue"
810                    | "svelte"
811            )
812        } else {
813            false
814        }
815    }
816
817    /// Checks if a file matches any exclude patterns
818    fn is_excluded(&self, path: &Path) -> bool {
819        // Get or build the cached GlobSet
820        let glob_set = self.exclude_glob_set.get_or_init(|| {
821            use globset::{Glob, GlobSetBuilder};
822
823            let mut builder = GlobSetBuilder::new();
824            for pattern in &self.exclude_patterns {
825                if let Ok(glob) = Glob::new(pattern) {
826                    builder.add(glob);
827                }
828            }
829
830            builder.build().unwrap_or_else(|_| GlobSet::empty())
831        });
832
833        glob_set.is_match(path)
834    }
835
836    /// Processes a single file content and returns function hashes
837    fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
838        let lang = self.detect_language(path)?;
839        let functions = extract_functions(code, lang)?;
840
841        // Use Arc<str> for efficient sharing across all functions in this file
842        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
843        let mut function_hashes = Vec::new();
844
845        for func in functions {
846            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
847            let raw_body = func.body.clone();
848            let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
849
850            // Skip if too small
851            if tokens.len() < self.min_block_size {
852                continue;
853            }
854
855            // Store the full token sequence for extension-based detection
856            function_hashes.push(FunctionHash {
857                file_path: Arc::clone(&file_path), // Cheap pointer clone
858                function_name: func.name.clone(),
859                start_byte: func.start_byte,
860                end_byte: func.end_byte,
861                start_line: func.start_line,
862                end_line: func.end_line,
863                tokens,
864                token_line_offsets,
865                raw_body,
866            });
867        }
868
869        Ok(function_hashes)
870    }
871
872    /// Detects the Tree-sitter Language from file extension
873    fn detect_language(&self, path: &Path) -> Result<Language> {
874        let ext = path
875            .extension()
876            .and_then(|e| e.to_str())
877            .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
878
879        match ext {
880            "rs" => Ok(tree_sitter_rust::language()),
881            "py" | "pyi" => Ok(tree_sitter_python::language()),
882            "js" | "mjs" | "cjs" | "jsx" | "ts" | "mts" | "cts" | "tsx" | "vue" | "svelte" => {
883                Ok(tree_sitter_javascript::language())
884            }
885            _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
886        }
887    }
888
889    /// Computes the inclusive line span for a token window within a function
890    fn compute_line_span(
891        &self,
892        func: &FunctionHash,
893        start_offset: usize,
894        length: usize,
895    ) -> (usize, usize) {
896        let start_line = func
897            .token_line_offsets
898            .get(start_offset)
899            .map(|offset| func.start_line + offset)
900            .unwrap_or(func.start_line + start_offset);
901
902        let end_index = start_offset + length.saturating_sub(1);
903        let end_line = func
904            .token_line_offsets
905            .get(end_index)
906            .map(|offset| func.start_line + offset)
907            .unwrap_or(func.start_line + end_index);
908
909        (start_line, end_line)
910    }
911
912    /// Finds duplicate code using greedy extension algorithm
913    ///
914    /// Orchestrates the detection pipeline:
915    /// 1. Type-1/2 detection (exact and renamed clones)
916    /// 2. Type-3 detection (near-miss clones with gaps)
917    /// 3. Duplicate ID computation
918    /// 4. Ignore filtering
919    ///
920    /// Returns (duplicates, suppressed_by_ignore_file_count)
921    fn find_duplicate_hashes(
922        &self,
923        function_hashes: &[FunctionHash],
924    ) -> (Vec<DuplicateMatch>, usize) {
925        // Type alias for pair deduplication keys
926        type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
927
928        // Shared state for deduplication across Type-1/2 and Type-3
929        let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
930            std::collections::HashSet::new();
931
932        // Phase 1: Type-1/2 detection
933        let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
934
935        // Phase 2: Type-3 detection (if enabled)
936        if self.enable_type3 {
937            self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
938        }
939
940        // Phase 3: Compute IDs for all duplicates
941        self.compute_duplicate_ids(function_hashes, &mut duplicates);
942
943        // Phase 4: Filter out ignored duplicates
944        let suppressed_count = self.filter_ignored_duplicates(&mut duplicates);
945
946        (duplicates, suppressed_count)
947    }
948
949    /// Detects Type-1 (exact) and Type-2 (renamed) clones
950    ///
951    /// Compares all function pairs using hash-based detection with greedy extension.
952    fn find_type12_duplicates<'a>(
953        &self,
954        function_hashes: &'a [FunctionHash],
955        seen_pairs: &mut std::collections::HashSet<(
956            &'a str,
957            &'a str,
958            usize,
959            usize,
960            usize,
961            usize,
962            usize,
963        )>,
964    ) -> Vec<DuplicateMatch> {
965        let mut duplicates = Vec::new();
966
967        for i in 0..function_hashes.len() {
968            for j in (i + 1)..function_hashes.len() {
969                let func1 = &function_hashes[i];
970                let func2 = &function_hashes[j];
971
972                let matches = self.find_clones_between_functions(func1, func2);
973
974                for clone_match in matches {
975                    let pair_key = canonical_pair_key(
976                        func1,
977                        func2,
978                        clone_match.source_start,
979                        clone_match.target_start,
980                        clone_match.length,
981                    );
982
983                    if seen_pairs.contains(&pair_key) {
984                        continue;
985                    }
986                    seen_pairs.insert(pair_key);
987
988                    // Compute hash for reporting
989                    let match_hash = Self::compute_match_hash(
990                        &func1.tokens[clone_match.source_start
991                            ..clone_match.source_start + clone_match.length],
992                    );
993
994                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
995
996                    let (actual_start1, actual_end1) =
997                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
998                    let (actual_start2, actual_end2) =
999                        self.compute_line_span(func2, clone_match.target_start, clone_match.length);
1000
1001                    // Skip same location (overlapping function boundaries)
1002                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1003                        continue;
1004                    }
1005
1006                    duplicates.push(DuplicateMatch {
1007                        file1: func1.file_path.to_string(),
1008                        file2: func2.file_path.to_string(),
1009                        start_line1: actual_start1,
1010                        start_line2: actual_start2,
1011                        end_line1: Some(actual_end1),
1012                        end_line2: Some(actual_end2),
1013                        length: clone_match.length,
1014                        similarity: clone_match.similarity,
1015                        hash: match_hash,
1016                        clone_type,
1017                        edit_distance: None,
1018                        suppressed_by_directive: None,
1019                        token_offset1: Some(clone_match.source_start),
1020                        token_offset2: Some(clone_match.target_start),
1021                        target_length: Some(clone_match.length),
1022                        duplicate_id: None,
1023                    });
1024                }
1025            }
1026        }
1027
1028        duplicates
1029    }
1030
1031    /// Detects Type-3 (gap-tolerant) clones using edit distance
1032    ///
1033    /// Finds near-miss clones that have insertions, deletions, or modifications.
1034    fn find_type3_duplicates<'a>(
1035        &self,
1036        function_hashes: &'a [FunctionHash],
1037        seen_pairs: &std::collections::HashSet<(
1038            &'a str,
1039            &'a str,
1040            usize,
1041            usize,
1042            usize,
1043            usize,
1044            usize,
1045        )>,
1046        duplicates: &mut Vec<DuplicateMatch>,
1047    ) {
1048        let mut type3_candidates = Vec::new();
1049
1050        for i in 0..function_hashes.len() {
1051            for j in (i + 1)..function_hashes.len() {
1052                let func1 = &function_hashes[i];
1053                let func2 = &function_hashes[j];
1054
1055                let type3_matches = detect_type3_clones(
1056                    &func1.tokens,
1057                    &func2.tokens,
1058                    self.min_block_size,
1059                    self.type3_tolerance,
1060                );
1061
1062                for clone_match in type3_matches {
1063                    let pair_key = canonical_pair_key(
1064                        func1,
1065                        func2,
1066                        clone_match.source_start,
1067                        clone_match.target_start,
1068                        clone_match.length,
1069                    );
1070
1071                    if seen_pairs.contains(&pair_key) {
1072                        continue;
1073                    }
1074
1075                    type3_candidates.push((func1, func2, clone_match));
1076                }
1077            }
1078        }
1079
1080        // Deduplicate overlapping Type-3 matches
1081        let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1082
1083        // Convert to DuplicateMatch
1084        for (func1, func2, clone_match) in deduplicated {
1085            let (actual_start1, actual_end1) =
1086                self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1087            let (actual_start2, actual_end2) =
1088                self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
1089
1090            // Skip self-matches: same file and same starting line indicates
1091            // the algorithm matched a code block against itself
1092            if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1093                continue;
1094            }
1095
1096            let window1 = &func1.tokens
1097                [clone_match.source_start..clone_match.source_start + clone_match.length];
1098            let window2 = &func2.tokens
1099                [clone_match.target_start..clone_match.target_start + clone_match.target_length];
1100            let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1101
1102            let match_hash = Self::compute_match_hash(window1);
1103
1104            duplicates.push(DuplicateMatch {
1105                file1: func1.file_path.to_string(),
1106                file2: func2.file_path.to_string(),
1107                start_line1: actual_start1,
1108                start_line2: actual_start2,
1109                end_line1: Some(actual_end1),
1110                end_line2: Some(actual_end2),
1111                length: clone_match.length,
1112                similarity: clone_match.similarity,
1113                hash: match_hash,
1114                clone_type: CloneType::Type3,
1115                edit_distance: Some(edit_dist),
1116                suppressed_by_directive: None,
1117                token_offset1: Some(clone_match.source_start),
1118                token_offset2: Some(clone_match.target_start),
1119                target_length: Some(clone_match.target_length),
1120                duplicate_id: None,
1121            });
1122        }
1123    }
1124
1125    /// Computes content-based IDs for all duplicates
1126    ///
1127    /// IDs are SHA256 hashes of normalized tokens, enabling persistent ignore rules.
1128    fn compute_duplicate_ids(
1129        &self,
1130        function_hashes: &[FunctionHash],
1131        duplicates: &mut [DuplicateMatch],
1132    ) {
1133        for dup in duplicates.iter_mut() {
1134            if dup.duplicate_id.is_some() {
1135                continue;
1136            }
1137
1138            let tokens1 = self.extract_duplicate_tokens(
1139                function_hashes,
1140                &dup.file1,
1141                dup.start_line1,
1142                dup.end_line1,
1143                dup.token_offset1,
1144                dup.length,
1145            );
1146
1147            let tokens2 = self.extract_duplicate_tokens(
1148                function_hashes,
1149                &dup.file2,
1150                dup.start_line2,
1151                dup.end_line2,
1152                dup.token_offset2,
1153                dup.target_length.unwrap_or(dup.length),
1154            );
1155
1156            if let Some(tokens1) = tokens1 {
1157                let id = if let Some(tokens2) = tokens2 {
1158                    ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
1159                } else {
1160                    ignore_rules::compute_duplicate_id(&tokens1)
1161                };
1162                dup.duplicate_id = Some(id);
1163            }
1164        }
1165    }
1166
1167    /// Extracts normalized token strings for a duplicate region
1168    fn extract_duplicate_tokens(
1169        &self,
1170        function_hashes: &[FunctionHash],
1171        file: &str,
1172        reported_start: usize,
1173        reported_end: Option<usize>,
1174        token_offset: Option<usize>,
1175        length: usize,
1176    ) -> Option<Vec<String>> {
1177        function_hashes.iter().find_map(|fh| {
1178            if fh.file_path.as_ref() != file
1179                || fh.start_line > reported_start
1180                || reported_start > fh.end_line
1181            {
1182                return None;
1183            }
1184
1185            let start_offset = match token_offset {
1186                Some(offset) if offset + length <= fh.tokens.len() => Some(offset),
1187                _ => self.infer_token_offset(fh, reported_start, reported_end, length),
1188            }?;
1189
1190            if start_offset + length > fh.tokens.len() {
1191                return None;
1192            }
1193
1194            Some(
1195                fh.tokens
1196                    .iter()
1197                    .skip(start_offset)
1198                    .take(length)
1199                    .map(|t| t.as_hash_string().to_string())
1200                    .collect(),
1201            )
1202        })
1203    }
1204
1205    /// Attempts to derive the token offset from reported lines when it's missing (e.g., older caches).
1206    fn infer_token_offset(
1207        &self,
1208        func_hash: &FunctionHash,
1209        reported_start: usize,
1210        reported_end: Option<usize>,
1211        length: usize,
1212    ) -> Option<usize> {
1213        let start_line_offset = reported_start.checked_sub(func_hash.start_line)?;
1214        let end_line = reported_end.unwrap_or(reported_start);
1215
1216        func_hash
1217            .token_line_offsets
1218            .iter()
1219            .enumerate()
1220            .filter_map(|(idx, line_offset)| {
1221                if *line_offset != start_line_offset {
1222                    return None;
1223                }
1224
1225                let end_idx = idx.checked_add(length.checked_sub(1)?)?;
1226                let end_offset = func_hash.token_line_offsets.get(end_idx)?;
1227                if func_hash.start_line + *end_offset == end_line {
1228                    Some(idx)
1229                } else {
1230                    None
1231                }
1232            })
1233            .next()
1234    }
1235
1236    /// Filters out duplicates that are in the ignore list
1237    fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) -> usize {
1238        let original_count = duplicates.len();
1239        if let Some(ref ignore_manager) = self.ignore_manager {
1240            duplicates.retain(|dup| {
1241                if let Some(ref id) = dup.duplicate_id {
1242                    !ignore_manager.is_ignored(id)
1243                } else {
1244                    // If we couldn't compute an ID, keep the duplicate (fail open)
1245                    true
1246                }
1247            });
1248        }
1249        original_count - duplicates.len()
1250    }
1251
1252    /// Computes a hash for a token slice (used for match reporting)
1253    fn compute_match_hash(tokens: &[Token]) -> u64 {
1254        use std::collections::hash_map::DefaultHasher;
1255        use std::hash::{Hash, Hasher};
1256        let mut hasher = DefaultHasher::new();
1257        tokens.hash(&mut hasher);
1258        hasher.finish()
1259    }
1260
1261    /// Checks if a duplicate is suppressed by an inline directive
1262    ///
1263    /// Directives suppress the entire function they're placed before, so we check
1264    /// if the owning function has a directive, not the duplicate's specific lines.
1265    fn is_suppressed_by_directive(
1266        &self,
1267        dup: &DuplicateMatch,
1268        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1269        function_hashes: &[FunctionHash],
1270    ) -> bool {
1271        // Check if either file has a directive suppressing this duplicate
1272        let file1_path = PathBuf::from(&dup.file1);
1273        let file2_path = PathBuf::from(&dup.file2);
1274
1275        // Check file1 - use the owning function's start line for directive lookup
1276        if let Some(directives) = directives_map.get(&file1_path) {
1277            let func_start =
1278                self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1279            // Use function start for directive check (directives apply to whole function)
1280            let check_line = func_start.unwrap_or(dup.start_line1);
1281
1282            if directives.is_suppressed(check_line, check_line).is_some() {
1283                return true;
1284            }
1285        }
1286
1287        // Check file2 - use the owning function's start line for directive lookup
1288        if let Some(directives) = directives_map.get(&file2_path) {
1289            let func_start =
1290                self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1291            // Use function start for directive check (directives apply to whole function)
1292            let check_line = func_start.unwrap_or(dup.start_line2);
1293
1294            if directives.is_suppressed(check_line, check_line).is_some() {
1295                return true;
1296            }
1297        }
1298
1299        false
1300    }
1301
1302    /// Finds the start line of the function containing a given line
1303    fn find_owning_function_start(
1304        &self,
1305        file: &str,
1306        line: usize,
1307        function_hashes: &[FunctionHash],
1308    ) -> Option<usize> {
1309        function_hashes
1310            .iter()
1311            .find(|fh| {
1312                fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1313            })
1314            .map(|fh| fh.start_line)
1315    }
1316
1317    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
1318    ///
1319    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
1320    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
1321    /// Overlap requires BOTH source AND target ranges to overlap.
1322    fn deduplicate_overlapping_matches<'a>(
1323        &self,
1324        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1325    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1326        if candidates.is_empty() {
1327            return Vec::new();
1328        }
1329
1330        // Track which matches have been merged
1331        let mut used = vec![false; candidates.len()];
1332        let mut deduplicated = Vec::new();
1333
1334        for i in 0..candidates.len() {
1335            if used[i] {
1336                continue;
1337            }
1338
1339            let (func1, func2, current) = &candidates[i];
1340            let mut best_match = (*func1, *func2, current.clone());
1341            used[i] = true;
1342
1343            // Find all overlapping matches (iterate until no more overlaps found)
1344            // This handles transitive overlaps: A overlaps B, B overlaps C
1345            let mut found_overlap = true;
1346            while found_overlap {
1347                found_overlap = false;
1348
1349                for j in (i + 1)..candidates.len() {
1350                    if used[j] {
1351                        continue;
1352                    }
1353
1354                    let (f1, f2, candidate) = &candidates[j];
1355
1356                    // Only merge if same function pair (by file path and line number)
1357                    let same_pair = (func1.file_path == f1.file_path
1358                        && func2.file_path == f2.file_path
1359                        && func1.start_line == f1.start_line
1360                        && func2.start_line == f2.start_line)
1361                        || (func1.file_path == f2.file_path
1362                            && func2.file_path == f1.file_path
1363                            && func1.start_line == f2.start_line
1364                            && func2.start_line == f1.start_line);
1365
1366                    if !same_pair {
1367                        continue;
1368                    }
1369
1370                    // Check if overlapping with CURRENT best_match (not original)
1371                    // This ensures transitive overlaps are handled correctly
1372                    let source_overlap = ranges_overlap(
1373                        best_match.2.source_start,
1374                        best_match.2.source_start + best_match.2.length,
1375                        candidate.source_start,
1376                        candidate.source_start + candidate.length,
1377                    );
1378                    let target_overlap = ranges_overlap(
1379                        best_match.2.target_start,
1380                        best_match.2.target_start + best_match.2.target_length,
1381                        candidate.target_start,
1382                        candidate.target_start + candidate.target_length,
1383                    );
1384
1385                    if source_overlap && target_overlap {
1386                        let best_span = best_match.2.length.max(best_match.2.target_length);
1387                        let candidate_span = candidate.length.max(candidate.target_length);
1388
1389                        // Keep the match that covers more tokens overall, breaking ties by similarity
1390                        if candidate_span > best_span
1391                            || (candidate_span == best_span
1392                                && candidate.similarity > best_match.2.similarity)
1393                        {
1394                            best_match = (*f1, *f2, candidate.clone());
1395                            found_overlap = true; // Need another pass to check against new best
1396                        }
1397                        used[j] = true;
1398                    }
1399                }
1400            }
1401
1402            deduplicated.push(best_match);
1403        }
1404
1405        deduplicated
1406    }
1407
1408    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
1409    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1410        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
1411        let normalized1 = raw1.split_whitespace().collect::<String>();
1412        let normalized2 = raw2.split_whitespace().collect::<String>();
1413
1414        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
1415        if normalized1 == normalized2 {
1416            CloneType::Type1
1417        } else {
1418            // Otherwise, it's Type-2 (renamed identifiers/literals)
1419            CloneType::Type2
1420        }
1421    }
1422
1423    /// Finds clone matches between two functions using extension algorithm
1424    fn find_clones_between_functions(
1425        &self,
1426        func1: &FunctionHash,
1427        func2: &FunctionHash,
1428    ) -> Vec<CloneMatch> {
1429        use std::collections::HashMap;
1430
1431        let mut matches = Vec::new();
1432        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1433
1434        // Index all windows in func1
1435        let mut i = 0;
1436        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1437            let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1438            hash_map.entry(hash).or_default().push(i);
1439            i += 1;
1440        }
1441
1442        // Search for matches in func2
1443        let mut j = 0;
1444        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1445            let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1446
1447            if let Some(func1_positions) = hash_map.get(&hash) {
1448                for &func1_pos in func1_positions {
1449                    // Verify exact match using shared utility
1450                    if hashing::verify_cross_window_match(
1451                        &func1.tokens,
1452                        &func2.tokens,
1453                        func1_pos,
1454                        j,
1455                        self.min_block_size,
1456                    ) {
1457                        // Greedy extension using shared utility
1458                        let extension = hashing::extend_match(
1459                            &func1.tokens,
1460                            &func2.tokens,
1461                            func1_pos,
1462                            j,
1463                            self.min_block_size,
1464                        );
1465
1466                        let total_length = self.min_block_size + extension;
1467
1468                        matches.push(CloneMatch {
1469                            source_start: func1_pos,
1470                            target_start: j,
1471                            length: total_length,
1472                            target_length: total_length,
1473                            similarity: 1.0, // Exact match
1474                        });
1475
1476                        // Skip ahead
1477                        j += extension.max(1);
1478                        break;
1479                    }
1480                }
1481            }
1482
1483            j += 1;
1484        }
1485
1486        matches
1487    }
1488
1489    fn add_hashes_to_cache(&self, function_hashes: &[FunctionHash], cache: &mut HashCache) {
1490        for func_hash in function_hashes {
1491            let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1492
1493            for (hash, offset) in hashes {
1494                let end_token_idx = offset + self.min_block_size;
1495                let (start_line, end_line) =
1496                    self.compute_line_span(func_hash, offset, self.min_block_size);
1497
1498                let location = CodeLocation {
1499                    file_path: func_hash.file_path.to_string(),
1500                    start_line,
1501                    end_line,
1502                    token_offset: Some(offset),
1503                    token_length: self.min_block_size,
1504                    tokens: func_hash.tokens[offset..end_token_idx].to_vec(),
1505                    raw_source: func_hash.raw_body.clone(),
1506                };
1507
1508                cache.add_hash(hash, location);
1509            }
1510        }
1511    }
1512
1513    /// Build a hash cache from the given paths
1514    ///
1515    /// Scans all files and builds a persistent cache of rolling hashes.
1516    /// This enables fast incremental scanning and git-diff mode.
1517    pub fn build_cache(&self, paths: Vec<PathBuf>) -> Result<HashCache> {
1518        let mut cache = HashCache::new(self.min_block_size);
1519
1520        // Collect all source files
1521        let source_files = self.collect_source_files(paths)?;
1522
1523        // Process each file and add to cache
1524        for file_path in source_files {
1525            let content = match std::fs::read_to_string(&file_path) {
1526                Ok(c) => c,
1527                Err(_) => continue, // Skip files we can't read
1528            };
1529
1530            let function_hashes = match self.process_file_content(&file_path, &content) {
1531                Ok(fh) => fh,
1532                Err(_) => continue, // Skip files we can't parse
1533            };
1534
1535            self.add_hashes_to_cache(&function_hashes, &mut cache);
1536        }
1537
1538        Ok(cache)
1539    }
1540
1541    /// Scan with cache lookup (for git-diff mode)
1542    ///
1543    /// Scans only the changed files, then looks up their hashes in the cache
1544    /// to find duplicates against the entire codebase.
1545    pub fn scan_with_cache(
1546        &self,
1547        changed_files: Vec<PathBuf>,
1548        cache: &mut HashCache,
1549    ) -> Result<Report> {
1550        use std::time::Instant;
1551        let start_time = Instant::now();
1552
1553        // Ensure we don't match against stale cache entries
1554        let stale_files = cache.invalidate_stale_files();
1555        let normalize_path =
1556            |path: &Path| path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
1557        let changed_set: HashSet<PathBuf> =
1558            changed_files.iter().map(|p| normalize_path(p)).collect();
1559
1560        if !stale_files.is_empty() {
1561            // Rebuild cache entries that were invalidated so unchanged files
1562            // remain available for lookups.
1563            let stale_paths: Vec<PathBuf> = stale_files
1564                .into_iter()
1565                .filter_map(|path| {
1566                    let raw_path = PathBuf::from(&path);
1567                    let normalized = normalize_path(&raw_path);
1568
1569                    if !normalized.exists() || changed_set.contains(&normalized) {
1570                        return None;
1571                    }
1572
1573                    Some(raw_path)
1574                })
1575                .collect();
1576
1577            if !stale_paths.is_empty() {
1578                let (stale_hashes, _, _) = self.analyze_files(&stale_paths)?;
1579                self.add_hashes_to_cache(&stale_hashes, cache);
1580            }
1581        }
1582
1583        // Only scan the changed files
1584        let (function_hashes, total_lines, skipped_files) = self.analyze_files(&changed_files)?;
1585
1586        // Find duplicates by looking up in cache
1587        let mut duplicates = Vec::new();
1588        let mut cached_hits_by_file: HashMap<String, Vec<CodeLocation>> = HashMap::new();
1589        let mut cached_function_hashes: Vec<FunctionHash> = Vec::new();
1590
1591        for func_hash in &function_hashes {
1592            let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1593
1594            for (hash, offset) in hashes {
1595                // Look up this hash in the cache
1596                if let Some(cached_locations) = cache.lookup(hash) {
1597                    for cached_loc in cached_locations {
1598                        // Normalize both paths for comparison (handle relative vs absolute)
1599                        let changed_file_path = Path::new(func_hash.file_path.as_ref())
1600                            .canonicalize()
1601                            .unwrap_or_else(|_| {
1602                                Path::new(func_hash.file_path.as_ref()).to_path_buf()
1603                            });
1604                        let cached_file_path = Path::new(&cached_loc.file_path)
1605                            .canonicalize()
1606                            .unwrap_or_else(|_| Path::new(&cached_loc.file_path).to_path_buf());
1607
1608                        // Skip if same file (we'll find those via normal duplicate detection)
1609                        if changed_file_path == cached_file_path {
1610                            continue;
1611                        }
1612
1613                        cached_hits_by_file
1614                            .entry(cached_loc.file_path.clone())
1615                            .or_default()
1616                            .push(cached_loc.clone());
1617
1618                        // Calculate line numbers for the match in changed file
1619                        let start_token_idx = offset;
1620                        let end_token_idx =
1621                            (offset + self.min_block_size).min(func_hash.tokens.len());
1622
1623                        let start_line_offset =
1624                            if start_token_idx < func_hash.token_line_offsets.len() {
1625                                func_hash.token_line_offsets[start_token_idx]
1626                            } else {
1627                                0
1628                            };
1629
1630                        let end_line_offset = if end_token_idx > 0
1631                            && end_token_idx - 1 < func_hash.token_line_offsets.len()
1632                        {
1633                            func_hash.token_line_offsets[end_token_idx - 1]
1634                        } else {
1635                            start_line_offset
1636                        };
1637
1638                        // Create duplicate match
1639                        let similarity = compute_token_similarity(
1640                            &func_hash.tokens[start_token_idx..end_token_idx],
1641                            &cached_loc.tokens,
1642                        );
1643
1644                        if similarity >= self.similarity_threshold {
1645                            let clone_type = if func_hash.raw_body == cached_loc.raw_source {
1646                                CloneType::Type1
1647                            } else {
1648                                CloneType::Type2
1649                            };
1650
1651                            duplicates.push(DuplicateMatch {
1652                                file1: func_hash.file_path.to_string(),
1653                                file2: cached_loc.file_path.clone(),
1654                                start_line1: func_hash.start_line + start_line_offset,
1655                                start_line2: cached_loc.start_line,
1656                                end_line1: Some(func_hash.start_line + end_line_offset),
1657                                end_line2: Some(cached_loc.end_line),
1658                                length: self.min_block_size,
1659                                similarity,
1660                                hash,
1661                                clone_type,
1662                                edit_distance: None,
1663                                suppressed_by_directive: None,
1664                                token_offset1: Some(offset),
1665                                token_offset2: cached_loc.token_offset,
1666                                target_length: Some(cached_loc.token_length),
1667                                duplicate_id: None,
1668                            });
1669                        }
1670                    }
1671                }
1672            }
1673        }
1674
1675        // Run Type-3 detection between changed files and any cached functions that matched hashes
1676        if self.enable_type3 && !cached_hits_by_file.is_empty() {
1677            let mut seen_functions: HashSet<(String, usize)> = HashSet::new();
1678
1679            for locations in cached_hits_by_file.values() {
1680                for loc in locations {
1681                    let token_offset = match loc.token_offset {
1682                        Some(offset) => offset,
1683                        None => continue,
1684                    };
1685
1686                    let normalized_path = normalize_path(Path::new(&loc.file_path));
1687                    if changed_set.contains(&normalized_path) {
1688                        continue;
1689                    }
1690
1691                    let (tokens, token_line_offsets) = normalize_with_line_numbers(&loc.raw_source);
1692                    if tokens.len() < self.min_block_size
1693                        || token_offset >= token_line_offsets.len()
1694                    {
1695                        continue;
1696                    }
1697
1698                    let line_offset = token_line_offsets[token_offset];
1699                    let start_line = loc.start_line.saturating_sub(line_offset);
1700                    let key = (loc.file_path.clone(), start_line);
1701
1702                    if !seen_functions.insert(key.clone()) {
1703                        continue;
1704                    }
1705
1706                    let end_line =
1707                        start_line + token_line_offsets.last().copied().unwrap_or_default();
1708
1709                    cached_function_hashes.push(FunctionHash {
1710                        file_path: Arc::<str>::from(key.0),
1711                        function_name: None,
1712                        start_byte: 0,
1713                        end_byte: 0,
1714                        start_line,
1715                        end_line,
1716                        tokens,
1717                        token_line_offsets,
1718                        raw_body: loc.raw_source.clone(),
1719                    });
1720                }
1721            }
1722
1723            if !cached_function_hashes.is_empty() {
1724                // Type alias for pair deduplication keys (shared with main detection path)
1725                type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
1726
1727                let mut seen_pairs: HashSet<SeenPairKey<'_>> = HashSet::new();
1728
1729                for dup in &duplicates {
1730                    if let (Some(offset1), Some(offset2)) = (dup.token_offset1, dup.token_offset2) {
1731                        if let (Some(func1), Some(func2)) = (
1732                            function_hashes.iter().find(|fh| {
1733                                fh.file_path.as_ref() == dup.file1.as_str()
1734                                    && fh.start_line <= dup.start_line1
1735                                    && dup.start_line1 <= fh.end_line
1736                            }),
1737                            cached_function_hashes.iter().find(|fh| {
1738                                fh.file_path.as_ref() == dup.file2.as_str()
1739                                    && fh.start_line <= dup.start_line2
1740                                    && dup.start_line2 <= fh.end_line
1741                            }),
1742                        ) {
1743                            seen_pairs.insert(canonical_pair_key(
1744                                func1, func2, offset1, offset2, dup.length,
1745                            ));
1746                        }
1747                    }
1748                }
1749
1750                let mut type3_candidates = Vec::new();
1751
1752                for func1 in &function_hashes {
1753                    for func2 in &cached_function_hashes {
1754                        let type3_matches = detect_type3_clones(
1755                            &func1.tokens,
1756                            &func2.tokens,
1757                            self.min_block_size,
1758                            self.type3_tolerance,
1759                        );
1760
1761                        for clone_match in type3_matches {
1762                            let pair_key = canonical_pair_key(
1763                                func1,
1764                                func2,
1765                                clone_match.source_start,
1766                                clone_match.target_start,
1767                                clone_match.length,
1768                            );
1769
1770                            if seen_pairs.contains(&pair_key) {
1771                                continue;
1772                            }
1773
1774                            type3_candidates.push((func1, func2, clone_match));
1775                        }
1776                    }
1777                }
1778
1779                let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1780
1781                for (func1, func2, clone_match) in deduplicated {
1782                    let (actual_start1, actual_end1) =
1783                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1784                    let (actual_start2, actual_end2) = self.compute_line_span(
1785                        func2,
1786                        clone_match.target_start,
1787                        clone_match.target_length,
1788                    );
1789
1790                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1791                        continue;
1792                    }
1793
1794                    let window1 = &func1.tokens
1795                        [clone_match.source_start..clone_match.source_start + clone_match.length];
1796                    let window2 = &func2.tokens[clone_match.target_start
1797                        ..clone_match.target_start + clone_match.target_length];
1798
1799                    let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1800                    let match_hash = Self::compute_match_hash(window1);
1801
1802                    duplicates.push(DuplicateMatch {
1803                        file1: func1.file_path.to_string(),
1804                        file2: func2.file_path.to_string(),
1805                        start_line1: actual_start1,
1806                        start_line2: actual_start2,
1807                        end_line1: Some(actual_end1),
1808                        end_line2: Some(actual_end2),
1809                        length: clone_match.length,
1810                        similarity: clone_match.similarity,
1811                        hash: match_hash,
1812                        clone_type: CloneType::Type3,
1813                        edit_distance: Some(edit_dist),
1814                        suppressed_by_directive: None,
1815                        token_offset1: Some(clone_match.source_start),
1816                        token_offset2: Some(clone_match.target_start),
1817                        target_length: Some(clone_match.target_length),
1818                        duplicate_id: None,
1819                    });
1820                }
1821            }
1822        }
1823
1824        // Also find duplicates within the changed files themselves
1825        let (intra_duplicates, _) = self.find_duplicate_hashes(&function_hashes);
1826        duplicates.extend(intra_duplicates);
1827
1828        // Deduplicate
1829        duplicates.sort_by(|a, b| {
1830            (&a.file1, &a.file2, a.start_line1, a.start_line2).cmp(&(
1831                &b.file1,
1832                &b.file2,
1833                b.start_line1,
1834                b.start_line2,
1835            ))
1836        });
1837        duplicates.dedup_by(|a, b| {
1838            a.file1 == b.file1
1839                && a.file2 == b.file2
1840                && a.start_line1 == b.start_line1
1841                && a.start_line2 == b.start_line2
1842        });
1843
1844        // Hydrate function metadata for any files that only exist in the cache so we
1845        // can compute duplicate IDs and apply directive-based suppression.
1846        let mut lookup_function_hashes = function_hashes.clone();
1847        if !cached_function_hashes.is_empty() {
1848            lookup_function_hashes.extend(cached_function_hashes.clone());
1849        }
1850        let hashed_files: HashSet<&str> = lookup_function_hashes
1851            .iter()
1852            .map(|fh| fh.file_path.as_ref())
1853            .collect();
1854
1855        let mut missing_files: HashSet<String> = HashSet::new();
1856        for dup in &duplicates {
1857            if !hashed_files.contains(dup.file1.as_str()) {
1858                missing_files.insert(dup.file1.clone());
1859            }
1860            if !hashed_files.contains(dup.file2.as_str()) {
1861                missing_files.insert(dup.file2.clone());
1862            }
1863        }
1864
1865        if !missing_files.is_empty() {
1866            let missing_paths: Vec<PathBuf> = missing_files.iter().map(PathBuf::from).collect();
1867            let (mut extra_hashes, _, _) = self.analyze_files(&missing_paths)?;
1868            lookup_function_hashes.append(&mut extra_hashes);
1869        }
1870
1871        // Compute IDs and filter against .polydup-ignore
1872        self.compute_duplicate_ids(&lookup_function_hashes, &mut duplicates);
1873        let suppressed_by_ignore_file = self.filter_ignored_duplicates(&mut duplicates);
1874
1875        // Apply inline directive filtering for both changed and cached files
1876        let suppressed_by_directive = if self.enable_directives && !duplicates.is_empty() {
1877            let directive_paths: HashSet<PathBuf> = lookup_function_hashes
1878                .iter()
1879                .map(|fh| PathBuf::from(fh.file_path.as_ref()))
1880                .collect();
1881            let directives_map =
1882                self.collect_directives(&directive_paths.into_iter().collect::<Vec<_>>());
1883
1884            if !directives_map.is_empty() {
1885                self.apply_directive_filtering(
1886                    &mut duplicates,
1887                    &directives_map,
1888                    &lookup_function_hashes,
1889                )
1890            } else {
1891                0
1892            }
1893        } else {
1894            0
1895        };
1896
1897        // Refresh cache with the newly scanned files so future runs stay incremental
1898        self.add_hashes_to_cache(&function_hashes, cache);
1899
1900        // Calculate statistics
1901        let stats = self.compute_stats(
1902            &function_hashes,
1903            total_lines,
1904            start_time,
1905            suppressed_by_ignore_file,
1906            suppressed_by_directive,
1907        );
1908
1909        // files_scanned is the count of successfully scanned files (total - skipped)
1910        let files_scanned = changed_files.len().saturating_sub(skipped_files.len());
1911
1912        Ok(Report {
1913            version: None,
1914            scan_time: None,
1915            config: None,
1916            files_scanned,
1917            functions_analyzed: function_hashes.len(),
1918            duplicates,
1919            skipped_files,
1920            stats,
1921        })
1922    }
1923}
1924
1925impl Default for Scanner {
1926    fn default() -> Self {
1927        Self::new() // Infallible now, no panic possible
1928    }
1929}
1930
1931/// Public API: Find duplicates in the given file paths
1932///
1933/// # Arguments
1934/// * `paths` - Vector of file paths to scan
1935///
1936/// # Returns
1937/// * `Result<Report>` - Scan report with detected duplicates
1938pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1939    let scanner = Scanner::new();
1940    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1941    scanner.scan(path_bufs)
1942}
1943
1944/// Public API with custom configuration
1945pub fn find_duplicates_with_config(
1946    paths: Vec<String>,
1947    min_block_size: usize,
1948    similarity_threshold: f64,
1949) -> Result<Report> {
1950    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1951    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1952    scanner.scan(path_bufs)
1953}
1954
1955#[cfg(test)]
1956mod tests {
1957    use super::*;
1958
1959    /// Helper to create a FunctionHash for testing with sequential line offsets
1960    fn make_test_function(
1961        file: &str,
1962        start_line: usize,
1963        tokens: Vec<Token>,
1964        raw_body: &str,
1965    ) -> FunctionHash {
1966        let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1967        FunctionHash {
1968            file_path: Arc::<str>::from(file),
1969            function_name: None,
1970            start_byte: 0,
1971            end_byte: 0,
1972            start_line,
1973            end_line: start_line + tokens.len(),
1974            tokens,
1975            token_line_offsets,
1976            raw_body: raw_body.to_string(),
1977        }
1978    }
1979
1980    /// Helper to create a FunctionHash with all tokens on the same line
1981    fn make_test_function_same_line(
1982        file: &str,
1983        start_line: usize,
1984        end_line: usize,
1985        tokens: Vec<Token>,
1986        raw_body: &str,
1987    ) -> FunctionHash {
1988        let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1989        FunctionHash {
1990            file_path: Arc::<str>::from(file),
1991            function_name: None,
1992            start_byte: 0,
1993            end_byte: 0,
1994            start_line,
1995            end_line,
1996            tokens,
1997            token_line_offsets,
1998            raw_body: raw_body.to_string(),
1999        }
2000    }
2001
2002    /// Helper to create simple expression tokens for testing: keyword id op id ;
2003    fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
2004        vec![
2005            Token::Keyword(keyword.into()),
2006            Token::Identifier,
2007            Token::Operator(op.into()),
2008            Token::Identifier,
2009            Token::Punctuation(";".into()),
2010        ]
2011    }
2012
2013    #[test]
2014    fn test_scanner_creation() {
2015        let _scanner = Scanner::new(); // Infallible
2016    }
2017
2018    #[test]
2019    fn test_scanner_with_config() {
2020        let scanner = Scanner::with_config(30, 0.9);
2021        assert!(scanner.is_ok());
2022        let s = scanner.unwrap();
2023        assert_eq!(s.min_block_size, 30);
2024        assert_eq!(s.similarity_threshold, 0.9);
2025    }
2026
2027    #[test]
2028    fn test_type3_tolerance_validation() {
2029        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
2030        assert!(Scanner::new().with_type3_detection(1.2).is_err());
2031        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
2032    }
2033
2034    #[test]
2035    fn test_type3_not_dropped_when_functions_share_offsets() {
2036        fn make_function(
2037            file: &str,
2038            start_line: usize,
2039            tokens: Vec<Token>,
2040            raw_body: &str,
2041        ) -> FunctionHash {
2042            let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
2043            FunctionHash {
2044                file_path: Arc::<str>::from(file),
2045                function_name: None,
2046                start_byte: 0,
2047                end_byte: 0,
2048                start_line,
2049                end_line: start_line + tokens.len(),
2050                tokens,
2051                token_line_offsets,
2052                raw_body: raw_body.to_string(),
2053            }
2054        }
2055
2056        let scanner = Scanner::with_config(3, 0.85)
2057            .unwrap()
2058            .with_type3_detection(0.6)
2059            .unwrap();
2060
2061        let type1_tokens = vec![
2062            Token::Keyword("return".into()),
2063            Token::NumberLiteral,
2064            Token::Punctuation(";".into()),
2065        ];
2066        let near_tokens_a = vec![
2067            Token::Keyword("compute".into()),
2068            Token::Identifier,
2069            Token::Identifier,
2070        ];
2071        let near_tokens_b = vec![
2072            Token::Keyword("compute".into()),
2073            Token::Identifier,
2074            Token::NumberLiteral,
2075        ];
2076
2077        let functions = vec![
2078            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
2079            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
2080            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
2081            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
2082        ];
2083
2084        let (duplicates, _) = scanner.find_duplicate_hashes(&functions);
2085
2086        let type1_present = duplicates.iter().any(|d| {
2087            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
2088                && d.start_line1 == 10
2089                && d.start_line2 == 20
2090        });
2091        assert!(
2092            type1_present,
2093            "expected Type-1/2 match for the first function pair"
2094        );
2095
2096        let type3_present = duplicates.iter().any(|d| {
2097            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
2098        });
2099        assert!(
2100            type3_present,
2101            "Type-3 match between later functions should not be deduped"
2102        );
2103
2104        assert_eq!(
2105            duplicates.len(),
2106            2,
2107            "should keep both the Type-1/2 and Type-3 matches"
2108        );
2109    }
2110
2111    #[test]
2112    fn test_type3_reports_token_offsets_in_start_lines() {
2113        let scanner = Scanner::with_config(3, 0.85)
2114            .unwrap()
2115            .with_type3_detection(0.75)
2116            .unwrap();
2117
2118        let functions = vec![
2119            make_test_function_same_line(
2120                "file_a.rs",
2121                100,
2122                105,
2123                make_expr_tokens("let", "+"),
2124                "let a = b + c;",
2125            ),
2126            make_test_function_same_line(
2127                "file_b.rs",
2128                200,
2129                205,
2130                make_expr_tokens("mut", "-"),
2131                "let a = b - c;",
2132            ),
2133        ];
2134
2135        let (duplicates, _) = scanner.find_duplicate_hashes(&functions);
2136
2137        let type3 = duplicates
2138            .iter()
2139            .find(|d| matches!(d.clone_type, CloneType::Type3))
2140            .expect("expected a Type-3 duplicate match");
2141
2142        assert_eq!(
2143            type3.start_line1, 100,
2144            "should report the actual source line even when tokens share a line"
2145        );
2146        assert_eq!(
2147            type3.start_line2, 200,
2148            "should report the actual target line even when tokens share a line"
2149        );
2150        assert_eq!(type3.token_offset1, Some(1));
2151        assert_eq!(type3.token_offset2, Some(1));
2152    }
2153
2154    #[test]
2155    fn type3_duplicate_ids_are_symmetric() {
2156        use tempfile::TempDir;
2157
2158        let tokens_a = make_expr_tokens("let", "+");
2159        // tokens_b has an extra identifier to create a Type-3 (near-miss) clone
2160        let mut tokens_b = make_expr_tokens("let", "-");
2161        tokens_b.push(Token::Identifier);
2162
2163        let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
2164        let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
2165
2166        let temp_dir = TempDir::new().unwrap();
2167        let scanner = Scanner::with_config(3, 0.85)
2168            .unwrap()
2169            .with_type3_detection(0.75)
2170            .unwrap()
2171            .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
2172
2173        let (forward, _) = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
2174        let (reverse, _) = scanner.find_duplicate_hashes(&[func_b, func_a]);
2175
2176        let id_forward = forward
2177            .into_iter()
2178            .find(|d| matches!(d.clone_type, CloneType::Type3))
2179            .and_then(|d| d.duplicate_id)
2180            .expect("expected a Type-3 duplicate ID");
2181
2182        let id_reverse = reverse
2183            .into_iter()
2184            .find(|d| matches!(d.clone_type, CloneType::Type3))
2185            .and_then(|d| d.duplicate_id)
2186            .expect("expected a Type-3 duplicate ID");
2187
2188        assert_eq!(
2189            id_forward, id_reverse,
2190            "Type-3 IDs should not depend on function order"
2191        );
2192    }
2193
2194    #[test]
2195    fn type3_does_not_report_self_matches() {
2196        // Regression test for issue #71: Type-3 detection was reporting functions
2197        // as duplicates of themselves (same file, same line on both sides)
2198        let scanner = Scanner::with_config(3, 0.85)
2199            .unwrap()
2200            .with_type3_detection(0.75)
2201            .unwrap();
2202
2203        // Create two functions in the SAME file with the SAME starting line
2204        // This simulates the bug where Type-3 matched a function against itself
2205        let tokens = make_expr_tokens("let", "+");
2206        let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
2207        let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
2208
2209        let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2210
2211        // Should NOT report any duplicates since both map to the same file:line
2212        let self_matches: Vec<_> = duplicates
2213            .iter()
2214            .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
2215            .collect();
2216
2217        assert!(
2218            self_matches.is_empty(),
2219            "Type-3 should never report self-matches (same file and line). Found: {:?}",
2220            self_matches
2221        );
2222    }
2223
2224    #[test]
2225    fn type3_still_detects_same_file_different_line_duplicates() {
2226        // Ensure the self-match fix doesn't break legitimate same-file duplicates
2227        let scanner = Scanner::with_config(3, 0.85)
2228            .unwrap()
2229            .with_type3_detection(0.75)
2230            .unwrap();
2231
2232        // Two similar functions in the SAME file but DIFFERENT lines
2233        let tokens1 = make_expr_tokens("let", "+");
2234        let mut tokens2 = make_expr_tokens("let", "-");
2235        tokens2.push(Token::Identifier); // Make it Type-3 (not exact)
2236
2237        let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
2238        let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
2239
2240        let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2241
2242        let same_file_different_line: Vec<_> = duplicates
2243            .iter()
2244            .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
2245            .collect();
2246
2247        assert!(
2248            !same_file_different_line.is_empty(),
2249            "Type-3 should still detect duplicates in the same file at different lines"
2250        );
2251    }
2252
2253    #[test]
2254    fn duplicate_matches_store_actual_end_lines() {
2255        let scanner = Scanner::with_config(2, 0.85).unwrap();
2256
2257        let tokens = vec![
2258            Token::Keyword("fn".into()),
2259            Token::Identifier,
2260            Token::Identifier,
2261            Token::Punctuation("{".into()),
2262            Token::Punctuation("}".into()),
2263        ];
2264
2265        let func1 = FunctionHash {
2266            file_path: Arc::<str>::from("file_a.rs"),
2267            function_name: None,
2268            start_byte: 0,
2269            end_byte: 0,
2270            start_line: 10,
2271            end_line: 14,
2272            tokens: tokens.clone(),
2273            token_line_offsets: vec![0, 0, 1, 1, 2],
2274            raw_body: "fn a() {}".to_string(),
2275        };
2276
2277        let func2 = FunctionHash {
2278            file_path: Arc::<str>::from("file_b.rs"),
2279            function_name: None,
2280            start_byte: 0,
2281            end_byte: 0,
2282            start_line: 20,
2283            end_line: 24,
2284            tokens,
2285            token_line_offsets: vec![0, 1, 1, 2, 2],
2286            raw_body: "fn b() {}".to_string(),
2287        };
2288
2289        let (duplicates, _) = scanner.find_duplicate_hashes(&[func1, func2]);
2290        let dup = duplicates.first().expect("expected a duplicate match");
2291
2292        assert_eq!(dup.start_line1, 10);
2293        assert_eq!(dup.start_line2, 20);
2294        assert_eq!(dup.end_line1, Some(12));
2295        assert_eq!(dup.end_line2, Some(22));
2296    }
2297
2298    #[test]
2299    fn scan_with_cache_prunes_stale_entries() {
2300        let temp_dir = tempfile::tempdir().unwrap();
2301        let file_a = temp_dir.path().join("a.js");
2302        let file_b = temp_dir.path().join("b.js");
2303
2304        let shared_fn = r#"
2305        function shared() {
2306          return 1 + 1;
2307        }
2308        "#;
2309        std::fs::write(&file_a, shared_fn).unwrap();
2310        std::fs::write(&file_b, shared_fn).unwrap();
2311
2312        let scanner = Scanner::with_config(3, 0.85).unwrap();
2313        let mut cache = scanner
2314            .build_cache(vec![file_a.clone(), file_b.clone()])
2315            .unwrap();
2316
2317        // Change the non-diff file so its cached hashes are outdated
2318        std::thread::sleep(std::time::Duration::from_millis(1100));
2319        std::fs::write(&file_b, "const unrelated = 42;\n").unwrap();
2320
2321        let report = scanner
2322            .scan_with_cache(vec![file_a.clone()], &mut cache)
2323            .unwrap();
2324
2325        assert!(
2326            report.duplicates.is_empty(),
2327            "stale cache entries should be invalidated before lookup"
2328        );
2329    }
2330
2331    #[test]
2332    fn scan_with_cache_repopulates_changed_entries() {
2333        let temp_dir = tempfile::tempdir().unwrap();
2334        let file_a = temp_dir.path().join("a.js");
2335
2336        let original = r#"
2337        function shared() {
2338          return 1 + 1;
2339        }
2340        "#;
2341
2342        let updated = r#"
2343        function shared() {
2344          return 7 + 8;
2345        }
2346        "#;
2347
2348        std::fs::write(&file_a, original).unwrap();
2349
2350        let scanner = Scanner::with_config(3, 0.85).unwrap();
2351        let mut cache = scanner.build_cache(vec![file_a.clone()]).unwrap();
2352
2353        std::thread::sleep(std::time::Duration::from_millis(1100));
2354        std::fs::write(&file_a, updated).unwrap();
2355
2356        let file_a_str = file_a.to_string_lossy().to_string();
2357        assert!(
2358            cache.file_needs_rescan(&file_a_str),
2359            "modified files should be considered stale before cache lookup"
2360        );
2361
2362        scanner
2363            .scan_with_cache(vec![file_a.clone()], &mut cache)
2364            .unwrap();
2365
2366        let cached_entries: Vec<&CodeLocation> = cache
2367            .hash_index
2368            .values()
2369            .flat_map(|locs| locs.iter())
2370            .filter(|loc| loc.file_path == file_a_str)
2371            .collect();
2372
2373        assert!(
2374            !cached_entries.is_empty(),
2375            "changed files should be added back into the cache after rescan"
2376        );
2377        assert!(
2378            cached_entries
2379                .iter()
2380                .any(|loc| loc.raw_source.contains("return 7 + 8;")),
2381            "cache should contain hashes for the refreshed file contents"
2382        );
2383        assert!(
2384            cache.file_metadata.contains_key(&file_a_str),
2385            "file metadata should be refreshed after rescanning changed files"
2386        );
2387    }
2388
2389    #[test]
2390    fn scan_with_cache_rehydrates_stale_unchanged_files() {
2391        let temp_dir = tempfile::tempdir().unwrap();
2392        let changed_file = temp_dir.path().join("changed.js");
2393        let unchanged_file = temp_dir.path().join("unchanged.js");
2394
2395        let shared_fn = r#"
2396        function shared() {
2397          return 1 + 1;
2398        }
2399        "#;
2400
2401        std::fs::write(&changed_file, shared_fn).unwrap();
2402        std::fs::write(&unchanged_file, shared_fn).unwrap();
2403
2404        let scanner = Scanner::with_config(3, 0.85).unwrap();
2405        let mut cache = scanner
2406            .build_cache(vec![temp_dir.path().to_path_buf()])
2407            .unwrap();
2408
2409        // Simulate a restored cache where file mtimes no longer match.
2410        std::thread::sleep(std::time::Duration::from_millis(1100));
2411        std::fs::write(
2412            &changed_file,
2413            r#"
2414        function shared() {
2415          return 1 + 1;
2416        }
2417        function another() {
2418          return 1 + 1;
2419        }
2420        "#,
2421        )
2422        .unwrap();
2423        std::fs::write(&unchanged_file, shared_fn).unwrap();
2424
2425        let report = scanner
2426            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2427            .unwrap();
2428
2429        assert!(
2430            report.duplicates.iter().any(|dup| {
2431                (dup.file1.ends_with("changed.js") && dup.file2.ends_with("unchanged.js"))
2432                    || (dup.file1.ends_with("unchanged.js") && dup.file2.ends_with("changed.js"))
2433            }),
2434            "invalidated entries should be rebuilt so unchanged files still match against diffs"
2435        );
2436    }
2437
2438    #[test]
2439    fn scan_with_cache_respects_ignore_file() {
2440        let temp_dir = tempfile::tempdir().unwrap();
2441        let file_a = temp_dir.path().join("a.js");
2442        let file_b = temp_dir.path().join("b.js");
2443
2444        let shared_fn = r#"
2445        function shared() {
2446          return 1 + 1;
2447        }
2448        "#;
2449        std::fs::write(&file_a, shared_fn).unwrap();
2450        std::fs::write(&file_b, shared_fn).unwrap();
2451
2452        let base_scanner = Scanner::with_config(3, 0.85).unwrap();
2453        let mut cache = base_scanner
2454            .build_cache(vec![temp_dir.path().to_path_buf()])
2455            .unwrap();
2456
2457        let initial_report = base_scanner
2458            .scan_with_cache(vec![file_a.clone()], &mut cache)
2459            .unwrap();
2460        assert!(
2461            !initial_report.duplicates.is_empty(),
2462            "expected an initial duplicate to seed ignore entries"
2463        );
2464        let ignored_ids: Vec<String> = initial_report
2465            .duplicates
2466            .iter()
2467            .map(|d| {
2468                d.duplicate_id
2469                    .clone()
2470                    .expect("expected cache path to compute duplicate IDs")
2471            })
2472            .collect();
2473
2474        let mut manager = IgnoreManager::new(temp_dir.path());
2475        for id in ignored_ids {
2476            manager.add_ignore(IgnoreEntry::new(
2477                id,
2478                vec![],
2479                "test ignore".to_string(),
2480                "tester".to_string(),
2481            ));
2482        }
2483
2484        let scanner = base_scanner.with_ignore_manager(manager);
2485        let report = scanner
2486            .scan_with_cache(vec![file_a.clone()], &mut cache)
2487            .unwrap();
2488
2489        assert!(
2490            report.duplicates.is_empty(),
2491            "duplicates present in .polydup-ignore should be filtered when using cache"
2492        );
2493    }
2494
2495    #[test]
2496    fn scan_with_cache_uses_symmetric_ids_for_existing_ignores() {
2497        let temp_dir = tempfile::tempdir().unwrap();
2498        let file_a = temp_dir.path().join("a.js");
2499        let file_b = temp_dir.path().join("b.js");
2500
2501        let shared_fn = r#"
2502        function shared() {
2503          return 1 + 1;
2504        }
2505        "#;
2506        std::fs::write(&file_a, shared_fn).unwrap();
2507        std::fs::write(&file_b, shared_fn).unwrap();
2508
2509        let base_scanner = Scanner::with_config(7, 0.85).unwrap();
2510        let mut cache = base_scanner
2511            .build_cache(vec![temp_dir.path().to_path_buf()])
2512            .unwrap();
2513
2514        let baseline_report = base_scanner
2515            .scan(vec![temp_dir.path().to_path_buf()])
2516            .unwrap();
2517        let baseline_id = baseline_report
2518            .duplicates
2519            .first()
2520            .and_then(|dup| dup.duplicate_id.clone())
2521            .expect("expected duplicate IDs from full scans");
2522        let baseline_id_for_ignore = baseline_id.clone();
2523
2524        let mut manager = IgnoreManager::new(temp_dir.path());
2525        manager.add_ignore(IgnoreEntry::new(
2526            baseline_id_for_ignore,
2527            vec![],
2528            "test ignore".to_string(),
2529            "tester".to_string(),
2530        ));
2531
2532        let scanner = base_scanner.with_ignore_manager(manager);
2533        let report = scanner
2534            .scan_with_cache(vec![file_a.clone()], &mut cache)
2535            .unwrap();
2536
2537        assert!(
2538            report.duplicates.is_empty(),
2539            "cached scans should honor ignores generated from full scans"
2540        );
2541    }
2542
2543    #[test]
2544    fn scan_with_cache_respects_directives_from_cached_files() {
2545        let temp_dir = tempfile::tempdir().unwrap();
2546        let changed_file = temp_dir.path().join("changed.js");
2547        let cached_file = temp_dir.path().join("cached.js");
2548
2549        let suppressed_fn = r#"
2550        // polydup-ignore: generated code
2551        function shared() {
2552          return 1 + 1;
2553        }
2554        "#;
2555
2556        let changed_fn = r#"
2557        function shared() {
2558          return 1 + 1;
2559        }
2560        "#;
2561
2562        std::fs::write(&cached_file, suppressed_fn).unwrap();
2563        std::fs::write(&changed_file, changed_fn).unwrap();
2564
2565        let scanner = Scanner::with_config(3, 0.85).unwrap().with_directives(true);
2566        let mut cache = scanner
2567            .build_cache(vec![temp_dir.path().to_path_buf()])
2568            .unwrap();
2569
2570        let report = scanner
2571            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2572            .unwrap();
2573
2574        assert!(
2575            report.duplicates.is_empty(),
2576            "duplicates suppressed by directives in cached files should stay suppressed when using cache"
2577        );
2578    }
2579
2580    #[test]
2581    fn scan_with_cache_runs_type3_detection_against_cached_files() {
2582        let temp_dir = tempfile::tempdir().unwrap();
2583        let changed_file = temp_dir.path().join("changed.js");
2584        let cached_file = temp_dir.path().join("cached.js");
2585
2586        let cached_fn = r#"
2587        function cached() {
2588          step1();
2589          step2();
2590          step3();
2591          step4();
2592          step5();
2593        }
2594        "#;
2595
2596        let changed_fn = r#"
2597        function cached() {
2598          step1();
2599          step2();
2600          insert_gap();
2601          step3();
2602          step4();
2603          step5();
2604        }
2605        "#;
2606
2607        std::fs::write(&cached_file, cached_fn).unwrap();
2608        std::fs::write(&changed_file, changed_fn).unwrap();
2609
2610        let scanner = Scanner::with_config(3, 0.8)
2611            .unwrap()
2612            .with_type3_detection(0.8)
2613            .unwrap();
2614        let mut cache = scanner
2615            .build_cache(vec![temp_dir.path().to_path_buf()])
2616            .unwrap();
2617
2618        let report = scanner
2619            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2620            .unwrap();
2621
2622        assert!(
2623            report.duplicates.iter().any(|dup| {
2624                matches!(dup.clone_type, CloneType::Type3)
2625                    && dup.file1.ends_with("changed.js")
2626                    && dup.file2.ends_with("cached.js")
2627            }),
2628            "Type-3 should run for cached comparisons so near-miss clones surface in git-diff mode"
2629        );
2630    }
2631
2632    #[test]
2633    fn test_find_duplicates_empty() {
2634        let result = find_duplicates(vec![]);
2635        assert!(result.is_ok());
2636        let report = result.unwrap();
2637        assert_eq!(report.duplicates.len(), 0);
2638    }
2639
2640    #[test]
2641    fn test_is_supported_file() {
2642        let scanner = Scanner::new();
2643
2644        assert!(scanner.is_supported_file(Path::new("test.rs")));
2645        assert!(scanner.is_supported_file(Path::new("test.py")));
2646        assert!(scanner.is_supported_file(Path::new("test.js")));
2647        assert!(scanner.is_supported_file(Path::new("test.ts")));
2648        assert!(!scanner.is_supported_file(Path::new("test.txt")));
2649        assert!(!scanner.is_supported_file(Path::new("test.md")));
2650    }
2651
2652    #[test]
2653    fn test_detect_language() {
2654        let scanner = Scanner::new();
2655
2656        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
2657        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
2658        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
2659        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
2660    }
2661}
dupe_core/lib.rs

dupe_core/
lib.rs