dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod cache;
8mod directives;
9mod error;
10mod hashing;
11mod ignore_rules;
12mod parsing;
13mod queries;
14
15#[cfg(test)]
16mod proptest_fuzzing;
17
18#[cfg(test)]
19mod snapshot_tests;
20
21// Re-export public types
22pub use cache::{CacheStats, CodeLocation, FileCacheMetadata, HashCache};
23pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
24pub use error::{PolyDupError, Result};
25pub use hashing::{
26    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
27    compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
28    normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
29    Token,
30};
31pub use ignore_rules::{
32    compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
33};
34pub use parsing::{
35    extract_functions, extract_javascript_functions, extract_python_functions,
36    extract_rust_functions, FunctionNode,
37};
38
39use anyhow::Context;
40use ignore::WalkBuilder;
41use rayon::prelude::*;
42use serde::{Deserialize, Serialize};
43use std::collections::{HashMap, HashSet};
44use std::fs;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use tree_sitter::Language;
48
49/// Clone type classification
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51pub enum CloneType {
52    /// Type-1: Exact copies (only whitespace/comments differ)
53    #[serde(rename = "type-1")]
54    Type1,
55    /// Type-2: Structurally identical but renamed identifiers/literals
56    #[serde(rename = "type-2")]
57    Type2,
58    /// Type-3: Near-miss clones with modifications (not yet implemented)
59    #[serde(rename = "type-3")]
60    Type3,
61}
62
63/// Helper function to check if two ranges overlap
64fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
65    start1 < end2 && start2 < end1
66}
67
68// Stable key for deduplicating matches within the same file pair.
69fn canonical_pair_key<'a>(
70    func1: &'a FunctionHash,
71    func2: &'a FunctionHash,
72    source_start: usize,
73    target_start: usize,
74    length: usize,
75) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
76    if func1.file_path.as_ref() < func2.file_path.as_ref() {
77        (
78            func1.file_path.as_ref(),
79            func2.file_path.as_ref(),
80            func1.start_line,
81            func2.start_line,
82            source_start,
83            target_start,
84            length,
85        )
86    } else {
87        (
88            func2.file_path.as_ref(),
89            func1.file_path.as_ref(),
90            func2.start_line,
91            func1.start_line,
92            target_start,
93            source_start,
94            length,
95        )
96    }
97}
98
99/// Represents a detected duplicate code fragment
100#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
101pub struct DuplicateMatch {
102    pub file1: String,
103    pub file2: String,
104    pub start_line1: usize,
105    pub start_line2: usize,
106    #[serde(skip_serializing_if = "Option::is_none")]
107    pub end_line1: Option<usize>,
108    #[serde(skip_serializing_if = "Option::is_none")]
109    pub end_line2: Option<usize>,
110    pub length: usize,
111    pub similarity: f64,
112    pub hash: u64,
113    pub clone_type: CloneType,
114    /// Edit distance (Type-3 only). None for Type-1/2
115    #[serde(skip_serializing_if = "Option::is_none")]
116    pub edit_distance: Option<usize>,
117    /// Indicates if this duplicate is suppressed by an inline directive
118    #[serde(skip_serializing_if = "Option::is_none")]
119    pub suppressed_by_directive: Option<bool>,
120    /// Token offset within function for file1 (used for ignore ID computation)
121    #[serde(skip)]
122    token_offset1: Option<usize>,
123    /// Token offset within function for file2 (used for ignore ID computation)
124    #[serde(skip)]
125    token_offset2: Option<usize>,
126    /// Token length of the second window (Type-3 may differ from `length`)
127    #[serde(skip)]
128    target_length: Option<usize>,
129    /// Content-based ID for this duplicate (SHA256 of normalized tokens)
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub duplicate_id: Option<String>,
132}
133
134/// Represents a function with its tokens for duplicate detection
135#[derive(Debug, Clone)]
136struct FunctionHash {
137    file_path: Arc<str>, // Shared ownership, cheap to clone
138    #[allow(dead_code)] // Kept for potential future reporting improvements
139    function_name: Option<String>,
140    #[allow(dead_code)] // Kept for byte-level analysis in future
141    start_byte: usize,
142    #[allow(dead_code)] // Kept for byte-level analysis in future
143    end_byte: usize,
144    start_line: usize,
145    #[allow(dead_code)] // Kept for future detailed reporting
146    end_line: usize,
147    tokens: Vec<Token>, // Normalized token sequence
148    /// Zero-based line offset for each token relative to start_line
149    token_line_offsets: Vec<usize>,
150    raw_body: String, // Original (unnormalized) function body for Type-1 detection
151}
152
153/// Baseline snapshot for comparing duplicate detection across runs
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct Baseline {
156    /// Version of the baseline format
157    pub version: String,
158    /// Timestamp when baseline was created
159    pub created_at: String,
160    /// Duplicates that existed at baseline time
161    pub duplicates: Vec<DuplicateMatch>,
162}
163
164impl Baseline {
165    /// Create a new baseline from scan results
166    pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
167        Self {
168            version: env!("CARGO_PKG_VERSION").to_string(),
169            created_at: chrono::Utc::now().to_rfc3339(),
170            duplicates,
171        }
172    }
173
174    /// Save baseline to a JSON file
175    pub fn save_to_file(&self, path: &Path) -> Result<()> {
176        let json =
177            serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
178        fs::write(path, json).context("Failed to write baseline file")?;
179        Ok(())
180    }
181
182    /// Load baseline from a JSON file
183    pub fn load_from_file(path: &Path) -> Result<Self> {
184        let content = fs::read_to_string(path)
185            .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
186        let baseline: Baseline =
187            serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
188        Ok(baseline)
189    }
190
191    /// Compare current duplicates against baseline and return only new ones
192    pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
193        let baseline_set: std::collections::HashSet<_> =
194            self.duplicates.iter().map(duplicate_key).collect();
195
196        current
197            .iter()
198            .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
199            .cloned()
200            .collect()
201    }
202}
203
204/// Create a unique key for a duplicate match for comparison
205fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
206    // Normalize file order for consistent comparison
207    let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
208        (
209            dup.file1.clone(),
210            dup.file2.clone(),
211            dup.start_line1,
212            dup.start_line2,
213        )
214    } else {
215        (
216            dup.file2.clone(),
217            dup.file1.clone(),
218            dup.start_line2,
219            dup.start_line1,
220        )
221    };
222    (file1, file2, line1, line2, dup.length)
223}
224
225/// Report containing scan results
226#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct Report {
228    /// PolyDup version
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub version: Option<String>,
231    /// Scan start time (ISO 8601)
232    #[serde(skip_serializing_if = "Option::is_none")]
233    pub scan_time: Option<String>,
234    /// Configuration used for the scan
235    #[serde(skip_serializing_if = "Option::is_none")]
236    pub config: Option<ScanConfig>,
237    /// Total number of files scanned
238    pub files_scanned: usize,
239    /// Total number of functions analyzed
240    pub functions_analyzed: usize,
241    /// Detected duplicate matches
242    pub duplicates: Vec<DuplicateMatch>,
243    /// Scan statistics
244    pub stats: ScanStats,
245}
246
247/// Configuration used for scanning
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct ScanConfig {
250    /// Minimum block size in tokens
251    pub threshold: usize,
252    /// Similarity threshold (0.0 - 1.0)
253    pub similarity: f64,
254    /// Type-3 detection enabled
255    pub type3_enabled: bool,
256    /// Paths scanned
257    #[serde(skip_serializing_if = "Option::is_none")]
258    pub paths: Option<Vec<String>>,
259}
260
261/// Statistics from the scanning process
262#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct ScanStats {
264    /// Total lines of code scanned
265    pub total_lines: usize,
266    /// Total tokens processed
267    pub total_tokens: usize,
268    /// Number of unique hashes computed
269    pub unique_hashes: usize,
270    /// Scan duration in milliseconds
271    pub duration_ms: u64,
272}
273
274/// Main scanner for detecting duplicates
275#[allow(dead_code)] // similarity_threshold reserved for future use
276pub struct Scanner {
277    /// Minimum code block size to consider (in tokens)
278    min_block_size: usize,
279    /// Similarity threshold (0.0 - 1.0)
280    similarity_threshold: f64,
281    /// Glob patterns to exclude from scanning
282    exclude_patterns: Vec<String>,
283    /// Enable Type-3 (gap-tolerant) clone detection
284    enable_type3: bool,
285    /// Type-3 similarity tolerance (0.0 - 1.0)
286    type3_tolerance: f64,
287    /// Ignore manager for filtering false positives
288    ignore_manager: Option<IgnoreManager>,
289    /// Enable inline directive detection
290    enable_directives: bool,
291    /// Include test files in scanning (*.test.*, *.spec.*, etc.)
292    include_tests: bool,
293}
294
295/// Default exclude patterns for test files and build artifacts
296fn default_exclude_patterns() -> Vec<String> {
297    vec![
298        // Test files (excluded by default, enable with --include-tests)
299        "**/*.test.ts".to_string(),
300        "**/*.test.js".to_string(),
301        "**/*.test.tsx".to_string(),
302        "**/*.test.jsx".to_string(),
303        "**/*.spec.ts".to_string(),
304        "**/*.spec.js".to_string(),
305        "**/*.spec.tsx".to_string(),
306        "**/*.spec.jsx".to_string(),
307        "**/__tests__/**".to_string(),
308        "**/*.test.py".to_string(),
309    ]
310}
311
312/// Exclude patterns for build artifacts (always excluded)
313fn build_artifact_patterns() -> Vec<String> {
314    vec![
315        "**/node_modules/**".to_string(),
316        "**/target/**".to_string(),
317        "**/dist/**".to_string(),
318        "**/build/**".to_string(),
319        "**/.git/**".to_string(),
320    ]
321}
322
323impl Scanner {
324    /// Creates a new Scanner with default settings
325    ///
326    /// This is now infallible as there are no I/O or allocation failures.
327    pub fn new() -> Self {
328        let mut exclude = build_artifact_patterns();
329        exclude.extend(default_exclude_patterns());
330
331        Self {
332            min_block_size: 50,
333            similarity_threshold: 0.85,
334            exclude_patterns: exclude,
335            enable_type3: false,
336            type3_tolerance: 0.85,
337            ignore_manager: None,
338            enable_directives: false,
339            include_tests: false,
340        }
341    }
342
343    /// Creates a new Scanner with custom settings
344    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
345        let mut exclude = build_artifact_patterns();
346        exclude.extend(default_exclude_patterns());
347
348        Ok(Self {
349            min_block_size,
350            similarity_threshold,
351            exclude_patterns: exclude,
352            enable_type3: false,
353            type3_tolerance: 0.85,
354            ignore_manager: None,
355            enable_directives: false,
356            include_tests: false,
357        })
358    }
359
360    /// Sets custom exclude patterns, replacing the defaults
361    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
362        self.exclude_patterns = patterns;
363        self
364    }
365
366    /// Enables test file scanning (removes test file patterns from exclusions)
367    pub fn with_test_files(mut self, include: bool) -> Self {
368        self.include_tests = include;
369        if include {
370            // Remove test file patterns from exclusions
371            let test_patterns = default_exclude_patterns();
372            self.exclude_patterns.retain(|p| !test_patterns.contains(p));
373        }
374        self
375    }
376
377    /// Enables Type-3 clone detection with the specified tolerance
378    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
379        if !(0.0..=1.0).contains(&tolerance) {
380            return Err(PolyDupError::Config(
381                "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
382            ));
383        }
384        self.enable_type3 = true;
385        self.type3_tolerance = tolerance;
386        Ok(self)
387    }
388
389    /// Sets the ignore manager for filtering false positives
390    pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
391        self.ignore_manager = Some(manager);
392        self
393    }
394
395    /// Enables inline directive detection (// polydup-ignore comments)
396    pub fn with_directives(mut self, enabled: bool) -> Self {
397        self.enable_directives = enabled;
398        self
399    }
400
401    /// Scans the given paths and returns a Report with detected duplicates
402    ///
403    /// Uses Rayon for parallel file processing:
404    /// 1. Read and parse files
405    /// 2. Extract functions
406    /// 3. Normalize and hash function bodies
407    /// 4. Compare hashes to find duplicates
408    /// 5. Apply directive-based filtering if enabled
409    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
410        use std::time::Instant;
411        let start_time = Instant::now();
412
413        // Collect all source files
414        let source_files = self.collect_source_files(paths)?;
415
416        // Detect directives if enabled
417        let directives_map = self.collect_directives(&source_files);
418
419        // Process files in parallel to extract functions and compute hashes
420        let (function_hashes, total_lines) = self.analyze_files(&source_files)?;
421
422        // Find duplicates by comparing hashes
423        let mut duplicates = self.find_duplicate_hashes(&function_hashes);
424
425        // Apply directive-based filtering
426        if self.enable_directives && !directives_map.is_empty() {
427            self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes);
428        }
429
430        // Calculate statistics
431        let stats = self.compute_stats(&function_hashes, total_lines, start_time);
432
433        Ok(Report {
434            version: None,   // Will be set by CLI
435            scan_time: None, // Will be set by CLI
436            config: None,    // Will be set by CLI
437            files_scanned: source_files.len(),
438            functions_analyzed: function_hashes.len(),
439            duplicates,
440            stats,
441        })
442    }
443
444    /// Parallel collection of directives from source files
445    fn collect_directives(
446        &self,
447        source_files: &[PathBuf],
448    ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
449        if self.enable_directives {
450            source_files
451                .par_iter()
452                .filter_map(|path| {
453                    crate::directives::detect_directives_in_file(path)
454                        .ok()
455                        .map(|d| (path.clone(), d))
456                })
457                .collect()
458        } else {
459            HashMap::new()
460        }
461    }
462
463    /// Analyze files in parallel to extract functions and metadata
464    fn analyze_files(&self, source_files: &[PathBuf]) -> Result<(Vec<FunctionHash>, usize)> {
465        // Collect function hashes and line counts
466        let results: Vec<Result<(Vec<FunctionHash>, usize)>> = source_files
467            .par_iter()
468            .map(|path| {
469                // Count lines first
470                let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
471                let line_count = content.lines().count();
472
473                // Process file for functions
474                let hashes = self.process_file_content(path, &content)?;
475                Ok((hashes, line_count))
476            })
477            .collect();
478
479        // Aggregate results
480        let mut all_hashes = Vec::new();
481        let mut total_lines = 0;
482
483        for res in results {
484            match res {
485                Ok((hashes, lines)) => {
486                    all_hashes.extend(hashes);
487                    total_lines += lines;
488                }
489                Err(_) => {
490                    // Silently skip files that fail to parse (e.g., binary files, permission errors).
491                    // This matches the original behavior which used .filter_map(|path| self.process_file(path).ok())
492                }
493            }
494        }
495
496        Ok((all_hashes, total_lines))
497    }
498
499    /// Filter duplicates based on directives
500    fn apply_directive_filtering(
501        &self,
502        duplicates: &mut Vec<DuplicateMatch>,
503        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
504        function_hashes: &[FunctionHash],
505    ) {
506        for dup in duplicates.iter_mut() {
507            let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
508            if suppressed {
509                dup.suppressed_by_directive = Some(true);
510            }
511        }
512
513        // Filter out suppressed duplicates (they shouldn't appear in reports or fail CI)
514        duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
515    }
516
517    /// Compute scan statistics
518    fn compute_stats(
519        &self,
520        function_hashes: &[FunctionHash],
521        total_lines: usize,
522        start_time: std::time::Instant,
523    ) -> ScanStats {
524        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
525
526        let unique_hashes: usize = {
527            let mut hash_set = std::collections::HashSet::new();
528            for fh in function_hashes {
529                // Compute rolling hashes just for statistics
530                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
531                for (hash, _) in hashes {
532                    hash_set.insert(hash);
533                }
534            }
535            hash_set.len()
536        };
537
538        let duration_ms = start_time.elapsed().as_millis() as u64;
539
540        ScanStats {
541            total_lines,
542            total_tokens,
543            unique_hashes,
544            duration_ms,
545        }
546    }
547
548    /// Collects all source files from the given paths
549    ///
550    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
551    /// and common ignore patterns (node_modules, target, etc.)
552    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
553        let mut files = Vec::new();
554
555        for path in paths {
556            if path.is_file() {
557                if self.is_supported_file(&path) && !self.is_excluded(&path) {
558                    files.push(path);
559                }
560            } else if path.is_dir() {
561                // Use ignore crate's WalkBuilder to respect .gitignore
562                let walker = WalkBuilder::new(&path)
563                    .git_ignore(true) // Respect .gitignore
564                    .git_global(true) // Respect global gitignore
565                    .git_exclude(true) // Respect .git/info/exclude
566                    .ignore(true) // Respect .ignore files
567                    .hidden(false) // Don't skip hidden files (e.g., .config/)
568                    .parents(true) // Respect parent .gitignore files
569                    .build();
570
571                for entry in walker {
572                    match entry {
573                        Ok(entry) => {
574                            let path = entry.path();
575                            if path.is_file()
576                                && self.is_supported_file(path)
577                                && !self.is_excluded(path)
578                            {
579                                files.push(path.to_path_buf());
580                            }
581                        }
582                        Err(err) => {
583                            // Log but don't fail on individual entry errors
584                            eprintln!("Warning: Failed to access path: {}", err);
585                        }
586                    }
587                }
588            }
589        }
590
591        Ok(files)
592    }
593
594    /// Checks if a file is a supported source file
595    fn is_supported_file(&self, path: &Path) -> bool {
596        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
597            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
598        } else {
599            false
600        }
601    }
602
603    /// Checks if a file matches any exclude patterns
604    fn is_excluded(&self, path: &Path) -> bool {
605        use globset::{Glob, GlobSetBuilder};
606
607        // Build glob set from exclude patterns
608        let mut builder = GlobSetBuilder::new();
609        for pattern in &self.exclude_patterns {
610            if let Ok(glob) = Glob::new(pattern) {
611                builder.add(glob);
612            }
613        }
614
615        if let Ok(glob_set) = builder.build() {
616            glob_set.is_match(path)
617        } else {
618            false
619        }
620    }
621
622    /// Processes a single file content and returns function hashes
623    fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
624        let lang = self.detect_language(path)?;
625        let functions = extract_functions(code, lang)?;
626
627        // Use Arc<str> for efficient sharing across all functions in this file
628        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
629        let mut function_hashes = Vec::new();
630
631        for func in functions {
632            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
633            let raw_body = func.body.clone();
634            let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
635
636            // Skip if too small
637            if tokens.len() < self.min_block_size {
638                continue;
639            }
640
641            // Store the full token sequence for extension-based detection
642            function_hashes.push(FunctionHash {
643                file_path: Arc::clone(&file_path), // Cheap pointer clone
644                function_name: func.name.clone(),
645                start_byte: func.start_byte,
646                end_byte: func.end_byte,
647                start_line: func.start_line,
648                end_line: func.end_line,
649                tokens,
650                token_line_offsets,
651                raw_body,
652            });
653        }
654
655        Ok(function_hashes)
656    }
657
658    /// Detects the Tree-sitter Language from file extension
659    fn detect_language(&self, path: &Path) -> Result<Language> {
660        let ext = path
661            .extension()
662            .and_then(|e| e.to_str())
663            .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
664
665        match ext {
666            "rs" => Ok(tree_sitter_rust::language()),
667            "py" => Ok(tree_sitter_python::language()),
668            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
669            _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
670        }
671    }
672
673    /// Computes the inclusive line span for a token window within a function
674    fn compute_line_span(
675        &self,
676        func: &FunctionHash,
677        start_offset: usize,
678        length: usize,
679    ) -> (usize, usize) {
680        let start_line = func
681            .token_line_offsets
682            .get(start_offset)
683            .map(|offset| func.start_line + offset)
684            .unwrap_or(func.start_line + start_offset);
685
686        let end_index = start_offset + length.saturating_sub(1);
687        let end_line = func
688            .token_line_offsets
689            .get(end_index)
690            .map(|offset| func.start_line + offset)
691            .unwrap_or(func.start_line + end_index);
692
693        (start_line, end_line)
694    }
695
696    /// Finds duplicate code using greedy extension algorithm
697    ///
698    /// Orchestrates the detection pipeline:
699    /// 1. Type-1/2 detection (exact and renamed clones)
700    /// 2. Type-3 detection (near-miss clones with gaps)
701    /// 3. Duplicate ID computation
702    /// 4. Ignore filtering
703    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
704        // Type alias for pair deduplication keys
705        type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
706
707        // Shared state for deduplication across Type-1/2 and Type-3
708        let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
709            std::collections::HashSet::new();
710
711        // Phase 1: Type-1/2 detection
712        let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
713
714        // Phase 2: Type-3 detection (if enabled)
715        if self.enable_type3 {
716            self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
717        }
718
719        // Phase 3: Compute IDs for all duplicates
720        self.compute_duplicate_ids(function_hashes, &mut duplicates);
721
722        // Phase 4: Filter out ignored duplicates
723        self.filter_ignored_duplicates(&mut duplicates);
724
725        duplicates
726    }
727
728    /// Detects Type-1 (exact) and Type-2 (renamed) clones
729    ///
730    /// Compares all function pairs using hash-based detection with greedy extension.
731    fn find_type12_duplicates<'a>(
732        &self,
733        function_hashes: &'a [FunctionHash],
734        seen_pairs: &mut std::collections::HashSet<(
735            &'a str,
736            &'a str,
737            usize,
738            usize,
739            usize,
740            usize,
741            usize,
742        )>,
743    ) -> Vec<DuplicateMatch> {
744        let mut duplicates = Vec::new();
745
746        for i in 0..function_hashes.len() {
747            for j in (i + 1)..function_hashes.len() {
748                let func1 = &function_hashes[i];
749                let func2 = &function_hashes[j];
750
751                let matches = self.find_clones_between_functions(func1, func2);
752
753                for clone_match in matches {
754                    let pair_key = canonical_pair_key(
755                        func1,
756                        func2,
757                        clone_match.source_start,
758                        clone_match.target_start,
759                        clone_match.length,
760                    );
761
762                    if seen_pairs.contains(&pair_key) {
763                        continue;
764                    }
765                    seen_pairs.insert(pair_key);
766
767                    // Compute hash for reporting
768                    let match_hash = Self::compute_match_hash(
769                        &func1.tokens[clone_match.source_start
770                            ..clone_match.source_start + clone_match.length],
771                    );
772
773                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
774
775                    let (actual_start1, actual_end1) =
776                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
777                    let (actual_start2, actual_end2) =
778                        self.compute_line_span(func2, clone_match.target_start, clone_match.length);
779
780                    // Skip same location (overlapping function boundaries)
781                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
782                        continue;
783                    }
784
785                    duplicates.push(DuplicateMatch {
786                        file1: func1.file_path.to_string(),
787                        file2: func2.file_path.to_string(),
788                        start_line1: actual_start1,
789                        start_line2: actual_start2,
790                        end_line1: Some(actual_end1),
791                        end_line2: Some(actual_end2),
792                        length: clone_match.length,
793                        similarity: clone_match.similarity,
794                        hash: match_hash,
795                        clone_type,
796                        edit_distance: None,
797                        suppressed_by_directive: None,
798                        token_offset1: Some(clone_match.source_start),
799                        token_offset2: Some(clone_match.target_start),
800                        target_length: Some(clone_match.length),
801                        duplicate_id: None,
802                    });
803                }
804            }
805        }
806
807        duplicates
808    }
809
810    /// Detects Type-3 (gap-tolerant) clones using edit distance
811    ///
812    /// Finds near-miss clones that have insertions, deletions, or modifications.
813    fn find_type3_duplicates<'a>(
814        &self,
815        function_hashes: &'a [FunctionHash],
816        seen_pairs: &std::collections::HashSet<(
817            &'a str,
818            &'a str,
819            usize,
820            usize,
821            usize,
822            usize,
823            usize,
824        )>,
825        duplicates: &mut Vec<DuplicateMatch>,
826    ) {
827        let mut type3_candidates = Vec::new();
828
829        for i in 0..function_hashes.len() {
830            for j in (i + 1)..function_hashes.len() {
831                let func1 = &function_hashes[i];
832                let func2 = &function_hashes[j];
833
834                let type3_matches = detect_type3_clones(
835                    &func1.tokens,
836                    &func2.tokens,
837                    self.min_block_size,
838                    self.type3_tolerance,
839                );
840
841                for clone_match in type3_matches {
842                    let pair_key = canonical_pair_key(
843                        func1,
844                        func2,
845                        clone_match.source_start,
846                        clone_match.target_start,
847                        clone_match.length,
848                    );
849
850                    if seen_pairs.contains(&pair_key) {
851                        continue;
852                    }
853
854                    type3_candidates.push((func1, func2, clone_match));
855                }
856            }
857        }
858
859        // Deduplicate overlapping Type-3 matches
860        let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
861
862        // Convert to DuplicateMatch
863        for (func1, func2, clone_match) in deduplicated {
864            let (actual_start1, actual_end1) =
865                self.compute_line_span(func1, clone_match.source_start, clone_match.length);
866            let (actual_start2, actual_end2) =
867                self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
868
869            // Skip self-matches: same file and same starting line indicates
870            // the algorithm matched a code block against itself
871            if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
872                continue;
873            }
874
875            let window1 = &func1.tokens
876                [clone_match.source_start..clone_match.source_start + clone_match.length];
877            let window2 = &func2.tokens
878                [clone_match.target_start..clone_match.target_start + clone_match.target_length];
879            let edit_dist = hashing::compute_token_edit_distance(window1, window2);
880
881            let match_hash = Self::compute_match_hash(window1);
882
883            duplicates.push(DuplicateMatch {
884                file1: func1.file_path.to_string(),
885                file2: func2.file_path.to_string(),
886                start_line1: actual_start1,
887                start_line2: actual_start2,
888                end_line1: Some(actual_end1),
889                end_line2: Some(actual_end2),
890                length: clone_match.length,
891                similarity: clone_match.similarity,
892                hash: match_hash,
893                clone_type: CloneType::Type3,
894                edit_distance: Some(edit_dist),
895                suppressed_by_directive: None,
896                token_offset1: Some(clone_match.source_start),
897                token_offset2: Some(clone_match.target_start),
898                target_length: Some(clone_match.target_length),
899                duplicate_id: None,
900            });
901        }
902    }
903
904    /// Computes content-based IDs for all duplicates
905    ///
906    /// IDs are SHA256 hashes of normalized tokens, enabling persistent ignore rules.
907    fn compute_duplicate_ids(
908        &self,
909        function_hashes: &[FunctionHash],
910        duplicates: &mut [DuplicateMatch],
911    ) {
912        for dup in duplicates.iter_mut() {
913            if dup.duplicate_id.is_some() {
914                continue;
915            }
916
917            let tokens1 = self.extract_duplicate_tokens(
918                function_hashes,
919                &dup.file1,
920                dup.start_line1,
921                dup.end_line1,
922                dup.token_offset1,
923                dup.length,
924            );
925
926            let tokens2 = self.extract_duplicate_tokens(
927                function_hashes,
928                &dup.file2,
929                dup.start_line2,
930                dup.end_line2,
931                dup.token_offset2,
932                dup.target_length.unwrap_or(dup.length),
933            );
934
935            if let Some(tokens1) = tokens1 {
936                let id = if let Some(tokens2) = tokens2 {
937                    ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
938                } else {
939                    ignore_rules::compute_duplicate_id(&tokens1)
940                };
941                dup.duplicate_id = Some(id);
942            }
943        }
944    }
945
946    /// Extracts normalized token strings for a duplicate region
947    fn extract_duplicate_tokens(
948        &self,
949        function_hashes: &[FunctionHash],
950        file: &str,
951        reported_start: usize,
952        reported_end: Option<usize>,
953        token_offset: Option<usize>,
954        length: usize,
955    ) -> Option<Vec<String>> {
956        function_hashes.iter().find_map(|fh| {
957            if fh.file_path.as_ref() != file
958                || fh.start_line > reported_start
959                || reported_start > fh.end_line
960            {
961                return None;
962            }
963
964            let start_offset = match token_offset {
965                Some(offset) if offset + length <= fh.tokens.len() => Some(offset),
966                _ => self.infer_token_offset(fh, reported_start, reported_end, length),
967            }?;
968
969            if start_offset + length > fh.tokens.len() {
970                return None;
971            }
972
973            Some(
974                fh.tokens
975                    .iter()
976                    .skip(start_offset)
977                    .take(length)
978                    .map(|t| t.as_hash_string().to_string())
979                    .collect(),
980            )
981        })
982    }
983
984    /// Attempts to derive the token offset from reported lines when it's missing (e.g., older caches).
985    fn infer_token_offset(
986        &self,
987        func_hash: &FunctionHash,
988        reported_start: usize,
989        reported_end: Option<usize>,
990        length: usize,
991    ) -> Option<usize> {
992        let start_line_offset = reported_start.checked_sub(func_hash.start_line)?;
993        let end_line = reported_end.unwrap_or(reported_start);
994
995        func_hash
996            .token_line_offsets
997            .iter()
998            .enumerate()
999            .filter_map(|(idx, line_offset)| {
1000                if *line_offset != start_line_offset {
1001                    return None;
1002                }
1003
1004                let end_idx = idx.checked_add(length.checked_sub(1)?)?;
1005                let end_offset = func_hash.token_line_offsets.get(end_idx)?;
1006                if func_hash.start_line + *end_offset == end_line {
1007                    Some(idx)
1008                } else {
1009                    None
1010                }
1011            })
1012            .next()
1013    }
1014
1015    /// Filters out duplicates that are in the ignore list
1016    fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
1017        if let Some(ref ignore_manager) = self.ignore_manager {
1018            duplicates.retain(|dup| {
1019                if let Some(ref id) = dup.duplicate_id {
1020                    !ignore_manager.is_ignored(id)
1021                } else {
1022                    // If we couldn't compute an ID, keep the duplicate (fail open)
1023                    true
1024                }
1025            });
1026        }
1027    }
1028
1029    /// Computes a hash for a token slice (used for match reporting)
1030    fn compute_match_hash(tokens: &[Token]) -> u64 {
1031        use std::collections::hash_map::DefaultHasher;
1032        use std::hash::{Hash, Hasher};
1033        let mut hasher = DefaultHasher::new();
1034        tokens.hash(&mut hasher);
1035        hasher.finish()
1036    }
1037
1038    /// Checks if a duplicate is suppressed by an inline directive
1039    ///
1040    /// Directives suppress the entire function they're placed before, so we check
1041    /// if the owning function has a directive, not the duplicate's specific lines.
1042    fn is_suppressed_by_directive(
1043        &self,
1044        dup: &DuplicateMatch,
1045        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1046        function_hashes: &[FunctionHash],
1047    ) -> bool {
1048        // Check if either file has a directive suppressing this duplicate
1049        let file1_path = PathBuf::from(&dup.file1);
1050        let file2_path = PathBuf::from(&dup.file2);
1051
1052        // Check file1 - use the owning function's start line for directive lookup
1053        if let Some(directives) = directives_map.get(&file1_path) {
1054            let func_start =
1055                self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1056            // Use function start for directive check (directives apply to whole function)
1057            let check_line = func_start.unwrap_or(dup.start_line1);
1058
1059            if directives.is_suppressed(check_line, check_line).is_some() {
1060                return true;
1061            }
1062        }
1063
1064        // Check file2 - use the owning function's start line for directive lookup
1065        if let Some(directives) = directives_map.get(&file2_path) {
1066            let func_start =
1067                self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1068            // Use function start for directive check (directives apply to whole function)
1069            let check_line = func_start.unwrap_or(dup.start_line2);
1070
1071            if directives.is_suppressed(check_line, check_line).is_some() {
1072                return true;
1073            }
1074        }
1075
1076        false
1077    }
1078
1079    /// Finds the start line of the function containing a given line
1080    fn find_owning_function_start(
1081        &self,
1082        file: &str,
1083        line: usize,
1084        function_hashes: &[FunctionHash],
1085    ) -> Option<usize> {
1086        function_hashes
1087            .iter()
1088            .find(|fh| {
1089                fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1090            })
1091            .map(|fh| fh.start_line)
1092    }
1093
1094    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
1095    ///
1096    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
1097    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
1098    /// Overlap requires BOTH source AND target ranges to overlap.
1099    fn deduplicate_overlapping_matches<'a>(
1100        &self,
1101        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1102    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1103        if candidates.is_empty() {
1104            return Vec::new();
1105        }
1106
1107        // Track which matches have been merged
1108        let mut used = vec![false; candidates.len()];
1109        let mut deduplicated = Vec::new();
1110
1111        for i in 0..candidates.len() {
1112            if used[i] {
1113                continue;
1114            }
1115
1116            let (func1, func2, current) = &candidates[i];
1117            let mut best_match = (*func1, *func2, current.clone());
1118            used[i] = true;
1119
1120            // Find all overlapping matches (iterate until no more overlaps found)
1121            // This handles transitive overlaps: A overlaps B, B overlaps C
1122            let mut found_overlap = true;
1123            while found_overlap {
1124                found_overlap = false;
1125
1126                for j in (i + 1)..candidates.len() {
1127                    if used[j] {
1128                        continue;
1129                    }
1130
1131                    let (f1, f2, candidate) = &candidates[j];
1132
1133                    // Only merge if same function pair (by file path and line number)
1134                    let same_pair = (func1.file_path == f1.file_path
1135                        && func2.file_path == f2.file_path
1136                        && func1.start_line == f1.start_line
1137                        && func2.start_line == f2.start_line)
1138                        || (func1.file_path == f2.file_path
1139                            && func2.file_path == f1.file_path
1140                            && func1.start_line == f2.start_line
1141                            && func2.start_line == f1.start_line);
1142
1143                    if !same_pair {
1144                        continue;
1145                    }
1146
1147                    // Check if overlapping with CURRENT best_match (not original)
1148                    // This ensures transitive overlaps are handled correctly
1149                    let source_overlap = ranges_overlap(
1150                        best_match.2.source_start,
1151                        best_match.2.source_start + best_match.2.length,
1152                        candidate.source_start,
1153                        candidate.source_start + candidate.length,
1154                    );
1155                    let target_overlap = ranges_overlap(
1156                        best_match.2.target_start,
1157                        best_match.2.target_start + best_match.2.target_length,
1158                        candidate.target_start,
1159                        candidate.target_start + candidate.target_length,
1160                    );
1161
1162                    if source_overlap && target_overlap {
1163                        let best_span = best_match.2.length.max(best_match.2.target_length);
1164                        let candidate_span = candidate.length.max(candidate.target_length);
1165
1166                        // Keep the match that covers more tokens overall, breaking ties by similarity
1167                        if candidate_span > best_span
1168                            || (candidate_span == best_span
1169                                && candidate.similarity > best_match.2.similarity)
1170                        {
1171                            best_match = (*f1, *f2, candidate.clone());
1172                            found_overlap = true; // Need another pass to check against new best
1173                        }
1174                        used[j] = true;
1175                    }
1176                }
1177            }
1178
1179            deduplicated.push(best_match);
1180        }
1181
1182        deduplicated
1183    }
1184
1185    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
1186    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1187        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
1188        let normalized1 = raw1.split_whitespace().collect::<String>();
1189        let normalized2 = raw2.split_whitespace().collect::<String>();
1190
1191        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
1192        if normalized1 == normalized2 {
1193            CloneType::Type1
1194        } else {
1195            // Otherwise, it's Type-2 (renamed identifiers/literals)
1196            CloneType::Type2
1197        }
1198    }
1199
1200    /// Finds clone matches between two functions using extension algorithm
1201    fn find_clones_between_functions(
1202        &self,
1203        func1: &FunctionHash,
1204        func2: &FunctionHash,
1205    ) -> Vec<CloneMatch> {
1206        use std::collections::HashMap;
1207
1208        let mut matches = Vec::new();
1209        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1210
1211        // Index all windows in func1
1212        let mut i = 0;
1213        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1214            let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1215            hash_map.entry(hash).or_default().push(i);
1216            i += 1;
1217        }
1218
1219        // Search for matches in func2
1220        let mut j = 0;
1221        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1222            let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1223
1224            if let Some(func1_positions) = hash_map.get(&hash) {
1225                for &func1_pos in func1_positions {
1226                    // Verify exact match using shared utility
1227                    if hashing::verify_cross_window_match(
1228                        &func1.tokens,
1229                        &func2.tokens,
1230                        func1_pos,
1231                        j,
1232                        self.min_block_size,
1233                    ) {
1234                        // Greedy extension using shared utility
1235                        let extension = hashing::extend_match(
1236                            &func1.tokens,
1237                            &func2.tokens,
1238                            func1_pos,
1239                            j,
1240                            self.min_block_size,
1241                        );
1242
1243                        let total_length = self.min_block_size + extension;
1244
1245                        matches.push(CloneMatch {
1246                            source_start: func1_pos,
1247                            target_start: j,
1248                            length: total_length,
1249                            target_length: total_length,
1250                            similarity: 1.0, // Exact match
1251                        });
1252
1253                        // Skip ahead
1254                        j += extension.max(1);
1255                        break;
1256                    }
1257                }
1258            }
1259
1260            j += 1;
1261        }
1262
1263        matches
1264    }
1265
1266    fn add_hashes_to_cache(&self, function_hashes: &[FunctionHash], cache: &mut HashCache) {
1267        for func_hash in function_hashes {
1268            let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1269
1270            for (hash, offset) in hashes {
1271                let end_token_idx = offset + self.min_block_size;
1272                let (start_line, end_line) =
1273                    self.compute_line_span(func_hash, offset, self.min_block_size);
1274
1275                let location = CodeLocation {
1276                    file_path: func_hash.file_path.to_string(),
1277                    start_line,
1278                    end_line,
1279                    token_offset: Some(offset),
1280                    token_length: self.min_block_size,
1281                    tokens: func_hash.tokens[offset..end_token_idx].to_vec(),
1282                    raw_source: func_hash.raw_body.clone(),
1283                };
1284
1285                cache.add_hash(hash, location);
1286            }
1287        }
1288    }
1289
1290    /// Build a hash cache from the given paths
1291    ///
1292    /// Scans all files and builds a persistent cache of rolling hashes.
1293    /// This enables fast incremental scanning and git-diff mode.
1294    pub fn build_cache(&self, paths: Vec<PathBuf>) -> Result<HashCache> {
1295        let mut cache = HashCache::new(self.min_block_size);
1296
1297        // Collect all source files
1298        let source_files = self.collect_source_files(paths)?;
1299
1300        // Process each file and add to cache
1301        for file_path in source_files {
1302            let content = match std::fs::read_to_string(&file_path) {
1303                Ok(c) => c,
1304                Err(_) => continue, // Skip files we can't read
1305            };
1306
1307            let function_hashes = match self.process_file_content(&file_path, &content) {
1308                Ok(fh) => fh,
1309                Err(_) => continue, // Skip files we can't parse
1310            };
1311
1312            self.add_hashes_to_cache(&function_hashes, &mut cache);
1313        }
1314
1315        Ok(cache)
1316    }
1317
1318    /// Scan with cache lookup (for git-diff mode)
1319    ///
1320    /// Scans only the changed files, then looks up their hashes in the cache
1321    /// to find duplicates against the entire codebase.
1322    pub fn scan_with_cache(
1323        &self,
1324        changed_files: Vec<PathBuf>,
1325        cache: &mut HashCache,
1326    ) -> Result<Report> {
1327        use std::time::Instant;
1328        let start_time = Instant::now();
1329
1330        // Ensure we don't match against stale cache entries
1331        let stale_files = cache.invalidate_stale_files();
1332        let normalize_path =
1333            |path: &Path| path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
1334        let changed_set: HashSet<PathBuf> =
1335            changed_files.iter().map(|p| normalize_path(p)).collect();
1336
1337        if !stale_files.is_empty() {
1338            // Rebuild cache entries that were invalidated so unchanged files
1339            // remain available for lookups.
1340            let stale_paths: Vec<PathBuf> = stale_files
1341                .into_iter()
1342                .filter_map(|path| {
1343                    let raw_path = PathBuf::from(&path);
1344                    let normalized = normalize_path(&raw_path);
1345
1346                    if !normalized.exists() || changed_set.contains(&normalized) {
1347                        return None;
1348                    }
1349
1350                    Some(raw_path)
1351                })
1352                .collect();
1353
1354            if !stale_paths.is_empty() {
1355                let (stale_hashes, _) = self.analyze_files(&stale_paths)?;
1356                self.add_hashes_to_cache(&stale_hashes, cache);
1357            }
1358        }
1359
1360        // Only scan the changed files
1361        let function_hashes_result = self.analyze_files(&changed_files)?;
1362        let (function_hashes, total_lines) = function_hashes_result;
1363
1364        // Find duplicates by looking up in cache
1365        let mut duplicates = Vec::new();
1366        let mut cached_hits_by_file: HashMap<String, Vec<CodeLocation>> = HashMap::new();
1367        let mut cached_function_hashes: Vec<FunctionHash> = Vec::new();
1368
1369        for func_hash in &function_hashes {
1370            let hashes = compute_rolling_hashes(&func_hash.tokens, self.min_block_size);
1371
1372            for (hash, offset) in hashes {
1373                // Look up this hash in the cache
1374                if let Some(cached_locations) = cache.lookup(hash) {
1375                    for cached_loc in cached_locations {
1376                        // Normalize both paths for comparison (handle relative vs absolute)
1377                        let changed_file_path = Path::new(func_hash.file_path.as_ref())
1378                            .canonicalize()
1379                            .unwrap_or_else(|_| {
1380                                Path::new(func_hash.file_path.as_ref()).to_path_buf()
1381                            });
1382                        let cached_file_path = Path::new(&cached_loc.file_path)
1383                            .canonicalize()
1384                            .unwrap_or_else(|_| Path::new(&cached_loc.file_path).to_path_buf());
1385
1386                        // Skip if same file (we'll find those via normal duplicate detection)
1387                        if changed_file_path == cached_file_path {
1388                            continue;
1389                        }
1390
1391                        cached_hits_by_file
1392                            .entry(cached_loc.file_path.clone())
1393                            .or_default()
1394                            .push(cached_loc.clone());
1395
1396                        // Calculate line numbers for the match in changed file
1397                        let start_token_idx = offset;
1398                        let end_token_idx =
1399                            (offset + self.min_block_size).min(func_hash.tokens.len());
1400
1401                        let start_line_offset =
1402                            if start_token_idx < func_hash.token_line_offsets.len() {
1403                                func_hash.token_line_offsets[start_token_idx]
1404                            } else {
1405                                0
1406                            };
1407
1408                        let end_line_offset = if end_token_idx > 0
1409                            && end_token_idx - 1 < func_hash.token_line_offsets.len()
1410                        {
1411                            func_hash.token_line_offsets[end_token_idx - 1]
1412                        } else {
1413                            start_line_offset
1414                        };
1415
1416                        // Create duplicate match
1417                        let similarity = compute_token_similarity(
1418                            &func_hash.tokens[start_token_idx..end_token_idx],
1419                            &cached_loc.tokens,
1420                        );
1421
1422                        if similarity >= self.similarity_threshold {
1423                            let clone_type = if func_hash.raw_body == cached_loc.raw_source {
1424                                CloneType::Type1
1425                            } else {
1426                                CloneType::Type2
1427                            };
1428
1429                            duplicates.push(DuplicateMatch {
1430                                file1: func_hash.file_path.to_string(),
1431                                file2: cached_loc.file_path.clone(),
1432                                start_line1: func_hash.start_line + start_line_offset,
1433                                start_line2: cached_loc.start_line,
1434                                end_line1: Some(func_hash.start_line + end_line_offset),
1435                                end_line2: Some(cached_loc.end_line),
1436                                length: self.min_block_size,
1437                                similarity,
1438                                hash,
1439                                clone_type,
1440                                edit_distance: None,
1441                                suppressed_by_directive: None,
1442                                token_offset1: Some(offset),
1443                                token_offset2: cached_loc.token_offset,
1444                                target_length: Some(cached_loc.token_length),
1445                                duplicate_id: None,
1446                            });
1447                        }
1448                    }
1449                }
1450            }
1451        }
1452
1453        // Run Type-3 detection between changed files and any cached functions that matched hashes
1454        if self.enable_type3 && !cached_hits_by_file.is_empty() {
1455            let mut seen_functions: HashSet<(String, usize)> = HashSet::new();
1456
1457            for locations in cached_hits_by_file.values() {
1458                for loc in locations {
1459                    let token_offset = match loc.token_offset {
1460                        Some(offset) => offset,
1461                        None => continue,
1462                    };
1463
1464                    let normalized_path = normalize_path(Path::new(&loc.file_path));
1465                    if changed_set.contains(&normalized_path) {
1466                        continue;
1467                    }
1468
1469                    let (tokens, token_line_offsets) = normalize_with_line_numbers(&loc.raw_source);
1470                    if tokens.len() < self.min_block_size
1471                        || token_offset >= token_line_offsets.len()
1472                    {
1473                        continue;
1474                    }
1475
1476                    let line_offset = token_line_offsets[token_offset];
1477                    let start_line = loc.start_line.saturating_sub(line_offset);
1478                    let key = (loc.file_path.clone(), start_line);
1479
1480                    if !seen_functions.insert(key.clone()) {
1481                        continue;
1482                    }
1483
1484                    let end_line =
1485                        start_line + token_line_offsets.last().copied().unwrap_or_default();
1486
1487                    cached_function_hashes.push(FunctionHash {
1488                        file_path: Arc::<str>::from(key.0),
1489                        function_name: None,
1490                        start_byte: 0,
1491                        end_byte: 0,
1492                        start_line,
1493                        end_line,
1494                        tokens,
1495                        token_line_offsets,
1496                        raw_body: loc.raw_source.clone(),
1497                    });
1498                }
1499            }
1500
1501            if !cached_function_hashes.is_empty() {
1502                // Type alias for pair deduplication keys (shared with main detection path)
1503                type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
1504
1505                let mut seen_pairs: HashSet<SeenPairKey<'_>> = HashSet::new();
1506
1507                for dup in &duplicates {
1508                    if let (Some(offset1), Some(offset2)) = (dup.token_offset1, dup.token_offset2) {
1509                        if let (Some(func1), Some(func2)) = (
1510                            function_hashes.iter().find(|fh| {
1511                                fh.file_path.as_ref() == dup.file1.as_str()
1512                                    && fh.start_line <= dup.start_line1
1513                                    && dup.start_line1 <= fh.end_line
1514                            }),
1515                            cached_function_hashes.iter().find(|fh| {
1516                                fh.file_path.as_ref() == dup.file2.as_str()
1517                                    && fh.start_line <= dup.start_line2
1518                                    && dup.start_line2 <= fh.end_line
1519                            }),
1520                        ) {
1521                            seen_pairs.insert(canonical_pair_key(
1522                                func1, func2, offset1, offset2, dup.length,
1523                            ));
1524                        }
1525                    }
1526                }
1527
1528                let mut type3_candidates = Vec::new();
1529
1530                for func1 in &function_hashes {
1531                    for func2 in &cached_function_hashes {
1532                        let type3_matches = detect_type3_clones(
1533                            &func1.tokens,
1534                            &func2.tokens,
1535                            self.min_block_size,
1536                            self.type3_tolerance,
1537                        );
1538
1539                        for clone_match in type3_matches {
1540                            let pair_key = canonical_pair_key(
1541                                func1,
1542                                func2,
1543                                clone_match.source_start,
1544                                clone_match.target_start,
1545                                clone_match.length,
1546                            );
1547
1548                            if seen_pairs.contains(&pair_key) {
1549                                continue;
1550                            }
1551
1552                            type3_candidates.push((func1, func2, clone_match));
1553                        }
1554                    }
1555                }
1556
1557                let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
1558
1559                for (func1, func2, clone_match) in deduplicated {
1560                    let (actual_start1, actual_end1) =
1561                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
1562                    let (actual_start2, actual_end2) = self.compute_line_span(
1563                        func2,
1564                        clone_match.target_start,
1565                        clone_match.target_length,
1566                    );
1567
1568                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
1569                        continue;
1570                    }
1571
1572                    let window1 = &func1.tokens
1573                        [clone_match.source_start..clone_match.source_start + clone_match.length];
1574                    let window2 = &func2.tokens[clone_match.target_start
1575                        ..clone_match.target_start + clone_match.target_length];
1576
1577                    let edit_dist = hashing::compute_token_edit_distance(window1, window2);
1578                    let match_hash = Self::compute_match_hash(window1);
1579
1580                    duplicates.push(DuplicateMatch {
1581                        file1: func1.file_path.to_string(),
1582                        file2: func2.file_path.to_string(),
1583                        start_line1: actual_start1,
1584                        start_line2: actual_start2,
1585                        end_line1: Some(actual_end1),
1586                        end_line2: Some(actual_end2),
1587                        length: clone_match.length,
1588                        similarity: clone_match.similarity,
1589                        hash: match_hash,
1590                        clone_type: CloneType::Type3,
1591                        edit_distance: Some(edit_dist),
1592                        suppressed_by_directive: None,
1593                        token_offset1: Some(clone_match.source_start),
1594                        token_offset2: Some(clone_match.target_start),
1595                        target_length: Some(clone_match.target_length),
1596                        duplicate_id: None,
1597                    });
1598                }
1599            }
1600        }
1601
1602        // Also find duplicates within the changed files themselves
1603        let intra_duplicates = self.find_duplicate_hashes(&function_hashes);
1604        duplicates.extend(intra_duplicates);
1605
1606        // Deduplicate
1607        duplicates.sort_by(|a, b| {
1608            (&a.file1, &a.file2, a.start_line1, a.start_line2).cmp(&(
1609                &b.file1,
1610                &b.file2,
1611                b.start_line1,
1612                b.start_line2,
1613            ))
1614        });
1615        duplicates.dedup_by(|a, b| {
1616            a.file1 == b.file1
1617                && a.file2 == b.file2
1618                && a.start_line1 == b.start_line1
1619                && a.start_line2 == b.start_line2
1620        });
1621
1622        // Hydrate function metadata for any files that only exist in the cache so we
1623        // can compute duplicate IDs and apply directive-based suppression.
1624        let mut lookup_function_hashes = function_hashes.clone();
1625        if !cached_function_hashes.is_empty() {
1626            lookup_function_hashes.extend(cached_function_hashes.clone());
1627        }
1628        let hashed_files: HashSet<&str> = lookup_function_hashes
1629            .iter()
1630            .map(|fh| fh.file_path.as_ref())
1631            .collect();
1632
1633        let mut missing_files: HashSet<String> = HashSet::new();
1634        for dup in &duplicates {
1635            if !hashed_files.contains(dup.file1.as_str()) {
1636                missing_files.insert(dup.file1.clone());
1637            }
1638            if !hashed_files.contains(dup.file2.as_str()) {
1639                missing_files.insert(dup.file2.clone());
1640            }
1641        }
1642
1643        if !missing_files.is_empty() {
1644            let missing_paths: Vec<PathBuf> = missing_files.iter().map(PathBuf::from).collect();
1645            let (mut extra_hashes, _) = self.analyze_files(&missing_paths)?;
1646            lookup_function_hashes.append(&mut extra_hashes);
1647        }
1648
1649        // Compute IDs and filter against .polydup-ignore
1650        self.compute_duplicate_ids(&lookup_function_hashes, &mut duplicates);
1651        self.filter_ignored_duplicates(&mut duplicates);
1652
1653        // Apply inline directive filtering for both changed and cached files
1654        if self.enable_directives && !duplicates.is_empty() {
1655            let directive_paths: HashSet<PathBuf> = lookup_function_hashes
1656                .iter()
1657                .map(|fh| PathBuf::from(fh.file_path.as_ref()))
1658                .collect();
1659            let directives_map =
1660                self.collect_directives(&directive_paths.into_iter().collect::<Vec<_>>());
1661
1662            if !directives_map.is_empty() {
1663                self.apply_directive_filtering(
1664                    &mut duplicates,
1665                    &directives_map,
1666                    &lookup_function_hashes,
1667                );
1668            }
1669        }
1670
1671        // Refresh cache with the newly scanned files so future runs stay incremental
1672        self.add_hashes_to_cache(&function_hashes, cache);
1673
1674        // Calculate statistics
1675        let stats = self.compute_stats(&function_hashes, total_lines, start_time);
1676
1677        Ok(Report {
1678            version: None,
1679            scan_time: None,
1680            config: None,
1681            files_scanned: changed_files.len(),
1682            functions_analyzed: function_hashes.len(),
1683            duplicates,
1684            stats,
1685        })
1686    }
1687}
1688
1689impl Default for Scanner {
1690    fn default() -> Self {
1691        Self::new() // Infallible now, no panic possible
1692    }
1693}
1694
1695/// Public API: Find duplicates in the given file paths
1696///
1697/// # Arguments
1698/// * `paths` - Vector of file paths to scan
1699///
1700/// # Returns
1701/// * `Result<Report>` - Scan report with detected duplicates
1702pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1703    let scanner = Scanner::new();
1704    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1705    scanner.scan(path_bufs)
1706}
1707
1708/// Public API with custom configuration
1709pub fn find_duplicates_with_config(
1710    paths: Vec<String>,
1711    min_block_size: usize,
1712    similarity_threshold: f64,
1713) -> Result<Report> {
1714    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1715    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1716    scanner.scan(path_bufs)
1717}
1718
1719#[cfg(test)]
1720mod tests {
1721    use super::*;
1722
1723    /// Helper to create a FunctionHash for testing with sequential line offsets
1724    fn make_test_function(
1725        file: &str,
1726        start_line: usize,
1727        tokens: Vec<Token>,
1728        raw_body: &str,
1729    ) -> FunctionHash {
1730        let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1731        FunctionHash {
1732            file_path: Arc::<str>::from(file),
1733            function_name: None,
1734            start_byte: 0,
1735            end_byte: 0,
1736            start_line,
1737            end_line: start_line + tokens.len(),
1738            tokens,
1739            token_line_offsets,
1740            raw_body: raw_body.to_string(),
1741        }
1742    }
1743
1744    /// Helper to create a FunctionHash with all tokens on the same line
1745    fn make_test_function_same_line(
1746        file: &str,
1747        start_line: usize,
1748        end_line: usize,
1749        tokens: Vec<Token>,
1750        raw_body: &str,
1751    ) -> FunctionHash {
1752        let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1753        FunctionHash {
1754            file_path: Arc::<str>::from(file),
1755            function_name: None,
1756            start_byte: 0,
1757            end_byte: 0,
1758            start_line,
1759            end_line,
1760            tokens,
1761            token_line_offsets,
1762            raw_body: raw_body.to_string(),
1763        }
1764    }
1765
1766    /// Helper to create simple expression tokens for testing: keyword id op id ;
1767    fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1768        vec![
1769            Token::Keyword(keyword.into()),
1770            Token::Identifier,
1771            Token::Operator(op.into()),
1772            Token::Identifier,
1773            Token::Punctuation(";".into()),
1774        ]
1775    }
1776
1777    #[test]
1778    fn test_scanner_creation() {
1779        let _scanner = Scanner::new(); // Infallible
1780    }
1781
1782    #[test]
1783    fn test_scanner_with_config() {
1784        let scanner = Scanner::with_config(30, 0.9);
1785        assert!(scanner.is_ok());
1786        let s = scanner.unwrap();
1787        assert_eq!(s.min_block_size, 30);
1788        assert_eq!(s.similarity_threshold, 0.9);
1789    }
1790
1791    #[test]
1792    fn test_type3_tolerance_validation() {
1793        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1794        assert!(Scanner::new().with_type3_detection(1.2).is_err());
1795        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1796    }
1797
1798    #[test]
1799    fn test_type3_not_dropped_when_functions_share_offsets() {
1800        fn make_function(
1801            file: &str,
1802            start_line: usize,
1803            tokens: Vec<Token>,
1804            raw_body: &str,
1805        ) -> FunctionHash {
1806            let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1807            FunctionHash {
1808                file_path: Arc::<str>::from(file),
1809                function_name: None,
1810                start_byte: 0,
1811                end_byte: 0,
1812                start_line,
1813                end_line: start_line + tokens.len(),
1814                tokens,
1815                token_line_offsets,
1816                raw_body: raw_body.to_string(),
1817            }
1818        }
1819
1820        let scanner = Scanner::with_config(3, 0.85)
1821            .unwrap()
1822            .with_type3_detection(0.6)
1823            .unwrap();
1824
1825        let type1_tokens = vec![
1826            Token::Keyword("return".into()),
1827            Token::NumberLiteral,
1828            Token::Punctuation(";".into()),
1829        ];
1830        let near_tokens_a = vec![
1831            Token::Keyword("compute".into()),
1832            Token::Identifier,
1833            Token::Identifier,
1834        ];
1835        let near_tokens_b = vec![
1836            Token::Keyword("compute".into()),
1837            Token::Identifier,
1838            Token::NumberLiteral,
1839        ];
1840
1841        let functions = vec![
1842            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1843            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1844            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1845            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1846        ];
1847
1848        let duplicates = scanner.find_duplicate_hashes(&functions);
1849
1850        let type1_present = duplicates.iter().any(|d| {
1851            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1852                && d.start_line1 == 10
1853                && d.start_line2 == 20
1854        });
1855        assert!(
1856            type1_present,
1857            "expected Type-1/2 match for the first function pair"
1858        );
1859
1860        let type3_present = duplicates.iter().any(|d| {
1861            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1862        });
1863        assert!(
1864            type3_present,
1865            "Type-3 match between later functions should not be deduped"
1866        );
1867
1868        assert_eq!(
1869            duplicates.len(),
1870            2,
1871            "should keep both the Type-1/2 and Type-3 matches"
1872        );
1873    }
1874
1875    #[test]
1876    fn test_type3_reports_token_offsets_in_start_lines() {
1877        let scanner = Scanner::with_config(3, 0.85)
1878            .unwrap()
1879            .with_type3_detection(0.75)
1880            .unwrap();
1881
1882        let functions = vec![
1883            make_test_function_same_line(
1884                "file_a.rs",
1885                100,
1886                105,
1887                make_expr_tokens("let", "+"),
1888                "let a = b + c;",
1889            ),
1890            make_test_function_same_line(
1891                "file_b.rs",
1892                200,
1893                205,
1894                make_expr_tokens("mut", "-"),
1895                "let a = b - c;",
1896            ),
1897        ];
1898
1899        let duplicates = scanner.find_duplicate_hashes(&functions);
1900
1901        let type3 = duplicates
1902            .iter()
1903            .find(|d| matches!(d.clone_type, CloneType::Type3))
1904            .expect("expected a Type-3 duplicate match");
1905
1906        assert_eq!(
1907            type3.start_line1, 100,
1908            "should report the actual source line even when tokens share a line"
1909        );
1910        assert_eq!(
1911            type3.start_line2, 200,
1912            "should report the actual target line even when tokens share a line"
1913        );
1914        assert_eq!(type3.token_offset1, Some(1));
1915        assert_eq!(type3.token_offset2, Some(1));
1916    }
1917
1918    #[test]
1919    fn type3_duplicate_ids_are_symmetric() {
1920        use tempfile::TempDir;
1921
1922        let tokens_a = make_expr_tokens("let", "+");
1923        // tokens_b has an extra identifier to create a Type-3 (near-miss) clone
1924        let mut tokens_b = make_expr_tokens("let", "-");
1925        tokens_b.push(Token::Identifier);
1926
1927        let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1928        let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1929
1930        let temp_dir = TempDir::new().unwrap();
1931        let scanner = Scanner::with_config(3, 0.85)
1932            .unwrap()
1933            .with_type3_detection(0.75)
1934            .unwrap()
1935            .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1936
1937        let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1938        let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1939
1940        let id_forward = forward
1941            .into_iter()
1942            .find(|d| matches!(d.clone_type, CloneType::Type3))
1943            .and_then(|d| d.duplicate_id)
1944            .expect("expected a Type-3 duplicate ID");
1945
1946        let id_reverse = reverse
1947            .into_iter()
1948            .find(|d| matches!(d.clone_type, CloneType::Type3))
1949            .and_then(|d| d.duplicate_id)
1950            .expect("expected a Type-3 duplicate ID");
1951
1952        assert_eq!(
1953            id_forward, id_reverse,
1954            "Type-3 IDs should not depend on function order"
1955        );
1956    }
1957
1958    #[test]
1959    fn type3_does_not_report_self_matches() {
1960        // Regression test for issue #71: Type-3 detection was reporting functions
1961        // as duplicates of themselves (same file, same line on both sides)
1962        let scanner = Scanner::with_config(3, 0.85)
1963            .unwrap()
1964            .with_type3_detection(0.75)
1965            .unwrap();
1966
1967        // Create two functions in the SAME file with the SAME starting line
1968        // This simulates the bug where Type-3 matched a function against itself
1969        let tokens = make_expr_tokens("let", "+");
1970        let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
1971        let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
1972
1973        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1974
1975        // Should NOT report any duplicates since both map to the same file:line
1976        let self_matches: Vec<_> = duplicates
1977            .iter()
1978            .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
1979            .collect();
1980
1981        assert!(
1982            self_matches.is_empty(),
1983            "Type-3 should never report self-matches (same file and line). Found: {:?}",
1984            self_matches
1985        );
1986    }
1987
1988    #[test]
1989    fn type3_still_detects_same_file_different_line_duplicates() {
1990        // Ensure the self-match fix doesn't break legitimate same-file duplicates
1991        let scanner = Scanner::with_config(3, 0.85)
1992            .unwrap()
1993            .with_type3_detection(0.75)
1994            .unwrap();
1995
1996        // Two similar functions in the SAME file but DIFFERENT lines
1997        let tokens1 = make_expr_tokens("let", "+");
1998        let mut tokens2 = make_expr_tokens("let", "-");
1999        tokens2.push(Token::Identifier); // Make it Type-3 (not exact)
2000
2001        let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
2002        let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
2003
2004        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
2005
2006        let same_file_different_line: Vec<_> = duplicates
2007            .iter()
2008            .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
2009            .collect();
2010
2011        assert!(
2012            !same_file_different_line.is_empty(),
2013            "Type-3 should still detect duplicates in the same file at different lines"
2014        );
2015    }
2016
2017    #[test]
2018    fn duplicate_matches_store_actual_end_lines() {
2019        let scanner = Scanner::with_config(2, 0.85).unwrap();
2020
2021        let tokens = vec![
2022            Token::Keyword("fn".into()),
2023            Token::Identifier,
2024            Token::Identifier,
2025            Token::Punctuation("{".into()),
2026            Token::Punctuation("}".into()),
2027        ];
2028
2029        let func1 = FunctionHash {
2030            file_path: Arc::<str>::from("file_a.rs"),
2031            function_name: None,
2032            start_byte: 0,
2033            end_byte: 0,
2034            start_line: 10,
2035            end_line: 14,
2036            tokens: tokens.clone(),
2037            token_line_offsets: vec![0, 0, 1, 1, 2],
2038            raw_body: "fn a() {}".to_string(),
2039        };
2040
2041        let func2 = FunctionHash {
2042            file_path: Arc::<str>::from("file_b.rs"),
2043            function_name: None,
2044            start_byte: 0,
2045            end_byte: 0,
2046            start_line: 20,
2047            end_line: 24,
2048            tokens,
2049            token_line_offsets: vec![0, 1, 1, 2, 2],
2050            raw_body: "fn b() {}".to_string(),
2051        };
2052
2053        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
2054        let dup = duplicates.first().expect("expected a duplicate match");
2055
2056        assert_eq!(dup.start_line1, 10);
2057        assert_eq!(dup.start_line2, 20);
2058        assert_eq!(dup.end_line1, Some(12));
2059        assert_eq!(dup.end_line2, Some(22));
2060    }
2061
2062    #[test]
2063    fn scan_with_cache_prunes_stale_entries() {
2064        let temp_dir = tempfile::tempdir().unwrap();
2065        let file_a = temp_dir.path().join("a.js");
2066        let file_b = temp_dir.path().join("b.js");
2067
2068        let shared_fn = r#"
2069        function shared() {
2070          return 1 + 1;
2071        }
2072        "#;
2073        std::fs::write(&file_a, shared_fn).unwrap();
2074        std::fs::write(&file_b, shared_fn).unwrap();
2075
2076        let scanner = Scanner::with_config(3, 0.85).unwrap();
2077        let mut cache = scanner
2078            .build_cache(vec![file_a.clone(), file_b.clone()])
2079            .unwrap();
2080
2081        // Change the non-diff file so its cached hashes are outdated
2082        std::thread::sleep(std::time::Duration::from_millis(1100));
2083        std::fs::write(&file_b, "const unrelated = 42;\n").unwrap();
2084
2085        let report = scanner
2086            .scan_with_cache(vec![file_a.clone()], &mut cache)
2087            .unwrap();
2088
2089        assert!(
2090            report.duplicates.is_empty(),
2091            "stale cache entries should be invalidated before lookup"
2092        );
2093    }
2094
2095    #[test]
2096    fn scan_with_cache_repopulates_changed_entries() {
2097        let temp_dir = tempfile::tempdir().unwrap();
2098        let file_a = temp_dir.path().join("a.js");
2099
2100        let original = r#"
2101        function shared() {
2102          return 1 + 1;
2103        }
2104        "#;
2105
2106        let updated = r#"
2107        function shared() {
2108          return 7 + 8;
2109        }
2110        "#;
2111
2112        std::fs::write(&file_a, original).unwrap();
2113
2114        let scanner = Scanner::with_config(3, 0.85).unwrap();
2115        let mut cache = scanner.build_cache(vec![file_a.clone()]).unwrap();
2116
2117        std::thread::sleep(std::time::Duration::from_millis(1100));
2118        std::fs::write(&file_a, updated).unwrap();
2119
2120        let file_a_str = file_a.to_string_lossy().to_string();
2121        assert!(
2122            cache.file_needs_rescan(&file_a_str),
2123            "modified files should be considered stale before cache lookup"
2124        );
2125
2126        scanner
2127            .scan_with_cache(vec![file_a.clone()], &mut cache)
2128            .unwrap();
2129
2130        let cached_entries: Vec<&CodeLocation> = cache
2131            .hash_index
2132            .values()
2133            .flat_map(|locs| locs.iter())
2134            .filter(|loc| loc.file_path == file_a_str)
2135            .collect();
2136
2137        assert!(
2138            !cached_entries.is_empty(),
2139            "changed files should be added back into the cache after rescan"
2140        );
2141        assert!(
2142            cached_entries
2143                .iter()
2144                .any(|loc| loc.raw_source.contains("return 7 + 8;")),
2145            "cache should contain hashes for the refreshed file contents"
2146        );
2147        assert!(
2148            cache.file_metadata.contains_key(&file_a_str),
2149            "file metadata should be refreshed after rescanning changed files"
2150        );
2151    }
2152
2153    #[test]
2154    fn scan_with_cache_rehydrates_stale_unchanged_files() {
2155        let temp_dir = tempfile::tempdir().unwrap();
2156        let changed_file = temp_dir.path().join("changed.js");
2157        let unchanged_file = temp_dir.path().join("unchanged.js");
2158
2159        let shared_fn = r#"
2160        function shared() {
2161          return 1 + 1;
2162        }
2163        "#;
2164
2165        std::fs::write(&changed_file, shared_fn).unwrap();
2166        std::fs::write(&unchanged_file, shared_fn).unwrap();
2167
2168        let scanner = Scanner::with_config(3, 0.85).unwrap();
2169        let mut cache = scanner
2170            .build_cache(vec![temp_dir.path().to_path_buf()])
2171            .unwrap();
2172
2173        // Simulate a restored cache where file mtimes no longer match.
2174        std::thread::sleep(std::time::Duration::from_millis(1100));
2175        std::fs::write(
2176            &changed_file,
2177            r#"
2178        function shared() {
2179          return 1 + 1;
2180        }
2181        function another() {
2182          return 1 + 1;
2183        }
2184        "#,
2185        )
2186        .unwrap();
2187        std::fs::write(&unchanged_file, shared_fn).unwrap();
2188
2189        let report = scanner
2190            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2191            .unwrap();
2192
2193        assert!(
2194            report.duplicates.iter().any(|dup| {
2195                (dup.file1.ends_with("changed.js") && dup.file2.ends_with("unchanged.js"))
2196                    || (dup.file1.ends_with("unchanged.js") && dup.file2.ends_with("changed.js"))
2197            }),
2198            "invalidated entries should be rebuilt so unchanged files still match against diffs"
2199        );
2200    }
2201
2202    #[test]
2203    fn scan_with_cache_respects_ignore_file() {
2204        let temp_dir = tempfile::tempdir().unwrap();
2205        let file_a = temp_dir.path().join("a.js");
2206        let file_b = temp_dir.path().join("b.js");
2207
2208        let shared_fn = r#"
2209        function shared() {
2210          return 1 + 1;
2211        }
2212        "#;
2213        std::fs::write(&file_a, shared_fn).unwrap();
2214        std::fs::write(&file_b, shared_fn).unwrap();
2215
2216        let base_scanner = Scanner::with_config(3, 0.85).unwrap();
2217        let mut cache = base_scanner
2218            .build_cache(vec![temp_dir.path().to_path_buf()])
2219            .unwrap();
2220
2221        let initial_report = base_scanner
2222            .scan_with_cache(vec![file_a.clone()], &mut cache)
2223            .unwrap();
2224        assert!(
2225            !initial_report.duplicates.is_empty(),
2226            "expected an initial duplicate to seed ignore entries"
2227        );
2228        let ignored_ids: Vec<String> = initial_report
2229            .duplicates
2230            .iter()
2231            .map(|d| {
2232                d.duplicate_id
2233                    .clone()
2234                    .expect("expected cache path to compute duplicate IDs")
2235            })
2236            .collect();
2237
2238        let mut manager = IgnoreManager::new(temp_dir.path());
2239        for id in ignored_ids {
2240            manager.add_ignore(IgnoreEntry::new(
2241                id,
2242                vec![],
2243                "test ignore".to_string(),
2244                "tester".to_string(),
2245            ));
2246        }
2247
2248        let scanner = base_scanner.with_ignore_manager(manager);
2249        let report = scanner
2250            .scan_with_cache(vec![file_a.clone()], &mut cache)
2251            .unwrap();
2252
2253        assert!(
2254            report.duplicates.is_empty(),
2255            "duplicates present in .polydup-ignore should be filtered when using cache"
2256        );
2257    }
2258
2259    #[test]
2260    fn scan_with_cache_uses_symmetric_ids_for_existing_ignores() {
2261        let temp_dir = tempfile::tempdir().unwrap();
2262        let file_a = temp_dir.path().join("a.js");
2263        let file_b = temp_dir.path().join("b.js");
2264
2265        let shared_fn = r#"
2266        function shared() {
2267          return 1 + 1;
2268        }
2269        "#;
2270        std::fs::write(&file_a, shared_fn).unwrap();
2271        std::fs::write(&file_b, shared_fn).unwrap();
2272
2273        let base_scanner = Scanner::with_config(7, 0.85).unwrap();
2274        let mut cache = base_scanner
2275            .build_cache(vec![temp_dir.path().to_path_buf()])
2276            .unwrap();
2277
2278        let baseline_report = base_scanner
2279            .scan(vec![temp_dir.path().to_path_buf()])
2280            .unwrap();
2281        let baseline_id = baseline_report
2282            .duplicates
2283            .first()
2284            .and_then(|dup| dup.duplicate_id.clone())
2285            .expect("expected duplicate IDs from full scans");
2286        let baseline_id_for_ignore = baseline_id.clone();
2287
2288        let mut manager = IgnoreManager::new(temp_dir.path());
2289        manager.add_ignore(IgnoreEntry::new(
2290            baseline_id_for_ignore,
2291            vec![],
2292            "test ignore".to_string(),
2293            "tester".to_string(),
2294        ));
2295
2296        let scanner = base_scanner.with_ignore_manager(manager);
2297        let report = scanner
2298            .scan_with_cache(vec![file_a.clone()], &mut cache)
2299            .unwrap();
2300
2301        assert!(
2302            report.duplicates.is_empty(),
2303            "cached scans should honor ignores generated from full scans"
2304        );
2305    }
2306
2307    #[test]
2308    fn scan_with_cache_respects_directives_from_cached_files() {
2309        let temp_dir = tempfile::tempdir().unwrap();
2310        let changed_file = temp_dir.path().join("changed.js");
2311        let cached_file = temp_dir.path().join("cached.js");
2312
2313        let suppressed_fn = r#"
2314        // polydup-ignore: generated code
2315        function shared() {
2316          return 1 + 1;
2317        }
2318        "#;
2319
2320        let changed_fn = r#"
2321        function shared() {
2322          return 1 + 1;
2323        }
2324        "#;
2325
2326        std::fs::write(&cached_file, suppressed_fn).unwrap();
2327        std::fs::write(&changed_file, changed_fn).unwrap();
2328
2329        let scanner = Scanner::with_config(3, 0.85).unwrap().with_directives(true);
2330        let mut cache = scanner
2331            .build_cache(vec![temp_dir.path().to_path_buf()])
2332            .unwrap();
2333
2334        let report = scanner
2335            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2336            .unwrap();
2337
2338        assert!(
2339            report.duplicates.is_empty(),
2340            "duplicates suppressed by directives in cached files should stay suppressed when using cache"
2341        );
2342    }
2343
2344    #[test]
2345    fn scan_with_cache_runs_type3_detection_against_cached_files() {
2346        let temp_dir = tempfile::tempdir().unwrap();
2347        let changed_file = temp_dir.path().join("changed.js");
2348        let cached_file = temp_dir.path().join("cached.js");
2349
2350        let cached_fn = r#"
2351        function cached() {
2352          step1();
2353          step2();
2354          step3();
2355          step4();
2356          step5();
2357        }
2358        "#;
2359
2360        let changed_fn = r#"
2361        function cached() {
2362          step1();
2363          step2();
2364          insert_gap();
2365          step3();
2366          step4();
2367          step5();
2368        }
2369        "#;
2370
2371        std::fs::write(&cached_file, cached_fn).unwrap();
2372        std::fs::write(&changed_file, changed_fn).unwrap();
2373
2374        let scanner = Scanner::with_config(3, 0.8)
2375            .unwrap()
2376            .with_type3_detection(0.8)
2377            .unwrap();
2378        let mut cache = scanner
2379            .build_cache(vec![temp_dir.path().to_path_buf()])
2380            .unwrap();
2381
2382        let report = scanner
2383            .scan_with_cache(vec![changed_file.clone()], &mut cache)
2384            .unwrap();
2385
2386        assert!(
2387            report.duplicates.iter().any(|dup| {
2388                matches!(dup.clone_type, CloneType::Type3)
2389                    && dup.file1.ends_with("changed.js")
2390                    && dup.file2.ends_with("cached.js")
2391            }),
2392            "Type-3 should run for cached comparisons so near-miss clones surface in git-diff mode"
2393        );
2394    }
2395
2396    #[test]
2397    fn test_find_duplicates_empty() {
2398        let result = find_duplicates(vec![]);
2399        assert!(result.is_ok());
2400        let report = result.unwrap();
2401        assert_eq!(report.duplicates.len(), 0);
2402    }
2403
2404    #[test]
2405    fn test_is_supported_file() {
2406        let scanner = Scanner::new();
2407
2408        assert!(scanner.is_supported_file(Path::new("test.rs")));
2409        assert!(scanner.is_supported_file(Path::new("test.py")));
2410        assert!(scanner.is_supported_file(Path::new("test.js")));
2411        assert!(scanner.is_supported_file(Path::new("test.ts")));
2412        assert!(!scanner.is_supported_file(Path::new("test.txt")));
2413        assert!(!scanner.is_supported_file(Path::new("test.md")));
2414    }
2415
2416    #[test]
2417    fn test_detect_language() {
2418        let scanner = Scanner::new();
2419
2420        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
2421        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
2422        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
2423        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
2424    }
2425}