dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod directives;
8mod error;
9mod hashing;
10mod ignore_rules;
11mod parsing;
12mod queries;
13
14#[cfg(test)]
15mod proptest_fuzzing;
16
17#[cfg(test)]
18mod snapshot_tests;
19
20// Re-export public types
21pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
22pub use error::{PolyDupError, Result};
23pub use hashing::{
24    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
25    compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
26    normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
27    Token,
28};
29pub use ignore_rules::{
30    compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
31};
32pub use parsing::{
33    extract_functions, extract_javascript_functions, extract_python_functions,
34    extract_rust_functions, FunctionNode,
35};
36
37use anyhow::Context;
38use ignore::WalkBuilder;
39use rayon::prelude::*;
40use serde::{Deserialize, Serialize};
41use std::collections::HashMap;
42use std::fs;
43use std::path::{Path, PathBuf};
44use std::sync::Arc;
45use tree_sitter::Language;
46
47/// Clone type classification
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
49pub enum CloneType {
50    /// Type-1: Exact copies (only whitespace/comments differ)
51    #[serde(rename = "type-1")]
52    Type1,
53    /// Type-2: Structurally identical but renamed identifiers/literals
54    #[serde(rename = "type-2")]
55    Type2,
56    /// Type-3: Near-miss clones with modifications (not yet implemented)
57    #[serde(rename = "type-3")]
58    Type3,
59}
60
61/// Helper function to check if two ranges overlap
62fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
63    start1 < end2 && start2 < end1
64}
65
66// Stable key for deduplicating matches within the same file pair.
67fn canonical_pair_key<'a>(
68    func1: &'a FunctionHash,
69    func2: &'a FunctionHash,
70    source_start: usize,
71    target_start: usize,
72    length: usize,
73) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
74    if func1.file_path.as_ref() < func2.file_path.as_ref() {
75        (
76            func1.file_path.as_ref(),
77            func2.file_path.as_ref(),
78            func1.start_line,
79            func2.start_line,
80            source_start,
81            target_start,
82            length,
83        )
84    } else {
85        (
86            func2.file_path.as_ref(),
87            func1.file_path.as_ref(),
88            func2.start_line,
89            func1.start_line,
90            target_start,
91            source_start,
92            length,
93        )
94    }
95}
96
97/// Represents a detected duplicate code fragment
98#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
99pub struct DuplicateMatch {
100    pub file1: String,
101    pub file2: String,
102    pub start_line1: usize,
103    pub start_line2: usize,
104    #[serde(skip_serializing_if = "Option::is_none")]
105    pub end_line1: Option<usize>,
106    #[serde(skip_serializing_if = "Option::is_none")]
107    pub end_line2: Option<usize>,
108    pub length: usize,
109    pub similarity: f64,
110    pub hash: u64,
111    pub clone_type: CloneType,
112    /// Edit distance (Type-3 only). None for Type-1/2
113    #[serde(skip_serializing_if = "Option::is_none")]
114    pub edit_distance: Option<usize>,
115    /// Indicates if this duplicate is suppressed by an inline directive
116    #[serde(skip_serializing_if = "Option::is_none")]
117    pub suppressed_by_directive: Option<bool>,
118    /// Token offset within function for file1 (used for ignore ID computation)
119    #[serde(skip)]
120    token_offset1: Option<usize>,
121    /// Token offset within function for file2 (used for ignore ID computation)
122    #[serde(skip)]
123    token_offset2: Option<usize>,
124    /// Token length of the second window (Type-3 may differ from `length`)
125    #[serde(skip)]
126    target_length: Option<usize>,
127    /// Content-based ID for this duplicate (SHA256 of normalized tokens)
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub duplicate_id: Option<String>,
130}
131
132/// Represents a function with its tokens for duplicate detection
133#[derive(Debug, Clone)]
134struct FunctionHash {
135    file_path: Arc<str>, // Shared ownership, cheap to clone
136    #[allow(dead_code)] // Kept for potential future reporting improvements
137    function_name: Option<String>,
138    #[allow(dead_code)] // Kept for byte-level analysis in future
139    start_byte: usize,
140    #[allow(dead_code)] // Kept for byte-level analysis in future
141    end_byte: usize,
142    start_line: usize,
143    #[allow(dead_code)] // Kept for future detailed reporting
144    end_line: usize,
145    tokens: Vec<Token>, // Normalized token sequence
146    /// Zero-based line offset for each token relative to start_line
147    token_line_offsets: Vec<usize>,
148    raw_body: String, // Original (unnormalized) function body for Type-1 detection
149}
150
151/// Baseline snapshot for comparing duplicate detection across runs
152#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct Baseline {
154    /// Version of the baseline format
155    pub version: String,
156    /// Timestamp when baseline was created
157    pub created_at: String,
158    /// Duplicates that existed at baseline time
159    pub duplicates: Vec<DuplicateMatch>,
160}
161
162impl Baseline {
163    /// Create a new baseline from scan results
164    pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
165        Self {
166            version: env!("CARGO_PKG_VERSION").to_string(),
167            created_at: chrono::Utc::now().to_rfc3339(),
168            duplicates,
169        }
170    }
171
172    /// Save baseline to a JSON file
173    pub fn save_to_file(&self, path: &Path) -> Result<()> {
174        let json =
175            serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
176        fs::write(path, json).context("Failed to write baseline file")?;
177        Ok(())
178    }
179
180    /// Load baseline from a JSON file
181    pub fn load_from_file(path: &Path) -> Result<Self> {
182        let content = fs::read_to_string(path)
183            .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
184        let baseline: Baseline =
185            serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
186        Ok(baseline)
187    }
188
189    /// Compare current duplicates against baseline and return only new ones
190    pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
191        let baseline_set: std::collections::HashSet<_> =
192            self.duplicates.iter().map(duplicate_key).collect();
193
194        current
195            .iter()
196            .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
197            .cloned()
198            .collect()
199    }
200}
201
202/// Create a unique key for a duplicate match for comparison
203fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
204    // Normalize file order for consistent comparison
205    let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
206        (
207            dup.file1.clone(),
208            dup.file2.clone(),
209            dup.start_line1,
210            dup.start_line2,
211        )
212    } else {
213        (
214            dup.file2.clone(),
215            dup.file1.clone(),
216            dup.start_line2,
217            dup.start_line1,
218        )
219    };
220    (file1, file2, line1, line2, dup.length)
221}
222
223/// Report containing scan results
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct Report {
226    /// PolyDup version
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub version: Option<String>,
229    /// Scan start time (ISO 8601)
230    #[serde(skip_serializing_if = "Option::is_none")]
231    pub scan_time: Option<String>,
232    /// Configuration used for the scan
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub config: Option<ScanConfig>,
235    /// Total number of files scanned
236    pub files_scanned: usize,
237    /// Total number of functions analyzed
238    pub functions_analyzed: usize,
239    /// Detected duplicate matches
240    pub duplicates: Vec<DuplicateMatch>,
241    /// Scan statistics
242    pub stats: ScanStats,
243}
244
245/// Configuration used for scanning
246#[derive(Debug, Clone, Serialize, Deserialize)]
247pub struct ScanConfig {
248    /// Minimum block size in tokens
249    pub threshold: usize,
250    /// Similarity threshold (0.0 - 1.0)
251    pub similarity: f64,
252    /// Type-3 detection enabled
253    pub type3_enabled: bool,
254    /// Paths scanned
255    #[serde(skip_serializing_if = "Option::is_none")]
256    pub paths: Option<Vec<String>>,
257}
258
259/// Statistics from the scanning process
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct ScanStats {
262    /// Total lines of code scanned
263    pub total_lines: usize,
264    /// Total tokens processed
265    pub total_tokens: usize,
266    /// Number of unique hashes computed
267    pub unique_hashes: usize,
268    /// Scan duration in milliseconds
269    pub duration_ms: u64,
270}
271
272/// Main scanner for detecting duplicates
273#[allow(dead_code)] // similarity_threshold reserved for future use
274pub struct Scanner {
275    /// Minimum code block size to consider (in tokens)
276    min_block_size: usize,
277    /// Similarity threshold (0.0 - 1.0)
278    similarity_threshold: f64,
279    /// Glob patterns to exclude from scanning
280    exclude_patterns: Vec<String>,
281    /// Enable Type-3 (gap-tolerant) clone detection
282    enable_type3: bool,
283    /// Type-3 similarity tolerance (0.0 - 1.0)
284    type3_tolerance: f64,
285    /// Ignore manager for filtering false positives
286    ignore_manager: Option<IgnoreManager>,
287    /// Enable inline directive detection
288    enable_directives: bool,
289    /// Include test files in scanning (*.test.*, *.spec.*, etc.)
290    include_tests: bool,
291}
292
293/// Default exclude patterns for test files and build artifacts
294fn default_exclude_patterns() -> Vec<String> {
295    vec![
296        // Test files (excluded by default, enable with --include-tests)
297        "**/*.test.ts".to_string(),
298        "**/*.test.js".to_string(),
299        "**/*.test.tsx".to_string(),
300        "**/*.test.jsx".to_string(),
301        "**/*.spec.ts".to_string(),
302        "**/*.spec.js".to_string(),
303        "**/*.spec.tsx".to_string(),
304        "**/*.spec.jsx".to_string(),
305        "**/__tests__/**".to_string(),
306        "**/*.test.py".to_string(),
307    ]
308}
309
310/// Exclude patterns for build artifacts (always excluded)
311fn build_artifact_patterns() -> Vec<String> {
312    vec![
313        "**/node_modules/**".to_string(),
314        "**/target/**".to_string(),
315        "**/dist/**".to_string(),
316        "**/build/**".to_string(),
317        "**/.git/**".to_string(),
318    ]
319}
320
321impl Scanner {
322    /// Creates a new Scanner with default settings
323    ///
324    /// This is now infallible as there are no I/O or allocation failures.
325    pub fn new() -> Self {
326        let mut exclude = build_artifact_patterns();
327        exclude.extend(default_exclude_patterns());
328
329        Self {
330            min_block_size: 50,
331            similarity_threshold: 0.85,
332            exclude_patterns: exclude,
333            enable_type3: false,
334            type3_tolerance: 0.85,
335            ignore_manager: None,
336            enable_directives: false,
337            include_tests: false,
338        }
339    }
340
341    /// Creates a new Scanner with custom settings
342    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
343        let mut exclude = build_artifact_patterns();
344        exclude.extend(default_exclude_patterns());
345
346        Ok(Self {
347            min_block_size,
348            similarity_threshold,
349            exclude_patterns: exclude,
350            enable_type3: false,
351            type3_tolerance: 0.85,
352            ignore_manager: None,
353            enable_directives: false,
354            include_tests: false,
355        })
356    }
357
358    /// Sets custom exclude patterns, replacing the defaults
359    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
360        self.exclude_patterns = patterns;
361        self
362    }
363
364    /// Enables test file scanning (removes test file patterns from exclusions)
365    pub fn with_test_files(mut self, include: bool) -> Self {
366        self.include_tests = include;
367        if include {
368            // Remove test file patterns from exclusions
369            let test_patterns = default_exclude_patterns();
370            self.exclude_patterns.retain(|p| !test_patterns.contains(p));
371        }
372        self
373    }
374
375    /// Enables Type-3 clone detection with the specified tolerance
376    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
377        if !(0.0..=1.0).contains(&tolerance) {
378            return Err(PolyDupError::Config(
379                "Type-3 tolerance must be between 0.0 and 1.0".to_string(),
380            ));
381        }
382        self.enable_type3 = true;
383        self.type3_tolerance = tolerance;
384        Ok(self)
385    }
386
387    /// Sets the ignore manager for filtering false positives
388    pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
389        self.ignore_manager = Some(manager);
390        self
391    }
392
393    /// Enables inline directive detection (// polydup-ignore comments)
394    pub fn with_directives(mut self, enabled: bool) -> Self {
395        self.enable_directives = enabled;
396        self
397    }
398
399    /// Scans the given paths and returns a Report with detected duplicates
400    ///
401    /// Uses Rayon for parallel file processing:
402    /// 1. Read and parse files
403    /// 2. Extract functions
404    /// 3. Normalize and hash function bodies
405    /// 4. Compare hashes to find duplicates
406    /// 5. Apply directive-based filtering if enabled
407    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
408        use std::time::Instant;
409        let start_time = Instant::now();
410
411        // Collect all source files
412        let source_files = self.collect_source_files(paths)?;
413
414        // Detect directives if enabled
415        let directives_map = self.collect_directives(&source_files);
416
417        // Process files in parallel to extract functions and compute hashes
418        let (function_hashes, total_lines) = self.analyze_files(&source_files)?;
419
420        // Find duplicates by comparing hashes
421        let mut duplicates = self.find_duplicate_hashes(&function_hashes);
422
423        // Apply directive-based filtering
424        if self.enable_directives && !directives_map.is_empty() {
425            self.apply_directive_filtering(&mut duplicates, &directives_map, &function_hashes);
426        }
427
428        // Calculate statistics
429        let stats = self.compute_stats(&function_hashes, total_lines, start_time);
430
431        Ok(Report {
432            version: None,   // Will be set by CLI
433            scan_time: None, // Will be set by CLI
434            config: None,    // Will be set by CLI
435            files_scanned: source_files.len(),
436            functions_analyzed: function_hashes.len(),
437            duplicates,
438            stats,
439        })
440    }
441
442    /// Parallel collection of directives from source files
443    fn collect_directives(
444        &self,
445        source_files: &[PathBuf],
446    ) -> HashMap<PathBuf, crate::directives::FileDirectives> {
447        if self.enable_directives {
448            source_files
449                .par_iter()
450                .filter_map(|path| {
451                    crate::directives::detect_directives_in_file(path)
452                        .ok()
453                        .map(|d| (path.clone(), d))
454                })
455                .collect()
456        } else {
457            HashMap::new()
458        }
459    }
460
461    /// Analyze files in parallel to extract functions and metadata
462    fn analyze_files(&self, source_files: &[PathBuf]) -> Result<(Vec<FunctionHash>, usize)> {
463        // Collect function hashes and line counts
464        let results: Vec<Result<(Vec<FunctionHash>, usize)>> = source_files
465            .par_iter()
466            .map(|path| {
467                // Count lines first
468                let content = std::fs::read_to_string(path).map_err(PolyDupError::Io)?;
469                let line_count = content.lines().count();
470
471                // Process file for functions
472                let hashes = self.process_file_content(path, &content)?;
473                Ok((hashes, line_count))
474            })
475            .collect();
476
477        // Aggregate results
478        let mut all_hashes = Vec::new();
479        let mut total_lines = 0;
480
481        for res in results {
482            match res {
483                Ok((hashes, lines)) => {
484                    all_hashes.extend(hashes);
485                    total_lines += lines;
486                }
487                Err(_) => {
488                    // Silently skip files that fail to parse (e.g., binary files, permission errors).
489                    // This matches the original behavior which used .filter_map(|path| self.process_file(path).ok())
490                }
491            }
492        }
493
494        Ok((all_hashes, total_lines))
495    }
496
497    /// Filter duplicates based on directives
498    fn apply_directive_filtering(
499        &self,
500        duplicates: &mut Vec<DuplicateMatch>,
501        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
502        function_hashes: &[FunctionHash],
503    ) {
504        for dup in duplicates.iter_mut() {
505            let suppressed = self.is_suppressed_by_directive(dup, directives_map, function_hashes);
506            if suppressed {
507                dup.suppressed_by_directive = Some(true);
508            }
509        }
510
511        // Filter out suppressed duplicates (they shouldn't appear in reports or fail CI)
512        duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
513    }
514
515    /// Compute scan statistics
516    fn compute_stats(
517        &self,
518        function_hashes: &[FunctionHash],
519        total_lines: usize,
520        start_time: std::time::Instant,
521    ) -> ScanStats {
522        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
523
524        let unique_hashes: usize = {
525            let mut hash_set = std::collections::HashSet::new();
526            for fh in function_hashes {
527                // Compute rolling hashes just for statistics
528                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
529                for (hash, _) in hashes {
530                    hash_set.insert(hash);
531                }
532            }
533            hash_set.len()
534        };
535
536        let duration_ms = start_time.elapsed().as_millis() as u64;
537
538        ScanStats {
539            total_lines,
540            total_tokens,
541            unique_hashes,
542            duration_ms,
543        }
544    }
545
546    /// Collects all source files from the given paths
547    ///
548    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
549    /// and common ignore patterns (node_modules, target, etc.)
550    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
551        let mut files = Vec::new();
552
553        for path in paths {
554            if path.is_file() {
555                if self.is_supported_file(&path) && !self.is_excluded(&path) {
556                    files.push(path);
557                }
558            } else if path.is_dir() {
559                // Use ignore crate's WalkBuilder to respect .gitignore
560                let walker = WalkBuilder::new(&path)
561                    .git_ignore(true) // Respect .gitignore
562                    .git_global(true) // Respect global gitignore
563                    .git_exclude(true) // Respect .git/info/exclude
564                    .ignore(true) // Respect .ignore files
565                    .hidden(false) // Don't skip hidden files (e.g., .config/)
566                    .parents(true) // Respect parent .gitignore files
567                    .build();
568
569                for entry in walker {
570                    match entry {
571                        Ok(entry) => {
572                            let path = entry.path();
573                            if path.is_file()
574                                && self.is_supported_file(path)
575                                && !self.is_excluded(path)
576                            {
577                                files.push(path.to_path_buf());
578                            }
579                        }
580                        Err(err) => {
581                            // Log but don't fail on individual entry errors
582                            eprintln!("Warning: Failed to access path: {}", err);
583                        }
584                    }
585                }
586            }
587        }
588
589        Ok(files)
590    }
591
592    /// Checks if a file is a supported source file
593    fn is_supported_file(&self, path: &Path) -> bool {
594        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
595            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
596        } else {
597            false
598        }
599    }
600
601    /// Checks if a file matches any exclude patterns
602    fn is_excluded(&self, path: &Path) -> bool {
603        use globset::{Glob, GlobSetBuilder};
604
605        // Build glob set from exclude patterns
606        let mut builder = GlobSetBuilder::new();
607        for pattern in &self.exclude_patterns {
608            if let Ok(glob) = Glob::new(pattern) {
609                builder.add(glob);
610            }
611        }
612
613        if let Ok(glob_set) = builder.build() {
614            glob_set.is_match(path)
615        } else {
616            false
617        }
618    }
619
620    /// Processes a single file content and returns function hashes
621    fn process_file_content(&self, path: &Path, code: &str) -> Result<Vec<FunctionHash>> {
622        let lang = self.detect_language(path)?;
623        let functions = extract_functions(code, lang)?;
624
625        // Use Arc<str> for efficient sharing across all functions in this file
626        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
627        let mut function_hashes = Vec::new();
628
629        for func in functions {
630            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
631            let raw_body = func.body.clone();
632            let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
633
634            // Skip if too small
635            if tokens.len() < self.min_block_size {
636                continue;
637            }
638
639            // Store the full token sequence for extension-based detection
640            function_hashes.push(FunctionHash {
641                file_path: Arc::clone(&file_path), // Cheap pointer clone
642                function_name: func.name.clone(),
643                start_byte: func.start_byte,
644                end_byte: func.end_byte,
645                start_line: func.start_line,
646                end_line: func.end_line,
647                tokens,
648                token_line_offsets,
649                raw_body,
650            });
651        }
652
653        Ok(function_hashes)
654    }
655
656    /// Detects the Tree-sitter Language from file extension
657    fn detect_language(&self, path: &Path) -> Result<Language> {
658        let ext = path
659            .extension()
660            .and_then(|e| e.to_str())
661            .ok_or_else(|| PolyDupError::LanguageDetection(path.to_path_buf()))?;
662
663        match ext {
664            "rs" => Ok(tree_sitter_rust::language()),
665            "py" => Ok(tree_sitter_python::language()),
666            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
667            _ => Err(PolyDupError::LanguageNotSupported(ext.to_string())),
668        }
669    }
670
671    /// Computes the inclusive line span for a token window within a function
672    fn compute_line_span(
673        &self,
674        func: &FunctionHash,
675        start_offset: usize,
676        length: usize,
677    ) -> (usize, usize) {
678        let start_line = func
679            .token_line_offsets
680            .get(start_offset)
681            .map(|offset| func.start_line + offset)
682            .unwrap_or(func.start_line + start_offset);
683
684        let end_index = start_offset + length.saturating_sub(1);
685        let end_line = func
686            .token_line_offsets
687            .get(end_index)
688            .map(|offset| func.start_line + offset)
689            .unwrap_or(func.start_line + end_index);
690
691        (start_line, end_line)
692    }
693
694    /// Finds duplicate code using greedy extension algorithm
695    ///
696    /// Orchestrates the detection pipeline:
697    /// 1. Type-1/2 detection (exact and renamed clones)
698    /// 2. Type-3 detection (near-miss clones with gaps)
699    /// 3. Duplicate ID computation
700    /// 4. Ignore filtering
701    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
702        // Type alias for pair deduplication keys
703        type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
704
705        // Shared state for deduplication across Type-1/2 and Type-3
706        let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
707            std::collections::HashSet::new();
708
709        // Phase 1: Type-1/2 detection
710        let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
711
712        // Phase 2: Type-3 detection (if enabled)
713        if self.enable_type3 {
714            self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
715        }
716
717        // Phase 3: Compute IDs for all duplicates
718        self.compute_duplicate_ids(function_hashes, &mut duplicates);
719
720        // Phase 4: Filter out ignored duplicates
721        self.filter_ignored_duplicates(&mut duplicates);
722
723        duplicates
724    }
725
726    /// Detects Type-1 (exact) and Type-2 (renamed) clones
727    ///
728    /// Compares all function pairs using hash-based detection with greedy extension.
729    fn find_type12_duplicates<'a>(
730        &self,
731        function_hashes: &'a [FunctionHash],
732        seen_pairs: &mut std::collections::HashSet<(
733            &'a str,
734            &'a str,
735            usize,
736            usize,
737            usize,
738            usize,
739            usize,
740        )>,
741    ) -> Vec<DuplicateMatch> {
742        let mut duplicates = Vec::new();
743
744        for i in 0..function_hashes.len() {
745            for j in (i + 1)..function_hashes.len() {
746                let func1 = &function_hashes[i];
747                let func2 = &function_hashes[j];
748
749                let matches = self.find_clones_between_functions(func1, func2);
750
751                for clone_match in matches {
752                    let pair_key = canonical_pair_key(
753                        func1,
754                        func2,
755                        clone_match.source_start,
756                        clone_match.target_start,
757                        clone_match.length,
758                    );
759
760                    if seen_pairs.contains(&pair_key) {
761                        continue;
762                    }
763                    seen_pairs.insert(pair_key);
764
765                    // Compute hash for reporting
766                    let match_hash = Self::compute_match_hash(
767                        &func1.tokens[clone_match.source_start
768                            ..clone_match.source_start + clone_match.length],
769                    );
770
771                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
772
773                    let (actual_start1, actual_end1) =
774                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
775                    let (actual_start2, actual_end2) =
776                        self.compute_line_span(func2, clone_match.target_start, clone_match.length);
777
778                    // Skip same location (overlapping function boundaries)
779                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
780                        continue;
781                    }
782
783                    duplicates.push(DuplicateMatch {
784                        file1: func1.file_path.to_string(),
785                        file2: func2.file_path.to_string(),
786                        start_line1: actual_start1,
787                        start_line2: actual_start2,
788                        end_line1: Some(actual_end1),
789                        end_line2: Some(actual_end2),
790                        length: clone_match.length,
791                        similarity: clone_match.similarity,
792                        hash: match_hash,
793                        clone_type,
794                        edit_distance: None,
795                        suppressed_by_directive: None,
796                        token_offset1: Some(clone_match.source_start),
797                        token_offset2: Some(clone_match.target_start),
798                        target_length: Some(clone_match.length),
799                        duplicate_id: None,
800                    });
801                }
802            }
803        }
804
805        duplicates
806    }
807
808    /// Detects Type-3 (gap-tolerant) clones using edit distance
809    ///
810    /// Finds near-miss clones that have insertions, deletions, or modifications.
811    fn find_type3_duplicates<'a>(
812        &self,
813        function_hashes: &'a [FunctionHash],
814        seen_pairs: &std::collections::HashSet<(
815            &'a str,
816            &'a str,
817            usize,
818            usize,
819            usize,
820            usize,
821            usize,
822        )>,
823        duplicates: &mut Vec<DuplicateMatch>,
824    ) {
825        let mut type3_candidates = Vec::new();
826
827        for i in 0..function_hashes.len() {
828            for j in (i + 1)..function_hashes.len() {
829                let func1 = &function_hashes[i];
830                let func2 = &function_hashes[j];
831
832                let type3_matches = detect_type3_clones(
833                    &func1.tokens,
834                    &func2.tokens,
835                    self.min_block_size,
836                    self.type3_tolerance,
837                );
838
839                for clone_match in type3_matches {
840                    let pair_key = canonical_pair_key(
841                        func1,
842                        func2,
843                        clone_match.source_start,
844                        clone_match.target_start,
845                        clone_match.length,
846                    );
847
848                    if seen_pairs.contains(&pair_key) {
849                        continue;
850                    }
851
852                    type3_candidates.push((func1, func2, clone_match));
853                }
854            }
855        }
856
857        // Deduplicate overlapping Type-3 matches
858        let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
859
860        // Convert to DuplicateMatch
861        for (func1, func2, clone_match) in deduplicated {
862            let (actual_start1, actual_end1) =
863                self.compute_line_span(func1, clone_match.source_start, clone_match.length);
864            let (actual_start2, actual_end2) =
865                self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
866
867            // Skip self-matches: same file and same starting line indicates
868            // the algorithm matched a code block against itself
869            if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
870                continue;
871            }
872
873            let window1 = &func1.tokens
874                [clone_match.source_start..clone_match.source_start + clone_match.length];
875            let window2 = &func2.tokens
876                [clone_match.target_start..clone_match.target_start + clone_match.target_length];
877            let edit_dist = hashing::compute_token_edit_distance(window1, window2);
878
879            let match_hash = Self::compute_match_hash(window1);
880
881            duplicates.push(DuplicateMatch {
882                file1: func1.file_path.to_string(),
883                file2: func2.file_path.to_string(),
884                start_line1: actual_start1,
885                start_line2: actual_start2,
886                end_line1: Some(actual_end1),
887                end_line2: Some(actual_end2),
888                length: clone_match.length,
889                similarity: clone_match.similarity,
890                hash: match_hash,
891                clone_type: CloneType::Type3,
892                edit_distance: Some(edit_dist),
893                suppressed_by_directive: None,
894                token_offset1: Some(clone_match.source_start),
895                token_offset2: Some(clone_match.target_start),
896                target_length: Some(clone_match.target_length),
897                duplicate_id: None,
898            });
899        }
900    }
901
902    /// Computes content-based IDs for all duplicates
903    ///
904    /// IDs are SHA256 hashes of normalized tokens, enabling persistent ignore rules.
905    fn compute_duplicate_ids(
906        &self,
907        function_hashes: &[FunctionHash],
908        duplicates: &mut [DuplicateMatch],
909    ) {
910        for dup in duplicates.iter_mut() {
911            if dup.duplicate_id.is_some() {
912                continue;
913            }
914
915            let tokens1 = self.extract_duplicate_tokens(
916                function_hashes,
917                &dup.file1,
918                dup.start_line1,
919                dup.token_offset1,
920                dup.length,
921            );
922
923            let tokens2 = self.extract_duplicate_tokens(
924                function_hashes,
925                &dup.file2,
926                dup.start_line2,
927                dup.token_offset2,
928                dup.target_length.unwrap_or(dup.length),
929            );
930
931            if let Some(tokens1) = tokens1 {
932                let id = if let Some(tokens2) = tokens2 {
933                    ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
934                } else {
935                    ignore_rules::compute_duplicate_id(&tokens1)
936                };
937                dup.duplicate_id = Some(id);
938            }
939        }
940    }
941
942    /// Extracts normalized token strings for a duplicate region
943    fn extract_duplicate_tokens(
944        &self,
945        function_hashes: &[FunctionHash],
946        file: &str,
947        reported_start: usize,
948        token_offset: Option<usize>,
949        length: usize,
950    ) -> Option<Vec<String>> {
951        let token_offset = token_offset?;
952        function_hashes
953            .iter()
954            .find(|fh| {
955                fh.file_path.as_ref() == file
956                    && fh.start_line <= reported_start
957                    && reported_start <= fh.end_line
958            })
959            .and_then(|fh| {
960                if token_offset + length <= fh.tokens.len() {
961                    Some(
962                        fh.tokens
963                            .iter()
964                            .skip(token_offset)
965                            .take(length)
966                            .map(|t| t.as_hash_string().to_string())
967                            .collect(),
968                    )
969                } else {
970                    None
971                }
972            })
973    }
974
975    /// Filters out duplicates that are in the ignore list
976    fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
977        if let Some(ref ignore_manager) = self.ignore_manager {
978            duplicates.retain(|dup| {
979                if let Some(ref id) = dup.duplicate_id {
980                    !ignore_manager.is_ignored(id)
981                } else {
982                    // If we couldn't compute an ID, keep the duplicate (fail open)
983                    true
984                }
985            });
986        }
987    }
988
989    /// Computes a hash for a token slice (used for match reporting)
990    fn compute_match_hash(tokens: &[Token]) -> u64 {
991        use std::collections::hash_map::DefaultHasher;
992        use std::hash::{Hash, Hasher};
993        let mut hasher = DefaultHasher::new();
994        tokens.hash(&mut hasher);
995        hasher.finish()
996    }
997
998    /// Checks if a duplicate is suppressed by an inline directive
999    ///
1000    /// Directives suppress the entire function they're placed before, so we check
1001    /// if the owning function has a directive, not the duplicate's specific lines.
1002    fn is_suppressed_by_directive(
1003        &self,
1004        dup: &DuplicateMatch,
1005        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
1006        function_hashes: &[FunctionHash],
1007    ) -> bool {
1008        // Check if either file has a directive suppressing this duplicate
1009        let file1_path = PathBuf::from(&dup.file1);
1010        let file2_path = PathBuf::from(&dup.file2);
1011
1012        // Check file1 - use the owning function's start line for directive lookup
1013        if let Some(directives) = directives_map.get(&file1_path) {
1014            let func_start =
1015                self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
1016            // Use function start for directive check (directives apply to whole function)
1017            let check_line = func_start.unwrap_or(dup.start_line1);
1018
1019            if directives.is_suppressed(check_line, check_line).is_some() {
1020                return true;
1021            }
1022        }
1023
1024        // Check file2 - use the owning function's start line for directive lookup
1025        if let Some(directives) = directives_map.get(&file2_path) {
1026            let func_start =
1027                self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
1028            // Use function start for directive check (directives apply to whole function)
1029            let check_line = func_start.unwrap_or(dup.start_line2);
1030
1031            if directives.is_suppressed(check_line, check_line).is_some() {
1032                return true;
1033            }
1034        }
1035
1036        false
1037    }
1038
1039    /// Finds the start line of the function containing a given line
1040    fn find_owning_function_start(
1041        &self,
1042        file: &str,
1043        line: usize,
1044        function_hashes: &[FunctionHash],
1045    ) -> Option<usize> {
1046        function_hashes
1047            .iter()
1048            .find(|fh| {
1049                fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
1050            })
1051            .map(|fh| fh.start_line)
1052    }
1053
1054    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
1055    ///
1056    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
1057    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
1058    /// Overlap requires BOTH source AND target ranges to overlap.
1059    fn deduplicate_overlapping_matches<'a>(
1060        &self,
1061        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1062    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1063        if candidates.is_empty() {
1064            return Vec::new();
1065        }
1066
1067        // Track which matches have been merged
1068        let mut used = vec![false; candidates.len()];
1069        let mut deduplicated = Vec::new();
1070
1071        for i in 0..candidates.len() {
1072            if used[i] {
1073                continue;
1074            }
1075
1076            let (func1, func2, current) = &candidates[i];
1077            let mut best_match = (*func1, *func2, current.clone());
1078            used[i] = true;
1079
1080            // Find all overlapping matches (iterate until no more overlaps found)
1081            // This handles transitive overlaps: A overlaps B, B overlaps C
1082            let mut found_overlap = true;
1083            while found_overlap {
1084                found_overlap = false;
1085
1086                for j in (i + 1)..candidates.len() {
1087                    if used[j] {
1088                        continue;
1089                    }
1090
1091                    let (f1, f2, candidate) = &candidates[j];
1092
1093                    // Only merge if same function pair (by file path and line number)
1094                    let same_pair = (func1.file_path == f1.file_path
1095                        && func2.file_path == f2.file_path
1096                        && func1.start_line == f1.start_line
1097                        && func2.start_line == f2.start_line)
1098                        || (func1.file_path == f2.file_path
1099                            && func2.file_path == f1.file_path
1100                            && func1.start_line == f2.start_line
1101                            && func2.start_line == f1.start_line);
1102
1103                    if !same_pair {
1104                        continue;
1105                    }
1106
1107                    // Check if overlapping with CURRENT best_match (not original)
1108                    // This ensures transitive overlaps are handled correctly
1109                    let source_overlap = ranges_overlap(
1110                        best_match.2.source_start,
1111                        best_match.2.source_start + best_match.2.length,
1112                        candidate.source_start,
1113                        candidate.source_start + candidate.length,
1114                    );
1115                    let target_overlap = ranges_overlap(
1116                        best_match.2.target_start,
1117                        best_match.2.target_start + best_match.2.target_length,
1118                        candidate.target_start,
1119                        candidate.target_start + candidate.target_length,
1120                    );
1121
1122                    if source_overlap && target_overlap {
1123                        let best_span = best_match.2.length.max(best_match.2.target_length);
1124                        let candidate_span = candidate.length.max(candidate.target_length);
1125
1126                        // Keep the match that covers more tokens overall, breaking ties by similarity
1127                        if candidate_span > best_span
1128                            || (candidate_span == best_span
1129                                && candidate.similarity > best_match.2.similarity)
1130                        {
1131                            best_match = (*f1, *f2, candidate.clone());
1132                            found_overlap = true; // Need another pass to check against new best
1133                        }
1134                        used[j] = true;
1135                    }
1136                }
1137            }
1138
1139            deduplicated.push(best_match);
1140        }
1141
1142        deduplicated
1143    }
1144
1145    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
1146    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1147        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
1148        let normalized1 = raw1.split_whitespace().collect::<String>();
1149        let normalized2 = raw2.split_whitespace().collect::<String>();
1150
1151        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
1152        if normalized1 == normalized2 {
1153            CloneType::Type1
1154        } else {
1155            // Otherwise, it's Type-2 (renamed identifiers/literals)
1156            CloneType::Type2
1157        }
1158    }
1159
1160    /// Finds clone matches between two functions using extension algorithm
1161    fn find_clones_between_functions(
1162        &self,
1163        func1: &FunctionHash,
1164        func2: &FunctionHash,
1165    ) -> Vec<CloneMatch> {
1166        use std::collections::HashMap;
1167
1168        let mut matches = Vec::new();
1169        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1170
1171        // Index all windows in func1
1172        let mut i = 0;
1173        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1174            let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1175            hash_map.entry(hash).or_default().push(i);
1176            i += 1;
1177        }
1178
1179        // Search for matches in func2
1180        let mut j = 0;
1181        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1182            let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1183
1184            if let Some(func1_positions) = hash_map.get(&hash) {
1185                for &func1_pos in func1_positions {
1186                    // Verify exact match using shared utility
1187                    if hashing::verify_cross_window_match(
1188                        &func1.tokens,
1189                        &func2.tokens,
1190                        func1_pos,
1191                        j,
1192                        self.min_block_size,
1193                    ) {
1194                        // Greedy extension using shared utility
1195                        let extension = hashing::extend_match(
1196                            &func1.tokens,
1197                            &func2.tokens,
1198                            func1_pos,
1199                            j,
1200                            self.min_block_size,
1201                        );
1202
1203                        let total_length = self.min_block_size + extension;
1204
1205                        matches.push(CloneMatch {
1206                            source_start: func1_pos,
1207                            target_start: j,
1208                            length: total_length,
1209                            target_length: total_length,
1210                            similarity: 1.0, // Exact match
1211                        });
1212
1213                        // Skip ahead
1214                        j += extension.max(1);
1215                        break;
1216                    }
1217                }
1218            }
1219
1220            j += 1;
1221        }
1222
1223        matches
1224    }
1225}
1226
1227impl Default for Scanner {
1228    fn default() -> Self {
1229        Self::new() // Infallible now, no panic possible
1230    }
1231}
1232
1233/// Public API: Find duplicates in the given file paths
1234///
1235/// # Arguments
1236/// * `paths` - Vector of file paths to scan
1237///
1238/// # Returns
1239/// * `Result<Report>` - Scan report with detected duplicates
1240pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1241    let scanner = Scanner::new();
1242    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1243    scanner.scan(path_bufs)
1244}
1245
1246/// Public API with custom configuration
1247pub fn find_duplicates_with_config(
1248    paths: Vec<String>,
1249    min_block_size: usize,
1250    similarity_threshold: f64,
1251) -> Result<Report> {
1252    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1253    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1254    scanner.scan(path_bufs)
1255}
1256
1257#[cfg(test)]
1258mod tests {
1259    use super::*;
1260
1261    /// Helper to create a FunctionHash for testing with sequential line offsets
1262    fn make_test_function(
1263        file: &str,
1264        start_line: usize,
1265        tokens: Vec<Token>,
1266        raw_body: &str,
1267    ) -> FunctionHash {
1268        let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1269        FunctionHash {
1270            file_path: Arc::<str>::from(file),
1271            function_name: None,
1272            start_byte: 0,
1273            end_byte: 0,
1274            start_line,
1275            end_line: start_line + tokens.len(),
1276            tokens,
1277            token_line_offsets,
1278            raw_body: raw_body.to_string(),
1279        }
1280    }
1281
1282    /// Helper to create a FunctionHash with all tokens on the same line
1283    fn make_test_function_same_line(
1284        file: &str,
1285        start_line: usize,
1286        end_line: usize,
1287        tokens: Vec<Token>,
1288        raw_body: &str,
1289    ) -> FunctionHash {
1290        let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1291        FunctionHash {
1292            file_path: Arc::<str>::from(file),
1293            function_name: None,
1294            start_byte: 0,
1295            end_byte: 0,
1296            start_line,
1297            end_line,
1298            tokens,
1299            token_line_offsets,
1300            raw_body: raw_body.to_string(),
1301        }
1302    }
1303
1304    /// Helper to create simple expression tokens for testing: keyword id op id ;
1305    fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1306        vec![
1307            Token::Keyword(keyword.into()),
1308            Token::Identifier,
1309            Token::Operator(op.into()),
1310            Token::Identifier,
1311            Token::Punctuation(";".into()),
1312        ]
1313    }
1314
1315    #[test]
1316    fn test_scanner_creation() {
1317        let _scanner = Scanner::new(); // Infallible
1318    }
1319
1320    #[test]
1321    fn test_scanner_with_config() {
1322        let scanner = Scanner::with_config(30, 0.9);
1323        assert!(scanner.is_ok());
1324        let s = scanner.unwrap();
1325        assert_eq!(s.min_block_size, 30);
1326        assert_eq!(s.similarity_threshold, 0.9);
1327    }
1328
1329    #[test]
1330    fn test_type3_tolerance_validation() {
1331        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1332        assert!(Scanner::new().with_type3_detection(1.2).is_err());
1333        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1334    }
1335
1336    #[test]
1337    fn test_type3_not_dropped_when_functions_share_offsets() {
1338        fn make_function(
1339            file: &str,
1340            start_line: usize,
1341            tokens: Vec<Token>,
1342            raw_body: &str,
1343        ) -> FunctionHash {
1344            let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1345            FunctionHash {
1346                file_path: Arc::<str>::from(file),
1347                function_name: None,
1348                start_byte: 0,
1349                end_byte: 0,
1350                start_line,
1351                end_line: start_line + tokens.len(),
1352                tokens,
1353                token_line_offsets,
1354                raw_body: raw_body.to_string(),
1355            }
1356        }
1357
1358        let scanner = Scanner::with_config(3, 0.85)
1359            .unwrap()
1360            .with_type3_detection(0.6)
1361            .unwrap();
1362
1363        let type1_tokens = vec![
1364            Token::Keyword("return".into()),
1365            Token::NumberLiteral,
1366            Token::Punctuation(";".into()),
1367        ];
1368        let near_tokens_a = vec![
1369            Token::Keyword("compute".into()),
1370            Token::Identifier,
1371            Token::Identifier,
1372        ];
1373        let near_tokens_b = vec![
1374            Token::Keyword("compute".into()),
1375            Token::Identifier,
1376            Token::NumberLiteral,
1377        ];
1378
1379        let functions = vec![
1380            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1381            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1382            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1383            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1384        ];
1385
1386        let duplicates = scanner.find_duplicate_hashes(&functions);
1387
1388        let type1_present = duplicates.iter().any(|d| {
1389            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1390                && d.start_line1 == 10
1391                && d.start_line2 == 20
1392        });
1393        assert!(
1394            type1_present,
1395            "expected Type-1/2 match for the first function pair"
1396        );
1397
1398        let type3_present = duplicates.iter().any(|d| {
1399            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1400        });
1401        assert!(
1402            type3_present,
1403            "Type-3 match between later functions should not be deduped"
1404        );
1405
1406        assert_eq!(
1407            duplicates.len(),
1408            2,
1409            "should keep both the Type-1/2 and Type-3 matches"
1410        );
1411    }
1412
1413    #[test]
1414    fn test_type3_reports_token_offsets_in_start_lines() {
1415        let scanner = Scanner::with_config(3, 0.85)
1416            .unwrap()
1417            .with_type3_detection(0.75)
1418            .unwrap();
1419
1420        let functions = vec![
1421            make_test_function_same_line(
1422                "file_a.rs",
1423                100,
1424                105,
1425                make_expr_tokens("let", "+"),
1426                "let a = b + c;",
1427            ),
1428            make_test_function_same_line(
1429                "file_b.rs",
1430                200,
1431                205,
1432                make_expr_tokens("mut", "-"),
1433                "let a = b - c;",
1434            ),
1435        ];
1436
1437        let duplicates = scanner.find_duplicate_hashes(&functions);
1438
1439        let type3 = duplicates
1440            .iter()
1441            .find(|d| matches!(d.clone_type, CloneType::Type3))
1442            .expect("expected a Type-3 duplicate match");
1443
1444        assert_eq!(
1445            type3.start_line1, 100,
1446            "should report the actual source line even when tokens share a line"
1447        );
1448        assert_eq!(
1449            type3.start_line2, 200,
1450            "should report the actual target line even when tokens share a line"
1451        );
1452        assert_eq!(type3.token_offset1, Some(1));
1453        assert_eq!(type3.token_offset2, Some(1));
1454    }
1455
1456    #[test]
1457    fn type3_duplicate_ids_are_symmetric() {
1458        use tempfile::TempDir;
1459
1460        let tokens_a = make_expr_tokens("let", "+");
1461        // tokens_b has an extra identifier to create a Type-3 (near-miss) clone
1462        let mut tokens_b = make_expr_tokens("let", "-");
1463        tokens_b.push(Token::Identifier);
1464
1465        let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1466        let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1467
1468        let temp_dir = TempDir::new().unwrap();
1469        let scanner = Scanner::with_config(3, 0.85)
1470            .unwrap()
1471            .with_type3_detection(0.75)
1472            .unwrap()
1473            .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1474
1475        let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1476        let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1477
1478        let id_forward = forward
1479            .into_iter()
1480            .find(|d| matches!(d.clone_type, CloneType::Type3))
1481            .and_then(|d| d.duplicate_id)
1482            .expect("expected a Type-3 duplicate ID");
1483
1484        let id_reverse = reverse
1485            .into_iter()
1486            .find(|d| matches!(d.clone_type, CloneType::Type3))
1487            .and_then(|d| d.duplicate_id)
1488            .expect("expected a Type-3 duplicate ID");
1489
1490        assert_eq!(
1491            id_forward, id_reverse,
1492            "Type-3 IDs should not depend on function order"
1493        );
1494    }
1495
1496    #[test]
1497    fn type3_does_not_report_self_matches() {
1498        // Regression test for issue #71: Type-3 detection was reporting functions
1499        // as duplicates of themselves (same file, same line on both sides)
1500        let scanner = Scanner::with_config(3, 0.85)
1501            .unwrap()
1502            .with_type3_detection(0.75)
1503            .unwrap();
1504
1505        // Create two functions in the SAME file with the SAME starting line
1506        // This simulates the bug where Type-3 matched a function against itself
1507        let tokens = make_expr_tokens("let", "+");
1508        let func1 = make_test_function_same_line("same_file.rs", 28, 35, tokens.clone(), "fn a()");
1509        let func2 = make_test_function_same_line("same_file.rs", 28, 35, tokens, "fn a()");
1510
1511        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1512
1513        // Should NOT report any duplicates since both map to the same file:line
1514        let self_matches: Vec<_> = duplicates
1515            .iter()
1516            .filter(|d| d.file1 == d.file2 && d.start_line1 == d.start_line2)
1517            .collect();
1518
1519        assert!(
1520            self_matches.is_empty(),
1521            "Type-3 should never report self-matches (same file and line). Found: {:?}",
1522            self_matches
1523        );
1524    }
1525
1526    #[test]
1527    fn type3_still_detects_same_file_different_line_duplicates() {
1528        // Ensure the self-match fix doesn't break legitimate same-file duplicates
1529        let scanner = Scanner::with_config(3, 0.85)
1530            .unwrap()
1531            .with_type3_detection(0.75)
1532            .unwrap();
1533
1534        // Two similar functions in the SAME file but DIFFERENT lines
1535        let tokens1 = make_expr_tokens("let", "+");
1536        let mut tokens2 = make_expr_tokens("let", "-");
1537        tokens2.push(Token::Identifier); // Make it Type-3 (not exact)
1538
1539        let func1 = make_test_function_same_line("same_file.rs", 10, 15, tokens1, "fn first()");
1540        let func2 = make_test_function_same_line("same_file.rs", 50, 55, tokens2, "fn second()");
1541
1542        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1543
1544        let same_file_different_line: Vec<_> = duplicates
1545            .iter()
1546            .filter(|d| d.file1 == d.file2 && d.start_line1 != d.start_line2)
1547            .collect();
1548
1549        assert!(
1550            !same_file_different_line.is_empty(),
1551            "Type-3 should still detect duplicates in the same file at different lines"
1552        );
1553    }
1554
1555    #[test]
1556    fn duplicate_matches_store_actual_end_lines() {
1557        let scanner = Scanner::with_config(2, 0.85).unwrap();
1558
1559        let tokens = vec![
1560            Token::Keyword("fn".into()),
1561            Token::Identifier,
1562            Token::Identifier,
1563            Token::Punctuation("{".into()),
1564            Token::Punctuation("}".into()),
1565        ];
1566
1567        let func1 = FunctionHash {
1568            file_path: Arc::<str>::from("file_a.rs"),
1569            function_name: None,
1570            start_byte: 0,
1571            end_byte: 0,
1572            start_line: 10,
1573            end_line: 14,
1574            tokens: tokens.clone(),
1575            token_line_offsets: vec![0, 0, 1, 1, 2],
1576            raw_body: "fn a() {}".to_string(),
1577        };
1578
1579        let func2 = FunctionHash {
1580            file_path: Arc::<str>::from("file_b.rs"),
1581            function_name: None,
1582            start_byte: 0,
1583            end_byte: 0,
1584            start_line: 20,
1585            end_line: 24,
1586            tokens,
1587            token_line_offsets: vec![0, 1, 1, 2, 2],
1588            raw_body: "fn b() {}".to_string(),
1589        };
1590
1591        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1592        let dup = duplicates.first().expect("expected a duplicate match");
1593
1594        assert_eq!(dup.start_line1, 10);
1595        assert_eq!(dup.start_line2, 20);
1596        assert_eq!(dup.end_line1, Some(12));
1597        assert_eq!(dup.end_line2, Some(22));
1598    }
1599
1600    #[test]
1601    fn test_find_duplicates_empty() {
1602        let result = find_duplicates(vec![]);
1603        assert!(result.is_ok());
1604        let report = result.unwrap();
1605        assert_eq!(report.duplicates.len(), 0);
1606    }
1607
1608    #[test]
1609    fn test_is_supported_file() {
1610        let scanner = Scanner::new();
1611
1612        assert!(scanner.is_supported_file(Path::new("test.rs")));
1613        assert!(scanner.is_supported_file(Path::new("test.py")));
1614        assert!(scanner.is_supported_file(Path::new("test.js")));
1615        assert!(scanner.is_supported_file(Path::new("test.ts")));
1616        assert!(!scanner.is_supported_file(Path::new("test.txt")));
1617        assert!(!scanner.is_supported_file(Path::new("test.md")));
1618    }
1619
1620    #[test]
1621    fn test_detect_language() {
1622        let scanner = Scanner::new();
1623
1624        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1625        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1626        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1627        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1628    }
1629}
dupe_core/lib.rs

dupe_core/
lib.rs