dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod directives;
8mod hashing;
9mod ignore_rules;
10mod parsing;
11mod queries;
12
13#[cfg(test)]
14mod proptest_fuzzing;
15
16#[cfg(test)]
17mod snapshot_tests;
18
19// Re-export public types
20pub use directives::{detect_directives, detect_directives_in_file, Directive, FileDirectives};
21pub use hashing::{
22    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
23    compute_window_hash, detect_duplicates_with_extension, detect_type3_clones, extend_match,
24    normalize, normalize_with_line_numbers, verify_cross_window_match, CloneMatch, RollingHash,
25    Token,
26};
27pub use ignore_rules::{
28    compute_duplicate_id, compute_symmetric_duplicate_id, FileRange, IgnoreEntry, IgnoreManager,
29};
30pub use parsing::{
31    extract_functions, extract_javascript_functions, extract_python_functions,
32    extract_rust_functions, FunctionNode,
33};
34
35use anyhow::{anyhow, Context, Result};
36use ignore::WalkBuilder;
37use rayon::prelude::*;
38use serde::{Deserialize, Serialize};
39use std::collections::HashMap;
40use std::fs;
41use std::path::{Path, PathBuf};
42use std::sync::Arc;
43use tree_sitter::Language;
44
45/// Clone type classification
46#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
47pub enum CloneType {
48    /// Type-1: Exact copies (only whitespace/comments differ)
49    #[serde(rename = "type-1")]
50    Type1,
51    /// Type-2: Structurally identical but renamed identifiers/literals
52    #[serde(rename = "type-2")]
53    Type2,
54    /// Type-3: Near-miss clones with modifications (not yet implemented)
55    #[serde(rename = "type-3")]
56    Type3,
57}
58
59/// Helper function to check if two ranges overlap
60fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
61    start1 < end2 && start2 < end1
62}
63
64// Stable key for deduplicating matches within the same file pair.
65fn canonical_pair_key<'a>(
66    func1: &'a FunctionHash,
67    func2: &'a FunctionHash,
68    source_start: usize,
69    target_start: usize,
70    length: usize,
71) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
72    if func1.file_path.as_ref() < func2.file_path.as_ref() {
73        (
74            func1.file_path.as_ref(),
75            func2.file_path.as_ref(),
76            func1.start_line,
77            func2.start_line,
78            source_start,
79            target_start,
80            length,
81        )
82    } else {
83        (
84            func2.file_path.as_ref(),
85            func1.file_path.as_ref(),
86            func2.start_line,
87            func1.start_line,
88            target_start,
89            source_start,
90            length,
91        )
92    }
93}
94
95/// Represents a detected duplicate code fragment
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
97pub struct DuplicateMatch {
98    pub file1: String,
99    pub file2: String,
100    pub start_line1: usize,
101    pub start_line2: usize,
102    #[serde(skip)]
103    pub end_line1: Option<usize>,
104    #[serde(skip)]
105    pub end_line2: Option<usize>,
106    pub length: usize,
107    pub similarity: f64,
108    pub hash: u64,
109    pub clone_type: CloneType,
110    /// Edit distance (Type-3 only). None for Type-1/2
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub edit_distance: Option<usize>,
113    /// Indicates if this duplicate is suppressed by an inline directive
114    #[serde(skip_serializing_if = "Option::is_none")]
115    pub suppressed_by_directive: Option<bool>,
116    /// Token offset within function for file1 (used for ignore ID computation)
117    #[serde(skip)]
118    token_offset1: Option<usize>,
119    /// Token offset within function for file2 (used for ignore ID computation)
120    #[serde(skip)]
121    token_offset2: Option<usize>,
122    /// Token length of the second window (Type-3 may differ from `length`)
123    #[serde(skip)]
124    target_length: Option<usize>,
125    /// Content-based ID for this duplicate (SHA256 of normalized tokens)
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub duplicate_id: Option<String>,
128}
129
130/// Represents a function with its tokens for duplicate detection
131#[derive(Debug, Clone)]
132struct FunctionHash {
133    file_path: Arc<str>, // Shared ownership, cheap to clone
134    #[allow(dead_code)] // Kept for potential future reporting improvements
135    function_name: Option<String>,
136    #[allow(dead_code)] // Kept for byte-level analysis in future
137    start_byte: usize,
138    #[allow(dead_code)] // Kept for byte-level analysis in future
139    end_byte: usize,
140    start_line: usize,
141    #[allow(dead_code)] // Kept for future detailed reporting
142    end_line: usize,
143    tokens: Vec<Token>, // Normalized token sequence
144    /// Zero-based line offset for each token relative to start_line
145    token_line_offsets: Vec<usize>,
146    raw_body: String, // Original (unnormalized) function body for Type-1 detection
147}
148
149/// Baseline snapshot for comparing duplicate detection across runs
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct Baseline {
152    /// Version of the baseline format
153    pub version: String,
154    /// Timestamp when baseline was created
155    pub created_at: String,
156    /// Duplicates that existed at baseline time
157    pub duplicates: Vec<DuplicateMatch>,
158}
159
160impl Baseline {
161    /// Create a new baseline from scan results
162    pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
163        Self {
164            version: env!("CARGO_PKG_VERSION").to_string(),
165            created_at: chrono::Utc::now().to_rfc3339(),
166            duplicates,
167        }
168    }
169
170    /// Save baseline to a JSON file
171    pub fn save_to_file(&self, path: &Path) -> Result<()> {
172        let json =
173            serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
174        fs::write(path, json).context("Failed to write baseline file")?;
175        Ok(())
176    }
177
178    /// Load baseline from a JSON file
179    pub fn load_from_file(path: &Path) -> Result<Self> {
180        let content = fs::read_to_string(path)
181            .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
182        let baseline: Baseline =
183            serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
184        Ok(baseline)
185    }
186
187    /// Compare current duplicates against baseline and return only new ones
188    pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
189        let baseline_set: std::collections::HashSet<_> =
190            self.duplicates.iter().map(duplicate_key).collect();
191
192        current
193            .iter()
194            .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
195            .cloned()
196            .collect()
197    }
198}
199
200/// Create a unique key for a duplicate match for comparison
201fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
202    // Normalize file order for consistent comparison
203    let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
204        (
205            dup.file1.clone(),
206            dup.file2.clone(),
207            dup.start_line1,
208            dup.start_line2,
209        )
210    } else {
211        (
212            dup.file2.clone(),
213            dup.file1.clone(),
214            dup.start_line2,
215            dup.start_line1,
216        )
217    };
218    (file1, file2, line1, line2, dup.length)
219}
220
221/// Report containing scan results
222#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct Report {
224    /// PolyDup version
225    #[serde(skip_serializing_if = "Option::is_none")]
226    pub version: Option<String>,
227    /// Scan start time (ISO 8601)
228    #[serde(skip_serializing_if = "Option::is_none")]
229    pub scan_time: Option<String>,
230    /// Configuration used for the scan
231    #[serde(skip_serializing_if = "Option::is_none")]
232    pub config: Option<ScanConfig>,
233    /// Total number of files scanned
234    pub files_scanned: usize,
235    /// Total number of functions analyzed
236    pub functions_analyzed: usize,
237    /// Detected duplicate matches
238    pub duplicates: Vec<DuplicateMatch>,
239    /// Scan statistics
240    pub stats: ScanStats,
241}
242
243/// Configuration used for scanning
244#[derive(Debug, Clone, Serialize, Deserialize)]
245pub struct ScanConfig {
246    /// Minimum block size in tokens
247    pub threshold: usize,
248    /// Similarity threshold (0.0 - 1.0)
249    pub similarity: f64,
250    /// Type-3 detection enabled
251    pub type3_enabled: bool,
252    /// Paths scanned
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub paths: Option<Vec<String>>,
255}
256
257/// Statistics from the scanning process
258#[derive(Debug, Clone, Serialize, Deserialize)]
259pub struct ScanStats {
260    /// Total lines of code scanned
261    pub total_lines: usize,
262    /// Total tokens processed
263    pub total_tokens: usize,
264    /// Number of unique hashes computed
265    pub unique_hashes: usize,
266    /// Scan duration in milliseconds
267    pub duration_ms: u64,
268}
269
270/// Main scanner for detecting duplicates
271#[allow(dead_code)] // similarity_threshold reserved for future use
272pub struct Scanner {
273    /// Minimum code block size to consider (in tokens)
274    min_block_size: usize,
275    /// Similarity threshold (0.0 - 1.0)
276    similarity_threshold: f64,
277    /// Glob patterns to exclude from scanning
278    exclude_patterns: Vec<String>,
279    /// Enable Type-3 (gap-tolerant) clone detection
280    enable_type3: bool,
281    /// Type-3 similarity tolerance (0.0 - 1.0)
282    type3_tolerance: f64,
283    /// Ignore manager for filtering false positives
284    ignore_manager: Option<IgnoreManager>,
285    /// Enable inline directive detection
286    enable_directives: bool,
287    /// Include test files in scanning (*.test.*, *.spec.*, etc.)
288    include_tests: bool,
289}
290
291/// Default exclude patterns for test files and build artifacts
292fn default_exclude_patterns() -> Vec<String> {
293    vec![
294        // Test files (excluded by default, enable with --include-tests)
295        "**/*.test.ts".to_string(),
296        "**/*.test.js".to_string(),
297        "**/*.test.tsx".to_string(),
298        "**/*.test.jsx".to_string(),
299        "**/*.spec.ts".to_string(),
300        "**/*.spec.js".to_string(),
301        "**/*.spec.tsx".to_string(),
302        "**/*.spec.jsx".to_string(),
303        "**/__tests__/**".to_string(),
304        "**/*.test.py".to_string(),
305    ]
306}
307
308/// Exclude patterns for build artifacts (always excluded)
309fn build_artifact_patterns() -> Vec<String> {
310    vec![
311        "**/node_modules/**".to_string(),
312        "**/target/**".to_string(),
313        "**/dist/**".to_string(),
314        "**/build/**".to_string(),
315        "**/.git/**".to_string(),
316    ]
317}
318
319impl Scanner {
320    /// Creates a new Scanner with default settings
321    ///
322    /// This is now infallible as there are no I/O or allocation failures.
323    pub fn new() -> Self {
324        let mut exclude = build_artifact_patterns();
325        exclude.extend(default_exclude_patterns());
326
327        Self {
328            min_block_size: 50,
329            similarity_threshold: 0.85,
330            exclude_patterns: exclude,
331            enable_type3: false,
332            type3_tolerance: 0.85,
333            ignore_manager: None,
334            enable_directives: false,
335            include_tests: false,
336        }
337    }
338
339    /// Creates a new Scanner with custom settings
340    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
341        let mut exclude = build_artifact_patterns();
342        exclude.extend(default_exclude_patterns());
343
344        Ok(Self {
345            min_block_size,
346            similarity_threshold,
347            exclude_patterns: exclude,
348            enable_type3: false,
349            type3_tolerance: 0.85,
350            ignore_manager: None,
351            enable_directives: false,
352            include_tests: false,
353        })
354    }
355
356    /// Sets custom exclude patterns, replacing the defaults
357    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
358        self.exclude_patterns = patterns;
359        self
360    }
361
362    /// Enables test file scanning (removes test file patterns from exclusions)
363    pub fn with_test_files(mut self, include: bool) -> Self {
364        self.include_tests = include;
365        if include {
366            // Remove test file patterns from exclusions
367            let test_patterns = default_exclude_patterns();
368            self.exclude_patterns.retain(|p| !test_patterns.contains(p));
369        }
370        self
371    }
372
373    /// Enables Type-3 clone detection with the specified tolerance
374    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
375        if !(0.0..=1.0).contains(&tolerance) {
376            return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
377        }
378        self.enable_type3 = true;
379        self.type3_tolerance = tolerance;
380        Ok(self)
381    }
382
383    /// Sets the ignore manager for filtering false positives
384    pub fn with_ignore_manager(mut self, manager: IgnoreManager) -> Self {
385        self.ignore_manager = Some(manager);
386        self
387    }
388
389    /// Enables inline directive detection (// polydup-ignore comments)
390    pub fn with_directives(mut self, enabled: bool) -> Self {
391        self.enable_directives = enabled;
392        self
393    }
394
395    /// Scans the given paths and returns a Report with detected duplicates
396    ///
397    /// Uses Rayon for parallel file processing:
398    /// 1. Read and parse files
399    /// 2. Extract functions
400    /// 3. Normalize and hash function bodies
401    /// 4. Compare hashes to find duplicates
402    /// 5. Apply directive-based filtering if enabled
403    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
404        use std::time::Instant;
405        let start_time = Instant::now();
406
407        // Collect all source files
408        let source_files = self.collect_source_files(paths.clone())?;
409
410        // Detect directives if enabled
411        let directives_map: HashMap<PathBuf, crate::directives::FileDirectives> =
412            if self.enable_directives {
413                source_files
414                    .par_iter()
415                    .filter_map(|path| {
416                        crate::directives::detect_directives_in_file(path)
417                            .ok()
418                            .map(|d| (path.clone(), d))
419                    })
420                    .collect()
421            } else {
422                HashMap::new()
423            };
424
425        // Process files in parallel using Rayon
426        let function_hashes: Vec<FunctionHash> = source_files
427            .par_iter()
428            .filter_map(|path| self.process_file(path).ok())
429            .flatten()
430            .collect();
431
432        // Find duplicates by comparing hashes
433        let mut duplicates = self.find_duplicate_hashes(&function_hashes);
434
435        // Apply directive-based filtering
436        if self.enable_directives && !directives_map.is_empty() {
437            for dup in &mut duplicates {
438                let suppressed =
439                    self.is_suppressed_by_directive(dup, &directives_map, &function_hashes);
440                if suppressed {
441                    dup.suppressed_by_directive = Some(true);
442                }
443            }
444
445            // Filter out suppressed duplicates (they shouldn't appear in reports or fail CI)
446            duplicates.retain(|dup| dup.suppressed_by_directive != Some(true));
447        }
448
449        // Calculate statistics
450        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
451
452        let unique_hashes: usize = {
453            let mut hash_set = std::collections::HashSet::new();
454            for fh in &function_hashes {
455                // Compute rolling hashes just for statistics
456                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
457                for (hash, _) in hashes {
458                    hash_set.insert(hash);
459                }
460            }
461            hash_set.len()
462        };
463
464        let duration_ms = start_time.elapsed().as_millis() as u64;
465
466        // Count total lines across all source files
467        let total_lines: usize = source_files
468            .iter()
469            .filter_map(|path| std::fs::read_to_string(path).ok())
470            .map(|content| content.lines().count())
471            .sum();
472
473        Ok(Report {
474            version: None,   // Will be set by CLI
475            scan_time: None, // Will be set by CLI
476            config: None,    // Will be set by CLI
477            files_scanned: source_files.len(),
478            functions_analyzed: function_hashes.len(),
479            duplicates,
480            stats: ScanStats {
481                total_lines,
482                total_tokens,
483                unique_hashes,
484                duration_ms,
485            },
486        })
487    }
488
489    /// Collects all source files from the given paths
490    ///
491    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
492    /// and common ignore patterns (node_modules, target, etc.)
493    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
494        let mut files = Vec::new();
495
496        for path in paths {
497            if path.is_file() {
498                if self.is_supported_file(&path) && !self.is_excluded(&path) {
499                    files.push(path);
500                }
501            } else if path.is_dir() {
502                // Use ignore crate's WalkBuilder to respect .gitignore
503                let walker = WalkBuilder::new(&path)
504                    .git_ignore(true) // Respect .gitignore
505                    .git_global(true) // Respect global gitignore
506                    .git_exclude(true) // Respect .git/info/exclude
507                    .ignore(true) // Respect .ignore files
508                    .hidden(false) // Don't skip hidden files (e.g., .config/)
509                    .parents(true) // Respect parent .gitignore files
510                    .build();
511
512                for entry in walker {
513                    match entry {
514                        Ok(entry) => {
515                            let path = entry.path();
516                            if path.is_file()
517                                && self.is_supported_file(path)
518                                && !self.is_excluded(path)
519                            {
520                                files.push(path.to_path_buf());
521                            }
522                        }
523                        Err(err) => {
524                            // Log but don't fail on individual entry errors
525                            eprintln!("Warning: Failed to access path: {}", err);
526                        }
527                    }
528                }
529            }
530        }
531
532        Ok(files)
533    }
534
535    /// Checks if a file is a supported source file
536    fn is_supported_file(&self, path: &Path) -> bool {
537        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
538            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
539        } else {
540            false
541        }
542    }
543
544    /// Checks if a file matches any exclude patterns
545    fn is_excluded(&self, path: &Path) -> bool {
546        use globset::{Glob, GlobSetBuilder};
547
548        // Build glob set from exclude patterns
549        let mut builder = GlobSetBuilder::new();
550        for pattern in &self.exclude_patterns {
551            if let Ok(glob) = Glob::new(pattern) {
552                builder.add(glob);
553            }
554        }
555
556        if let Ok(glob_set) = builder.build() {
557            glob_set.is_match(path)
558        } else {
559            false
560        }
561    }
562
563    /// Processes a single file and returns function hashes
564    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
565        let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
566
567        let lang = self.detect_language(path)?;
568        let functions = extract_functions(&code, lang)?;
569
570        // Use Arc<str> for efficient sharing across all functions in this file
571        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
572        let mut function_hashes = Vec::new();
573
574        for func in functions {
575            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
576            let raw_body = func.body.clone();
577            let (tokens, token_line_offsets) = normalize_with_line_numbers(&func.body);
578
579            // Skip if too small
580            if tokens.len() < self.min_block_size {
581                continue;
582            }
583
584            // Store the full token sequence for extension-based detection
585            function_hashes.push(FunctionHash {
586                file_path: Arc::clone(&file_path), // Cheap pointer clone
587                function_name: func.name.clone(),
588                start_byte: func.start_byte,
589                end_byte: func.end_byte,
590                start_line: func.start_line,
591                end_line: func.end_line,
592                tokens,
593                token_line_offsets,
594                raw_body,
595            });
596        }
597
598        Ok(function_hashes)
599    }
600
601    /// Detects the Tree-sitter Language from file extension
602    fn detect_language(&self, path: &Path) -> Result<Language> {
603        let ext = path
604            .extension()
605            .and_then(|e| e.to_str())
606            .ok_or_else(|| anyhow!("No file extension"))?;
607
608        match ext {
609            "rs" => Ok(tree_sitter_rust::language()),
610            "py" => Ok(tree_sitter_python::language()),
611            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
612            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
613        }
614    }
615
616    /// Computes the inclusive line span for a token window within a function
617    fn compute_line_span(
618        &self,
619        func: &FunctionHash,
620        start_offset: usize,
621        length: usize,
622    ) -> (usize, usize) {
623        let start_line = func
624            .token_line_offsets
625            .get(start_offset)
626            .map(|offset| func.start_line + offset)
627            .unwrap_or(func.start_line + start_offset);
628
629        let end_index = start_offset + length.saturating_sub(1);
630        let end_line = func
631            .token_line_offsets
632            .get(end_index)
633            .map(|offset| func.start_line + offset)
634            .unwrap_or(func.start_line + end_index);
635
636        (start_line, end_line)
637    }
638
639    /// Finds duplicate code using greedy extension algorithm
640    ///
641    /// Orchestrates the detection pipeline:
642    /// 1. Type-1/2 detection (exact and renamed clones)
643    /// 2. Type-3 detection (near-miss clones with gaps)
644    /// 3. Duplicate ID computation
645    /// 4. Ignore filtering
646    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
647        // Type alias for pair deduplication keys
648        type SeenPairKey<'a> = (&'a str, &'a str, usize, usize, usize, usize, usize);
649
650        // Shared state for deduplication across Type-1/2 and Type-3
651        let mut seen_pairs: std::collections::HashSet<SeenPairKey<'_>> =
652            std::collections::HashSet::new();
653
654        // Phase 1: Type-1/2 detection
655        let mut duplicates = self.find_type12_duplicates(function_hashes, &mut seen_pairs);
656
657        // Phase 2: Type-3 detection (if enabled)
658        if self.enable_type3 {
659            self.find_type3_duplicates(function_hashes, &seen_pairs, &mut duplicates);
660        }
661
662        // Phase 3: Compute IDs for all duplicates
663        self.compute_duplicate_ids(function_hashes, &mut duplicates);
664
665        // Phase 4: Filter out ignored duplicates
666        self.filter_ignored_duplicates(&mut duplicates);
667
668        duplicates
669    }
670
671    /// Detects Type-1 (exact) and Type-2 (renamed) clones
672    ///
673    /// Compares all function pairs using hash-based detection with greedy extension.
674    fn find_type12_duplicates<'a>(
675        &self,
676        function_hashes: &'a [FunctionHash],
677        seen_pairs: &mut std::collections::HashSet<(
678            &'a str,
679            &'a str,
680            usize,
681            usize,
682            usize,
683            usize,
684            usize,
685        )>,
686    ) -> Vec<DuplicateMatch> {
687        let mut duplicates = Vec::new();
688
689        for i in 0..function_hashes.len() {
690            for j in (i + 1)..function_hashes.len() {
691                let func1 = &function_hashes[i];
692                let func2 = &function_hashes[j];
693
694                let matches = self.find_clones_between_functions(func1, func2);
695
696                for clone_match in matches {
697                    let pair_key = canonical_pair_key(
698                        func1,
699                        func2,
700                        clone_match.source_start,
701                        clone_match.target_start,
702                        clone_match.length,
703                    );
704
705                    if seen_pairs.contains(&pair_key) {
706                        continue;
707                    }
708                    seen_pairs.insert(pair_key);
709
710                    // Compute hash for reporting
711                    let match_hash = Self::compute_match_hash(
712                        &func1.tokens[clone_match.source_start
713                            ..clone_match.source_start + clone_match.length],
714                    );
715
716                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
717
718                    let (actual_start1, actual_end1) =
719                        self.compute_line_span(func1, clone_match.source_start, clone_match.length);
720                    let (actual_start2, actual_end2) =
721                        self.compute_line_span(func2, clone_match.target_start, clone_match.length);
722
723                    // Skip same location (overlapping function boundaries)
724                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
725                        continue;
726                    }
727
728                    duplicates.push(DuplicateMatch {
729                        file1: func1.file_path.to_string(),
730                        file2: func2.file_path.to_string(),
731                        start_line1: actual_start1,
732                        start_line2: actual_start2,
733                        end_line1: Some(actual_end1),
734                        end_line2: Some(actual_end2),
735                        length: clone_match.length,
736                        similarity: clone_match.similarity,
737                        hash: match_hash,
738                        clone_type,
739                        edit_distance: None,
740                        suppressed_by_directive: None,
741                        token_offset1: Some(clone_match.source_start),
742                        token_offset2: Some(clone_match.target_start),
743                        target_length: Some(clone_match.length),
744                        duplicate_id: None,
745                    });
746                }
747            }
748        }
749
750        duplicates
751    }
752
753    /// Detects Type-3 (gap-tolerant) clones using edit distance
754    ///
755    /// Finds near-miss clones that have insertions, deletions, or modifications.
756    fn find_type3_duplicates<'a>(
757        &self,
758        function_hashes: &'a [FunctionHash],
759        seen_pairs: &std::collections::HashSet<(
760            &'a str,
761            &'a str,
762            usize,
763            usize,
764            usize,
765            usize,
766            usize,
767        )>,
768        duplicates: &mut Vec<DuplicateMatch>,
769    ) {
770        let mut type3_candidates = Vec::new();
771
772        for i in 0..function_hashes.len() {
773            for j in (i + 1)..function_hashes.len() {
774                let func1 = &function_hashes[i];
775                let func2 = &function_hashes[j];
776
777                let type3_matches = detect_type3_clones(
778                    &func1.tokens,
779                    &func2.tokens,
780                    self.min_block_size,
781                    self.type3_tolerance,
782                );
783
784                for clone_match in type3_matches {
785                    let pair_key = canonical_pair_key(
786                        func1,
787                        func2,
788                        clone_match.source_start,
789                        clone_match.target_start,
790                        clone_match.length,
791                    );
792
793                    if seen_pairs.contains(&pair_key) {
794                        continue;
795                    }
796
797                    type3_candidates.push((func1, func2, clone_match));
798                }
799            }
800        }
801
802        // Deduplicate overlapping Type-3 matches
803        let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
804
805        // Convert to DuplicateMatch
806        for (func1, func2, clone_match) in deduplicated {
807            let (actual_start1, actual_end1) =
808                self.compute_line_span(func1, clone_match.source_start, clone_match.length);
809            let (actual_start2, actual_end2) =
810                self.compute_line_span(func2, clone_match.target_start, clone_match.target_length);
811
812            let window1 = &func1.tokens
813                [clone_match.source_start..clone_match.source_start + clone_match.length];
814            let window2 = &func2.tokens
815                [clone_match.target_start..clone_match.target_start + clone_match.target_length];
816            let edit_dist = hashing::compute_token_edit_distance(window1, window2);
817
818            let match_hash = Self::compute_match_hash(window1);
819
820            duplicates.push(DuplicateMatch {
821                file1: func1.file_path.to_string(),
822                file2: func2.file_path.to_string(),
823                start_line1: actual_start1,
824                start_line2: actual_start2,
825                end_line1: Some(actual_end1),
826                end_line2: Some(actual_end2),
827                length: clone_match.length,
828                similarity: clone_match.similarity,
829                hash: match_hash,
830                clone_type: CloneType::Type3,
831                edit_distance: Some(edit_dist),
832                suppressed_by_directive: None,
833                token_offset1: Some(clone_match.source_start),
834                token_offset2: Some(clone_match.target_start),
835                target_length: Some(clone_match.target_length),
836                duplicate_id: None,
837            });
838        }
839    }
840
841    /// Computes content-based IDs for all duplicates
842    ///
843    /// IDs are SHA256 hashes of normalized tokens, enabling persistent ignore rules.
844    fn compute_duplicate_ids(
845        &self,
846        function_hashes: &[FunctionHash],
847        duplicates: &mut [DuplicateMatch],
848    ) {
849        for dup in duplicates.iter_mut() {
850            if dup.duplicate_id.is_some() {
851                continue;
852            }
853
854            let tokens1 = self.extract_duplicate_tokens(
855                function_hashes,
856                &dup.file1,
857                dup.start_line1,
858                dup.token_offset1,
859                dup.length,
860            );
861
862            let tokens2 = self.extract_duplicate_tokens(
863                function_hashes,
864                &dup.file2,
865                dup.start_line2,
866                dup.token_offset2,
867                dup.target_length.unwrap_or(dup.length),
868            );
869
870            if let Some(tokens1) = tokens1 {
871                let id = if let Some(tokens2) = tokens2 {
872                    ignore_rules::compute_symmetric_duplicate_id(&tokens1, &tokens2)
873                } else {
874                    ignore_rules::compute_duplicate_id(&tokens1)
875                };
876                dup.duplicate_id = Some(id);
877            }
878        }
879    }
880
881    /// Extracts normalized token strings for a duplicate region
882    fn extract_duplicate_tokens(
883        &self,
884        function_hashes: &[FunctionHash],
885        file: &str,
886        reported_start: usize,
887        token_offset: Option<usize>,
888        length: usize,
889    ) -> Option<Vec<String>> {
890        let token_offset = token_offset?;
891        function_hashes
892            .iter()
893            .find(|fh| {
894                fh.file_path.as_ref() == file
895                    && fh.start_line <= reported_start
896                    && reported_start <= fh.end_line
897            })
898            .and_then(|fh| {
899                if token_offset + length <= fh.tokens.len() {
900                    Some(
901                        fh.tokens
902                            .iter()
903                            .skip(token_offset)
904                            .take(length)
905                            .map(|t| t.as_hash_string().to_string())
906                            .collect(),
907                    )
908                } else {
909                    None
910                }
911            })
912    }
913
914    /// Filters out duplicates that are in the ignore list
915    fn filter_ignored_duplicates(&self, duplicates: &mut Vec<DuplicateMatch>) {
916        if let Some(ref ignore_manager) = self.ignore_manager {
917            duplicates.retain(|dup| {
918                if let Some(ref id) = dup.duplicate_id {
919                    !ignore_manager.is_ignored(id)
920                } else {
921                    // If we couldn't compute an ID, keep the duplicate (fail open)
922                    true
923                }
924            });
925        }
926    }
927
928    /// Computes a hash for a token slice (used for match reporting)
929    fn compute_match_hash(tokens: &[Token]) -> u64 {
930        use std::collections::hash_map::DefaultHasher;
931        use std::hash::{Hash, Hasher};
932        let mut hasher = DefaultHasher::new();
933        tokens.hash(&mut hasher);
934        hasher.finish()
935    }
936
937    /// Checks if a duplicate is suppressed by an inline directive
938    ///
939    /// Directives suppress the entire function they're placed before, so we check
940    /// if the owning function has a directive, not the duplicate's specific lines.
941    fn is_suppressed_by_directive(
942        &self,
943        dup: &DuplicateMatch,
944        directives_map: &HashMap<PathBuf, crate::directives::FileDirectives>,
945        function_hashes: &[FunctionHash],
946    ) -> bool {
947        // Check if either file has a directive suppressing this duplicate
948        let file1_path = PathBuf::from(&dup.file1);
949        let file2_path = PathBuf::from(&dup.file2);
950
951        // Check file1 - use the owning function's start line for directive lookup
952        if let Some(directives) = directives_map.get(&file1_path) {
953            let func_start =
954                self.find_owning_function_start(&dup.file1, dup.start_line1, function_hashes);
955            // Use function start for directive check (directives apply to whole function)
956            let check_line = func_start.unwrap_or(dup.start_line1);
957
958            if directives.is_suppressed(check_line, check_line).is_some() {
959                return true;
960            }
961        }
962
963        // Check file2 - use the owning function's start line for directive lookup
964        if let Some(directives) = directives_map.get(&file2_path) {
965            let func_start =
966                self.find_owning_function_start(&dup.file2, dup.start_line2, function_hashes);
967            // Use function start for directive check (directives apply to whole function)
968            let check_line = func_start.unwrap_or(dup.start_line2);
969
970            if directives.is_suppressed(check_line, check_line).is_some() {
971                return true;
972            }
973        }
974
975        false
976    }
977
978    /// Finds the start line of the function containing a given line
979    fn find_owning_function_start(
980        &self,
981        file: &str,
982        line: usize,
983        function_hashes: &[FunctionHash],
984    ) -> Option<usize> {
985        function_hashes
986            .iter()
987            .find(|fh| {
988                fh.file_path.as_ref() == file && fh.start_line <= line && line <= fh.end_line
989            })
990            .map(|fh| fh.start_line)
991    }
992
993    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
994    ///
995    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
996    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
997    /// Overlap requires BOTH source AND target ranges to overlap.
998    fn deduplicate_overlapping_matches<'a>(
999        &self,
1000        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
1001    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
1002        if candidates.is_empty() {
1003            return Vec::new();
1004        }
1005
1006        // Track which matches have been merged
1007        let mut used = vec![false; candidates.len()];
1008        let mut deduplicated = Vec::new();
1009
1010        for i in 0..candidates.len() {
1011            if used[i] {
1012                continue;
1013            }
1014
1015            let (func1, func2, current) = &candidates[i];
1016            let mut best_match = (*func1, *func2, current.clone());
1017            used[i] = true;
1018
1019            // Find all overlapping matches (iterate until no more overlaps found)
1020            // This handles transitive overlaps: A overlaps B, B overlaps C
1021            let mut found_overlap = true;
1022            while found_overlap {
1023                found_overlap = false;
1024
1025                for j in (i + 1)..candidates.len() {
1026                    if used[j] {
1027                        continue;
1028                    }
1029
1030                    let (f1, f2, candidate) = &candidates[j];
1031
1032                    // Only merge if same function pair (by file path and line number)
1033                    let same_pair = (func1.file_path == f1.file_path
1034                        && func2.file_path == f2.file_path
1035                        && func1.start_line == f1.start_line
1036                        && func2.start_line == f2.start_line)
1037                        || (func1.file_path == f2.file_path
1038                            && func2.file_path == f1.file_path
1039                            && func1.start_line == f2.start_line
1040                            && func2.start_line == f1.start_line);
1041
1042                    if !same_pair {
1043                        continue;
1044                    }
1045
1046                    // Check if overlapping with CURRENT best_match (not original)
1047                    // This ensures transitive overlaps are handled correctly
1048                    let source_overlap = ranges_overlap(
1049                        best_match.2.source_start,
1050                        best_match.2.source_start + best_match.2.length,
1051                        candidate.source_start,
1052                        candidate.source_start + candidate.length,
1053                    );
1054                    let target_overlap = ranges_overlap(
1055                        best_match.2.target_start,
1056                        best_match.2.target_start + best_match.2.target_length,
1057                        candidate.target_start,
1058                        candidate.target_start + candidate.target_length,
1059                    );
1060
1061                    if source_overlap && target_overlap {
1062                        let best_span = best_match.2.length.max(best_match.2.target_length);
1063                        let candidate_span = candidate.length.max(candidate.target_length);
1064
1065                        // Keep the match that covers more tokens overall, breaking ties by similarity
1066                        if candidate_span > best_span
1067                            || (candidate_span == best_span
1068                                && candidate.similarity > best_match.2.similarity)
1069                        {
1070                            best_match = (*f1, *f2, candidate.clone());
1071                            found_overlap = true; // Need another pass to check against new best
1072                        }
1073                        used[j] = true;
1074                    }
1075                }
1076            }
1077
1078            deduplicated.push(best_match);
1079        }
1080
1081        deduplicated
1082    }
1083
1084    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
1085    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
1086        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
1087        let normalized1 = raw1.split_whitespace().collect::<String>();
1088        let normalized2 = raw2.split_whitespace().collect::<String>();
1089
1090        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
1091        if normalized1 == normalized2 {
1092            CloneType::Type1
1093        } else {
1094            // Otherwise, it's Type-2 (renamed identifiers/literals)
1095            CloneType::Type2
1096        }
1097    }
1098
1099    /// Finds clone matches between two functions using extension algorithm
1100    fn find_clones_between_functions(
1101        &self,
1102        func1: &FunctionHash,
1103        func2: &FunctionHash,
1104    ) -> Vec<CloneMatch> {
1105        use std::collections::HashMap;
1106
1107        let mut matches = Vec::new();
1108        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
1109
1110        // Index all windows in func1
1111        let mut i = 0;
1112        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
1113            let hash = hashing::compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
1114            hash_map.entry(hash).or_default().push(i);
1115            i += 1;
1116        }
1117
1118        // Search for matches in func2
1119        let mut j = 0;
1120        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
1121            let hash = hashing::compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
1122
1123            if let Some(func1_positions) = hash_map.get(&hash) {
1124                for &func1_pos in func1_positions {
1125                    // Verify exact match using shared utility
1126                    if hashing::verify_cross_window_match(
1127                        &func1.tokens,
1128                        &func2.tokens,
1129                        func1_pos,
1130                        j,
1131                        self.min_block_size,
1132                    ) {
1133                        // Greedy extension using shared utility
1134                        let extension = hashing::extend_match(
1135                            &func1.tokens,
1136                            &func2.tokens,
1137                            func1_pos,
1138                            j,
1139                            self.min_block_size,
1140                        );
1141
1142                        let total_length = self.min_block_size + extension;
1143
1144                        matches.push(CloneMatch {
1145                            source_start: func1_pos,
1146                            target_start: j,
1147                            length: total_length,
1148                            target_length: total_length,
1149                            similarity: 1.0, // Exact match
1150                        });
1151
1152                        // Skip ahead
1153                        j += extension.max(1);
1154                        break;
1155                    }
1156                }
1157            }
1158
1159            j += 1;
1160        }
1161
1162        matches
1163    }
1164}
1165
1166impl Default for Scanner {
1167    fn default() -> Self {
1168        Self::new() // Infallible now, no panic possible
1169    }
1170}
1171
1172/// Public API: Find duplicates in the given file paths
1173///
1174/// # Arguments
1175/// * `paths` - Vector of file paths to scan
1176///
1177/// # Returns
1178/// * `Result<Report>` - Scan report with detected duplicates
1179pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
1180    let scanner = Scanner::new();
1181    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1182    scanner.scan(path_bufs)
1183}
1184
1185/// Public API with custom configuration
1186pub fn find_duplicates_with_config(
1187    paths: Vec<String>,
1188    min_block_size: usize,
1189    similarity_threshold: f64,
1190) -> Result<Report> {
1191    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
1192    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
1193    scanner.scan(path_bufs)
1194}
1195
1196#[cfg(test)]
1197mod tests {
1198    use super::*;
1199
1200    /// Helper to create a FunctionHash for testing with sequential line offsets
1201    fn make_test_function(
1202        file: &str,
1203        start_line: usize,
1204        tokens: Vec<Token>,
1205        raw_body: &str,
1206    ) -> FunctionHash {
1207        let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1208        FunctionHash {
1209            file_path: Arc::<str>::from(file),
1210            function_name: None,
1211            start_byte: 0,
1212            end_byte: 0,
1213            start_line,
1214            end_line: start_line + tokens.len(),
1215            tokens,
1216            token_line_offsets,
1217            raw_body: raw_body.to_string(),
1218        }
1219    }
1220
1221    /// Helper to create a FunctionHash with all tokens on the same line
1222    fn make_test_function_same_line(
1223        file: &str,
1224        start_line: usize,
1225        end_line: usize,
1226        tokens: Vec<Token>,
1227        raw_body: &str,
1228    ) -> FunctionHash {
1229        let token_line_offsets: Vec<usize> = vec![0; tokens.len()];
1230        FunctionHash {
1231            file_path: Arc::<str>::from(file),
1232            function_name: None,
1233            start_byte: 0,
1234            end_byte: 0,
1235            start_line,
1236            end_line,
1237            tokens,
1238            token_line_offsets,
1239            raw_body: raw_body.to_string(),
1240        }
1241    }
1242
1243    /// Helper to create simple expression tokens for testing: keyword id op id ;
1244    fn make_expr_tokens(keyword: &str, op: &str) -> Vec<Token> {
1245        vec![
1246            Token::Keyword(keyword.into()),
1247            Token::Identifier,
1248            Token::Operator(op.into()),
1249            Token::Identifier,
1250            Token::Punctuation(";".into()),
1251        ]
1252    }
1253
1254    #[test]
1255    fn test_scanner_creation() {
1256        let _scanner = Scanner::new(); // Infallible
1257    }
1258
1259    #[test]
1260    fn test_scanner_with_config() {
1261        let scanner = Scanner::with_config(30, 0.9);
1262        assert!(scanner.is_ok());
1263        let s = scanner.unwrap();
1264        assert_eq!(s.min_block_size, 30);
1265        assert_eq!(s.similarity_threshold, 0.9);
1266    }
1267
1268    #[test]
1269    fn test_type3_tolerance_validation() {
1270        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
1271        assert!(Scanner::new().with_type3_detection(1.2).is_err());
1272        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
1273    }
1274
1275    #[test]
1276    fn test_type3_not_dropped_when_functions_share_offsets() {
1277        fn make_function(
1278            file: &str,
1279            start_line: usize,
1280            tokens: Vec<Token>,
1281            raw_body: &str,
1282        ) -> FunctionHash {
1283            let token_line_offsets: Vec<usize> = (0..tokens.len()).collect();
1284            FunctionHash {
1285                file_path: Arc::<str>::from(file),
1286                function_name: None,
1287                start_byte: 0,
1288                end_byte: 0,
1289                start_line,
1290                end_line: start_line + tokens.len(),
1291                tokens,
1292                token_line_offsets,
1293                raw_body: raw_body.to_string(),
1294            }
1295        }
1296
1297        let scanner = Scanner::with_config(3, 0.85)
1298            .unwrap()
1299            .with_type3_detection(0.6)
1300            .unwrap();
1301
1302        let type1_tokens = vec![
1303            Token::Keyword("return".into()),
1304            Token::NumberLiteral,
1305            Token::Punctuation(";".into()),
1306        ];
1307        let near_tokens_a = vec![
1308            Token::Keyword("compute".into()),
1309            Token::Identifier,
1310            Token::Identifier,
1311        ];
1312        let near_tokens_b = vec![
1313            Token::Keyword("compute".into()),
1314            Token::Identifier,
1315            Token::NumberLiteral,
1316        ];
1317
1318        let functions = vec![
1319            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
1320            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
1321            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
1322            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
1323        ];
1324
1325        let duplicates = scanner.find_duplicate_hashes(&functions);
1326
1327        let type1_present = duplicates.iter().any(|d| {
1328            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
1329                && d.start_line1 == 10
1330                && d.start_line2 == 20
1331        });
1332        assert!(
1333            type1_present,
1334            "expected Type-1/2 match for the first function pair"
1335        );
1336
1337        let type3_present = duplicates.iter().any(|d| {
1338            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
1339        });
1340        assert!(
1341            type3_present,
1342            "Type-3 match between later functions should not be deduped"
1343        );
1344
1345        assert_eq!(
1346            duplicates.len(),
1347            2,
1348            "should keep both the Type-1/2 and Type-3 matches"
1349        );
1350    }
1351
1352    #[test]
1353    fn test_type3_reports_token_offsets_in_start_lines() {
1354        let scanner = Scanner::with_config(3, 0.85)
1355            .unwrap()
1356            .with_type3_detection(0.75)
1357            .unwrap();
1358
1359        let functions = vec![
1360            make_test_function_same_line(
1361                "file_a.rs",
1362                100,
1363                105,
1364                make_expr_tokens("let", "+"),
1365                "let a = b + c;",
1366            ),
1367            make_test_function_same_line(
1368                "file_b.rs",
1369                200,
1370                205,
1371                make_expr_tokens("mut", "-"),
1372                "let a = b - c;",
1373            ),
1374        ];
1375
1376        let duplicates = scanner.find_duplicate_hashes(&functions);
1377
1378        let type3 = duplicates
1379            .iter()
1380            .find(|d| matches!(d.clone_type, CloneType::Type3))
1381            .expect("expected a Type-3 duplicate match");
1382
1383        assert_eq!(
1384            type3.start_line1, 100,
1385            "should report the actual source line even when tokens share a line"
1386        );
1387        assert_eq!(
1388            type3.start_line2, 200,
1389            "should report the actual target line even when tokens share a line"
1390        );
1391        assert_eq!(type3.token_offset1, Some(1));
1392        assert_eq!(type3.token_offset2, Some(1));
1393    }
1394
1395    #[test]
1396    fn type3_duplicate_ids_are_symmetric() {
1397        use tempfile::TempDir;
1398
1399        let tokens_a = make_expr_tokens("let", "+");
1400        // tokens_b has an extra identifier to create a Type-3 (near-miss) clone
1401        let mut tokens_b = make_expr_tokens("let", "-");
1402        tokens_b.push(Token::Identifier);
1403
1404        let func_a = make_test_function("file_a.rs", 10, tokens_a.clone(), "fn file_a.rs() {}");
1405        let func_b = make_test_function("file_b.rs", 20, tokens_b.clone(), "fn file_b.rs() {}");
1406
1407        let temp_dir = TempDir::new().unwrap();
1408        let scanner = Scanner::with_config(3, 0.85)
1409            .unwrap()
1410            .with_type3_detection(0.75)
1411            .unwrap()
1412            .with_ignore_manager(IgnoreManager::new(temp_dir.path()));
1413
1414        let forward = scanner.find_duplicate_hashes(&[func_a.clone(), func_b.clone()]);
1415        let reverse = scanner.find_duplicate_hashes(&[func_b, func_a]);
1416
1417        let id_forward = forward
1418            .into_iter()
1419            .find(|d| matches!(d.clone_type, CloneType::Type3))
1420            .and_then(|d| d.duplicate_id)
1421            .expect("expected a Type-3 duplicate ID");
1422
1423        let id_reverse = reverse
1424            .into_iter()
1425            .find(|d| matches!(d.clone_type, CloneType::Type3))
1426            .and_then(|d| d.duplicate_id)
1427            .expect("expected a Type-3 duplicate ID");
1428
1429        assert_eq!(
1430            id_forward, id_reverse,
1431            "Type-3 IDs should not depend on function order"
1432        );
1433    }
1434
1435    #[test]
1436    fn duplicate_matches_store_actual_end_lines() {
1437        let scanner = Scanner::with_config(2, 0.85).unwrap();
1438
1439        let tokens = vec![
1440            Token::Keyword("fn".into()),
1441            Token::Identifier,
1442            Token::Identifier,
1443            Token::Punctuation("{".into()),
1444            Token::Punctuation("}".into()),
1445        ];
1446
1447        let func1 = FunctionHash {
1448            file_path: Arc::<str>::from("file_a.rs"),
1449            function_name: None,
1450            start_byte: 0,
1451            end_byte: 0,
1452            start_line: 10,
1453            end_line: 14,
1454            tokens: tokens.clone(),
1455            token_line_offsets: vec![0, 0, 1, 1, 2],
1456            raw_body: "fn a() {}".to_string(),
1457        };
1458
1459        let func2 = FunctionHash {
1460            file_path: Arc::<str>::from("file_b.rs"),
1461            function_name: None,
1462            start_byte: 0,
1463            end_byte: 0,
1464            start_line: 20,
1465            end_line: 24,
1466            tokens,
1467            token_line_offsets: vec![0, 1, 1, 2, 2],
1468            raw_body: "fn b() {}".to_string(),
1469        };
1470
1471        let duplicates = scanner.find_duplicate_hashes(&[func1, func2]);
1472        let dup = duplicates.first().expect("expected a duplicate match");
1473
1474        assert_eq!(dup.start_line1, 10);
1475        assert_eq!(dup.start_line2, 20);
1476        assert_eq!(dup.end_line1, Some(12));
1477        assert_eq!(dup.end_line2, Some(22));
1478    }
1479
1480    #[test]
1481    fn test_find_duplicates_empty() {
1482        let result = find_duplicates(vec![]);
1483        assert!(result.is_ok());
1484        let report = result.unwrap();
1485        assert_eq!(report.duplicates.len(), 0);
1486    }
1487
1488    #[test]
1489    fn test_is_supported_file() {
1490        let scanner = Scanner::new();
1491
1492        assert!(scanner.is_supported_file(Path::new("test.rs")));
1493        assert!(scanner.is_supported_file(Path::new("test.py")));
1494        assert!(scanner.is_supported_file(Path::new("test.js")));
1495        assert!(scanner.is_supported_file(Path::new("test.ts")));
1496        assert!(!scanner.is_supported_file(Path::new("test.txt")));
1497        assert!(!scanner.is_supported_file(Path::new("test.md")));
1498    }
1499
1500    #[test]
1501    fn test_detect_language() {
1502        let scanner = Scanner::new();
1503
1504        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1505        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1506        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1507        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1508    }
1509}