dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17// Re-export public types
18pub use hashing::{
19    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
20    detect_duplicates_with_extension, detect_type3_clones, normalize, CloneMatch, RollingHash,
21    Token,
22};
23pub use parsing::{
24    extract_functions, extract_javascript_functions, extract_python_functions,
25    extract_rust_functions, FunctionNode,
26};
27
28use anyhow::{anyhow, Context, Result};
29use ignore::WalkBuilder;
30use rayon::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::fs;
33use std::path::{Path, PathBuf};
34use std::sync::Arc;
35use tree_sitter::Language;
36
37/// Clone type classification
38#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub enum CloneType {
40    /// Type-1: Exact copies (only whitespace/comments differ)
41    #[serde(rename = "type-1")]
42    Type1,
43    /// Type-2: Structurally identical but renamed identifiers/literals
44    #[serde(rename = "type-2")]
45    Type2,
46    /// Type-3: Near-miss clones with modifications (not yet implemented)
47    #[serde(rename = "type-3")]
48    Type3,
49}
50
51/// Helper function to check if two ranges overlap
52fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
53    start1 < end2 && start2 < end1
54}
55
56// Stable key for deduplicating matches within the same file pair.
57fn canonical_pair_key<'a>(
58    func1: &'a FunctionHash,
59    func2: &'a FunctionHash,
60    source_start: usize,
61    target_start: usize,
62    length: usize,
63) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
64    if func1.file_path.as_ref() < func2.file_path.as_ref() {
65        (
66            func1.file_path.as_ref(),
67            func2.file_path.as_ref(),
68            func1.start_line,
69            func2.start_line,
70            source_start,
71            target_start,
72            length,
73        )
74    } else {
75        (
76            func2.file_path.as_ref(),
77            func1.file_path.as_ref(),
78            func2.start_line,
79            func1.start_line,
80            target_start,
81            source_start,
82            length,
83        )
84    }
85}
86
87/// Represents a detected duplicate code fragment
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89pub struct DuplicateMatch {
90    pub file1: String,
91    pub file2: String,
92    pub start_line1: usize,
93    pub start_line2: usize,
94    pub length: usize,
95    pub similarity: f64,
96    pub hash: u64,
97    pub clone_type: CloneType,
98    /// Edit distance (Type-3 only). None for Type-1/2
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub edit_distance: Option<usize>,
101}
102
103/// Represents a function with its tokens for duplicate detection
104#[derive(Debug, Clone)]
105struct FunctionHash {
106    file_path: Arc<str>, // Shared ownership, cheap to clone
107    #[allow(dead_code)] // Kept for potential future reporting improvements
108    function_name: Option<String>,
109    #[allow(dead_code)] // Kept for byte-level analysis in future
110    start_byte: usize,
111    #[allow(dead_code)] // Kept for byte-level analysis in future
112    end_byte: usize,
113    start_line: usize,
114    #[allow(dead_code)] // Kept for future detailed reporting
115    end_line: usize,
116    tokens: Vec<Token>, // Normalized token sequence
117    raw_body: String,   // Original (unnormalized) function body for Type-1 detection
118}
119
120/// Baseline snapshot for comparing duplicate detection across runs
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct Baseline {
123    /// Version of the baseline format
124    pub version: String,
125    /// Timestamp when baseline was created
126    pub created_at: String,
127    /// Duplicates that existed at baseline time
128    pub duplicates: Vec<DuplicateMatch>,
129}
130
131impl Baseline {
132    /// Create a new baseline from scan results
133    pub fn from_duplicates(duplicates: Vec<DuplicateMatch>) -> Self {
134        Self {
135            version: env!("CARGO_PKG_VERSION").to_string(),
136            created_at: chrono::Utc::now().to_rfc3339(),
137            duplicates,
138        }
139    }
140
141    /// Save baseline to a JSON file
142    pub fn save_to_file(&self, path: &Path) -> Result<()> {
143        let json =
144            serde_json::to_string_pretty(self).context("Failed to serialize baseline to JSON")?;
145        fs::write(path, json).context("Failed to write baseline file")?;
146        Ok(())
147    }
148
149    /// Load baseline from a JSON file
150    pub fn load_from_file(path: &Path) -> Result<Self> {
151        let content = fs::read_to_string(path)
152            .with_context(|| format!("Failed to read baseline file: {}", path.display()))?;
153        let baseline: Baseline =
154            serde_json::from_str(&content).context("Failed to parse baseline JSON")?;
155        Ok(baseline)
156    }
157
158    /// Compare current duplicates against baseline and return only new ones
159    pub fn find_new_duplicates(&self, current: &[DuplicateMatch]) -> Vec<DuplicateMatch> {
160        let baseline_set: std::collections::HashSet<_> =
161            self.duplicates.iter().map(duplicate_key).collect();
162
163        current
164            .iter()
165            .filter(|dup| !baseline_set.contains(&duplicate_key(dup)))
166            .cloned()
167            .collect()
168    }
169}
170
171/// Create a unique key for a duplicate match for comparison
172fn duplicate_key(dup: &DuplicateMatch) -> (String, String, usize, usize, usize) {
173    // Normalize file order for consistent comparison
174    let (file1, file2, line1, line2) = if dup.file1 < dup.file2 {
175        (
176            dup.file1.clone(),
177            dup.file2.clone(),
178            dup.start_line1,
179            dup.start_line2,
180        )
181    } else {
182        (
183            dup.file2.clone(),
184            dup.file1.clone(),
185            dup.start_line2,
186            dup.start_line1,
187        )
188    };
189    (file1, file2, line1, line2, dup.length)
190}
191
192/// Report containing scan results
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct Report {
195    /// PolyDup version
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub version: Option<String>,
198    /// Scan start time (ISO 8601)
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub scan_time: Option<String>,
201    /// Configuration used for the scan
202    #[serde(skip_serializing_if = "Option::is_none")]
203    pub config: Option<ScanConfig>,
204    /// Total number of files scanned
205    pub files_scanned: usize,
206    /// Total number of functions analyzed
207    pub functions_analyzed: usize,
208    /// Detected duplicate matches
209    pub duplicates: Vec<DuplicateMatch>,
210    /// Scan statistics
211    pub stats: ScanStats,
212}
213
214/// Configuration used for scanning
215#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct ScanConfig {
217    /// Minimum block size in tokens
218    pub threshold: usize,
219    /// Similarity threshold (0.0 - 1.0)
220    pub similarity: f64,
221    /// Type-3 detection enabled
222    pub type3_enabled: bool,
223    /// Paths scanned
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub paths: Option<Vec<String>>,
226}
227
228/// Statistics from the scanning process
229#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct ScanStats {
231    /// Total lines of code scanned
232    pub total_lines: usize,
233    /// Total tokens processed
234    pub total_tokens: usize,
235    /// Number of unique hashes computed
236    pub unique_hashes: usize,
237    /// Scan duration in milliseconds
238    pub duration_ms: u64,
239}
240
241/// Main scanner for detecting duplicates
242#[allow(dead_code)] // similarity_threshold reserved for future use
243pub struct Scanner {
244    /// Minimum code block size to consider (in tokens)
245    min_block_size: usize,
246    /// Similarity threshold (0.0 - 1.0)
247    similarity_threshold: f64,
248    /// Glob patterns to exclude from scanning
249    exclude_patterns: Vec<String>,
250    /// Enable Type-3 (gap-tolerant) clone detection
251    enable_type3: bool,
252    /// Type-3 similarity tolerance (0.0 - 1.0)
253    type3_tolerance: f64,
254}
255
256impl Scanner {
257    /// Creates a new Scanner with default settings
258    ///
259    /// This is now infallible as there are no I/O or allocation failures.
260    pub fn new() -> Self {
261        Self {
262            min_block_size: 50,
263            similarity_threshold: 0.85,
264            exclude_patterns: vec![
265                "**/*.test.ts".to_string(),
266                "**/*.test.js".to_string(),
267                "**/*.test.tsx".to_string(),
268                "**/*.test.jsx".to_string(),
269                "**/*.spec.ts".to_string(),
270                "**/*.spec.js".to_string(),
271                "**/*.spec.tsx".to_string(),
272                "**/*.spec.jsx".to_string(),
273                "**/__tests__/**".to_string(),
274                "**/*.test.py".to_string(),
275            ],
276            enable_type3: false,
277            type3_tolerance: 0.85,
278        }
279    }
280
281    /// Creates a new Scanner with custom settings
282    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
283        Ok(Self {
284            min_block_size,
285            similarity_threshold,
286            exclude_patterns: vec![
287                "**/*.test.ts".to_string(),
288                "**/*.test.js".to_string(),
289                "**/*.test.tsx".to_string(),
290                "**/*.test.jsx".to_string(),
291                "**/*.spec.ts".to_string(),
292                "**/*.spec.js".to_string(),
293                "**/*.spec.tsx".to_string(),
294                "**/*.spec.jsx".to_string(),
295                "**/__tests__/**".to_string(),
296                "**/*.test.py".to_string(),
297            ],
298            enable_type3: false,
299            type3_tolerance: 0.85,
300        })
301    }
302
303    /// Sets custom exclude patterns, replacing the defaults
304    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
305        self.exclude_patterns = patterns;
306        self
307    }
308
309    /// Enables Type-3 clone detection with the specified tolerance
310    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
311        if !(0.0..=1.0).contains(&tolerance) {
312            return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
313        }
314        self.enable_type3 = true;
315        self.type3_tolerance = tolerance;
316        Ok(self)
317    }
318
319    /// Scans the given paths and returns a Report with detected duplicates
320    ///
321    /// Uses Rayon for parallel file processing:
322    /// 1. Read and parse files
323    /// 2. Extract functions
324    /// 3. Normalize and hash function bodies
325    /// 4. Compare hashes to find duplicates
326    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
327        use std::time::Instant;
328        let start_time = Instant::now();
329
330        // Collect all source files
331        let source_files = self.collect_source_files(paths)?;
332
333        // Process files in parallel using Rayon
334        let function_hashes: Vec<FunctionHash> = source_files
335            .par_iter()
336            .filter_map(|path| self.process_file(path).ok())
337            .flatten()
338            .collect();
339
340        // Find duplicates by comparing hashes
341        let duplicates = self.find_duplicate_hashes(&function_hashes);
342
343        // Calculate statistics
344        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
345
346        let unique_hashes: usize = {
347            let mut hash_set = std::collections::HashSet::new();
348            for fh in &function_hashes {
349                // Compute rolling hashes just for statistics
350                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
351                for (hash, _) in hashes {
352                    hash_set.insert(hash);
353                }
354            }
355            hash_set.len()
356        };
357
358        let duration_ms = start_time.elapsed().as_millis() as u64;
359
360        // Count total lines across all source files
361        let total_lines: usize = source_files
362            .iter()
363            .filter_map(|path| std::fs::read_to_string(path).ok())
364            .map(|content| content.lines().count())
365            .sum();
366
367        Ok(Report {
368            version: None,   // Will be set by CLI
369            scan_time: None, // Will be set by CLI
370            config: None,    // Will be set by CLI
371            files_scanned: source_files.len(),
372            functions_analyzed: function_hashes.len(),
373            duplicates,
374            stats: ScanStats {
375                total_lines,
376                total_tokens,
377                unique_hashes,
378                duration_ms,
379            },
380        })
381    }
382
383    /// Collects all source files from the given paths
384    ///
385    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
386    /// and common ignore patterns (node_modules, target, etc.)
387    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
388        let mut files = Vec::new();
389
390        for path in paths {
391            if path.is_file() {
392                if self.is_supported_file(&path) && !self.is_excluded(&path) {
393                    files.push(path);
394                }
395            } else if path.is_dir() {
396                // Use ignore crate's WalkBuilder to respect .gitignore
397                let walker = WalkBuilder::new(&path)
398                    .git_ignore(true) // Respect .gitignore
399                    .git_global(true) // Respect global gitignore
400                    .git_exclude(true) // Respect .git/info/exclude
401                    .ignore(true) // Respect .ignore files
402                    .hidden(false) // Don't skip hidden files (e.g., .config/)
403                    .parents(true) // Respect parent .gitignore files
404                    .build();
405
406                for entry in walker {
407                    match entry {
408                        Ok(entry) => {
409                            let path = entry.path();
410                            if path.is_file()
411                                && self.is_supported_file(path)
412                                && !self.is_excluded(path)
413                            {
414                                files.push(path.to_path_buf());
415                            }
416                        }
417                        Err(err) => {
418                            // Log but don't fail on individual entry errors
419                            eprintln!("Warning: Failed to access path: {}", err);
420                        }
421                    }
422                }
423            }
424        }
425
426        Ok(files)
427    }
428
429    /// Checks if a file is a supported source file
430    fn is_supported_file(&self, path: &Path) -> bool {
431        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
432            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
433        } else {
434            false
435        }
436    }
437
438    /// Checks if a file matches any exclude patterns
439    fn is_excluded(&self, path: &Path) -> bool {
440        use globset::{Glob, GlobSetBuilder};
441
442        // Build glob set from exclude patterns
443        let mut builder = GlobSetBuilder::new();
444        for pattern in &self.exclude_patterns {
445            if let Ok(glob) = Glob::new(pattern) {
446                builder.add(glob);
447            }
448        }
449
450        if let Ok(glob_set) = builder.build() {
451            glob_set.is_match(path)
452        } else {
453            false
454        }
455    }
456
457    /// Processes a single file and returns function hashes
458    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
459        let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
460
461        let lang = self.detect_language(path)?;
462        let functions = extract_functions(&code, lang)?;
463
464        // Use Arc<str> for efficient sharing across all functions in this file
465        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
466        let mut function_hashes = Vec::new();
467
468        for func in functions {
469            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
470            let raw_body = func.body.clone();
471            let tokens = normalize(&func.body);
472
473            // Skip if too small
474            if tokens.len() < self.min_block_size {
475                continue;
476            }
477
478            // Store the full token sequence for extension-based detection
479            function_hashes.push(FunctionHash {
480                file_path: Arc::clone(&file_path), // Cheap pointer clone
481                function_name: func.name.clone(),
482                start_byte: func.start_byte,
483                end_byte: func.end_byte,
484                start_line: func.start_line,
485                end_line: func.end_line,
486                tokens,
487                raw_body,
488            });
489        }
490
491        Ok(function_hashes)
492    }
493
494    /// Detects the Tree-sitter Language from file extension
495    fn detect_language(&self, path: &Path) -> Result<Language> {
496        let ext = path
497            .extension()
498            .and_then(|e| e.to_str())
499            .ok_or_else(|| anyhow!("No file extension"))?;
500
501        match ext {
502            "rs" => Ok(tree_sitter_rust::language()),
503            "py" => Ok(tree_sitter_python::language()),
504            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
505            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
506        }
507    }
508
509    /// Finds duplicate code using greedy extension algorithm
510    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
511        let mut duplicates = Vec::new();
512        let mut seen_pairs: std::collections::HashSet<(
513            &str,
514            &str,
515            usize,
516            usize,
517            usize,
518            usize,
519            usize,
520        )> = std::collections::HashSet::new();
521
522        // Compare each pair of functions
523        for i in 0..function_hashes.len() {
524            for j in (i + 1)..function_hashes.len() {
525                let func1 = &function_hashes[i];
526                let func2 = &function_hashes[j];
527
528                // Use extension-based detection on each function's tokens
529                let matches = self.find_clones_between_functions(func1, func2);
530
531                for clone_match in matches {
532                    // Create pair key for deduplication using string slices (no cloning)
533                    let pair_key = canonical_pair_key(
534                        func1,
535                        func2,
536                        clone_match.source_start,
537                        clone_match.target_start,
538                        clone_match.length,
539                    );
540
541                    if seen_pairs.contains(&pair_key) {
542                        continue;
543                    }
544                    seen_pairs.insert(pair_key);
545
546                    // Compute a hash for this match for reporting
547                    use std::collections::hash_map::DefaultHasher;
548                    use std::hash::{Hash, Hasher};
549                    let mut hasher = DefaultHasher::new();
550                    func1.tokens
551                        [clone_match.source_start..clone_match.source_start + clone_match.length]
552                        .hash(&mut hasher);
553                    let match_hash = hasher.finish();
554
555                    // Determine clone type by comparing raw function bodies
556                    // (simplified: compares entire bodies, not just the clone region)
557                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
558
559                    // Calculate actual line numbers within the code
560                    let actual_start1 = func1.start_line + clone_match.source_start;
561                    let actual_start2 = func2.start_line + clone_match.target_start;
562
563                    // Skip if this is the exact same location (same file, same line)
564                    // This can happen with overlapping function boundaries
565                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
566                        continue;
567                    }
568
569                    duplicates.push(DuplicateMatch {
570                        file1: func1.file_path.to_string(),
571                        file2: func2.file_path.to_string(),
572                        start_line1: actual_start1,
573                        start_line2: actual_start2,
574                        length: clone_match.length,
575                        similarity: clone_match.similarity,
576                        hash: match_hash,
577                        clone_type,
578                        edit_distance: None, // Type-1/2 don't have edit distance
579                    });
580                }
581            }
582        }
583
584        // Type-3 detection (gap-tolerant) if enabled
585        if self.enable_type3 {
586            // Collect all Type-3 matches first
587            let mut type3_candidates = Vec::new();
588
589            for i in 0..function_hashes.len() {
590                for j in (i + 1)..function_hashes.len() {
591                    let func1 = &function_hashes[i];
592                    let func2 = &function_hashes[j];
593
594                    // Type-3 detection works for both same-file and cross-file clones
595                    // (Type-1/2 already handled same-file detection)
596
597                    // Detect Type-3 clones
598                    let type3_matches = detect_type3_clones(
599                        &func1.tokens,
600                        &func2.tokens,
601                        self.min_block_size,
602                        self.type3_tolerance,
603                    );
604
605                    for clone_match in type3_matches {
606                        // Check if this overlaps with existing Type-1/2 matches
607                        let pair_key = canonical_pair_key(
608                            func1,
609                            func2,
610                            clone_match.source_start,
611                            clone_match.target_start,
612                            clone_match.length,
613                        );
614
615                        if seen_pairs.contains(&pair_key) {
616                            continue;
617                        }
618
619                        type3_candidates.push((func1, func2, clone_match));
620                    }
621                }
622            }
623
624            // Deduplicate overlapping Type-3 matches
625            let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
626
627            // Convert deduplicated matches to DuplicateMatch
628            for (func1, func2, clone_match) in deduplicated {
629                // Calculate edit distance for Type-3 using our custom implementation
630                let window1 = &func1.tokens
631                    [clone_match.source_start..clone_match.source_start + clone_match.length];
632                let window2 = &func2.tokens[clone_match.target_start
633                    ..clone_match.target_start + clone_match.target_length];
634                let edit_dist = hashing::compute_token_edit_distance(window1, window2);
635
636                // Compute hash for Type-3 match
637                use std::collections::hash_map::DefaultHasher;
638                use std::hash::{Hash, Hasher};
639                let mut hasher = DefaultHasher::new();
640                window1.hash(&mut hasher);
641                let match_hash = hasher.finish();
642
643                duplicates.push(DuplicateMatch {
644                    file1: func1.file_path.to_string(),
645                    file2: func2.file_path.to_string(),
646                    start_line1: func1.start_line,
647                    start_line2: func2.start_line,
648                    length: clone_match.length,
649                    similarity: clone_match.similarity,
650                    hash: match_hash,
651                    clone_type: CloneType::Type3,
652                    edit_distance: Some(edit_dist),
653                });
654            }
655        }
656
657        duplicates
658    }
659
660    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
661    ///
662    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
663    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
664    /// Overlap requires BOTH source AND target ranges to overlap.
665    fn deduplicate_overlapping_matches<'a>(
666        &self,
667        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
668    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
669        if candidates.is_empty() {
670            return Vec::new();
671        }
672
673        // Track which matches have been merged
674        let mut used = vec![false; candidates.len()];
675        let mut deduplicated = Vec::new();
676
677        for i in 0..candidates.len() {
678            if used[i] {
679                continue;
680            }
681
682            let (func1, func2, current) = &candidates[i];
683            let mut best_match = (*func1, *func2, current.clone());
684            used[i] = true;
685
686            // Find all overlapping matches (iterate until no more overlaps found)
687            // This handles transitive overlaps: A overlaps B, B overlaps C
688            let mut found_overlap = true;
689            while found_overlap {
690                found_overlap = false;
691
692                for j in (i + 1)..candidates.len() {
693                    if used[j] {
694                        continue;
695                    }
696
697                    let (f1, f2, candidate) = &candidates[j];
698
699                    // Only merge if same function pair (by file path and line number)
700                    let same_pair = (func1.file_path == f1.file_path
701                        && func2.file_path == f2.file_path
702                        && func1.start_line == f1.start_line
703                        && func2.start_line == f2.start_line)
704                        || (func1.file_path == f2.file_path
705                            && func2.file_path == f1.file_path
706                            && func1.start_line == f2.start_line
707                            && func2.start_line == f1.start_line);
708
709                    if !same_pair {
710                        continue;
711                    }
712
713                    // Check if overlapping with CURRENT best_match (not original)
714                    // This ensures transitive overlaps are handled correctly
715                    let source_overlap = ranges_overlap(
716                        best_match.2.source_start,
717                        best_match.2.source_start + best_match.2.length,
718                        candidate.source_start,
719                        candidate.source_start + candidate.length,
720                    );
721                    let target_overlap = ranges_overlap(
722                        best_match.2.target_start,
723                        best_match.2.target_start + best_match.2.target_length,
724                        candidate.target_start,
725                        candidate.target_start + candidate.target_length,
726                    );
727
728                    if source_overlap && target_overlap {
729                        let best_span = best_match.2.length.max(best_match.2.target_length);
730                        let candidate_span = candidate.length.max(candidate.target_length);
731
732                        // Keep the match that covers more tokens overall, breaking ties by similarity
733                        if candidate_span > best_span
734                            || (candidate_span == best_span
735                                && candidate.similarity > best_match.2.similarity)
736                        {
737                            best_match = (*f1, *f2, candidate.clone());
738                            found_overlap = true; // Need another pass to check against new best
739                        }
740                        used[j] = true;
741                    }
742                }
743            }
744
745            deduplicated.push(best_match);
746        }
747
748        deduplicated
749    }
750
751    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
752    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
753        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
754        let normalized1 = raw1.split_whitespace().collect::<String>();
755        let normalized2 = raw2.split_whitespace().collect::<String>();
756
757        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
758        if normalized1 == normalized2 {
759            CloneType::Type1
760        } else {
761            // Otherwise, it's Type-2 (renamed identifiers/literals)
762            CloneType::Type2
763        }
764    }
765
766    /// Finds clone matches between two functions using extension algorithm
767    fn find_clones_between_functions(
768        &self,
769        func1: &FunctionHash,
770        func2: &FunctionHash,
771    ) -> Vec<CloneMatch> {
772        use std::collections::HashMap;
773
774        let mut matches = Vec::new();
775        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
776
777        // Index all windows in func1
778        let mut i = 0;
779        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
780            let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
781            hash_map.entry(hash).or_default().push(i);
782            i += 1;
783        }
784
785        // Search for matches in func2
786        let mut j = 0;
787        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
788            let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
789
790            if let Some(func1_positions) = hash_map.get(&hash) {
791                for &func1_pos in func1_positions {
792                    // Verify exact match
793                    if self.verify_window_match(
794                        &func1.tokens,
795                        &func2.tokens,
796                        func1_pos,
797                        j,
798                        self.min_block_size,
799                    ) {
800                        // Greedy extension
801                        let mut extension = 0;
802                        while (func1_pos + self.min_block_size + extension < func1.tokens.len())
803                            && (j + self.min_block_size + extension < func2.tokens.len())
804                            && (func1.tokens[func1_pos + self.min_block_size + extension]
805                                == func2.tokens[j + self.min_block_size + extension])
806                        {
807                            extension += 1;
808                        }
809
810                        let total_length = self.min_block_size + extension;
811
812                        matches.push(CloneMatch {
813                            source_start: func1_pos,
814                            target_start: j,
815                            length: total_length,
816                            target_length: total_length,
817                            similarity: 1.0, // Exact match
818                        });
819
820                        // Skip ahead
821                        j += extension.max(1);
822                        break;
823                    }
824                }
825            }
826
827            j += 1;
828        }
829
830        matches
831    }
832
833    /// Computes hash for a token window
834    ///
835    /// Uses Rabin-Karp polynomial rolling hash with:
836    /// - BASE = 257 (prime > 256 to minimize collisions)
837    /// - MODULUS = 1e9+7 (large prime for hash space)
838    fn compute_window_hash(&self, window: &[Token]) -> u64 {
839        /// Prime base for polynomial rolling hash
840        const BASE: u64 = 257;
841        /// Large prime modulus to reduce hash collisions
842        const MODULUS: u64 = 1_000_000_007;
843
844        let mut hash: u64 = 0;
845        for token in window {
846            use std::collections::hash_map::DefaultHasher;
847            use std::hash::{Hash, Hasher};
848            let mut hasher = DefaultHasher::new();
849            token.as_hash_string().hash(&mut hasher);
850            let token_hash = hasher.finish();
851            // Use u128 to prevent overflow before modulo operation
852            let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
853            hash = wide_hash as u64;
854        }
855        hash
856    }
857
858    /// Verifies that two token windows are exactly identical
859    fn verify_window_match(
860        &self,
861        tokens1: &[Token],
862        tokens2: &[Token],
863        idx1: usize,
864        idx2: usize,
865        len: usize,
866    ) -> bool {
867        if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
868            return false;
869        }
870        tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
871    }
872}
873
874impl Default for Scanner {
875    fn default() -> Self {
876        Self::new() // Infallible now, no panic possible
877    }
878}
879
880/// Public API: Find duplicates in the given file paths
881///
882/// # Arguments
883/// * `paths` - Vector of file paths to scan
884///
885/// # Returns
886/// * `Result<Report>` - Scan report with detected duplicates
887pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
888    let scanner = Scanner::new();
889    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
890    scanner.scan(path_bufs)
891}
892
893/// Public API with custom configuration
894pub fn find_duplicates_with_config(
895    paths: Vec<String>,
896    min_block_size: usize,
897    similarity_threshold: f64,
898) -> Result<Report> {
899    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
900    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
901    scanner.scan(path_bufs)
902}
903
904#[cfg(test)]
905mod tests {
906    use super::*;
907
908    #[test]
909    fn test_scanner_creation() {
910        let _scanner = Scanner::new(); // Infallible
911    }
912
913    #[test]
914    fn test_scanner_with_config() {
915        let scanner = Scanner::with_config(30, 0.9);
916        assert!(scanner.is_ok());
917        let s = scanner.unwrap();
918        assert_eq!(s.min_block_size, 30);
919        assert_eq!(s.similarity_threshold, 0.9);
920    }
921
922    #[test]
923    fn test_type3_tolerance_validation() {
924        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
925        assert!(Scanner::new().with_type3_detection(1.2).is_err());
926        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
927    }
928
929    #[test]
930    fn test_type3_not_dropped_when_functions_share_offsets() {
931        fn make_function(
932            file: &str,
933            start_line: usize,
934            tokens: Vec<Token>,
935            raw_body: &str,
936        ) -> FunctionHash {
937            FunctionHash {
938                file_path: Arc::<str>::from(file),
939                function_name: None,
940                start_byte: 0,
941                end_byte: 0,
942                start_line,
943                end_line: start_line + tokens.len(),
944                tokens,
945                raw_body: raw_body.to_string(),
946            }
947        }
948
949        let scanner = Scanner::with_config(3, 0.85)
950            .unwrap()
951            .with_type3_detection(0.6)
952            .unwrap();
953
954        let type1_tokens = vec![
955            Token::Keyword("return".into()),
956            Token::NumberLiteral,
957            Token::Punctuation(";".into()),
958        ];
959        let near_tokens_a = vec![
960            Token::Keyword("compute".into()),
961            Token::Identifier,
962            Token::Identifier,
963        ];
964        let near_tokens_b = vec![
965            Token::Keyword("compute".into()),
966            Token::Identifier,
967            Token::NumberLiteral,
968        ];
969
970        let functions = vec![
971            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
972            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
973            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
974            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
975        ];
976
977        let duplicates = scanner.find_duplicate_hashes(&functions);
978
979        let type1_present = duplicates.iter().any(|d| {
980            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
981                && d.start_line1 == 10
982                && d.start_line2 == 20
983        });
984        assert!(
985            type1_present,
986            "expected Type-1/2 match for the first function pair"
987        );
988
989        let type3_present = duplicates.iter().any(|d| {
990            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
991        });
992        assert!(
993            type3_present,
994            "Type-3 match between later functions should not be deduped"
995        );
996
997        assert_eq!(
998            duplicates.len(),
999            2,
1000            "should keep both the Type-1/2 and Type-3 matches"
1001        );
1002    }
1003
1004    #[test]
1005    fn test_find_duplicates_empty() {
1006        let result = find_duplicates(vec![]);
1007        assert!(result.is_ok());
1008        let report = result.unwrap();
1009        assert_eq!(report.duplicates.len(), 0);
1010    }
1011
1012    #[test]
1013    fn test_is_supported_file() {
1014        let scanner = Scanner::new();
1015
1016        assert!(scanner.is_supported_file(Path::new("test.rs")));
1017        assert!(scanner.is_supported_file(Path::new("test.py")));
1018        assert!(scanner.is_supported_file(Path::new("test.js")));
1019        assert!(scanner.is_supported_file(Path::new("test.ts")));
1020        assert!(!scanner.is_supported_file(Path::new("test.txt")));
1021        assert!(!scanner.is_supported_file(Path::new("test.md")));
1022    }
1023
1024    #[test]
1025    fn test_detect_language() {
1026        let scanner = Scanner::new();
1027
1028        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
1029        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
1030        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
1031        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
1032    }
1033}
dupe_core/lib.rs

dupe_core/
lib.rs