dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17// Re-export public types
18pub use hashing::{
19    compute_rolling_hashes, detect_duplicates_with_extension, normalize, CloneMatch, RollingHash,
20    Token,
21};
22pub use parsing::{
23    extract_functions, extract_javascript_functions, extract_python_functions,
24    extract_rust_functions, FunctionNode,
25};
26
27use anyhow::{anyhow, Context, Result};
28use ignore::WalkBuilder;
29use rayon::prelude::*;
30use serde::{Deserialize, Serialize};
31use std::fs;
32use std::path::{Path, PathBuf};
33use std::sync::Arc;
34use tree_sitter::Language;
35
36/// Clone type classification
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38pub enum CloneType {
39    /// Type-1: Exact copies (only whitespace/comments differ)
40    #[serde(rename = "type-1")]
41    Type1,
42    /// Type-2: Structurally identical but renamed identifiers/literals
43    #[serde(rename = "type-2")]
44    Type2,
45    /// Type-3: Near-miss clones with modifications (not yet implemented)
46    #[serde(rename = "type-3")]
47    Type3,
48}
49
50/// Represents a detected duplicate code fragment
51#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
52pub struct DuplicateMatch {
53    pub file1: String,
54    pub file2: String,
55    pub start_line1: usize,
56    pub start_line2: usize,
57    pub length: usize,
58    pub similarity: f64,
59    pub hash: u64,
60    pub clone_type: CloneType,
61}
62
63/// Represents a function with its tokens for duplicate detection
64#[derive(Debug, Clone)]
65struct FunctionHash {
66    file_path: Arc<str>, // Shared ownership, cheap to clone
67    #[allow(dead_code)] // Kept for potential future reporting improvements
68    function_name: Option<String>,
69    #[allow(dead_code)] // Kept for byte-level analysis in future
70    start_byte: usize,
71    #[allow(dead_code)] // Kept for byte-level analysis in future
72    end_byte: usize,
73    start_line: usize,
74    #[allow(dead_code)] // Kept for future detailed reporting
75    end_line: usize,
76    tokens: Vec<Token>, // Normalized token sequence
77    raw_body: String,   // Original (unnormalized) function body for Type-1 detection
78}
79
80/// Report containing scan results
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct Report {
83    /// Total number of files scanned
84    pub files_scanned: usize,
85    /// Total number of functions analyzed
86    pub functions_analyzed: usize,
87    /// Detected duplicate matches
88    pub duplicates: Vec<DuplicateMatch>,
89    /// Scan statistics
90    pub stats: ScanStats,
91}
92
93/// Statistics from the scanning process
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ScanStats {
96    /// Total lines of code scanned
97    pub total_lines: usize,
98    /// Total tokens processed
99    pub total_tokens: usize,
100    /// Number of unique hashes computed
101    pub unique_hashes: usize,
102    /// Scan duration in milliseconds
103    pub duration_ms: u64,
104}
105
106/// Main scanner for detecting duplicates
107#[allow(dead_code)] // similarity_threshold reserved for future use
108pub struct Scanner {
109    /// Minimum code block size to consider (in tokens)
110    min_block_size: usize,
111    /// Similarity threshold (0.0 - 1.0)
112    similarity_threshold: f64,
113    /// Glob patterns to exclude from scanning
114    exclude_patterns: Vec<String>,
115}
116
117impl Scanner {
118    /// Creates a new Scanner with default settings
119    ///
120    /// This is now infallible as there are no I/O or allocation failures.
121    pub fn new() -> Self {
122        Self {
123            min_block_size: 50,
124            similarity_threshold: 0.85,
125            exclude_patterns: vec![
126                "**/*.test.ts".to_string(),
127                "**/*.test.js".to_string(),
128                "**/*.test.tsx".to_string(),
129                "**/*.test.jsx".to_string(),
130                "**/*.spec.ts".to_string(),
131                "**/*.spec.js".to_string(),
132                "**/*.spec.tsx".to_string(),
133                "**/*.spec.jsx".to_string(),
134                "**/__tests__/**".to_string(),
135                "**/*.test.py".to_string(),
136            ],
137        }
138    }
139
140    /// Creates a new Scanner with custom settings
141    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
142        Ok(Self {
143            min_block_size,
144            similarity_threshold,
145            exclude_patterns: vec![
146                "**/*.test.ts".to_string(),
147                "**/*.test.js".to_string(),
148                "**/*.test.tsx".to_string(),
149                "**/*.test.jsx".to_string(),
150                "**/*.spec.ts".to_string(),
151                "**/*.spec.js".to_string(),
152                "**/*.spec.tsx".to_string(),
153                "**/*.spec.jsx".to_string(),
154                "**/__tests__/**".to_string(),
155                "**/*.test.py".to_string(),
156            ],
157        })
158    }
159
160    /// Sets custom exclude patterns, replacing the defaults
161    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
162        self.exclude_patterns = patterns;
163        self
164    }
165
166    /// Scans the given paths and returns a Report with detected duplicates
167    ///
168    /// Uses Rayon for parallel file processing:
169    /// 1. Read and parse files
170    /// 2. Extract functions
171    /// 3. Normalize and hash function bodies
172    /// 4. Compare hashes to find duplicates
173    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
174        use std::time::Instant;
175        let start_time = Instant::now();
176
177        // Collect all source files
178        let source_files = self.collect_source_files(paths)?;
179
180        // Process files in parallel using Rayon
181        let function_hashes: Vec<FunctionHash> = source_files
182            .par_iter()
183            .filter_map(|path| self.process_file(path).ok())
184            .flatten()
185            .collect();
186
187        // Find duplicates by comparing hashes
188        let duplicates = self.find_duplicate_hashes(&function_hashes);
189
190        // Calculate statistics
191        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
192
193        let unique_hashes: usize = {
194            let mut hash_set = std::collections::HashSet::new();
195            for fh in &function_hashes {
196                // Compute rolling hashes just for statistics
197                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
198                for (hash, _) in hashes {
199                    hash_set.insert(hash);
200                }
201            }
202            hash_set.len()
203        };
204
205        let duration_ms = start_time.elapsed().as_millis() as u64;
206
207        // Count total lines across all source files
208        let total_lines: usize = source_files
209            .iter()
210            .filter_map(|path| std::fs::read_to_string(path).ok())
211            .map(|content| content.lines().count())
212            .sum();
213
214        Ok(Report {
215            files_scanned: source_files.len(),
216            functions_analyzed: function_hashes.len(),
217            duplicates,
218            stats: ScanStats {
219                total_lines,
220                total_tokens,
221                unique_hashes,
222                duration_ms,
223            },
224        })
225    }
226
227    /// Collects all source files from the given paths
228    ///
229    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
230    /// and common ignore patterns (node_modules, target, etc.)
231    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
232        let mut files = Vec::new();
233
234        for path in paths {
235            if path.is_file() {
236                if self.is_supported_file(&path) && !self.is_excluded(&path) {
237                    files.push(path);
238                }
239            } else if path.is_dir() {
240                // Use ignore crate's WalkBuilder to respect .gitignore
241                let walker = WalkBuilder::new(&path)
242                    .git_ignore(true) // Respect .gitignore
243                    .git_global(true) // Respect global gitignore
244                    .git_exclude(true) // Respect .git/info/exclude
245                    .ignore(true) // Respect .ignore files
246                    .hidden(false) // Don't skip hidden files (e.g., .config/)
247                    .parents(true) // Respect parent .gitignore files
248                    .build();
249
250                for entry in walker {
251                    match entry {
252                        Ok(entry) => {
253                            let path = entry.path();
254                            if path.is_file()
255                                && self.is_supported_file(path)
256                                && !self.is_excluded(path)
257                            {
258                                files.push(path.to_path_buf());
259                            }
260                        }
261                        Err(err) => {
262                            // Log but don't fail on individual entry errors
263                            eprintln!("Warning: Failed to access path: {}", err);
264                        }
265                    }
266                }
267            }
268        }
269
270        Ok(files)
271    }
272
273    /// Checks if a file is a supported source file
274    fn is_supported_file(&self, path: &Path) -> bool {
275        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
276            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
277        } else {
278            false
279        }
280    }
281
282    /// Checks if a file matches any exclude patterns
283    fn is_excluded(&self, path: &Path) -> bool {
284        use globset::{Glob, GlobSetBuilder};
285
286        // Build glob set from exclude patterns
287        let mut builder = GlobSetBuilder::new();
288        for pattern in &self.exclude_patterns {
289            if let Ok(glob) = Glob::new(pattern) {
290                builder.add(glob);
291            }
292        }
293
294        if let Ok(glob_set) = builder.build() {
295            glob_set.is_match(path)
296        } else {
297            false
298        }
299    }
300
301    /// Processes a single file and returns function hashes
302    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
303        let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
304
305        let lang = self.detect_language(path)?;
306        let functions = extract_functions(&code, lang)?;
307
308        // Use Arc<str> for efficient sharing across all functions in this file
309        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
310        let mut function_hashes = Vec::new();
311
312        for func in functions {
313            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
314            let raw_body = func.body.clone();
315            let tokens = normalize(&func.body);
316
317            // Skip if too small
318            if tokens.len() < self.min_block_size {
319                continue;
320            }
321
322            // Store the full token sequence for extension-based detection
323            function_hashes.push(FunctionHash {
324                file_path: Arc::clone(&file_path), // Cheap pointer clone
325                function_name: func.name.clone(),
326                start_byte: func.start_byte,
327                end_byte: func.end_byte,
328                start_line: func.start_line,
329                end_line: func.end_line,
330                tokens,
331                raw_body,
332            });
333        }
334
335        Ok(function_hashes)
336    }
337
338    /// Detects the Tree-sitter Language from file extension
339    fn detect_language(&self, path: &Path) -> Result<Language> {
340        let ext = path
341            .extension()
342            .and_then(|e| e.to_str())
343            .ok_or_else(|| anyhow!("No file extension"))?;
344
345        match ext {
346            "rs" => Ok(tree_sitter_rust::language()),
347            "py" => Ok(tree_sitter_python::language()),
348            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
349            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
350        }
351    }
352
353    /// Finds duplicate code using greedy extension algorithm
354    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
355        let mut duplicates = Vec::new();
356        let mut seen_pairs = std::collections::HashSet::new();
357
358        // Compare each pair of functions
359        for i in 0..function_hashes.len() {
360            for j in (i + 1)..function_hashes.len() {
361                let func1 = &function_hashes[i];
362                let func2 = &function_hashes[j];
363
364                // Skip if same file
365                if func1.file_path == func2.file_path {
366                    continue;
367                }
368
369                // Use extension-based detection on each function's tokens
370                let matches = self.find_clones_between_functions(func1, func2);
371
372                for clone_match in matches {
373                    // Create pair key for deduplication using string slices (no cloning)
374                    let pair_key = if func1.file_path.as_ref() < func2.file_path.as_ref() {
375                        (
376                            func1.file_path.as_ref(),
377                            func2.file_path.as_ref(),
378                            clone_match.source_start,
379                            clone_match.target_start,
380                            clone_match.length,
381                        )
382                    } else {
383                        (
384                            func2.file_path.as_ref(),
385                            func1.file_path.as_ref(),
386                            clone_match.target_start,
387                            clone_match.source_start,
388                            clone_match.length,
389                        )
390                    };
391
392                    if seen_pairs.contains(&pair_key) {
393                        continue;
394                    }
395                    seen_pairs.insert(pair_key);
396
397                    // Compute a hash for this match for reporting
398                    use std::collections::hash_map::DefaultHasher;
399                    use std::hash::{Hash, Hasher};
400                    let mut hasher = DefaultHasher::new();
401                    func1.tokens
402                        [clone_match.source_start..clone_match.source_start + clone_match.length]
403                        .hash(&mut hasher);
404                    let match_hash = hasher.finish();
405
406                    // Determine clone type by comparing raw function bodies
407                    // (simplified: compares entire bodies, not just the clone region)
408                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
409
410                    duplicates.push(DuplicateMatch {
411                        file1: func1.file_path.to_string(),
412                        file2: func2.file_path.to_string(),
413                        start_line1: func1.start_line,
414                        start_line2: func2.start_line,
415                        length: clone_match.length,
416                        similarity: 1.0, // Exact match
417                        hash: match_hash,
418                        clone_type,
419                    });
420                }
421            }
422        }
423
424        duplicates
425    }
426
427    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
428    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
429        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
430        let normalized1 = raw1.split_whitespace().collect::<String>();
431        let normalized2 = raw2.split_whitespace().collect::<String>();
432
433        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
434        if normalized1 == normalized2 {
435            CloneType::Type1
436        } else {
437            // Otherwise, it's Type-2 (renamed identifiers/literals)
438            CloneType::Type2
439        }
440    }
441
442    /// Finds clone matches between two functions using extension algorithm
443    fn find_clones_between_functions(
444        &self,
445        func1: &FunctionHash,
446        func2: &FunctionHash,
447    ) -> Vec<CloneMatch> {
448        use std::collections::HashMap;
449
450        let mut matches = Vec::new();
451        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
452
453        // Index all windows in func1
454        let mut i = 0;
455        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
456            let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
457            hash_map.entry(hash).or_default().push(i);
458            i += 1;
459        }
460
461        // Search for matches in func2
462        let mut j = 0;
463        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
464            let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
465
466            if let Some(func1_positions) = hash_map.get(&hash) {
467                for &func1_pos in func1_positions {
468                    // Verify exact match
469                    if self.verify_window_match(
470                        &func1.tokens,
471                        &func2.tokens,
472                        func1_pos,
473                        j,
474                        self.min_block_size,
475                    ) {
476                        // Greedy extension
477                        let mut extension = 0;
478                        while (func1_pos + self.min_block_size + extension < func1.tokens.len())
479                            && (j + self.min_block_size + extension < func2.tokens.len())
480                            && (func1.tokens[func1_pos + self.min_block_size + extension]
481                                == func2.tokens[j + self.min_block_size + extension])
482                        {
483                            extension += 1;
484                        }
485
486                        let total_length = self.min_block_size + extension;
487
488                        matches.push(CloneMatch {
489                            source_start: func1_pos,
490                            target_start: j,
491                            length: total_length,
492                        });
493
494                        // Skip ahead
495                        j += extension.max(1);
496                        break;
497                    }
498                }
499            }
500
501            j += 1;
502        }
503
504        matches
505    }
506
507    /// Computes hash for a token window
508    ///
509    /// Uses Rabin-Karp polynomial rolling hash with:
510    /// - BASE = 257 (prime > 256 to minimize collisions)
511    /// - MODULUS = 1e9+7 (large prime for hash space)
512    fn compute_window_hash(&self, window: &[Token]) -> u64 {
513        /// Prime base for polynomial rolling hash
514        const BASE: u64 = 257;
515        /// Large prime modulus to reduce hash collisions
516        const MODULUS: u64 = 1_000_000_007;
517
518        let mut hash: u64 = 0;
519        for token in window {
520            use std::collections::hash_map::DefaultHasher;
521            use std::hash::{Hash, Hasher};
522            let mut hasher = DefaultHasher::new();
523            token.as_hash_string().hash(&mut hasher);
524            let token_hash = hasher.finish();
525            // Use u128 to prevent overflow before modulo operation
526            let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
527            hash = wide_hash as u64;
528        }
529        hash
530    }
531
532    /// Verifies that two token windows are exactly identical
533    fn verify_window_match(
534        &self,
535        tokens1: &[Token],
536        tokens2: &[Token],
537        idx1: usize,
538        idx2: usize,
539        len: usize,
540    ) -> bool {
541        if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
542            return false;
543        }
544        tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
545    }
546}
547
548impl Default for Scanner {
549    fn default() -> Self {
550        Self::new() // Infallible now, no panic possible
551    }
552}
553
554/// Public API: Find duplicates in the given file paths
555///
556/// # Arguments
557/// * `paths` - Vector of file paths to scan
558///
559/// # Returns
560/// * `Result<Report>` - Scan report with detected duplicates
561pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
562    let scanner = Scanner::new();
563    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
564    scanner.scan(path_bufs)
565}
566
567/// Public API with custom configuration
568pub fn find_duplicates_with_config(
569    paths: Vec<String>,
570    min_block_size: usize,
571    similarity_threshold: f64,
572) -> Result<Report> {
573    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
574    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
575    scanner.scan(path_bufs)
576}
577
578#[cfg(test)]
579mod tests {
580    use super::*;
581
582    #[test]
583    fn test_scanner_creation() {
584        let _scanner = Scanner::new(); // Infallible
585    }
586
587    #[test]
588    fn test_scanner_with_config() {
589        let scanner = Scanner::with_config(30, 0.9);
590        assert!(scanner.is_ok());
591        let s = scanner.unwrap();
592        assert_eq!(s.min_block_size, 30);
593        assert_eq!(s.similarity_threshold, 0.9);
594    }
595
596    #[test]
597    fn test_find_duplicates_empty() {
598        let result = find_duplicates(vec![]);
599        assert!(result.is_ok());
600        let report = result.unwrap();
601        assert_eq!(report.duplicates.len(), 0);
602    }
603
604    #[test]
605    fn test_is_supported_file() {
606        let scanner = Scanner::new();
607
608        assert!(scanner.is_supported_file(Path::new("test.rs")));
609        assert!(scanner.is_supported_file(Path::new("test.py")));
610        assert!(scanner.is_supported_file(Path::new("test.js")));
611        assert!(scanner.is_supported_file(Path::new("test.ts")));
612        assert!(!scanner.is_supported_file(Path::new("test.txt")));
613        assert!(!scanner.is_supported_file(Path::new("test.md")));
614    }
615
616    #[test]
617    fn test_detect_language() {
618        let scanner = Scanner::new();
619
620        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
621        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
622        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
623        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
624    }
625}