dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17// Re-export public types
18pub use hashing::{
19    compute_rolling_hashes, detect_duplicates_with_extension, normalize, CloneMatch, RollingHash,
20    Token,
21};
22pub use parsing::{
23    extract_functions, extract_javascript_functions, extract_python_functions,
24    extract_rust_functions, FunctionNode,
25};
26
27use anyhow::{anyhow, Context, Result};
28use ignore::WalkBuilder;
29use rayon::prelude::*;
30use serde::{Deserialize, Serialize};
31use std::fs;
32use std::path::{Path, PathBuf};
33use tree_sitter::Language;
34
35/// Represents a detected duplicate code fragment
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
37pub struct DuplicateMatch {
38    pub file1: String,
39    pub file2: String,
40    pub start_line1: usize,
41    pub start_line2: usize,
42    pub length: usize,
43    pub similarity: f64,
44    pub hash: u64,
45}
46
47/// Represents a function with its tokens for duplicate detection
48#[derive(Debug, Clone)]
49#[allow(dead_code)] // Some fields used for debugging/future features
50struct FunctionHash {
51    file_path: String,
52    function_name: Option<String>,
53    start_byte: usize,
54    end_byte: usize,
55    start_line: usize,
56    end_line: usize,
57    tokens: Vec<Token>, // Full token sequence for extension
58}
59
60/// Report containing scan results
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct Report {
63    /// Total number of files scanned
64    pub files_scanned: usize,
65    /// Total number of functions analyzed
66    pub functions_analyzed: usize,
67    /// Detected duplicate matches
68    pub duplicates: Vec<DuplicateMatch>,
69    /// Scan statistics
70    pub stats: ScanStats,
71}
72
73/// Statistics from the scanning process
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct ScanStats {
76    /// Total lines of code scanned
77    pub total_lines: usize,
78    /// Total tokens processed
79    pub total_tokens: usize,
80    /// Number of unique hashes computed
81    pub unique_hashes: usize,
82    /// Scan duration in milliseconds
83    pub duration_ms: u64,
84}
85
86/// Main scanner for detecting duplicates
87#[allow(dead_code)] // similarity_threshold reserved for future use
88pub struct Scanner {
89    /// Minimum code block size to consider (in tokens)
90    min_block_size: usize,
91    /// Similarity threshold (0.0 - 1.0)
92    similarity_threshold: f64,
93}
94
95impl Scanner {
96    /// Creates a new Scanner with default settings
97    pub fn new() -> Result<Self> {
98        Ok(Self {
99            min_block_size: 50,
100            similarity_threshold: 0.85,
101        })
102    }
103
104    /// Creates a new Scanner with custom settings
105    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
106        Ok(Self {
107            min_block_size,
108            similarity_threshold,
109        })
110    }
111
112    /// Scans the given paths and returns a Report with detected duplicates
113    ///
114    /// Uses Rayon for parallel file processing:
115    /// 1. Read and parse files
116    /// 2. Extract functions
117    /// 3. Normalize and hash function bodies
118    /// 4. Compare hashes to find duplicates
119    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
120        use std::time::Instant;
121        let start_time = Instant::now();
122
123        // Collect all source files
124        let source_files = self.collect_source_files(paths)?;
125
126        // Process files in parallel using Rayon
127        let function_hashes: Vec<FunctionHash> = source_files
128            .par_iter()
129            .filter_map(|path| self.process_file(path).ok())
130            .flatten()
131            .collect();
132
133        // Find duplicates by comparing hashes
134        let duplicates = self.find_duplicate_hashes(&function_hashes);
135
136        // Calculate statistics
137        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
138
139        let unique_hashes: usize = {
140            let mut hash_set = std::collections::HashSet::new();
141            for fh in &function_hashes {
142                // Compute rolling hashes just for statistics
143                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
144                for (hash, _) in hashes {
145                    hash_set.insert(hash);
146                }
147            }
148            hash_set.len()
149        };
150
151        let duration_ms = start_time.elapsed().as_millis() as u64;
152
153        // Count total lines across all source files
154        let total_lines: usize = source_files
155            .iter()
156            .filter_map(|path| std::fs::read_to_string(path).ok())
157            .map(|content| content.lines().count())
158            .sum();
159
160        Ok(Report {
161            files_scanned: source_files.len(),
162            functions_analyzed: function_hashes.len(),
163            duplicates,
164            stats: ScanStats {
165                total_lines,
166                total_tokens,
167                unique_hashes,
168                duration_ms,
169            },
170        })
171    }
172
173    /// Collects all source files from the given paths
174    ///
175    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
176    /// and common ignore patterns (node_modules, target, etc.)
177    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
178        let mut files = Vec::new();
179
180        for path in paths {
181            if path.is_file() {
182                if self.is_supported_file(&path) {
183                    files.push(path);
184                }
185            } else if path.is_dir() {
186                // Use ignore crate's WalkBuilder to respect .gitignore
187                let walker = WalkBuilder::new(&path)
188                    .git_ignore(true) // Respect .gitignore
189                    .git_global(true) // Respect global gitignore
190                    .git_exclude(true) // Respect .git/info/exclude
191                    .ignore(true) // Respect .ignore files
192                    .hidden(false) // Don't skip hidden files (e.g., .config/)
193                    .parents(true) // Respect parent .gitignore files
194                    .build();
195
196                for entry in walker {
197                    match entry {
198                        Ok(entry) => {
199                            let path = entry.path();
200                            if path.is_file() && self.is_supported_file(path) {
201                                files.push(path.to_path_buf());
202                            }
203                        }
204                        Err(err) => {
205                            // Log but don't fail on individual entry errors
206                            eprintln!("Warning: Failed to access path: {}", err);
207                        }
208                    }
209                }
210            }
211        }
212
213        Ok(files)
214    }
215
216    /// Checks if a file is a supported source file
217    fn is_supported_file(&self, path: &Path) -> bool {
218        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
219            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
220        } else {
221            false
222        }
223    }
224
225    /// Processes a single file and returns function hashes
226    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
227        let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
228
229        let lang = self.detect_language(path)?;
230        let functions = extract_functions(&code, lang)?;
231
232        let file_path = path.to_string_lossy().to_string();
233        let mut function_hashes = Vec::new();
234
235        for func in functions {
236            // Normalize the function body
237            let tokens = normalize(&func.body);
238
239            // Skip if too small
240            if tokens.len() < self.min_block_size {
241                continue;
242            }
243
244            // Store the full token sequence for extension-based detection
245            function_hashes.push(FunctionHash {
246                file_path: file_path.clone(),
247                function_name: func.name.clone(),
248                start_byte: func.start_byte,
249                end_byte: func.end_byte,
250                start_line: func.start_line,
251                end_line: func.end_line,
252                tokens,
253            });
254        }
255
256        Ok(function_hashes)
257    }
258
259    /// Detects the Tree-sitter Language from file extension
260    fn detect_language(&self, path: &Path) -> Result<Language> {
261        let ext = path
262            .extension()
263            .and_then(|e| e.to_str())
264            .ok_or_else(|| anyhow!("No file extension"))?;
265
266        match ext {
267            "rs" => Ok(tree_sitter_rust::language()),
268            "py" => Ok(tree_sitter_python::language()),
269            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
270            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
271        }
272    }
273
274    /// Finds duplicate code using greedy extension algorithm
275    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
276        let mut duplicates = Vec::new();
277        let mut seen_pairs = std::collections::HashSet::new();
278
279        // Compare each pair of functions
280        for i in 0..function_hashes.len() {
281            for j in (i + 1)..function_hashes.len() {
282                let func1 = &function_hashes[i];
283                let func2 = &function_hashes[j];
284
285                // Skip if same file
286                if func1.file_path == func2.file_path {
287                    continue;
288                }
289
290                // Use extension-based detection on each function's tokens
291                let matches = self.find_clones_between_functions(func1, func2);
292
293                for clone_match in matches {
294                    // Create pair key for deduplication
295                    let pair_key = if func1.file_path < func2.file_path {
296                        (
297                            func1.file_path.clone(),
298                            func2.file_path.clone(),
299                            clone_match.source_start,
300                            clone_match.target_start,
301                            clone_match.length,
302                        )
303                    } else {
304                        (
305                            func2.file_path.clone(),
306                            func1.file_path.clone(),
307                            clone_match.target_start,
308                            clone_match.source_start,
309                            clone_match.length,
310                        )
311                    };
312
313                    if seen_pairs.contains(&pair_key) {
314                        continue;
315                    }
316                    seen_pairs.insert(pair_key);
317
318                    // Compute a hash for this match for reporting
319                    use std::collections::hash_map::DefaultHasher;
320                    use std::hash::{Hash, Hasher};
321                    let mut hasher = DefaultHasher::new();
322                    func1.tokens
323                        [clone_match.source_start..clone_match.source_start + clone_match.length]
324                        .hash(&mut hasher);
325                    let match_hash = hasher.finish();
326
327                    duplicates.push(DuplicateMatch {
328                        file1: func1.file_path.clone(),
329                        file2: func2.file_path.clone(),
330                        start_line1: func1.start_line,
331                        start_line2: func2.start_line,
332                        length: clone_match.length,
333                        similarity: 1.0, // Exact match
334                        hash: match_hash,
335                    });
336                }
337            }
338        }
339
340        duplicates
341    }
342
343    /// Finds clone matches between two functions using extension algorithm
344    fn find_clones_between_functions(
345        &self,
346        func1: &FunctionHash,
347        func2: &FunctionHash,
348    ) -> Vec<CloneMatch> {
349        use std::collections::HashMap;
350
351        let mut matches = Vec::new();
352        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
353
354        // Index all windows in func1
355        let mut i = 0;
356        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
357            let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
358            hash_map.entry(hash).or_default().push(i);
359            i += 1;
360        }
361
362        // Search for matches in func2
363        let mut j = 0;
364        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
365            let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
366
367            if let Some(func1_positions) = hash_map.get(&hash) {
368                for &func1_pos in func1_positions {
369                    // Verify exact match
370                    if self.verify_window_match(
371                        &func1.tokens,
372                        &func2.tokens,
373                        func1_pos,
374                        j,
375                        self.min_block_size,
376                    ) {
377                        // Greedy extension
378                        let mut extension = 0;
379                        while (func1_pos + self.min_block_size + extension < func1.tokens.len())
380                            && (j + self.min_block_size + extension < func2.tokens.len())
381                            && (func1.tokens[func1_pos + self.min_block_size + extension]
382                                == func2.tokens[j + self.min_block_size + extension])
383                        {
384                            extension += 1;
385                        }
386
387                        let total_length = self.min_block_size + extension;
388
389                        matches.push(CloneMatch {
390                            source_start: func1_pos,
391                            target_start: j,
392                            length: total_length,
393                        });
394
395                        // Skip ahead
396                        j += extension.max(1);
397                        break;
398                    }
399                }
400            }
401
402            j += 1;
403        }
404
405        matches
406    }
407
408    /// Computes hash for a token window
409    fn compute_window_hash(&self, window: &[Token]) -> u64 {
410        const BASE: u64 = 257;
411        const MODULUS: u64 = 1_000_000_007;
412
413        let mut hash: u64 = 0;
414        for token in window {
415            use std::collections::hash_map::DefaultHasher;
416            use std::hash::{Hash, Hasher};
417            let mut hasher = DefaultHasher::new();
418            token.as_hash_string().hash(&mut hasher);
419            let token_hash = hasher.finish();
420            hash = (hash.wrapping_mul(BASE).wrapping_add(token_hash)) % MODULUS;
421        }
422        hash
423    }
424
425    /// Verifies that two token windows are exactly identical
426    fn verify_window_match(
427        &self,
428        tokens1: &[Token],
429        tokens2: &[Token],
430        idx1: usize,
431        idx2: usize,
432        len: usize,
433    ) -> bool {
434        if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
435            return false;
436        }
437        tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
438    }
439}
440
441impl Default for Scanner {
442    fn default() -> Self {
443        Self::new().expect("Failed to initialize default Scanner")
444    }
445}
446
447/// Public API: Find duplicates in the given file paths
448///
449/// # Arguments
450/// * `paths` - Vector of file paths to scan
451///
452/// # Returns
453/// * `Result<Report>` - Scan report with detected duplicates
454pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
455    let scanner = Scanner::new()?;
456    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
457    scanner.scan(path_bufs)
458}
459
460/// Public API with custom configuration
461pub fn find_duplicates_with_config(
462    paths: Vec<String>,
463    min_block_size: usize,
464    similarity_threshold: f64,
465) -> Result<Report> {
466    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
467    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
468    scanner.scan(path_bufs)
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    #[test]
476    fn test_scanner_creation() {
477        let scanner = Scanner::new();
478        assert!(scanner.is_ok());
479    }
480
481    #[test]
482    fn test_scanner_with_config() {
483        let scanner = Scanner::with_config(30, 0.9);
484        assert!(scanner.is_ok());
485        let s = scanner.unwrap();
486        assert_eq!(s.min_block_size, 30);
487        assert_eq!(s.similarity_threshold, 0.9);
488    }
489
490    #[test]
491    fn test_find_duplicates_empty() {
492        let result = find_duplicates(vec![]);
493        assert!(result.is_ok());
494        let report = result.unwrap();
495        assert_eq!(report.duplicates.len(), 0);
496    }
497
498    #[test]
499    fn test_is_supported_file() {
500        let scanner = Scanner::new().unwrap();
501
502        assert!(scanner.is_supported_file(Path::new("test.rs")));
503        assert!(scanner.is_supported_file(Path::new("test.py")));
504        assert!(scanner.is_supported_file(Path::new("test.js")));
505        assert!(scanner.is_supported_file(Path::new("test.ts")));
506        assert!(!scanner.is_supported_file(Path::new("test.txt")));
507        assert!(!scanner.is_supported_file(Path::new("test.md")));
508    }
509
510    #[test]
511    fn test_detect_language() {
512        let scanner = Scanner::new().unwrap();
513
514        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
515        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
516        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
517        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
518    }
519}