dupe_core/
lib.rs

1//! PolyDup Core - Cross-language duplicate code detection engine
2//!
3//! This library provides the core functionality for detecting duplicate code
4//! across Node.js, Python, and Rust codebases using Tree-sitter parsing,
5//! Rabin-Karp/MinHash algorithms, and parallel processing.
6
7mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17// Re-export public types
18pub use hashing::{
19    compute_rolling_hashes, compute_token_edit_distance, compute_token_similarity,
20    detect_duplicates_with_extension, detect_type3_clones, normalize, CloneMatch, RollingHash,
21    Token,
22};
23pub use parsing::{
24    extract_functions, extract_javascript_functions, extract_python_functions,
25    extract_rust_functions, FunctionNode,
26};
27
28use anyhow::{anyhow, Context, Result};
29use ignore::WalkBuilder;
30use rayon::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::fs;
33use std::path::{Path, PathBuf};
34use std::sync::Arc;
35use tree_sitter::Language;
36
37/// Clone type classification
38#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub enum CloneType {
40    /// Type-1: Exact copies (only whitespace/comments differ)
41    #[serde(rename = "type-1")]
42    Type1,
43    /// Type-2: Structurally identical but renamed identifiers/literals
44    #[serde(rename = "type-2")]
45    Type2,
46    /// Type-3: Near-miss clones with modifications (not yet implemented)
47    #[serde(rename = "type-3")]
48    Type3,
49}
50
51/// Helper function to check if two ranges overlap
52fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
53    start1 < end2 && start2 < end1
54}
55
56// Stable key for deduplicating matches within the same file pair.
57fn canonical_pair_key<'a>(
58    func1: &'a FunctionHash,
59    func2: &'a FunctionHash,
60    source_start: usize,
61    target_start: usize,
62    length: usize,
63) -> (&'a str, &'a str, usize, usize, usize, usize, usize) {
64    if func1.file_path.as_ref() < func2.file_path.as_ref() {
65        (
66            func1.file_path.as_ref(),
67            func2.file_path.as_ref(),
68            func1.start_line,
69            func2.start_line,
70            source_start,
71            target_start,
72            length,
73        )
74    } else {
75        (
76            func2.file_path.as_ref(),
77            func1.file_path.as_ref(),
78            func2.start_line,
79            func1.start_line,
80            target_start,
81            source_start,
82            length,
83        )
84    }
85}
86
87/// Represents a detected duplicate code fragment
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89pub struct DuplicateMatch {
90    pub file1: String,
91    pub file2: String,
92    pub start_line1: usize,
93    pub start_line2: usize,
94    pub length: usize,
95    pub similarity: f64,
96    pub hash: u64,
97    pub clone_type: CloneType,
98    /// Edit distance (Type-3 only). None for Type-1/2
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub edit_distance: Option<usize>,
101}
102
103/// Represents a function with its tokens for duplicate detection
104#[derive(Debug, Clone)]
105struct FunctionHash {
106    file_path: Arc<str>, // Shared ownership, cheap to clone
107    #[allow(dead_code)] // Kept for potential future reporting improvements
108    function_name: Option<String>,
109    #[allow(dead_code)] // Kept for byte-level analysis in future
110    start_byte: usize,
111    #[allow(dead_code)] // Kept for byte-level analysis in future
112    end_byte: usize,
113    start_line: usize,
114    #[allow(dead_code)] // Kept for future detailed reporting
115    end_line: usize,
116    tokens: Vec<Token>, // Normalized token sequence
117    raw_body: String,   // Original (unnormalized) function body for Type-1 detection
118}
119
120/// Report containing scan results
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct Report {
123    /// Total number of files scanned
124    pub files_scanned: usize,
125    /// Total number of functions analyzed
126    pub functions_analyzed: usize,
127    /// Detected duplicate matches
128    pub duplicates: Vec<DuplicateMatch>,
129    /// Scan statistics
130    pub stats: ScanStats,
131}
132
133/// Statistics from the scanning process
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct ScanStats {
136    /// Total lines of code scanned
137    pub total_lines: usize,
138    /// Total tokens processed
139    pub total_tokens: usize,
140    /// Number of unique hashes computed
141    pub unique_hashes: usize,
142    /// Scan duration in milliseconds
143    pub duration_ms: u64,
144}
145
146/// Main scanner for detecting duplicates
147#[allow(dead_code)] // similarity_threshold reserved for future use
148pub struct Scanner {
149    /// Minimum code block size to consider (in tokens)
150    min_block_size: usize,
151    /// Similarity threshold (0.0 - 1.0)
152    similarity_threshold: f64,
153    /// Glob patterns to exclude from scanning
154    exclude_patterns: Vec<String>,
155    /// Enable Type-3 (gap-tolerant) clone detection
156    enable_type3: bool,
157    /// Type-3 similarity tolerance (0.0 - 1.0)
158    type3_tolerance: f64,
159}
160
161impl Scanner {
162    /// Creates a new Scanner with default settings
163    ///
164    /// This is now infallible as there are no I/O or allocation failures.
165    pub fn new() -> Self {
166        Self {
167            min_block_size: 50,
168            similarity_threshold: 0.85,
169            exclude_patterns: vec![
170                "**/*.test.ts".to_string(),
171                "**/*.test.js".to_string(),
172                "**/*.test.tsx".to_string(),
173                "**/*.test.jsx".to_string(),
174                "**/*.spec.ts".to_string(),
175                "**/*.spec.js".to_string(),
176                "**/*.spec.tsx".to_string(),
177                "**/*.spec.jsx".to_string(),
178                "**/__tests__/**".to_string(),
179                "**/*.test.py".to_string(),
180            ],
181            enable_type3: false,
182            type3_tolerance: 0.85,
183        }
184    }
185
186    /// Creates a new Scanner with custom settings
187    pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
188        Ok(Self {
189            min_block_size,
190            similarity_threshold,
191            exclude_patterns: vec![
192                "**/*.test.ts".to_string(),
193                "**/*.test.js".to_string(),
194                "**/*.test.tsx".to_string(),
195                "**/*.test.jsx".to_string(),
196                "**/*.spec.ts".to_string(),
197                "**/*.spec.js".to_string(),
198                "**/*.spec.tsx".to_string(),
199                "**/*.spec.jsx".to_string(),
200                "**/__tests__/**".to_string(),
201                "**/*.test.py".to_string(),
202            ],
203            enable_type3: false,
204            type3_tolerance: 0.85,
205        })
206    }
207
208    /// Sets custom exclude patterns, replacing the defaults
209    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
210        self.exclude_patterns = patterns;
211        self
212    }
213
214    /// Enables Type-3 clone detection with the specified tolerance
215    pub fn with_type3_detection(mut self, tolerance: f64) -> Result<Self> {
216        if !(0.0..=1.0).contains(&tolerance) {
217            return Err(anyhow!("Type-3 tolerance must be between 0.0 and 1.0"));
218        }
219        self.enable_type3 = true;
220        self.type3_tolerance = tolerance;
221        Ok(self)
222    }
223
224    /// Scans the given paths and returns a Report with detected duplicates
225    ///
226    /// Uses Rayon for parallel file processing:
227    /// 1. Read and parse files
228    /// 2. Extract functions
229    /// 3. Normalize and hash function bodies
230    /// 4. Compare hashes to find duplicates
231    pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
232        use std::time::Instant;
233        let start_time = Instant::now();
234
235        // Collect all source files
236        let source_files = self.collect_source_files(paths)?;
237
238        // Process files in parallel using Rayon
239        let function_hashes: Vec<FunctionHash> = source_files
240            .par_iter()
241            .filter_map(|path| self.process_file(path).ok())
242            .flatten()
243            .collect();
244
245        // Find duplicates by comparing hashes
246        let duplicates = self.find_duplicate_hashes(&function_hashes);
247
248        // Calculate statistics
249        let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
250
251        let unique_hashes: usize = {
252            let mut hash_set = std::collections::HashSet::new();
253            for fh in &function_hashes {
254                // Compute rolling hashes just for statistics
255                let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
256                for (hash, _) in hashes {
257                    hash_set.insert(hash);
258                }
259            }
260            hash_set.len()
261        };
262
263        let duration_ms = start_time.elapsed().as_millis() as u64;
264
265        // Count total lines across all source files
266        let total_lines: usize = source_files
267            .iter()
268            .filter_map(|path| std::fs::read_to_string(path).ok())
269            .map(|content| content.lines().count())
270            .sum();
271
272        Ok(Report {
273            files_scanned: source_files.len(),
274            functions_analyzed: function_hashes.len(),
275            duplicates,
276            stats: ScanStats {
277                total_lines,
278                total_tokens,
279                unique_hashes,
280                duration_ms,
281            },
282        })
283    }
284
285    /// Collects all source files from the given paths
286    ///
287    /// Uses the `ignore` crate to respect .gitignore, .ignore files,
288    /// and common ignore patterns (node_modules, target, etc.)
289    fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
290        let mut files = Vec::new();
291
292        for path in paths {
293            if path.is_file() {
294                if self.is_supported_file(&path) && !self.is_excluded(&path) {
295                    files.push(path);
296                }
297            } else if path.is_dir() {
298                // Use ignore crate's WalkBuilder to respect .gitignore
299                let walker = WalkBuilder::new(&path)
300                    .git_ignore(true) // Respect .gitignore
301                    .git_global(true) // Respect global gitignore
302                    .git_exclude(true) // Respect .git/info/exclude
303                    .ignore(true) // Respect .ignore files
304                    .hidden(false) // Don't skip hidden files (e.g., .config/)
305                    .parents(true) // Respect parent .gitignore files
306                    .build();
307
308                for entry in walker {
309                    match entry {
310                        Ok(entry) => {
311                            let path = entry.path();
312                            if path.is_file()
313                                && self.is_supported_file(path)
314                                && !self.is_excluded(path)
315                            {
316                                files.push(path.to_path_buf());
317                            }
318                        }
319                        Err(err) => {
320                            // Log but don't fail on individual entry errors
321                            eprintln!("Warning: Failed to access path: {}", err);
322                        }
323                    }
324                }
325            }
326        }
327
328        Ok(files)
329    }
330
331    /// Checks if a file is a supported source file
332    fn is_supported_file(&self, path: &Path) -> bool {
333        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
334            matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
335        } else {
336            false
337        }
338    }
339
340    /// Checks if a file matches any exclude patterns
341    fn is_excluded(&self, path: &Path) -> bool {
342        use globset::{Glob, GlobSetBuilder};
343
344        // Build glob set from exclude patterns
345        let mut builder = GlobSetBuilder::new();
346        for pattern in &self.exclude_patterns {
347            if let Ok(glob) = Glob::new(pattern) {
348                builder.add(glob);
349            }
350        }
351
352        if let Ok(glob_set) = builder.build() {
353            glob_set.is_match(path)
354        } else {
355            false
356        }
357    }
358
359    /// Processes a single file and returns function hashes
360    fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
361        let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
362
363        let lang = self.detect_language(path)?;
364        let functions = extract_functions(&code, lang)?;
365
366        // Use Arc<str> for efficient sharing across all functions in this file
367        let file_path: Arc<str> = path.to_string_lossy().to_string().into();
368        let mut function_hashes = Vec::new();
369
370        for func in functions {
371            // Store both raw body (for Type-1) and normalized tokens (for Type-2)
372            let raw_body = func.body.clone();
373            let tokens = normalize(&func.body);
374
375            // Skip if too small
376            if tokens.len() < self.min_block_size {
377                continue;
378            }
379
380            // Store the full token sequence for extension-based detection
381            function_hashes.push(FunctionHash {
382                file_path: Arc::clone(&file_path), // Cheap pointer clone
383                function_name: func.name.clone(),
384                start_byte: func.start_byte,
385                end_byte: func.end_byte,
386                start_line: func.start_line,
387                end_line: func.end_line,
388                tokens,
389                raw_body,
390            });
391        }
392
393        Ok(function_hashes)
394    }
395
396    /// Detects the Tree-sitter Language from file extension
397    fn detect_language(&self, path: &Path) -> Result<Language> {
398        let ext = path
399            .extension()
400            .and_then(|e| e.to_str())
401            .ok_or_else(|| anyhow!("No file extension"))?;
402
403        match ext {
404            "rs" => Ok(tree_sitter_rust::language()),
405            "py" => Ok(tree_sitter_python::language()),
406            "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
407            _ => Err(anyhow!("Unsupported file extension: {}", ext)),
408        }
409    }
410
411    /// Finds duplicate code using greedy extension algorithm
412    fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
413        let mut duplicates = Vec::new();
414        let mut seen_pairs: std::collections::HashSet<(
415            &str,
416            &str,
417            usize,
418            usize,
419            usize,
420            usize,
421            usize,
422        )> = std::collections::HashSet::new();
423
424        // Compare each pair of functions
425        for i in 0..function_hashes.len() {
426            for j in (i + 1)..function_hashes.len() {
427                let func1 = &function_hashes[i];
428                let func2 = &function_hashes[j];
429
430                // Use extension-based detection on each function's tokens
431                let matches = self.find_clones_between_functions(func1, func2);
432
433                for clone_match in matches {
434                    // Create pair key for deduplication using string slices (no cloning)
435                    let pair_key = canonical_pair_key(
436                        func1,
437                        func2,
438                        clone_match.source_start,
439                        clone_match.target_start,
440                        clone_match.length,
441                    );
442
443                    if seen_pairs.contains(&pair_key) {
444                        continue;
445                    }
446                    seen_pairs.insert(pair_key);
447
448                    // Compute a hash for this match for reporting
449                    use std::collections::hash_map::DefaultHasher;
450                    use std::hash::{Hash, Hasher};
451                    let mut hasher = DefaultHasher::new();
452                    func1.tokens
453                        [clone_match.source_start..clone_match.source_start + clone_match.length]
454                        .hash(&mut hasher);
455                    let match_hash = hasher.finish();
456
457                    // Determine clone type by comparing raw function bodies
458                    // (simplified: compares entire bodies, not just the clone region)
459                    let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
460
461                    // Calculate actual line numbers within the code
462                    let actual_start1 = func1.start_line + clone_match.source_start;
463                    let actual_start2 = func2.start_line + clone_match.target_start;
464
465                    // Skip if this is the exact same location (same file, same line)
466                    // This can happen with overlapping function boundaries
467                    if func1.file_path == func2.file_path && actual_start1 == actual_start2 {
468                        continue;
469                    }
470
471                    duplicates.push(DuplicateMatch {
472                        file1: func1.file_path.to_string(),
473                        file2: func2.file_path.to_string(),
474                        start_line1: actual_start1,
475                        start_line2: actual_start2,
476                        length: clone_match.length,
477                        similarity: clone_match.similarity,
478                        hash: match_hash,
479                        clone_type,
480                        edit_distance: None, // Type-1/2 don't have edit distance
481                    });
482                }
483            }
484        }
485
486        // Type-3 detection (gap-tolerant) if enabled
487        if self.enable_type3 {
488            // Collect all Type-3 matches first
489            let mut type3_candidates = Vec::new();
490
491            for i in 0..function_hashes.len() {
492                for j in (i + 1)..function_hashes.len() {
493                    let func1 = &function_hashes[i];
494                    let func2 = &function_hashes[j];
495
496                    // Type-3 detection works for both same-file and cross-file clones
497                    // (Type-1/2 already handled same-file detection)
498
499                    // Detect Type-3 clones
500                    let type3_matches = detect_type3_clones(
501                        &func1.tokens,
502                        &func2.tokens,
503                        self.min_block_size,
504                        self.type3_tolerance,
505                    );
506
507                    for clone_match in type3_matches {
508                        // Check if this overlaps with existing Type-1/2 matches
509                        let pair_key = canonical_pair_key(
510                            func1,
511                            func2,
512                            clone_match.source_start,
513                            clone_match.target_start,
514                            clone_match.length,
515                        );
516
517                        if seen_pairs.contains(&pair_key) {
518                            continue;
519                        }
520
521                        type3_candidates.push((func1, func2, clone_match));
522                    }
523                }
524            }
525
526            // Deduplicate overlapping Type-3 matches
527            let deduplicated = self.deduplicate_overlapping_matches(type3_candidates);
528
529            // Convert deduplicated matches to DuplicateMatch
530            for (func1, func2, clone_match) in deduplicated {
531                // Calculate edit distance for Type-3 using our custom implementation
532                let window1 = &func1.tokens
533                    [clone_match.source_start..clone_match.source_start + clone_match.length];
534                let window2 = &func2.tokens[clone_match.target_start
535                    ..clone_match.target_start + clone_match.target_length];
536                let edit_dist = hashing::compute_token_edit_distance(window1, window2);
537
538                // Compute hash for Type-3 match
539                use std::collections::hash_map::DefaultHasher;
540                use std::hash::{Hash, Hasher};
541                let mut hasher = DefaultHasher::new();
542                window1.hash(&mut hasher);
543                let match_hash = hasher.finish();
544
545                duplicates.push(DuplicateMatch {
546                    file1: func1.file_path.to_string(),
547                    file2: func2.file_path.to_string(),
548                    start_line1: func1.start_line,
549                    start_line2: func2.start_line,
550                    length: clone_match.length,
551                    similarity: clone_match.similarity,
552                    hash: match_hash,
553                    clone_type: CloneType::Type3,
554                    edit_distance: Some(edit_dist),
555                });
556            }
557        }
558
559        duplicates
560    }
561
562    /// Deduplicates overlapping Type-3 matches by keeping only the longest match per region
563    ///
564    /// Groups matches by (file1, file2, func1_line, func2_line) to handle same-file clones properly.
565    /// Merges overlapping regions, keeping the longest match with the highest similarity score.
566    /// Overlap requires BOTH source AND target ranges to overlap.
567    fn deduplicate_overlapping_matches<'a>(
568        &self,
569        candidates: Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)>,
570    ) -> Vec<(&'a FunctionHash, &'a FunctionHash, CloneMatch)> {
571        if candidates.is_empty() {
572            return Vec::new();
573        }
574
575        // Track which matches have been merged
576        let mut used = vec![false; candidates.len()];
577        let mut deduplicated = Vec::new();
578
579        for i in 0..candidates.len() {
580            if used[i] {
581                continue;
582            }
583
584            let (func1, func2, current) = &candidates[i];
585            let mut best_match = (*func1, *func2, current.clone());
586            used[i] = true;
587
588            // Find all overlapping matches (iterate until no more overlaps found)
589            // This handles transitive overlaps: A overlaps B, B overlaps C
590            let mut found_overlap = true;
591            while found_overlap {
592                found_overlap = false;
593
594                for j in (i + 1)..candidates.len() {
595                    if used[j] {
596                        continue;
597                    }
598
599                    let (f1, f2, candidate) = &candidates[j];
600
601                    // Only merge if same function pair (by file path and line number)
602                    let same_pair = (func1.file_path == f1.file_path
603                        && func2.file_path == f2.file_path
604                        && func1.start_line == f1.start_line
605                        && func2.start_line == f2.start_line)
606                        || (func1.file_path == f2.file_path
607                            && func2.file_path == f1.file_path
608                            && func1.start_line == f2.start_line
609                            && func2.start_line == f1.start_line);
610
611                    if !same_pair {
612                        continue;
613                    }
614
615                    // Check if overlapping with CURRENT best_match (not original)
616                    // This ensures transitive overlaps are handled correctly
617                    let source_overlap = ranges_overlap(
618                        best_match.2.source_start,
619                        best_match.2.source_start + best_match.2.length,
620                        candidate.source_start,
621                        candidate.source_start + candidate.length,
622                    );
623                    let target_overlap = ranges_overlap(
624                        best_match.2.target_start,
625                        best_match.2.target_start + best_match.2.target_length,
626                        candidate.target_start,
627                        candidate.target_start + candidate.target_length,
628                    );
629
630                    if source_overlap && target_overlap {
631                        let best_span = best_match.2.length.max(best_match.2.target_length);
632                        let candidate_span = candidate.length.max(candidate.target_length);
633
634                        // Keep the match that covers more tokens overall, breaking ties by similarity
635                        if candidate_span > best_span
636                            || (candidate_span == best_span
637                                && candidate.similarity > best_match.2.similarity)
638                        {
639                            best_match = (*f1, *f2, candidate.clone());
640                            found_overlap = true; // Need another pass to check against new best
641                        }
642                        used[j] = true;
643                    }
644                }
645            }
646
647            deduplicated.push(best_match);
648        }
649
650        deduplicated
651    }
652
653    /// Classifies a clone as Type-1 (exact) or Type-2 (renamed)
654    fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
655        // Normalize whitespace for comparison (avoid intermediate Vec allocation)
656        let normalized1 = raw1.split_whitespace().collect::<String>();
657        let normalized2 = raw2.split_whitespace().collect::<String>();
658
659        // If raw code is identical (ignoring whitespace), it's Type-1 (exact copy)
660        if normalized1 == normalized2 {
661            CloneType::Type1
662        } else {
663            // Otherwise, it's Type-2 (renamed identifiers/literals)
664            CloneType::Type2
665        }
666    }
667
668    /// Finds clone matches between two functions using extension algorithm
669    fn find_clones_between_functions(
670        &self,
671        func1: &FunctionHash,
672        func2: &FunctionHash,
673    ) -> Vec<CloneMatch> {
674        use std::collections::HashMap;
675
676        let mut matches = Vec::new();
677        let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
678
679        // Index all windows in func1
680        let mut i = 0;
681        while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
682            let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
683            hash_map.entry(hash).or_default().push(i);
684            i += 1;
685        }
686
687        // Search for matches in func2
688        let mut j = 0;
689        while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
690            let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
691
692            if let Some(func1_positions) = hash_map.get(&hash) {
693                for &func1_pos in func1_positions {
694                    // Verify exact match
695                    if self.verify_window_match(
696                        &func1.tokens,
697                        &func2.tokens,
698                        func1_pos,
699                        j,
700                        self.min_block_size,
701                    ) {
702                        // Greedy extension
703                        let mut extension = 0;
704                        while (func1_pos + self.min_block_size + extension < func1.tokens.len())
705                            && (j + self.min_block_size + extension < func2.tokens.len())
706                            && (func1.tokens[func1_pos + self.min_block_size + extension]
707                                == func2.tokens[j + self.min_block_size + extension])
708                        {
709                            extension += 1;
710                        }
711
712                        let total_length = self.min_block_size + extension;
713
714                        matches.push(CloneMatch {
715                            source_start: func1_pos,
716                            target_start: j,
717                            length: total_length,
718                            target_length: total_length,
719                            similarity: 1.0, // Exact match
720                        });
721
722                        // Skip ahead
723                        j += extension.max(1);
724                        break;
725                    }
726                }
727            }
728
729            j += 1;
730        }
731
732        matches
733    }
734
735    /// Computes hash for a token window
736    ///
737    /// Uses Rabin-Karp polynomial rolling hash with:
738    /// - BASE = 257 (prime > 256 to minimize collisions)
739    /// - MODULUS = 1e9+7 (large prime for hash space)
740    fn compute_window_hash(&self, window: &[Token]) -> u64 {
741        /// Prime base for polynomial rolling hash
742        const BASE: u64 = 257;
743        /// Large prime modulus to reduce hash collisions
744        const MODULUS: u64 = 1_000_000_007;
745
746        let mut hash: u64 = 0;
747        for token in window {
748            use std::collections::hash_map::DefaultHasher;
749            use std::hash::{Hash, Hasher};
750            let mut hasher = DefaultHasher::new();
751            token.as_hash_string().hash(&mut hasher);
752            let token_hash = hasher.finish();
753            // Use u128 to prevent overflow before modulo operation
754            let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
755            hash = wide_hash as u64;
756        }
757        hash
758    }
759
760    /// Verifies that two token windows are exactly identical
761    fn verify_window_match(
762        &self,
763        tokens1: &[Token],
764        tokens2: &[Token],
765        idx1: usize,
766        idx2: usize,
767        len: usize,
768    ) -> bool {
769        if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
770            return false;
771        }
772        tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
773    }
774}
775
776impl Default for Scanner {
777    fn default() -> Self {
778        Self::new() // Infallible now, no panic possible
779    }
780}
781
782/// Public API: Find duplicates in the given file paths
783///
784/// # Arguments
785/// * `paths` - Vector of file paths to scan
786///
787/// # Returns
788/// * `Result<Report>` - Scan report with detected duplicates
789pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
790    let scanner = Scanner::new();
791    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
792    scanner.scan(path_bufs)
793}
794
795/// Public API with custom configuration
796pub fn find_duplicates_with_config(
797    paths: Vec<String>,
798    min_block_size: usize,
799    similarity_threshold: f64,
800) -> Result<Report> {
801    let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
802    let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
803    scanner.scan(path_bufs)
804}
805
806#[cfg(test)]
807mod tests {
808    use super::*;
809
810    #[test]
811    fn test_scanner_creation() {
812        let _scanner = Scanner::new(); // Infallible
813    }
814
815    #[test]
816    fn test_scanner_with_config() {
817        let scanner = Scanner::with_config(30, 0.9);
818        assert!(scanner.is_ok());
819        let s = scanner.unwrap();
820        assert_eq!(s.min_block_size, 30);
821        assert_eq!(s.similarity_threshold, 0.9);
822    }
823
824    #[test]
825    fn test_type3_tolerance_validation() {
826        assert!(Scanner::new().with_type3_detection(0.9).is_ok());
827        assert!(Scanner::new().with_type3_detection(1.2).is_err());
828        assert!(Scanner::new().with_type3_detection(-0.1).is_err());
829    }
830
831    #[test]
832    fn test_type3_not_dropped_when_functions_share_offsets() {
833        fn make_function(
834            file: &str,
835            start_line: usize,
836            tokens: Vec<Token>,
837            raw_body: &str,
838        ) -> FunctionHash {
839            FunctionHash {
840                file_path: Arc::<str>::from(file),
841                function_name: None,
842                start_byte: 0,
843                end_byte: 0,
844                start_line,
845                end_line: start_line + tokens.len(),
846                tokens,
847                raw_body: raw_body.to_string(),
848            }
849        }
850
851        let scanner = Scanner::with_config(3, 0.85)
852            .unwrap()
853            .with_type3_detection(0.6)
854            .unwrap();
855
856        let type1_tokens = vec![
857            Token::Keyword("return".into()),
858            Token::NumberLiteral,
859            Token::Punctuation(";".into()),
860        ];
861        let near_tokens_a = vec![
862            Token::Keyword("compute".into()),
863            Token::Identifier,
864            Token::Identifier,
865        ];
866        let near_tokens_b = vec![
867            Token::Keyword("compute".into()),
868            Token::Identifier,
869            Token::NumberLiteral,
870        ];
871
872        let functions = vec![
873            make_function("file_a.rs", 10, type1_tokens.clone(), "return 1;"),
874            make_function("file_b.rs", 20, type1_tokens, "return 1;"),
875            make_function("file_a.rs", 200, near_tokens_a, "compute(x, y)"),
876            make_function("file_b.rs", 300, near_tokens_b, "compute(x, 1)"),
877        ];
878
879        let duplicates = scanner.find_duplicate_hashes(&functions);
880
881        let type1_present = duplicates.iter().any(|d| {
882            matches!(d.clone_type, CloneType::Type1 | CloneType::Type2)
883                && d.start_line1 == 10
884                && d.start_line2 == 20
885        });
886        assert!(
887            type1_present,
888            "expected Type-1/2 match for the first function pair"
889        );
890
891        let type3_present = duplicates.iter().any(|d| {
892            matches!(d.clone_type, CloneType::Type3) && d.start_line1 == 200 && d.start_line2 == 300
893        });
894        assert!(
895            type3_present,
896            "Type-3 match between later functions should not be deduped"
897        );
898
899        assert_eq!(
900            duplicates.len(),
901            2,
902            "should keep both the Type-1/2 and Type-3 matches"
903        );
904    }
905
906    #[test]
907    fn test_find_duplicates_empty() {
908        let result = find_duplicates(vec![]);
909        assert!(result.is_ok());
910        let report = result.unwrap();
911        assert_eq!(report.duplicates.len(), 0);
912    }
913
914    #[test]
915    fn test_is_supported_file() {
916        let scanner = Scanner::new();
917
918        assert!(scanner.is_supported_file(Path::new("test.rs")));
919        assert!(scanner.is_supported_file(Path::new("test.py")));
920        assert!(scanner.is_supported_file(Path::new("test.js")));
921        assert!(scanner.is_supported_file(Path::new("test.ts")));
922        assert!(!scanner.is_supported_file(Path::new("test.txt")));
923        assert!(!scanner.is_supported_file(Path::new("test.md")));
924    }
925
926    #[test]
927    fn test_detect_language() {
928        let scanner = Scanner::new();
929
930        assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
931        assert!(scanner.detect_language(Path::new("test.py")).is_ok());
932        assert!(scanner.detect_language(Path::new("test.js")).is_ok());
933        assert!(scanner.detect_language(Path::new("test.txt")).is_err());
934    }
935}
dupe_core/lib.rs

dupe_core/
lib.rs