1mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17pub use hashing::{
19 compute_rolling_hashes, detect_duplicates_with_extension, normalize, CloneMatch, RollingHash,
20 Token,
21};
22pub use parsing::{
23 extract_functions, extract_javascript_functions, extract_python_functions,
24 extract_rust_functions, FunctionNode,
25};
26
27use anyhow::{anyhow, Context, Result};
28use ignore::WalkBuilder;
29use rayon::prelude::*;
30use serde::{Deserialize, Serialize};
31use std::fs;
32use std::path::{Path, PathBuf};
33use std::sync::Arc;
34use tree_sitter::Language;
35
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38pub enum CloneType {
39 #[serde(rename = "type-1")]
41 Type1,
42 #[serde(rename = "type-2")]
44 Type2,
45 #[serde(rename = "type-3")]
47 Type3,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
52pub struct DuplicateMatch {
53 pub file1: String,
54 pub file2: String,
55 pub start_line1: usize,
56 pub start_line2: usize,
57 pub length: usize,
58 pub similarity: f64,
59 pub hash: u64,
60 pub clone_type: CloneType,
61}
62
63#[derive(Debug, Clone)]
65struct FunctionHash {
66 file_path: Arc<str>, #[allow(dead_code)] function_name: Option<String>,
69 #[allow(dead_code)] start_byte: usize,
71 #[allow(dead_code)] end_byte: usize,
73 start_line: usize,
74 #[allow(dead_code)] end_line: usize,
76 tokens: Vec<Token>, raw_body: String, }
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct Report {
83 pub files_scanned: usize,
85 pub functions_analyzed: usize,
87 pub duplicates: Vec<DuplicateMatch>,
89 pub stats: ScanStats,
91}
92
93#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ScanStats {
96 pub total_lines: usize,
98 pub total_tokens: usize,
100 pub unique_hashes: usize,
102 pub duration_ms: u64,
104}
105
106#[allow(dead_code)] pub struct Scanner {
109 min_block_size: usize,
111 similarity_threshold: f64,
113 exclude_patterns: Vec<String>,
115}
116
117impl Scanner {
118 pub fn new() -> Self {
122 Self {
123 min_block_size: 50,
124 similarity_threshold: 0.85,
125 exclude_patterns: vec![
126 "**/*.test.ts".to_string(),
127 "**/*.test.js".to_string(),
128 "**/*.test.tsx".to_string(),
129 "**/*.test.jsx".to_string(),
130 "**/*.spec.ts".to_string(),
131 "**/*.spec.js".to_string(),
132 "**/*.spec.tsx".to_string(),
133 "**/*.spec.jsx".to_string(),
134 "**/__tests__/**".to_string(),
135 "**/*.test.py".to_string(),
136 ],
137 }
138 }
139
140 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
142 Ok(Self {
143 min_block_size,
144 similarity_threshold,
145 exclude_patterns: vec![
146 "**/*.test.ts".to_string(),
147 "**/*.test.js".to_string(),
148 "**/*.test.tsx".to_string(),
149 "**/*.test.jsx".to_string(),
150 "**/*.spec.ts".to_string(),
151 "**/*.spec.js".to_string(),
152 "**/*.spec.tsx".to_string(),
153 "**/*.spec.jsx".to_string(),
154 "**/__tests__/**".to_string(),
155 "**/*.test.py".to_string(),
156 ],
157 })
158 }
159
160 pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
162 self.exclude_patterns = patterns;
163 self
164 }
165
166 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
174 use std::time::Instant;
175 let start_time = Instant::now();
176
177 let source_files = self.collect_source_files(paths)?;
179
180 let function_hashes: Vec<FunctionHash> = source_files
182 .par_iter()
183 .filter_map(|path| self.process_file(path).ok())
184 .flatten()
185 .collect();
186
187 let duplicates = self.find_duplicate_hashes(&function_hashes);
189
190 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
192
193 let unique_hashes: usize = {
194 let mut hash_set = std::collections::HashSet::new();
195 for fh in &function_hashes {
196 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
198 for (hash, _) in hashes {
199 hash_set.insert(hash);
200 }
201 }
202 hash_set.len()
203 };
204
205 let duration_ms = start_time.elapsed().as_millis() as u64;
206
207 let total_lines: usize = source_files
209 .iter()
210 .filter_map(|path| std::fs::read_to_string(path).ok())
211 .map(|content| content.lines().count())
212 .sum();
213
214 Ok(Report {
215 files_scanned: source_files.len(),
216 functions_analyzed: function_hashes.len(),
217 duplicates,
218 stats: ScanStats {
219 total_lines,
220 total_tokens,
221 unique_hashes,
222 duration_ms,
223 },
224 })
225 }
226
227 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
232 let mut files = Vec::new();
233
234 for path in paths {
235 if path.is_file() {
236 if self.is_supported_file(&path) && !self.is_excluded(&path) {
237 files.push(path);
238 }
239 } else if path.is_dir() {
240 let walker = WalkBuilder::new(&path)
242 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
249
250 for entry in walker {
251 match entry {
252 Ok(entry) => {
253 let path = entry.path();
254 if path.is_file()
255 && self.is_supported_file(path)
256 && !self.is_excluded(path)
257 {
258 files.push(path.to_path_buf());
259 }
260 }
261 Err(err) => {
262 eprintln!("Warning: Failed to access path: {}", err);
264 }
265 }
266 }
267 }
268 }
269
270 Ok(files)
271 }
272
273 fn is_supported_file(&self, path: &Path) -> bool {
275 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
276 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
277 } else {
278 false
279 }
280 }
281
282 fn is_excluded(&self, path: &Path) -> bool {
284 use globset::{Glob, GlobSetBuilder};
285
286 let mut builder = GlobSetBuilder::new();
288 for pattern in &self.exclude_patterns {
289 if let Ok(glob) = Glob::new(pattern) {
290 builder.add(glob);
291 }
292 }
293
294 if let Ok(glob_set) = builder.build() {
295 glob_set.is_match(path)
296 } else {
297 false
298 }
299 }
300
301 fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
303 let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
304
305 let lang = self.detect_language(path)?;
306 let functions = extract_functions(&code, lang)?;
307
308 let file_path: Arc<str> = path.to_string_lossy().to_string().into();
310 let mut function_hashes = Vec::new();
311
312 for func in functions {
313 let raw_body = func.body.clone();
315 let tokens = normalize(&func.body);
316
317 if tokens.len() < self.min_block_size {
319 continue;
320 }
321
322 function_hashes.push(FunctionHash {
324 file_path: Arc::clone(&file_path), function_name: func.name.clone(),
326 start_byte: func.start_byte,
327 end_byte: func.end_byte,
328 start_line: func.start_line,
329 end_line: func.end_line,
330 tokens,
331 raw_body,
332 });
333 }
334
335 Ok(function_hashes)
336 }
337
338 fn detect_language(&self, path: &Path) -> Result<Language> {
340 let ext = path
341 .extension()
342 .and_then(|e| e.to_str())
343 .ok_or_else(|| anyhow!("No file extension"))?;
344
345 match ext {
346 "rs" => Ok(tree_sitter_rust::language()),
347 "py" => Ok(tree_sitter_python::language()),
348 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
349 _ => Err(anyhow!("Unsupported file extension: {}", ext)),
350 }
351 }
352
353 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
355 let mut duplicates = Vec::new();
356 let mut seen_pairs = std::collections::HashSet::new();
357
358 for i in 0..function_hashes.len() {
360 for j in (i + 1)..function_hashes.len() {
361 let func1 = &function_hashes[i];
362 let func2 = &function_hashes[j];
363
364 if func1.file_path == func2.file_path {
366 continue;
367 }
368
369 let matches = self.find_clones_between_functions(func1, func2);
371
372 for clone_match in matches {
373 let pair_key = if func1.file_path.as_ref() < func2.file_path.as_ref() {
375 (
376 func1.file_path.as_ref(),
377 func2.file_path.as_ref(),
378 clone_match.source_start,
379 clone_match.target_start,
380 clone_match.length,
381 )
382 } else {
383 (
384 func2.file_path.as_ref(),
385 func1.file_path.as_ref(),
386 clone_match.target_start,
387 clone_match.source_start,
388 clone_match.length,
389 )
390 };
391
392 if seen_pairs.contains(&pair_key) {
393 continue;
394 }
395 seen_pairs.insert(pair_key);
396
397 use std::collections::hash_map::DefaultHasher;
399 use std::hash::{Hash, Hasher};
400 let mut hasher = DefaultHasher::new();
401 func1.tokens
402 [clone_match.source_start..clone_match.source_start + clone_match.length]
403 .hash(&mut hasher);
404 let match_hash = hasher.finish();
405
406 let clone_type = self.classify_clone_type(&func1.raw_body, &func2.raw_body);
409
410 duplicates.push(DuplicateMatch {
411 file1: func1.file_path.to_string(),
412 file2: func2.file_path.to_string(),
413 start_line1: func1.start_line,
414 start_line2: func2.start_line,
415 length: clone_match.length,
416 similarity: 1.0, hash: match_hash,
418 clone_type,
419 });
420 }
421 }
422 }
423
424 duplicates
425 }
426
427 fn classify_clone_type(&self, raw1: &str, raw2: &str) -> CloneType {
429 let normalized1 = raw1.split_whitespace().collect::<String>();
431 let normalized2 = raw2.split_whitespace().collect::<String>();
432
433 if normalized1 == normalized2 {
435 CloneType::Type1
436 } else {
437 CloneType::Type2
439 }
440 }
441
442 fn find_clones_between_functions(
444 &self,
445 func1: &FunctionHash,
446 func2: &FunctionHash,
447 ) -> Vec<CloneMatch> {
448 use std::collections::HashMap;
449
450 let mut matches = Vec::new();
451 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
452
453 let mut i = 0;
455 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
456 let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
457 hash_map.entry(hash).or_default().push(i);
458 i += 1;
459 }
460
461 let mut j = 0;
463 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
464 let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
465
466 if let Some(func1_positions) = hash_map.get(&hash) {
467 for &func1_pos in func1_positions {
468 if self.verify_window_match(
470 &func1.tokens,
471 &func2.tokens,
472 func1_pos,
473 j,
474 self.min_block_size,
475 ) {
476 let mut extension = 0;
478 while (func1_pos + self.min_block_size + extension < func1.tokens.len())
479 && (j + self.min_block_size + extension < func2.tokens.len())
480 && (func1.tokens[func1_pos + self.min_block_size + extension]
481 == func2.tokens[j + self.min_block_size + extension])
482 {
483 extension += 1;
484 }
485
486 let total_length = self.min_block_size + extension;
487
488 matches.push(CloneMatch {
489 source_start: func1_pos,
490 target_start: j,
491 length: total_length,
492 });
493
494 j += extension.max(1);
496 break;
497 }
498 }
499 }
500
501 j += 1;
502 }
503
504 matches
505 }
506
507 fn compute_window_hash(&self, window: &[Token]) -> u64 {
513 const BASE: u64 = 257;
515 const MODULUS: u64 = 1_000_000_007;
517
518 let mut hash: u64 = 0;
519 for token in window {
520 use std::collections::hash_map::DefaultHasher;
521 use std::hash::{Hash, Hasher};
522 let mut hasher = DefaultHasher::new();
523 token.as_hash_string().hash(&mut hasher);
524 let token_hash = hasher.finish();
525 let wide_hash = (hash as u128 * BASE as u128 + token_hash as u128) % MODULUS as u128;
527 hash = wide_hash as u64;
528 }
529 hash
530 }
531
532 fn verify_window_match(
534 &self,
535 tokens1: &[Token],
536 tokens2: &[Token],
537 idx1: usize,
538 idx2: usize,
539 len: usize,
540 ) -> bool {
541 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
542 return false;
543 }
544 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
545 }
546}
547
548impl Default for Scanner {
549 fn default() -> Self {
550 Self::new() }
552}
553
554pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
562 let scanner = Scanner::new();
563 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
564 scanner.scan(path_bufs)
565}
566
567pub fn find_duplicates_with_config(
569 paths: Vec<String>,
570 min_block_size: usize,
571 similarity_threshold: f64,
572) -> Result<Report> {
573 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
574 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
575 scanner.scan(path_bufs)
576}
577
578#[cfg(test)]
579mod tests {
580 use super::*;
581
582 #[test]
583 fn test_scanner_creation() {
584 let _scanner = Scanner::new(); }
586
587 #[test]
588 fn test_scanner_with_config() {
589 let scanner = Scanner::with_config(30, 0.9);
590 assert!(scanner.is_ok());
591 let s = scanner.unwrap();
592 assert_eq!(s.min_block_size, 30);
593 assert_eq!(s.similarity_threshold, 0.9);
594 }
595
596 #[test]
597 fn test_find_duplicates_empty() {
598 let result = find_duplicates(vec![]);
599 assert!(result.is_ok());
600 let report = result.unwrap();
601 assert_eq!(report.duplicates.len(), 0);
602 }
603
604 #[test]
605 fn test_is_supported_file() {
606 let scanner = Scanner::new();
607
608 assert!(scanner.is_supported_file(Path::new("test.rs")));
609 assert!(scanner.is_supported_file(Path::new("test.py")));
610 assert!(scanner.is_supported_file(Path::new("test.js")));
611 assert!(scanner.is_supported_file(Path::new("test.ts")));
612 assert!(!scanner.is_supported_file(Path::new("test.txt")));
613 assert!(!scanner.is_supported_file(Path::new("test.md")));
614 }
615
616 #[test]
617 fn test_detect_language() {
618 let scanner = Scanner::new();
619
620 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
621 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
622 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
623 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
624 }
625}