1mod hashing;
8mod parsing;
9mod queries;
10
11#[cfg(test)]
12mod proptest_fuzzing;
13
14#[cfg(test)]
15mod snapshot_tests;
16
17pub use hashing::{
19 compute_rolling_hashes, detect_duplicates_with_extension, normalize, CloneMatch, RollingHash,
20 Token,
21};
22pub use parsing::{
23 extract_functions, extract_javascript_functions, extract_python_functions,
24 extract_rust_functions, FunctionNode,
25};
26
27use anyhow::{anyhow, Context, Result};
28use ignore::WalkBuilder;
29use rayon::prelude::*;
30use serde::{Deserialize, Serialize};
31use std::fs;
32use std::path::{Path, PathBuf};
33use tree_sitter::Language;
34
35#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
37pub struct DuplicateMatch {
38 pub file1: String,
39 pub file2: String,
40 pub start_line1: usize,
41 pub start_line2: usize,
42 pub length: usize,
43 pub similarity: f64,
44 pub hash: u64,
45}
46
47#[derive(Debug, Clone)]
49#[allow(dead_code)] struct FunctionHash {
51 file_path: String,
52 function_name: Option<String>,
53 start_byte: usize,
54 end_byte: usize,
55 start_line: usize,
56 end_line: usize,
57 tokens: Vec<Token>, }
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct Report {
63 pub files_scanned: usize,
65 pub functions_analyzed: usize,
67 pub duplicates: Vec<DuplicateMatch>,
69 pub stats: ScanStats,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct ScanStats {
76 pub total_lines: usize,
78 pub total_tokens: usize,
80 pub unique_hashes: usize,
82 pub duration_ms: u64,
84}
85
86#[allow(dead_code)] pub struct Scanner {
89 min_block_size: usize,
91 similarity_threshold: f64,
93}
94
95impl Scanner {
96 pub fn new() -> Result<Self> {
98 Ok(Self {
99 min_block_size: 50,
100 similarity_threshold: 0.85,
101 })
102 }
103
104 pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
106 Ok(Self {
107 min_block_size,
108 similarity_threshold,
109 })
110 }
111
112 pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
120 use std::time::Instant;
121 let start_time = Instant::now();
122
123 let source_files = self.collect_source_files(paths)?;
125
126 let function_hashes: Vec<FunctionHash> = source_files
128 .par_iter()
129 .filter_map(|path| self.process_file(path).ok())
130 .flatten()
131 .collect();
132
133 let duplicates = self.find_duplicate_hashes(&function_hashes);
135
136 let total_tokens: usize = function_hashes.iter().map(|fh| fh.tokens.len()).sum();
138
139 let unique_hashes: usize = {
140 let mut hash_set = std::collections::HashSet::new();
141 for fh in &function_hashes {
142 let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
144 for (hash, _) in hashes {
145 hash_set.insert(hash);
146 }
147 }
148 hash_set.len()
149 };
150
151 let duration_ms = start_time.elapsed().as_millis() as u64;
152
153 let total_lines: usize = source_files
155 .iter()
156 .filter_map(|path| std::fs::read_to_string(path).ok())
157 .map(|content| content.lines().count())
158 .sum();
159
160 Ok(Report {
161 files_scanned: source_files.len(),
162 functions_analyzed: function_hashes.len(),
163 duplicates,
164 stats: ScanStats {
165 total_lines,
166 total_tokens,
167 unique_hashes,
168 duration_ms,
169 },
170 })
171 }
172
173 fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
178 let mut files = Vec::new();
179
180 for path in paths {
181 if path.is_file() {
182 if self.is_supported_file(&path) {
183 files.push(path);
184 }
185 } else if path.is_dir() {
186 let walker = WalkBuilder::new(&path)
188 .git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
195
196 for entry in walker {
197 match entry {
198 Ok(entry) => {
199 let path = entry.path();
200 if path.is_file() && self.is_supported_file(path) {
201 files.push(path.to_path_buf());
202 }
203 }
204 Err(err) => {
205 eprintln!("Warning: Failed to access path: {}", err);
207 }
208 }
209 }
210 }
211 }
212
213 Ok(files)
214 }
215
216 fn is_supported_file(&self, path: &Path) -> bool {
218 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
219 matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
220 } else {
221 false
222 }
223 }
224
225 fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
227 let code = fs::read_to_string(path).context(format!("Failed to read file: {:?}", path))?;
228
229 let lang = self.detect_language(path)?;
230 let functions = extract_functions(&code, lang)?;
231
232 let file_path = path.to_string_lossy().to_string();
233 let mut function_hashes = Vec::new();
234
235 for func in functions {
236 let tokens = normalize(&func.body);
238
239 if tokens.len() < self.min_block_size {
241 continue;
242 }
243
244 function_hashes.push(FunctionHash {
246 file_path: file_path.clone(),
247 function_name: func.name.clone(),
248 start_byte: func.start_byte,
249 end_byte: func.end_byte,
250 start_line: func.start_line,
251 end_line: func.end_line,
252 tokens,
253 });
254 }
255
256 Ok(function_hashes)
257 }
258
259 fn detect_language(&self, path: &Path) -> Result<Language> {
261 let ext = path
262 .extension()
263 .and_then(|e| e.to_str())
264 .ok_or_else(|| anyhow!("No file extension"))?;
265
266 match ext {
267 "rs" => Ok(tree_sitter_rust::language()),
268 "py" => Ok(tree_sitter_python::language()),
269 "js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
270 _ => Err(anyhow!("Unsupported file extension: {}", ext)),
271 }
272 }
273
274 fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
276 let mut duplicates = Vec::new();
277 let mut seen_pairs = std::collections::HashSet::new();
278
279 for i in 0..function_hashes.len() {
281 for j in (i + 1)..function_hashes.len() {
282 let func1 = &function_hashes[i];
283 let func2 = &function_hashes[j];
284
285 if func1.file_path == func2.file_path {
287 continue;
288 }
289
290 let matches = self.find_clones_between_functions(func1, func2);
292
293 for clone_match in matches {
294 let pair_key = if func1.file_path < func2.file_path {
296 (
297 func1.file_path.clone(),
298 func2.file_path.clone(),
299 clone_match.source_start,
300 clone_match.target_start,
301 clone_match.length,
302 )
303 } else {
304 (
305 func2.file_path.clone(),
306 func1.file_path.clone(),
307 clone_match.target_start,
308 clone_match.source_start,
309 clone_match.length,
310 )
311 };
312
313 if seen_pairs.contains(&pair_key) {
314 continue;
315 }
316 seen_pairs.insert(pair_key);
317
318 use std::collections::hash_map::DefaultHasher;
320 use std::hash::{Hash, Hasher};
321 let mut hasher = DefaultHasher::new();
322 func1.tokens
323 [clone_match.source_start..clone_match.source_start + clone_match.length]
324 .hash(&mut hasher);
325 let match_hash = hasher.finish();
326
327 duplicates.push(DuplicateMatch {
328 file1: func1.file_path.clone(),
329 file2: func2.file_path.clone(),
330 start_line1: func1.start_line,
331 start_line2: func2.start_line,
332 length: clone_match.length,
333 similarity: 1.0, hash: match_hash,
335 });
336 }
337 }
338 }
339
340 duplicates
341 }
342
343 fn find_clones_between_functions(
345 &self,
346 func1: &FunctionHash,
347 func2: &FunctionHash,
348 ) -> Vec<CloneMatch> {
349 use std::collections::HashMap;
350
351 let mut matches = Vec::new();
352 let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
353
354 let mut i = 0;
356 while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
357 let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
358 hash_map.entry(hash).or_default().push(i);
359 i += 1;
360 }
361
362 let mut j = 0;
364 while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
365 let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
366
367 if let Some(func1_positions) = hash_map.get(&hash) {
368 for &func1_pos in func1_positions {
369 if self.verify_window_match(
371 &func1.tokens,
372 &func2.tokens,
373 func1_pos,
374 j,
375 self.min_block_size,
376 ) {
377 let mut extension = 0;
379 while (func1_pos + self.min_block_size + extension < func1.tokens.len())
380 && (j + self.min_block_size + extension < func2.tokens.len())
381 && (func1.tokens[func1_pos + self.min_block_size + extension]
382 == func2.tokens[j + self.min_block_size + extension])
383 {
384 extension += 1;
385 }
386
387 let total_length = self.min_block_size + extension;
388
389 matches.push(CloneMatch {
390 source_start: func1_pos,
391 target_start: j,
392 length: total_length,
393 });
394
395 j += extension.max(1);
397 break;
398 }
399 }
400 }
401
402 j += 1;
403 }
404
405 matches
406 }
407
408 fn compute_window_hash(&self, window: &[Token]) -> u64 {
410 const BASE: u64 = 257;
411 const MODULUS: u64 = 1_000_000_007;
412
413 let mut hash: u64 = 0;
414 for token in window {
415 use std::collections::hash_map::DefaultHasher;
416 use std::hash::{Hash, Hasher};
417 let mut hasher = DefaultHasher::new();
418 token.as_hash_string().hash(&mut hasher);
419 let token_hash = hasher.finish();
420 hash = (hash.wrapping_mul(BASE).wrapping_add(token_hash)) % MODULUS;
421 }
422 hash
423 }
424
425 fn verify_window_match(
427 &self,
428 tokens1: &[Token],
429 tokens2: &[Token],
430 idx1: usize,
431 idx2: usize,
432 len: usize,
433 ) -> bool {
434 if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
435 return false;
436 }
437 tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
438 }
439}
440
441impl Default for Scanner {
442 fn default() -> Self {
443 Self::new().expect("Failed to initialize default Scanner")
444 }
445}
446
447pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
455 let scanner = Scanner::new()?;
456 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
457 scanner.scan(path_bufs)
458}
459
460pub fn find_duplicates_with_config(
462 paths: Vec<String>,
463 min_block_size: usize,
464 similarity_threshold: f64,
465) -> Result<Report> {
466 let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
467 let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
468 scanner.scan(path_bufs)
469}
470
471#[cfg(test)]
472mod tests {
473 use super::*;
474
475 #[test]
476 fn test_scanner_creation() {
477 let scanner = Scanner::new();
478 assert!(scanner.is_ok());
479 }
480
481 #[test]
482 fn test_scanner_with_config() {
483 let scanner = Scanner::with_config(30, 0.9);
484 assert!(scanner.is_ok());
485 let s = scanner.unwrap();
486 assert_eq!(s.min_block_size, 30);
487 assert_eq!(s.similarity_threshold, 0.9);
488 }
489
490 #[test]
491 fn test_find_duplicates_empty() {
492 let result = find_duplicates(vec![]);
493 assert!(result.is_ok());
494 let report = result.unwrap();
495 assert_eq!(report.duplicates.len(), 0);
496 }
497
498 #[test]
499 fn test_is_supported_file() {
500 let scanner = Scanner::new().unwrap();
501
502 assert!(scanner.is_supported_file(Path::new("test.rs")));
503 assert!(scanner.is_supported_file(Path::new("test.py")));
504 assert!(scanner.is_supported_file(Path::new("test.js")));
505 assert!(scanner.is_supported_file(Path::new("test.ts")));
506 assert!(!scanner.is_supported_file(Path::new("test.txt")));
507 assert!(!scanner.is_supported_file(Path::new("test.md")));
508 }
509
510 #[test]
511 fn test_detect_language() {
512 let scanner = Scanner::new().unwrap();
513
514 assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
515 assert!(scanner.detect_language(Path::new("test.py")).is_ok());
516 assert!(scanner.detect_language(Path::new("test.js")).is_ok());
517 assert!(scanner.detect_language(Path::new("test.txt")).is_err());
518 }
519}