mod queries;
mod parsing;
mod hashing;
#[cfg(test)]
mod proptest_fuzzing;
#[cfg(test)]
mod snapshot_tests;
pub use parsing::{
extract_functions, extract_javascript_functions, extract_python_functions,
extract_rust_functions, FunctionNode,
};
pub use hashing::{normalize, Token, RollingHash, compute_rolling_hashes, CloneMatch, detect_duplicates_with_extension};
use anyhow::{Context, Result, anyhow};
use ignore::WalkBuilder;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use tree_sitter::Language;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DuplicateMatch {
pub file1: String,
pub file2: String,
pub start_line1: usize,
pub start_line2: usize,
pub length: usize,
pub similarity: f64,
pub hash: u64,
}
#[derive(Debug, Clone)]
struct FunctionHash {
file_path: String,
function_name: Option<String>,
start_byte: usize,
end_byte: usize,
start_line: usize,
end_line: usize,
tokens: Vec<Token>, }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Report {
pub files_scanned: usize,
pub functions_analyzed: usize,
pub duplicates: Vec<DuplicateMatch>,
pub stats: ScanStats,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanStats {
pub total_lines: usize,
pub total_tokens: usize,
pub unique_hashes: usize,
pub duration_ms: u64,
}
pub struct Scanner {
min_block_size: usize,
similarity_threshold: f64,
}
impl Scanner {
pub fn new() -> Result<Self> {
Ok(Self {
min_block_size: 50,
similarity_threshold: 0.85,
})
}
pub fn with_config(min_block_size: usize, similarity_threshold: f64) -> Result<Self> {
Ok(Self {
min_block_size,
similarity_threshold,
})
}
pub fn scan(&self, paths: Vec<PathBuf>) -> Result<Report> {
use std::time::Instant;
let start_time = Instant::now();
let source_files = self.collect_source_files(paths)?;
let function_hashes: Vec<FunctionHash> = source_files
.par_iter()
.filter_map(|path| self.process_file(path).ok())
.flatten()
.collect();
let duplicates = self.find_duplicate_hashes(&function_hashes);
let total_tokens: usize = function_hashes
.iter()
.map(|fh| fh.tokens.len())
.sum();
let unique_hashes: usize = {
let mut hash_set = std::collections::HashSet::new();
for fh in &function_hashes {
let hashes = compute_rolling_hashes(&fh.tokens, self.min_block_size);
for (hash, _) in hashes {
hash_set.insert(hash);
}
}
hash_set.len()
};
let duration_ms = start_time.elapsed().as_millis() as u64;
Ok(Report {
files_scanned: source_files.len(),
functions_analyzed: function_hashes.len(),
duplicates,
stats: ScanStats {
total_lines: 0, total_tokens,
unique_hashes,
duration_ms,
},
})
}
fn collect_source_files(&self, paths: Vec<PathBuf>) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
for path in paths {
if path.is_file() {
if self.is_supported_file(&path) {
files.push(path);
}
} else if path.is_dir() {
let walker = WalkBuilder::new(&path)
.git_ignore(true) .git_global(true) .git_exclude(true) .ignore(true) .hidden(false) .parents(true) .build();
for entry in walker {
match entry {
Ok(entry) => {
let path = entry.path();
if path.is_file() && self.is_supported_file(path) {
files.push(path.to_path_buf());
}
}
Err(err) => {
eprintln!("Warning: Failed to access path: {}", err);
}
}
}
}
}
Ok(files)
}
fn is_supported_file(&self, path: &Path) -> bool {
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
matches!(ext, "rs" | "py" | "js" | "ts" | "jsx" | "tsx")
} else {
false
}
}
fn process_file(&self, path: &Path) -> Result<Vec<FunctionHash>> {
let code = fs::read_to_string(path)
.context(format!("Failed to read file: {:?}", path))?;
let lang = self.detect_language(path)?;
let functions = extract_functions(&code, lang)?;
let file_path = path.to_string_lossy().to_string();
let mut function_hashes = Vec::new();
for func in functions {
let tokens = normalize(&func.body);
if tokens.len() < self.min_block_size {
continue;
}
function_hashes.push(FunctionHash {
file_path: file_path.clone(),
function_name: func.name.clone(),
start_byte: func.start_byte,
end_byte: func.end_byte,
start_line: func.start_line,
end_line: func.end_line,
tokens,
});
}
Ok(function_hashes)
}
fn detect_language(&self, path: &Path) -> Result<Language> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.ok_or_else(|| anyhow!("No file extension"))?;
match ext {
"rs" => Ok(tree_sitter_rust::language()),
"py" => Ok(tree_sitter_python::language()),
"js" | "jsx" | "ts" | "tsx" => Ok(tree_sitter_javascript::language()),
_ => Err(anyhow!("Unsupported file extension: {}", ext)),
}
}
fn find_duplicate_hashes(&self, function_hashes: &[FunctionHash]) -> Vec<DuplicateMatch> {
let mut duplicates = Vec::new();
let mut seen_pairs = std::collections::HashSet::new();
for i in 0..function_hashes.len() {
for j in (i + 1)..function_hashes.len() {
let func1 = &function_hashes[i];
let func2 = &function_hashes[j];
if func1.file_path == func2.file_path {
continue;
}
let matches = self.find_clones_between_functions(func1, func2);
for clone_match in matches {
let pair_key = if func1.file_path < func2.file_path {
(
func1.file_path.clone(),
func2.file_path.clone(),
clone_match.source_start,
clone_match.target_start,
clone_match.length,
)
} else {
(
func2.file_path.clone(),
func1.file_path.clone(),
clone_match.target_start,
clone_match.source_start,
clone_match.length,
)
};
if seen_pairs.contains(&pair_key) {
continue;
}
seen_pairs.insert(pair_key);
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
func1.tokens[clone_match.source_start
..clone_match.source_start + clone_match.length]
.hash(&mut hasher);
let match_hash = hasher.finish();
duplicates.push(DuplicateMatch {
file1: func1.file_path.clone(),
file2: func2.file_path.clone(),
start_line1: func1.start_line,
start_line2: func2.start_line,
length: clone_match.length,
similarity: 1.0, hash: match_hash,
});
}
}
}
duplicates
}
fn find_clones_between_functions(
&self,
func1: &FunctionHash,
func2: &FunctionHash,
) -> Vec<CloneMatch> {
use std::collections::HashMap;
let mut matches = Vec::new();
let mut hash_map: HashMap<u64, Vec<usize>> = HashMap::new();
let mut i = 0;
while i <= func1.tokens.len().saturating_sub(self.min_block_size) {
let hash = self.compute_window_hash(&func1.tokens[i..i + self.min_block_size]);
hash_map.entry(hash).or_insert_with(Vec::new).push(i);
i += 1;
}
let mut j = 0;
while j <= func2.tokens.len().saturating_sub(self.min_block_size) {
let hash = self.compute_window_hash(&func2.tokens[j..j + self.min_block_size]);
if let Some(func1_positions) = hash_map.get(&hash) {
for &func1_pos in func1_positions {
if self.verify_window_match(
&func1.tokens,
&func2.tokens,
func1_pos,
j,
self.min_block_size,
) {
let mut extension = 0;
while (func1_pos + self.min_block_size + extension < func1.tokens.len())
&& (j + self.min_block_size + extension < func2.tokens.len())
&& (func1.tokens[func1_pos + self.min_block_size + extension]
== func2.tokens[j + self.min_block_size + extension])
{
extension += 1;
}
let total_length = self.min_block_size + extension;
matches.push(CloneMatch {
source_start: func1_pos,
target_start: j,
length: total_length,
});
j += extension.max(1);
break;
}
}
}
j += 1;
}
matches
}
fn compute_window_hash(&self, window: &[Token]) -> u64 {
const BASE: u64 = 257;
const MODULUS: u64 = 1_000_000_007;
let mut hash: u64 = 0;
for token in window {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
token.as_hash_string().hash(&mut hasher);
let token_hash = hasher.finish();
hash = (hash.wrapping_mul(BASE).wrapping_add(token_hash)) % MODULUS;
}
hash
}
fn verify_window_match(
&self,
tokens1: &[Token],
tokens2: &[Token],
idx1: usize,
idx2: usize,
len: usize,
) -> bool {
if idx1 + len > tokens1.len() || idx2 + len > tokens2.len() {
return false;
}
tokens1[idx1..idx1 + len] == tokens2[idx2..idx2 + len]
}
}
impl Default for Scanner {
fn default() -> Self {
Self::new().expect("Failed to initialize default Scanner")
}
}
pub fn find_duplicates(paths: Vec<String>) -> Result<Report> {
let scanner = Scanner::new()?;
let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
scanner.scan(path_bufs)
}
pub fn find_duplicates_with_config(
paths: Vec<String>,
min_block_size: usize,
similarity_threshold: f64,
) -> Result<Report> {
let scanner = Scanner::with_config(min_block_size, similarity_threshold)?;
let path_bufs: Vec<PathBuf> = paths.into_iter().map(PathBuf::from).collect();
scanner.scan(path_bufs)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_scanner_creation() {
let scanner = Scanner::new();
assert!(scanner.is_ok());
}
#[test]
fn test_scanner_with_config() {
let scanner = Scanner::with_config(30, 0.9);
assert!(scanner.is_ok());
let s = scanner.unwrap();
assert_eq!(s.min_block_size, 30);
assert_eq!(s.similarity_threshold, 0.9);
}
#[test]
fn test_find_duplicates_empty() {
let result = find_duplicates(vec![]);
assert!(result.is_ok());
let report = result.unwrap();
assert_eq!(report.duplicates.len(), 0);
}
#[test]
fn test_is_supported_file() {
let scanner = Scanner::new().unwrap();
assert!(scanner.is_supported_file(Path::new("test.rs")));
assert!(scanner.is_supported_file(Path::new("test.py")));
assert!(scanner.is_supported_file(Path::new("test.js")));
assert!(scanner.is_supported_file(Path::new("test.ts")));
assert!(!scanner.is_supported_file(Path::new("test.txt")));
assert!(!scanner.is_supported_file(Path::new("test.md")));
}
#[test]
fn test_detect_language() {
let scanner = Scanner::new().unwrap();
assert!(scanner.detect_language(Path::new("test.rs")).is_ok());
assert!(scanner.detect_language(Path::new("test.py")).is_ok());
assert!(scanner.detect_language(Path::new("test.js")).is_ok());
assert!(scanner.detect_language(Path::new("test.txt")).is_err());
}
}