use std::path::Path;
use super::clustering::{cluster_clone_groups, CloneClass};
use super::cross_language::CrossLanguageDetector;
use super::minhash::{FunctionSignature, MinHashDetector};
use super::simhash::SimHashFingerprinter;
use super::types::CloneGroup;
#[derive(Debug, Clone)]
pub struct CloneConfig {
pub min_lines: usize,
pub min_nodes: usize,
pub similarity_threshold: f64,
pub detect_type1: bool,
pub detect_type2: bool,
pub detect_type3: bool,
pub detect_cross_language: bool,
}
impl Default for CloneConfig {
fn default() -> Self {
Self {
min_lines: 6,
min_nodes: 5,
similarity_threshold: 0.8,
detect_type1: true,
detect_type2: true,
detect_type3: true,
detect_cross_language: true,
}
}
}
#[derive(Debug)]
pub struct CloneResult {
pub groups: Vec<CloneGroup>,
pub files_analyzed: usize,
pub total_duplicated_lines: usize,
}
pub struct CloneDetector {
config: CloneConfig,
}
impl CloneDetector {
pub fn new(config: CloneConfig) -> Self {
Self { config }
}
pub fn with_defaults() -> Self {
Self {
config: CloneConfig::default(),
}
}
pub fn detect_in_sources(
&self,
files: &[(String, String)], ) -> CloneResult {
let mut all_groups = Vec::new();
let minhash = MinHashDetector::new(128, 3, self.config.similarity_threshold);
if self.config.detect_type3 {
let mut func_signatures: Vec<FunctionSignature> = Vec::new();
for (path, source) in files {
let functions = extract_functions_from_source(source);
for (name, start_line, end_line, body) in functions {
let line_count = end_line - start_line + 1;
if line_count < self.config.min_lines {
continue;
}
let shingle_hashes = minhash.compute_shingles(&body);
if shingle_hashes.is_empty() {
continue;
}
let sketch = minhash.build_sketch(&shingle_hashes);
func_signatures.push(FunctionSignature {
file: path.clone(),
name,
start_line,
end_line,
sketch,
shingle_hashes,
});
}
}
if func_signatures.len() >= 2 {
let func_groups = minhash.detect_clones(&func_signatures);
all_groups.extend(func_groups);
}
}
if self.config.detect_type3 && files.len() >= 2 {
let simhash_threshold =
similarity_to_hamming_distance(self.config.similarity_threshold);
let simhasher = SimHashFingerprinter::new(simhash_threshold);
let fingerprints = simhasher.fingerprint_files(files);
let candidates = simhasher.find_candidates(&fingerprints);
if !candidates.is_empty() {
let mut needed: std::collections::HashSet<usize> = std::collections::HashSet::new();
for &(i, j) in &candidates {
needed.insert(i);
needed.insert(j);
}
let file_sketches: std::collections::HashMap<usize, FunctionSignature> = needed
.into_iter()
.map(|idx| {
let (path, source) = &files[idx];
let shingle_hashes = minhash.compute_shingles(source);
let sketch = minhash.build_sketch(&shingle_hashes);
(
idx,
FunctionSignature {
file: path.clone(),
name: format!("[file] {path}"),
start_line: 1,
end_line: source.lines().count(),
sketch,
shingle_hashes,
},
)
})
.collect();
for (i, j) in candidates {
if let (Some(sig_a), Some(sig_b)) =
(file_sketches.get(&i), file_sketches.get(&j))
{
let lines_a = sig_a.end_line.saturating_sub(sig_a.start_line) + 1;
let lines_b = sig_b.end_line.saturating_sub(sig_b.start_line) + 1;
if lines_a < self.config.min_lines || lines_b < self.config.min_lines {
continue;
}
let similarity =
MinHashDetector::jaccard_similarity(&sig_a.sketch, &sig_b.sketch);
if similarity >= self.config.similarity_threshold {
let instance_a = crate::clones::types::CloneInstance {
file: sig_a.file.clone(),
start_line: sig_a.start_line,
end_line: sig_a.end_line,
start_byte: 0,
end_byte: 0,
function_name: Some(sig_a.name.clone()),
};
let instance_b = crate::clones::types::CloneInstance {
file: sig_b.file.clone(),
start_line: sig_b.start_line,
end_line: sig_b.end_line,
start_byte: 0,
end_byte: 0,
function_name: Some(sig_b.name.clone()),
};
all_groups.push(
CloneGroup::new(
crate::clones::types::CloneType::Type3,
vec![instance_a, instance_b],
)
.with_similarity(similarity),
);
}
}
}
}
}
if self.config.detect_cross_language && files.len() >= 2 {
let cross_lang = CrossLanguageDetector::new(self.config.similarity_threshold);
let mut signatures = Vec::new();
let mut languages_seen = std::collections::HashSet::new();
for (path, _) in files {
if let Some(lang) = crate::core::Language::from_path(std::path::Path::new(path)) {
languages_seen.insert(lang);
}
}
if languages_seen.len() >= 2 {
for (path, source) in files {
let language =
match crate::core::Language::from_path(std::path::Path::new(path)) {
Some(lang) => lang,
None => continue,
};
let functions = extract_functions_from_source(source);
for (name, start_line, end_line, _body) in &functions {
let line_count = end_line - start_line + 1;
if line_count < self.config.min_lines {
continue;
}
let ir_tokens = crate::clones::ir_tokenizer::extract_ir_tokens_from_source(
source,
*start_line,
*end_line,
);
if ir_tokens.len() < 8 {
continue;
}
let shingles =
crate::clones::ir_tokenizer::ir_tokens_to_shingles(&ir_tokens, 4);
if shingles.is_empty() {
continue;
}
let minhash_det = crate::clones::minhash::MinHashDetector::new(
128,
3,
self.config.similarity_threshold,
);
let sketch = minhash_det.build_sketch(&shingles);
signatures.push(crate::clones::cross_language::CrossLanguageSignature {
file: path.clone(),
name: name.clone(),
language,
start_line: *start_line,
end_line: *end_line,
ir_tokens,
sketch,
});
}
}
}
if signatures.len() >= 2 {
let cross_groups = cross_lang.detect_clones(&signatures);
all_groups.extend(cross_groups);
}
}
all_groups.retain(|g| !is_trivial_clone_group(g, files));
all_groups.retain(|g| !is_trait_impl_clone_group(g, files));
all_groups.retain(|g| is_any_instance_in_non_test_context(g, files));
let total_duplicated_lines: usize = all_groups.iter().map(|g| g.duplicated_lines()).sum();
CloneResult {
groups: all_groups,
files_analyzed: files.len(),
total_duplicated_lines,
}
}
pub fn detect_and_cluster(&self, files: &[(String, String)]) -> (CloneResult, Vec<CloneClass>) {
let result = self.detect_in_sources(files);
let classes = cluster_clone_groups(&result.groups);
(result, classes)
}
pub fn detect(&self, root: &Path) -> Result<CloneResult, crate::core::Error> {
let scanner = crate::analysis::FileScanner::new();
let source_files = scanner.scan(root)?;
let files: Vec<(String, String)> = source_files
.iter()
.filter_map(|f| {
let source = std::fs::read_to_string(&f.path).ok()?;
Some((f.path.to_string_lossy().to_string(), source))
})
.collect();
Ok(self.detect_in_sources(&files))
}
}
fn similarity_to_hamming_distance(similarity_threshold: f64) -> u32 {
let loose_threshold = (similarity_threshold - 0.1).max(0.0);
let distance = (64.0 * (1.0 - loose_threshold)).ceil() as u32;
distance.min(64)
}
fn extract_functions_from_source(source: &str) -> Vec<(String, usize, usize, String)> {
let lines: Vec<&str> = source.lines().collect();
let mut functions = Vec::new();
let patterns = [
regex::Regex::new(r"^\s*(?:async\s+)?def\s+(\w+)\s*\(").unwrap(),
regex::Regex::new(r"^\s*(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(").unwrap(),
regex::Regex::new(
r"^\s*(?:public|private|protected|static|final|abstract|\s)*\w+\s+(\w+)\s*\(",
)
.unwrap(),
regex::Regex::new(r"^\s*func\s+\([^)]+\)\s+(\w+)\s*\(").unwrap(),
regex::Regex::new(r"^\s*func\s+(\w+)\s*\(").unwrap(),
regex::Regex::new(r"^\s*def\s+(\w+)").unwrap(),
regex::Regex::new(r"^\s*(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*[\(<]").unwrap(),
regex::Regex::new(r"^\s*(?:public|private|protected|static|\s)*function\s+(\w+)\s*\(")
.unwrap(),
];
let mut i = 0;
while i < lines.len() {
let line = lines[i];
let mut matched_name = None;
for pattern in &patterns {
if let Some(caps) = pattern.captures(line) {
if let Some(name) = caps.get(1) {
matched_name = Some(name.as_str().to_string());
break;
}
}
}
if let Some(name) = matched_name {
let start_line = i + 1; let end_line = find_function_end(&lines, i);
let body: String = lines[i..end_line].join("\n");
functions.push((name, start_line, end_line, body));
i = end_line; } else {
i += 1;
}
}
functions
}
fn find_function_end(lines: &[&str], start_idx: usize) -> usize {
let start_line = lines[start_idx];
let start_indent = start_line.len() - start_line.trim_start().len();
let has_brace = lines[start_idx..].iter().take(3).any(|l| l.contains('{'));
if has_brace {
let mut depth = 0;
for (offset, line) in lines[start_idx..].iter().enumerate() {
for ch in line.chars() {
if ch == '{' {
depth += 1;
} else if ch == '}' {
depth -= 1;
if depth == 0 {
return start_idx + offset + 1;
}
}
}
}
lines.len()
} else {
let trimmed_start = start_line.trim();
let is_ruby_end_style = trimmed_start.starts_with("def ")
&& !trimmed_start.ends_with(':')
&& !trimmed_start.contains('{');
if is_ruby_end_style {
for (idx, line) in lines.iter().enumerate().skip(start_idx + 1) {
let trimmed = line.trim();
if trimmed == "end" {
let indent = line.len() - line.trim_start().len();
if indent <= start_indent {
return idx + 1;
}
}
}
lines.len()
} else {
for (idx, line) in lines.iter().enumerate().skip(start_idx + 1) {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let indent = line.len() - line.trim_start().len();
if indent <= start_indent && !trimmed.starts_with('@') {
return idx;
}
}
lines.len()
}
}
}
fn is_trivial_clone_group(group: &CloneGroup, files: &[(String, String)]) -> bool {
group.instances.iter().all(|inst| {
let body_lines = inst.end_line.saturating_sub(inst.start_line) + 1;
if body_lines > 5 {
return false;
}
let source = files
.iter()
.find(|(path, _)| *path == inst.file)
.map(|(_, s)| s.as_str());
if let Some(source) = source {
let lines: Vec<&str> = source.lines().collect();
let start = inst.start_line.saturating_sub(1); let end = inst.end_line.min(lines.len());
let body: String = lines[start..end]
.iter()
.map(|l| l.trim())
.filter(|l| {
!l.is_empty()
&& !l.starts_with('{')
&& !l.starts_with('}')
&& !l.starts_with("end")
})
.collect::<Vec<_>>()
.join(" ");
let has_control_flow = body.contains("if ")
|| body.contains("for ")
|| body.contains("while ")
|| body.contains("match ")
|| body.contains("switch ")
|| body.contains("loop ")
|| body.contains("elif ")
|| body.contains("except ")
|| body.contains("except:")
|| body.contains("with ")
|| body.contains("yield ")
|| body.contains("unless ")
|| body.contains("until ")
|| body.contains("case ")
|| body.contains("when ")
|| body.contains("rescue ")
|| body.contains("rescue:")
|| body.contains("ensure ")
|| body.contains("ensure:")
|| body.contains("try ")
|| body.contains("try{")
|| body.contains("catch ")
|| body.contains("catch(")
|| body.contains("else ")
|| body.contains("else{")
|| body.contains("else:")
|| body.contains("select ")
|| body.contains("select{");
!has_control_flow
} else {
true
}
})
}
fn is_trait_impl_clone_group(group: &CloneGroup, files: &[(String, String)]) -> bool {
if group.instances.len() < 2 {
return false;
}
let first_name = group.instances[0].function_name.as_deref().unwrap_or("");
if first_name.is_empty() {
return false;
}
let same_name = group
.instances
.iter()
.all(|inst| inst.function_name.as_deref().unwrap_or("") == first_name);
if !same_name {
return false;
}
group.instances.iter().all(|inst| {
let source = files
.iter()
.find(|(path, _)| *path == inst.file)
.map(|(_, s)| s.as_str());
let Some(source) = source else { return false };
is_trait_impl_function(source, inst.start_line, first_name)
})
}
fn is_trait_impl_function(source: &str, start_line: usize, fn_name: &str) -> bool {
let lines: Vec<&str> = source.lines().collect();
let fn_idx = start_line.saturating_sub(1);
if fn_name.starts_with("__") && fn_name.ends_with("__") && fn_name.len() > 4 {
return true;
}
for look_back in 1..=3 {
if fn_idx >= look_back {
let above = lines[fn_idx - look_back].trim();
if above == "@Override" {
return true;
}
if !above.is_empty() && !above.starts_with('@') && !above.starts_with("//") {
break;
}
}
}
let mut brace_depth: i32 = 0;
for i in (0..fn_idx).rev() {
let trimmed = lines[i].trim();
if trimmed.starts_with("impl ") && trimmed.contains(" for ") {
return true;
}
if trimmed.starts_with("fn ")
|| trimmed.starts_with("pub fn ")
|| trimmed.starts_with("mod ")
{
break;
}
for ch in lines[i].chars().rev() {
if ch == '}' {
brace_depth += 1;
}
if ch == '{' {
brace_depth -= 1;
}
}
if brace_depth < 0 {
break;
}
}
false
}
fn is_any_instance_in_non_test_context(group: &CloneGroup, files: &[(String, String)]) -> bool {
group.instances.iter().any(|inst| {
if is_test_file(&inst.file) {
return false; }
let source = files
.iter()
.find(|(path, _)| *path == inst.file)
.map(|(_, s)| s.as_str());
let Some(source) = source else { return true };
!is_in_test_context(source, inst.start_line)
})
}
fn is_in_test_context(source: &str, start_line: usize) -> bool {
let lines: Vec<&str> = source.lines().collect();
let fn_idx = start_line.saturating_sub(1);
for look_back in 1..=5 {
if fn_idx < look_back {
break;
}
let above = lines[fn_idx - look_back].trim();
if (above.starts_with('#') || above.starts_with('@'))
&& above.to_lowercase().contains("test")
{
return true;
}
if !above.is_empty()
&& !above.starts_with('#')
&& !above.starts_with('@')
&& !above.starts_with("//")
&& !above.starts_with("///")
&& !above.starts_with("*")
{
break;
}
}
let mut brace_depth: i32 = 0;
for i in (0..fn_idx).rev() {
let trimmed = lines[i].trim();
if (trimmed.starts_with("mod tests")
|| trimmed.starts_with("mod test")
|| trimmed.starts_with("pub mod tests"))
&& brace_depth == 0
{
for j in 1..=3 {
if i >= j {
let attr = lines[i - j].trim();
if attr == "#[cfg(test)]" {
return true;
}
if !attr.is_empty() && !attr.starts_with('#') && !attr.starts_with("//") {
break;
}
}
}
}
for ch in lines[i].chars().rev() {
if ch == '}' {
brace_depth += 1;
}
if ch == '{' {
brace_depth -= 1;
}
}
if brace_depth < 0 {
break;
}
}
false
}
fn is_test_file(path: &str) -> bool {
let path_lower = path.to_lowercase();
let segments: Vec<&str> = path_lower.split('/').collect();
for seg in &segments {
if *seg == "tests" || *seg == "test" || *seg == "__tests__" || *seg == "spec" {
return true;
}
}
if let Some(filename) = segments.last() {
if filename.starts_with("test_") {
return true;
}
if let Some(stem) = filename.rsplit_once('.').map(|(s, _)| s) {
if stem.ends_with("_test")
|| stem.ends_with("_tests")
|| stem.ends_with(".test")
|| stem.ends_with(".spec")
|| stem.ends_with("_spec")
{
return true;
}
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clone_detector_config() {
let config = CloneConfig::default();
assert_eq!(config.min_lines, 6);
assert!(config.detect_type1);
assert!(config.detect_type2);
assert!(config.detect_type3);
}
#[test]
fn test_detect_similar_files() {
let source_a = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n\ndef bar(a):\n b = a + 1\n c = b * 2\n return c\n";
let source_b = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n\ndef baz(a):\n b = a + 1\n c = b * 2\n return c\n";
let detector = CloneDetector::with_defaults();
let result = detector.detect_in_sources(&[
("a.py".to_string(), source_a.to_string()),
("b.py".to_string(), source_b.to_string()),
]);
assert_eq!(result.files_analyzed, 2);
}
#[test]
fn test_extract_functions_from_source() {
let source = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n\ndef bar(a):\n b = a + 1\n c = b * 2\n return c\n";
let functions = extract_functions_from_source(source);
let names: Vec<&str> = functions.iter().map(|(n, _, _, _)| n.as_str()).collect();
assert!(names.contains(&"foo"));
assert!(names.contains(&"bar"));
}
#[test]
fn test_function_level_clones_within_file() {
let source = r#"
def format_bytes(size):
for unit in ['B', 'KB', 'MB', 'GB']:
if size < 1024:
return f"{size:.1f} {unit}"
size /= 1024
return f"{size:.1f} TB"
def format_file_size(bytes_count):
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_count < 1024:
return f"{bytes_count:.1f} {unit}"
bytes_count /= 1024
return f"{bytes_count:.1f} TB"
"#;
let config = CloneConfig {
similarity_threshold: 0.4,
..CloneConfig::default()
};
let detector = CloneDetector::new(config);
let result = detector.detect_in_sources(&[("utils.py".to_string(), source.to_string())]);
assert!(
!result.groups.is_empty(),
"Should detect function-level clones within a single file"
);
}
#[test]
fn test_function_level_clones_across_files() {
let source_a = r#"
function formatFileSize(bytes) {
if (bytes === 0) return '0 B';
const units = ['B', 'KB', 'MB', 'GB', 'TB'];
const factor = 1024;
let idx = 0;
let size = bytes;
while (size >= factor && idx < units.length - 1) {
size /= factor;
idx++;
}
return size.toFixed(1) + ' ' + units[idx];
}
"#;
let source_b = r#"
function formatDataSize(numBytes) {
if (numBytes === 0) return '0 B';
const suffixes = ['B', 'KB', 'MB', 'GB', 'TB'];
const base = 1024;
let index = 0;
let value = numBytes;
while (value >= base && index < suffixes.length - 1) {
value /= base;
index++;
}
return value.toFixed(1) + ' ' + suffixes[index];
}
"#;
let config = CloneConfig {
similarity_threshold: 0.3,
..CloneConfig::default()
};
let detector = CloneDetector::new(config);
let result = detector.detect_in_sources(&[
("a.js".to_string(), source_a.to_string()),
("b.js".to_string(), source_b.to_string()),
]);
assert!(
!result.groups.is_empty(),
"Should detect function-level clones across files"
);
}
#[test]
fn test_clone_config_has_cross_language_field() {
let config = CloneConfig::default();
assert!(
config.detect_cross_language,
"cross-language detection should be enabled by default"
);
}
#[test]
fn test_cross_language_disabled_skips_phase3() {
let source_a = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n";
let source_b =
"function foo(x) {\n let y = x + 1;\n let z = y * 2;\n return z;\n}\n";
let config = CloneConfig {
detect_cross_language: false,
detect_type3: false,
..CloneConfig::default()
};
let detector = CloneDetector::new(config);
let result = detector.detect_in_sources(&[
("a.py".to_string(), source_a.to_string()),
("b.js".to_string(), source_b.to_string()),
]);
assert!(
result.groups.is_empty(),
"Disabling cross-language should skip cross-language detection"
);
}
#[test]
fn test_detect_and_cluster_returns_classes() {
let source_a = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n\ndef bar(a):\n b = a + 1\n c = b * 2\n return c\n";
let source_b = "def foo(x):\n y = x + 1\n z = y * 2\n return z\n\ndef baz(a):\n b = a + 1\n c = b * 2\n return c\n";
let config = CloneConfig {
similarity_threshold: 0.4,
detect_cross_language: false, ..CloneConfig::default()
};
let detector = CloneDetector::new(config);
let (result, classes) = detector.detect_and_cluster(&[
("a.py".to_string(), source_a.to_string()),
("b.py".to_string(), source_b.to_string()),
]);
assert_eq!(result.files_analyzed, 2);
assert!(classes.len() <= result.groups.len() || result.groups.is_empty());
}
}