impl SimilarityDetector {
#[must_use]
pub fn new(config: SimilarityConfig) -> Self {
Self {
winnower: Winnowing::new(config.window_size, config.k_gram_size),
token_analyzer: TokenAnalyzer::new(),
entropy_calculator: EntropyCalculator::new(),
config,
}
}
#[must_use]
pub fn detect_exact_duplicates(&self, files: &[(PathBuf, String)]) -> Vec<SimilarBlock> {
let mut hash_map: HashMap<u64, Vec<(PathBuf, usize, usize, String)>> = HashMap::new();
for (path, content) in files {
let blocks = self.extract_code_blocks(content, self.config.min_lines);
for block in blocks {
let normalized = self.normalize_whitespace(&block.content);
let hash = self.hash_content(&normalized);
hash_map.entry(hash).or_default().push((
path.clone(),
block.start_line,
block.end_line,
block.content,
));
}
}
self.build_duplicate_blocks(hash_map, CloneType::Type1)
}
#[must_use]
pub fn detect_structural_similarity(
&self,
files: &[(PathBuf, String)],
threshold: f64,
) -> Vec<SimilarBlock> {
let mut normalized_blocks = Vec::new();
for (path, content) in files {
let blocks = self.extract_code_blocks(content, self.config.min_lines);
for block in blocks {
let normalized = self.normalize_identifiers(&block.content);
normalized_blocks.push((path.clone(), block, normalized));
}
}
self.find_similar_blocks(normalized_blocks, threshold, CloneType::Type2)
}
#[must_use]
pub fn detect_semantic_similarity(
&self,
files: &[(PathBuf, String)],
threshold: f64,
) -> Vec<SimilarBlock> {
let mut token_vectors = Vec::new();
for (path, content) in files {
let blocks = self.extract_code_blocks(content, self.config.min_lines);
for block in blocks {
let tokens = self.token_analyzer.tokenize(&block.content);
let vector = self.token_analyzer.to_vector(&tokens);
token_vectors.push((path.clone(), block, vector));
}
}
self.find_semantic_matches(token_vectors, threshold, CloneType::Type4)
}
#[must_use]
pub fn analyze_entropy(&self, files: &[(PathBuf, String)]) -> EntropyReport {
let mut all_entropies = Vec::new();
let mut high_entropy = Vec::new();
let mut low_entropy = Vec::new();
for (path, content) in files {
let blocks = self.extract_code_blocks(content, self.config.min_lines);
for block in blocks {
let entropy = self.calculate_entropy(&block.content);
all_entropies.push(entropy);
let location = Location {
file: path.clone(),
start_line: block.start_line,
end_line: block.end_line,
start_column: None,
end_column: None,
};
if entropy > 4.0 {
high_entropy.push(EntropyBlock {
location,
entropy,
category: "Complex".to_string(),
suggestion: "Consider breaking down this complex code".to_string(),
});
} else if entropy < 2.0 {
low_entropy.push(EntropyBlock {
location,
entropy,
category: "Repetitive".to_string(),
suggestion: "Extract repeated pattern into reusable function".to_string(),
});
}
}
}
let avg_entropy = if all_entropies.is_empty() {
0.0
} else {
all_entropies.iter().sum::<f64>() / all_entropies.len() as f64
};
let recommendations = self.generate_recommendations(&high_entropy, &low_entropy);
EntropyReport {
average_entropy: avg_entropy,
high_entropy_blocks: high_entropy,
low_entropy_patterns: low_entropy,
recommendations,
}
}
#[must_use]
pub fn find_refactoring_opportunities(
&self,
files: &[(PathBuf, String)],
) -> Vec<RefactoringHint> {
let mut hints = Vec::new();
let structural = self.detect_structural_similarity(files, 0.8);
for similar in structural {
if similar.locations.len() > 2 {
hints.push(RefactoringHint {
locations: similar.locations,
pattern: "Repeated code structure".to_string(),
suggestion: "Extract common pattern into shared function".to_string(),
priority: Priority::High,
});
}
}
let semantic = self.detect_semantic_similarity(files, 0.7);
for similar in semantic {
hints.push(RefactoringHint {
locations: similar.locations,
pattern: "Semantically equivalent code".to_string(),
suggestion: "Consolidate implementations".to_string(),
priority: Priority::Medium,
});
}
hints
}
#[must_use]
pub fn comprehensive_analysis(&self, files: &[(PathBuf, String)]) -> ComprehensiveReport {
let exact = self.detect_exact_duplicates(files);
let structural = self.detect_structural_similarity(files, self.config.similarity_threshold);
let semantic = self.detect_semantic_similarity(files, self.config.similarity_threshold);
let entropy = if self.config.enable_entropy {
Some(self.analyze_entropy(files))
} else {
None
};
let refactoring = self.find_refactoring_opportunities(files);
let total_clones = exact.len() + structural.len() + semantic.len();
let duplication_percentage = self.calculate_duplication_percentage(files, &exact);
let average_entropy = entropy.as_ref().map_or(0.0, |e| e.average_entropy);
ComprehensiveReport {
exact_duplicates: exact,
structural_similarities: structural,
semantic_similarities: semantic,
entropy_analysis: entropy,
refactoring_opportunities: refactoring,
metrics: Metrics {
duplication_percentage,
average_entropy,
total_clones,
},
}
}
#[must_use]
pub fn calculate_entropy(&self, text: &str) -> f64 {
self.entropy_calculator.calculate(text)
}
fn extract_code_blocks(&self, content: &str, min_lines: usize) -> Vec<CodeBlock> {
let lines: Vec<&str> = content.lines().collect();
let mut blocks = Vec::new();
for i in 0..lines.len().saturating_sub(min_lines - 1) {
let block_lines = &lines[i..i + min_lines];
let block_content = block_lines.join("\n");
if self.count_tokens(&block_content) >= self.config.min_tokens {
blocks.push(CodeBlock {
start_line: i + 1,
end_line: i + min_lines,
content: block_content,
});
}
}
blocks
}
fn normalize_whitespace(&self, text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn normalize_identifiers(&self, text: &str) -> String {
let mut result = text.to_string();
let ident_pattern =
regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b").expect("internal error");
let mut counter = 0;
for mat in ident_pattern.find_iter(text) {
if !self.is_keyword(mat.as_str()) {
counter += 1;
result = result.replace(mat.as_str(), &format!("VAR{counter}"));
}
}
result
}
fn hash_content(&self, content: &str) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
hasher.finish()
}
fn count_tokens(&self, text: &str) -> usize {
text.split_whitespace().count()
}
fn is_keyword(&self, word: &str) -> bool {
matches!(
word,
"fn" | "let"
| "mut"
| "if"
| "else"
| "match"
| "for"
| "while"
| "loop"
| "return"
| "use"
| "pub"
| "struct"
| "enum"
| "impl"
| "trait"
| "mod"
)
}
fn build_duplicate_blocks(
&self,
hash_map: HashMap<u64, Vec<(PathBuf, usize, usize, String)>>,
clone_type: CloneType,
) -> Vec<SimilarBlock> {
let mut blocks = Vec::new();
for (hash, locations) in hash_map {
if locations.len() > 1 {
let content = &locations[0].3;
let lines = content.lines().count();
let tokens = self.count_tokens(content);
blocks.push(SimilarBlock {
id: format!("{hash:x}"),
locations: locations
.iter()
.map(|(path, start, end, _)| Location {
file: path.clone(),
start_line: *start,
end_line: *end,
start_column: None,
end_column: None,
})
.collect(),
similarity: 1.0,
clone_type,
lines,
tokens,
content_preview: content.lines().take(3).collect::<Vec<_>>().join("\n"),
});
}
}
blocks
}
fn find_similar_blocks(
&self,
normalized: Vec<(PathBuf, CodeBlock, String)>,
threshold: f64,
clone_type: CloneType,
) -> Vec<SimilarBlock> {
let mut similar = Vec::new();
for i in 0..normalized.len() {
for j in i + 1..normalized.len() {
let sim = self.calculate_similarity(&normalized[i].2, &normalized[j].2);
if sim >= threshold {
similar.push(SimilarBlock {
id: format!("sim_{}", similar.len()),
locations: vec![
Location {
file: normalized[i].0.clone(),
start_line: normalized[i].1.start_line,
end_line: normalized[i].1.end_line,
start_column: None,
end_column: None,
},
Location {
file: normalized[j].0.clone(),
start_line: normalized[j].1.start_line,
end_line: normalized[j].1.end_line,
start_column: None,
end_column: None,
},
],
similarity: sim,
clone_type,
lines: normalized[i].1.content.lines().count(),
tokens: self.count_tokens(&normalized[i].1.content),
content_preview: normalized[i]
.1
.content
.lines()
.take(3)
.collect::<Vec<_>>()
.join("\n"),
});
}
}
}
similar
}
fn find_semantic_matches(
&self,
vectors: Vec<(PathBuf, CodeBlock, TokenVector)>,
threshold: f64,
clone_type: CloneType,
) -> Vec<SimilarBlock> {
let mut matches = Vec::new();
for i in 0..vectors.len() {
for j in i + 1..vectors.len() {
let sim = self
.token_analyzer
.cosine_similarity(&vectors[i].2, &vectors[j].2);
if sim >= threshold {
matches.push(SimilarBlock {
id: format!("sem_{}", matches.len()),
locations: vec![
Location {
file: vectors[i].0.clone(),
start_line: vectors[i].1.start_line,
end_line: vectors[i].1.end_line,
start_column: None,
end_column: None,
},
Location {
file: vectors[j].0.clone(),
start_line: vectors[j].1.start_line,
end_line: vectors[j].1.end_line,
start_column: None,
end_column: None,
},
],
similarity: sim,
clone_type,
lines: vectors[i].1.content.lines().count(),
tokens: self.count_tokens(&vectors[i].1.content),
content_preview: vectors[i]
.1
.content
.lines()
.take(3)
.collect::<Vec<_>>()
.join("\n"),
});
}
}
}
matches
}
fn calculate_similarity(&self, text1: &str, text2: &str) -> f64 {
aprender::text::similarity::edit_distance_similarity(text1, text2).unwrap_or(0.0)
}
fn calculate_duplication_percentage(
&self,
files: &[(PathBuf, String)],
duplicates: &[SimilarBlock],
) -> f64 {
let total_lines: usize = files
.iter()
.map(|(_, content)| content.lines().count())
.sum();
let duplicate_lines: usize = duplicates.iter().map(|d| d.lines * d.locations.len()).sum();
if total_lines > 0 {
(duplicate_lines as f64 / total_lines as f64) * 100.0
} else {
0.0
}
}
fn generate_recommendations(
&self,
high_entropy: &[EntropyBlock],
low_entropy: &[EntropyBlock],
) -> Vec<String> {
let mut recommendations = Vec::new();
if !high_entropy.is_empty() {
recommendations.push(format!(
"Found {} complex code blocks that should be simplified",
high_entropy.len()
));
}
if !low_entropy.is_empty() {
recommendations.push(format!(
"Found {} repetitive patterns that could be extracted",
low_entropy.len()
));
}
if low_entropy.len() > 5 {
recommendations
.push("Consider creating utility functions for common patterns".to_string());
}
recommendations
}
}