use super::types::{CloneType, CodeBlock};
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct SimilarityMetrics {
pub token_similarity: f64,
pub structural_similarity: f64,
pub overall_similarity: f64,
pub clone_type: CloneType,
}
pub fn calculate_similarity(block1: &CodeBlock, block2: &CodeBlock) -> SimilarityMetrics {
let token_sim = token_similarity(&block1.tokens, &block2.tokens);
let structural_sim = structural_similarity(&block1.normalized, &block2.normalized);
let clone_type = determine_clone_type(
&block1.content,
&block2.content,
&block1.normalized,
&block2.normalized,
block1.hash,
block2.hash,
block1.normalized_hash,
block2.normalized_hash,
);
let overall = match clone_type {
CloneType::Type1 => 1.0,
CloneType::Type2 => token_sim * 0.3 + structural_sim * 0.7,
CloneType::Type3 => token_sim * 0.5 + structural_sim * 0.5,
CloneType::Type4 => token_sim * 0.8,
};
SimilarityMetrics {
token_similarity: token_sim,
structural_similarity: structural_sim,
overall_similarity: overall,
clone_type,
}
}
pub fn token_similarity(tokens1: &[String], tokens2: &[String]) -> f64 {
if tokens1.is_empty() && tokens2.is_empty() {
return 1.0;
}
if tokens1.is_empty() || tokens2.is_empty() {
return 0.0;
}
let set1: HashSet<&String> = tokens1.iter().collect();
let set2: HashSet<&String> = tokens2.iter().collect();
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
return 0.0;
}
intersection as f64 / union as f64
}
pub fn structural_similarity(normalized1: &str, normalized2: &str) -> f64 {
if normalized1 == normalized2 {
return 1.0;
}
let distance = levenshtein_distance(normalized1, normalized2);
let max_len = normalized1.len().max(normalized2.len());
if max_len == 0 {
return 1.0;
}
1.0 - (distance as f64 / max_len as f64)
}
fn levenshtein_distance(s1: &str, s2: &str) -> usize {
let len1 = s1.chars().count();
let len2 = s2.chars().count();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
for i in 0..=len1 {
matrix[i][0] = i;
}
for j in 0..=len2 {
matrix[0][j] = j;
}
let chars1: Vec<char> = s1.chars().collect();
let chars2: Vec<char> = s2.chars().collect();
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
matrix[i][j] = (matrix[i - 1][j] + 1)
.min(matrix[i][j - 1] + 1)
.min(matrix[i - 1][j - 1] + cost);
}
}
matrix[len1][len2]
}
fn determine_clone_type(
_content1: &str,
_content2: &str,
normalized1: &str,
normalized2: &str,
_hash1: u64,
_hash2: u64,
norm_hash1: u64,
norm_hash2: u64,
) -> CloneType {
if normalized1 == normalized2 {
return CloneType::Type1;
}
if norm_hash1 == norm_hash2 {
return CloneType::Type2;
}
let structural_sim = structural_similarity(normalized1, normalized2);
if structural_sim > 0.7 {
return CloneType::Type3;
}
CloneType::Type4
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_similarity() {
let tokens1 = vec!["hello".to_string(), "world".to_string()];
let tokens2 = vec!["hello".to_string(), "world".to_string()];
let sim = token_similarity(&tokens1, &tokens2);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_token_similarity_partial() {
let tokens1 = vec!["hello".to_string(), "world".to_string()];
let tokens2 = vec!["hello".to_string(), "there".to_string()];
let sim = token_similarity(&tokens1, &tokens2);
assert!(sim > 0.0 && sim < 1.0);
}
#[test]
fn test_structural_similarity_identical() {
let sim = structural_similarity("hello world", "hello world");
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_structural_similarity_different() {
let sim = structural_similarity("hello world", "goodbye world");
assert!(sim < 1.0);
}
#[test]
fn test_levenshtein_distance() {
assert_eq!(levenshtein_distance("", ""), 0);
assert_eq!(levenshtein_distance("hello", "hello"), 0);
assert_eq!(levenshtein_distance("hello", "hallo"), 1);
assert_eq!(levenshtein_distance("hello", ""), 5);
}
}