use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use crate::analysis::clones::{normalize_tokens, NormalizationMode, NormalizedToken};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarityReport {
pub fragment1: SimilarityFragment,
pub fragment2: SimilarityFragment,
pub similarity: SimilarityScores,
pub token_breakdown: TokenBreakdown,
pub config: SimilarityConfig,
}
impl SimilarityReport {
pub fn new(
fragment1: SimilarityFragment,
fragment2: SimilarityFragment,
similarity: SimilarityScores,
token_breakdown: TokenBreakdown,
config: SimilarityConfig,
) -> Self {
Self {
fragment1,
fragment2,
similarity,
token_breakdown,
config,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarityFragment {
pub file: PathBuf,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub function: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub line_range: Option<(usize, usize)>,
pub tokens: usize,
pub lines: usize,
}
impl SimilarityFragment {
pub fn new(file: PathBuf, tokens: usize, lines: usize) -> Self {
Self {
file,
function: None,
line_range: None,
tokens,
lines,
}
}
pub fn with_function(mut self, function: String) -> Self {
self.function = Some(function);
self
}
pub fn with_line_range(mut self, start: usize, end: usize) -> Self {
self.line_range = Some((start, end));
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarityScores {
pub dice: f64,
pub jaccard: f64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cosine: Option<f64>,
pub interpretation: String,
}
impl SimilarityScores {
pub fn new(dice: f64, jaccard: f64) -> Self {
let interpretation = interpret_similarity_score(dice);
Self {
dice,
jaccard,
cosine: None,
interpretation,
}
}
pub fn with_cosine(mut self, cosine: f64) -> Self {
self.cosine = Some(cosine);
self
}
}
impl Default for SimilarityScores {
fn default() -> Self {
Self::new(0.0, 0.0)
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct TokenBreakdown {
pub shared_tokens: usize,
pub unique_to_fragment1: usize,
pub unique_to_fragment2: usize,
pub total_unique: usize,
}
impl TokenBreakdown {
pub fn new(shared: usize, unique1: usize, unique2: usize) -> Self {
Self {
shared_tokens: shared,
unique_to_fragment1: unique1,
unique_to_fragment2: unique2,
total_unique: shared + unique1 + unique2,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarityConfig {
pub metric: SimilarityMetric,
pub ngram_size: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
}
impl Default for SimilarityConfig {
fn default() -> Self {
Self {
metric: SimilarityMetric::All,
ngram_size: 1,
language: None,
}
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "lowercase")]
pub enum SimilarityMetric {
Dice,
Jaccard,
Cosine,
#[default]
All,
}
impl SimilarityMetric {
pub fn as_str(&self) -> &'static str {
match self {
SimilarityMetric::Dice => "dice",
SimilarityMetric::Jaccard => "jaccard",
SimilarityMetric::Cosine => "cosine",
SimilarityMetric::All => "all",
}
}
}
#[derive(Debug, Clone, Default)]
pub struct SimilarityOptions {
pub metric: SimilarityMetric,
pub ngram_size: usize,
pub language: Option<String>,
pub level: Option<ComparisonLevel>,
}
impl SimilarityOptions {
pub fn new() -> Self {
Self {
metric: SimilarityMetric::All,
ngram_size: 1,
language: None,
level: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComparisonLevel {
File,
Function,
Block,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PairwiseSimilarityReport {
pub pairs: Vec<PairwiseSimilarityEntry>,
pub files_analyzed: usize,
pub config: SimilarityConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PairwiseSimilarityEntry {
pub file1: PathBuf,
pub file2: PathBuf,
pub dice: f64,
pub jaccard: f64,
}
#[derive(Debug, Clone)]
pub struct ParsedTarget {
pub file: PathBuf,
pub function: Option<String>,
pub line_range: Option<(usize, usize)>,
}
impl ParsedTarget {
pub fn file_only(file: PathBuf) -> Self {
Self {
file,
function: None,
line_range: None,
}
}
pub fn with_function(file: PathBuf, function: String) -> Self {
Self {
file,
function: Some(function),
line_range: None,
}
}
pub fn with_line_range(file: PathBuf, start: usize, end: usize) -> Self {
Self {
file,
function: None,
line_range: Some((start, end)),
}
}
}
pub fn interpret_similarity_score(score: f64) -> String {
match score {
s if s >= 0.95 => "Near-identical (Type-1/2 clone)".to_string(),
s if s >= 0.85 => "High similarity (likely derived from same source)".to_string(),
s if s >= 0.70 => "Moderate similarity (possible Type-3 clone)".to_string(),
s if s >= 0.50 => "Some similarity (shared patterns/idioms)".to_string(),
s if s >= 0.30 => "Low similarity".to_string(),
_ => "Very different (minimal shared code)".to_string(),
}
}
pub fn dice_coefficient(shared: usize, total_a: usize, total_b: usize) -> f64 {
if total_a == 0 && total_b == 0 {
return 1.0; }
let total = total_a + total_b;
if total == 0 {
return 0.0;
}
(2.0 * shared as f64) / total as f64
}
pub fn jaccard_coefficient(shared: usize, total_a: usize, total_b: usize) -> f64 {
if total_a == 0 && total_b == 0 {
return 1.0; }
let union = total_a + total_b - shared;
if union == 0 {
return 0.0;
}
shared as f64 / union as f64
}
pub fn dice_to_jaccard(dice: f64) -> f64 {
if dice >= 2.0 {
return 1.0;
}
dice / (2.0 - dice)
}
pub fn jaccard_to_dice(jaccard: f64) -> f64 {
(2.0 * jaccard) / (1.0 + jaccard)
}
pub fn compute_similarity(
path1: &Path,
path2: &Path,
options: &SimilarityOptions,
) -> anyhow::Result<SimilarityReport> {
let language = detect_language_from_path(path1)
.or_else(|| detect_language_from_path(path2))
.or_else(|| options.language.clone())
.ok_or_else(|| anyhow::anyhow!("Could not detect language from file extension"))?;
let source1 = std::fs::read_to_string(path1)
.map_err(|e| anyhow::anyhow!("Failed to read {}: {}", path1.display(), e))?;
let source2 = std::fs::read_to_string(path2)
.map_err(|e| anyhow::anyhow!("Failed to read {}: {}", path2.display(), e))?;
let normalization = NormalizationMode::All;
let tokens1 = normalize_tokens(&source1, &language, normalization)?;
let tokens2 = normalize_tokens(&source2, &language, normalization)?;
let lines1 = source1.lines().count();
let lines2 = source2.lines().count();
let fragment1 = SimilarityFragment::new(path1.to_path_buf(), tokens1.len(), lines1);
let fragment2 = SimilarityFragment::new(path2.to_path_buf(), tokens2.len(), lines2);
let results = compute_similarity_from_tokens(&tokens1, &tokens2, options);
let breakdown = compute_token_breakdown(&tokens1, &tokens2);
let config = SimilarityConfig {
metric: options.metric,
ngram_size: if options.ngram_size == 0 {
1
} else {
options.ngram_size
},
language: Some(language),
};
let scores = SimilarityScores::from(results);
Ok(SimilarityReport::new(
fragment1, fragment2, scores, breakdown, config,
))
}
fn detect_language_from_path(path: &Path) -> Option<String> {
path.extension()
.and_then(|ext| ext.to_str())
.and_then(|ext| match ext {
"py" => Some("python"),
"ts" | "tsx" => Some("typescript"),
"js" | "jsx" => Some("javascript"),
"go" => Some("go"),
"rs" => Some("rust"),
"java" => Some("java"),
_ => None,
})
.map(String::from)
}
pub fn compute_pairwise_similarity(
_path: &Path,
_options: &SimilarityOptions,
) -> anyhow::Result<PairwiseSimilarityReport> {
todo!("Pairwise similarity computation not yet implemented")
}
pub fn parse_target(target: &str) -> anyhow::Result<ParsedTarget> {
if let Some(pos) = target.find("::") {
let file = PathBuf::from(&target[..pos]);
let function = target[pos + 2..].to_string();
return Ok(ParsedTarget::with_function(file, function));
}
let parts: Vec<&str> = target.rsplitn(3, ':').collect();
if parts.len() == 3 {
if let (Ok(end), Ok(start)) = (parts[0].parse::<usize>(), parts[1].parse::<usize>()) {
if start > end {
anyhow::bail!("Invalid line range: start ({}) > end ({})", start, end);
}
let file = PathBuf::from(parts[2]);
return Ok(ParsedTarget::with_line_range(file, start, end));
}
}
Ok(ParsedTarget::file_only(PathBuf::from(target)))
}
pub fn compute_similarity_from_tokens(
tokens1: &[NormalizedToken],
tokens2: &[NormalizedToken],
options: &SimilarityOptions,
) -> SimilarityResults {
if tokens1.is_empty() && tokens2.is_empty() {
return SimilarityResults {
dice: 1.0,
jaccard: 1.0,
cosine: if matches!(
options.metric,
SimilarityMetric::Cosine | SimilarityMetric::All
) {
Some(1.0)
} else {
None
},
interpretation: interpret_similarity_score(1.0),
};
}
if tokens1.is_empty() || tokens2.is_empty() {
return SimilarityResults {
dice: 0.0,
jaccard: 0.0,
cosine: if matches!(
options.metric,
SimilarityMetric::Cosine | SimilarityMetric::All
) {
Some(0.0)
} else {
None
},
interpretation: interpret_similarity_score(0.0),
};
}
let bag1 = build_token_bag(tokens1);
let bag2 = build_token_bag(tokens2);
let dice = compute_dice(&bag1, &bag2);
let jaccard = compute_jaccard(&bag1, &bag2);
let cosine = if matches!(
options.metric,
SimilarityMetric::Cosine | SimilarityMetric::All
) {
Some(compute_cosine_tf(&bag1, &bag2))
} else {
None
};
let mut scores = SimilarityScores::new(dice, jaccard);
if let Some(cos) = cosine {
scores = scores.with_cosine(cos);
}
SimilarityResults {
dice: scores.dice,
jaccard: scores.jaccard,
cosine: scores.cosine,
interpretation: scores.interpretation,
}
}
fn build_token_bag(tokens: &[NormalizedToken]) -> HashMap<&str, usize> {
let mut bag: HashMap<&str, usize> = HashMap::new();
for token in tokens {
*bag.entry(token.value.as_str()).or_insert(0) += 1;
}
bag
}
fn compute_dice(bag1: &HashMap<&str, usize>, bag2: &HashMap<&str, usize>) -> f64 {
let size1: usize = bag1.values().sum();
let size2: usize = bag2.values().sum();
if size1 == 0 && size2 == 0 {
return 1.0; }
if size1 == 0 || size2 == 0 {
return 0.0; }
let mut intersection = 0usize;
for (token, &count1) in bag1 {
if let Some(&count2) = bag2.get(token) {
intersection += count1.min(count2);
}
}
(2.0 * intersection as f64) / (size1 + size2) as f64
}
fn compute_jaccard(bag1: &HashMap<&str, usize>, bag2: &HashMap<&str, usize>) -> f64 {
let size1: usize = bag1.values().sum();
let size2: usize = bag2.values().sum();
if size1 == 0 && size2 == 0 {
return 1.0; }
if size1 == 0 || size2 == 0 {
return 0.0; }
let mut intersection = 0usize;
for (token, &count1) in bag1 {
if let Some(&count2) = bag2.get(token) {
intersection += count1.min(count2);
}
}
let union = size1 + size2 - intersection;
if union == 0 {
return 1.0; }
intersection as f64 / union as f64
}
fn compute_cosine_tf(bag1: &HashMap<&str, usize>, bag2: &HashMap<&str, usize>) -> f64 {
if bag1.is_empty() && bag2.is_empty() {
return 1.0; }
if bag1.is_empty() || bag2.is_empty() {
return 0.0; }
let mut dot_product = 0.0f64;
for (token, &count1) in bag1 {
if let Some(&count2) = bag2.get(token) {
dot_product += (count1 * count2) as f64;
}
}
let mut norm1 = 0.0f64;
for &count in bag1.values() {
norm1 += (count as f64).powi(2);
}
let mut norm2 = 0.0f64;
for &count in bag2.values() {
norm2 += (count as f64).powi(2);
}
if norm1 == 0.0 || norm2 == 0.0 {
return 0.0;
}
dot_product / (norm1.sqrt() * norm2.sqrt())
}
pub fn compute_token_breakdown(
tokens1: &[NormalizedToken],
tokens2: &[NormalizedToken],
) -> TokenBreakdown {
let bag1 = build_token_bag(tokens1);
let bag2 = build_token_bag(tokens2);
let size1: usize = bag1.values().sum();
let size2: usize = bag2.values().sum();
let mut shared = 0usize;
for (token, &count1) in &bag1 {
if let Some(&count2) = bag2.get(token) {
shared += count1.min(count2);
}
}
let unique1 = size1.saturating_sub(shared);
let unique2 = size2.saturating_sub(shared);
TokenBreakdown::new(shared, unique1, unique2)
}
#[derive(Debug, Clone)]
pub struct SimilarityResults {
pub dice: f64,
pub jaccard: f64,
pub cosine: Option<f64>,
pub interpretation: String,
}
impl From<SimilarityResults> for SimilarityScores {
fn from(results: SimilarityResults) -> Self {
let mut scores = SimilarityScores::new(results.dice, results.jaccard);
if let Some(cosine) = results.cosine {
scores = scores.with_cosine(cosine);
}
scores
}
}
#[cfg(test)]
mod similarity_unit_tests {
use super::*;
use crate::analysis::clones::{NormalizedToken, TokenCategory};
fn make_token(value: &str) -> NormalizedToken {
NormalizedToken {
value: value.to_string(),
original: value.to_string(),
category: TokenCategory::Other,
}
}
fn make_tokens(values: &[&str]) -> Vec<NormalizedToken> {
values.iter().map(|v| make_token(v)).collect()
}
#[test]
fn test_dice_identical_tokens() {
let tokens = make_tokens(&["a", "b", "c"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens, &tokens, &options);
assert!(
(result.dice - 1.0).abs() < 0.001,
"Identical tokens should have dice=1.0, got {}",
result.dice
);
}
#[test]
fn test_dice_completely_different() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["x", "y", "z"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(
(result.dice - 0.0).abs() < 0.001,
"Disjoint tokens should have dice=0.0, got {}",
result.dice
);
}
#[test]
fn test_dice_partial_overlap() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["b", "c", "d"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
let expected = 4.0 / 6.0;
assert!(
(result.dice - expected).abs() < 0.001,
"Expected dice={:.3}, got {:.3}",
expected,
result.dice
);
}
#[test]
fn test_dice_with_duplicates() {
let tokens1 = make_tokens(&["a", "a", "b"]);
let tokens2 = make_tokens(&["a", "b", "b"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
let expected = 4.0 / 6.0;
assert!(
(result.dice - expected).abs() < 0.001,
"Expected dice={:.3}, got {:.3}",
expected,
result.dice
);
}
#[test]
fn test_dice_symmetry() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["b", "c", "d"]);
let options = SimilarityOptions::default();
let result_ab = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
let result_ba = compute_similarity_from_tokens(&tokens2, &tokens1, &options);
assert!(
(result_ab.dice - result_ba.dice).abs() < 0.001,
"Dice should be symmetric"
);
}
#[test]
fn test_dice_both_empty() {
let tokens1: Vec<NormalizedToken> = vec![];
let tokens2: Vec<NormalizedToken> = vec![];
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(
(result.dice - 1.0).abs() < 0.001,
"Both empty should have dice=1.0, got {}",
result.dice
);
}
#[test]
fn test_dice_one_empty() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2: Vec<NormalizedToken> = vec![];
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(
(result.dice - 0.0).abs() < 0.001,
"One empty should have dice=0.0, got {}",
result.dice
);
}
#[test]
fn test_jaccard_identical_tokens() {
let tokens = make_tokens(&["a", "b", "c"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens, &tokens, &options);
assert!(
(result.jaccard - 1.0).abs() < 0.001,
"Identical tokens should have jaccard=1.0, got {}",
result.jaccard
);
}
#[test]
fn test_jaccard_completely_different() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["x", "y", "z"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(
(result.jaccard - 0.0).abs() < 0.001,
"Disjoint tokens should have jaccard=0.0, got {}",
result.jaccard
);
}
#[test]
fn test_jaccard_partial_overlap() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["b", "c", "d"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
let expected = 0.5;
assert!(
(result.jaccard - expected).abs() < 0.001,
"Expected jaccard={:.3}, got {:.3}",
expected,
result.jaccard
);
}
#[test]
fn test_jaccard_less_than_or_equal_dice() {
let tokens1 = make_tokens(&["a", "b", "c", "d"]);
let tokens2 = make_tokens(&["b", "c", "d", "e"]);
let options = SimilarityOptions::default();
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(
result.jaccard <= result.dice + 0.001,
"Jaccard ({}) should be <= Dice ({})",
result.jaccard,
result.dice
);
}
#[test]
fn test_cosine_identical_tokens() {
let tokens = make_tokens(&["a", "b", "c"]);
let options = SimilarityOptions {
metric: SimilarityMetric::All,
..Default::default()
};
let result = compute_similarity_from_tokens(&tokens, &tokens, &options);
assert!(result.cosine.is_some(), "Cosine should be computed");
assert!(
(result.cosine.unwrap() - 1.0).abs() < 0.001,
"Identical tokens should have cosine=1.0"
);
}
#[test]
fn test_cosine_completely_different() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["x", "y", "z"]);
let options = SimilarityOptions {
metric: SimilarityMetric::All,
..Default::default()
};
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
assert!(result.cosine.is_some(), "Cosine should be computed");
assert!(
(result.cosine.unwrap() - 0.0).abs() < 0.001,
"Disjoint tokens should have cosine=0.0"
);
}
#[test]
fn test_cosine_with_duplicates() {
let tokens1 = make_tokens(&["a", "a", "b"]);
let tokens2 = make_tokens(&["a", "b", "b"]);
let options = SimilarityOptions {
metric: SimilarityMetric::All,
..Default::default()
};
let result = compute_similarity_from_tokens(&tokens1, &tokens2, &options);
let expected = 4.0 / 5.0;
assert!(result.cosine.is_some(), "Cosine should be computed");
assert!(
(result.cosine.unwrap() - expected).abs() < 0.001,
"Expected cosine={:.3}, got {:.3}",
expected,
result.cosine.unwrap()
);
}
#[test]
fn test_token_breakdown_identical() {
let tokens = make_tokens(&["a", "b", "c"]);
let breakdown = compute_token_breakdown(&tokens, &tokens);
assert_eq!(breakdown.shared_tokens, 3, "All tokens should be shared");
assert_eq!(
breakdown.unique_to_fragment1, 0,
"No unique tokens in fragment1"
);
assert_eq!(
breakdown.unique_to_fragment2, 0,
"No unique tokens in fragment2"
);
}
#[test]
fn test_token_breakdown_disjoint() {
let tokens1 = make_tokens(&["a", "b"]);
let tokens2 = make_tokens(&["x", "y", "z"]);
let breakdown = compute_token_breakdown(&tokens1, &tokens2);
assert_eq!(breakdown.shared_tokens, 0, "No shared tokens");
assert_eq!(
breakdown.unique_to_fragment1, 2,
"All tokens unique to fragment1"
);
assert_eq!(
breakdown.unique_to_fragment2, 3,
"All tokens unique to fragment2"
);
}
#[test]
fn test_token_breakdown_partial_overlap() {
let tokens1 = make_tokens(&["a", "b", "c"]);
let tokens2 = make_tokens(&["b", "c", "d"]);
let breakdown = compute_token_breakdown(&tokens1, &tokens2);
assert_eq!(breakdown.shared_tokens, 2, "Two shared tokens");
assert_eq!(breakdown.unique_to_fragment1, 1, "One unique to fragment1");
assert_eq!(breakdown.unique_to_fragment2, 1, "One unique to fragment2");
}
#[test]
fn test_dice_coefficient_helper() {
let dice = dice_coefficient(2, 3, 3);
assert!((dice - 0.667).abs() < 0.001);
}
#[test]
fn test_jaccard_coefficient_helper() {
let jaccard = jaccard_coefficient(2, 3, 3);
assert!((jaccard - 0.5).abs() < 0.001);
}
#[test]
fn test_dice_jaccard_conversion() {
let dice = 0.8;
let jaccard = dice_to_jaccard(dice);
let expected = 0.8 / 1.2; assert!((jaccard - expected).abs() < 0.001);
let dice_back = jaccard_to_dice(jaccard);
assert!((dice_back - dice).abs() < 0.001);
}
#[test]
fn test_dice_coefficient_both_empty() {
let dice = dice_coefficient(0, 0, 0);
assert!(
(dice - 1.0).abs() < 0.001,
"Both empty should give dice=1.0"
);
}
#[test]
fn test_jaccard_coefficient_both_empty() {
let jaccard = jaccard_coefficient(0, 0, 0);
assert!(
(jaccard - 1.0).abs() < 0.001,
"Both empty should give jaccard=1.0"
);
}
}