use std::collections::HashSet;
#[must_use]
pub fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
let tokens_a: HashSet<&str> = name_a
.split(['-', '_', '.', '@', '/'])
.filter(|t| !t.is_empty())
.collect();
let tokens_b: HashSet<&str> = name_b
.split(['-', '_', '.', '@', '/'])
.filter(|t| !t.is_empty())
.collect();
if tokens_a.is_empty() && tokens_b.is_empty() {
return 1.0;
}
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let intersection = tokens_a.intersection(&tokens_b).count();
let union = tokens_a.union(&tokens_b).count();
if union > 0 {
intersection as f64 / union as f64
} else {
0.0
}
}
#[must_use]
pub fn compute_version_similarity(va: Option<&String>, vb: Option<&String>) -> f64 {
match (va, vb) {
(Some(a), Some(b)) if a == b => 0.10, (Some(a), Some(b)) => {
let parts_a: Vec<&str> = a.split('.').collect();
let parts_b: Vec<&str> = b.split('.').collect();
let major_a = parts_a.first().and_then(|s| s.parse::<u32>().ok());
let major_b = parts_b.first().and_then(|s| s.parse::<u32>().ok());
let minor_a = parts_a
.get(1)
.and_then(|s| s.split('-').next())
.and_then(|s| s.parse::<u32>().ok());
let minor_b = parts_b
.get(1)
.and_then(|s| s.split('-').next())
.and_then(|s| s.parse::<u32>().ok());
match (major_a, major_b, minor_a, minor_b) {
(Some(ma), Some(mb), Some(mia), Some(mib)) if ma == mb && mia == mib => 0.07,
(Some(ma), Some(mb), _, _) if ma == mb => 0.04,
_ => 0.0,
}
}
_ => 0.0, }
}
pub fn soundex(name: &str) -> String {
if name.is_empty() {
return String::new();
}
let name_upper: String = name
.to_uppercase()
.chars()
.filter(char::is_ascii_alphabetic)
.collect();
if name_upper.is_empty() {
return String::new();
}
let mut chars = name_upper.chars();
let first_char = chars
.next()
.expect("name_upper is non-empty after empty check above");
let mut code = String::with_capacity(4);
code.push(first_char);
let mut last_digit = soundex_digit(first_char);
for c in chars {
let digit = soundex_digit(c);
if digit != '0' && digit != last_digit {
code.push(digit);
if code.len() == 4 {
break;
}
}
if digit != '0' {
last_digit = digit;
}
}
while code.len() < 4 {
code.push('0');
}
code
}
#[must_use]
pub const fn soundex_digit(c: char) -> char {
match c {
'B' | 'F' | 'P' | 'V' => '1',
'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
'D' | 'T' => '3',
'L' => '4',
'M' | 'N' => '5',
'R' => '6',
_ => '0', }
}
#[must_use]
pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
let soundex_a = soundex(name_a);
let soundex_b = soundex(name_b);
if !soundex_a.is_empty() && soundex_a == soundex_b {
return 1.0;
}
let tokens_a: Vec<&str> = name_a
.split(|c: char| !c.is_alphanumeric())
.filter(|t| !t.is_empty())
.collect();
let tokens_b: Vec<&str> = name_b
.split(|c: char| !c.is_alphanumeric())
.filter(|t| !t.is_empty())
.collect();
if tokens_a.is_empty() || tokens_b.is_empty() {
return 0.0;
}
let mut matches = 0;
let total = tokens_a.len().max(tokens_b.len());
for ta in &tokens_a {
let sa = soundex(ta);
if sa.is_empty() {
continue;
}
for tb in &tokens_b {
let sb = soundex(tb);
if sa == sb {
matches += 1;
break;
}
}
}
if total == 0 {
0.0
} else {
f64::from(matches) / total as f64
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_similarity_identical() {
assert_eq!(compute_token_similarity("react-dom", "react-dom"), 1.0);
}
#[test]
fn test_token_similarity_reordered() {
assert_eq!(compute_token_similarity("react-dom", "dom-react"), 1.0);
}
#[test]
fn test_token_similarity_partial() {
let score = compute_token_similarity("react-dom", "react-native");
assert!(score > 0.0 && score < 1.0);
}
#[test]
fn test_token_similarity_empty() {
assert_eq!(compute_token_similarity("", ""), 1.0);
assert_eq!(compute_token_similarity("foo", ""), 0.0);
}
#[test]
fn test_soundex_basic() {
assert_eq!(soundex("Robert"), "R163");
assert_eq!(soundex("Smith"), "S530");
}
#[test]
fn test_soundex_empty() {
assert_eq!(soundex(""), "");
assert_eq!(soundex("123"), "");
}
#[test]
fn test_phonetic_similarity_match() {
assert_eq!(compute_phonetic_similarity("smith", "smyth"), 1.0);
}
#[test]
fn test_phonetic_similarity_no_match() {
assert_eq!(compute_phonetic_similarity("react", "angular"), 0.0);
}
#[test]
fn test_version_similarity_exact() {
let v1 = "1.2.3".to_string();
let v2 = "1.2.3".to_string();
assert_eq!(compute_version_similarity(Some(&v1), Some(&v2)), 0.10);
}
#[test]
fn test_version_similarity_same_major_minor() {
let v1 = "1.2.3".to_string();
let v2 = "1.2.5".to_string();
assert_eq!(compute_version_similarity(Some(&v1), Some(&v2)), 0.07);
}
#[test]
fn test_version_similarity_same_major() {
let v1 = "1.2.3".to_string();
let v2 = "1.5.0".to_string();
assert_eq!(compute_version_similarity(Some(&v1), Some(&v2)), 0.04);
}
#[test]
fn test_version_similarity_none() {
assert_eq!(compute_version_similarity(None, None), 0.0);
}
}