pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
let len1 = s1.chars().count();
let len2 = s2.chars().count();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let chars1: Vec<char> = s1.chars().collect();
let chars2: Vec<char> = s2.chars().collect();
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
row[0] = i;
}
for (j, cell) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
*cell = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
matrix[i][j] = std::cmp::min(
std::cmp::min(
matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, ),
matrix[i - 1][j - 1] + cost, );
}
}
matrix[len1][len2]
}
pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
let distance = levenshtein_distance(s1, s2);
let max_len = s1.chars().count().max(s2.chars().count());
if max_len == 0 {
return 1.0;
}
1.0 - (distance as f64 / max_len as f64)
}
pub fn word_levenshtein_distance(s1: &str, s2: &str) -> usize {
let has_japanese = |text: &str| {
text.chars().any(|c| {
('\u{3040}'..='\u{309F}').contains(&c) ||
('\u{30A0}'..='\u{30FF}').contains(&c) ||
('\u{4E00}'..='\u{9FAF}').contains(&c)
})
};
if has_japanese(s1) || has_japanese(s2) {
return levenshtein_distance(s1, s2);
}
let words1: Vec<&str> = s1.split_whitespace().collect();
let words2: Vec<&str> = s2.split_whitespace().collect();
let len1 = words1.len();
let len2 = words2.len();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
row[0] = i;
}
for (j, cell) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
*cell = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if words1[i - 1] == words2[j - 1] { 0 } else { 1 };
matrix[i][j] = std::cmp::min(
std::cmp::min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1),
matrix[i - 1][j - 1] + cost,
);
}
}
matrix[len1][len2]
}
pub fn word_levenshtein_similarity(s1: &str, s2: &str) -> f64 {
let distance = word_levenshtein_distance(s1, s2);
let has_japanese = |text: &str| {
text.chars().any(|c| {
('\u{3040}'..='\u{309F}').contains(&c) ||
('\u{30A0}'..='\u{30FF}').contains(&c) ||
('\u{4E00}'..='\u{9FAF}').contains(&c)
})
};
let max_len = if has_japanese(s1) || has_japanese(s2) {
s1.chars().count().max(s2.chars().count())
} else {
let words1_count = s1.split_whitespace().count();
let words2_count = s2.split_whitespace().count();
words1_count.max(words2_count)
};
if max_len == 0 {
return 1.0;
}
1.0 - (distance as f64 / max_len as f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_levenshtein_distance() {
assert_eq!(levenshtein_distance("", ""), 0);
assert_eq!(levenshtein_distance("abc", ""), 3);
assert_eq!(levenshtein_distance("", "abc"), 3);
assert_eq!(levenshtein_distance("abc", "abc"), 0);
assert_eq!(levenshtein_distance("abc", "ab"), 1);
assert_eq!(levenshtein_distance("abc", "abcd"), 1);
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
}
#[test]
fn test_levenshtein_similarity() {
assert_eq!(levenshtein_similarity("abc", "abc"), 1.0);
assert_eq!(levenshtein_similarity("", ""), 1.0);
assert!((levenshtein_similarity("abc", "ab") - 0.6666666666666667).abs() < 0.0001);
}
#[test]
fn test_word_levenshtein_distance() {
assert_eq!(word_levenshtein_distance("hello world", "hello world"), 0);
assert_eq!(word_levenshtein_distance("hello world", "hello"), 1);
assert_eq!(word_levenshtein_distance("hello world", "world hello"), 2);
}
#[test]
fn test_word_levenshtein_similarity() {
assert_eq!(word_levenshtein_similarity("hello world", "hello world"), 1.0);
assert_eq!(word_levenshtein_similarity("hello world", "hello"), 0.5);
}
}