use std::cmp::min;
#[must_use]
pub fn levenshtein(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
levenshtein_chars(&a_chars, &b_chars)
}
pub fn levenshtein_chars(a: &[char], b: &[char]) -> usize {
let m = a.len();
let n = b.len();
if m == 0 {
return n;
}
if n == 0 {
return m;
}
let (a, b, m, n) = if m > n { (b, a, n, m) } else { (a, b, m, n) };
let mut prev_row: Vec<usize> = (0..=m).collect();
let mut curr_row: Vec<usize> = vec![0; m + 1];
for j in 1..=n {
curr_row[0] = j;
for i in 1..=m {
let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
curr_row[i] = min(
min(
prev_row[i] + 1, curr_row[i - 1] + 1, ),
prev_row[i - 1] + cost, );
}
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[m]
}
#[derive(Debug, Clone, Copy)]
pub struct EditWeights {
pub insert: f64,
pub delete: f64,
pub substitute: f64,
}
impl Default for EditWeights {
fn default() -> Self {
Self {
insert: 1.0,
delete: 1.0,
substitute: 1.0,
}
}
}
#[must_use]
pub fn weighted_levenshtein(a: &str, b: &str, weights: EditWeights) -> f64 {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let m = a_chars.len();
let n = b_chars.len();
if m == 0 {
return n as f64 * weights.insert;
}
if n == 0 {
return m as f64 * weights.delete;
}
let mut prev_row: Vec<f64> = (0..=m).map(|i| i as f64 * weights.delete).collect();
let mut curr_row: Vec<f64> = vec![0.0; m + 1];
for j in 1..=n {
curr_row[0] = j as f64 * weights.insert;
for i in 1..=m {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0.0
} else {
weights.substitute
};
curr_row[i] = f64::min(
f64::min(
prev_row[i] + weights.insert, curr_row[i - 1] + weights.delete, ),
prev_row[i - 1] + cost,
);
}
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[m]
}
#[must_use]
pub fn normalized_edit_distance(a: &str, b: &str) -> f64 {
let a_len = a.chars().count();
let b_len = b.chars().count();
if a_len == 0 && b_len == 0 {
return 0.0; }
let ed = levenshtein(a, b);
let denominator = a_len + b_len + ed;
if denominator == 0 {
0.0
} else {
(2 * ed) as f64 / denominator as f64
}
}
#[must_use]
#[inline]
pub fn edit_similarity(a: &str, b: &str) -> f64 {
1.0 - normalized_edit_distance(a, b)
}
#[must_use]
pub fn edit_distance_wildcards(pattern: &str, text: &str) -> usize {
let p_chars: Vec<char> = pattern.chars().collect();
let t_chars: Vec<char> = text.chars().collect();
edit_distance_wildcards_chars(&p_chars, &t_chars)
}
fn edit_distance_wildcards_chars(pattern: &[char], text: &[char]) -> usize {
let m = pattern.len();
let n = text.len();
let mut dp = vec![vec![usize::MAX / 2; n + 1]; m + 1];
dp[0][0] = 0;
for (j, cell) in dp[0].iter_mut().enumerate().skip(1).take(n) {
*cell = j;
}
for i in 1..=m {
let c = pattern[i - 1];
if c == '*' {
dp[i][0] = dp[i - 1][0];
} else {
dp[i][0] = dp[i - 1][0] + 1;
}
}
for i in 1..=m {
let p_char = pattern[i - 1];
for j in 1..=n {
let t_char = text[j - 1];
match p_char {
'?' => {
dp[i][j] = dp[i - 1][j - 1];
}
'*' => {
dp[i][j] = min(dp[i - 1][j], dp[i][j - 1]);
}
_ => {
let cost = if p_char == t_char { 0 } else { 1 };
dp[i][j] = min(
min(
dp[i - 1][j] + 1, dp[i][j - 1] + 1, ),
dp[i - 1][j - 1] + cost, );
}
}
}
}
dp[m][n]
}
#[must_use]
pub fn normalized_edit_distance_wildcards(pattern: &str, text: &str) -> f64 {
let p_len = pattern.chars().count();
let t_len = text.chars().count();
if p_len == 0 && t_len == 0 {
return 0.0;
}
let ed = edit_distance_wildcards(pattern, text);
let denominator = p_len + t_len + ed;
if denominator == 0 {
0.0
} else {
(2 * ed) as f64 / denominator as f64
}
}
#[must_use]
#[inline]
pub fn edit_similarity_wildcards(pattern: &str, text: &str) -> f64 {
1.0 - normalized_edit_distance_wildcards(pattern, text)
}
#[must_use]
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let m = a_chars.len();
let n = b_chars.len();
if m == 0 {
return n;
}
if n == 0 {
return m;
}
let mut dp = vec![vec![0usize; n + 1]; m + 1];
for (i, row) in dp.iter_mut().enumerate().take(m + 1) {
row[0] = i;
}
for (j, cell) in dp[0].iter_mut().enumerate().take(n + 1) {
*cell = j;
}
for i in 1..=m {
for j in 1..=n {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
dp[i][j] = min(
min(
dp[i - 1][j] + 1, dp[i][j - 1] + 1, ),
dp[i - 1][j - 1] + cost, );
if i > 1
&& j > 1
&& a_chars[i - 1] == b_chars[j - 2]
&& a_chars[i - 2] == b_chars[j - 1]
{
dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost);
}
}
}
dp[m][n]
}
#[must_use]
pub fn find_closest(query: &str, candidates: &[&str]) -> Option<(usize, usize)> {
if candidates.is_empty() {
return None;
}
let mut best_idx = 0;
let mut best_dist = levenshtein(query, candidates[0]);
for (i, candidate) in candidates.iter().enumerate().skip(1) {
let dist = levenshtein(query, candidate);
if dist < best_dist {
best_dist = dist;
best_idx = i;
}
}
Some((best_idx, best_dist))
}
#[must_use]
pub fn find_within_distance(
query: &str,
candidates: &[&str],
max_distance: usize,
) -> Vec<(usize, usize)> {
candidates
.iter()
.enumerate()
.filter_map(|(i, c)| {
let dist = levenshtein(query, c);
if dist <= max_distance {
Some((i, dist))
} else {
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_levenshtein_identical() {
assert_eq!(levenshtein("hello", "hello"), 0);
assert_eq!(levenshtein("", ""), 0);
}
#[test]
fn test_levenshtein_empty() {
assert_eq!(levenshtein("", "abc"), 3);
assert_eq!(levenshtein("abc", ""), 3);
}
#[test]
fn test_levenshtein_classic() {
assert_eq!(levenshtein("kitten", "sitting"), 3);
assert_eq!(levenshtein("saturday", "sunday"), 3);
}
#[test]
fn test_levenshtein_single_char() {
assert_eq!(levenshtein("a", "b"), 1);
assert_eq!(levenshtein("a", "a"), 0);
assert_eq!(levenshtein("a", "ab"), 1);
}
#[test]
fn test_levenshtein_cjk() {
assert_eq!(levenshtein("北京", "北平"), 1);
assert_eq!(levenshtein("東京", "東京都"), 1);
assert_eq!(levenshtein("서울", "서울시"), 1);
}
#[test]
fn test_levenshtein_arabic() {
assert_eq!(levenshtein("محمد", "أحمد"), 1);
assert_eq!(levenshtein("الرياض", "الرياض"), 0);
}
#[test]
fn test_levenshtein_cyrillic() {
assert_eq!(levenshtein("Москва", "Москве"), 1);
assert_eq!(levenshtein("Путин", "Путин"), 0);
}
#[test]
fn test_levenshtein_diacritics() {
assert_eq!(levenshtein("François", "Francois"), 1);
assert_eq!(levenshtein("José", "Jose"), 1);
assert_eq!(levenshtein("München", "Munchen"), 1);
}
#[test]
fn test_levenshtein_devanagari() {
assert!(levenshtein("नमस्ते", "नमस्कार") > 0);
assert_eq!(levenshtein("नमस्ते", "नमस्ते"), 0);
}
#[test]
fn test_levenshtein_classical_greek() {
let dist = levenshtein("ἐπιστήμη", "επιστημη");
assert!(dist > 0 && dist <= 4);
}
#[test]
fn test_levenshtein_mixed_script() {
let dist = levenshtein("Dr. 田中", "Dr. Tanaka");
assert!(dist > 0); }
#[test]
fn test_normalized_identical() {
assert!((normalized_edit_distance("hello", "hello") - 0.0).abs() < 0.001);
assert!((normalized_edit_distance("", "") - 0.0).abs() < 0.001);
}
#[test]
fn test_normalized_bounds() {
let pairs = [
("a", "b"),
("hello", "world"),
("abc", "xyz"),
("", "test"),
("北京", "東京"),
];
for (a, b) in pairs {
let d = normalized_edit_distance(a, b);
assert!(
(0.0..=1.0).contains(&d),
"Distance {} out of bounds for ({}, {})",
d,
a,
b
);
}
}
#[test]
fn test_normalized_symmetry() {
let pairs = [
("Einstein", "Einstien"),
("hello", "hallo"),
("北京", "北平"),
];
for (a, b) in pairs {
let d1 = normalized_edit_distance(a, b);
let d2 = normalized_edit_distance(b, a);
assert!(
(d1 - d2).abs() < 0.001,
"Asymmetric: {} vs {} for ({}, {})",
d1,
d2,
a,
b
);
}
}
#[test]
fn test_edit_similarity() {
assert!((edit_similarity("hello", "hello") - 1.0).abs() < 0.001);
assert!(edit_similarity("Einstein", "Einstien") > 0.7);
assert!(edit_similarity("abc", "xyz") < 0.5);
}
#[test]
fn test_wildcard_question_mark() {
assert_eq!(edit_distance_wildcards("R?ma", "Roma"), 0);
assert_eq!(edit_distance_wildcards("R?ma", "Rama"), 0);
assert_eq!(edit_distance_wildcards("R?ma", "Rima"), 0);
assert!(edit_distance_wildcards("R?ma", "Rooma") > 0);
}
#[test]
fn test_wildcard_star() {
assert_eq!(edit_distance_wildcards("Ein*", "Einstein"), 0);
assert_eq!(edit_distance_wildcards("*stein", "Einstein"), 0);
assert_eq!(edit_distance_wildcards("*", "anything"), 0);
assert_eq!(edit_distance_wildcards("*", ""), 0);
assert_eq!(edit_distance_wildcards("Ein*ein", "Einstein"), 0);
assert_eq!(edit_distance_wildcards("M*e", "Marie"), 0);
}
#[test]
fn test_wildcard_combined() {
assert_eq!(edit_distance_wildcards("M?r?e C*", "Marie Curie"), 0);
assert_eq!(edit_distance_wildcards("?lbert *", "Albert Einstein"), 0);
}
#[test]
fn test_wildcard_no_match() {
assert!(edit_distance_wildcards("R?ma", "Paris") > 0);
assert!(edit_distance_wildcards("Ein*", "Newton") > 0);
}
#[test]
fn test_wildcard_exact_no_wildcards() {
assert_eq!(edit_distance_wildcards("hello", "hello"), 0);
assert_eq!(edit_distance_wildcards("hello", "hallo"), 1);
}
#[test]
fn test_wildcard_cjk() {
assert_eq!(edit_distance_wildcards("北?", "北京"), 0);
assert_eq!(edit_distance_wildcards("*京", "北京"), 0);
assert_eq!(edit_distance_wildcards("東京*", "東京都"), 0);
}
#[test]
fn test_wildcard_damaged_inscription() {
assert_eq!(edit_distance_wildcards("???TOR", "CASTOR"), 0);
assert_eq!(edit_distance_wildcards("???TOR", "NESTOR"), 0);
assert_eq!(edit_distance_wildcards("CA*R", "CASTOR"), 0);
assert_eq!(edit_distance_wildcards("CA*R", "CAR"), 0);
}
#[test]
fn test_normalized_wildcard() {
assert!((normalized_edit_distance_wildcards("R?ma", "Roma") - 0.0).abs() < 0.001);
assert!(normalized_edit_distance_wildcards("R?ma", "Paris") > 0.0);
}
#[test]
fn test_damerau_transposition() {
assert_eq!(damerau_levenshtein("ab", "ba"), 1);
assert_eq!(levenshtein("ab", "ba"), 2);
assert_eq!(damerau_levenshtein("abc", "bac"), 1);
}
#[test]
fn test_damerau_vs_levenshtein() {
assert_eq!(
damerau_levenshtein("kitten", "sitting"),
levenshtein("kitten", "sitting")
);
}
#[test]
fn test_damerau_common_typos() {
assert_eq!(damerau_levenshtein("teh", "the"), 1);
assert_eq!(damerau_levenshtein("recieve", "receive"), 1);
}
#[test]
fn test_find_closest() {
let candidates = vec!["Einstein", "Newton", "Curie", "Darwin"];
let (idx, dist) = find_closest("Einstien", &candidates).unwrap();
assert_eq!(idx, 0);
assert_eq!(dist, 2);
let (idx, _) = find_closest("Neuton", &candidates).unwrap();
assert_eq!(idx, 1); }
#[test]
fn test_find_closest_empty() {
let candidates: Vec<&str> = vec![];
assert!(find_closest("query", &candidates).is_none());
}
#[test]
fn test_find_within_distance() {
let candidates = vec!["cat", "car", "cart", "dog", "rat"];
let matches = find_within_distance("cat", &candidates, 1);
assert!(matches.contains(&(0, 0))); assert!(matches.contains(&(1, 1))); assert!(matches.contains(&(4, 1))); assert!(!matches.iter().any(|(i, _)| *i == 3)); }
#[test]
fn test_unicode_normalization_awareness() {
let _composed = "café";
let _decomposed = "cafe\u{0301}";
}
#[test]
fn test_emoji() {
assert_eq!(levenshtein("🎉", "🎉"), 0);
assert_eq!(levenshtein("🎉🎊", "🎉"), 1);
let _family = "👨👩👧"; }
#[test]
fn test_very_long_strings() {
let a: String = "a".repeat(1000);
let b: String = "b".repeat(1000);
let dist = levenshtein(&a, &b);
assert_eq!(dist, 1000); }
#[test]
fn test_weighted_default() {
let weights = EditWeights::default();
let d1 = levenshtein("kitten", "sitting");
let d2 = weighted_levenshtein("kitten", "sitting", weights);
assert!((d1 as f64 - d2).abs() < 0.001);
}
#[test]
fn test_weighted_asymmetric() {
let weights = EditWeights {
insert: 2.0,
delete: 0.5,
substitute: 1.0,
};
let d1 = weighted_levenshtein("abc", "abcd", weights); let d2 = weighted_levenshtein("abcd", "abc", weights);
assert!(d1 > d2, "Insertion should cost more");
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn arb_unicode_string() -> impl Strategy<Value = String> {
prop::string::string_regex("[a-zA-Z0-9 \\p{Han}\\p{Arabic}\\p{Cyrillic}\\p{Greek}]*")
.unwrap()
.prop_filter("non-empty or short", |s| s.len() < 200)
}
fn arb_short_string() -> impl Strategy<Value = String> {
prop::string::string_regex("[a-zA-Z0-9]*")
.unwrap()
.prop_filter("reasonable length", |s| s.len() < 50)
}
proptest! {
#[test]
fn prop_levenshtein_identity(s in arb_unicode_string()) {
prop_assert_eq!(levenshtein(&s, &s), 0);
}
#[test]
fn prop_levenshtein_symmetry(a in arb_short_string(), b in arb_short_string()) {
prop_assert_eq!(levenshtein(&a, &b), levenshtein(&b, &a));
}
#[test]
fn prop_levenshtein_upper_bound(a in arb_short_string(), b in arb_short_string()) {
let dist = levenshtein(&a, &b);
let max_len = a.chars().count().max(b.chars().count());
prop_assert!(dist <= max_len, "dist {} > max_len {}", dist, max_len);
}
#[test]
fn prop_levenshtein_lower_bound(a in arb_short_string(), b in arb_short_string()) {
let dist = levenshtein(&a, &b);
let len_a = a.chars().count() as i64;
let len_b = b.chars().count() as i64;
let min_dist = (len_a - len_b).unsigned_abs() as usize;
prop_assert!(dist >= min_dist, "dist {} < min_dist {}", dist, min_dist);
}
#[test]
fn prop_levenshtein_triangle_inequality(
a in arb_short_string(),
b in arb_short_string(),
c in arb_short_string()
) {
let d_ac = levenshtein(&a, &c);
let d_ab = levenshtein(&a, &b);
let d_bc = levenshtein(&b, &c);
prop_assert!(
d_ac <= d_ab + d_bc,
"Triangle inequality violated: d({},{})={} > d({},{})={} + d({},{})={}",
a, c, d_ac, a, b, d_ab, b, c, d_bc
);
}
}
proptest! {
#[test]
fn prop_normalized_bounds(a in arb_short_string(), b in arb_short_string()) {
let d = normalized_edit_distance(&a, &b);
prop_assert!(
(0.0..=1.0).contains(&d),
"Normalized distance {} out of bounds",
d
);
}
#[test]
fn prop_normalized_identity(s in arb_unicode_string()) {
let d = normalized_edit_distance(&s, &s);
prop_assert!((d - 0.0).abs() < 1e-10, "Self-distance should be 0, got {}", d);
}
#[test]
fn prop_normalized_symmetry(a in arb_short_string(), b in arb_short_string()) {
let d1 = normalized_edit_distance(&a, &b);
let d2 = normalized_edit_distance(&b, &a);
prop_assert!(
(d1 - d2).abs() < 1e-10,
"Asymmetric: d({},{})={} != d({},{})={}", a, b, d1, b, a, d2
);
}
#[test]
fn prop_similarity_distance_complement(a in arb_short_string(), b in arb_short_string()) {
let d = normalized_edit_distance(&a, &b);
let s = edit_similarity(&a, &b);
prop_assert!(
(s - (1.0 - d)).abs() < 1e-10,
"similarity {} != 1 - distance {}", s, d
);
}
}
proptest! {
#[test]
fn prop_damerau_identity(s in arb_unicode_string()) {
prop_assert_eq!(damerau_levenshtein(&s, &s), 0);
}
#[test]
fn prop_damerau_symmetry(a in arb_short_string(), b in arb_short_string()) {
prop_assert_eq!(damerau_levenshtein(&a, &b), damerau_levenshtein(&b, &a));
}
#[test]
fn prop_damerau_le_levenshtein(a in arb_short_string(), b in arb_short_string()) {
let damerau = damerau_levenshtein(&a, &b);
let lev = levenshtein(&a, &b);
prop_assert!(
damerau <= lev,
"Damerau {} > Levenshtein {} for ({}, {})", damerau, lev, a, b
);
}
}
proptest! {
#[test]
fn prop_wildcard_le_regular(
pattern in arb_short_string(),
text in arb_short_string()
) {
if !pattern.contains('?') && !pattern.contains('*') {
let with_wildcards = edit_distance_wildcards(&pattern, &text);
let without = levenshtein(&pattern, &text);
prop_assert!(
with_wildcards <= without,
"Wildcard distance {} > regular {} for ({}, {})",
with_wildcards, without, pattern, text
);
}
}
#[test]
fn prop_question_mark_matches_one(c in "[a-zA-Z]") {
let pattern = "?";
let text = c.to_string();
prop_assert_eq!(
edit_distance_wildcards(pattern, &text), 0,
"? should match single char '{}'", c
);
}
#[test]
fn prop_star_matches_empty(_unused in Just(())) {
prop_assert_eq!(edit_distance_wildcards("*", ""), 0);
}
#[test]
fn prop_star_matches_any(s in arb_short_string()) {
prop_assert_eq!(
edit_distance_wildcards("*", &s), 0,
"* should match any string '{}'", s
);
}
}
proptest! {
#[test]
fn prop_weighted_default_equals_regular(a in arb_short_string(), b in arb_short_string()) {
let weighted = weighted_levenshtein(&a, &b, EditWeights::default());
let regular = levenshtein(&a, &b) as f64;
prop_assert!(
(weighted - regular).abs() < 1e-10,
"Weighted {} != regular {} for ({}, {})", weighted, regular, a, b
);
}
#[test]
fn prop_weighted_non_negative(
a in arb_short_string(),
b in arb_short_string(),
insert in 0.1f64..5.0,
delete in 0.1f64..5.0,
substitute in 0.1f64..5.0
) {
let weights = EditWeights { insert, delete, substitute };
let d = weighted_levenshtein(&a, &b, weights);
prop_assert!(d >= 0.0, "Weighted distance {} < 0", d);
}
}
proptest! {
#[test]
fn prop_find_closest_returns_iff_nonempty(
query in arb_short_string(),
candidates in prop::collection::vec(arb_short_string(), 0..10)
) {
let candidate_refs: Vec<&str> = candidates.iter().map(|s| s.as_str()).collect();
let result = find_closest(&query, &candidate_refs);
if candidates.is_empty() {
prop_assert!(result.is_none());
} else {
prop_assert!(result.is_some());
}
}
#[test]
fn prop_find_closest_in_bounds(
query in arb_short_string(),
candidates in prop::collection::vec(arb_short_string(), 1..10)
) {
let candidate_refs: Vec<&str> = candidates.iter().map(|s| s.as_str()).collect();
if let Some((idx, _)) = find_closest(&query, &candidate_refs) {
prop_assert!(idx < candidates.len());
}
}
#[test]
fn prop_find_within_distance_subset(
query in arb_short_string(),
candidates in prop::collection::vec(arb_short_string(), 0..10),
max_dist in 0usize..5
) {
let candidate_refs: Vec<&str> = candidates.iter().map(|s| s.as_str()).collect();
let results = find_within_distance(&query, &candidate_refs, max_dist);
for (idx, dist) in results {
prop_assert!(idx < candidates.len(), "Index {} out of bounds", idx);
prop_assert!(dist <= max_dist, "Distance {} > max {}", dist, max_dist);
}
}
}
}