agnostic_levenshtein/
lib.rs

1//! This library provides a common algorithm for calculating the Levenshtein distance
2//! between two strings, i.e., the minimum number of single-character edits (insertions,
3//! deletions, or substitutions) required to change one string into the other. There is
4//! a single public function, `edit_distance`, which takes two string references
5//! (`&str`) and a `bool` flag indicating whether the strings can be treated as
6//! ASCII-only. If the flag is set to false—the safer option—the strings will operated
7//! on as sequences of `char`s, i.e., 32-bit Unicode scalar values. This does involve
8//! more allocation and probably a longer running time than the ASCII case. The return
9//! value of `edit_distance`, in any event, is the Levenshtein distance as `u32`.
10
11#![forbid(unsafe_code)]
12#![deny(missing_docs)]
13#![warn(clippy::pedantic, clippy::nursery, clippy::cargo)]
14#![allow(clippy::cast_possible_truncation)]
15
16use std::mem::swap;
17
18/// Returns the Levenshtein distance (`u32`) between two strings (`&str`), `a` and `b`.
19/// The `ascii` flag indicates whether the strings can be treated as ASCII-only.
20#[must_use]
21pub fn edit_distance(a: &str, b: &str, ascii: bool) -> u32 {
22    // Handle edge cases as early as possible
23    if a == b {
24        return 0;
25    }
26
27    if a.is_empty() {
28        if ascii {
29            return b.len() as u32;
30        }
31        return b.chars().count() as u32;
32    }
33
34    if b.is_empty() {
35        if ascii {
36            return a.len() as u32;
37        }
38        return a.chars().count() as u32;
39    }
40
41    if ascii {
42        min_distance(a.as_bytes(), b.as_bytes())
43    } else {
44        let a_chars: Vec<char> = a.chars().collect();
45        let b_chars: Vec<char> = b.chars().collect();
46        min_distance(&a_chars, &b_chars)
47    }
48}
49
50fn min_distance<T: PartialEq>(a: &[T], b: &[T]) -> u32 {
51    // We already know: strings are not equal; neither string is empty
52    let m = a.len();
53
54    // "Previous row" is initialized with the base case:
55    // the distance from an empty string to each prefix of `a`.
56    let mut dp_prev: Vec<u32> = (0..=m as u32).collect();
57    let mut dp_curr: Vec<u32> = vec![0; m + 1];
58
59    for (i, b_char) in b.iter().enumerate() {
60        // i.e., cost of deleting all chars from `b` up to this point
61        dp_curr[0] = i as u32 + 1;
62
63        for j in 1..=m {
64            if a[j - 1] == *b_char {
65                dp_curr[j] = dp_prev[j - 1];
66                continue;
67            }
68
69            let insert = dp_curr[j - 1] + 1;
70            let delete = dp_prev[j] + 1;
71            let substitute = dp_prev[j - 1] + 1;
72
73            dp_curr[j] = insert.min(delete).min(substitute);
74        }
75
76        // `curr` becomes `prev` for next iteration
77        swap(&mut dp_prev, &mut dp_curr);
78    }
79
80    dp_prev[m]
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn sitting_kitten() {
89        assert_eq!(edit_distance("sitting", "kitten", true), 3);
90    }
91
92    #[test]
93    fn ascii_no_difference() {
94        let a = "If I were a wise man";
95        let b = "I would do my part";
96        let ascii_dist = edit_distance(a, b, true);
97        let unicode_dist = edit_distance(a, b, false);
98        assert_eq!(ascii_dist, unicode_dist);
99    }
100
101    #[test]
102    fn accents_difference() {
103        let a = "ʿAlī ibn Abī Ṭālib";
104        let b = "ʿUthmān ibn ʿAffān";
105        let ascii_dist = edit_distance(a, b, true);
106        let unicode_dist = edit_distance(a, b, false);
107        assert_ne!(ascii_dist, unicode_dist);
108    }
109
110    #[test]
111    fn shahnama_unicode() {
112        let a = "شاهنامه";
113        let b = "شهنامه";
114        assert_eq!(edit_distance(a, b, false), 1);
115    }
116
117    #[test]
118    fn shahnama_ascii() {
119        let a = "شاهنامه";
120        let b = "شهنامه";
121        assert_eq!(edit_distance(a, b, true), 2);
122    }
123
124    #[test]
125    fn empty_ascii() {
126        let a = "levenshtein";
127        let b = "";
128        assert_eq!(edit_distance(a, b, true), 11);
129    }
130
131    #[test]
132    fn empty_unicode() {
133        let a = "maḥmūd";
134        let b = "";
135        assert_eq!(edit_distance(a, b, false), 6);
136    }
137
138    #[test]
139    fn equal_regardless() {
140        let a = "Ghiyāth al-Dīn";
141        let b = "Ghiyāth al-Dīn";
142        assert_eq!(edit_distance(a, b, true), 0);
143    }
144}