textdistance/algorithms/
levenshtein.rs

1//! Levenshtein distance
2use crate::{Algorithm, Result};
3use alloc::vec::Vec;
4
5/// [Levenshtein distance] is an edit distance between two sequences.
6///
7/// It is the minimum number of single-character edits (insertions, deletions or substitutions)
8/// required to change one word into the other.
9///
10/// See also [`DamerauLevenshtein`](crate::DamerauLevenshtein) which is an extended
11/// version of this algorithm that also includes transpositions.
12///
13/// [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
14pub struct Levenshtein {
15    /// The cost of removing a character.
16    pub del_cost: usize,
17
18    /// The cost of adding a new character.
19    pub ins_cost: usize,
20
21    /// The cost of replacing a character with another one.
22    pub sub_cost: usize,
23}
24
25impl Default for Levenshtein {
26    fn default() -> Self {
27        Self {
28            del_cost: 1,
29            ins_cost: 1,
30            sub_cost: 1,
31        }
32    }
33}
34
35impl Algorithm<usize> for Levenshtein {
36    fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<usize>
37    where
38        C: Iterator<Item = E>,
39        E: Eq,
40    {
41        let s1: Vec<E> = s1.collect();
42        let l1 = s1.len();
43        if l1 == 0 {
44            let l2 = s2.count();
45            return Result {
46                abs: l2,
47                is_distance: true,
48                max: l1.max(l2),
49                len1: l1,
50                len2: l2,
51            };
52        }
53
54        let mut cache: Vec<usize> = (1..).take(l1).collect();
55        let mut dist1;
56        let mut dist2;
57
58        let mut result = 0;
59        let mut l2 = 0;
60        for (i2, c2) in s2.enumerate() {
61            result = i2;
62            dist1 = i2;
63            l2 += 1;
64
65            for (i1, c1) in s1.iter().enumerate() {
66                dist2 = if c1 == &c2 {
67                    dist1
68                } else {
69                    dist1 + self.sub_cost
70                };
71                dist1 = cache[i1];
72                result = if dist1 > result {
73                    if dist2 > result {
74                        result + self.del_cost
75                    } else {
76                        dist2
77                    }
78                } else if dist2 > dist1 {
79                    dist1 + self.ins_cost
80                } else {
81                    dist2
82                };
83                cache[i1] = result;
84            }
85        }
86        if l2 == 0 {
87            return Result {
88                abs: l1,
89                is_distance: true,
90                max: l1.max(l2),
91                len1: l1,
92                len2: l2,
93            };
94        }
95        Result {
96            abs: result,
97            is_distance: true,
98            max: l1.max(l2),
99            len1: l1,
100            len2: l2,
101        }
102    }
103}
104
105#[cfg(test)]
106mod tests {
107    use crate::str::levenshtein;
108    use assert2::assert;
109    use proptest::prelude::*;
110    use rstest::rstest;
111
112    #[rstest]
113    #[case("", "", 0)]
114    #[case("", "\0", 1)]
115    #[case("", "abc", 3)]
116    #[case("abc", "", 3)]
117    #[case("sitting", "sitting", 0)]
118    #[case("sitting", "kitten", 3)]
119    #[case("example", "samples", 3)]
120    #[case("distance", "difference", 5)]
121    #[case("test", "text", 1)]
122    #[case("test", "tset", 2)]
123    #[case("test", "qwe", 4)]
124    #[case("test", "testit", 2)]
125    #[case("test", "tesst", 1)]
126    #[case("test", "tet", 1)]
127    // parity with levenshtein-rs
128    #[case("sitting", "kitten", 3)]
129    #[case("gumbo", "gambol", 2)]
130    #[case("saturday", "sunday", 3)]
131    #[case("a", "b", 1)]
132    #[case("ab", "ac", 1)]
133    #[case("ac", "bc", 1)]
134    #[case("abc", "axc", 1)]
135    #[case("xabxcdxxefxgx", "1ab2cd34ef5g6", 6)]
136    #[case("xabxcdxxefxgx", "abcdefg", 6)]
137    #[case("javawasneat", "scalaisgreat", 7)]
138    #[case("example", "samples", 3)]
139    #[case("sturgeon", "urgently", 6)]
140    #[case("levenshtein", "frankenstein", 6)]
141    #[case("distance", "difference", 5)]
142    #[case("kitten", "sitting", 3)]
143    #[case("Tier", "Tor", 2)]
144    fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: usize) {
145        assert!(levenshtein(s1, s2) == exp);
146    }
147
148    proptest! {
149        #[test]
150        fn prop(s1 in ".*", s2 in ".*") {
151            let res = levenshtein(&s1, &s2);
152            let res2 = levenshtein(&s2, &s1);
153            prop_assert_eq!(res, res2);
154            prop_assert!(res <= s1.len() || res <= s2.len());
155        }
156    }
157}