agnostic_levenshtein/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
//! This library provides a common algorithm for calculating the Levenshtein distance
//! between two strings, i.e., the minimum number of single-character edits (insertions,
//! deletions, or substitutions) required to change one string into the other. There is
//! a single public function, `edit_distance`, which takes two string references
//! (`&str`) and a `bool` flag indicating whether the strings can be treated as
//! ASCII-only. If the flag is set to false—the safer option—the strings will operated
//! on as sequences of `char`s, i.e., 32-bit Unicode scalar values. This does involve
//! more allocation and probably a longer running time than the ASCII case. The return
//! value of `edit_distance`, in any event, is the Levenshtein distance as `u32`.

#![forbid(unsafe_code)]
#![deny(missing_docs)]
#![warn(clippy::pedantic, clippy::nursery, clippy::cargo)]
#![allow(clippy::cast_possible_truncation)]

/// Returns the Levenshtein distance (`u32`) between two strings (`&str`), `a` and `b`.
/// The `ascii` flag indicates whether the strings can be treated as ASCII-only.
#[must_use]
pub fn edit_distance(a: &str, b: &str, ascii: bool) -> u32 {
    if a == b {
        0
    } else if ascii {
        min_distance(a.as_bytes(), b.as_bytes())
    } else {
        let a_chars: Vec<char> = a.chars().collect();
        let b_chars: Vec<char> = b.chars().collect();
        min_distance(&a_chars, &b_chars)
    }
}

fn min_distance<T: PartialEq>(a: &[T], b: &[T]) -> u32 {
    let m = a.len();
    let n = b.len();

    if m == 0 {
        return n as u32;
    }
    if n == 0 {
        return m as u32;
    }

    let mut dp: Vec<u32> = (1..).take(m).collect();

    for (row, char_b) in b.iter().enumerate() {
        let mut left = row as u32;
        let mut diag = row as u32;

        for (col, char_a) in a.iter().enumerate() {
            let insert = left + 1;
            let delete = dp[col] + 1;
            let subst = if char_a == char_b { diag } else { diag + 1 };

            let min_cost = insert.min(delete).min(subst);

            diag = dp[col]; // Save for next iteration of inner loop
            left = min_cost;
            dp[col] = min_cost;
        }
    }

    dp[m - 1]
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn sitting_kitten() {
        assert_eq!(edit_distance("sitting", "kitten", true), 3);
    }

    #[test]
    fn ascii_no_difference() {
        let a = "If I were a wise man";
        let b = "I would do my part";
        let ascii_dist = edit_distance(a, b, true);
        let unicode_dist = edit_distance(a, b, false);
        assert_eq!(ascii_dist, unicode_dist);
    }

    #[test]
    fn accents_difference() {
        let a = "ʿAlī ibn Abī Ṭālib";
        let b = "ʿUthmān ibn ʿAffān";
        let ascii_dist = edit_distance(a, b, true);
        let unicode_dist = edit_distance(a, b, false);
        assert_ne!(ascii_dist, unicode_dist);
    }

    #[test]
    fn shahnama_unicode() {
        let a = "شاهنامه";
        let b = "شهنامه";
        assert_eq!(edit_distance(a, b, false), 1);
    }

    #[test]
    fn shahnama_ascii() {
        let a = "شاهنامه";
        let b = "شهنامه";
        assert_eq!(edit_distance(a, b, true), 2);
    }

    #[test]
    fn empty_ascii() {
        let a = "levenshtein";
        let b = "";
        assert_eq!(edit_distance(a, b, true), 11);
    }

    #[test]
    fn empty_unicode() {
        let a = "maḥmūd";
        let b = "";
        assert_eq!(edit_distance(a, b, false), 6);
    }

    #[test]
    fn equal_regardless() {
        let a = "Ghiyāth al-Dīn";
        let b = "Ghiyāth al-Dīn";
        assert_eq!(edit_distance(a, b, true), 0);
    }
}