ragit_cli/
dist.rs

1// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
2fn edit_distance(a: &[u8], b: &[u8]) -> usize {
3    if a.is_empty() {
4        if b.is_empty() {
5            0
6        }
7
8        else {
9            b.len()
10        }
11    }
12
13    else if b.is_empty() {
14        a.len()
15    }
16
17    else {
18        let i = a.len();
19        let j = b.len();
20        let mut cache = vec![vec![usize::MAX; j]; i];
21
22        edit_distance_impl(a, b, i - 1, j - 1, &mut cache)
23    }
24}
25
26// lowercase
27// remove `_`s
28fn preprocess(s: &[u8]) -> Vec<u8> {
29    // `cache` uses O(n * m) space and I want to prevent it from OOM.
30    let s = if s.len() > 256 {
31        &s[..256]
32    } else {
33        s
34    };
35
36    s.iter().map(
37        |c| {
38            let mut c = *c;
39            c.make_ascii_lowercase();
40
41            c
42        }
43    ).filter(
44        |c| *c != b'_'
45    ).collect()
46}
47
48pub fn get_closest_string(
49    candidates: &[String],
50    input: &str,
51) -> Option<String> {
52    let b = input.as_bytes();
53    let mut close_strings = vec![];
54
55    for c in candidates.iter() {
56        let dist = substr_edit_distance(b, c.as_bytes());
57
58        // Different strings can have 0-distance, since it does a case-insensitive comparison
59        if dist <= input.len().min(c.len()) / 3 {
60            close_strings.push((c.to_string(), dist));
61
62            if dist == 0 {
63                break;
64            }
65        }
66    }
67
68    close_strings.sort_by_key(|(_, dist)| *dist);
69    close_strings.get(0).map(|(s, _)| s.to_string())
70}
71
72// VERY EXPENSIVE
73// Please make sure that either `sub` or `s` is a short string, like a name of a command or a flag.
74pub fn substr_edit_distance(sub: &[u8], s: &[u8]) -> usize {
75    let sub = &preprocess(sub);
76    let s = &preprocess(s);
77
78    if sub == s {
79        0
80    }
81
82    // I found that `edit_distance` cannot handle this case, I should fix that later
83    else if sub.len() == 1 && s.len() == 1 {
84        return (sub != s) as usize;
85    }
86
87    else if sub.len() > s.len() || s.len() < 4 {
88        edit_distance(sub, s)
89    }
90
91    else if sub.len() * 2 > s.len() {
92        let mut result = usize::MAX;
93
94        for start in 0..s.len() {
95            for end in (start + 1)..(s.len() + 1) {
96                result = result.min(edit_distance(sub, &s[start..end]));
97            }
98        }
99
100        result
101    }
102
103    else {
104        edit_distance(sub, s)
105    }
106}
107
108fn edit_distance_impl(a: &[u8], b: &[u8], i: usize, j: usize, cache: &mut Vec<Vec<usize>>) -> usize {
109    let indicator = (a[i] != b[j]) as usize;
110
111    if i == 0 && j == 0 {
112        return indicator;
113    }
114
115    if cache[i][j] != usize::MAX {
116        return cache[i][j];
117    }
118
119    let mut result = usize::MAX;
120
121    if i > 0 {
122        result = result.min(edit_distance_impl(a, b, i - 1, j, cache) + 1);
123    }
124
125    if j > 0 {
126        result = result.min(edit_distance_impl(a, b, i, j - 1, cache) + 1);
127    }
128
129    if i > 0 && j > 0 {
130        result = result.min(edit_distance_impl(a, b, i - 1, j - 1, cache) + indicator);
131    }
132
133    if i > 1 && j > 1 && a[i] == b[j - 1] && a[i - 1] == b[j] {
134        result = result.min(edit_distance_impl(a, b, i - 2, j - 2, cache) + indicator);
135    }
136
137    cache[i][j] = result;
138    result
139}
140
141#[test]
142fn dist_test() {
143    assert_eq!(substr_edit_distance(b"x", b"X"), 0);
144    assert_eq!(substr_edit_distance(b"x", b"y"), 1);
145    assert_eq!(substr_edit_distance(b"x", b"x1"), 1);
146    assert_eq!(substr_edit_distance(b"item", b"itme"), 1);
147    assert_eq!(substr_edit_distance(b"item", b"itm"), 1);
148    assert_eq!(substr_edit_distance(b"time", b"tiem"), 1);
149    assert_eq!(substr_edit_distance(b"time", b"sime"), 1);
150    assert_eq!(substr_edit_distance(b"Internal", b"Interal"), 1);
151    assert_eq!(substr_edit_distance(b"Interal", b"Internal"), 1);
152    assert_eq!(substr_edit_distance(b"qqqqq", b"qqqqq"), 0);
153    assert_eq!(substr_edit_distance(b"qqqqq", b"cqqqq"), 1);
154    assert_eq!(substr_edit_distance(b"cqqqq", b"qqqqq"), 1);
155    assert_eq!(substr_edit_distance(b"query", b"qeury"), 1);
156    assert_eq!(substr_edit_distance(b"interactive", b"intercative"), 1);
157    assert_eq!(substr_edit_distance(b"HTML", b"Programming Language"), 18);
158
159    assert_eq!(substr_edit_distance(b"edit_distan", b"substr_edit_distance"), 0);
160    assert_eq!(substr_edit_distance(b"edit_dustan", b"substr_edit_distance"), 1);
161
162    assert!(substr_edit_distance(
163        "Very Very Long String: I want to make sure that `edit_distance` is not an O(a^n) algorithm".repeat(256).as_bytes(),
164        "Another very very long string... 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(256).as_bytes(),
165    ) > 10 /* the result doesn't matter, i just want to make sure that this code terminates in reasonable time, without causing OOM */ );
166}