Skip to main content

lance_core/
levenshtein.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Calculate the Levenshtein distance between two strings.
5///
6/// The Levenshtein distance is a measure of the number of single-character edits
7/// (insertions, deletions, or substitutions) required to change one word into the other.
8///
9/// # Examples
10///
11/// ```
12/// use lance_core::levenshtein::levenshtein_distance;
13///
14/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
15/// assert_eq!(levenshtein_distance("hello", "hello"), 0);
16/// ```
17pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
18    let s1_chars: Vec<char> = s1.chars().collect();
19    let s2_chars: Vec<char> = s2.chars().collect();
20    let m = s1_chars.len();
21    let n = s2_chars.len();
22
23    if m == 0 {
24        return n;
25    }
26    if n == 0 {
27        return m;
28    }
29
30    // Use two rows instead of full matrix for space efficiency
31    let mut prev_row: Vec<usize> = (0..=n).collect();
32    let mut curr_row: Vec<usize> = vec![0; n + 1];
33
34    for (i, s1_char) in s1_chars.iter().enumerate() {
35        curr_row[0] = i + 1;
36        for (j, s2_char) in s2_chars.iter().enumerate() {
37            let cost = if s1_char == s2_char { 0 } else { 1 };
38            curr_row[j + 1] = (prev_row[j + 1] + 1)
39                .min(curr_row[j] + 1)
40                .min(prev_row[j] + cost);
41        }
42        std::mem::swap(&mut prev_row, &mut curr_row);
43    }
44
45    prev_row[n]
46}
47
48/// Find the best suggestion from a list of options based on Levenshtein distance.
49///
50/// Returns `Some(suggestion)` if there's an option where the Levenshtein distance
51/// is at most 1/3 of the length of the input string (integer division).
52/// Otherwise returns `None`.
53///
54/// # Examples
55///
56/// ```
57/// use lance_core::levenshtein::find_best_suggestion;
58///
59/// let options = vec!["vector", "id", "name"];
60/// assert_eq!(find_best_suggestion("vacter", &options), Some("vector"));
61/// assert_eq!(find_best_suggestion("hello", &options), None);
62/// ```
63pub fn find_best_suggestion<'a, 'b>(
64    input: &'a str,
65    options: &'b [impl AsRef<str>],
66) -> Option<&'b str> {
67    let input_len = input.chars().count();
68    if input_len == 0 {
69        return None;
70    }
71
72    let threshold = input_len / 3;
73    let mut best_option: Option<(&'b str, usize)> = None;
74    for option in options {
75        let distance = levenshtein_distance(input, option.as_ref());
76        if distance <= threshold {
77            match &best_option {
78                None => best_option = Some((option.as_ref(), distance)),
79                Some((_, best_distance)) => {
80                    if distance < *best_distance {
81                        best_option = Some((option.as_ref(), distance));
82                    }
83                }
84            }
85        }
86    }
87
88    best_option.map(|(option, _)| option)
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn test_levenshtein_distance() {
97        assert_eq!(levenshtein_distance("", ""), 0);
98        assert_eq!(levenshtein_distance("a", ""), 1);
99        assert_eq!(levenshtein_distance("", "a"), 1);
100        assert_eq!(levenshtein_distance("abc", "abc"), 0);
101        assert_eq!(levenshtein_distance("abc", ""), 3);
102        assert_eq!(levenshtein_distance("", "abc"), 3);
103        assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
104        assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
105        assert_eq!(levenshtein_distance("vector", "vectr"), 1);
106        assert_eq!(levenshtein_distance("vector", "vextor"), 1);
107        assert_eq!(levenshtein_distance("vector", "vvector"), 1);
108        assert_eq!(levenshtein_distance("abc", "xyz"), 3);
109    }
110
111    #[test]
112    fn test_find_best_suggestion() {
113        let options = vec!["vector", "id", "name", "column", "table"];
114
115        assert_eq!(find_best_suggestion("vacter", &options), Some("vector"));
116        assert_eq!(find_best_suggestion("vectr", &options), Some("vector"));
117        assert_eq!(find_best_suggestion("tble", &options), Some("table"));
118
119        // Should return None if no good match
120        assert_eq!(find_best_suggestion("hello", &options), None);
121        assert_eq!(find_best_suggestion("xyz", &options), None);
122
123        // Should return None if input is too short
124        assert_eq!(find_best_suggestion("v", &options), None);
125        assert_eq!(find_best_suggestion("", &options), None);
126
127        // Picks closest when multiple are close
128        assert_eq!(
129            find_best_suggestion("vecor", &["vector", "vendor"]),
130            Some("vector")
131        );
132    }
133}