pub fn levenshtein(a: &str, b: &str) -> usize {
let a_bytes = a.as_bytes();
let b_bytes = b.as_bytes();
let a_len = a_bytes.len();
let b_len = b_bytes.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
if a_len > b_len {
return levenshtein(b, a);
}
let mut prev_row: Vec<usize> = (0..=a_len).collect();
let mut curr_row: Vec<usize> = vec![0; a_len + 1];
for (j, b_byte) in b_bytes.iter().enumerate() {
curr_row[0] = j + 1;
for (i, a_byte) in a_bytes.iter().enumerate() {
let cost = if a_byte == b_byte { 0 } else { 1 };
curr_row[i + 1] = (prev_row[i + 1] + 1) .min(curr_row[i] + 1) .min(prev_row[i] + cost); }
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[a_len]
}
pub fn max_distance_for_length(len: usize) -> usize {
match len {
0..=3 => 0, 4..=6 => 1, _ => 2, }
}
pub fn fuzzy_match<'a>(
query_term: &str,
index_terms: impl Iterator<Item = &'a str>,
) -> Vec<(&'a str, usize)> {
let max_dist = max_distance_for_length(query_term.len());
if max_dist == 0 {
return Vec::new();
}
let mut matches: Vec<(&str, usize)> = index_terms
.filter_map(|term| {
let len_diff = query_term.len().abs_diff(term.len());
if len_diff > max_dist {
return None;
}
let dist = levenshtein(query_term, term);
if dist > 0 && dist <= max_dist {
Some((term, dist))
} else {
None
}
})
.collect();
matches.sort_by_key(|&(_, d)| d);
matches
}
pub fn fuzzy_discount(distance: usize) -> f32 {
match distance {
0 => 1.0,
1 => 0.7,
2 => 0.4,
_ => 0.2,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn levenshtein_basic() {
assert_eq!(levenshtein("", ""), 0);
assert_eq!(levenshtein("abc", ""), 3);
assert_eq!(levenshtein("", "xyz"), 3);
assert_eq!(levenshtein("kitten", "sitting"), 3);
assert_eq!(levenshtein("saturday", "sunday"), 3);
}
#[test]
fn levenshtein_same() {
assert_eq!(levenshtein("database", "database"), 0);
}
#[test]
fn levenshtein_one_edit() {
assert_eq!(levenshtein("database", "databse"), 1);
assert_eq!(levenshtein("index", "indx"), 1);
}
#[test]
fn max_distance_thresholds() {
assert_eq!(max_distance_for_length(2), 0);
assert_eq!(max_distance_for_length(3), 0);
assert_eq!(max_distance_for_length(4), 1);
assert_eq!(max_distance_for_length(6), 1);
assert_eq!(max_distance_for_length(7), 2);
assert_eq!(max_distance_for_length(10), 2);
}
#[test]
fn fuzzy_match_finds_typos() {
let index = ["database", "distributed", "document", "data", "date"];
let matches = fuzzy_match("databse", index.iter().copied());
assert!(!matches.is_empty());
assert_eq!(matches[0].0, "database");
assert_eq!(matches[0].1, 1);
}
#[test]
fn fuzzy_match_respects_length_threshold() {
let index = ["cat", "bat", "car"];
let matches = fuzzy_match("cat", index.iter().copied());
assert!(matches.is_empty());
}
}