fn edit_distance(a: &[u8], b: &[u8]) -> usize {
if a.is_empty() {
if b.is_empty() {
0
}
else {
b.len()
}
}
else if b.is_empty() {
a.len()
}
else {
let i = a.len();
let j = b.len();
let mut cache = vec![vec![usize::MAX; j]; i];
edit_distance_impl(a, b, i - 1, j - 1, &mut cache)
}
}
fn preprocess(s: &[u8]) -> Vec<u8> {
let s = if s.len() > 256 {
&s[..256]
} else {
s
};
s.iter().map(
|c| {
let mut c = *c;
c.make_ascii_lowercase();
c
}
).filter(
|c| *c != b'_'
).collect()
}
pub fn get_closest_string(
candidates: &[String],
input: &str,
) -> Option<String> {
let b = input.as_bytes();
let mut close_strings = vec![];
for c in candidates.iter() {
let dist = substr_edit_distance(b, c.as_bytes());
if dist <= input.len().min(c.len()) / 3 {
close_strings.push((c.to_string(), dist));
if dist == 0 {
break;
}
}
}
close_strings.sort_by_key(|(_, dist)| *dist);
close_strings.get(0).map(|(s, _)| s.to_string())
}
pub fn substr_edit_distance(sub: &[u8], s: &[u8]) -> usize {
let sub = &preprocess(sub);
let s = &preprocess(s);
if sub == s {
0
}
else if sub.len() == 1 && s.len() == 1 {
return (sub != s) as usize;
}
else if sub.len() > s.len() || s.len() < 4 {
edit_distance(sub, s)
}
else if sub.len() * 2 > s.len() {
let mut result = usize::MAX;
for start in 0..s.len() {
for end in (start + 1)..(s.len() + 1) {
result = result.min(edit_distance(sub, &s[start..end]));
}
}
result
}
else {
edit_distance(sub, s)
}
}
fn edit_distance_impl(a: &[u8], b: &[u8], i: usize, j: usize, cache: &mut Vec<Vec<usize>>) -> usize {
let indicator = (a[i] != b[j]) as usize;
if i == 0 && j == 0 {
return indicator;
}
if cache[i][j] != usize::MAX {
return cache[i][j];
}
let mut result = usize::MAX;
if i > 0 {
result = result.min(edit_distance_impl(a, b, i - 1, j, cache) + 1);
}
if j > 0 {
result = result.min(edit_distance_impl(a, b, i, j - 1, cache) + 1);
}
if i > 0 && j > 0 {
result = result.min(edit_distance_impl(a, b, i - 1, j - 1, cache) + indicator);
}
if i > 1 && j > 1 && a[i] == b[j - 1] && a[i - 1] == b[j] {
result = result.min(edit_distance_impl(a, b, i - 2, j - 2, cache) + indicator);
}
cache[i][j] = result;
result
}
#[test]
fn dist_test() {
assert_eq!(substr_edit_distance(b"x", b"X"), 0);
assert_eq!(substr_edit_distance(b"x", b"y"), 1);
assert_eq!(substr_edit_distance(b"x", b"x1"), 1);
assert_eq!(substr_edit_distance(b"item", b"itme"), 1);
assert_eq!(substr_edit_distance(b"item", b"itm"), 1);
assert_eq!(substr_edit_distance(b"time", b"tiem"), 1);
assert_eq!(substr_edit_distance(b"time", b"sime"), 1);
assert_eq!(substr_edit_distance(b"Internal", b"Interal"), 1);
assert_eq!(substr_edit_distance(b"Interal", b"Internal"), 1);
assert_eq!(substr_edit_distance(b"qqqqq", b"qqqqq"), 0);
assert_eq!(substr_edit_distance(b"qqqqq", b"cqqqq"), 1);
assert_eq!(substr_edit_distance(b"cqqqq", b"qqqqq"), 1);
assert_eq!(substr_edit_distance(b"query", b"qeury"), 1);
assert_eq!(substr_edit_distance(b"interactive", b"intercative"), 1);
assert_eq!(substr_edit_distance(b"HTML", b"Programming Language"), 18);
assert_eq!(substr_edit_distance(b"edit_distan", b"substr_edit_distance"), 0);
assert_eq!(substr_edit_distance(b"edit_dustan", b"substr_edit_distance"), 1);
assert!(substr_edit_distance(
"Very Very Long String: I want to make sure that `edit_distance` is not an O(a^n) algorithm".repeat(256).as_bytes(),
"Another very very long string... 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(256).as_bytes(),
) > 10 );
}