pub fn osa_distance(a: &str, b: &str) -> usize {
if a == b {
return 0;
}
if a.is_empty() {
return b.chars().count();
}
if b.is_empty() {
return a.chars().count();
}
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
let mut matrix = vec![vec![0; b_len + 1]; a_len + 1];
#[allow(clippy::needless_range_loop)]
for i in 0..=a_len {
matrix[i][0] = i;
}
for (j, val) in matrix[0].iter_mut().enumerate().take(b_len + 1) {
*val = j;
}
for i in 1..=a_len {
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
let deletion = matrix[i - 1][j] + 1;
let insertion = matrix[i][j - 1] + 1;
let substitution = matrix[i - 1][j - 1] + cost;
let mut min_dist = deletion.min(insertion).min(substitution);
if i > 1
&& j > 1
&& a_chars[i - 1] == b_chars[j - 2]
&& a_chars[i - 2] == b_chars[j - 1]
{
let transposition = matrix[i - 2][j - 2] + 1;
min_dist = min_dist.min(transposition);
}
matrix[i][j] = min_dist;
}
}
matrix[a_len][b_len]
}
pub fn normalized_osa(a: &str, b: &str) -> f64 {
if a == b {
return 1.0;
}
let distance = osa_distance(a, b);
let max_len = a.chars().count().max(b.chars().count());
if max_len == 0 {
return 1.0;
}
1.0 - (distance as f64 / max_len as f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_osa_identical() {
assert_eq!(osa_distance("hello", "hello"), 0);
}
#[test]
fn test_osa_empty() {
assert_eq!(osa_distance("", "hello"), 5);
assert_eq!(osa_distance("hello", ""), 5);
assert_eq!(osa_distance("", ""), 0);
}
#[test]
fn test_osa_transposition() {
assert_eq!(osa_distance("ca", "ac"), 1);
assert_eq!(osa_distance("abcd", "acbd"), 1);
assert_eq!(osa_distance("abc", "bca"), 2);
}
#[test]
fn test_osa_vs_levenshtein() {
use crate::levenshtein;
let osa = osa_distance("ca", "ac");
let lev = levenshtein("ca", "ac");
assert!(osa <= lev);
}
#[test]
fn test_normalized_osa() {
assert_eq!(normalized_osa("hello", "hello"), 1.0);
assert!(normalized_osa("abc", "xyz") < 0.5);
}
}