cofe 0.1.1

tiny string similarity crate
Documentation
// cofe, a tiny string similarity crate
// Copyright (c) 2023 fawn
//
// SPDX-License-Identifier: Apache-2.0

pub fn jaro(s1: &str, s2: &str) -> f64 {
    if s1 == s2 || (s1.is_empty() && s2.is_empty()) {
        return 1.0;
    } else if s1.is_empty() || s2.is_empty() {
        return 0.0;
    }

    let max_dist = (s1.len().max(s2.len()) / 2) - 1;
    let mut matches = 0.0;

    let s1 = s1.chars().collect::<Vec<_>>();
    let s2 = s2.chars().collect::<Vec<_>>();

    let mut s1_hash = vec![false; s1.len()];
    let mut s2_hash = vec![false; s2.len()];

    for (i, x) in s1.iter().enumerate() {
        for (j, y) in s2.iter().enumerate() {
            if x == y && i.max(j) - i.min(j) <= max_dist && !s1_hash[i] && !s2_hash[j] {
                matches += 1.0;
                s1_hash[i] = true;
                s2_hash[j] = true;
            }
        }
    }

    if matches == 0.0 {
        return 0.0;
    }

    let mut transpositions = 0.0;
    let mut k = 0;

    for i in 0..s1.len() {
        if s1_hash[i] {
            while !s2_hash[k] {
                k += 1;
            }

            if s1[i] != s2[k] {
                transpositions += 1.0;
            }

            k += 1;
        }
    }

    transpositions /= 2.0;

    ((matches / s1.len() as f64)
        + (matches / s2.len() as f64)
        + ((matches - transpositions) / matches))
        / 3.0
}

pub fn jaro_winkler(s1: &str, s2: &str) -> f64 {
    const MAX_PREFIX_LEN: usize = 4;
    const SCALING_FACTOR: f64 = 0.1;

    let jaro_dist = jaro(s1, s2);
    let prefix_len = s1
        .chars()
        .zip(s2.chars())
        .take_while(|(x, y)| x == y)
        .count()
        .min(MAX_PREFIX_LEN);

    (prefix_len as f64 * SCALING_FACTOR).mul_add(1.0 - jaro_dist, jaro_dist)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_jaro() {
        assert_eq!(jaro("fly", "ant"), 0.0);
        assert_eq!(jaro("martha", "marhta"), 0.9444444444444445);
        assert_eq!(jaro("dwayne", "duane"), 0.8222222222222223);
        assert_eq!(jaro("dixon", "dicksonx"), 0.7666666666666666);
        assert_eq!(jaro("jellyfish", "smellyfish"), 0.8962962962962964);
        assert_eq!(jaro("FAREMVIEL", "FARMVILLE"), 0.8842592592592592);
    }

    #[test]
    fn test_jaro_winkler() {
        assert_eq!(jaro_winkler("fly", "ant"), 0.0);
        assert_eq!(jaro_winkler("DwAyNE", "DuANE"), 0.8400000000000001);
        assert_eq!(jaro_winkler("TRATE", "TRACE"), 0.9066666666666667);
    }
}