textdistance 1.1.1

Lots of algorithms to compare how similar two sequences are
Documentation
#![cfg_attr(not(any(feature = "std", test)), no_std)]
#![doc = include_str!("../README.md")]
#![deny(missing_docs)]
#![deny(clippy::all, clippy::pedantic)]
#![allow(
    clippy::cast_precision_loss,
    clippy::must_use_candidate,
    clippy::similar_names,
    clippy::unreadable_literal,
    clippy::doc_markdown,
    clippy::wildcard_imports
)]

extern crate alloc;

mod algorithm;
mod counter;
mod result;

pub mod nstr;
pub mod str;

mod algorithms {
    pub mod bag;
    pub mod cosine;
    pub mod damerau_levenshtein;
    pub mod entropy_ncd;
    pub mod hamming;
    pub mod jaccard;
    pub mod jaro;
    pub mod jaro_winkler;
    pub mod lcsseq;
    pub mod lcsstr;
    pub mod length;
    pub mod levenshtein;
    pub mod lig3;
    pub mod mlipns;
    pub mod overlap;
    pub mod prefix;
    pub mod ratcliff_obershelp;
    pub mod roberts;
    pub mod sift4_common;
    pub mod sift4_simple;
    pub mod smith_waterman;
    pub mod sorensen_dice;
    pub mod suffix;
    pub mod tversky;
    pub mod yujian_bo;
}

pub use self::algorithm::Algorithm;
#[cfg(feature = "std")]
pub use self::algorithms::bag::Bag;
#[cfg(feature = "std")]
pub use self::algorithms::cosine::Cosine;
#[cfg(feature = "std")]
pub use self::algorithms::damerau_levenshtein::DamerauLevenshtein;
#[cfg(feature = "std")]
pub use self::algorithms::entropy_ncd::EntropyNCD;
pub use self::algorithms::hamming::Hamming;
#[cfg(feature = "std")]
pub use self::algorithms::jaccard::Jaccard;
pub use self::algorithms::jaro::Jaro;
pub use self::algorithms::jaro_winkler::JaroWinkler;
pub use self::algorithms::lcsseq::LCSSeq;
pub use self::algorithms::lcsstr::LCSStr;
pub use self::algorithms::length::Length;
pub use self::algorithms::levenshtein::Levenshtein;
pub use self::algorithms::lig3::LIG3;
pub use self::algorithms::mlipns::MLIPNS;
#[cfg(feature = "std")]
pub use self::algorithms::overlap::Overlap;
pub use self::algorithms::prefix::Prefix;
pub use self::algorithms::ratcliff_obershelp::RatcliffObershelp;
#[cfg(feature = "std")]
pub use self::algorithms::roberts::Roberts;
pub use self::algorithms::sift4_common::Sift4Common;
pub use self::algorithms::sift4_simple::Sift4Simple;
pub use self::algorithms::smith_waterman::SmithWaterman;
#[cfg(feature = "std")]
pub use self::algorithms::sorensen_dice::SorensenDice;
pub use self::algorithms::suffix::Suffix;
#[cfg(feature = "std")]
pub use self::algorithms::tversky::Tversky;
pub use self::algorithms::yujian_bo::YujianBo;
pub use self::result::Result;

#[cfg(test)]
mod tests {
    #![allow(clippy::float_cmp)]

    use super::*;
    use assert2::assert;
    use proptest::prelude::*;
    use rstest::rstest;

    const ALGS: usize = 8;

    fn get_result(alg: usize, s1: &str, s2: &str) -> Result<usize> {
        match alg {
            1 => Hamming::default().for_str(s1, s2),
            2 => LCSSeq::default().for_str(s1, s2),
            3 => LCSStr::default().for_str(s1, s2),
            4 => RatcliffObershelp::default().for_str(s1, s2),
            5 => Levenshtein::default().for_str(s1, s2),
            #[cfg(feature = "std")]
            6 => DamerauLevenshtein::default().for_str(s1, s2),
            7 => Sift4Simple::default().for_str(s1, s2),
            8 => MLIPNS::default().for_str(s1, s2),
            9 => Prefix::default().for_str(s1, s2),
            10 => Suffix::default().for_str(s1, s2),
            11 => Length::default().for_str(s1, s2),
            12 => Bag::default().for_str(s1, s2),
            13 => SmithWaterman::default().for_str(s1, s2),
            14 => Sift4Common::default().for_str(s1, s2),
            _ => panic!("there are not so many algorithms!"),
        }
    }

    fn get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64> {
        match alg {
            1 => Jaro::default().for_str(s1, s2),
            2 => JaroWinkler::default().for_str(s1, s2),
            3 => YujianBo::default().for_str(s1, s2),
            4 => Jaccard::default().for_str(s1, s2),
            5 => SorensenDice::default().for_str(s1, s2),
            6 => Tversky::default().for_str(s1, s2),
            7 => Overlap::default().for_str(s1, s2),
            8 => Cosine::default().for_str(s1, s2),
            9 => EntropyNCD::default().for_str(s1, s2),
            10 => LIG3::default().for_str(s1, s2),
            11 => Roberts::default().for_str(s1, s2),
            _ => panic!("there are not so many algorithms!"),
        }
    }

    #[rstest]
    #[case::hamming(1)]
    #[case::lcsseq(2)]
    #[case::lcsstr(3)]
    #[case::ratcliff_obershelp(4)]
    #[case::levenshtein(5)]
    #[case::damerau_levenshtein(6)]
    #[case::sift4_simple(7)]
    #[case::mlipns(8)]
    #[case::prefix(9)]
    #[case::suffix(10)]
    #[case::length(11)]
    #[case::bag(12)]
    #[case::smith_waterman(13)]
    #[case::sift4_common(14)]
    fn basic_usize(#[case] alg: usize) {
        let empty_res = get_result(alg, "", "");
        assert!(empty_res.dist() == 0);
        if alg != 8 {
            assert!(get_result(alg, "ab", "cde").dist() > 0);
            assert!(get_result(alg, "ab", "cde").ndist() > 0.);
        }
        if alg != 11 {
            assert!(get_result(alg, "spam", "qwer").sim() == 0);
            assert!(get_result(alg, "spam", "qwer").nsim() == 0.);
        }
        assert!(empty_res.ndist() == 0.);
        assert!(empty_res.nsim() == 1.);
    }

    #[rstest]
    #[case::jaro(1)]
    #[case::jaro_winkler(2)]
    #[case::yujian_bo(3)]
    #[case::jaccard(4)]
    #[case::sorensen_dice(5)]
    #[case::tversky(6)]
    #[case::overlap(7)]
    #[case::cosine(8)]
    #[case::entropy_ncd(9)]
    #[case::lig3(10)]
    #[case::roberts(11)]
    fn basic_f64(#[case] alg: usize) {
        let empty_res = get_result_f64(alg, "", "");
        assert!(get_result_f64(alg, "ab", "cde").ndist() > 0.);
        if alg != 3 && alg != 9 {
            assert!(get_result_f64(alg, "spam", "qwer").nsim() == 0.);
        }
        assert!(empty_res.ndist() == 0.);
        assert!(empty_res.nsim() == 1.);
        assert!(empty_res.max == 1.);
    }

    fn is_close(a: f64, b: f64) -> bool {
        (a - b).abs() < 1E-9
    }

    proptest! {
        #[test]
        fn prop(s1 in ".*", s2 in ".*") {
            for alg in 1..ALGS {
                let res = get_result(alg, &s1, &s2);
                let d = res.dist();
                let s = res.sim();

                let nd = res.ndist();
                prop_assert!(nd.is_finite());
                prop_assert!(nd >= 0.);
                prop_assert!(nd <= 1.);

                let ns = res.nsim();
                prop_assert!(ns.is_finite());
                prop_assert!(ns >= 0.);
                prop_assert!(ns <= 1.);

                prop_assert!(is_close(ns + nd, 1.), "{} + {} == 1", nd, ns);

                if d < s {
                    prop_assert!(nd < ns, "{} < {}", nd, ns);
                } else if d > s {
                    prop_assert!(nd > ns, "{} > {}", nd, ns);
                } else if !s1.is_empty() && !s2.is_empty() {
                    prop_assert!(nd == ns, "{} == {}", nd, ns);
                }
                prop_assert!(res.val() == d || res.val() == s);

                prop_assert_eq!(res.len1, s1.chars().count());
                prop_assert_eq!(res.len2, s2.chars().count());
                prop_assert!(res.max >= res.len1.min(res.len2));
            }
        }

        #[test]
        fn prop_same(s in ".*") {
            for alg in 1..ALGS {
                let res = get_result(alg, &s, &s);
                let nd = res.ndist();
                prop_assert_eq!(nd, 0., "{}: {} == 0.0", alg, nd);
                let ns = res.nsim();
                prop_assert_eq!(ns, 1., "{}: {} == 1.0", alg, ns);
            }
        }

        // strings should have lower distance if you add the same prefix to them
        fn prop_prefix(prefix in ".+", s1 in ".+", s2 in ".+") {
            for alg in 1..ALGS {
                let r1 = get_result(alg, &s1, &s2).ndist();
                let mut p1 = prefix.clone();
                let mut p2 = prefix.clone();
                p1.push_str(&s1);
                p2.push_str(&s2);
                let r2 = get_result(alg, &p1, &p2).ndist();
                prop_assert!(r1 > r2, "{}: {} > {}", alg, r1, r2);
            }
        }
    }
}