textdistance/
lib.rs

1#![cfg_attr(not(any(feature = "std", test)), no_std)]
2#![doc = include_str!("../README.md")]
3#![deny(missing_docs)]
4#![deny(clippy::all, clippy::pedantic)]
5#![allow(
6    clippy::cast_precision_loss,
7    clippy::must_use_candidate,
8    clippy::similar_names,
9    clippy::unreadable_literal,
10    clippy::doc_markdown,
11    clippy::wildcard_imports
12)]
13
14extern crate alloc;
15
16mod algorithm;
17mod counter;
18mod result;
19
20pub mod nstr;
21pub mod str;
22
23mod algorithms {
24    pub mod bag;
25    pub mod cosine;
26    pub mod damerau_levenshtein;
27    pub mod entropy_ncd;
28    pub mod hamming;
29    pub mod jaccard;
30    pub mod jaro;
31    pub mod jaro_winkler;
32    pub mod lcsseq;
33    pub mod lcsstr;
34    pub mod length;
35    pub mod levenshtein;
36    pub mod lig3;
37    pub mod mlipns;
38    pub mod overlap;
39    pub mod prefix;
40    pub mod ratcliff_obershelp;
41    pub mod roberts;
42    pub mod sift4_common;
43    pub mod sift4_simple;
44    pub mod smith_waterman;
45    pub mod sorensen_dice;
46    pub mod suffix;
47    pub mod tversky;
48    pub mod yujian_bo;
49}
50
51pub use self::algorithm::Algorithm;
52#[cfg(feature = "std")]
53pub use self::algorithms::bag::Bag;
54#[cfg(feature = "std")]
55pub use self::algorithms::cosine::Cosine;
56#[cfg(feature = "std")]
57pub use self::algorithms::damerau_levenshtein::DamerauLevenshtein;
58#[cfg(feature = "std")]
59pub use self::algorithms::entropy_ncd::EntropyNCD;
60pub use self::algorithms::hamming::Hamming;
61#[cfg(feature = "std")]
62pub use self::algorithms::jaccard::Jaccard;
63pub use self::algorithms::jaro::Jaro;
64pub use self::algorithms::jaro_winkler::JaroWinkler;
65pub use self::algorithms::lcsseq::LCSSeq;
66pub use self::algorithms::lcsstr::LCSStr;
67pub use self::algorithms::length::Length;
68pub use self::algorithms::levenshtein::Levenshtein;
69pub use self::algorithms::lig3::LIG3;
70pub use self::algorithms::mlipns::MLIPNS;
71#[cfg(feature = "std")]
72pub use self::algorithms::overlap::Overlap;
73pub use self::algorithms::prefix::Prefix;
74pub use self::algorithms::ratcliff_obershelp::RatcliffObershelp;
75#[cfg(feature = "std")]
76pub use self::algorithms::roberts::Roberts;
77pub use self::algorithms::sift4_common::Sift4Common;
78pub use self::algorithms::sift4_simple::Sift4Simple;
79pub use self::algorithms::smith_waterman::SmithWaterman;
80#[cfg(feature = "std")]
81pub use self::algorithms::sorensen_dice::SorensenDice;
82pub use self::algorithms::suffix::Suffix;
83#[cfg(feature = "std")]
84pub use self::algorithms::tversky::Tversky;
85pub use self::algorithms::yujian_bo::YujianBo;
86pub use self::result::Result;
87
88#[cfg(test)]
89mod tests {
90    #![allow(clippy::float_cmp)]
91
92    use super::*;
93    use assert2::assert;
94    use proptest::prelude::*;
95    use rstest::rstest;
96
97    const ALGS: usize = 8;
98
99    fn get_result(alg: usize, s1: &str, s2: &str) -> Result<usize> {
100        match alg {
101            1 => Hamming::default().for_str(s1, s2),
102            2 => LCSSeq::default().for_str(s1, s2),
103            3 => LCSStr::default().for_str(s1, s2),
104            4 => RatcliffObershelp::default().for_str(s1, s2),
105            5 => Levenshtein::default().for_str(s1, s2),
106            #[cfg(feature = "std")]
107            6 => DamerauLevenshtein::default().for_str(s1, s2),
108            7 => Sift4Simple::default().for_str(s1, s2),
109            8 => MLIPNS::default().for_str(s1, s2),
110            9 => Prefix::default().for_str(s1, s2),
111            10 => Suffix::default().for_str(s1, s2),
112            11 => Length::default().for_str(s1, s2),
113            12 => Bag::default().for_str(s1, s2),
114            13 => SmithWaterman::default().for_str(s1, s2),
115            14 => Sift4Common::default().for_str(s1, s2),
116            _ => panic!("there are not so many algorithms!"),
117        }
118    }
119
120    fn get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64> {
121        match alg {
122            1 => Jaro::default().for_str(s1, s2),
123            2 => JaroWinkler::default().for_str(s1, s2),
124            3 => YujianBo::default().for_str(s1, s2),
125            4 => Jaccard::default().for_str(s1, s2),
126            5 => SorensenDice::default().for_str(s1, s2),
127            6 => Tversky::default().for_str(s1, s2),
128            7 => Overlap::default().for_str(s1, s2),
129            8 => Cosine::default().for_str(s1, s2),
130            9 => EntropyNCD::default().for_str(s1, s2),
131            10 => LIG3::default().for_str(s1, s2),
132            11 => Roberts::default().for_str(s1, s2),
133            _ => panic!("there are not so many algorithms!"),
134        }
135    }
136
137    #[rstest]
138    #[case::hamming(1)]
139    #[case::lcsseq(2)]
140    #[case::lcsstr(3)]
141    #[case::ratcliff_obershelp(4)]
142    #[case::levenshtein(5)]
143    #[case::damerau_levenshtein(6)]
144    #[case::sift4_simple(7)]
145    #[case::mlipns(8)]
146    #[case::prefix(9)]
147    #[case::suffix(10)]
148    #[case::length(11)]
149    #[case::bag(12)]
150    #[case::smith_waterman(13)]
151    #[case::sift4_common(14)]
152    fn basic_usize(#[case] alg: usize) {
153        let empty_res = get_result(alg, "", "");
154        assert!(empty_res.dist() == 0);
155        if alg != 8 {
156            assert!(get_result(alg, "ab", "cde").dist() > 0);
157            assert!(get_result(alg, "ab", "cde").ndist() > 0.);
158        }
159        if alg != 11 {
160            assert!(get_result(alg, "spam", "qwer").sim() == 0);
161            assert!(get_result(alg, "spam", "qwer").nsim() == 0.);
162        }
163        assert!(empty_res.ndist() == 0.);
164        assert!(empty_res.nsim() == 1.);
165    }
166
167    #[rstest]
168    #[case::jaro(1)]
169    #[case::jaro_winkler(2)]
170    #[case::yujian_bo(3)]
171    #[case::jaccard(4)]
172    #[case::sorensen_dice(5)]
173    #[case::tversky(6)]
174    #[case::overlap(7)]
175    #[case::cosine(8)]
176    #[case::entropy_ncd(9)]
177    #[case::lig3(10)]
178    #[case::roberts(11)]
179    fn basic_f64(#[case] alg: usize) {
180        let empty_res = get_result_f64(alg, "", "");
181        assert!(get_result_f64(alg, "ab", "cde").ndist() > 0.);
182        if alg != 3 && alg != 9 {
183            assert!(get_result_f64(alg, "spam", "qwer").nsim() == 0.);
184        }
185        assert!(empty_res.ndist() == 0.);
186        assert!(empty_res.nsim() == 1.);
187        assert!(empty_res.max == 1.);
188    }
189
190    fn is_close(a: f64, b: f64) -> bool {
191        (a - b).abs() < 1E-9
192    }
193
194    proptest! {
195        #[test]
196        fn prop(s1 in ".*", s2 in ".*") {
197            for alg in 1..ALGS {
198                let res = get_result(alg, &s1, &s2);
199                let d = res.dist();
200                let s = res.sim();
201
202                let nd = res.ndist();
203                prop_assert!(nd.is_finite());
204                prop_assert!(nd >= 0.);
205                prop_assert!(nd <= 1.);
206
207                let ns = res.nsim();
208                prop_assert!(ns.is_finite());
209                prop_assert!(ns >= 0.);
210                prop_assert!(ns <= 1.);
211
212                prop_assert!(is_close(ns + nd, 1.), "{} + {} == 1", nd, ns);
213
214                if d < s {
215                    prop_assert!(nd < ns, "{} < {}", nd, ns);
216                } else if d > s {
217                    prop_assert!(nd > ns, "{} > {}", nd, ns);
218                } else if !s1.is_empty() && !s2.is_empty() {
219                    prop_assert!(nd == ns, "{} == {}", nd, ns);
220                }
221                prop_assert!(res.val() == d || res.val() == s);
222
223                prop_assert_eq!(res.len1, s1.chars().count());
224                prop_assert_eq!(res.len2, s2.chars().count());
225                prop_assert!(res.max >= res.len1.min(res.len2));
226            }
227        }
228
229        #[test]
230        fn prop_same(s in ".*") {
231            for alg in 1..ALGS {
232                let res = get_result(alg, &s, &s);
233                let nd = res.ndist();
234                prop_assert_eq!(nd, 0., "{}: {} == 0.0", alg, nd);
235                let ns = res.nsim();
236                prop_assert_eq!(ns, 1., "{}: {} == 1.0", alg, ns);
237            }
238        }
239
240        // strings should have lower distance if you add the same prefix to them
241        fn prop_prefix(prefix in ".+", s1 in ".+", s2 in ".+") {
242            for alg in 1..ALGS {
243                let r1 = get_result(alg, &s1, &s2).ndist();
244                let mut p1 = prefix.clone();
245                let mut p2 = prefix.clone();
246                p1.push_str(&s1);
247                p2.push_str(&s2);
248                let r2 = get_result(alg, &p1, &p2).ndist();
249                prop_assert!(r1 > r2, "{}: {} > {}", alg, r1, r2);
250            }
251        }
252    }
253}