str_distance/
lib.rs

1//! Compute distances between strings types (and others)
2//!
3//! This crate provides implementations for a variety of distance or equality
4//! metrics. When using metrics that are a measure of **similarity**, the
5//! following should be noted: All implementations always return the value of
6//! the distance between two elements (e.g. str), i.e. their degree of
7//! **dissimilarity**. Which the implemented metrics that are designed to measure similarity (e.g. [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index)) will return the distance, which is complementary to the similarity score.
8//!
9//! # Usage
10//!
11//! ## The `str_distance::str_distance*` convenience functions.
12//!
13//! `str_distance` and `str_distance_normalized` take the two string inputs for
14//! which the distance is determined using the passed 'DistanceMetric`.
15//! `str_distance_normalized` evaluates the normalized distance between two
16//! strings. A value of '0.0' corresponds to the "zero distance", both strings
17//! are considered equal by means of the metric, whereas a value of '1.0'
18//! corresponds to the maximum distance that can exist between the strings.
19//!
20//! Calling the `str_distance::str_distance*` is just convenience for
21//! `DistanceMetric.str_distance*("", "")`
22//!
23//! ### Example
24//!
25//! Levenshtein metrics offer the possibility to define a maximum distance at
26//! which the further calculation of the exact distance is aborted early.
27//!
28//! **Distance**
29//!
30//! ```rust
31//! use str_distance::*;
32//!
33//! // calculate the exact distance
34//! assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
35//!
36//! // short circuit if distance exceeds 10
37//! let s1 = "Wisdom is easily acquired when hiding under the bed with a saucepan on your head.";
38//! let s2 = "The quick brown fox jumped over the angry dog.";
39//! assert_eq!(str_distance(s1, s2, Levenshtein::with_max_distance(10)), DistanceValue::Exceeded(10));
40//! ```
41//!
42//! **Normalized Distance**
43//!
44//! ```rust
45//! use str_distance::*;
46//! assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
47//! assert_eq!(str_distance_normalized("nacht", "nacht", Levenshtein::default()), 0.0);
48//! assert_eq!(str_distance_normalized("abc", "def", Levenshtein::default()), 1.0);
49//! ```
50//!
51//! ## The `DistanceMetric` trait
52//!
53//! ```rust
54//! use str_distance::{DistanceMetric, SorensenDice};
55//! // QGram metrics require the length of the underlying fragment length to use for comparison.
56//! // For `SorensenDice` default is 2.
57//! assert_eq!(SorensenDice::new(2).str_distance("nacht", "night"), 0.75);
58//! ```
59//!
60//! `DistanceMetric` was designed for `str` types, but is not limited to.
61//! Calculating distance is possible for all data types which are comparable and
62//! are passed as 'IntoIterator', e.g. as `Vec` or slice
63//!
64//! ```rust
65//! use str_distance::{DistanceMetric, Levenshtein, DistanceValue};
66//!
67//! assert_eq!(*Levenshtein::default().distance(&[1,2,3], &[1,2,3,4,5,6]),3);
68//! ```
69
70#![forbid(unsafe_code)]
71
72use std::ops::Deref;
73
74pub use jaro::{Jaro, JaroWinkler};
75pub use levenshtein::{DamerauLevenshtein, Levenshtein};
76pub use modifiers::{Winkler, WinklerConfig};
77pub use qgram::{Cosine, Jaccard, Overlap, QGram, SorensenDice};
78pub use ratcliff::RatcliffObershelp;
79pub use token::{TokenSet, TokenSort};
80
81pub mod jaro;
82pub mod levenshtein;
83pub mod modifiers;
84pub mod qgram;
85pub mod ratcliff;
86pub mod token;
87mod utils;
88
89/// Evaluates the distance between two strings based on the provided
90/// [`crate::DistanceMetric`].
91///
92/// # Examples
93///
94/// ```
95/// # use str_distance::{Levenshtein, str_distance, SorensenDice, TokenSet, RatcliffObershelp, DistanceValue};
96/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
97/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::with_max_distance(1)), DistanceValue::Exceeded(1));
98/// assert_eq!(str_distance("nacht", "night", SorensenDice::default()), 0.75);
99/// assert_eq!(str_distance("Real Madrid vs FC Barcelona", "Barcelona vs Real Madrid",
100/// TokenSet::new(RatcliffObershelp)), 0.0);
101/// ```
102pub fn str_distance<S, T, D>(a: S, b: T, dist: D) -> <D as DistanceMetric>::Dist
103where
104    S: AsRef<str>,
105    T: AsRef<str>,
106    D: DistanceMetric,
107{
108    dist.str_distance(a, b)
109}
110
111/// Evaluates the normalized distance between two strings based on the provided
112/// [`crate::DistanceMetric`], so that it returns always a f64 between 0 and 1.
113/// A value of '0.0' corresponds to the "zero distance", both strings are
114/// considered equal by means of the metric, whereas a value of '1.0'
115/// corresponds to the maximum distance that can exist between the strings.
116///
117/// # Remark
118///
119/// The distance between two empty strings (a: "", b: "") is determined as 0.0,
120/// `(a == b)`
121///
122/// # Examples
123///
124/// /// ```
125/// # use str_distance::{Levenshtein, SorensenDice, TokenSet, RatcliffObershelp,
126/// DistanceValue, str_distance_normalized};
127/// assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
128/// assert_eq!(str_distance_normalized("nacht", "nacht",
129/// Levenshtein::default()), 0.0); assert_eq!(strdistance_normalized("abc",
130/// "def", Levenshtein::default()), 1.0); ```
131pub fn str_distance_normalized<S, T, D>(a: S, b: T, dist: D) -> f64
132where
133    S: AsRef<str>,
134    T: AsRef<str>,
135    D: DistanceMetric,
136{
137    dist.str_normalized(a, b)
138}
139
140pub trait DistanceMetric {
141    /// Represents the data type in which this distance is evaluated.
142    type Dist: PartialOrd;
143
144    /// Generic implementation of the metric.
145    fn distance<S, T>(&self, a: S, b: T) -> Self::Dist
146    where
147        S: IntoIterator,
148        T: IntoIterator,
149        <S as IntoIterator>::IntoIter: Clone,
150        <T as IntoIterator>::IntoIter: Clone,
151        <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
152        <T as IntoIterator>::Item: PartialEq;
153
154    /// Evaluates the distance between two str.
155    fn str_distance<S, T>(&self, a: S, b: T) -> Self::Dist
156    where
157        S: AsRef<str>,
158        T: AsRef<str>,
159    {
160        self.distance(a.as_ref().chars(), b.as_ref().chars())
161    }
162
163    /// Evaluates the normalized distance between two strings
164    /// A value of '0.0' corresponds to the "zero distance", both strings are
165    /// considered equal by means of the metric, whereas a value of '1.0'
166    /// corresponds to the maximum distance that can exist between the strings.
167    fn normalized<S, T>(&self, a: S, b: T) -> f64
168    where
169        S: IntoIterator,
170        T: IntoIterator,
171        <S as IntoIterator>::IntoIter: Clone,
172        <T as IntoIterator>::IntoIter: Clone,
173        <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
174        <T as IntoIterator>::Item: PartialEq;
175
176    /// Convenience normalization for str types.
177    fn str_normalized<S, T>(&self, a: S, b: T) -> f64
178    where
179        S: AsRef<str>,
180        T: AsRef<str>,
181    {
182        self.normalized(a.as_ref().chars(), b.as_ref().chars())
183    }
184}
185
186/// Convenience trait to use a distance on a type directly.
187pub trait DistanceElement {
188    fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
189    where
190        S: AsRef<str>,
191        D: DistanceMetric;
192}
193
194impl<T: AsRef<str>> DistanceElement for T {
195    fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
196    where
197        S: AsRef<str>,
198        D: DistanceMetric,
199    {
200        dist.str_distance(self, other)
201    }
202}
203
204#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
205pub enum DistanceValue {
206    Exact(usize),
207    Exceeded(usize),
208}
209
210impl Into<usize> for DistanceValue {
211    fn into(self) -> usize {
212        *self
213    }
214}
215
216impl Deref for DistanceValue {
217    type Target = usize;
218
219    fn deref(&self) -> &Self::Target {
220        match self {
221            DistanceValue::Exact(val) | DistanceValue::Exceeded(val) => val,
222        }
223    }
224}