str_distance/lib.rs
1//! Compute distances between strings types (and others)
2//!
3//! This crate provides implementations for a variety of distance or equality
4//! metrics. When using metrics that are a measure of **similarity**, the
5//! following should be noted: All implementations always return the value of
6//! the distance between two elements (e.g. str), i.e. their degree of
7//! **dissimilarity**. Which the implemented metrics that are designed to measure similarity (e.g. [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index)) will return the distance, which is complementary to the similarity score.
8//!
9//! # Usage
10//!
11//! ## The `str_distance::str_distance*` convenience functions.
12//!
13//! `str_distance` and `str_distance_normalized` take the two string inputs for
14//! which the distance is determined using the passed 'DistanceMetric`.
15//! `str_distance_normalized` evaluates the normalized distance between two
16//! strings. A value of '0.0' corresponds to the "zero distance", both strings
17//! are considered equal by means of the metric, whereas a value of '1.0'
18//! corresponds to the maximum distance that can exist between the strings.
19//!
20//! Calling the `str_distance::str_distance*` is just convenience for
21//! `DistanceMetric.str_distance*("", "")`
22//!
23//! ### Example
24//!
25//! Levenshtein metrics offer the possibility to define a maximum distance at
26//! which the further calculation of the exact distance is aborted early.
27//!
28//! **Distance**
29//!
30//! ```rust
31//! use str_distance::*;
32//!
33//! // calculate the exact distance
34//! assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
35//!
36//! // short circuit if distance exceeds 10
37//! let s1 = "Wisdom is easily acquired when hiding under the bed with a saucepan on your head.";
38//! let s2 = "The quick brown fox jumped over the angry dog.";
39//! assert_eq!(str_distance(s1, s2, Levenshtein::with_max_distance(10)), DistanceValue::Exceeded(10));
40//! ```
41//!
42//! **Normalized Distance**
43//!
44//! ```rust
45//! use str_distance::*;
46//! assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
47//! assert_eq!(str_distance_normalized("nacht", "nacht", Levenshtein::default()), 0.0);
48//! assert_eq!(str_distance_normalized("abc", "def", Levenshtein::default()), 1.0);
49//! ```
50//!
51//! ## The `DistanceMetric` trait
52//!
53//! ```rust
54//! use str_distance::{DistanceMetric, SorensenDice};
55//! // QGram metrics require the length of the underlying fragment length to use for comparison.
56//! // For `SorensenDice` default is 2.
57//! assert_eq!(SorensenDice::new(2).str_distance("nacht", "night"), 0.75);
58//! ```
59//!
60//! `DistanceMetric` was designed for `str` types, but is not limited to.
61//! Calculating distance is possible for all data types which are comparable and
62//! are passed as 'IntoIterator', e.g. as `Vec` or slice
63//!
64//! ```rust
65//! use str_distance::{DistanceMetric, Levenshtein, DistanceValue};
66//!
67//! assert_eq!(*Levenshtein::default().distance(&[1,2,3], &[1,2,3,4,5,6]),3);
68//! ```
69
70#![forbid(unsafe_code)]
71
72use std::ops::Deref;
73
74pub use jaro::{Jaro, JaroWinkler};
75pub use levenshtein::{DamerauLevenshtein, Levenshtein};
76pub use modifiers::{Winkler, WinklerConfig};
77pub use qgram::{Cosine, Jaccard, Overlap, QGram, SorensenDice};
78pub use ratcliff::RatcliffObershelp;
79pub use token::{TokenSet, TokenSort};
80
81pub mod jaro;
82pub mod levenshtein;
83pub mod modifiers;
84pub mod qgram;
85pub mod ratcliff;
86pub mod token;
87mod utils;
88
89/// Evaluates the distance between two strings based on the provided
90/// [`crate::DistanceMetric`].
91///
92/// # Examples
93///
94/// ```
95/// # use str_distance::{Levenshtein, str_distance, SorensenDice, TokenSet, RatcliffObershelp, DistanceValue};
96/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
97/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::with_max_distance(1)), DistanceValue::Exceeded(1));
98/// assert_eq!(str_distance("nacht", "night", SorensenDice::default()), 0.75);
99/// assert_eq!(str_distance("Real Madrid vs FC Barcelona", "Barcelona vs Real Madrid",
100/// TokenSet::new(RatcliffObershelp)), 0.0);
101/// ```
102pub fn str_distance<S, T, D>(a: S, b: T, dist: D) -> <D as DistanceMetric>::Dist
103where
104 S: AsRef<str>,
105 T: AsRef<str>,
106 D: DistanceMetric,
107{
108 dist.str_distance(a, b)
109}
110
111/// Evaluates the normalized distance between two strings based on the provided
112/// [`crate::DistanceMetric`], so that it returns always a f64 between 0 and 1.
113/// A value of '0.0' corresponds to the "zero distance", both strings are
114/// considered equal by means of the metric, whereas a value of '1.0'
115/// corresponds to the maximum distance that can exist between the strings.
116///
117/// # Remark
118///
119/// The distance between two empty strings (a: "", b: "") is determined as 0.0,
120/// `(a == b)`
121///
122/// # Examples
123///
124/// /// ```
125/// # use str_distance::{Levenshtein, SorensenDice, TokenSet, RatcliffObershelp,
126/// DistanceValue, str_distance_normalized};
127/// assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
128/// assert_eq!(str_distance_normalized("nacht", "nacht",
129/// Levenshtein::default()), 0.0); assert_eq!(strdistance_normalized("abc",
130/// "def", Levenshtein::default()), 1.0); ```
131pub fn str_distance_normalized<S, T, D>(a: S, b: T, dist: D) -> f64
132where
133 S: AsRef<str>,
134 T: AsRef<str>,
135 D: DistanceMetric,
136{
137 dist.str_normalized(a, b)
138}
139
140pub trait DistanceMetric {
141 /// Represents the data type in which this distance is evaluated.
142 type Dist: PartialOrd;
143
144 /// Generic implementation of the metric.
145 fn distance<S, T>(&self, a: S, b: T) -> Self::Dist
146 where
147 S: IntoIterator,
148 T: IntoIterator,
149 <S as IntoIterator>::IntoIter: Clone,
150 <T as IntoIterator>::IntoIter: Clone,
151 <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
152 <T as IntoIterator>::Item: PartialEq;
153
154 /// Evaluates the distance between two str.
155 fn str_distance<S, T>(&self, a: S, b: T) -> Self::Dist
156 where
157 S: AsRef<str>,
158 T: AsRef<str>,
159 {
160 self.distance(a.as_ref().chars(), b.as_ref().chars())
161 }
162
163 /// Evaluates the normalized distance between two strings
164 /// A value of '0.0' corresponds to the "zero distance", both strings are
165 /// considered equal by means of the metric, whereas a value of '1.0'
166 /// corresponds to the maximum distance that can exist between the strings.
167 fn normalized<S, T>(&self, a: S, b: T) -> f64
168 where
169 S: IntoIterator,
170 T: IntoIterator,
171 <S as IntoIterator>::IntoIter: Clone,
172 <T as IntoIterator>::IntoIter: Clone,
173 <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
174 <T as IntoIterator>::Item: PartialEq;
175
176 /// Convenience normalization for str types.
177 fn str_normalized<S, T>(&self, a: S, b: T) -> f64
178 where
179 S: AsRef<str>,
180 T: AsRef<str>,
181 {
182 self.normalized(a.as_ref().chars(), b.as_ref().chars())
183 }
184}
185
186/// Convenience trait to use a distance on a type directly.
187pub trait DistanceElement {
188 fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
189 where
190 S: AsRef<str>,
191 D: DistanceMetric;
192}
193
194impl<T: AsRef<str>> DistanceElement for T {
195 fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
196 where
197 S: AsRef<str>,
198 D: DistanceMetric,
199 {
200 dist.str_distance(self, other)
201 }
202}
203
204#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
205pub enum DistanceValue {
206 Exact(usize),
207 Exceeded(usize),
208}
209
210impl Into<usize> for DistanceValue {
211 fn into(self) -> usize {
212 *self
213 }
214}
215
216impl Deref for DistanceValue {
217 type Target = usize;
218
219 fn deref(&self) -> &Self::Target {
220 match self {
221 DistanceValue::Exact(val) | DistanceValue::Exceeded(val) => val,
222 }
223 }
224}