1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
//! Compute distances between strings types (and others)
//!
//! This crate provides implementations for a variety of distance or equality
//! metrics. When using metrics that are a measure of **similarity**, the
//! following should be noted: All implementations always return the value of
//! the distance between two elements (e.g. str), i.e. their degree of
//! **dissimilarity**. Which the implemented metrics that are designed to measure similarity (e.g. [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index)) will return the distance, which is complementary to the similarity score.
//!
//! # Usage
//!
//! ## The `str_distance::str_distance*` convenience functions.
//!
//! `str_distance` and `str_distance_normalized` take the two string inputs for
//! which the distance is determined using the passed 'DistanceMetric`.
//! `str_distance_normalized` evaluates the normalized distance between two
//! strings. A value of '0.0' corresponds to the "zero distance", both strings
//! are considered equal by means of the metric, whereas a value of '1.0'
//! corresponds to the maximum distance that can exist between the strings.
//!
//! Calling the `str_distance::str_distance*` is just convenience for
//! `DistanceMetric.str_distance*("", "")`
//!
//! ### Example
//!
//! Levenshtein metrics offer the possibility to define a maximum distance at
//! which the further calculation of the exact distance is aborted early.
//!
//! **Distance**
//!
//! ```rust
//! use str_distance::*;
//!
//! // calculate the exact distance
//! assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
//!
//! // short circuit if distance exceeds 10
//! let s1 = "Wisdom is easily acquired when hiding under the bed with a saucepan on your head.";
//! let s2 = "The quick brown fox jumped over the angry dog.";
//! assert_eq!(str_distance(s1, s2, Levenshtein::with_max_distance(10)), DistanceValue::Exceeded(10));
//! ```
//!
//! **Normalized Distance**
//!
//! ```rust
//! use str_distance::*;
//! assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
//! assert_eq!(str_distance_normalized("nacht", "nacht", Levenshtein::default()), 0.0);
//! assert_eq!(str_distance_normalized("abc", "def", Levenshtein::default()), 1.0);
//! ```
//!
//! ## The `DistanceMetric` trait
//!
//! ```rust
//! use str_distance::{DistanceMetric, SorensenDice};
//! // QGram metrics require the length of the underlying fragment length to use for comparison.
//! // For `SorensenDice` default is 2.
//! assert_eq!(SorensenDice::new(2).str_distance("nacht", "night"), 0.75);
//! ```
//!
//! `DistanceMetric` was designed for `str` types, but is not limited to.
//! Calculating distance is possible for all data types which are comparable and
//! are passed as 'IntoIterator', e.g. as `Vec` or slice
//!
//! ```rust
//! use str_distance::{DistanceMetric, Levenshtein, DistanceValue};
//!
//! assert_eq!(*Levenshtein::default().distance(&[1,2,3], &[1,2,3,4,5,6]),3);
//! ```

#![forbid(unsafe_code)]

use std::ops::Deref;

pub use jaro::{Jaro, JaroWinkler};
pub use levenshtein::{DamerauLevenshtein, Levenshtein};
pub use modifiers::{Winkler, WinklerConfig};
pub use qgram::{Cosine, Jaccard, Overlap, QGram, SorensenDice};
pub use ratcliff::RatcliffObershelp;
pub use token::{TokenSet, TokenSort};

pub mod jaro;
pub mod levenshtein;
pub mod modifiers;
pub mod qgram;
pub mod ratcliff;
pub mod token;
mod utils;

/// Evaluates the distance between two strings based on the provided
/// [`crate::DistanceMetric`].
///
/// # Examples
///
/// ```
/// # use str_distance::{Levenshtein, str_distance, SorensenDice, TokenSet, RatcliffObershelp, DistanceValue};
/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::default()), DistanceValue::Exact(3));
/// assert_eq!(str_distance("kitten", "sitting", Levenshtein::with_max_distance(1)), DistanceValue::Exceeded(1));
/// assert_eq!(str_distance("nacht", "night", SorensenDice::default()), 0.75);
/// assert_eq!(str_distance("Real Madrid vs FC Barcelona", "Barcelona vs Real Madrid",
/// TokenSet::new(RatcliffObershelp)), 0.0);
/// ```
pub fn str_distance<S, T, D>(a: S, b: T, dist: D) -> <D as DistanceMetric>::Dist
where
    S: AsRef<str>,
    T: AsRef<str>,
    D: DistanceMetric,
{
    dist.str_distance(a, b)
}

/// Evaluates the normalized distance between two strings based on the provided
/// [`crate::DistanceMetric`], so that it returns always a f64 between 0 and 1.
/// A value of '0.0' corresponds to the "zero distance", both strings are
/// considered equal by means of the metric, whereas a value of '1.0'
/// corresponds to the maximum distance that can exist between the strings.
///
/// # Remark
///
/// The distance between two empty strings (a: "", b: "") is determined as 0.0,
/// `(a == b)`
///
/// # Examples
///
/// /// ```
/// # use str_distance::{Levenshtein, SorensenDice, TokenSet, RatcliffObershelp,
/// DistanceValue, str_distance_normalized};
/// assert_eq!(str_distance_normalized("" , "", Levenshtein::default()), 0.0);
/// assert_eq!(str_distance_normalized("nacht", "nacht",
/// Levenshtein::default()), 0.0); assert_eq!(strdistance_normalized("abc",
/// "def", Levenshtein::default()), 1.0); ```
pub fn str_distance_normalized<S, T, D>(a: S, b: T, dist: D) -> f64
where
    S: AsRef<str>,
    T: AsRef<str>,
    D: DistanceMetric,
{
    dist.str_normalized(a, b)
}

pub trait DistanceMetric {
    /// Represents the data type in which this distance is evaluated.
    type Dist: PartialOrd;

    /// Generic implementation of the metric.
    fn distance<S, T>(&self, a: S, b: T) -> Self::Dist
    where
        S: IntoIterator,
        T: IntoIterator,
        <S as IntoIterator>::IntoIter: Clone,
        <T as IntoIterator>::IntoIter: Clone,
        <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
        <T as IntoIterator>::Item: PartialEq;

    /// Evaluates the distance between two str.
    fn str_distance<S, T>(&self, a: S, b: T) -> Self::Dist
    where
        S: AsRef<str>,
        T: AsRef<str>,
    {
        self.distance(a.as_ref().chars(), b.as_ref().chars())
    }

    /// Evaluates the normalized distance between two strings
    /// A value of '0.0' corresponds to the "zero distance", both strings are
    /// considered equal by means of the metric, whereas a value of '1.0'
    /// corresponds to the maximum distance that can exist between the strings.
    fn normalized<S, T>(&self, a: S, b: T) -> f64
    where
        S: IntoIterator,
        T: IntoIterator,
        <S as IntoIterator>::IntoIter: Clone,
        <T as IntoIterator>::IntoIter: Clone,
        <S as IntoIterator>::Item: PartialEq + PartialEq<<T as IntoIterator>::Item>,
        <T as IntoIterator>::Item: PartialEq;

    /// Convenience normalization for str types.
    fn str_normalized<S, T>(&self, a: S, b: T) -> f64
    where
        S: AsRef<str>,
        T: AsRef<str>,
    {
        self.normalized(a.as_ref().chars(), b.as_ref().chars())
    }
}

/// Convenience trait to use a distance on a type directly.
pub trait DistanceElement {
    fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
    where
        S: AsRef<str>,
        D: DistanceMetric;
}

impl<T: AsRef<str>> DistanceElement for T {
    fn distance<S, D>(&self, other: S, dist: &D) -> <D as DistanceMetric>::Dist
    where
        S: AsRef<str>,
        D: DistanceMetric,
    {
        dist.str_distance(self, other)
    }
}

#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
pub enum DistanceValue {
    Exact(usize),
    Exceeded(usize),
}

impl Into<usize> for DistanceValue {
    fn into(self) -> usize {
        *self
    }
}

impl Deref for DistanceValue {
    type Target = usize;

    fn deref(&self) -> &Self::Target {
        match self {
            DistanceValue::Exact(val) | DistanceValue::Exceeded(val) => val,
        }
    }
}