rust_fuzzy_search/
lib.rs

1#![warn(missing_docs)]
2//#![doc(html_playground_url = "https://playground.example.com/")] // TODO add playground if possible
3
4//! This crate implements Fuzzy Searching with trigrams
5//!
6//!
7//! Fuzzy searching allows to compare strings by similarity rather than by equality:\
8//! Similar strings will get a high score (close to `1.0f32`) while dissimilar strings will get a lower score (closer to `0.0f32`).
9//!
10//! Fuzzy searching tolerates changes in word order:\
11//! ex. `"John Dep"` and `"Dep John"` will get a high score.
12//!
13//!
14//! The crate exposes 5 main functions:
15//!  - [fuzzy_compare] will take 2 strings and return a score representing how similar those strings are.
16//!  - [fuzzy_search] applies [fuzzy_compare] to a list of strings and returns a list of tuples: (word, score).
17//!  - [fuzzy_search_sorted] is similar to [fuzzy_search] but orders the output in descending order.
18//!  - [fuzzy_search_threshold] will take an additional `f32` as input and returns only tuples with score greater than the threshold.
19//!  - [fuzzy_search_best_n] will take an additional `usize` arguments and returns the first `n` tuples.
20//!
21//! The Algorithm used is taken from : <https://dev.to/kaleman15/fuzzy-searching-with-postgresql-97o>
22//!
23//! Basic idea:
24//!
25//! 1. From both strings extracts all groups of 3 adjacent letters.\
26//! (`"House"` becomes `['  H', ' Ho', 'Hou', 'ous', 'use', 'se ']`).\
27//! Note the 2 spaces added to the head of the string and the one on the tail, used to make the algorithm work on zero length words.
28//!
29//! 1. Then counts the number of trigrams of the first words that are also present on the second word and divide by the number of trigrams of the first word.\
30//!
31//!
32//! Example: Comparing 2 strings
33//! ```rust
34//! fn test () {
35//!    use rust_fuzzy_search::fuzzy_compare;
36//!    let score : f32 = fuzzy_compare("kolbasobulko", "kolbasobulko");
37//!    println!("score = {:?}", score);
38//! }
39//! ```
40//!
41//! Example: Comparing a string with a list of strings and retrieving only the best matches
42//! ```rust
43//! fn test() {
44//!     use rust_fuzzy_search::fuzzy_search_best_n;
45//!     let s = "bulko";
46//!     let list : Vec<&str> = vec![
47//!         "kolbasobulko",
48//!         "sandviĉo",
49//!         "ŝatas",
50//!         "domo",
51//!         "emuo",
52//!         "fabo",
53//!         "fazano"
54//!     ];
55//!     let n : usize = 3;
56//!     let res : Vec<(&str, f32)> = fuzzy_search_best_n(s,&list, n);
57//!     for (_word, score) in res {
58//!         println!("{:?}",score)
59//!     }
60//! }
61//! ```
62//! Example: if you have a `Vec` of `String`s you need to convert it to a list of `&str`
63//! ```rust
64//! fn works_with_strings() {
65//!     use rust_fuzzy_search::fuzzy_search;
66//!     let s = String::from("varma");
67//!     let list: Vec<String> = vec![String::from("varma vetero"), String::from("varma ĉokolado")];
68//!     fuzzy_search(&s, &list.iter().map(String::as_ref).collect::<Vec<&str>>());
69//! }
70//! ```
71//!
72
73use std::iter;
74
75fn trigrams(s: &str) -> Vec<(char, char, char)> {
76    let it_1 = iter::once(' ').chain(iter::once(' ')).chain(s.chars());
77    let it_2 = iter::once(' ').chain(s.chars());
78    let it_3 = s.chars().chain(iter::once(' '));
79
80    let res: Vec<(char, char, char)> = it_1
81        .zip(it_2)
82        .zip(it_3)
83        .map(|((a, b), c): ((char, char), char)| (a, b, c))
84        .collect();
85    res
86}
87
88/// Use this function to compare 2 strings.
89///
90/// The output is a score (between `0.0f32 and 1.0f32`) representing how similar the 2 strings are.
91///
92/// Arguments:
93///  * `a` : the first string to compare.
94///  * `b` : the second string to compare.
95///
96///
97/// example:
98/// ```rust
99/// fn test () {
100///     use rust_fuzzy_search::fuzzy_compare;
101///     let score : f32 = fuzzy_compare("kolbasobulko", "kolbasobulko");
102///     println!("score = {:?}", score);
103/// }
104/// ```
105pub fn fuzzy_compare(a: &str, b: &str) -> f32 {
106
107    // gets length of first input string plus 1 (because of the 3 added spaces (' '))
108    let string_len = a.chars().count() + 1;
109
110    // gets the trigrams for both strings
111    let trigrams_a = trigrams(a);
112    let trigrams_b = trigrams(b);
113
114    // accumulator
115    let mut acc: f32 = 0.0f32;
116    // counts the number of trigrams of the first string that are also present in the second one
117    for t_a in &trigrams_a {
118        for t_b in &trigrams_b {
119            if t_a == t_b {
120                acc += 1.0f32;
121                break;
122            }
123        }
124    }
125    let res = acc / (string_len as f32);
126    // crops between zero and one
127    if (0.0f32..=1.0f32).contains(&res) {
128        res
129    } else {
130        0.0f32
131    }
132}
133
134/// Use this function to compare a string (`&str`) with all elements of a list.
135///
136///
137/// The result is a list whose elements are tuples of the form `(string, score)`, the first element being the word of the list and the second element the score.
138///
139/// Arguments:
140///  * `s` : the string to compare.
141///  * `list` : the list of strings to compare with `s`.
142///
143/// example:
144/// ```rust
145/// fn test() {
146///     use rust_fuzzy_search::fuzzy_search;
147///     let s = "bulko";
148///     let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
149///     let res : Vec<(&str, f32)> = fuzzy_search(s,&list);
150///     for (_word, score) in res {
151///         println!("{:?}",score)
152///     }
153/// }
154/// ```
155///
156///
157pub fn fuzzy_search<'a>(s: &'a str, list: &'a [&str]) -> Vec<(&'a str, f32)> {
158    list.iter()
159        .map(|&value| {
160            let res = fuzzy_compare(s, value);
161            (value, res)
162        })
163        .collect()
164}
165
166/// This function is similar to [fuzzy_search] but sorts the result in descending order (the best matches are placed at the beginning).
167///
168/// Arguments:
169///  * `s` : the string to compare.
170///  * `list` : the list of strings to compare with `s`.
171///
172/// example:
173/// ```rust
174/// fn test() {
175///     use rust_fuzzy_search::fuzzy_search_sorted;
176///     let s = "bulko";
177///     let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
178///     let res : Vec<(&str, f32)> = fuzzy_search_sorted(s,&list);
179///     for (_word, score) in res {
180///         println!("{:?}",score)
181///     }
182/// }
183/// ```
184///
185pub fn fuzzy_search_sorted<'a>(s: &'a str, list: &'a [&str]) -> Vec<(&'a str, f32)> {
186    let mut res = fuzzy_search(s, list);
187    res.sort_by(|(_, d1), (_, d2)| d2.partial_cmp(d1).unwrap()); // TODO to fix the unwrap call
188    res
189}
190
191/// This function is similar to [fuzzy_search] but filters out element with a score lower than the specified one.
192///
193/// Arguments:
194///  * `s` : the string to compare.
195///  * `list` : the list of strings to compare with `s`.
196///  * `threshold` : the minimum allowed score for the elements in the result: elements with lower score will be removed.
197///
198/// ```rust
199/// fn test() {
200///     use rust_fuzzy_search::fuzzy_search_threshold;
201///     let s = "bulko";
202///     let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
203///     let threshold : f32 = 0.4f32;
204///     let res : Vec<(&str, f32)> = fuzzy_search_threshold(s,&list, threshold);
205///     for (_word, score) in res {
206///         println!("{:?}",score)
207///     }
208/// }
209/// ```
210pub fn fuzzy_search_threshold<'a>(
211    s: &'a str,
212    list: &'a [&str],
213    threshold: f32,
214) -> Vec<(&'a str, f32)> {
215    fuzzy_search(s, list)
216        .into_iter()
217        .filter(|&(_, score)| score >= threshold)
218        .collect()
219}
220
221/// This function is similar to [fuzzy_search_sorted] but keeps only the `n` best items, those with a better match.
222///
223/// Arguments :
224///  * `s` : the string to compare.
225///  * `list` : the list of strings to compare with `s`.
226///  * `n` : the number of element to retrieve.
227///
228/// example:
229/// ```
230/// fn test() {
231///     use rust_fuzzy_search::fuzzy_search_best_n;
232///     let s = "bulko";
233///     let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
234///     let n : usize = 1;
235///     let res : Vec<(&str, f32)> = fuzzy_search_best_n(s,&list, n);
236///     for (_word, score) in res {
237///         println!("{:?}",score)
238///     }
239/// }
240/// ```
241///
242pub fn fuzzy_search_best_n<'a>(s: &'a str, list: &'a [&str], n: usize) -> Vec<(&'a str, f32)> {
243    fuzzy_search_sorted(s, list).into_iter().take(n).collect()
244}
245
246#[cfg(test)]
247mod tests {
248    use crate::{
249        fuzzy_compare, fuzzy_search, fuzzy_search_best_n, fuzzy_search_sorted,
250        fuzzy_search_threshold,
251    };
252
253    #[test]
254    fn perfect_match_1() {
255        assert_eq!(fuzzy_compare("kolbasobulko", "kolbasobulko"), 1.0f32)
256    }
257    #[test]
258    fn perfect_match_2() {
259        assert_eq!(fuzzy_compare("sandviĉo", "sandviĉo"), 1.0f32)
260    }
261    #[test]
262    fn perfect_match_3() {
263        assert_eq!(fuzzy_compare("domo", "domo"), 1.0f32)
264    }
265    #[test]
266    fn perfect_match_4() {
267        assert_eq!(fuzzy_compare("ŝatas", "ŝatas"), 1.0f32)
268    }
269    #[test]
270    fn perfect_match_5() {
271        assert_eq!(fuzzy_compare("mirinda estonto", "mirinda estonto"), 1.0f32)
272    }
273    #[test]
274    fn no_match() {
275        assert_eq!(fuzzy_compare("abc", "def"), 0.0f32)
276    }
277    #[test]
278    fn empty_word() {
279        assert_eq!(fuzzy_compare("", ""), 1.0f32)
280    }
281    #[test]
282    fn one_letter() {
283        assert_eq!(fuzzy_compare("a", "a"), 1.0f32)
284    }
285    #[test]
286    fn utf8_one_letter_1() {
287        assert_eq!(fuzzy_compare("ĉ", "ĉ"), 1.0f32)
288    }
289    #[test]
290    fn utf8_one_letter_2() {
291        assert_eq!(fuzzy_compare("ł", "ł"), 1.0f32)
292    }
293    #[test]
294    fn utf8_no_match() {
295        assert_eq!(fuzzy_compare("cgs", "ĉĝŝ"), 0.0f32)
296    }
297    #[test]
298    fn test_fuzzy_search_1() {
299        let s: &str = "bulko";
300        let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
301        let res: Vec<(&str, f32)> = fuzzy_search(s, &list);
302        assert_eq!(res.into_iter().count(), 3);
303    }
304    #[test]
305    fn test_fuzzy_search_sorted() {
306        let s: &str = "bulko";
307        let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
308        let res: Vec<(&str, f32)> = fuzzy_search_sorted(s, &list);
309        assert_eq!(res.into_iter().count(), 3);
310    }
311    #[test]
312    fn no_lowers() {
313        let threshold = 0.5f32;
314        let s: &str = "bulko";
315        let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
316        for (_word, score) in fuzzy_search_threshold(s, &list, threshold) {
317            assert!(score > threshold)
318        }
319    }
320    #[test]
321    fn test_fuzzy_search_best_n() {
322        let s: &str = "bulko";
323        let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
324        let res: Vec<(&str, f32)> = fuzzy_search_best_n(s, &list, 2);
325        assert_eq!(res.into_iter().count(), 2);
326    }
327    #[test]
328    fn works_with_strings() {
329        let s = String::from("varma");
330        let list: Vec<String> = vec![String::from("varma vetero"), String::from("varma ĉokolado")];
331        fuzzy_search(&s, &list.iter().map(String::as_ref).collect::<Vec<&str>>());
332    }
333}