rust_fuzzy_search/lib.rs
1#![warn(missing_docs)]
2//#![doc(html_playground_url = "https://playground.example.com/")] // TODO add playground if possible
3
4//! This crate implements Fuzzy Searching with trigrams
5//!
6//!
7//! Fuzzy searching allows to compare strings by similarity rather than by equality:\
8//! Similar strings will get a high score (close to `1.0f32`) while dissimilar strings will get a lower score (closer to `0.0f32`).
9//!
10//! Fuzzy searching tolerates changes in word order:\
11//! ex. `"John Dep"` and `"Dep John"` will get a high score.
12//!
13//!
14//! The crate exposes 5 main functions:
15//! - [fuzzy_compare] will take 2 strings and return a score representing how similar those strings are.
16//! - [fuzzy_search] applies [fuzzy_compare] to a list of strings and returns a list of tuples: (word, score).
17//! - [fuzzy_search_sorted] is similar to [fuzzy_search] but orders the output in descending order.
18//! - [fuzzy_search_threshold] will take an additional `f32` as input and returns only tuples with score greater than the threshold.
19//! - [fuzzy_search_best_n] will take an additional `usize` arguments and returns the first `n` tuples.
20//!
21//! The Algorithm used is taken from : <https://dev.to/kaleman15/fuzzy-searching-with-postgresql-97o>
22//!
23//! Basic idea:
24//!
25//! 1. From both strings extracts all groups of 3 adjacent letters.\
26//! (`"House"` becomes `[' H', ' Ho', 'Hou', 'ous', 'use', 'se ']`).\
27//! Note the 2 spaces added to the head of the string and the one on the tail, used to make the algorithm work on zero length words.
28//!
29//! 1. Then counts the number of trigrams of the first words that are also present on the second word and divide by the number of trigrams of the first word.\
30//!
31//!
32//! Example: Comparing 2 strings
33//! ```rust
34//! fn test () {
35//! use rust_fuzzy_search::fuzzy_compare;
36//! let score : f32 = fuzzy_compare("kolbasobulko", "kolbasobulko");
37//! println!("score = {:?}", score);
38//! }
39//! ```
40//!
41//! Example: Comparing a string with a list of strings and retrieving only the best matches
42//! ```rust
43//! fn test() {
44//! use rust_fuzzy_search::fuzzy_search_best_n;
45//! let s = "bulko";
46//! let list : Vec<&str> = vec![
47//! "kolbasobulko",
48//! "sandviĉo",
49//! "ŝatas",
50//! "domo",
51//! "emuo",
52//! "fabo",
53//! "fazano"
54//! ];
55//! let n : usize = 3;
56//! let res : Vec<(&str, f32)> = fuzzy_search_best_n(s,&list, n);
57//! for (_word, score) in res {
58//! println!("{:?}",score)
59//! }
60//! }
61//! ```
62//! Example: if you have a `Vec` of `String`s you need to convert it to a list of `&str`
63//! ```rust
64//! fn works_with_strings() {
65//! use rust_fuzzy_search::fuzzy_search;
66//! let s = String::from("varma");
67//! let list: Vec<String> = vec![String::from("varma vetero"), String::from("varma ĉokolado")];
68//! fuzzy_search(&s, &list.iter().map(String::as_ref).collect::<Vec<&str>>());
69//! }
70//! ```
71//!
72
73use std::iter;
74
75fn trigrams(s: &str) -> Vec<(char, char, char)> {
76 let it_1 = iter::once(' ').chain(iter::once(' ')).chain(s.chars());
77 let it_2 = iter::once(' ').chain(s.chars());
78 let it_3 = s.chars().chain(iter::once(' '));
79
80 let res: Vec<(char, char, char)> = it_1
81 .zip(it_2)
82 .zip(it_3)
83 .map(|((a, b), c): ((char, char), char)| (a, b, c))
84 .collect();
85 res
86}
87
88/// Use this function to compare 2 strings.
89///
90/// The output is a score (between `0.0f32 and 1.0f32`) representing how similar the 2 strings are.
91///
92/// Arguments:
93/// * `a` : the first string to compare.
94/// * `b` : the second string to compare.
95///
96///
97/// example:
98/// ```rust
99/// fn test () {
100/// use rust_fuzzy_search::fuzzy_compare;
101/// let score : f32 = fuzzy_compare("kolbasobulko", "kolbasobulko");
102/// println!("score = {:?}", score);
103/// }
104/// ```
105pub fn fuzzy_compare(a: &str, b: &str) -> f32 {
106
107 // gets length of first input string plus 1 (because of the 3 added spaces (' '))
108 let string_len = a.chars().count() + 1;
109
110 // gets the trigrams for both strings
111 let trigrams_a = trigrams(a);
112 let trigrams_b = trigrams(b);
113
114 // accumulator
115 let mut acc: f32 = 0.0f32;
116 // counts the number of trigrams of the first string that are also present in the second one
117 for t_a in &trigrams_a {
118 for t_b in &trigrams_b {
119 if t_a == t_b {
120 acc += 1.0f32;
121 break;
122 }
123 }
124 }
125 let res = acc / (string_len as f32);
126 // crops between zero and one
127 if (0.0f32..=1.0f32).contains(&res) {
128 res
129 } else {
130 0.0f32
131 }
132}
133
134/// Use this function to compare a string (`&str`) with all elements of a list.
135///
136///
137/// The result is a list whose elements are tuples of the form `(string, score)`, the first element being the word of the list and the second element the score.
138///
139/// Arguments:
140/// * `s` : the string to compare.
141/// * `list` : the list of strings to compare with `s`.
142///
143/// example:
144/// ```rust
145/// fn test() {
146/// use rust_fuzzy_search::fuzzy_search;
147/// let s = "bulko";
148/// let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
149/// let res : Vec<(&str, f32)> = fuzzy_search(s,&list);
150/// for (_word, score) in res {
151/// println!("{:?}",score)
152/// }
153/// }
154/// ```
155///
156///
157pub fn fuzzy_search<'a>(s: &'a str, list: &'a [&str]) -> Vec<(&'a str, f32)> {
158 list.iter()
159 .map(|&value| {
160 let res = fuzzy_compare(s, value);
161 (value, res)
162 })
163 .collect()
164}
165
166/// This function is similar to [fuzzy_search] but sorts the result in descending order (the best matches are placed at the beginning).
167///
168/// Arguments:
169/// * `s` : the string to compare.
170/// * `list` : the list of strings to compare with `s`.
171///
172/// example:
173/// ```rust
174/// fn test() {
175/// use rust_fuzzy_search::fuzzy_search_sorted;
176/// let s = "bulko";
177/// let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
178/// let res : Vec<(&str, f32)> = fuzzy_search_sorted(s,&list);
179/// for (_word, score) in res {
180/// println!("{:?}",score)
181/// }
182/// }
183/// ```
184///
185pub fn fuzzy_search_sorted<'a>(s: &'a str, list: &'a [&str]) -> Vec<(&'a str, f32)> {
186 let mut res = fuzzy_search(s, list);
187 res.sort_by(|(_, d1), (_, d2)| d2.partial_cmp(d1).unwrap()); // TODO to fix the unwrap call
188 res
189}
190
191/// This function is similar to [fuzzy_search] but filters out element with a score lower than the specified one.
192///
193/// Arguments:
194/// * `s` : the string to compare.
195/// * `list` : the list of strings to compare with `s`.
196/// * `threshold` : the minimum allowed score for the elements in the result: elements with lower score will be removed.
197///
198/// ```rust
199/// fn test() {
200/// use rust_fuzzy_search::fuzzy_search_threshold;
201/// let s = "bulko";
202/// let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
203/// let threshold : f32 = 0.4f32;
204/// let res : Vec<(&str, f32)> = fuzzy_search_threshold(s,&list, threshold);
205/// for (_word, score) in res {
206/// println!("{:?}",score)
207/// }
208/// }
209/// ```
210pub fn fuzzy_search_threshold<'a>(
211 s: &'a str,
212 list: &'a [&str],
213 threshold: f32,
214) -> Vec<(&'a str, f32)> {
215 fuzzy_search(s, list)
216 .into_iter()
217 .filter(|&(_, score)| score >= threshold)
218 .collect()
219}
220
221/// This function is similar to [fuzzy_search_sorted] but keeps only the `n` best items, those with a better match.
222///
223/// Arguments :
224/// * `s` : the string to compare.
225/// * `list` : the list of strings to compare with `s`.
226/// * `n` : the number of element to retrieve.
227///
228/// example:
229/// ```
230/// fn test() {
231/// use rust_fuzzy_search::fuzzy_search_best_n;
232/// let s = "bulko";
233/// let list : Vec<&str> = vec!["kolbasobulko", "sandviĉo"];
234/// let n : usize = 1;
235/// let res : Vec<(&str, f32)> = fuzzy_search_best_n(s,&list, n);
236/// for (_word, score) in res {
237/// println!("{:?}",score)
238/// }
239/// }
240/// ```
241///
242pub fn fuzzy_search_best_n<'a>(s: &'a str, list: &'a [&str], n: usize) -> Vec<(&'a str, f32)> {
243 fuzzy_search_sorted(s, list).into_iter().take(n).collect()
244}
245
246#[cfg(test)]
247mod tests {
248 use crate::{
249 fuzzy_compare, fuzzy_search, fuzzy_search_best_n, fuzzy_search_sorted,
250 fuzzy_search_threshold,
251 };
252
253 #[test]
254 fn perfect_match_1() {
255 assert_eq!(fuzzy_compare("kolbasobulko", "kolbasobulko"), 1.0f32)
256 }
257 #[test]
258 fn perfect_match_2() {
259 assert_eq!(fuzzy_compare("sandviĉo", "sandviĉo"), 1.0f32)
260 }
261 #[test]
262 fn perfect_match_3() {
263 assert_eq!(fuzzy_compare("domo", "domo"), 1.0f32)
264 }
265 #[test]
266 fn perfect_match_4() {
267 assert_eq!(fuzzy_compare("ŝatas", "ŝatas"), 1.0f32)
268 }
269 #[test]
270 fn perfect_match_5() {
271 assert_eq!(fuzzy_compare("mirinda estonto", "mirinda estonto"), 1.0f32)
272 }
273 #[test]
274 fn no_match() {
275 assert_eq!(fuzzy_compare("abc", "def"), 0.0f32)
276 }
277 #[test]
278 fn empty_word() {
279 assert_eq!(fuzzy_compare("", ""), 1.0f32)
280 }
281 #[test]
282 fn one_letter() {
283 assert_eq!(fuzzy_compare("a", "a"), 1.0f32)
284 }
285 #[test]
286 fn utf8_one_letter_1() {
287 assert_eq!(fuzzy_compare("ĉ", "ĉ"), 1.0f32)
288 }
289 #[test]
290 fn utf8_one_letter_2() {
291 assert_eq!(fuzzy_compare("ł", "ł"), 1.0f32)
292 }
293 #[test]
294 fn utf8_no_match() {
295 assert_eq!(fuzzy_compare("cgs", "ĉĝŝ"), 0.0f32)
296 }
297 #[test]
298 fn test_fuzzy_search_1() {
299 let s: &str = "bulko";
300 let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
301 let res: Vec<(&str, f32)> = fuzzy_search(s, &list);
302 assert_eq!(res.into_iter().count(), 3);
303 }
304 #[test]
305 fn test_fuzzy_search_sorted() {
306 let s: &str = "bulko";
307 let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
308 let res: Vec<(&str, f32)> = fuzzy_search_sorted(s, &list);
309 assert_eq!(res.into_iter().count(), 3);
310 }
311 #[test]
312 fn no_lowers() {
313 let threshold = 0.5f32;
314 let s: &str = "bulko";
315 let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
316 for (_word, score) in fuzzy_search_threshold(s, &list, threshold) {
317 assert!(score > threshold)
318 }
319 }
320 #[test]
321 fn test_fuzzy_search_best_n() {
322 let s: &str = "bulko";
323 let list: Vec<&str> = vec!["kolbasobulko", "sandviĉo", "kolbasobulkejo"];
324 let res: Vec<(&str, f32)> = fuzzy_search_best_n(s, &list, 2);
325 assert_eq!(res.into_iter().count(), 2);
326 }
327 #[test]
328 fn works_with_strings() {
329 let s = String::from("varma");
330 let list: Vec<String> = vec![String::from("varma vetero"), String::from("varma ĉokolado")];
331 fuzzy_search(&s, &list.iter().map(String::as_ref).collect::<Vec<&str>>());
332 }
333}