fuzzy_match_flex/
lib.rs

1//! `Fuzzy Match` is a fuzzy matching library based on the popular `FuzzyWuzzy` library for python.
2//! It contains 4 basic fuzzy matching functions.
3
4use std::{collections::HashSet};
5use regex::Regex;
6
7/// Uses the levenshtein distance to calculate the similarity of two strings.
8///
9/// # Example
10///
11/// ```
12/// let str1 = "Hello";
13/// let str2 = "Hallo";
14/// let similiarity = fuzzy_match_flex::ratio(str1, str2, None);
15///
16/// assert_eq!(similiarity, 0.8);
17/// ```
18/// 
19/// # Third parameter
20/// 
21/// `clean_str: Option<bool>` - tells the function if the strings were cleaned. Set to `Some(false)` if they ware and to `Some(true)` or `None` if they need to be cleaned.
22/// 
23pub fn ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
24    if str1 == str2 {
25        return 1.0;
26    }
27
28    let clean_str = clean_str.unwrap_or(true);
29
30    let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
31    let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
32
33    levenshtein(&str1_cleaned, &str2_cleaned, 2)
34}
35
36/// Splits the longest string into tokens and uses the levenshtein distance to calculate the similarity of tokens and shorter string.
37///
38/// # Example
39///
40/// ```
41/// let str1 = "Do we buy the airplane?";
42/// let str2 = "Airplane";
43/// let similiarity = fuzzy_match_flex::partial_ratio(str1, str2, None);
44///
45/// assert_eq!(similiarity, 1.0);
46/// ```
47/// 
48/// # Third parameter
49/// 
50/// `clean_str: Option<bool>` - tells the function if the strings were cleaned. Set to `Some(false)` if they ware and to `Some(true)` or `None` if they need to be cleaned.
51/// 
52pub fn partial_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
53    if str1 == str2 {
54        return 1.0;
55    }
56
57    let str1_len = str1.len();
58    let str2_len = str2.len();
59
60    let clean_str = clean_str.unwrap_or(true);
61
62    let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
63    let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
64        
65    let tokens = if str1_len > str2_len { str1_cleaned.split(' ') } else { str2_cleaned.split(' ') };
66
67    let mut best_ratio : f32 = 0.0;
68    let shortest_str : &str = if str1_len < str2_len { &str1_cleaned } else { &str2_cleaned };
69
70    for token in tokens {
71        best_ratio = best_ratio.max(levenshtein(token, shortest_str, 2));
72    }
73
74    best_ratio
75 }
76
77/// Splits both strings into tokens, sorts them and calculates the similarity between sorted words.
78///
79/// # Example
80///
81/// ```
82/// let str1 = "My mom bought me ice cream";
83/// let str2 = "The ice cream was bought by my mom";
84/// let similiarity = fuzzy_match_flex::token_sort_ratio(str1, str2, None);
85///
86/// assert_eq!(similiarity, 0.8666667);
87/// ```
88/// 
89/// # Third parameter
90/// 
91/// `clean_str: Option<bool>` - tells the function if the strings were cleaned. Set to `Some(false)` if they ware and to `Some(true)` or `None` if they need to be cleaned.
92/// 
93pub fn token_sort_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
94    if str1 == str2 {
95        return 1.0;
96     }
97
98    let clean_str = clean_str.unwrap_or(true);
99
100    let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
101    let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
102
103    let mut tokens_str1 : Vec<&str> = str1_cleaned.split(' ').collect();
104    tokens_str1.sort();
105
106    let mut tokens_str2 : Vec<&str> = str2_cleaned.split(' ').collect();
107    tokens_str2.sort();
108
109    let str1_low_sorted = tokens_str1.join(" ");
110    let str2_low_sorted = tokens_str2.join(" ");
111
112    levenshtein(&str1_low_sorted, &str2_low_sorted, 0)
113}
114
115/// Finds the intersection of both strings and calculates the similarity of strings.
116///
117/// # Example
118///
119/// ```
120/// let str1 = "There are a lot of differences between Rust and C++";
121/// let str2 = "differences in Rust C++";
122/// let similiarity = fuzzy_match_flex::token_set_ratio(str1, str2, None);
123///
124/// assert_eq!(similiarity, 0.9230769);
125/// ```
126/// 
127/// # Third parameter
128/// 
129/// `clean_str: Option<bool>` - tells the function if the strings were cleaned. Set to `Some(false)` if they ware and to `Some(true)` or `None` if they need to be cleaned.
130/// 
131pub fn token_set_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
132    if str1 == str2 {
133        return 1.0;
134    }
135
136    let clean_str = clean_str.unwrap_or(true);
137
138    let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
139    let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
140
141    let tokens_str1 : Vec<&str> = str1_cleaned.split(' ').collect();
142    let tokens_str2 : Vec<&str> = str2_cleaned.split(' ').collect();
143        
144    let unique_tokens_str1: HashSet<&str> = tokens_str1.into_iter().collect();
145    let unique_tokens_str2: HashSet<&str> = tokens_str2.into_iter().collect();
146
147    let mut intersection : Vec<&str> = unique_tokens_str1.intersection(&unique_tokens_str2).copied().collect();
148    let mut difference1 : Vec<&str> = unique_tokens_str1.difference(&unique_tokens_str2).copied().collect();
149    let mut difference2 : Vec<&str> = unique_tokens_str2.difference(&unique_tokens_str1).copied().collect();
150
151    intersection.sort();
152    difference1.sort();
153    difference2.sort();
154
155    let sorted_intersection_str = intersection.join(" ");
156    let sorted_difference1_str = difference1.join(" ");
157    let sorted_difference2_str = difference2.join(" ");
158
159    let combined_intersection_and_difference1 = format!("{} {}", sorted_intersection_str, sorted_difference1_str);
160    let combined_intersection_and_difference2 = format!("{} {}", sorted_intersection_str, sorted_difference2_str);
161        
162    let ratios = vec![levenshtein(&sorted_intersection_str, &combined_intersection_and_difference1, 2),
163                      levenshtein(&sorted_intersection_str, &combined_intersection_and_difference2, 2),
164                      levenshtein(&combined_intersection_and_difference1, &combined_intersection_and_difference2, 2)];
165
166    let mut max_ratio = ratios[0];
167    for val in ratios.iter().skip(1){
168        max_ratio = max_ratio.max(*val);
169    };
170
171    max_ratio
172}
173
174/// Levenshtein distance that calculates the similarity of two strings.
175///
176/// # Example
177///
178/// ```rust,ignore
179/// let str1 = "hello";
180/// let str2 = "hallo";
181/// let similiarity = fuzzy_match_flex::levenshtein(str1, str2, 2);
182///
183/// assert_eq!(similiarity, 0.8);
184/// ```
185/// 
186/// # Third parameter
187/// 
188/// `substitution_const: usize` - sets the cost of changing one letter into another.
189/// 
190fn levenshtein(str1 : &str, str2 : &str, substitution_const: usize) -> f32 {
191    let rows = str1.len() + 1;
192    let columns = str2.len() + 1;
193    let mut distance: Vec<Vec<usize>> = vec![vec![0; columns]; rows];
194
195    for i in 1..rows {
196        distance[i][0] = i;
197    }
198        
199    for i in 1..columns {
200        distance[0][i] = i;
201    }
202        
203    for i in 1..rows{
204        for j in 1..columns{
205            let cost = if str1.chars().nth(i-1) == str2.chars().nth(i-1) { 0 } else { substitution_const };
206
207            distance[i][j] = (distance[i-1][j] + 1)
208                            .min(distance[i][j-1] + 1)
209                            .min(distance[i-1][j-1] + cost);
210        }
211    }
212
213    let sum_length = (str1.len() + str2.len()) as f32;
214
215    (sum_length - distance[rows-1][columns-1] as f32) / sum_length
216}
217
218/// Cleans the string leaving only lower case letters, numbers and one space between words.
219///
220/// # Example
221///
222/// ```rust,ignore
223/// let str1 = "   It IS   imp^^^^0rtant";
224/// let cleaned_str = fuzzy_match_flex::clean_string(str1);
225///
226/// assert_eq!(cleaned_str, "it is imp0rtant");
227/// ```
228fn clean_string(str : &str) -> String {
229    let mut regex = Regex::new(r"[^a-zA-Z0-9\s]").unwrap();
230    let cleared_str = regex.replace_all(&str, "").to_string().to_lowercase();
231
232    regex = Regex::new(r"\s+").unwrap();
233    regex.replace_all(cleared_str.trim(), " ").to_string()
234}
235
236/// Uses the levenshtein distance to calculate the similarity of two strings.
237///
238/// # Example
239///
240/// ```rust,ignore
241/// let str1 = "Hello";
242/// let str2 = "Hallo";
243/// let similiarity = fuzzy_match_flex::ratio!(str1, str2);
244///
245/// assert_eq!(similiarity, 0.8);
246/// ```
247#[macro_export]
248macro_rules! ratio {
249    ($str1:expr, $str2:expr) => {
250        fuzzy_match_flex::ratio($str1, $str2, None)
251    };
252}
253
254/// Splits the longest string into tokens and uses the levenshtein distance to calculate the similarity of tokens and shorter string.
255///
256/// # Example
257///
258/// ```rust,ignore
259/// let str1 = "Do we buy the airplane?";
260/// let str2 = "Airplane";
261/// let similiarity = fuzzy_match_flex::partial_ratio!(str1, str2);
262///
263/// assert_eq!(similiarity, 1.0);
264/// ```
265#[macro_export]
266macro_rules! partial_ratio {
267    ($str1:expr, $str2:expr) => {
268        fuzzy_match_flex::partial_ratio($str1, $str2, None)
269    };
270}
271
272/// Splits both strings into tokens, sorts them and calculates the similarity between sorted words.
273///
274/// # Example
275///
276/// ```rust,ignore
277/// let str1 = "My mom bought me ice cream";
278/// let str2 = "The ice cream was bought by my mom";
279/// let similiarity = fuzzy_match_flex::token_sort_ratio!(str1, str2);
280///
281/// assert_eq!(similiarity, 0.8666667);
282/// ```
283#[macro_export]
284macro_rules! token_sort_ratio {
285    ($str1:expr, $str2:expr) => {
286        fuzzy_match_flex::token_sort_ratio($str1, $str2, None)
287    };
288}
289
290/// Finds the intersection of both strings and calculates the similarity of strings.
291///
292/// # Example
293///
294/// ```rust,ignore
295/// let str1 = "There are a lot of differences between Rust and C++";
296/// let str2 = "differences in Rust C++";
297/// let similiarity = fuzzy_match_flex::token_set_ratio!(str1, str2);
298///
299/// assert_eq!(similiarity, 0.9230769);
300/// ```
301#[macro_export]
302macro_rules! token_set_ratio {
303    ($str1:expr, $str2:expr) => {
304        fuzzy_match_flex::token_set_ratio($str1, $str2, None)
305    };
306}
307
308 #[cfg(test)]
309 mod tests {
310    use super::*;
311
312    #[test]
313    fn clean_string_trims_string_and_deletes_non_alphanumerics() {
314        let str = "  duck && cats";
315        assert_eq!(clean_string(&str), "duck cats");
316    }
317
318    #[test]
319    fn clean_string_deletes_repeaded_spaces_and_non_alphanumerics() {
320        let str = "this1       is 4 u   :*   ";
321        assert_eq!(clean_string(&str), "this1 is 4 u");
322    }
323
324    #[test]
325    fn clean_string_turns_letters_into_lower_case() {
326        let str = "THERE IS NO LOWERCASE LETTER IN THIS SENTENCE HAHAHA";
327        assert_eq!(clean_string(&str), "there is no lowercase letter in this sentence hahaha");
328    }
329
330    #[test]
331    fn clean_string_turns_letters_into_lower_case_delets_non_alphanumerics_and_trims() {
332        let str = "  SUper Mega    H4Rd s**************tR1N$$$G";
333        assert_eq!(clean_string(&str), "super mega h4rd str1ng");
334    }
335
336    #[test]
337    fn levenshtein_no_spaces_sub_count_2() {
338        let str1 = "haouse";
339        let str2 = "home";
340        let sum_length = str1.len() + str2.len();
341        let result = (sum_length - 8) as f32 / sum_length as f32;
342
343        assert_eq!(levenshtein(&str1, &str2, 2), result);
344    }
345
346    #[test]
347    fn levenshtein_spaces_sub_count_1() {
348        let str1 = "this is a string";
349        let str2 = "that is a string";
350        let sum_length = str1.len() + str2.len();
351        let result = (sum_length - 2) as f32 / sum_length as f32;
352
353        assert_eq!(levenshtein(&str1, &str2, 1), result);
354    }
355
356    #[test]
357    fn ratio_words_with_spaces() {
358        let str1 = "This is a string";
359        let str2 = "That is a string";
360        let sum_length = str1.len() + str2.len();
361        let result = (sum_length - 4) as f32 / sum_length as f32;
362
363        assert_eq!(ratio(&str1, &str2, None), result);
364    }
365
366    #[test]
367    fn partial_ratio_words_with_spaces() {
368        let str1 = "My Friend Builds :* Robots";
369        let str2 = "Roboto";
370        let sum_length = "Robots".len() + str2.len();
371        let result = (sum_length - 2) as f32 / sum_length as f32;
372
373        assert_eq!(partial_ratio(&str1, &str2, None), result);
374    }
375
376    #[test]
377    fn token_sort_ratio_words_with_spaces() {
378        let str1 = "Nasa landed a man on the moon";
379        let str2 = "A man landed on the moon due to Nasa";
380        let sum_length = str1.len() + str2.len();
381        let result = (sum_length - 7) as f32 / sum_length as f32;
382
383        assert_eq!(token_sort_ratio(&str1, &str2, None), result);
384    }
385
386    #[test]
387    fn token_set_ratio_words_with_spaces() {
388        let str1 = "The library has my favorite book its called rain & world";
389        let str2 = "rain and world";
390        let sum_length = 24;
391        let result = (sum_length - 4) as f32 / sum_length as f32;
392
393        assert_eq!(token_set_ratio(&str1, &str2, None), result);
394    }
395}