1use std::{collections::HashSet};
5use regex::Regex;
6
7pub fn ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
24 if str1 == str2 {
25 return 1.0;
26 }
27
28 let clean_str = clean_str.unwrap_or(true);
29
30 let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
31 let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
32
33 levenshtein(&str1_cleaned, &str2_cleaned, 2)
34}
35
36pub fn partial_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
53 if str1 == str2 {
54 return 1.0;
55 }
56
57 let str1_len = str1.len();
58 let str2_len = str2.len();
59
60 let clean_str = clean_str.unwrap_or(true);
61
62 let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
63 let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
64
65 let tokens = if str1_len > str2_len { str1_cleaned.split(' ') } else { str2_cleaned.split(' ') };
66
67 let mut best_ratio : f32 = 0.0;
68 let shortest_str : &str = if str1_len < str2_len { &str1_cleaned } else { &str2_cleaned };
69
70 for token in tokens {
71 best_ratio = best_ratio.max(levenshtein(token, shortest_str, 2));
72 }
73
74 best_ratio
75 }
76
77pub fn token_sort_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
94 if str1 == str2 {
95 return 1.0;
96 }
97
98 let clean_str = clean_str.unwrap_or(true);
99
100 let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
101 let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
102
103 let mut tokens_str1 : Vec<&str> = str1_cleaned.split(' ').collect();
104 tokens_str1.sort();
105
106 let mut tokens_str2 : Vec<&str> = str2_cleaned.split(' ').collect();
107 tokens_str2.sort();
108
109 let str1_low_sorted = tokens_str1.join(" ");
110 let str2_low_sorted = tokens_str2.join(" ");
111
112 levenshtein(&str1_low_sorted, &str2_low_sorted, 0)
113}
114
115pub fn token_set_ratio(str1 : &str, str2 : &str, clean_str: Option<bool>) -> f32 {
132 if str1 == str2 {
133 return 1.0;
134 }
135
136 let clean_str = clean_str.unwrap_or(true);
137
138 let str1_cleaned = if clean_str { clean_string(str1) } else { str1.to_owned() };
139 let str2_cleaned = if clean_str { clean_string(str2) } else { str2.to_owned() };
140
141 let tokens_str1 : Vec<&str> = str1_cleaned.split(' ').collect();
142 let tokens_str2 : Vec<&str> = str2_cleaned.split(' ').collect();
143
144 let unique_tokens_str1: HashSet<&str> = tokens_str1.into_iter().collect();
145 let unique_tokens_str2: HashSet<&str> = tokens_str2.into_iter().collect();
146
147 let mut intersection : Vec<&str> = unique_tokens_str1.intersection(&unique_tokens_str2).copied().collect();
148 let mut difference1 : Vec<&str> = unique_tokens_str1.difference(&unique_tokens_str2).copied().collect();
149 let mut difference2 : Vec<&str> = unique_tokens_str2.difference(&unique_tokens_str1).copied().collect();
150
151 intersection.sort();
152 difference1.sort();
153 difference2.sort();
154
155 let sorted_intersection_str = intersection.join(" ");
156 let sorted_difference1_str = difference1.join(" ");
157 let sorted_difference2_str = difference2.join(" ");
158
159 let combined_intersection_and_difference1 = format!("{} {}", sorted_intersection_str, sorted_difference1_str);
160 let combined_intersection_and_difference2 = format!("{} {}", sorted_intersection_str, sorted_difference2_str);
161
162 let ratios = vec![levenshtein(&sorted_intersection_str, &combined_intersection_and_difference1, 2),
163 levenshtein(&sorted_intersection_str, &combined_intersection_and_difference2, 2),
164 levenshtein(&combined_intersection_and_difference1, &combined_intersection_and_difference2, 2)];
165
166 let mut max_ratio = ratios[0];
167 for val in ratios.iter().skip(1){
168 max_ratio = max_ratio.max(*val);
169 };
170
171 max_ratio
172}
173
174fn levenshtein(str1 : &str, str2 : &str, substitution_const: usize) -> f32 {
191 let rows = str1.len() + 1;
192 let columns = str2.len() + 1;
193 let mut distance: Vec<Vec<usize>> = vec![vec![0; columns]; rows];
194
195 for i in 1..rows {
196 distance[i][0] = i;
197 }
198
199 for i in 1..columns {
200 distance[0][i] = i;
201 }
202
203 for i in 1..rows{
204 for j in 1..columns{
205 let cost = if str1.chars().nth(i-1) == str2.chars().nth(i-1) { 0 } else { substitution_const };
206
207 distance[i][j] = (distance[i-1][j] + 1)
208 .min(distance[i][j-1] + 1)
209 .min(distance[i-1][j-1] + cost);
210 }
211 }
212
213 let sum_length = (str1.len() + str2.len()) as f32;
214
215 (sum_length - distance[rows-1][columns-1] as f32) / sum_length
216}
217
218fn clean_string(str : &str) -> String {
229 let mut regex = Regex::new(r"[^a-zA-Z0-9\s]").unwrap();
230 let cleared_str = regex.replace_all(&str, "").to_string().to_lowercase();
231
232 regex = Regex::new(r"\s+").unwrap();
233 regex.replace_all(cleared_str.trim(), " ").to_string()
234}
235
236#[macro_export]
248macro_rules! ratio {
249 ($str1:expr, $str2:expr) => {
250 fuzzy_match_flex::ratio($str1, $str2, None)
251 };
252}
253
254#[macro_export]
266macro_rules! partial_ratio {
267 ($str1:expr, $str2:expr) => {
268 fuzzy_match_flex::partial_ratio($str1, $str2, None)
269 };
270}
271
272#[macro_export]
284macro_rules! token_sort_ratio {
285 ($str1:expr, $str2:expr) => {
286 fuzzy_match_flex::token_sort_ratio($str1, $str2, None)
287 };
288}
289
290#[macro_export]
302macro_rules! token_set_ratio {
303 ($str1:expr, $str2:expr) => {
304 fuzzy_match_flex::token_set_ratio($str1, $str2, None)
305 };
306}
307
308 #[cfg(test)]
309 mod tests {
310 use super::*;
311
312 #[test]
313 fn clean_string_trims_string_and_deletes_non_alphanumerics() {
314 let str = " duck && cats";
315 assert_eq!(clean_string(&str), "duck cats");
316 }
317
318 #[test]
319 fn clean_string_deletes_repeaded_spaces_and_non_alphanumerics() {
320 let str = "this1 is 4 u :* ";
321 assert_eq!(clean_string(&str), "this1 is 4 u");
322 }
323
324 #[test]
325 fn clean_string_turns_letters_into_lower_case() {
326 let str = "THERE IS NO LOWERCASE LETTER IN THIS SENTENCE HAHAHA";
327 assert_eq!(clean_string(&str), "there is no lowercase letter in this sentence hahaha");
328 }
329
330 #[test]
331 fn clean_string_turns_letters_into_lower_case_delets_non_alphanumerics_and_trims() {
332 let str = " SUper Mega H4Rd s**************tR1N$$$G";
333 assert_eq!(clean_string(&str), "super mega h4rd str1ng");
334 }
335
336 #[test]
337 fn levenshtein_no_spaces_sub_count_2() {
338 let str1 = "haouse";
339 let str2 = "home";
340 let sum_length = str1.len() + str2.len();
341 let result = (sum_length - 8) as f32 / sum_length as f32;
342
343 assert_eq!(levenshtein(&str1, &str2, 2), result);
344 }
345
346 #[test]
347 fn levenshtein_spaces_sub_count_1() {
348 let str1 = "this is a string";
349 let str2 = "that is a string";
350 let sum_length = str1.len() + str2.len();
351 let result = (sum_length - 2) as f32 / sum_length as f32;
352
353 assert_eq!(levenshtein(&str1, &str2, 1), result);
354 }
355
356 #[test]
357 fn ratio_words_with_spaces() {
358 let str1 = "This is a string";
359 let str2 = "That is a string";
360 let sum_length = str1.len() + str2.len();
361 let result = (sum_length - 4) as f32 / sum_length as f32;
362
363 assert_eq!(ratio(&str1, &str2, None), result);
364 }
365
366 #[test]
367 fn partial_ratio_words_with_spaces() {
368 let str1 = "My Friend Builds :* Robots";
369 let str2 = "Roboto";
370 let sum_length = "Robots".len() + str2.len();
371 let result = (sum_length - 2) as f32 / sum_length as f32;
372
373 assert_eq!(partial_ratio(&str1, &str2, None), result);
374 }
375
376 #[test]
377 fn token_sort_ratio_words_with_spaces() {
378 let str1 = "Nasa landed a man on the moon";
379 let str2 = "A man landed on the moon due to Nasa";
380 let sum_length = str1.len() + str2.len();
381 let result = (sum_length - 7) as f32 / sum_length as f32;
382
383 assert_eq!(token_sort_ratio(&str1, &str2, None), result);
384 }
385
386 #[test]
387 fn token_set_ratio_words_with_spaces() {
388 let str1 = "The library has my favorite book its called rain & world";
389 let str2 = "rain and world";
390 let sum_length = 24;
391 let result = (sum_length - 4) as f32 / sum_length as f32;
392
393 assert_eq!(token_set_ratio(&str1, &str2, None), result);
394 }
395}