fuzzt/
matcher.rs

1use crate::{
2    algorithms::{SequenceMatcher, Similarity, SimilarityMetric},
3    processors::{NullStringProcessor, StringProcessor},
4};
5use std::cmp::Reverse;
6use std::collections::BinaryHeap;
7
8/// Returns a list of the best matches to a collection of choices.
9///
10/// This is a convenience function for getting the choices with the highest scores.
11///
12/// # Arguments
13///
14/// * `query` - A string to match against.
15/// * `choices` - A list of choices to compare against the query.
16/// * `cutoff` - A score threshold. No matches with a score less than this number will be returned. Defaults to 0.7.
17/// * `n` - Optional maximum for the number of elements returned. Defaults to 3.
18/// * `processor` - Optional function for transforming choices before matching. If not provided, `NullStringProcessor` is used.
19/// * `scorer` - Optional scoring function for extract(). If not provided, `SequenceMatcher` is used.
20///
21/// # Returns
22///
23/// * A vector of the top 'n' matches from the given choices.
24///
25/// # Example
26///
27/// ```
28/// extern crate fuzzt;
29/// use fuzzt::{algorithms::NormalizedLevenshtein, get_top_n, processors::NullStringProcessor};
30///
31/// let matches = get_top_n(
32///     "apple",
33///     &["apply", "apples", "ape", "applet", "applesauce"],
34///     Some(0.8),
35///     Some(3),
36///     Some(&NullStringProcessor),
37///     Some(&NormalizedLevenshtein),
38/// );
39/// assert_eq!(matches, ["apples", "applet", "apply"]);
40/// ```
41pub fn get_top_n<'a>(
42    query: &str,
43    choices: &[&'a str],
44    cutoff: Option<f64>,
45    n: Option<usize>,
46    processor: Option<&dyn StringProcessor>,
47    scorer: Option<&dyn SimilarityMetric>,
48) -> Vec<&'a str> {
49    let mut matches = BinaryHeap::new();
50    let n = n.unwrap_or(3);
51    let cutoff = cutoff.unwrap_or(0.7);
52    let scorer = match scorer {
53        Some(scorer_trait) => scorer_trait,
54        None => &SequenceMatcher,
55    };
56    let processor = match processor {
57        Some(some_processor) => some_processor,
58        None => &NullStringProcessor,
59    };
60    let processed_query = processor.process(query);
61
62    for &choice in choices {
63        let processed_choice = processor.process(choice);
64        let raw_ratio = scorer.compute_metric(processed_query.as_str(), processed_choice.as_str());
65        let ratio = match raw_ratio {
66            Similarity::Usize(r) => r as f64,
67            Similarity::Float(r) => r,
68        };
69        if ratio >= cutoff {
70            let int_ratio = match raw_ratio {
71                Similarity::Usize(r) => r as i64,
72                Similarity::Float(r) => (r * std::u32::MAX as f64) as i64,
73            };
74            // we're putting the word itself in reverse in so that matches with
75            // the same ratio are ordered lexicographically.
76            matches.push((int_ratio, Reverse(choice)));
77        }
78    }
79    let mut rv = vec![];
80    for _ in 0..n {
81        if let Some((_, elt)) = matches.pop() {
82            rv.push(elt.0);
83        } else {
84            break;
85        }
86    }
87    rv
88}
89
90#[cfg(test)]
91mod tests {
92    use super::get_top_n;
93    use crate::algorithms::jaro::JaroWinkler;
94    use crate::algorithms::SimilarityMetric;
95    use crate::processors::{LowerAlphaNumStringProcessor, StringProcessor};
96    use rstest::rstest;
97
98    #[rstest]
99    #[case(Some(0.7), Some(3), None, None, &["brazil", "braziu", "trazil"])]
100    #[case(Some(0.9), Some(5), None, None, &["brazil"])]
101    #[case(Some(0.7), Some(2), None, Some(&JaroWinkler as &dyn SimilarityMetric), &["brazil", "braziu"])]
102    #[case(Some(0.7), Some(2), Some(&LowerAlphaNumStringProcessor as &dyn StringProcessor), None, &["brazil", "BRA ZIL"])]
103    fn test_get_top_n<'a>(
104        #[case] cutoff: Option<f64>,
105        #[case] n: Option<usize>,
106        #[case] processor: Option<&dyn StringProcessor>,
107        #[case] scorer: Option<&dyn SimilarityMetric>,
108        #[case] expected: &[&'a str],
109    ) {
110        let choices = &["trazil", "BRA ZIL", "brazil", "spain", "braziu"][..];
111        let query = "brazil";
112        let matches = get_top_n(query, choices, cutoff, n, processor, scorer);
113        assert_eq!(matches, expected);
114    }
115}