closestmatch/
lib.rs

1#![deny(missing_docs,
2        missing_debug_implementations, missing_copy_implementations,
3        trivial_casts, trivial_numeric_casts,
4        unsafe_code,
5        unstable_features,
6        unused_import_braces, unused_qualifications)]
7
8//! # closestmatch-rs
9//! Inspired by closestmatch a golang library.
10//! This library fuzzy matches an input string to a list of target strings.
11extern crate rayon;
12
13use std::collections::HashMap;
14use std::collections::HashSet;
15use rayon::prelude::*;
16
17#[derive(Debug)]
18/// The ```ClosestMatch``` struct stores informations about the dictionary of known words
19/// and the different sizes for the bags of words.
20pub struct ClosestMatch {
21    substrings: HashMap<String, HashSet<String>>,
22    substring_sizes: Vec<usize>,
23}
24
25#[derive(Debug)]
26struct SplitWord {
27    word: String,
28    substrings: HashSet<String>,
29}
30
31#[derive(Debug)]
32struct ScoreValue {
33    word: String,
34    score: f32,
35}
36
37fn split_word(word: String, sizes: &Vec<usize>) -> SplitWord {
38    let mut substrings: HashSet<String> = HashSet::new();
39    for size in sizes {
40        if *size > word.len() {
41            continue;
42        }
43        for x in 0..(word.len() - size + 1) {
44            let sub = word[x..(x + size)].to_string().to_lowercase();
45            substrings.insert(sub);
46        }
47    }
48    return SplitWord {
49               word: word,
50               substrings: substrings,
51           };
52}
53
54fn evaluate(word_subs: &HashSet<String>,
55            possible: String,
56            possible_subs: &HashSet<String>)
57            -> ScoreValue {
58    let mut count = 0;
59    let len_sum = word_subs.len() + possible_subs.len();
60    for sub in word_subs {
61        if possible_subs.contains(sub) {
62            count += 1;
63        }
64    }
65    let score = (count as f32) / (len_sum as f32);
66    return ScoreValue {
67               word: possible,
68               score: score,
69           };
70}
71
72fn max_score(a: ScoreValue, b: ScoreValue) -> ScoreValue {
73    if a.score <= b.score {
74        return b;
75    }
76    return a;
77}
78
79impl ClosestMatch {
80    /// The function ```new``` takes a dictionary of known words with type ```Vec<String>``` and the
81    /// different sizes of bag of words with type ```Vec<usize>```.
82    /// It returns a ClosestMatch object.
83    pub fn new(dictionary: Vec<String>, sizes: Vec<usize>) -> ClosestMatch {
84        let mut substrings: HashMap<String, HashSet<String>> = HashMap::new();
85        let splitwords: Vec<SplitWord> = dictionary
86            .par_iter()
87            .map(|possible| split_word(possible.to_lowercase(), &sizes))
88            .collect();
89        for splitword in splitwords {
90            substrings.insert(splitword.word, splitword.substrings);
91        }
92        return ClosestMatch {
93                   substrings: substrings,
94                   substring_sizes: sizes,
95               };
96    }
97
98    /// The function ```get_closest``` takes a word with type ```String``` and
99    /// returns the closest word in the dictionary of known words.
100    pub fn get_closest(&self, word: String) -> Option<String> {
101        let word_subs = split_word(word, &self.substring_sizes).substrings;
102        let best = self.substrings
103            .par_iter()
104            .map(|(possible, possible_subs)| {
105                     evaluate(&word_subs, possible.to_lowercase(), possible_subs)
106                 })
107            .reduce_with(|a, b| max_score(a, b));
108        match best {
109            Some(expr) => Some(expr.word),
110            None => None,
111        }
112    }
113}
114
115#[cfg(test)]
116mod tests {
117    use ClosestMatch;
118
119    #[test]
120    fn it_works() {
121        let cm = ClosestMatch::new(["hello".to_string(),
122                                    "bullo".to_string(),
123                                    "hello world".to_string()]
124                                           .to_vec(),
125                                   [1, 2, 3].to_vec());
126        let closest = cm.get_closest("hlo".to_string());
127        println!("{:?}", closest);
128    }
129}