1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#![deny(missing_docs,
missing_debug_implementations, missing_copy_implementations,
trivial_casts, trivial_numeric_casts,
unsafe_code,
unstable_features,
unused_import_braces, unused_qualifications)]
extern crate rayon;
use std::collections::HashMap;
use std::collections::HashSet;
use rayon::prelude::*;
#[derive(Debug)]
pub struct ClosestMatch {
substrings: HashMap<String, HashSet<String>>,
substring_sizes: Vec<usize>,
}
#[derive(Debug)]
struct SplitWord {
word: String,
substrings: HashSet<String>,
}
#[derive(Debug)]
struct ScoreValue {
word: String,
score: f32,
}
fn split_word(word: String, sizes: &Vec<usize>) -> SplitWord {
let mut substrings: HashSet<String> = HashSet::new();
for size in sizes {
if *size > word.len() {
continue;
}
for x in 0..(word.len() - size + 1) {
let sub = word[x..(x + size)].to_string().to_lowercase();
substrings.insert(sub);
}
}
return SplitWord {
word: word,
substrings: substrings,
};
}
fn evaluate(word_subs: &HashSet<String>,
possible: String,
possible_subs: &HashSet<String>)
-> ScoreValue {
let mut count = 0;
let len_sum = word_subs.len() + possible_subs.len();
for sub in word_subs {
if possible_subs.contains(sub) {
count += 1;
}
}
let score = (count as f32) / (len_sum as f32);
return ScoreValue {
word: possible,
score: score,
};
}
fn max_score(a: ScoreValue, b: ScoreValue) -> ScoreValue {
if a.score <= b.score {
return b;
}
return a;
}
impl ClosestMatch {
pub fn new(dictionary: Vec<String>, sizes: Vec<usize>) -> ClosestMatch {
let mut substrings: HashMap<String, HashSet<String>> = HashMap::new();
let splitwords: Vec<SplitWord> = dictionary
.par_iter()
.map(|possible| split_word(possible.to_lowercase(), &sizes))
.collect();
for splitword in splitwords {
substrings.insert(splitword.word, splitword.substrings);
}
return ClosestMatch {
substrings: substrings,
substring_sizes: sizes,
};
}
pub fn get_closest(&self, word: String) -> Option<String> {
let word_subs = split_word(word, &self.substring_sizes).substrings;
let best = self.substrings
.par_iter()
.map(|(possible, possible_subs)| {
evaluate(&word_subs, possible.to_lowercase(), possible_subs)
})
.reduce_with(|a, b| max_score(a, b));
match best {
Some(expr) => Some(expr.word),
None => None,
}
}
}
#[cfg(test)]
mod tests {
use ClosestMatch;
#[test]
fn it_works() {
let cm = ClosestMatch::new(["hello".to_string(),
"bullo".to_string(),
"hello world".to_string()]
.to_vec(),
[1, 2, 3].to_vec());
let closest = cm.get_closest("hlo".to_string());
println!("{:?}", closest);
}
}