1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
use std::collections::HashMap;
pub fn get_trigrams_with_positions(text : &String) -> HashMap<String, u32> {
let counter_hash = count(text);
let mut count_vec: Vec<_> = counter_hash.iter().collect();
count_vec.sort_by_key(|key| key.1 );
count_vec.reverse();
let mut result: HashMap<String, u32> = HashMap::new();
for (i, trigram) in count_vec.iter().take(600).map(|x| x.0).enumerate() {
result.insert((*trigram).clone(), i as u32);
}
result
}
#[inline(always)]
fn count(text : &String) -> HashMap<String, u32> {
let mut s = text.to_lowercase();
s.push(' ');
let mut counter_hash : HashMap<String, u32> = HashMap::new();
let mut chars_iter = s.chars();
let mut c1 = ' ';
let mut c2 = to_trigram_char(chars_iter.next().unwrap());
for cur_char in chars_iter {
let c3 = to_trigram_char(cur_char);
if !((c1 == ' ' && c2 == ' ') || (c2 == ' ' && c3 == ' ')) {
let mut trigram = String::with_capacity(3);
trigram.push(c1);
trigram.push(c2);
trigram.push(c3);
let count = counter_hash.entry(trigram).or_insert(0);
*count += 1;
}
c1 = c2;
c2 = c3;
}
counter_hash
}
#[inline(always)]
fn to_trigram_char(ch : char) -> char {
match ch {
'\u{0000}'...'\u{0040}' | '\u{005B}'...'\u{0060}' | '\u{007B}'...'\u{007E}' => ' ',
_ => ch
}
}
#[cfg(test)]
mod tests {
use super::to_trigram_char;
use super::count;
use super::get_trigrams_with_positions;
fn assert_valuable_trigram_chars(chars : &[char]) {
for &ch in chars.iter() {
assert_eq!(to_trigram_char(ch), ch);
}
}
fn assert_not_valuable_trigram_chars(chars : &[char]) {
for &ch in chars.iter() {
assert_eq!(to_trigram_char(ch), ' ');
}
}
#[test]
fn test_to_trigram_char() {
assert_valuable_trigram_chars(&['a', 'z', 'A', 'Z', 'Ж', 'ß']);
assert_not_valuable_trigram_chars(&['\t', '\n', ' ', '.', '0', '9', ',', '@']);
assert_not_valuable_trigram_chars(&['[', ']', '^', '\\', '`']);
assert_not_valuable_trigram_chars(&['[', '|', '{', '}', '~']);
}
fn assert_count(text : &str, pairs : &[(&str, u32)]) {
let result = count(&text.to_string());
for &(k, v) in pairs.iter() {
let &actual_val = result.get(k).unwrap_or(&0);
assert_eq!(actual_val, v, "trigram '{}' expected to occur {} times, got {}", k, v, actual_val);
}
assert_eq!(result.len(), pairs.len());
}
#[test]
fn test_count() {
assert_count("", &[]);
assert_count(",", &[]);
assert_count("a", &[(" a ", 1)]);
assert_count("-a-", &[(" a ", 1)]);
assert_count("yes", &[(" ye", 1), ("yes", 1), ("es ", 1)]);
assert_count("Give - IT...", &[(" gi", 1), ("giv", 1), ("ive", 1), ("ve ", 1), (" it", 1), ("it ", 1)]);
}
#[test]
fn test_get_trigrams_with_positions() {
let res = get_trigrams_with_positions(&"xaaaaabbbbd".to_string());
assert_eq!(*res.get("aaa").unwrap(), 0);
assert_eq!(*res.get("bbb").unwrap(), 1);
}
}