1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
use tokenize::tokenize;
struct NGram<'a> {
text: &'a str,
n: usize,
pad: &'a str
}
impl<'a> NGram<'a> {
fn calculate(&self) -> Vec<Vec<&'a str>> {
let mut tokenized_sequence = tokenize(self.text);
tokenized_sequence.shrink_to_fit();
let count = tokenized_sequence.len() - self.n + 1;
let mut ngram_result = Vec::new();
if !self.pad.is_empty() {
for i in 1..self.n {
let num_blanks = self.n - i;
let mut this_sequence = Vec::new();
for _ in 0..num_blanks {
this_sequence.push(self.pad);
}
let sl = &tokenized_sequence[0 .. (self.n - num_blanks)];
this_sequence.extend_from_slice(sl);
ngram_result.push(this_sequence);
}
}
for i in 0..count {
let a = &tokenized_sequence[i..i + self.n];
let sl = a.to_vec();
ngram_result.push(sl);
}
if !self.pad.is_empty() {
for num_blanks in 1..self.n {
let num_tokens = self.n - num_blanks;
let last_entry = tokenized_sequence.len();
let mut tc = Vec::new();
tc.extend_from_slice(&tokenized_sequence[(last_entry - num_tokens) .. last_entry]);
for _ in 0..num_blanks {
tc.push(self.pad);
}
ngram_result.push(tc);
}
}
ngram_result
}
}
pub fn get_ngram<'a>(this_text: &'a str, this_n: usize) -> Vec<Vec<&'a str>> {
let ng = NGram { text: this_text, n: this_n, pad: "" };
ng.calculate()
}
pub fn get_ngram_with_padding<'a>(this_text: &'a str, this_n: usize, this_padding: &'a str) -> Vec<Vec<&'a str>> {
let ng = NGram { text: this_text, n: this_n, pad: this_padding };
ng.calculate()
}