1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
use std::str::pattern::Pattern;
const SEPARATORS: &'static str = ",.;:!?";
pub fn tokenize_whitespace(text: String) -> Vec<String> {
let tokens: Vec<String> = text.split_whitespace().map(|t| t.to_string() ).collect();
tokens
}
pub fn tokenize_overlapping_ngrams(text: String, n: usize) -> Vec<String> {
let mut ngrams: Vec<String> = Vec::new();
let mut curr_token = String::new();
let mut curr_ngram: Vec<String> = Vec::with_capacity(n);
for ch in text.chars() {
if ch.is_alphanumeric() {
curr_token.push(ch);
} else if ch.is_whitespace() || ch.is_contained_in(SEPARATORS) {
if !curr_token.is_empty(){
curr_ngram.push(curr_token);
curr_token = String::new();
}
};
if n == curr_ngram.len() {
let ngram = curr_ngram.join(" ");
ngrams.push(ngram);
curr_ngram = curr_ngram.split_off(1);
}
}
if !curr_token.is_empty() {
curr_ngram.push(curr_token);
}
if !curr_ngram.is_empty(){
ngrams.push(curr_ngram.join(" "))
}
ngrams
}