1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
use rust_stemmers::{Algorithm, Stemmer};
pub fn text_tokens(text: &str) -> Vec<String> {
let en_stemmer = Stemmer::create(Algorithm::English);
let mut stem: std::borrow::Cow<str>;
let mut token: String;
let mut tokens = Vec::new();
let text_clean = text.replace(&['(', ')', ',', '\"', '.', ';', ':', '\'','!','?'][..], "");
let text_lower = text_clean.to_lowercase();
let words = text_lower.split_whitespace();
for gram in words {
stem = en_stemmer.stem(&gram);
token = stem.to_string();
tokens.push(token);
}
tokens
}
pub fn text_token_bigrams(text: &str) -> Vec<String> {
let en_stemmer = Stemmer::create(Algorithm::English);
let mut stem: std::borrow::Cow<str>;
let mut bigram = String::from("!NewDoc");
let mut bigrams = Vec::new();
let text_clean = text.replace(&['(', ')', ',', '\"', '.', ';', ':', '\'','!','?'][..], "");
let text_lower = text_clean.to_lowercase();
let words = text_lower.split_whitespace();
for gram in words {
stem = en_stemmer.stem(&gram);
bigram.push_str(" ");
bigram.push_str(&stem);
bigrams.push(bigram);
bigram = stem.to_string();
}
bigrams
}
#[test]
fn demo() {
let sentence = String::from("Today I walked slowly to the garden in San Diego.");
let tokenized_bigrams = text_token_bigrams(&sentence);
for bigram in tokenized_bigrams {
println!("bigram= {}", bigram);
}
}