1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
pub struct NgramsIterator<'a> {
n: usize,
text: &'a str,
index: usize,
}
pub fn ngrams(text: &str, n: usize) -> impl Iterator<Item = &str> {
NgramsIterator::new(text, n)
}
impl<'a> NgramsIterator<'a> {
fn new(text: &str, n: usize) -> NgramsIterator {
NgramsIterator {
text,
n,
index: 0,
}
}
}
impl<'a> Iterator for NgramsIterator<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
let start_index: usize = self.index;
if start_index < self.text.len() {
let mut end_index: usize = start_index + 1;
let mut first_char_end_index: Option<usize> = None;
let mut chars: usize = 0;
while end_index < self.text.len() {
if self.text.is_char_boundary(end_index) {
chars += 1;
if first_char_end_index.is_none() {
first_char_end_index = Some(end_index);
}
}
if chars == self.n {
self.index = first_char_end_index.unwrap();
return Some(&self.text[start_index..end_index])
} else {
end_index += 1;
}
}
self.index = end_index;
Some(&self.text[start_index..end_index])
} else {
None
}
}
}
#[cfg(test)]
mod tests {
mod ngrams {
use crate::ngrams::ngrams;
#[test]
fn test_english_chars_1_gram() {
let text = "foo";
let size = 1;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["f", "o", "o"]
);
}
#[test]
fn test_english_chars_2_gram() {
let text = "foo";
let size = 2;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["fo", "oo"]
);
}
#[test]
fn test_chinese_chars_1_gram() {
let text = "你好世界";
let size = 1;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["你", "好", "世", "界"]
);
}
#[test]
fn test_chineses_chars_2_gram() {
let text = "你好世界";
let size = 2;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["你好", "好世", "世界"]
);
}
}
}