1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
pub struct NgramsIterator {
n: usize,
chars: Vec<char>,
char_index: usize,
}
pub fn ngrams(text: &str, n: usize) -> impl Iterator<Item = String> {
NgramsIterator::new(text, n)
}
impl NgramsIterator {
fn new(text: &str, n: usize) -> NgramsIterator {
NgramsIterator {
n,
chars: text.chars().collect::<Vec<char>>(),
char_index: 0,
}
}
}
impl Iterator for NgramsIterator {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
if self.char_index + self.n <= self.chars.len() {
let result = self
.chars[self.char_index..self.char_index + self.n]
.iter()
.collect::<String>();
self.char_index += 1;
Some(result)
} else {
None
}
}
}
#[cfg(test)]
mod tests {
mod ngrams {
use crate::ngrams::ngrams;
#[test]
fn test_1_gram() {
let text = "你好世界";
let size = 1;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<String>>(),
vec!["你", "好", "世", "界"]
);
}
#[test]
fn test_2_gram() {
let text = "你好世界";
let size = 2;
let result = ngrams(text, size);
assert_eq!(
result
.into_iter()
.collect::<Vec<String>>(),
vec!["你好", "好世", "世界"]
);
}
}
}