1pub struct NgramsIterator<'a> {
2 n: usize,
3 text: &'a str,
4 index: usize,
5}
6
7pub fn ngrams(text: &str, n: usize) -> impl Iterator<Item = &str> {
8 NgramsIterator::new(text, n)
9}
10
11impl<'a> NgramsIterator<'a> {
12 fn new(text: &str, n: usize) -> NgramsIterator {
13 NgramsIterator {
14 text,
15 n,
16 index: 0,
17 }
18 }
19}
20
21impl<'a> Iterator for NgramsIterator<'a> {
22 type Item = &'a str;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 let start_index: usize = self.index;
26
27 if start_index < self.text.len() {
28 let mut end_index: usize = start_index + 1;
29 let mut first_char_end_index: Option<usize> = None;
30 let mut chars: usize = 0;
31
32 while end_index < self.text.len() {
33 if self.text.is_char_boundary(end_index) {
34 chars += 1;
35
36 if first_char_end_index.is_none() {
37 first_char_end_index = Some(end_index);
38 }
39 }
40
41 if chars == self.n {
42 self.index = first_char_end_index.unwrap();
43
44 return Some(&self.text[start_index..end_index])
45 } else {
46 end_index += 1;
47 }
48 }
49
50 self.index = end_index;
51
52 Some(&self.text[start_index..end_index])
53 } else {
54 None
55 }
56 }
57}
58
59#[cfg(test)]
60mod tests {
61 mod ngrams {
62 use crate::ngrams::ngrams;
63
64 #[test]
65 fn test_english_chars_1_gram() {
66 let text = "foo";
67 let size = 1;
68
69 let result = ngrams(text, size);
70
71 assert_eq!(
72 result
73 .into_iter()
74 .collect::<Vec<&str>>(),
75 vec!["f", "o", "o"]
76 );
77 }
78
79 #[test]
80 fn test_english_chars_2_gram() {
81 let text = "foo";
82 let size = 2;
83
84 let result = ngrams(text, size);
85
86 assert_eq!(
87 result
88 .into_iter()
89 .collect::<Vec<&str>>(),
90 vec!["fo", "oo"]
91 );
92 }
93
94 #[test]
95 fn test_chinese_chars_1_gram() {
96 let text = "你好世界";
97 let size = 1;
98
99 let result = ngrams(text, size);
100
101 assert_eq!(
102 result
103 .into_iter()
104 .collect::<Vec<&str>>(),
105 vec!["你", "好", "世", "界"]
106 );
107 }
108
109 #[test]
110 fn test_chineses_chars_2_gram() {
111 let text = "你好世界";
112 let size = 2;
113
114 let result = ngrams(text, size);
115
116 assert_eq!(
117 result
118 .into_iter()
119 .collect::<Vec<&str>>(),
120 vec!["你好", "好世", "世界"]
121 );
122 }
123
124 #[test]
125 fn test_emoji_chars_1_gram() {
126 let text = "🌱🌿🌲🌳";
127 let size = 1;
128
129 let result = ngrams(text, size);
130
131 assert_eq!(
132 result
133 .into_iter()
134 .collect::<Vec<&str>>(),
135 vec!["🌱", "🌿", "🌲", "🌳"]
136 );
137 }
138
139 #[test]
140 fn test_emoji_chars_2_gram() {
141 let text = "🌱🌿🌲🌳";
142 let size = 2;
143
144 let result = ngrams(text, size);
145
146 assert_eq!(
147 result
148 .into_iter()
149 .collect::<Vec<&str>>(),
150 vec!["🌱🌿", "🌿🌲", "🌲🌳"]
151 );
152 }
153
154 #[test]
155 fn test_mix_chars_1_gram() {
156 let text = "f🌱你o";
157 let size = 1;
158
159 let result = ngrams(text, size);
160
161 assert_eq!(
162 result
163 .into_iter()
164 .collect::<Vec<&str>>(),
165 vec!["f", "🌱", "你", "o"]
166 );
167 }
168
169 #[test]
170 fn test_mix_chars_2_gram() {
171 let text = "f🌱你o";
172 let size = 2;
173
174 let result = ngrams(text, size);
175
176 assert_eq!(
177 result
178 .into_iter()
179 .collect::<Vec<&str>>(),
180 vec!["f🌱", "🌱你", "你o"]
181 );
182 }
183 }
184}