ultra_nlp/
ngrams.rs

1pub struct NgramsIterator<'a> {
2    n: usize,
3    text: &'a str,
4    index: usize,
5}
6
7pub fn ngrams(text: &str, n: usize) -> impl Iterator<Item = &str> {
8    NgramsIterator::new(text, n)
9}
10
11impl<'a> NgramsIterator<'a> {
12    fn new(text: &str, n: usize) -> NgramsIterator {
13        NgramsIterator {
14            text,
15            n,
16            index: 0,
17        }
18    }
19}
20
21impl<'a> Iterator for NgramsIterator<'a> {
22    type Item = &'a str;
23
24    fn next(&mut self) -> Option<Self::Item> {
25        let start_index: usize = self.index;
26
27        if start_index < self.text.len() {
28            let mut end_index: usize = start_index + 1;
29            let mut first_char_end_index: Option<usize> = None;
30            let mut chars: usize = 0;
31
32            while end_index < self.text.len() {
33                if self.text.is_char_boundary(end_index) {
34                    chars += 1;
35
36                    if first_char_end_index.is_none() {
37                        first_char_end_index = Some(end_index);
38                    }
39                }
40
41                if chars == self.n {
42                    self.index = first_char_end_index.unwrap();
43
44                    return Some(&self.text[start_index..end_index])
45                } else {
46                    end_index += 1;
47                }
48            }
49
50            self.index = end_index;
51
52            Some(&self.text[start_index..end_index])
53        } else {
54            None
55        }
56    }
57}
58
59#[cfg(test)]
60mod tests {
61    mod ngrams {
62        use crate::ngrams::ngrams;
63
64        #[test]
65        fn test_english_chars_1_gram() {
66            let text = "foo";
67            let size = 1;
68
69            let result = ngrams(text, size);
70
71            assert_eq!(
72                result
73                    .into_iter()
74                    .collect::<Vec<&str>>(),
75                vec!["f", "o", "o"]
76            );
77        }
78
79        #[test]
80        fn test_english_chars_2_gram() {
81            let text = "foo";
82            let size = 2;
83
84            let result = ngrams(text, size);
85
86            assert_eq!(
87                result
88                    .into_iter()
89                    .collect::<Vec<&str>>(),
90                vec!["fo", "oo"]
91            );
92        }
93
94        #[test]
95        fn test_chinese_chars_1_gram() {
96            let text = "你好世界";
97            let size = 1;
98
99            let result = ngrams(text, size);
100
101            assert_eq!(
102                result
103                    .into_iter()
104                    .collect::<Vec<&str>>(),
105                vec!["你", "好", "世", "界"]
106            );
107        }
108
109        #[test]
110        fn test_chineses_chars_2_gram() {
111            let text = "你好世界";
112            let size = 2;
113
114            let result = ngrams(text, size);
115
116            assert_eq!(
117                result
118                    .into_iter()
119                    .collect::<Vec<&str>>(),
120                vec!["你好", "好世", "世界"]
121            );
122        }
123
124        #[test]
125        fn test_emoji_chars_1_gram() {
126            let text = "🌱🌿🌲🌳";
127            let size = 1;
128
129            let result = ngrams(text, size);
130
131            assert_eq!(
132                result
133                    .into_iter()
134                    .collect::<Vec<&str>>(),
135                vec!["🌱", "🌿", "🌲", "🌳"]
136            );
137        }
138
139        #[test]
140        fn test_emoji_chars_2_gram() {
141            let text = "🌱🌿🌲🌳";
142            let size = 2;
143
144            let result = ngrams(text, size);
145
146            assert_eq!(
147                result
148                    .into_iter()
149                    .collect::<Vec<&str>>(),
150                vec!["🌱🌿", "🌿🌲", "🌲🌳"]
151            );
152        }
153
154        #[test]
155        fn test_mix_chars_1_gram() {
156            let text = "f🌱你o";
157            let size = 1;
158
159            let result = ngrams(text, size);
160
161            assert_eq!(
162                result
163                    .into_iter()
164                    .collect::<Vec<&str>>(),
165                vec!["f", "🌱", "你", "o"]
166            );
167        }
168
169        #[test]
170        fn test_mix_chars_2_gram() {
171            let text = "f🌱你o";
172            let size = 2;
173
174            let result = ngrams(text, size);
175
176            assert_eq!(
177                result
178                    .into_iter()
179                    .collect::<Vec<&str>>(),
180                vec!["f🌱", "🌱你", "你o"]
181            );
182        }
183    }
184}