1#![deny(missing_docs,
40 missing_debug_implementations, missing_copy_implementations,
41 trivial_casts, trivial_numeric_casts,
42 unsafe_code,
43 unstable_features,
44 unused_import_braces, unused_qualifications)]
45#![cfg_attr(feature = "dev", allow(unstable_features))]
46#![cfg_attr(feature = "dev", feature(plugin))]
47#![cfg_attr(feature = "dev", plugin(clippy))]
48#![cfg_attr(feature = "dev", deny(clippy))]
49
50use std::fmt;
51use std::collections::VecDeque;
52
53const WORD_SEP: &'static str = "\u{2060}";
54
55pub trait Ngram<'a, T: 'a + Pad + fmt::Debug + Clone>: Iterator<Item=T> where Self: Sized {
71 #[allow(missing_docs)]
72 fn ngrams(self, usize) -> Ngrams<'a, T>;
73}
74
75impl<'a, T: 'a + Pad + fmt::Debug + Clone, U: 'a + Iterator<Item=T>> Ngram<'a, T> for U {
76 fn ngrams(self, n: usize) -> Ngrams<'a, T> {
77 Ngrams::new(self, n)
78 }
79}
80
81pub struct Ngrams<'a, T: 'a + Pad + fmt::Debug + Clone> {
83 source: Box<Iterator<Item = T> + 'a>,
84 num: usize,
85 memsize: usize,
86 memory: VecDeque<T>,
87 pad: bool,
88}
89
90impl<'a, T: 'a + Pad + fmt::Debug + Clone> fmt::Debug for Ngrams<'a, T> {
91 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
92 write!(f, "Ngrams(tokens, N)")
93 }
94}
95
96impl<'a, T: 'a + Pad + fmt::Debug + Clone + Sized> Ngrams<'a, T> {
97 pub fn new<V: 'a + Iterator<Item = T>>(source: V, n: usize) -> Ngrams<'a, T> {
100 let memsize = n - 1;
101 Ngrams {
102 source: Box::new(source),
103 num: n,
104 memsize: memsize,
105 memory: VecDeque::with_capacity(memsize),
106 pad: false,
107 }
108 }
109
110 pub fn pad(mut self) -> Self {
114 self.pad = true;
115 self.source = Box::new(Padded::new(self.source, self.num));
116 self
117 }
118
119 fn fill_memory(&mut self) {
120 while self.memory.len() < self.memsize {
121 let a = self.source.next().unwrap();
124 self.memory.push_back(a);
125 }
126 }
127}
128
129impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Ngrams<'a, T> {
130 type Item = Vec<T>;
131
132 fn next(&mut self) -> Option<Self::Item> {
133 self.fill_memory();
134
135 self.source.next().map(|n| {
136 let mut result = Vec::with_capacity(self.num);
137
138 for elem in &self.memory {
139 result.push(elem.clone());
140 }
141
142 result.push(n.clone());
143
144 let _ = self.memory.pop_front();
145 self.memory.push_back(n.clone());
146
147 result
148 })
149 }
150}
151
152pub trait Pad {
183 fn symbol() -> Self;
185
186 fn len(n: usize) -> usize {
188 n - 1
189 }
190}
191
192impl<'a> Pad for &'a str {
193 fn symbol() -> Self {
194 WORD_SEP
195 }
196}
197
198impl Pad for String {
199 fn symbol() -> Self {
200 WORD_SEP.to_owned()
201 }
202}
203
204impl Pad for Vec<u8> {
205 fn symbol() -> Self {
206 WORD_SEP.to_owned().into()
207 }
208}
209
210impl Pad for char {
211 fn symbol() -> Self {
212 WORD_SEP.chars().next().unwrap()
213 }
214}
215
216struct Padded<'a, T: 'a + Pad + fmt::Debug + Clone> {
217 source: Box<Iterator<Item = T> + 'a>,
218 len: usize,
219 symbol: T,
220 remaining: usize,
221 end: bool,
222}
223
224impl<'a, T: 'a + Pad + fmt::Debug + Clone> Padded<'a, T> {
225 fn new<U: 'a + Iterator<Item = T> + Sized>(source: U, n: usize) -> Padded<'a, T> {
226 let l = T::len(n);
227 Padded {
228 source: Box::new(source),
229 len: l,
230 symbol: T::symbol(),
231 remaining: l,
232 end: false,
233 }
234 }
235}
236
237impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Padded<'a, T> {
238 type Item = T;
239
240 fn next(&mut self) -> Option<Self::Item> {
241 if self.remaining > 0 {
242 self.remaining -= 1;
243 return Some(self.symbol.clone());
244 }
245
246 let result = self.source.next();
247
248 if result.is_none() {
249
250 if !self.end {
251 self.end = true;
254 self.remaining = self.len;
255 }
256
257 if self.remaining > 0 {
258 self.remaining -= 1;
259 return Some(self.symbol.clone());
260 }
261
262 }
263
264 result
265 }
266}
267
268#[cfg(test)]
269mod tests {
270
271 use super::{Ngram, Ngrams};
272 use std::string::ToString;
273
274 #[test]
275 fn test_words_iter_adaptor_padded() {
276 let result: Vec<_> = "one two three four five".split(' ').ngrams(4).pad().collect();
277 assert_eq!(
278 result,
279 vec![
280 vec!["\u{2060}", "\u{2060}", "\u{2060}", "one"],
281 vec!["\u{2060}", "\u{2060}", "one", "two"],
282 vec!["\u{2060}", "one", "two", "three"],
283 vec!["one", "two", "three", "four"],
284 vec!["two", "three", "four", "five"],
285 vec!["three", "four", "five", "\u{2060}"],
286 vec!["four", "five", "\u{2060}", "\u{2060}"],
287 vec!["five", "\u{2060}", "\u{2060}", "\u{2060}"],
288 ]
289 );
290 }
291
292 #[test]
293 fn test_words_padded() {
294 let seq = "one two three four".split(' ');
295 let result: Vec<_> = Ngrams::new(seq, 2).pad().collect();
296 assert_eq!(result,
297 vec![
298 vec!["\u{2060}", "one"],
299 vec!["one", "two"],
300 vec!["two", "three"],
301 vec!["three", "four"],
302 vec!["four", "\u{2060}"],
303 ]);
304 }
305
306 #[test]
307 fn test_chars_padded() {
308 let seq = "test string".chars().map(|c| c.to_string());
309 let result: Vec<_> = Ngrams::new(seq, 4).pad().collect();
310 assert_eq!(result,
311 vec![
312 vec!["\u{2060}", "\u{2060}", "\u{2060}", "t"],
313 vec!["\u{2060}", "\u{2060}", "t", "e"],
314 vec!["\u{2060}", "t", "e", "s"],
315 vec!["t", "e", "s", "t"],
316 vec!["e", "s", "t", " "],
317 vec!["s", "t", " ", "s"],
318 vec!["t", " ", "s", "t"],
319 vec![" ", "s", "t", "r"],
320 vec!["s", "t", "r", "i"],
321 vec!["t", "r", "i", "n"],
322 vec!["r", "i", "n", "g"],
323 vec!["i", "n", "g", "\u{2060}"],
324 vec!["n", "g", "\u{2060}", "\u{2060}"],
325 vec!["g", "\u{2060}", "\u{2060}", "\u{2060}"],
326 ]);
327 }
328 #[test]
329 fn test_words_iter_adaptor() {
330 let result: Vec<_> = "one two three four five".split(' ').ngrams(4).collect();
331 assert_eq!(
332 result,
333 vec![
334 vec!["one", "two", "three", "four"],
335 vec!["two", "three", "four", "five"],
336 ]
337 );
338 }
339
340 #[test]
341 fn test_words() {
342 let seq = "one two three four".split(' ');
343 let result: Vec<_> = Ngrams::new(seq, 2).collect();
344 assert_eq!(result,
345 vec![
346 vec!["one", "two"],
347 vec!["two", "three"],
348 vec!["three", "four"],
349 ]);
350 }
351
352 #[test]
353 fn test_chars() {
354 let seq = "test string".chars().map(|c| c.to_string());
355 let result: Vec<_> = Ngrams::new(seq, 4).collect();
356 assert_eq!(result,
357 vec![
358 vec!["t", "e", "s", "t"],
359 vec!["e", "s", "t", " "],
360 vec!["s", "t", " ", "s"],
361 vec!["t", " ", "s", "t"],
362 vec![" ", "s", "t", "r"],
363 vec!["s", "t", "r", "i"],
364 vec!["t", "r", "i", "n"],
365 vec!["r", "i", "n", "g"],
366 ]);
367 }
368}