ngrams/
lib.rs

1//! Ngrams
2//!
3//! Produce n-gram sequences from a sequence of tokens
4//!
5//! ## Examples
6//!
7//! ```rust
8//! use ngrams::Ngram;
9//!
10//! let grams: Vec<_> = "foo".chars().ngrams(2).pad().collect();
11//! assert_eq!(
12//!     grams,
13//!     vec![
14//!           vec!['\u{2060}', 'f'],
15//!           vec!['f', 'o'],
16//!           vec!['o', 'o'],
17//!           vec!['o', '\u{2060}']
18//!     ]
19//! );
20//! ```
21//!
22//! ```rust
23//! use ngrams::Ngrams; // notice `Ngram` vs `Ngrams`
24//!
25//! let iter = "one two three".split(' ');
26//! let grams: Vec<_> = Ngrams::new(iter, 3).pad().collect();
27//! assert_eq!(
28//!     grams,
29//!     vec![
30//!           vec!["\u{2060}", "\u{2060}", "one"],
31//!           vec!["\u{2060}", "one", "two"],
32//!           vec!["one", "two", "three"],
33//!           vec!["two", "three", "\u{2060}"],
34//!           vec!["three", "\u{2060}", "\u{2060}"],
35//!     ]
36//! );
37//! ```
38
39#![deny(missing_docs,
40       missing_debug_implementations, missing_copy_implementations,
41       trivial_casts, trivial_numeric_casts,
42       unsafe_code,
43       unstable_features,
44       unused_import_braces, unused_qualifications)]
45#![cfg_attr(feature = "dev", allow(unstable_features))]
46#![cfg_attr(feature = "dev", feature(plugin))]
47#![cfg_attr(feature = "dev", plugin(clippy))]
48#![cfg_attr(feature = "dev", deny(clippy))]
49
50use std::fmt;
51use std::collections::VecDeque;
52
53const WORD_SEP: &'static str = "\u{2060}";
54
55/// Iterator adaptor, allows you to call the method `.ngrams(n)` on your iterator, as long as the
56/// `Item` of the `Iterator` fits the trait bound
57///
58/// ## Example
59///
60/// ```rust
61/// use ngrams::Ngram;
62/// let s: Vec<_> = "hello".chars().ngrams(2).collect();
63/// assert_eq!(s, vec![
64///     vec!['h', 'e'],
65///     vec!['e', 'l'],
66///     vec!['l', 'l'],
67///     vec!['l', 'o'],
68/// ]);
69/// ```
70pub trait Ngram<'a, T: 'a + Pad + fmt::Debug + Clone>: Iterator<Item=T>  where Self: Sized {
71    #[allow(missing_docs)]
72    fn ngrams(self, usize) -> Ngrams<'a, T>;
73}
74
75impl<'a, T: 'a + Pad + fmt::Debug + Clone, U: 'a + Iterator<Item=T>> Ngram<'a, T> for U {
76    fn ngrams(self, n: usize) -> Ngrams<'a, T> {
77        Ngrams::new(self, n)
78    }
79}
80
81/// Main data type, implements the logic on splitting and grouping n-grams
82pub struct Ngrams<'a, T: 'a + Pad + fmt::Debug + Clone> {
83    source: Box<Iterator<Item = T> + 'a>,
84    num: usize,
85    memsize: usize,
86    memory: VecDeque<T>,
87    pad: bool,
88}
89
90impl<'a, T: 'a + Pad + fmt::Debug + Clone> fmt::Debug for Ngrams<'a, T> {
91    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
92        write!(f, "Ngrams(tokens, N)")
93    }
94}
95
96impl<'a, T: 'a + Pad + fmt::Debug + Clone + Sized> Ngrams<'a, T> {
97    /// The source for the `Ngrams` is expected to be pre-tokenized, this library
98    /// does not make any decisions regarding how your input should be tokenized.
99    pub fn new<V: 'a + Iterator<Item = T>>(source: V, n: usize) -> Ngrams<'a, T> {
100        let memsize = n - 1;
101        Ngrams {
102            source: Box::new(source),
103            num: n,
104            memsize: memsize,
105            memory: VecDeque::with_capacity(memsize),
106            pad: false,
107        }
108    }
109
110    /// Include padding at the beginning and end of the input. By default, this crate includes
111    /// implementations for some common data structures, that prepends and appends the "WORD_SEP"
112    /// unicode character onto the input.
113    pub fn pad(mut self) -> Self {
114        self.pad = true;
115        self.source = Box::new(Padded::new(self.source, self.num));
116        self
117    }
118
119    fn fill_memory(&mut self) {
120        while self.memory.len() < self.memsize {
121            // Can I unwrap here? I need to make sure that
122            // .next() can't return None before .memory is full
123            let a = self.source.next().unwrap();
124            self.memory.push_back(a);
125        }
126    }
127}
128
129impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Ngrams<'a, T> {
130    type Item = Vec<T>;
131
132    fn next(&mut self) -> Option<Self::Item> {
133        self.fill_memory();
134
135        self.source.next().map(|n| {
136            let mut result = Vec::with_capacity(self.num);
137
138            for elem in &self.memory {
139                result.push(elem.clone());
140            }
141
142            result.push(n.clone());
143
144            let _ = self.memory.pop_front();
145            self.memory.push_back(n.clone());
146
147            result
148        })
149    }
150}
151
152/*
153impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for &'a Ngrams<'a, T> {
154    type Item = Vec<&'a T>;
155
156    fn next(&mut self) -> Option<Self::Item> {
157        self.fill_memory();
158        let next_item = self.source.next();
159
160        match next_item {
161            None => None,
162            Some(n) => {
163                let mut result = Vec::with_capacity(self.num);
164
165                for elem in &self.memory {
166                }
167                result.push(&n);
168
169                let _ = self.memory.pop_front();
170                self.memory.push_back(n.clone());
171
172                Some(result)
173            }
174        }
175    }
176}
177*/
178
179/// Implement this so `ngrams` knows how to pad the beginning and end of your input.
180///
181/// There are default implementations for `&str`, `String`, and `Vec<u8>`
182pub trait Pad {
183    /// The item returned from this method will be used to pad the beginning and end of each n-gram
184    fn symbol() -> Self;
185
186    /// Specifies how many characters of padding to add. Defaults to N - 1
187    fn len(n: usize) -> usize {
188        n - 1
189    }
190}
191
192impl<'a> Pad for &'a str {
193    fn symbol() -> Self {
194        WORD_SEP
195    }
196}
197
198impl Pad for String {
199    fn symbol() -> Self {
200        WORD_SEP.to_owned()
201    }
202}
203
204impl Pad for Vec<u8> {
205    fn symbol() -> Self {
206        WORD_SEP.to_owned().into()
207    }
208}
209
210impl Pad for char {
211    fn symbol() -> Self {
212        WORD_SEP.chars().next().unwrap()
213    }
214}
215
216struct Padded<'a, T: 'a + Pad + fmt::Debug + Clone> {
217    source: Box<Iterator<Item = T> + 'a>,
218    len: usize,
219    symbol: T,
220    remaining: usize,
221    end: bool,
222}
223
224impl<'a, T: 'a + Pad + fmt::Debug + Clone> Padded<'a, T> {
225    fn new<U: 'a + Iterator<Item = T> + Sized>(source: U, n: usize) -> Padded<'a, T> {
226        let l = T::len(n);
227        Padded {
228            source: Box::new(source),
229            len: l,
230            symbol: T::symbol(),
231            remaining: l,
232            end: false,
233        }
234    }
235}
236
237impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Padded<'a, T> {
238  type Item = T;
239
240    fn next(&mut self) -> Option<Self::Item> {
241        if self.remaining > 0 {
242            self.remaining -= 1;
243            return Some(self.symbol.clone());
244        }
245
246        let result = self.source.next();
247
248        if result.is_none() {
249
250            if !self.end {
251                // then this is the first time
252                // we have seen this return None.
253                self.end = true;
254                self.remaining = self.len;
255            }
256
257            if self.remaining > 0 {
258                self.remaining -= 1;
259                return Some(self.symbol.clone());
260            }
261
262        }
263
264        result
265    }
266}
267
268#[cfg(test)]
269mod tests {
270
271    use super::{Ngram, Ngrams};
272    use std::string::ToString;
273
274    #[test]
275    fn test_words_iter_adaptor_padded() {
276        let result: Vec<_> = "one two three four five".split(' ').ngrams(4).pad().collect();
277        assert_eq!(
278                result,
279                vec![
280                    vec!["\u{2060}", "\u{2060}", "\u{2060}", "one"],
281                    vec!["\u{2060}", "\u{2060}", "one", "two"],
282                    vec!["\u{2060}", "one", "two", "three"],
283                    vec!["one", "two", "three", "four"],
284                    vec!["two", "three", "four", "five"],
285                    vec!["three", "four", "five", "\u{2060}"],
286                    vec!["four", "five", "\u{2060}", "\u{2060}"],
287                    vec!["five", "\u{2060}", "\u{2060}", "\u{2060}"],
288                ]
289        );
290    }
291
292    #[test]
293    fn test_words_padded() {
294        let seq = "one two three four".split(' ');
295        let result: Vec<_> = Ngrams::new(seq, 2).pad().collect();
296        assert_eq!(result,
297                   vec![
298                vec!["\u{2060}", "one"],
299                vec!["one", "two"],
300                vec!["two", "three"],
301                vec!["three", "four"],
302                vec!["four", "\u{2060}"],
303            ]);
304    }
305
306    #[test]
307    fn test_chars_padded() {
308        let seq = "test string".chars().map(|c| c.to_string());
309        let result: Vec<_> = Ngrams::new(seq, 4).pad().collect();
310        assert_eq!(result,
311                   vec![
312                vec!["\u{2060}", "\u{2060}", "\u{2060}", "t"],
313                vec!["\u{2060}", "\u{2060}", "t", "e"],
314                vec!["\u{2060}", "t", "e", "s"],
315                vec!["t", "e", "s", "t"],
316                vec!["e", "s", "t", " "],
317                vec!["s", "t", " ", "s"],
318                vec!["t", " ", "s", "t"],
319                vec![" ", "s", "t", "r"],
320                vec!["s", "t", "r", "i"],
321                vec!["t", "r", "i", "n"],
322                vec!["r", "i", "n", "g"],
323                vec!["i", "n", "g", "\u{2060}"],
324                vec!["n", "g", "\u{2060}", "\u{2060}"],
325                vec!["g", "\u{2060}", "\u{2060}", "\u{2060}"],
326            ]);
327    }
328    #[test]
329    fn test_words_iter_adaptor() {
330        let result: Vec<_> = "one two three four five".split(' ').ngrams(4).collect();
331        assert_eq!(
332                result,
333                vec![
334                    vec!["one", "two", "three", "four"],
335                    vec!["two", "three", "four", "five"],
336                ]
337        );
338    }
339
340    #[test]
341    fn test_words() {
342        let seq = "one two three four".split(' ');
343        let result: Vec<_> = Ngrams::new(seq, 2).collect();
344        assert_eq!(result,
345                   vec![
346                vec!["one", "two"],
347                vec!["two", "three"],
348                vec!["three", "four"],
349            ]);
350    }
351
352    #[test]
353    fn test_chars() {
354        let seq = "test string".chars().map(|c| c.to_string());
355        let result: Vec<_> = Ngrams::new(seq, 4).collect();
356        assert_eq!(result,
357                   vec![
358                vec!["t", "e", "s", "t"],
359                vec!["e", "s", "t", " "],
360                vec!["s", "t", " ", "s"],
361                vec!["t", " ", "s", "t"],
362                vec![" ", "s", "t", "r"],
363                vec!["s", "t", "r", "i"],
364                vec!["t", "r", "i", "n"],
365                vec!["r", "i", "n", "g"],
366            ]);
367    }
368}