#![deny(missing_docs,
missing_debug_implementations, missing_copy_implementations,
trivial_casts, trivial_numeric_casts,
unsafe_code,
unstable_features,
unused_import_braces, unused_qualifications)]
#![cfg_attr(feature = "dev", allow(unstable_features))]
#![cfg_attr(feature = "dev", feature(plugin))]
#![cfg_attr(feature = "dev", plugin(clippy))]
#![cfg_attr(feature = "dev", deny(clippy))]
use std::fmt;
use std::collections::VecDeque;
const WORD_SEP: &'static str = "\u{2060}";
pub trait Ngram<'a, T: 'a + Pad + fmt::Debug + Clone>: Iterator<Item=T> where Self: Sized {
#[allow(missing_docs)]
fn ngrams(self, usize) -> Ngrams<'a, T>;
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone, U: 'a + Iterator<Item=T>> Ngram<'a, T> for U {
fn ngrams(self, n: usize) -> Ngrams<'a, T> {
Ngrams::new(self, n)
}
}
pub struct Ngrams<'a, T: 'a + Pad + fmt::Debug + Clone> {
source: Box<Iterator<Item = T> + 'a>,
num: usize,
memsize: usize,
memory: VecDeque<T>,
pad: bool,
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone> fmt::Debug for Ngrams<'a, T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Ngrams(tokens, N)")
}
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone + Sized> Ngrams<'a, T> {
pub fn new<V: 'a + Iterator<Item = T>>(source: V, n: usize) -> Ngrams<'a, T> {
let memsize = n - 1;
Ngrams {
source: Box::new(source),
num: n,
memsize: memsize,
memory: VecDeque::with_capacity(memsize),
pad: false,
}
}
pub fn pad(mut self) -> Self {
self.pad = true;
self.source = Box::new(Padded::new(self.source, self.num));
self
}
fn fill_memory(&mut self) {
while self.memory.len() < self.memsize {
let a = self.source.next().unwrap();
self.memory.push_back(a);
}
}
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Ngrams<'a, T> {
type Item = Vec<T>;
fn next(&mut self) -> Option<Self::Item> {
self.fill_memory();
self.source.next().map(|n| {
let mut result = Vec::with_capacity(self.num);
for elem in &self.memory {
result.push(elem.clone());
}
result.push(n.clone());
let _ = self.memory.pop_front();
self.memory.push_back(n.clone());
result
})
}
}
pub trait Pad {
fn symbol() -> Self;
fn len(n: usize) -> usize {
n - 1
}
}
impl<'a> Pad for &'a str {
fn symbol() -> Self {
WORD_SEP
}
}
impl Pad for String {
fn symbol() -> Self {
WORD_SEP.to_owned()
}
}
impl Pad for Vec<u8> {
fn symbol() -> Self {
WORD_SEP.to_owned().into()
}
}
impl Pad for char {
fn symbol() -> Self {
WORD_SEP.chars().next().unwrap()
}
}
struct Padded<'a, T: 'a + Pad + fmt::Debug + Clone> {
source: Box<Iterator<Item = T> + 'a>,
len: usize,
symbol: T,
remaining: usize,
end: bool,
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone> Padded<'a, T> {
fn new<U: 'a + Iterator<Item = T> + Sized>(source: U, n: usize) -> Padded<'a, T> {
let l = T::len(n);
Padded {
source: Box::new(source),
len: l,
symbol: T::symbol(),
remaining: l,
end: false,
}
}
}
impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Padded<'a, T> {
type Item = T;
fn next(&mut self) -> Option<Self::Item> {
if self.remaining > 0 {
self.remaining -= 1;
return Some(self.symbol.clone());
}
let result = self.source.next();
if result.is_none() {
if !self.end {
self.end = true;
self.remaining = self.len;
}
if self.remaining > 0 {
self.remaining -= 1;
return Some(self.symbol.clone());
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::{Ngram, Ngrams};
use std::string::ToString;
#[test]
fn test_words_iter_adaptor_padded() {
let result: Vec<_> = "one two three four five".split(' ').ngrams(4).pad().collect();
assert_eq!(
result,
vec![
vec!["\u{2060}", "\u{2060}", "\u{2060}", "one"],
vec!["\u{2060}", "\u{2060}", "one", "two"],
vec!["\u{2060}", "one", "two", "three"],
vec!["one", "two", "three", "four"],
vec!["two", "three", "four", "five"],
vec!["three", "four", "five", "\u{2060}"],
vec!["four", "five", "\u{2060}", "\u{2060}"],
vec!["five", "\u{2060}", "\u{2060}", "\u{2060}"],
]
);
}
#[test]
fn test_words_padded() {
let seq = "one two three four".split(' ');
let result: Vec<_> = Ngrams::new(seq, 2).pad().collect();
assert_eq!(result,
vec![
vec!["\u{2060}", "one"],
vec!["one", "two"],
vec!["two", "three"],
vec!["three", "four"],
vec!["four", "\u{2060}"],
]);
}
#[test]
fn test_chars_padded() {
let seq = "test string".chars().map(|c| c.to_string());
let result: Vec<_> = Ngrams::new(seq, 4).pad().collect();
assert_eq!(result,
vec![
vec!["\u{2060}", "\u{2060}", "\u{2060}", "t"],
vec!["\u{2060}", "\u{2060}", "t", "e"],
vec!["\u{2060}", "t", "e", "s"],
vec!["t", "e", "s", "t"],
vec!["e", "s", "t", " "],
vec!["s", "t", " ", "s"],
vec!["t", " ", "s", "t"],
vec![" ", "s", "t", "r"],
vec!["s", "t", "r", "i"],
vec!["t", "r", "i", "n"],
vec!["r", "i", "n", "g"],
vec!["i", "n", "g", "\u{2060}"],
vec!["n", "g", "\u{2060}", "\u{2060}"],
vec!["g", "\u{2060}", "\u{2060}", "\u{2060}"],
]);
}
#[test]
fn test_words_iter_adaptor() {
let result: Vec<_> = "one two three four five".split(' ').ngrams(4).collect();
assert_eq!(
result,
vec![
vec!["one", "two", "three", "four"],
vec!["two", "three", "four", "five"],
]
);
}
#[test]
fn test_words() {
let seq = "one two three four".split(' ');
let result: Vec<_> = Ngrams::new(seq, 2).collect();
assert_eq!(result,
vec![
vec!["one", "two"],
vec!["two", "three"],
vec!["three", "four"],
]);
}
#[test]
fn test_chars() {
let seq = "test string".chars().map(|c| c.to_string());
let result: Vec<_> = Ngrams::new(seq, 4).collect();
assert_eq!(result,
vec![
vec!["t", "e", "s", "t"],
vec!["e", "s", "t", " "],
vec!["s", "t", " ", "s"],
vec!["t", " ", "s", "t"],
vec![" ", "s", "t", "r"],
vec!["s", "t", "r", "i"],
vec!["t", "r", "i", "n"],
vec!["r", "i", "n", "g"],
]);
}
}