1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/// Given a sequence of words, the list can be iterated to obtain all the n-grams in the sequence,
/// starting from n-grams of lenght `min` up to n_grams of length `max`.
#[derive(Debug, Clone, PartialEq)]
pub struct NGramList<'a> {
    min: usize,
    max: usize,
    list: Vec<&'a str>,
}

#[derive(Debug, Clone, PartialEq)]
pub struct NGramListIntoIterator<'a> {
    list: NGramList<'a>,
    index: usize,
}

impl<'a> Iterator for NGramListIntoIterator<'a> {
    type Item = Vec<String>;
    fn next(&mut self) -> Option<Self::Item> {
        if self.index >= self.list.len() {
            return None;
        }
        let res = self.list.ngram_items(self.index);
        if res.is_some() {
            self.index += 1;
            res
        } else {
            None
        }
    }
}

impl<'a> IntoIterator for NGramList<'a> {
    type Item = Vec<String>;
    type IntoIter = NGramListIntoIterator<'a>;

    fn into_iter(self) -> Self::IntoIter {
        NGramListIntoIterator {
            list: self,
            index: 0,
        }
    }
}

impl<'a> NGramList<'a> {
    pub fn new(vec: Vec<&'a str>, range: (usize, usize)) -> Self {
        Self {
            min: range.0,
            max: range.1,
            list: vec,
        }
    }

    pub fn len(&self) -> usize {
        self.list.len()
    }

    /// Constructs all n-grams obtainable from the word sequence starting from the word at `index`
    pub fn ngram_items(&self, index: usize) -> Option<Vec<String>> {
        if self.max == 1 {
            return Some(vec![self.list[index].to_string()]);
        }
        let mut items = Vec::new();
        let len = self.list.len();
        let min_end = index + self.min;
        if min_end > len {
            return None;
        }
        let max_end = usize::min(index + self.max, len);
        let mut item = self.list[index].to_string();
        for j in (index + 1)..min_end {
            item.push(' ');
            item.push_str(self.list[j]);
        }
        items.push(item.clone());
        for j in min_end..max_end {
            item.push(' ');
            item.push_str(self.list[j]);
            items.push(item.clone())
        }
        Some(items)
    }
}

#[macro_export]
macro_rules! column_for_word {
    ($voc:expr, $transf:expr, $word: expr ) => {
        $transf.column($voc.iter().position(|s| s == $word).unwrap())
    };
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn autotraits() {
        fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
        has_autotraits::<NGramList>();
        has_autotraits::<NGramListIntoIterator>();
    }

    #[test]
    fn test_ngram_queue() {
        let words = vec![
            "oNe",
            "oNe",
            "two",
            "three",
            "four",
            "TWO",
            "three",
            "four",
            "three;four",
            "four",
        ];
        let list = NGramList::new(words.clone(), (1, 1));
        for (i, items) in list.into_iter().enumerate() {
            assert_eq!(items.len(), 1);
            assert_eq!(items[0], words[i]);
        }

        let list = NGramList::new(words.clone(), (2, 2));
        for (i, items) in list.into_iter().enumerate() {
            assert_eq!(items.len(), 1);
            assert_eq!(items[0], words[i].to_string() + " " + words[i + 1]);
        }
        let list = NGramList::new(words.clone(), (1, 2));
        for (i, items) in list.into_iter().enumerate() {
            if i < words.len() - 1 {
                assert_eq!(items.len(), 2);
                assert_eq!(items[0], words[i]);
                assert_eq!(items[1], words[i].to_string() + " " + words[i + 1]);
            } else {
                assert_eq!(items.len(), 1);
                assert_eq!(items[0], words[i]);
            }
        }
    }
}