1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/// Given a sequence of words, the list can be iterated to obtain all the n-grams in the sequence,
/// starting from n-grams of lenght `min` up to n_grams of length `max`.
pub struct NGramList<'a> {
    min: usize,
    max: usize,
    list: Vec<&'a str>,
}

pub struct NGramListIntoIterator<'a> {
    list: NGramList<'a>,
    index: usize,
}

impl<'a> Iterator for NGramListIntoIterator<'a> {
    type Item = Vec<String>;
    fn next(&mut self) -> Option<Self::Item> {
        if self.index >= self.list.len() {
            return None;
        }
        let res = self.list.ngram_items(self.index);
        if res.is_some() {
            self.index += 1;
            res
        } else {
            None
        }
    }
}

impl<'a> IntoIterator for NGramList<'a> {
    type Item = Vec<String>;
    type IntoIter = NGramListIntoIterator<'a>;

    fn into_iter(self) -> Self::IntoIter {
        NGramListIntoIterator {
            list: self,
            index: 0,
        }
    }
}

impl<'a> NGramList<'a> {
    pub fn new(vec: Vec<&'a str>, range: (usize, usize)) -> Self {
        Self {
            min: range.0,
            max: range.1,
            list: vec,
        }
    }

    pub fn len(&self) -> usize {
        self.list.len()
    }

    /// Constructs all n-grams obtainable from the word sequence starting from the word at `index`
    pub fn ngram_items(&self, index: usize) -> Option<Vec<String>> {
        if self.max == 1 {
            return Some(vec![self.list[index].to_string()]);
        }
        let mut items = Vec::new();
        let len = self.list.len();
        let min_end = index + self.min;
        if min_end > len {
            return None;
        }
        let max_end = usize::min(index + self.max, len);
        let mut item = self.list[index].to_string();
        for j in (index + 1)..min_end {
            item.push(' ');
            item.push_str(self.list[j]);
        }
        items.push(item.clone());
        for j in min_end..max_end {
            item.push(' ');
            item.push_str(self.list[j]);
            items.push(item.clone())
        }
        Some(items)
    }
}

#[macro_export]
macro_rules! column_for_word {
    ($voc:expr, $transf:expr, $word: expr ) => {
        $transf.column($voc.iter().position(|s| s == $word).unwrap())
    };
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_ngram_queue() {
        let words = vec![
            "oNe",
            "oNe",
            "two",
            "three",
            "four",
            "TWO",
            "three",
            "four",
            "three;four",
            "four",
        ];
        let list = NGramList::new(words.clone(), (1, 1));
        for (i, items) in list.into_iter().enumerate() {
            assert_eq!(items.len(), 1);
            assert_eq!(items[0], words[i]);
        }

        let list = NGramList::new(words.clone(), (2, 2));
        for (i, items) in list.into_iter().enumerate() {
            assert_eq!(items.len(), 1);
            assert_eq!(items[0], words[i].to_string() + " " + words[i + 1]);
        }
        let list = NGramList::new(words.clone(), (1, 2));
        for (i, items) in list.into_iter().enumerate() {
            if i < words.len() - 1 {
                assert_eq!(items.len(), 2);
                assert_eq!(items[0], words[i]);
                assert_eq!(items[1], words[i].to_string() + " " + words[i + 1]);
            } else {
                assert_eq!(items.len(), 1);
                assert_eq!(items[0], words[i]);
            }
        }
    }
}