1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
use std::io::{self, BufRead, Lines, Write};

use failure::Error;

use crate::util::EOS;

/// Sentence iterator.
///
/// This iterator consumes a reader with tokenized sentences:
///
/// - One sentence per line.
/// - Tokens separated by a space.
///
/// It produces `Vec`s with the tokens, adding an end-of-sentence marker
/// to the end of the sentence. Lines that are empty or only consist of
/// whitespace are discarded.
pub struct SentenceIterator<R> {
    lines: Lines<R>,
}

impl<R> SentenceIterator<R>
where
    R: BufRead,
{
    pub fn new(read: R) -> Self {
        SentenceIterator {
            lines: read.lines(),
        }
    }
}

impl<R> Iterator for SentenceIterator<R>
where
    R: BufRead,
{
    type Item = Result<Vec<String>, io::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        for line in &mut self.lines {
            let line = match line {
                Ok(ref line) => line.trim(),
                Err(err) => return Some(Err(err)),
            };

            // Skip empty lines.
            if !line.is_empty() {
                return Some(Ok(whitespace_tokenize(line)));
            }
        }

        None
    }
}

/// Trait for writing models in binary format.
pub trait WriteModelBinary<W>
where
    W: Write,
{
    fn write_model_binary(self, write: &mut W) -> Result<(), Error>;
}

fn whitespace_tokenize(line: &str) -> Vec<String> {
    let mut tokens = line
        .split_whitespace()
        .map(ToOwned::to_owned)
        .collect::<Vec<_>>();
    tokens.push(EOS.to_string());
    tokens
}

/// Trait for writing models in text format.
pub trait WriteModelText<W>
where
    W: Write,
{
    /// Write the model in text format.
    ///
    /// This function only writes the word embeddings. The subword
    /// embeddings are discarded.
    ///
    /// The `write_dims` parameter indicates whether the first line
    /// should contain the dimensionality of the embedding matrix.
    fn write_model_text(&self, write: &mut W, write_dims: bool) -> Result<(), Error>;
}

/// Trait for writing models in binary format.
pub trait WriteModelWord2Vec<W>
where
    W: Write,
{
    fn write_model_word2vec(&self, write: &mut W) -> Result<(), Error>;
}

#[cfg(test)]
mod tests {
    use std::io::Cursor;

    use super::SentenceIterator;
    use crate::util::EOS;

    #[test]
    fn sentence_iterator_test() {
        let v = b"This is a sentence .\nAnd another one .\n".to_vec();
        let c = Cursor::new(v);
        let mut iter = SentenceIterator::new(c);
        assert_eq!(
            iter.next().unwrap().unwrap(),
            vec!["This", "is", "a", "sentence", ".", EOS]
        );
        assert_eq!(
            iter.next().unwrap().unwrap(),
            vec!["And", "another", "one", ".", EOS]
        );
        assert!(iter.next().is_none());
    }

    #[test]
    fn sentence_iterator_no_newline_test() {
        let v = b"This is a sentence .\nAnd another one .".to_vec();
        let c = Cursor::new(v);
        let mut iter = SentenceIterator::new(c);
        assert_eq!(
            iter.next().unwrap().unwrap(),
            vec!["This", "is", "a", "sentence", ".", EOS]
        );
        assert_eq!(
            iter.next().unwrap().unwrap(),
            vec!["And", "another", "one", ".", EOS]
        );
        assert!(iter.next().is_none());
    }

    #[test]
    fn sentence_iterator_empty_test() {
        let v = b"".to_vec();
        let c = Cursor::new(v);
        let mut iter = SentenceIterator::new(c);
        assert!(iter.next().is_none());
    }

    #[test]
    fn sentence_iterator_empty_newline_test() {
        let v = b"\n \n   \n".to_vec();
        let c = Cursor::new(v);
        let mut iter = SentenceIterator::new(c);
        assert!(iter.next().is_none());
    }

}