1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
use std::cmp::Ordering;

/// Struct representing pre-tokenized text
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct PreTokenizedString {
    /// Original text
    pub text: String,
    /// Tokens derived from the text
    pub tokens: Vec<Token>,
}

impl Ord for PreTokenizedString {
    fn cmp(&self, other: &Self) -> Ordering {
        self.text.cmp(&other.text)
    }
}

impl PartialOrd for PreTokenizedString {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

/// TokenStream implementation which wraps PreTokenizedString
pub struct PreTokenizedStream {
    tokenized_string: PreTokenizedString,
    current_token: i64,
}

impl From<PreTokenizedString> for PreTokenizedStream {
    fn from(s: PreTokenizedString) -> PreTokenizedStream {
        PreTokenizedStream {
            tokenized_string: s,
            current_token: -1,
        }
    }
}

impl PreTokenizedStream {
    /// Creates a TokenStream from PreTokenizedString array
    pub fn chain_tokenized_strings<'a>(
        tok_strings: &'a [&'a PreTokenizedString],
    ) -> BoxTokenStream {
        if tok_strings.len() == 1 {
            PreTokenizedStream::from((*tok_strings[0]).clone()).into()
        } else {
            let mut offsets = vec![];
            let mut total_offset = 0;
            for &tok_string in tok_strings {
                offsets.push(total_offset);
                if let Some(last_token) = tok_string.tokens.last() {
                    total_offset += last_token.offset_to;
                }
            }
            // TODO remove the string cloning.
            let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
                .iter()
                .map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
                .collect();
            TokenStreamChain::new(offsets, token_streams).into()
        }
    }
}

impl TokenStream for PreTokenizedStream {
    fn advance(&mut self) -> bool {
        self.current_token += 1;
        self.current_token < self.tokenized_string.tokens.len() as i64
    }

    fn token(&self) -> &Token {
        assert!(
            self.current_token >= 0,
            "TokenStream not initialized. You should call advance() at least once."
        );
        &self.tokenized_string.tokens[self.current_token as usize]
    }

    fn token_mut(&mut self) -> &mut Token {
        assert!(
            self.current_token >= 0,
            "TokenStream not initialized. You should call advance() at least once."
        );
        &mut self.tokenized_string.tokens[self.current_token as usize]
    }
}

#[cfg(test)]
mod tests {

    use super::*;

    use crate::tokenizer::Token;

    #[test]
    fn test_tokenized_stream() {
        let tok_text = PreTokenizedString {
            text: String::from("A a"),
            tokens: vec![
                Token {
                    offset_from: 0,
                    offset_to: 1,
                    position: 0,
                    text: String::from("A"),
                    position_length: 1,
                },
                Token {
                    offset_from: 2,
                    offset_to: 3,
                    position: 1,
                    text: String::from("a"),
                    position_length: 1,
                },
            ],
        };

        let mut token_stream = PreTokenizedStream::from(tok_text.clone());

        for expected_token in tok_text.tokens {
            assert!(token_stream.advance());
            assert_eq!(token_stream.token(), &expected_token);
        }
        assert!(!token_stream.advance());
    }

    #[test]
    fn test_chain_tokenized_strings() {
        let tok_text = PreTokenizedString {
            text: String::from("A a"),
            tokens: vec![
                Token {
                    offset_from: 0,
                    offset_to: 1,
                    position: 0,
                    text: String::from("A"),
                    position_length: 1,
                },
                Token {
                    offset_from: 2,
                    offset_to: 3,
                    position: 1,
                    text: String::from("a"),
                    position_length: 1,
                },
            ],
        };

        let chain_parts = vec![&tok_text, &tok_text];

        let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);

        let expected_tokens = vec![
            Token {
                offset_from: 0,
                offset_to: 1,
                position: 0,
                text: String::from("A"),
                position_length: 1,
            },
            Token {
                offset_from: 2,
                offset_to: 3,
                position: 1,
                text: String::from("a"),
                position_length: 1,
            },
            Token {
                offset_from: 3,
                offset_to: 4,
                position: 3,
                text: String::from("A"),
                position_length: 1,
            },
            Token {
                offset_from: 5,
                offset_to: 6,
                position: 4,
                text: String::from("a"),
                position_length: 1,
            },
        ];

        for expected_token in expected_tokens {
            assert!(token_stream.advance());
            assert_eq!(token_stream.token(), &expected_token);
        }
        assert!(!token_stream.advance());
    }
}