sieve/compiler/grammar/expr/
tokenizer.rs

1/*
2 * Copyright (c) 2020-2023, Stalwart Labs Ltd.
3 *
4 * This file is part of the Stalwart Sieve Interpreter.
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as
8 * published by the Free Software Foundation, either version 3 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
15 * in the LICENSE file at the top-level directory of this distribution.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 *
19 * You can be released from the requirements of the AGPLv3 license by
20 * purchasing a commercial license. Please contact licensing@stalw.art
21 * for more details.
22*/
23
24use std::{
25    iter::{Enumerate, Peekable},
26    slice::Iter,
27};
28
29use crate::{compiler::Number, runtime::eval::IntoString};
30
31use super::{BinaryOperator, Token, UnaryOperator};
32
33pub(crate) struct Tokenizer<'x, F>
34where
35    F: Fn(&str, bool) -> Result<Token, String>,
36{
37    pub(crate) iter: Peekable<Enumerate<Iter<'x, u8>>>,
38    token_map: F,
39    buf: Vec<u8>,
40    depth: u32,
41    next_token: Vec<Token>,
42    has_number: bool,
43    has_dot: bool,
44    has_alpha: bool,
45    is_start: bool,
46    is_eof: bool,
47}
48
49impl<'x, F> Tokenizer<'x, F>
50where
51    F: Fn(&str, bool) -> Result<Token, String>,
52{
53    #[cfg(test)]
54    pub fn new(expr: &'x str, token_map: F) -> Self {
55        Self::from_iter(expr.as_bytes().iter().enumerate().peekable(), token_map)
56    }
57
58    #[allow(clippy::should_implement_trait)]
59    pub(crate) fn from_iter(iter: Peekable<Enumerate<Iter<'x, u8>>>, token_map: F) -> Self {
60        Self {
61            iter,
62            buf: Vec::new(),
63            depth: 0,
64            next_token: Vec::with_capacity(2),
65            has_number: false,
66            has_dot: false,
67            has_alpha: false,
68            is_start: true,
69            is_eof: false,
70            token_map,
71        }
72    }
73
74    #[allow(clippy::should_implement_trait)]
75    pub(crate) fn next(&mut self) -> Result<Option<Token>, String> {
76        if let Some(token) = self.next_token.pop() {
77            return Ok(Some(token));
78        } else if self.is_eof {
79            return Ok(None);
80        }
81
82        while let Some((_, &ch)) = self.iter.next() {
83            match ch {
84                b'A'..=b'Z' | b'a'..=b'z' | b'_' => {
85                    self.buf.push(ch);
86                    self.has_alpha = true;
87                }
88                b'0'..=b'9' => {
89                    self.buf.push(ch);
90                    self.has_number = true;
91                }
92                b'.' => {
93                    self.buf.push(ch);
94                    self.has_dot = true;
95                }
96                b'}' => {
97                    self.is_eof = true;
98                    break;
99                }
100                b'[' if matches!(self.buf.get(0..7), Some(b"header.")) => {
101                    self.buf.push(ch);
102                }
103                b'-' if self.buf.last().map_or(false, |c| *c == b'[')
104                    || matches!(self.buf.get(0..7), Some(b"header.")) =>
105                {
106                    self.buf.push(ch);
107                }
108                b':' if self.buf.contains(&b'.') => {
109                    self.buf.push(ch);
110                }
111                b']' if self.buf.contains(&b'[') => {
112                    self.buf.push(b']');
113                }
114                b'*' if self.buf.last().map_or(false, |&c| c == b'[' || c == b'.') => {
115                    self.buf.push(ch);
116                }
117                _ => {
118                    let prev_token = if !self.buf.is_empty() {
119                        self.is_start = false;
120                        self.parse_buf()?.into()
121                    } else {
122                        None
123                    };
124                    let token = match ch {
125                        b'&' => {
126                            if matches!(self.iter.peek(), Some((_, b'&'))) {
127                                self.iter.next();
128                            }
129                            Token::BinaryOperator(BinaryOperator::And)
130                        }
131                        b'|' => {
132                            if matches!(self.iter.peek(), Some((_, b'|'))) {
133                                self.iter.next();
134                            }
135                            Token::BinaryOperator(BinaryOperator::Or)
136                        }
137                        b'!' => {
138                            if matches!(self.iter.peek(), Some((_, b'='))) {
139                                self.iter.next();
140                                Token::BinaryOperator(BinaryOperator::Ne)
141                            } else {
142                                Token::UnaryOperator(UnaryOperator::Not)
143                            }
144                        }
145                        b'^' => Token::BinaryOperator(BinaryOperator::Xor),
146                        b'(' => {
147                            self.depth += 1;
148                            Token::OpenParen
149                        }
150                        b')' => {
151                            if self.depth == 0 {
152                                return Err("Unmatched close parenthesis".to_string());
153                            }
154                            self.depth -= 1;
155                            Token::CloseParen
156                        }
157                        b'+' => Token::BinaryOperator(BinaryOperator::Add),
158                        b'*' => Token::BinaryOperator(BinaryOperator::Multiply),
159                        b'/' => Token::BinaryOperator(BinaryOperator::Divide),
160                        b'-' => {
161                            if self.is_start {
162                                Token::UnaryOperator(UnaryOperator::Minus)
163                            } else {
164                                Token::BinaryOperator(BinaryOperator::Subtract)
165                            }
166                        }
167                        b'=' => match self.iter.next() {
168                            Some((_, b'=')) => Token::BinaryOperator(BinaryOperator::Eq),
169                            Some((_, b'>')) => Token::BinaryOperator(BinaryOperator::Ge),
170                            Some((_, b'<')) => Token::BinaryOperator(BinaryOperator::Le),
171                            _ => Token::BinaryOperator(BinaryOperator::Eq),
172                        },
173                        b'>' => match self.iter.peek() {
174                            Some((_, b'=')) => {
175                                self.iter.next();
176                                Token::BinaryOperator(BinaryOperator::Ge)
177                            }
178                            _ => Token::BinaryOperator(BinaryOperator::Gt),
179                        },
180                        b'<' => match self.iter.peek() {
181                            Some((_, b'=')) => {
182                                self.iter.next();
183                                Token::BinaryOperator(BinaryOperator::Le)
184                            }
185                            _ => Token::BinaryOperator(BinaryOperator::Lt),
186                        },
187                        b',' => Token::Comma,
188                        b'[' => Token::OpenBracket,
189                        b']' => Token::CloseBracket,
190                        b' ' | b'\r' | b'\n' => {
191                            if prev_token.is_some() {
192                                return Ok(prev_token);
193                            } else {
194                                continue;
195                            }
196                        }
197                        b'\"' | b'\'' => {
198                            let mut buf = Vec::with_capacity(16);
199                            let stop_ch = ch;
200                            let mut last_ch = 0;
201                            let mut found_end = false;
202
203                            for (_, &ch) in self.iter.by_ref() {
204                                if last_ch != b'\\' {
205                                    if ch != stop_ch {
206                                        buf.push(ch);
207                                    } else {
208                                        found_end = true;
209                                        break;
210                                    }
211                                } else {
212                                    match ch {
213                                        b'n' => {
214                                            buf.push(b'\n');
215                                        }
216                                        b'r' => {
217                                            buf.push(b'\r');
218                                        }
219                                        b't' => {
220                                            buf.push(b'\t');
221                                        }
222                                        _ => {
223                                            buf.push(ch);
224                                        }
225                                    }
226                                }
227
228                                last_ch = ch;
229                            }
230
231                            if found_end {
232                                Token::String(
233                                    String::from_utf8(buf)
234                                        .map_err(|_| "Invalid UTF-8".to_string())?,
235                                )
236                            } else {
237                                return Err("Unterminated string".to_string());
238                            }
239                        }
240                        _ => {
241                            return Err(format!("Invalid character {:?}", char::from(ch),));
242                        }
243                    };
244                    self.is_start = matches!(
245                        token,
246                        Token::OpenParen | Token::Comma | Token::BinaryOperator(_)
247                    );
248
249                    return if prev_token.is_some() {
250                        self.next_token.push(token);
251                        Ok(prev_token)
252                    } else {
253                        Ok(Some(token))
254                    };
255                }
256            }
257        }
258
259        if self.depth > 0 {
260            Err("Unmatched open parenthesis".to_string())
261        } else if !self.buf.is_empty() {
262            self.parse_buf().map(Some)
263        } else {
264            Ok(None)
265        }
266    }
267
268    fn parse_buf(&mut self) -> Result<Token, String> {
269        let buf = std::mem::take(&mut self.buf).into_string();
270        if self.has_number && !self.has_alpha {
271            self.has_number = false;
272            if self.has_dot {
273                self.has_dot = false;
274
275                buf.parse::<f64>()
276                    .map(|f| Token::Number(Number::Float(f)))
277                    .map_err(|_| format!("Invalid float value {}", buf,))
278            } else {
279                buf.parse::<i64>()
280                    .map(|i| Token::Number(Number::Integer(i)))
281                    .map_err(|_| format!("Invalid integer value {}", buf,))
282            }
283        } else {
284            let has_dot = self.has_dot;
285            let has_number = self.has_number;
286
287            self.has_alpha = false;
288            self.has_number = false;
289            self.has_dot = false;
290
291            if !has_number && !has_dot && [4, 5].contains(&buf.len()) {
292                if buf == "true" {
293                    return Ok(Token::Number(Number::Integer(1)));
294                } else if buf == "false" {
295                    return Ok(Token::Number(Number::Integer(0)));
296                }
297            }
298
299            (self.token_map)(&buf, has_dot)
300        }
301    }
302}