sieve/compiler/grammar/expr/
tokenizer.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
5 */
6
7use std::{
8    iter::{Enumerate, Peekable},
9    slice::Iter,
10};
11
12use crate::{compiler::Number, runtime::eval::IntoString};
13
14use super::{BinaryOperator, Token, UnaryOperator};
15
16pub(crate) struct Tokenizer<'x, F>
17where
18    F: Fn(&str, bool) -> Result<Token, String>,
19{
20    pub(crate) iter: Peekable<Enumerate<Iter<'x, u8>>>,
21    token_map: F,
22    buf: Vec<u8>,
23    depth: u32,
24    next_token: Vec<Token>,
25    has_number: bool,
26    has_dot: bool,
27    has_alpha: bool,
28    is_start: bool,
29    is_eof: bool,
30}
31
32impl<'x, F> Tokenizer<'x, F>
33where
34    F: Fn(&str, bool) -> Result<Token, String>,
35{
36    #[cfg(test)]
37    pub fn new(expr: &'x str, token_map: F) -> Self {
38        Self::from_iter(expr.as_bytes().iter().enumerate().peekable(), token_map)
39    }
40
41    #[allow(clippy::should_implement_trait)]
42    pub(crate) fn from_iter(iter: Peekable<Enumerate<Iter<'x, u8>>>, token_map: F) -> Self {
43        Self {
44            iter,
45            buf: Vec::new(),
46            depth: 0,
47            next_token: Vec::with_capacity(2),
48            has_number: false,
49            has_dot: false,
50            has_alpha: false,
51            is_start: true,
52            is_eof: false,
53            token_map,
54        }
55    }
56
57    #[allow(clippy::should_implement_trait)]
58    pub(crate) fn next(&mut self) -> Result<Option<Token>, String> {
59        if let Some(token) = self.next_token.pop() {
60            return Ok(Some(token));
61        } else if self.is_eof {
62            return Ok(None);
63        }
64
65        while let Some((_, &ch)) = self.iter.next() {
66            match ch {
67                b'A'..=b'Z' | b'a'..=b'z' | b'_' => {
68                    self.buf.push(ch);
69                    self.has_alpha = true;
70                }
71                b'0'..=b'9' => {
72                    self.buf.push(ch);
73                    self.has_number = true;
74                }
75                b'.' => {
76                    self.buf.push(ch);
77                    self.has_dot = true;
78                }
79                b'}' => {
80                    self.is_eof = true;
81                    break;
82                }
83                b'[' if matches!(self.buf.get(0..7), Some(b"header.")) => {
84                    self.buf.push(ch);
85                }
86                b'-' if self.buf.last().is_some_and( |c| *c == b'[')
87                    || matches!(self.buf.get(0..7), Some(b"header.")) =>
88                {
89                    self.buf.push(ch);
90                }
91                b':' if self.buf.contains(&b'.') => {
92                    self.buf.push(ch);
93                }
94                b']' if self.buf.contains(&b'[') => {
95                    self.buf.push(b']');
96                }
97                b'*' if self.buf.last().is_some_and( |&c| c == b'[' || c == b'.') => {
98                    self.buf.push(ch);
99                }
100                _ => {
101                    let prev_token = if !self.buf.is_empty() {
102                        self.is_start = false;
103                        self.parse_buf()?.into()
104                    } else {
105                        None
106                    };
107                    let token = match ch {
108                        b'&' => {
109                            if matches!(self.iter.peek(), Some((_, b'&'))) {
110                                self.iter.next();
111                            }
112                            Token::BinaryOperator(BinaryOperator::And)
113                        }
114                        b'|' => {
115                            if matches!(self.iter.peek(), Some((_, b'|'))) {
116                                self.iter.next();
117                            }
118                            Token::BinaryOperator(BinaryOperator::Or)
119                        }
120                        b'!' => {
121                            if matches!(self.iter.peek(), Some((_, b'='))) {
122                                self.iter.next();
123                                Token::BinaryOperator(BinaryOperator::Ne)
124                            } else {
125                                Token::UnaryOperator(UnaryOperator::Not)
126                            }
127                        }
128                        b'^' => Token::BinaryOperator(BinaryOperator::Xor),
129                        b'(' => {
130                            self.depth += 1;
131                            Token::OpenParen
132                        }
133                        b')' => {
134                            if self.depth == 0 {
135                                return Err("Unmatched close parenthesis".to_string());
136                            }
137                            self.depth -= 1;
138                            Token::CloseParen
139                        }
140                        b'+' => Token::BinaryOperator(BinaryOperator::Add),
141                        b'*' => Token::BinaryOperator(BinaryOperator::Multiply),
142                        b'/' => Token::BinaryOperator(BinaryOperator::Divide),
143                        b'-' => {
144                            if self.is_start {
145                                Token::UnaryOperator(UnaryOperator::Minus)
146                            } else {
147                                Token::BinaryOperator(BinaryOperator::Subtract)
148                            }
149                        }
150                        b'=' => match self.iter.next() {
151                            Some((_, b'=')) => Token::BinaryOperator(BinaryOperator::Eq),
152                            Some((_, b'>')) => Token::BinaryOperator(BinaryOperator::Ge),
153                            Some((_, b'<')) => Token::BinaryOperator(BinaryOperator::Le),
154                            _ => Token::BinaryOperator(BinaryOperator::Eq),
155                        },
156                        b'>' => match self.iter.peek() {
157                            Some((_, b'=')) => {
158                                self.iter.next();
159                                Token::BinaryOperator(BinaryOperator::Ge)
160                            }
161                            _ => Token::BinaryOperator(BinaryOperator::Gt),
162                        },
163                        b'<' => match self.iter.peek() {
164                            Some((_, b'=')) => {
165                                self.iter.next();
166                                Token::BinaryOperator(BinaryOperator::Le)
167                            }
168                            _ => Token::BinaryOperator(BinaryOperator::Lt),
169                        },
170                        b',' => Token::Comma,
171                        b'[' => Token::OpenBracket,
172                        b']' => Token::CloseBracket,
173                        b' ' | b'\r' | b'\n' => {
174                            if prev_token.is_some() {
175                                return Ok(prev_token);
176                            } else {
177                                continue;
178                            }
179                        }
180                        b'\"' | b'\'' => {
181                            let mut buf = Vec::with_capacity(16);
182                            let stop_ch = ch;
183                            let mut last_ch = 0;
184                            let mut found_end = false;
185
186                            for (_, &ch) in self.iter.by_ref() {
187                                if last_ch != b'\\' {
188                                    if ch != stop_ch {
189                                        buf.push(ch);
190                                    } else {
191                                        found_end = true;
192                                        break;
193                                    }
194                                } else {
195                                    match ch {
196                                        b'n' => {
197                                            buf.push(b'\n');
198                                        }
199                                        b'r' => {
200                                            buf.push(b'\r');
201                                        }
202                                        b't' => {
203                                            buf.push(b'\t');
204                                        }
205                                        _ => {
206                                            buf.push(ch);
207                                        }
208                                    }
209                                }
210
211                                last_ch = ch;
212                            }
213
214                            if found_end {
215                                Token::String(
216                                    String::from_utf8(buf)
217                                        .map_err(|_| "Invalid UTF-8".to_string())?,
218                                )
219                            } else {
220                                return Err("Unterminated string".to_string());
221                            }
222                        }
223                        _ => {
224                            return Err(format!("Invalid character {:?}", char::from(ch),));
225                        }
226                    };
227                    self.is_start = matches!(
228                        token,
229                        Token::OpenParen | Token::Comma | Token::BinaryOperator(_)
230                    );
231
232                    return if prev_token.is_some() {
233                        self.next_token.push(token);
234                        Ok(prev_token)
235                    } else {
236                        Ok(Some(token))
237                    };
238                }
239            }
240        }
241
242        if self.depth > 0 {
243            Err("Unmatched open parenthesis".to_string())
244        } else if !self.buf.is_empty() {
245            self.parse_buf().map(Some)
246        } else {
247            Ok(None)
248        }
249    }
250
251    fn parse_buf(&mut self) -> Result<Token, String> {
252        let buf = std::mem::take(&mut self.buf).into_string();
253        if self.has_number && !self.has_alpha {
254            self.has_number = false;
255            if self.has_dot {
256                self.has_dot = false;
257
258                buf.parse::<f64>()
259                    .map(|f| Token::Number(Number::Float(f)))
260                    .map_err(|_| format!("Invalid float value {}", buf,))
261            } else {
262                buf.parse::<i64>()
263                    .map(|i| Token::Number(Number::Integer(i)))
264                    .map_err(|_| format!("Invalid integer value {}", buf,))
265            }
266        } else {
267            let has_dot = self.has_dot;
268            let has_number = self.has_number;
269
270            self.has_alpha = false;
271            self.has_number = false;
272            self.has_dot = false;
273
274            if !has_number && !has_dot && [4, 5].contains(&buf.len()) {
275                if buf == "true" {
276                    return Ok(Token::Number(Number::Integer(1)));
277                } else if buf == "false" {
278                    return Ok(Token::Number(Number::Integer(0)));
279                }
280            }
281
282            (self.token_map)(&buf, has_dot)
283        }
284    }
285}