sieve/compiler/lexer/
tokenizer.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
5 */
6
7use std::{iter::Peekable, slice::Iter};
8
9use crate::{
10    compiler::{CompileError, ErrorType, Number},
11    runtime::eval::IntoString,
12    Compiler,
13};
14
15use super::{word::lookup_words, StringConstant, Token};
16
17pub(crate) struct Tokenizer<'x> {
18    pub compiler: &'x Compiler,
19    pub iter: Peekable<Iter<'x, u8>>,
20    pub buf: Vec<u8>,
21    pub next_token: Vec<TokenInfo>,
22
23    pub pos: usize,
24    pub line_num: usize,
25    pub line_start: usize,
26
27    pub text_line_num: usize,
28    pub text_line_pos: usize,
29
30    pub token_line_num: usize,
31    pub token_line_pos: usize,
32
33    pub token_is_tag: bool,
34
35    pub last_ch: u8,
36    pub state: State,
37}
38
39#[derive(Debug)]
40pub(crate) struct TokenInfo {
41    pub(crate) token: Token,
42    pub(crate) line_num: usize,
43    pub(crate) line_pos: usize,
44}
45
46pub(crate) enum State {
47    None,
48    BracketComment,
49    HashComment,
50    QuotedString(StringType),
51    MultiLine(StringType),
52}
53
54#[derive(Clone, Copy, Default)]
55pub(crate) struct StringType {
56    maybe_variable: bool,
57    has_other: bool,
58    has_digits: bool,
59    has_dots: bool,
60}
61
62impl<'x> Tokenizer<'x> {
63    pub fn new(compiler: &'x Compiler, bytes: &'x [u8]) -> Self {
64        Tokenizer {
65            compiler,
66            iter: bytes.iter().peekable(),
67            buf: Vec::with_capacity(bytes.len() / 2),
68            pos: usize::MAX,
69            line_num: 1,
70            line_start: 0,
71            text_line_num: 0,
72            text_line_pos: 0,
73            token_line_num: 0,
74            token_line_pos: 0,
75            token_is_tag: false,
76            next_token: Vec::with_capacity(2),
77            last_ch: 0,
78            state: State::None,
79        }
80    }
81
82    pub fn get_current_token(&mut self) -> Option<TokenInfo> {
83        if !self.buf.is_empty() {
84            let word = std::str::from_utf8(&self.buf).unwrap();
85            let token = if let Some(word) = lookup_words(word) {
86                if self.token_is_tag {
87                    self.token_line_pos -= 1;
88                    Token::Tag(word)
89                } else {
90                    Token::Identifier(word)
91                }
92            } else if self.buf.first().unwrap().is_ascii_digit() {
93                let multiplier = match self.buf.last().unwrap() {
94                    b'k' => 1024,
95                    b'm' => 1048576,
96                    b'g' => 1073741824,
97                    _ => 1,
98                };
99
100                if let Ok(number) = (if multiplier > 1 && self.buf.len() > 1 {
101                    std::str::from_utf8(&self.buf[..self.buf.len() - 1]).unwrap()
102                } else {
103                    word
104                })
105                .parse::<usize>()
106                {
107                    Token::Number(number.saturating_mul(multiplier))
108                } else if self.token_is_tag {
109                    Token::Unknown(format!(":{word}"))
110                } else {
111                    Token::Unknown(word.to_string())
112                }
113            } else if self.token_is_tag {
114                Token::Unknown(format!(":{word}"))
115            } else {
116                Token::Unknown(word.to_string())
117            };
118
119            self.reset_current_token();
120
121            Some(TokenInfo {
122                token,
123                line_num: self.token_line_num,
124                line_pos: self.token_line_pos,
125            })
126        } else {
127            None
128        }
129    }
130
131    #[inline(always)]
132    pub fn reset_current_token(&mut self) {
133        self.buf.clear();
134        self.token_is_tag = false;
135    }
136
137    #[inline(always)]
138    pub fn token_is_tag(&mut self) {
139        self.token_is_tag = true;
140    }
141
142    pub fn get_token(&mut self, token: Token) -> TokenInfo {
143        let next_token = TokenInfo {
144            token,
145            line_num: self.line_num,
146            line_pos: self.pos - self.line_start,
147        };
148        if let Some(token) = self.get_current_token() {
149            self.next_token.push(next_token);
150            token
151        } else {
152            next_token
153        }
154    }
155
156    pub fn get_string(&mut self, str_type: StringType) -> Result<TokenInfo, CompileError> {
157        if self.buf.len() < self.compiler.max_string_size {
158            let token = if str_type.maybe_variable {
159                Token::StringVariable(self.buf.to_vec())
160            } else {
161                let constant = self.buf.to_vec().into_string();
162                if !str_type.has_other && str_type.has_digits {
163                    if !str_type.has_dots {
164                        if let Some(number) = constant.parse::<i64>().ok().and_then(|n| {
165                            if n.to_string() == constant {
166                                Some(n)
167                            } else {
168                                None
169                            }
170                        }) {
171                            Token::StringConstant(StringConstant::Number(Number::Integer(number)))
172                        } else {
173                            Token::StringConstant(StringConstant::String(constant))
174                        }
175                    } else if let Some(number) = constant.parse::<f64>().ok().and_then(|n| {
176                        if n.to_string() == constant {
177                            Some(n)
178                        } else {
179                            None
180                        }
181                    }) {
182                        Token::StringConstant(StringConstant::Number(Number::Float(number)))
183                    } else {
184                        Token::StringConstant(StringConstant::String(constant))
185                    }
186                } else {
187                    Token::StringConstant(StringConstant::String(constant))
188                }
189            };
190
191            self.buf.clear();
192
193            Ok(TokenInfo {
194                token,
195                line_num: self.text_line_num,
196                line_pos: self.text_line_pos,
197            })
198        } else {
199            Err(CompileError {
200                line_num: self.text_line_num,
201                line_pos: self.text_line_pos,
202                error_type: ErrorType::StringTooLong,
203            })
204        }
205    }
206
207    #[inline(always)]
208    pub fn push_byte(&mut self, ch: u8) {
209        if self.buf.is_empty() {
210            self.token_line_num = self.line_num;
211            self.token_line_pos = self.pos - self.line_start;
212        }
213        self.buf.push(ch);
214    }
215
216    #[inline(always)]
217    pub fn new_line(&mut self) {
218        self.line_num += 1;
219        self.line_start = self.pos;
220    }
221
222    #[inline(always)]
223    pub fn text_start(&mut self) {
224        self.text_line_num = self.line_num;
225        self.text_line_pos = self.pos - self.line_start;
226    }
227
228    #[inline(always)]
229    pub fn is_token_start(&self) -> bool {
230        self.buf.is_empty()
231    }
232
233    #[inline(always)]
234    pub fn token_bytes(&self) -> &[u8] {
235        &self.buf
236    }
237
238    #[inline(always)]
239    pub fn next_byte(&mut self) -> Option<(u8, u8)> {
240        self.iter.next().map(|&ch| {
241            let last_ch = self.last_ch;
242            self.pos = self.pos.wrapping_add(1);
243            self.last_ch = ch;
244            (ch, last_ch)
245        })
246    }
247
248    #[inline(always)]
249    pub fn peek_byte(&mut self) -> Option<u8> {
250        self.iter.peek().map(|ch| **ch)
251    }
252
253    pub fn unwrap_next(&mut self) -> Result<TokenInfo, CompileError> {
254        if let Some(token) = self.next() {
255            token
256        } else {
257            Err(CompileError {
258                line_num: self.line_num,
259                line_pos: self.pos - self.line_start,
260                error_type: ErrorType::UnexpectedEOF,
261            })
262        }
263    }
264
265    pub fn expect_token(&mut self, token: Token) -> Result<(), CompileError> {
266        let next_token = self.unwrap_next()?;
267        if next_token.token == token {
268            Ok(())
269        } else {
270            Err(next_token.expected(format!("'{token}'")))
271        }
272    }
273
274    pub fn expect_static_string(&mut self) -> Result<String, CompileError> {
275        let next_token = self.unwrap_next()?;
276        match next_token.token {
277            Token::StringConstant(s) => Ok(s.into_string()),
278            Token::BracketOpen => {
279                let mut string = None;
280                loop {
281                    let token_info = self.unwrap_next()?;
282                    match token_info.token {
283                        Token::StringConstant(string_) => {
284                            string = string_.into();
285                        }
286                        Token::BracketClose if string.is_some() => break,
287                        _ => return Err(token_info.expected("constant string")),
288                    }
289                }
290                Ok(string.unwrap().into_string())
291            }
292            _ => Err(next_token.expected("constant string")),
293        }
294    }
295
296    pub fn expect_number(&mut self, max_value: usize) -> Result<usize, CompileError> {
297        let next_token = self.unwrap_next()?;
298        if let Token::Number(n) = next_token.token {
299            if n < max_value {
300                Ok(n)
301            } else {
302                Err(next_token.expected(format!("number lower than {max_value}")))
303            }
304        } else {
305            Err(next_token.expected("number"))
306        }
307    }
308
309    pub fn invalid_character(&self) -> CompileError {
310        CompileError {
311            line_num: self.line_num,
312            line_pos: self.pos - self.line_start,
313            error_type: ErrorType::InvalidCharacter(self.last_ch),
314        }
315    }
316
317    pub fn peek(&mut self) -> Option<Result<&TokenInfo, CompileError>> {
318        if self.next_token.is_empty() {
319            match self.next()? {
320                Ok(next_token) => self.next_token.push(next_token),
321                Err(err) => return Some(Err(err)),
322            }
323        }
324        self.next_token.last().map(Ok)
325    }
326}
327
328impl Iterator for Tokenizer<'_> {
329    type Item = Result<TokenInfo, CompileError>;
330
331    fn next(&mut self) -> Option<Self::Item> {
332        if let Some(prev_token) = self.next_token.pop() {
333            return Some(Ok(prev_token));
334        }
335
336        'outer: while let Some((ch, last_ch)) = self.next_byte() {
337            match self.state {
338                State::None => match ch {
339                    b'a'..=b'z' | b'0'..=b'9' | b'_' | b'.' | b'$' => {
340                        self.push_byte(ch);
341                    }
342                    b'A'..=b'Z' => {
343                        self.push_byte(ch.to_ascii_lowercase());
344                    }
345                    b':' => {
346                        if self.is_token_start()
347                            && matches!(self.peek_byte(), Some(b) if b.is_ascii_alphabetic())
348                        {
349                            self.token_is_tag();
350                        } else if self.token_bytes().eq_ignore_ascii_case(b"text") {
351                            self.state = State::MultiLine(StringType::default());
352                            self.text_start();
353                            while let Some((ch, _)) = self.next_byte() {
354                                if ch == b'\n' {
355                                    self.new_line();
356                                    self.reset_current_token();
357                                    continue 'outer;
358                                }
359                            }
360                        } else {
361                            return Some(Ok(self.get_token(Token::Colon)));
362                            //return Some(Err(self.invalid_character()));
363                        }
364                    }
365                    b'"' => {
366                        self.state = State::QuotedString(StringType::default());
367                        self.text_start();
368                        if let Some(token) = self.get_current_token() {
369                            return Some(Ok(token));
370                        }
371                    }
372                    b'{' => {
373                        return Some(Ok(self.get_token(Token::CurlyOpen)));
374                    }
375                    b'}' => {
376                        return Some(Ok(self.get_token(Token::CurlyClose)));
377                    }
378                    b';' => {
379                        return Some(Ok(self.get_token(Token::Semicolon)));
380                    }
381                    b',' => {
382                        return Some(Ok(self.get_token(Token::Comma)));
383                    }
384                    b'[' => {
385                        return Some(Ok(self.get_token(Token::BracketOpen)));
386                    }
387                    b']' => {
388                        return Some(Ok(self.get_token(Token::BracketClose)));
389                    }
390                    b'(' => {
391                        return Some(Ok(self.get_token(Token::ParenthesisOpen)));
392                    }
393                    b')' => {
394                        return Some(Ok(self.get_token(Token::ParenthesisClose)));
395                    }
396                    b'/' => {
397                        if let Some((b'*', _)) = self.next_byte() {
398                            self.last_ch = 0;
399                            self.state = State::BracketComment;
400                            self.text_start();
401                            if let Some(token) = self.get_current_token() {
402                                return Some(Ok(token));
403                            }
404                        } else {
405                            return Some(Err(self.invalid_character()));
406                        }
407                    }
408                    b'#' => {
409                        self.state = State::HashComment;
410                        if let Some(token) = self.get_current_token() {
411                            return Some(Ok(token));
412                        }
413                    }
414                    b'\n' => {
415                        self.new_line();
416                        if let Some(token) = self.get_current_token() {
417                            return Some(Ok(token));
418                        }
419                    }
420                    b' ' | b'\t' | b'\r' => {
421                        if let Some(token) = self.get_current_token() {
422                            return Some(Ok(token));
423                        }
424                    }
425                    _ => {
426                        return Some(Err(self.invalid_character()));
427                    }
428                },
429                State::BracketComment { .. } => match ch {
430                    b'/' if last_ch == b'*' => {
431                        self.state = State::None;
432                    }
433                    b'\n' => {
434                        self.new_line();
435                    }
436                    _ => (),
437                },
438                State::HashComment => {
439                    if ch == b'\n' {
440                        self.state = State::None;
441                        self.new_line();
442                    }
443                }
444                State::QuotedString(mut str_type) => match ch {
445                    b'"' if last_ch != b'\\' => {
446                        self.state = State::None;
447                        return Some(self.get_string(str_type));
448                    }
449                    b'\n' => {
450                        self.new_line();
451                        self.push_byte(b'\n');
452                        str_type.has_other = true;
453                        self.state = State::QuotedString(str_type);
454                    }
455                    b'{' if (last_ch == b'$' || last_ch == b'%') => {
456                        str_type.maybe_variable = true;
457                        self.state = State::QuotedString(str_type);
458                        self.push_byte(ch);
459                    }
460                    b'\\' => {
461                        if last_ch == b'\\' {
462                            self.push_byte(ch);
463                        }
464                    }
465                    b'0'..=b'9' => {
466                        if !str_type.has_digits {
467                            str_type.has_digits = true;
468                            self.state = State::QuotedString(str_type);
469                        }
470                        self.push_byte(ch);
471                    }
472                    b'.' => {
473                        if !str_type.has_dots {
474                            str_type.has_dots = true;
475                        } else {
476                            str_type.has_other = true;
477                        }
478                        self.state = State::QuotedString(str_type);
479                        self.push_byte(ch);
480                    }
481                    _ => {
482                        let ch = if last_ch == b'\\' {
483                            match ch {
484                                b'n' => b'\n',
485                                b'r' => b'\r',
486                                b't' => b'\t',
487                                _ => ch,
488                            }
489                        } else {
490                            ch
491                        };
492                        if !str_type.has_other && ch != b'-' {
493                            str_type.has_other = true;
494                            self.state = State::QuotedString(str_type);
495                        }
496                        self.push_byte(ch);
497                    }
498                },
499                State::MultiLine(mut str_type) => match ch {
500                    b'.' if last_ch == b'\n' => {
501                        let is_eof = match (self.next_byte(), self.peek_byte()) {
502                            (Some((b'\r', _)), Some(b'\n')) => {
503                                self.next_byte();
504                                true
505                            }
506                            (Some((b'\n', _)), _) => true,
507                            (Some((b'.', _)), _) => {
508                                self.push_byte(b'.');
509                                false
510                            }
511                            (Some((ch, _)), _) => {
512                                self.push_byte(b'.');
513                                self.push_byte(ch);
514                                false
515                            }
516                            _ => false,
517                        };
518
519                        if is_eof {
520                            self.new_line();
521                            self.state = State::None;
522                            return Some(self.get_string(str_type));
523                        }
524                    }
525                    b'\n' => {
526                        self.new_line();
527                        self.push_byte(b'\n');
528                    }
529                    b'{' if (last_ch == b'$' || last_ch == b'%') => {
530                        str_type.maybe_variable = true;
531                        self.state = State::MultiLine(str_type);
532                        self.push_byte(ch);
533                    }
534                    b'0'..=b'9' => {
535                        if !str_type.has_digits {
536                            str_type.has_digits = true;
537                            self.state = State::MultiLine(str_type);
538                        }
539                        self.push_byte(ch);
540                    }
541                    b'.' => {
542                        if !str_type.has_dots {
543                            str_type.has_dots = true;
544                        } else {
545                            str_type.has_other = true;
546                        }
547                        self.state = State::MultiLine(str_type);
548                        self.push_byte(ch);
549                    }
550                    _ => {
551                        if !str_type.has_other && ch != b'-' {
552                            str_type.has_other = true;
553                            self.state = State::MultiLine(str_type);
554                        }
555                        self.push_byte(ch);
556                    }
557                },
558            }
559        }
560
561        match self.state {
562            State::BracketComment | State::QuotedString(_) | State::MultiLine(_) => {
563                Some(Err(CompileError {
564                    line_num: self.text_line_num,
565                    line_pos: self.text_line_pos,
566                    error_type: (&self.state).into(),
567                }))
568            }
569            _ => None,
570        }
571    }
572}
573
574impl From<&State> for ErrorType {
575    fn from(state: &State) -> Self {
576        match state {
577            State::BracketComment => ErrorType::UnterminatedComment,
578            State::QuotedString(_) => ErrorType::UnterminatedString,
579            State::MultiLine(_) => ErrorType::UnterminatedMultiline,
580            _ => unreachable!(),
581        }
582    }
583}