piccolo/compiler/
lexer.rs

1use std::{
2    char, fmt, i32, i64,
3    io::{self, Read},
4    str,
5};
6
7use gc_arena::Collect;
8use thiserror::Error;
9
10use super::StringInterner;
11
12#[derive(Clone)]
13pub enum Token<S> {
14    Break,
15    Do,
16    Else,
17    ElseIf,
18    End,
19    Function,
20    Goto,
21    If,
22    In,
23    Local,
24    Nil,
25    For,
26    While,
27    Repeat,
28    Until,
29    Return,
30    Then,
31    True,
32    False,
33    Not,
34    And,
35    Or,
36    Minus,
37    Add,
38    Mul,
39    Div,
40    IDiv,
41    Pow,
42    Mod,
43    Len,
44    BitNotXor,
45    BitAnd,
46    BitOr,
47    ShiftRight,
48    ShiftLeft,
49    Concat,
50    Dots,
51    Assign,
52    LessThan,
53    LessEqual,
54    GreaterThan,
55    GreaterEqual,
56    Equal,
57    NotEqual,
58    Dot,
59    SemiColon,
60    Colon,
61    DoubleColon,
62    Comma,
63    LeftParen,
64    RightParen,
65    LeftBracket,
66    RightBracket,
67    LeftBrace,
68    RightBrace,
69    /// Numerals are only lexed as integers in the range [-(2^63-1), 2^63-1], otherwise they will be
70    /// lexed as floats.
71    Integer(i64),
72    Float(f64),
73    Name(S),
74    String(S),
75}
76
77impl<S: AsRef<[u8]>> PartialEq for Token<S> {
78    fn eq(&self, other: &Self) -> bool {
79        match (self, other) {
80            (Token::Break, Token::Break) => true,
81            (Token::Do, Token::Do) => true,
82            (Token::Else, Token::Else) => true,
83            (Token::ElseIf, Token::ElseIf) => true,
84            (Token::End, Token::End) => true,
85            (Token::Function, Token::Function) => true,
86            (Token::Goto, Token::Goto) => true,
87            (Token::If, Token::If) => true,
88            (Token::In, Token::In) => true,
89            (Token::Local, Token::Local) => true,
90            (Token::Nil, Token::Nil) => true,
91            (Token::For, Token::For) => true,
92            (Token::While, Token::While) => true,
93            (Token::Repeat, Token::Repeat) => true,
94            (Token::Until, Token::Until) => true,
95            (Token::Return, Token::Return) => true,
96            (Token::Then, Token::Then) => true,
97            (Token::True, Token::True) => true,
98            (Token::False, Token::False) => true,
99            (Token::Not, Token::Not) => true,
100            (Token::And, Token::And) => true,
101            (Token::Or, Token::Or) => true,
102            (Token::Minus, Token::Minus) => true,
103            (Token::Add, Token::Add) => true,
104            (Token::Mul, Token::Mul) => true,
105            (Token::Div, Token::Div) => true,
106            (Token::IDiv, Token::IDiv) => true,
107            (Token::Pow, Token::Pow) => true,
108            (Token::Mod, Token::Mod) => true,
109            (Token::Len, Token::Len) => true,
110            (Token::BitNotXor, Token::BitNotXor) => true,
111            (Token::BitAnd, Token::BitAnd) => true,
112            (Token::BitOr, Token::BitOr) => true,
113            (Token::ShiftRight, Token::ShiftRight) => true,
114            (Token::ShiftLeft, Token::ShiftLeft) => true,
115            (Token::Concat, Token::Concat) => true,
116            (Token::Dots, Token::Dots) => true,
117            (Token::Assign, Token::Assign) => true,
118            (Token::LessThan, Token::LessThan) => true,
119            (Token::LessEqual, Token::LessEqual) => true,
120            (Token::GreaterThan, Token::GreaterThan) => true,
121            (Token::GreaterEqual, Token::GreaterEqual) => true,
122            (Token::Equal, Token::Equal) => true,
123            (Token::NotEqual, Token::NotEqual) => true,
124            (Token::Dot, Token::Dot) => true,
125            (Token::SemiColon, Token::SemiColon) => true,
126            (Token::Colon, Token::Colon) => true,
127            (Token::DoubleColon, Token::DoubleColon) => true,
128            (Token::Comma, Token::Comma) => true,
129            (Token::LeftParen, Token::LeftParen) => true,
130            (Token::RightParen, Token::RightParen) => true,
131            (Token::LeftBracket, Token::LeftBracket) => true,
132            (Token::RightBracket, Token::RightBracket) => true,
133            (Token::LeftBrace, Token::LeftBrace) => true,
134            (Token::RightBrace, Token::RightBrace) => true,
135            (Token::Integer(a), Token::Integer(b)) => a == b,
136            (Token::Float(a), Token::Float(b)) => a.total_cmp(b).is_eq(),
137            (Token::Name(a), Token::Name(b)) => a.as_ref() == b.as_ref(),
138            (Token::String(a), Token::String(b)) => a.as_ref() == b.as_ref(),
139            _ => false,
140        }
141    }
142}
143
144impl<S: AsRef<[u8]>> fmt::Debug for Token<S> {
145    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
146        match self {
147            Token::Break => write!(f, "Break"),
148            Token::Do => write!(f, "Do"),
149            Token::Else => write!(f, "Else"),
150            Token::ElseIf => write!(f, "ElseIf"),
151            Token::End => write!(f, "End"),
152            Token::Function => write!(f, "Function"),
153            Token::Goto => write!(f, "Goto"),
154            Token::If => write!(f, "If"),
155            Token::In => write!(f, "In"),
156            Token::Local => write!(f, "Local"),
157            Token::Nil => write!(f, "Nil"),
158            Token::For => write!(f, "For"),
159            Token::While => write!(f, "While"),
160            Token::Repeat => write!(f, "Repeat"),
161            Token::Until => write!(f, "Until"),
162            Token::Return => write!(f, "Return"),
163            Token::Then => write!(f, "Then"),
164            Token::True => write!(f, "True"),
165            Token::False => write!(f, "False"),
166            Token::Not => write!(f, "Not"),
167            Token::And => write!(f, "And"),
168            Token::Or => write!(f, "Or"),
169            Token::Minus => write!(f, "Minus"),
170            Token::Add => write!(f, "Add"),
171            Token::Mul => write!(f, "Mul"),
172            Token::Div => write!(f, "Div"),
173            Token::IDiv => write!(f, "IDiv"),
174            Token::Pow => write!(f, "Pow"),
175            Token::Mod => write!(f, "Mod"),
176            Token::Len => write!(f, "Len"),
177            Token::BitNotXor => write!(f, "BitNotXor"),
178            Token::BitAnd => write!(f, "BitAnd"),
179            Token::BitOr => write!(f, "BitOr"),
180            Token::ShiftRight => write!(f, "ShiftRight"),
181            Token::ShiftLeft => write!(f, "ShiftLeft"),
182            Token::Concat => write!(f, "Concat"),
183            Token::Dots => write!(f, "Dots"),
184            Token::Assign => write!(f, "Assign"),
185            Token::LessThan => write!(f, "LessThan"),
186            Token::LessEqual => write!(f, "LessEqual"),
187            Token::GreaterThan => write!(f, "GreaterThan"),
188            Token::GreaterEqual => write!(f, "GreaterEqual"),
189            Token::Equal => write!(f, "Equal"),
190            Token::NotEqual => write!(f, "NotEqual"),
191            Token::Dot => write!(f, "Dot"),
192            Token::SemiColon => write!(f, "SemiColon"),
193            Token::Colon => write!(f, "Colon"),
194            Token::DoubleColon => write!(f, "DoubleColon"),
195            Token::Comma => write!(f, "Comma"),
196            Token::LeftParen => write!(f, "LeftParen"),
197            Token::RightParen => write!(f, "RightParen"),
198            Token::LeftBracket => write!(f, "LeftBracket"),
199            Token::RightBracket => write!(f, "RightBracket"),
200            Token::LeftBrace => write!(f, "LeftBrace"),
201            Token::RightBrace => write!(f, "RightBrace"),
202            Token::Integer(i) => write!(f, "Integer({})", *i),
203            Token::Float(d) => write!(f, "Float({})", *d),
204            Token::Name(n) => write!(f, "Name({:?})", String::from_utf8_lossy(n.as_ref())),
205            Token::String(s) => write!(f, "String({:?})", String::from_utf8_lossy(s.as_ref())),
206        }
207    }
208}
209
210fn print_char(c: u8) -> char {
211    char::from_u32(c as u32).unwrap_or(char::REPLACEMENT_CHARACTER)
212}
213
214#[derive(Debug, Error)]
215pub enum LexError {
216    #[error("short string not finished, expected matching {}", print_char(*.0))]
217    UnfinishedShortString(u8),
218    #[error("unexpected character: {}", print_char(*.0))]
219    UnexpectedCharacter(u8),
220    #[error("hexadecimal digit expected")]
221    HexDigitExpected,
222    #[error("missing '{{' in \\u{{xxxx}} escape")]
223    EscapeUnicodeStart,
224    #[error("missing '}}' in \\u{{xxxx}} escape")]
225    EscapeUnicodeEnd,
226    #[error("invalid unicode value in \\u{{xxxx}} escape")]
227    EscapeUnicodeInvalid,
228    #[error("\\ddd escape out of 0-255 range")]
229    EscapeDecimalTooLarge,
230    #[error("invalid escape sequence")]
231    InvalidEscape,
232    #[error("invalid long string delimiter")]
233    InvalidLongStringDelimiter,
234    #[error("unfinished long string")]
235    UnfinishedLongString,
236    #[error("malformed number")]
237    BadNumber,
238    #[error("IO Error: {0}")]
239    IOError(#[from] io::Error),
240}
241
242/// A 0-indexed line number of the current source input.
243#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Collect)]
244#[collect(require_static)]
245pub struct LineNumber(pub u64);
246
247impl fmt::Display for LineNumber {
248    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
249        write!(f, "{}", u128::from(self.0) + 1)
250    }
251}
252
253pub struct Lexer<R, S> {
254    source: Option<R>,
255    interner: S,
256    peek_buffer: Vec<u8>,
257    string_buffer: Vec<u8>,
258    line_number: u64,
259}
260
261impl<R, S> Lexer<R, S>
262where
263    R: Read,
264    S: StringInterner,
265{
266    pub fn new(source: R, interner: S) -> Lexer<R, S> {
267        Lexer {
268            source: Some(source),
269            interner,
270            peek_buffer: Vec::new(),
271            string_buffer: Vec::new(),
272            line_number: 0,
273        }
274    }
275
276    /// Current line number of the source file.
277    pub fn line_number(&self) -> LineNumber {
278        LineNumber(self.line_number)
279    }
280
281    pub fn skip_whitespace(&mut self) -> Result<(), LexError> {
282        let mut do_skip_whitespace = || {
283            while let Some(c) = self.peek(0)? {
284                match c {
285                    b' ' | b'\t' | VERTICAL_TAB | FORM_FEED => {
286                        self.advance(1);
287                    }
288
289                    b'\n' | b'\r' => {
290                        self.read_line_end(false)?;
291                    }
292
293                    b'-' => {
294                        if self.peek(1)? != Some(b'-') {
295                            break;
296                        } else {
297                            self.advance(2);
298
299                            match (self.peek(0)?, self.peek(1)?) {
300                                (Some(b'['), Some(b'=')) | (Some(b'['), Some(b'[')) => {
301                                    // long comment
302                                    self.read_long_string(false)?;
303                                }
304                                _ => {
305                                    // Short comment, read until end of line
306                                    while let Some(c) = self.peek(0)? {
307                                        if is_newline(c) {
308                                            break;
309                                        } else {
310                                            self.advance(1);
311                                        }
312                                    }
313                                }
314                            }
315                        }
316                    }
317
318                    _ => break,
319                }
320            }
321
322            Ok(())
323        };
324
325        match do_skip_whitespace() {
326            Ok(()) => Ok(()),
327            Err(err) => {
328                self.reset();
329                Err(err)
330            }
331        }
332    }
333
334    /// Reads the next token, or None if the end of the source has been reached.
335    pub fn read_token(&mut self) -> Result<Option<Token<S::String>>, LexError> {
336        self.skip_whitespace()?;
337
338        let mut do_read_token = || {
339            if let Some(c) = self.peek(0)? {
340                Ok(Some(match c {
341                    b' ' | b'\t' | VERTICAL_TAB | FORM_FEED | b'\n' | b'\r' => {
342                        unreachable!("whitespace should have been skipped");
343                    }
344
345                    b'-' => {
346                        if self.peek(1)? != Some(b'-') {
347                            self.advance(1);
348                            Token::Minus
349                        } else {
350                            unreachable!("whitespace should have been skipped");
351                        }
352                    }
353
354                    b'[' => {
355                        let next = self.peek(1)?;
356                        if next == Some(b'=') || next == Some(b'[') {
357                            self.read_long_string(true)?;
358                            Token::String(self.take_string())
359                        } else {
360                            self.advance(1);
361                            Token::LeftBracket
362                        }
363                    }
364
365                    b'=' => {
366                        self.advance(1);
367                        if self.peek(0)? == Some(b'=') {
368                            self.advance(1);
369                            Token::Equal
370                        } else {
371                            Token::Assign
372                        }
373                    }
374
375                    b'<' => {
376                        self.advance(1);
377                        let next = self.peek(0)?;
378                        if next == Some(b'=') {
379                            self.advance(1);
380                            Token::LessEqual
381                        } else if next == Some(b'<') {
382                            self.advance(1);
383                            Token::ShiftLeft
384                        } else {
385                            Token::LessThan
386                        }
387                    }
388
389                    b'>' => {
390                        self.advance(1);
391                        let next = self.peek(0)?;
392                        if next == Some(b'=') {
393                            self.advance(1);
394                            Token::GreaterEqual
395                        } else if next == Some(b'>') {
396                            self.advance(1);
397                            Token::ShiftRight
398                        } else {
399                            Token::GreaterThan
400                        }
401                    }
402
403                    b'/' => {
404                        self.advance(1);
405                        if self.peek(0)? == Some(b'/') {
406                            self.advance(1);
407                            Token::IDiv
408                        } else {
409                            Token::Div
410                        }
411                    }
412
413                    b'~' => {
414                        self.advance(1);
415                        if self.peek(0)? == Some(b'=') {
416                            self.advance(1);
417                            Token::NotEqual
418                        } else {
419                            Token::BitNotXor
420                        }
421                    }
422
423                    b':' => {
424                        self.advance(1);
425                        if self.peek(0)? == Some(b':') {
426                            self.advance(1);
427                            Token::DoubleColon
428                        } else {
429                            Token::Colon
430                        }
431                    }
432
433                    b'"' | b'\'' => {
434                        self.read_short_string()?;
435                        Token::String(self.take_string())
436                    }
437
438                    b'.' => {
439                        if self.peek(1)? == Some(b'.') {
440                            if self.peek(2)? == Some(b'.') {
441                                self.advance(3);
442                                Token::Dots
443                            } else {
444                                self.advance(2);
445                                Token::Concat
446                            }
447                        } else if self.peek(1)?.map(is_digit).unwrap_or(false) {
448                            self.read_numeral()?
449                        } else {
450                            self.advance(1);
451                            Token::Dot
452                        }
453                    }
454
455                    c => {
456                        if is_digit(c) {
457                            self.read_numeral()?
458                        } else if let Some(t) = get_char_token(c) {
459                            self.advance(1);
460                            t
461                        } else if is_alpha(c) {
462                            self.string_buffer.clear();
463                            self.string_buffer.push(c);
464                            self.advance(1);
465
466                            while let Some(c) = self.peek(0)? {
467                                if is_alpha(c) || is_digit(c) {
468                                    self.string_buffer.push(c);
469                                    self.advance(1);
470                                } else {
471                                    break;
472                                }
473                            }
474
475                            if let Some(t) = get_reserved_word_token(self.string_buffer.as_slice())
476                            {
477                                t
478                            } else {
479                                Token::Name(self.take_string())
480                            }
481                        } else {
482                            return Err(LexError::UnexpectedCharacter(c));
483                        }
484                    }
485                }))
486            } else {
487                Ok(None)
488            }
489        };
490
491        match do_read_token() {
492            Ok(Some(token)) => Ok(Some(token)),
493            res => {
494                self.reset();
495                res
496            }
497        }
498    }
499
500    // End of stream encountered, clear any input handles and temp buffers
501    fn reset(&mut self) {
502        self.source = None;
503        self.peek_buffer.clear();
504        self.string_buffer.clear();
505    }
506
507    // Read any of "\n", "\r", "\n\r", or "\r\n" as a single newline, and increment the current line
508    // number. If `append_buffer` is true, then appends the read newline to the string buffer.
509    fn read_line_end(&mut self, append_string: bool) -> Result<(), LexError> {
510        let newline = self.peek(0).unwrap().unwrap();
511        assert!(is_newline(newline));
512        self.advance(1);
513        // We always append a single plain `\n` character for any newline characters, matching the
514        // behavior of PUC-Rio Lua.
515        if append_string {
516            self.string_buffer.push(b'\n');
517        }
518
519        if let Some(next_newline) = self.peek(0)? {
520            if is_newline(next_newline) && next_newline != newline {
521                self.advance(1);
522            }
523        }
524
525        self.line_number += 1;
526        Ok(())
527    }
528
529    // Read a string on a single line delimited by ' or " that allows for \ escaping of certain
530    // characters. Always reads the contained string into the string buffer.
531    fn read_short_string(&mut self) -> Result<(), LexError> {
532        let start_quote = self.peek(0).unwrap().unwrap();
533        assert!(start_quote == b'\'' || start_quote == b'"');
534        self.advance(1);
535
536        self.string_buffer.clear();
537
538        loop {
539            let c = if let Some(c) = self.peek(0)? {
540                c
541            } else {
542                return Err(LexError::UnfinishedShortString(start_quote));
543            };
544
545            if is_newline(c) {
546                return Err(LexError::UnfinishedShortString(start_quote));
547            }
548
549            self.advance(1);
550            if c == b'\\' {
551                match self
552                    .peek(0)?
553                    .ok_or_else(|| LexError::UnfinishedShortString(start_quote))?
554                {
555                    b'a' => {
556                        self.advance(1);
557                        self.string_buffer.push(ALERT_BEEP);
558                    }
559
560                    b'b' => {
561                        self.advance(1);
562                        self.string_buffer.push(BACKSPACE);
563                    }
564
565                    b'f' => {
566                        self.advance(1);
567                        self.string_buffer.push(FORM_FEED);
568                    }
569
570                    b'n' => {
571                        self.advance(1);
572                        self.string_buffer.push(b'\n');
573                    }
574
575                    b'r' => {
576                        self.advance(1);
577                        self.string_buffer.push(b'\r');
578                    }
579
580                    b't' => {
581                        self.advance(1);
582                        self.string_buffer.push(b'\t');
583                    }
584
585                    b'v' => {
586                        self.advance(1);
587                        self.string_buffer.push(VERTICAL_TAB);
588                    }
589
590                    b'\\' => {
591                        self.advance(1);
592                        self.string_buffer.push(b'\\');
593                    }
594
595                    b'\'' => {
596                        self.advance(1);
597                        self.string_buffer.push(b'\'');
598                    }
599
600                    b'"' => {
601                        self.advance(1);
602                        self.string_buffer.push(b'"');
603                    }
604
605                    b'\n' | b'\r' => {
606                        self.read_line_end(true)?;
607                    }
608
609                    b'x' => {
610                        self.advance(1);
611                        let first = self
612                            .peek(0)?
613                            .and_then(from_hex_digit)
614                            .ok_or(LexError::HexDigitExpected)?;
615                        let second = self
616                            .peek(1)?
617                            .and_then(from_hex_digit)
618                            .ok_or(LexError::HexDigitExpected)?;
619                        self.string_buffer.push(first << 4 | second);
620                        self.advance(2);
621                    }
622
623                    b'u' => {
624                        if self.peek(1)? != Some(b'{') {
625                            return Err(LexError::EscapeUnicodeStart);
626                        }
627                        self.advance(2);
628
629                        let mut u: u32 = 0;
630                        loop {
631                            if let Some(c) = self.peek(0)? {
632                                if c == b'}' {
633                                    self.advance(1);
634                                    break;
635                                } else if let Some(h) = from_hex_digit(c) {
636                                    u = (u << 4) | h as u32;
637                                    self.advance(1);
638                                } else {
639                                    return Err(LexError::EscapeUnicodeEnd);
640                                }
641                            } else {
642                                return Err(LexError::EscapeUnicodeEnd);
643                            }
644                        }
645
646                        let c = char::from_u32(u).ok_or(LexError::EscapeUnicodeInvalid)?;
647                        let mut buf = [0; 4];
648                        for &b in c.encode_utf8(&mut buf).as_bytes() {
649                            self.string_buffer.push(b);
650                        }
651                    }
652
653                    b'z' => {
654                        self.advance(1);
655                        while let Some(c) = self.peek(0)? {
656                            if is_newline(c) {
657                                self.read_line_end(false)?;
658                            } else if is_space(c) {
659                                self.advance(1);
660                            } else {
661                                break;
662                            }
663                        }
664                    }
665
666                    c => {
667                        if is_digit(c) {
668                            let mut u: u16 = 0;
669                            for _ in 0..3 {
670                                if let Some(d) = self.peek(0)?.and_then(from_digit) {
671                                    u = 10 * u + d as u16;
672                                    self.advance(1);
673                                } else {
674                                    break;
675                                }
676                            }
677                            if u > 255 {
678                                return Err(LexError::EscapeDecimalTooLarge);
679                            }
680
681                            self.string_buffer.push(u as u8);
682                        } else {
683                            return Err(LexError::InvalidEscape);
684                        }
685                    }
686                }
687            } else if c == start_quote {
688                break;
689            } else {
690                self.string_buffer.push(c);
691            }
692        }
693
694        Ok(())
695    }
696
697    // Read a [=*[...]=*] sequence with matching numbers of '='. If `into_string` is true, writes
698    // the contained string into the string buffer.
699    fn read_long_string(&mut self, into_string: bool) -> Result<(), LexError> {
700        assert_eq!(self.peek(0).unwrap().unwrap(), b'[');
701        self.advance(1);
702
703        if into_string {
704            self.string_buffer.clear();
705        }
706
707        let mut open_sep_length = 0;
708        while self.peek(0)? == Some(b'=') {
709            self.advance(1);
710            open_sep_length += 1;
711        }
712
713        if self.peek(0)? != Some(b'[') {
714            return Err(LexError::InvalidLongStringDelimiter);
715        }
716        self.advance(1);
717
718        if matches!(self.peek(0)?, Some(b'\n' | b'\r')) {
719            // If the long string starts imediately with a newline, we read it and do *not* put it
720            // into the string buffer, matching the behavior of PUC-Rio Lua.
721            self.read_line_end(false)?;
722        }
723
724        loop {
725            let c = if let Some(c) = self.peek(0)? {
726                c
727            } else {
728                return Err(LexError::UnfinishedLongString);
729            };
730
731            match c {
732                b'\n' | b'\r' => {
733                    self.read_line_end(into_string)?;
734                }
735
736                b']' => {
737                    let mut close_sep_length = 0;
738                    self.advance(1);
739                    while self.peek(0)? == Some(b'=') {
740                        self.advance(1);
741                        close_sep_length += 1;
742                    }
743
744                    if open_sep_length == close_sep_length && self.peek(0)? == Some(b']') {
745                        self.advance(1);
746                        break;
747                    } else {
748                        // If it turns out this is not a valid long string close delimiter, we need
749                        // to add the invalid close delimiter to the string.
750                        if into_string {
751                            self.string_buffer.push(b']');
752                            for _ in 0..close_sep_length {
753                                self.string_buffer.push(b'=');
754                            }
755                        }
756                    }
757                }
758
759                c => {
760                    if into_string {
761                        self.string_buffer.push(c);
762                    }
763                    self.advance(1);
764                }
765            }
766        }
767
768        Ok(())
769    }
770
771    // Reads a hex or decimal integer or floating point identifier. Allows decimal integers (123),
772    // hex integers (0xdeadbeef), decimal floating point with optional exponent and exponent sign
773    // (3.21e+1), and hex floats with optional exponent and exponent sign (0xe.2fp-1c).
774    fn read_numeral(&mut self) -> Result<Token<S::String>, LexError> {
775        let p1 = self.peek(0).unwrap().unwrap();
776        assert!(p1 == b'.' || is_digit(p1));
777
778        self.string_buffer.clear();
779
780        let p2 = self.peek(1)?;
781        let is_hex = p1 == b'0' && (p2 == Some(b'x') || p2 == Some(b'X'));
782        if is_hex {
783            self.string_buffer.push(p1);
784            self.string_buffer.push(p2.unwrap());
785            self.advance(2);
786        }
787
788        let mut has_radix = false;
789        while let Some(c) = self.peek(0)? {
790            if c == b'.' && !has_radix {
791                self.string_buffer.push(b'.');
792                has_radix = true;
793                self.advance(1);
794            } else if (!is_hex && is_digit(c)) || (is_hex && is_hex_digit(c)) {
795                self.string_buffer.push(c);
796                self.advance(1);
797            } else {
798                break;
799            }
800        }
801
802        let mut has_exp = false;
803        if let Some(exp_begin) = self.peek(0)? {
804            if (is_hex && (exp_begin == b'p' || exp_begin == b'P'))
805                || (!is_hex && (exp_begin == b'e' || exp_begin == b'E'))
806            {
807                self.string_buffer.push(exp_begin);
808                has_exp = true;
809                self.advance(1);
810
811                if let Some(sign) = self.peek(0)? {
812                    if sign == b'+' || sign == b'-' {
813                        self.string_buffer.push(sign);
814                        self.advance(1);
815                    }
816                }
817
818                while let Some(c) = self.peek(0)? {
819                    if is_digit(c) {
820                        self.string_buffer.push(c);
821                        self.advance(1);
822                    } else {
823                        break;
824                    }
825                }
826            }
827        }
828
829        if !has_exp && !has_radix {
830            if is_hex {
831                if let Some(i) = read_hex_integer(&self.string_buffer) {
832                    return Ok(Token::Integer(i));
833                }
834            }
835            if let Some(i) = read_dec_integer(&self.string_buffer) {
836                return Ok(Token::Integer(i));
837            }
838        }
839
840        Ok(Token::Float(
841            if is_hex {
842                read_hex_float(&self.string_buffer)
843            } else {
844                read_dec_float(&self.string_buffer)
845            }
846            .ok_or(LexError::BadNumber)?,
847        ))
848    }
849
850    fn peek(&mut self, n: usize) -> Result<Option<u8>, LexError> {
851        if let Some(source) = self.source.as_mut() {
852            while self.peek_buffer.len() <= n {
853                let mut c = [0];
854                match source.read(&mut c) {
855                    Ok(0) => {
856                        self.source = None;
857                        break;
858                    }
859                    Ok(_) => {
860                        self.peek_buffer.push(c[0]);
861                    }
862                    Err(e) => {
863                        if e.kind() != io::ErrorKind::Interrupted {
864                            self.source = None;
865                            return Err(LexError::IOError(e));
866                        }
867                    }
868                }
869            }
870        }
871
872        Ok(self.peek_buffer.get(n).copied())
873    }
874
875    fn advance(&mut self, n: usize) {
876        assert!(
877            n <= self.peek_buffer.len(),
878            "cannot advance over un-peeked characters"
879        );
880        self.peek_buffer.drain(0..n);
881    }
882
883    fn take_string(&mut self) -> S::String {
884        let s = self.interner.intern(&self.string_buffer);
885        self.string_buffer.clear();
886        s
887    }
888}
889
890pub fn read_integer(s: &[u8]) -> Option<i64> {
891    read_hex_integer(s).or_else(|| read_dec_integer(s))
892}
893
894pub fn read_dec_integer(s: &[u8]) -> Option<i64> {
895    let (is_neg, s) = read_neg(s);
896
897    let mut i: i64 = 0;
898    for &c in s {
899        let d = from_digit(c)? as i64;
900        i = i.checked_mul(10)?.checked_add(d)?;
901    }
902
903    if is_neg {
904        i = i.checked_neg()?;
905    }
906
907    Some(i)
908}
909
910pub fn read_hex_integer(s: &[u8]) -> Option<i64> {
911    let (is_neg, s) = read_neg(s);
912
913    if s[0] != b'0' || (s[1] != b'x' && s[1] != b'X') {
914        return None;
915    }
916
917    let mut i: i64 = 0;
918    for &c in &s[2..] {
919        let d = from_hex_digit(c)? as i64;
920        i = i.checked_mul(16)?.checked_add(d)?;
921    }
922
923    if is_neg {
924        i = i.checked_neg()?;
925    }
926
927    Some(i)
928}
929
930pub fn read_float(s: &[u8]) -> Option<f64> {
931    read_hex_float(s).or_else(|| read_dec_float(s))
932}
933
934pub fn read_dec_float(s: &[u8]) -> Option<f64> {
935    let s = str::from_utf8(s).ok()?;
936    str::parse(s).ok()
937}
938
939pub fn read_hex_float(s: &[u8]) -> Option<f64> {
940    const MAX_SIGNIFICANT_DIGITS: u32 = 30;
941
942    let (is_neg, s) = read_neg(s);
943
944    if s.len() < 2 {
945        return None;
946    }
947
948    if s[0] != b'0' || (s[1] != b'x' && s[1] != b'X') {
949        return None;
950    }
951
952    let mut significant_digits: u32 = 0;
953    let mut non_significant_digits: u32 = 0;
954    let mut found_dot = false;
955    let mut base: f64 = 0.0;
956    let mut exp: i32 = 0;
957    let mut i = 2;
958
959    while i < s.len() {
960        let c = s[i];
961        if c == b'.' {
962            if found_dot {
963                return None;
964            }
965            found_dot = true;
966        } else if let Some(d) = from_hex_digit(c) {
967            if significant_digits == 0 && d == 0 {
968                non_significant_digits += 1;
969            } else if significant_digits < MAX_SIGNIFICANT_DIGITS {
970                significant_digits += 1;
971                base = (base * 16.0) + d as f64;
972            } else {
973                // ignore the digit, but count it towards the expontent
974                exp = exp.checked_add(4)?;
975            }
976            if found_dot {
977                // Correct exponent for the fractional part
978                exp = exp.checked_sub(4)?;
979            }
980        } else {
981            break;
982        }
983        i += 1;
984    }
985
986    if non_significant_digits + significant_digits == 0 {
987        return None;
988    }
989
990    if i + 1 < s.len() && (s[i] == b'p' || s[i] == b'P') {
991        let (exp_neg, exp_s) = read_neg(&s[i + 1..]);
992        let mut exp1: i32 = 0;
993        for &c in exp_s {
994            let d = from_digit(c)?;
995            exp1 = exp1.saturating_mul(10).saturating_add(d as i32);
996        }
997        if exp_neg {
998            exp1 = -exp1;
999        }
1000        exp = exp.saturating_add(exp1);
1001    } else if i != s.len() {
1002        return None;
1003    }
1004
1005    if is_neg {
1006        base = -base;
1007    }
1008
1009    Some(base * (exp as f64).exp2())
1010}
1011
1012fn read_neg(s: &[u8]) -> (bool, &[u8]) {
1013    if s.len() > 0 {
1014        if s[0] == b'-' {
1015            (true, &s[1..])
1016        } else if s[0] == b'+' {
1017            (false, &s[1..])
1018        } else {
1019            (false, s)
1020        }
1021    } else {
1022        (false, s)
1023    }
1024}
1025
1026const ALERT_BEEP: u8 = 0x07;
1027const BACKSPACE: u8 = 0x08;
1028const VERTICAL_TAB: u8 = 0x0b;
1029const FORM_FEED: u8 = 0x0c;
1030
1031fn get_char_token<S>(c: u8) -> Option<Token<S>> {
1032    match c {
1033        b'-' => Some(Token::Minus),
1034        b'+' => Some(Token::Add),
1035        b'*' => Some(Token::Mul),
1036        b'^' => Some(Token::Pow),
1037        b'%' => Some(Token::Mod),
1038        b'&' => Some(Token::BitAnd),
1039        b'|' => Some(Token::BitOr),
1040        b',' => Some(Token::Comma),
1041        b';' => Some(Token::SemiColon),
1042        b'#' => Some(Token::Len),
1043        b'(' => Some(Token::LeftParen),
1044        b')' => Some(Token::RightParen),
1045        b']' => Some(Token::RightBracket),
1046        b'{' => Some(Token::LeftBrace),
1047        b'}' => Some(Token::RightBrace),
1048        _ => None,
1049    }
1050}
1051
1052fn get_reserved_word_token<S>(word: &[u8]) -> Option<Token<S>> {
1053    match word {
1054        b"break" => Some(Token::Break),
1055        b"do" => Some(Token::Do),
1056        b"else" => Some(Token::Else),
1057        b"elseif" => Some(Token::ElseIf),
1058        b"end" => Some(Token::End),
1059        b"function" => Some(Token::Function),
1060        b"goto" => Some(Token::Goto),
1061        b"if" => Some(Token::If),
1062        b"in" => Some(Token::In),
1063        b"local" => Some(Token::Local),
1064        b"nil" => Some(Token::Nil),
1065        b"for" => Some(Token::For),
1066        b"while" => Some(Token::While),
1067        b"repeat" => Some(Token::Repeat),
1068        b"until" => Some(Token::Until),
1069        b"return" => Some(Token::Return),
1070        b"then" => Some(Token::Then),
1071        b"true" => Some(Token::True),
1072        b"false" => Some(Token::False),
1073        b"not" => Some(Token::Not),
1074        b"and" => Some(Token::And),
1075        b"or" => Some(Token::Or),
1076        _ => None,
1077    }
1078}
1079
1080fn is_newline(c: u8) -> bool {
1081    c == b'\n' || c == b'\r'
1082}
1083
1084fn is_space(c: u8) -> bool {
1085    c == b' ' || c == b'\t' || c == VERTICAL_TAB || c == FORM_FEED || is_newline(c)
1086}
1087
1088// Is this character a lua alpha, which is A-Z, a-z, and _
1089fn is_alpha(c: u8) -> bool {
1090    (c >= b'a' && c <= b'z') || (c >= b'A' && c <= b'Z') || c == b'_'
1091}
1092
1093fn from_digit(c: u8) -> Option<u8> {
1094    if c >= b'0' && c <= b'9' {
1095        Some(c - b'0')
1096    } else {
1097        None
1098    }
1099}
1100
1101fn is_digit(c: u8) -> bool {
1102    from_digit(c).is_some()
1103}
1104
1105fn from_hex_digit(c: u8) -> Option<u8> {
1106    if c >= b'0' && c <= b'9' {
1107        Some(c - b'0')
1108    } else if c >= b'a' && c <= b'f' {
1109        Some(10 + c - b'a')
1110    } else if c >= b'A' && c <= b'F' {
1111        Some(10 + c - b'A')
1112    } else {
1113        None
1114    }
1115}
1116
1117fn is_hex_digit(c: u8) -> bool {
1118    from_hex_digit(c).is_some()
1119}
1120
1121#[cfg(test)]
1122mod tests {
1123    use std::rc::Rc;
1124
1125    use crate::compiler::interning::BasicInterner;
1126
1127    use super::*;
1128
1129    fn test_tokens(source: &str, tokens: &[Token<Rc<[u8]>>]) {
1130        let mut lexer = Lexer::new(source.as_bytes(), BasicInterner::default());
1131        let mut i = 0;
1132        while let Some(token) = lexer.read_token().unwrap() {
1133            assert!(i < tokens.len(), "too many tokens");
1134            assert_eq!(token, tokens[i], "tokens not equal");
1135            i += 1;
1136        }
1137        assert!(i == tokens.len(), "not enough tokens");
1138    }
1139
1140    fn test_tokens_lines(source: &str, tokens: &[(Token<Rc<[u8]>>, u64)]) {
1141        let mut lexer = Lexer::new(source.as_bytes(), BasicInterner::default());
1142        let mut i = 0;
1143        loop {
1144            lexer.skip_whitespace().unwrap();
1145            let line_number = lexer.line_number().0;
1146            if let Some(token) = lexer.read_token().unwrap() {
1147                assert!(i < tokens.len(), "too many tokens");
1148                assert_eq!(token, tokens[i].0, "tokens not equal");
1149                assert_eq!(line_number, tokens[i].1, "line numbers do not match");
1150                i += 1;
1151            } else {
1152                break;
1153            }
1154        }
1155        assert!(i == tokens.len(), "not enough tokens");
1156    }
1157
1158    fn str_token(s: &str) -> Token<Rc<[u8]>> {
1159        Token::String(s.as_bytes().to_vec().into_boxed_slice().into())
1160    }
1161
1162    fn name_token(s: &str) -> Token<Rc<[u8]>> {
1163        Token::Name(s.as_bytes().to_vec().into_boxed_slice().into())
1164    }
1165
1166    #[test]
1167    fn comments() {
1168        test_tokens_lines(
1169            r#"
1170                -- this is a comment
1171                -- this is also -- a comment
1172                --[[ long comment ]]
1173                --[==[ longer comment ]==]
1174
1175                -- Real token
1176                -
1177
1178                --[====[ longest comment
1179                    these shouldn't trigger the end of comments
1180                    ]=] ]==] ]===]
1181                ]====]
1182
1183                -- Real token
1184                =
1185            "#,
1186            &[(Token::Minus, 7), (Token::Assign, 15)],
1187        );
1188    }
1189
1190    #[test]
1191    fn long_string() {
1192        test_tokens(
1193            r#"
1194                [====[ [==[ this is a [[]] long string ]== ]==] ]====]
1195                [[ [=] [==] another long string [==] [=] ]]
1196            "#,
1197            &[
1198                str_token(" [==[ this is a [[]] long string ]== ]==] "),
1199                str_token(" [=] [==] another long string [==] [=] "),
1200            ],
1201        );
1202
1203        test_tokens(
1204            "[==[\nfoo\nbar\rbaz\r\nbaf\rquux]==]",
1205            &[str_token("foo\nbar\nbaz\nbaf\nquux")],
1206        );
1207    }
1208
1209    #[test]
1210    fn short_string() {
1211        test_tokens_lines(
1212            r#"
1213                "\\ \" '"
1214                '\n \t "'
1215                "begin \z
1216                end"
1217                'state\u{2e}'
1218                "question\x3f"
1219                "exclaim\33"
1220            "#,
1221            &[
1222                (str_token("\\ \" '"), 1),
1223                (str_token("\n \t \""), 2),
1224                (str_token("begin end"), 3),
1225                (str_token("state."), 5),
1226                (str_token("question?"), 6),
1227                (str_token("exclaim!"), 7),
1228            ],
1229        );
1230    }
1231
1232    #[test]
1233    fn numerals() {
1234        test_tokens(
1235            r#"
1236                0xdeadbeef
1237                12345
1238                12345.
1239                3.1415e-2
1240                0x22.4p+1
1241                0Xaa.8P-2
1242                0x8.4P0
1243                .123E-10
1244                0x99999999999999999999999999999999p999999999999999999999999999999
1245                9223372036854775807
1246                9223372036854775808
1247            "#,
1248            &[
1249                Token::Integer(0xdeadbeef),
1250                Token::Integer(12345),
1251                Token::Float(12345.0),
1252                Token::Float(3.1415e-2),
1253                Token::Float(68.5),
1254                Token::Float(42.625),
1255                Token::Float(8.25),
1256                Token::Float(0.123e-10),
1257                Token::Float(f64::INFINITY),
1258                Token::Integer(9223372036854775807),
1259                Token::Float(9223372036854775808.0),
1260            ],
1261        );
1262    }
1263
1264    #[test]
1265    fn words() {
1266        test_tokens(
1267            r#"
1268                break do else elseif end function goto if in local nil for while repeat until return
1269                then true false not and or
1270                custom names
1271            "#,
1272            &[
1273                Token::Break,
1274                Token::Do,
1275                Token::Else,
1276                Token::ElseIf,
1277                Token::End,
1278                Token::Function,
1279                Token::Goto,
1280                Token::If,
1281                Token::In,
1282                Token::Local,
1283                Token::Nil,
1284                Token::For,
1285                Token::While,
1286                Token::Repeat,
1287                Token::Until,
1288                Token::Return,
1289                Token::Then,
1290                Token::True,
1291                Token::False,
1292                Token::Not,
1293                Token::And,
1294                Token::Or,
1295                name_token("custom"),
1296                name_token("names"),
1297            ],
1298        );
1299    }
1300
1301    #[test]
1302    fn ops() {
1303        test_tokens(
1304            r#"- + * / // ^ % & ~ | , ; >> << . .. ... = < <= > >= == ~= : :: # ( ) [ ] { }"#,
1305            &[
1306                Token::Minus,
1307                Token::Add,
1308                Token::Mul,
1309                Token::Div,
1310                Token::IDiv,
1311                Token::Pow,
1312                Token::Mod,
1313                Token::BitAnd,
1314                Token::BitNotXor,
1315                Token::BitOr,
1316                Token::Comma,
1317                Token::SemiColon,
1318                Token::ShiftRight,
1319                Token::ShiftLeft,
1320                Token::Dot,
1321                Token::Concat,
1322                Token::Dots,
1323                Token::Assign,
1324                Token::LessThan,
1325                Token::LessEqual,
1326                Token::GreaterThan,
1327                Token::GreaterEqual,
1328                Token::Equal,
1329                Token::NotEqual,
1330                Token::Colon,
1331                Token::DoubleColon,
1332                Token::Len,
1333                Token::LeftParen,
1334                Token::RightParen,
1335                Token::LeftBracket,
1336                Token::RightBracket,
1337                Token::LeftBrace,
1338                Token::RightBrace,
1339            ],
1340        );
1341    }
1342}