1use std::{
4    fmt,
5    num::{ParseFloatError, ParseIntError},
6    str::Chars,
7};
8
9use thiserror::Error;
10use unicode_xid;
11
12use crate::utils::Location;
13
14use super::token::{
15    LiteralKind::*,
16    Token,
17    TokenKind::{self, *},
18};
19
20struct Cursor<'a> {
25    initial_len: usize,
26    chars: Chars<'a>,
28    lineno: u32,
29    column: u32,
30    #[cfg(debug_assertions)]
31    prev: char,
32}
33
34const EOF_CHAR: char = '\0';
35
36impl<'a> Cursor<'a> {
37    fn new(input: &'a str) -> Cursor<'a> {
38        Cursor {
39            initial_len: input.len(),
40            chars: input.chars(),
41            lineno: 1,
42            column: 1,
43            #[cfg(debug_assertions)]
44            prev: EOF_CHAR,
45        }
46    }
47
48    fn prev(&self) -> char {
51        #[cfg(debug_assertions)]
52        {
53            self.prev
54        }
55
56        #[cfg(not(debug_assertions))]
57        {
58            EOF_CHAR
59        }
60    }
61
62    fn first(&self) -> char {
67        self.chars.clone().next().unwrap_or(EOF_CHAR)
69    }
70
71    fn is_eof(&self) -> bool {
73        self.chars.as_str().is_empty()
74    }
75
76    fn bump(&mut self) -> Option<char> {
78        let c = self.chars.next()?;
79
80        if c == '\n' {
81            self.lineno += 1;
82            self.column = 0;
83        }
84        self.column += 1;
85
86        #[cfg(debug_assertions)]
87        {
88            self.prev = c;
89        }
90
91        Some(c)
92    }
93
94    fn location(&self) -> Location {
96        Location {
97            lineno: self.lineno,
98            column: self.column,
99            offset: (self.initial_len - self.chars.as_str().len()) as u32,
100        }
101    }
102
103    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
105        while predicate(self.first()) && !self.is_eof() {
106            self.bump();
107        }
108    }
109}
110
111#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
113enum Base {
114    Binary,
116    Octal,
118    Hexadecimal,
120    Decimal,
122}
123
124pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
126    let mut cursor = Cursor::new(input);
127    std::iter::from_fn(move || loop {
128        if cursor.is_eof() {
129            break None;
130        } else {
131            let t = cursor.advance_token();
132            match t.kind {
133                LineComment | BlockComment | Whitespace => (),
134                _ => break Some(t),
135            }
136        }
137    })
138}
139
140pub fn is_whitespace(c: char) -> bool {
142    matches!(
148        c,
149        '\u{0009}'   | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}' | '\u{0085}'
158
159        | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' )
167}
168
169pub fn is_id_start(c: char) -> bool {
171    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
173}
174
175pub fn is_id_continue(c: char) -> bool {
177    unicode_xid::UnicodeXID::is_xid_continue(c)
178}
179
180pub fn is_ident(string: &str) -> bool {
182    let mut chars = string.chars();
183    if let Some(start) = chars.next() {
184        is_id_start(start) && chars.all(is_id_continue)
185    } else {
186        false
187    }
188}
189
190impl Cursor<'_> {
191    pub fn advance_token(&mut self) -> Token {
192        let start = self.location();
193        let first_char = self.bump().unwrap_or(EOF_CHAR);
194        let token_kind = match first_char {
195            '/' => match self.first() {
197                '/' => self.line_comment(),
198                '*' => self.block_comment(),
199                '=' => {
200                    self.bump();
201                    DivAssign
202                }
203                _ => Div,
204            },
205
206            c if is_whitespace(c) => {
208                self.eat_while(is_whitespace);
209                Whitespace
210            }
211
212            'r' => match self.first() {
214                c @ ('"' | '\'') => self.string(c, true),
215                _ => self.ident_or_reserved_word('r'),
216            },
217
218            c if is_id_start(c) => self.ident_or_reserved_word(c),
221
222            c @ '0'..='9' => self.number(c),
224
225            c @ ('"' | '\'') => self.string(c, false),
227
228            ':' if self.first() == ':' => {
230                self.bump();
231                DoubleColon
232            }
233            '=' if self.first() == '=' => {
234                self.bump();
235                Eq
236            }
237            '!' if self.first() == '=' => {
238                self.bump();
239                NotEq
240            }
241            '<' if self.first() == '=' => {
242                self.bump();
243                LtEq
244            }
245            '>' if self.first() == '=' => {
246                self.bump();
247                GtEq
248            }
249            '+' if self.first() == '=' => {
250                self.bump();
251                AddAssign
252            }
253            '-' if self.first() == '=' => {
254                self.bump();
255                SubAssign
256            }
257            '*' if self.first() == '=' => {
258                self.bump();
259                MulAssign
260            }
261            '%' if self.first() == '=' => {
262                self.bump();
263                ModAssign
264            }
265
266            '\n' => self.eol(),
268            '\\' if self.first() == '\n' => {
269                self.bump();
270                Whitespace
271            }
272            ',' => Comma,
273            '.' => Dot,
274            '(' => OpenParen,
275            ')' => CloseParen,
276            '{' => OpenBrace,
277            '}' => CloseBrace,
278            '[' => OpenBracket,
279            ']' => CloseBracket,
280            '#' => Pound,
281            '?' => Question,
282            ':' => Colon,
283            '=' => Assign,
284            '<' => Lt,
285            '>' => Gt,
286            '|' => VBar,
287            '+' => Add,
288            '-' => Sub,
289            '*' => Mul,
290            '%' => Mod,
291            c => Unknown(c),
292        };
293        Token::new(token_kind, start, self.location())
294    }
295
296    fn eol(&mut self) -> TokenKind {
297        debug_assert!(self.prev() == '\n');
298        self.eat_while(|c| c == '\n');
299        EOL
300    }
301
302    fn line_comment(&mut self) -> TokenKind {
303        debug_assert!(self.prev() == '/' && self.first() == '/');
304        self.bump();
305        self.eat_while(|c| c != '\n');
306        LineComment
307    }
308
309    fn block_comment(&mut self) -> TokenKind {
310        debug_assert!(self.prev() == '/' && self.first() == '*');
311        self.bump();
312        let mut depth = 1usize;
313        while let Some(c) = self.bump() {
314            match c {
315                '/' if self.first() == '*' => {
316                    self.bump();
317                    depth += 1;
318                }
319                '*' if self.first() == '/' => {
320                    self.bump();
321                    depth -= 1;
322                    if depth == 0 {
323                        break;
327                    }
328                }
329                _ => (),
330            }
331        }
332        BlockComment
333    }
334
335    fn ident_or_reserved_word(&mut self, first_char: char) -> TokenKind {
336        debug_assert!(is_id_start(self.prev()));
337        let mut value = String::from(first_char);
338        loop {
339            let c = self.first();
340            if is_id_continue(c) {
341                value.push(c);
342            } else {
343                break;
344            }
345            self.bump();
346        }
347
348        match value.as_str() {
349            "if" => If,
350            "else" => Else,
351            "loop" => Loop,
352            "while" => While,
353            "for" => For,
354            "in" => In,
355            "break" => Break,
356            "continue" => Continue,
357            "throw" => Throw,
358            "return" => Return,
359            "global" => Global,
360            "import" => Import,
361            "as" => As,
362            "is" => Is,
363            "not" => Not,
364            "and" => And,
365            "or" => Or,
366            "try" => Try,
367            "fn" => Fn,
368            "do" => Do,
369            "null" => Null,
370            "true" => True,
371            "false" => False,
372            _ => Ident(value),
373        }
374    }
375
376    fn number(&mut self, first_digit: char) -> TokenKind {
377        debug_assert!('0' <= self.prev() && self.prev() <= '9');
378        let mut base = Base::Decimal;
379        let mut value = String::new();
380        let mut has_point = false;
381        let mut has_exponent = false;
382        if first_digit == '0' {
383            match self.first() {
385                'b' => {
386                    base = Base::Binary;
387                    self.bump();
388                }
389                'o' => {
390                    base = Base::Octal;
391                    self.bump();
392                }
393                'x' => {
394                    base = Base::Hexadecimal;
395                    self.bump();
396                }
397                '0'..='9' | '_' | '.' | 'e' | 'E' => {
399                    base = Base::Decimal;
400                    value.push('0');
401                }
402                _ => return Literal(Int(Ok(0))),
404            };
405        } else {
406            value.push(first_digit);
407        }
408        loop {
409            let t = self.first();
410            match t {
411                '_' => {
412                    self.bump();
413                    continue;
414                }
415                '.' if base == Base::Decimal => {
416                    if has_point {
417                        return Literal(Float(Err(LexerError::NumberFormatError)));
418                    }
419                    has_point = true;
420                }
421                'e' | 'E' if base == Base::Decimal => {
422                    if has_exponent {
423                        return Literal(Float(Err(LexerError::NumberFormatError)));
424                    }
425                    has_exponent = true;
426                }
427                '0'..='1' if base == Base::Binary => {}
428                '0'..='7' if base == Base::Octal => {}
429                '0'..='9' if base == Base::Decimal => {}
430                '0'..='9' | 'a'..='f' | 'A'..='F' if base == Base::Hexadecimal => {}
431                _ => break,
432            }
433            value.push(t);
434            self.bump();
435        }
436
437        if has_point || has_exponent {
438            if base != Base::Decimal {
440                Literal(Float(Err(LexerError::NumberFormatError)))
441            } else {
442                match value.parse::<f64>() {
443                    Ok(v) => Literal(Float(Ok(v))),
444                    Err(e) => Literal(Float(Err(LexerError::ParseFloatError(e)))),
445                }
446            }
447        } else {
448            match i64::from_str_radix(
449                &value,
450                match base {
451                    Base::Binary => 2,
452                    Base::Octal => 8,
453                    Base::Hexadecimal => 16,
454                    Base::Decimal => 10,
455                },
456            ) {
457                Ok(v) => Literal(Int(Ok(v))),
458                Err(e) => Literal(Int(Err(LexerError::ParseIntError(e)))),
459            }
460        }
461    }
462
463    fn string(&mut self, quoted: char, is_raw: bool) -> TokenKind {
464        if is_raw {
465            debug_assert!(self.prev() == 'r');
466            self.bump();
467        }
468        debug_assert!(self.prev() == '"' || self.prev() == '\'');
469        let mut value = String::new();
470        loop {
471            if let Some(c) = self.bump() {
472                let t = match c {
473                    _ if c == quoted => break,
474                    '\\' if !is_raw => match self.first() {
475                        '\n' => {
476                            self.bump();
477                            continue;
478                        }
479                        _ => self.scan_escape(),
480                    },
481                    '\r' => Err(EscapeError::BareCarriageReturn),
482                    _ => Ok(c),
483                };
484                match t {
485                    Ok(c) => value.push(c),
486                    Err(e) => return Literal(Str(Err(LexerError::EscapeError(e)))),
487                }
488            } else {
489                return Literal(Str(Err(LexerError::UnterminatedStringError)));
490            }
491        }
492        Literal(Str(Ok(value)))
493    }
494
495    fn scan_escape(&mut self) -> std::result::Result<char, EscapeError> {
496        debug_assert!(self.prev() == '\\');
497        let res = match self.bump().unwrap_or(EOF_CHAR) {
499            '"' => '"',
500            'n' => '\n',
501            'r' => '\r',
502            't' => '\t',
503            '\\' => '\\',
504            '\'' => '\'',
505            '0' => '\0',
506
507            'x' => {
508                let hi = self.bump().ok_or(EscapeError::TooShortHexEscape)?;
511                let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
512
513                let lo = self.bump().ok_or(EscapeError::TooShortHexEscape)?;
514                let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
515
516                let value = hi * 16 + lo;
517
518                if value > 0x7F {
520                    return Err(EscapeError::OutOfRangeHexEscape);
521                }
522                let value = value as u8;
523
524                value as char
525            }
526
527            'u' => {
528                if self.bump() != Some('{') {
531                    return Err(EscapeError::NoBraceInUnicodeEscape);
532                }
533
534                let mut n_digits = 1;
536                let mut value: u32 = match self.bump().ok_or(EscapeError::UnclosedUnicodeEscape)? {
537                    '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
538                    '}' => return Err(EscapeError::EmptyUnicodeEscape),
539                    c => c
540                        .to_digit(16)
541                        .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
542                };
543
544                loop {
547                    match self.bump() {
548                        None => return Err(EscapeError::UnclosedUnicodeEscape),
549                        Some('_') => continue,
550                        Some('}') => {
551                            if n_digits > 6 {
552                                return Err(EscapeError::OverlongUnicodeEscape);
553                            }
554
555                            break std::char::from_u32(value).ok_or({
556                                if value > 0x10FFFF {
557                                    EscapeError::OutOfRangeUnicodeEscape
558                                } else {
559                                    EscapeError::LoneSurrogateUnicodeEscape
560                                }
561                            })?;
562                        }
563                        Some(c) => {
564                            let digit: u32 = c
565                                .to_digit(16)
566                                .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
567                            n_digits += 1;
568                            if n_digits > 6 {
569                                continue;
571                            }
572                            value = value * 16 + digit;
573                        }
574                    };
575                }
576            }
577            _ => return Err(EscapeError::InvalidEscape),
578        };
579        Ok(res)
580    }
581}
582
583#[derive(Error, Debug, Clone, PartialEq)]
585pub enum LexerError {
586    #[error("parse int error ({0})")]
587    ParseIntError(#[from] ParseIntError),
588    #[error("parse float error ({0})")]
589    ParseFloatError(#[from] ParseFloatError),
590    #[error("number format error")]
591    NumberFormatError,
592    #[error("unterminated string error")]
593    UnterminatedStringError,
594    #[error("escape error ({0})")]
595    EscapeError(#[from] EscapeError),
596}
597
598#[derive(Error, Debug, Clone, PartialEq, Eq)]
600pub enum EscapeError {
601    InvalidEscape,
603    BareCarriageReturn,
605
606    TooShortHexEscape,
608    InvalidCharInHexEscape,
610    OutOfRangeHexEscape,
612
613    NoBraceInUnicodeEscape,
615    InvalidCharInUnicodeEscape,
617    EmptyUnicodeEscape,
619    UnclosedUnicodeEscape,
621    LeadingUnderscoreUnicodeEscape,
623    OverlongUnicodeEscape,
625    LoneSurrogateUnicodeEscape,
627    OutOfRangeUnicodeEscape,
629}
630
631impl fmt::Display for EscapeError {
632    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
633        fmt::Debug::fmt(self, f)
634    }
635}