Skip to main content

lex_just_parse/
lexer.rs

1//! Provides lexical analysis features.
2//!
3//! This module contains the `Lexer` and the tokens it generates.
4
5use std::fmt;
6
7/// A stream-based lexical analyzer capable of interpreting string sources.
8///
9/// `Lexer` sequentially reads the underlying string slice and produces
10/// tokens on demand via the [`next()`](Self::next) and [`peek()`](Self::peek) methods.
11pub struct Lexer<'src> {
12    source: &'src str,
13    data: Vec<char>,
14    pos: usize,
15    byte_pos: usize,
16    loc: Loc,
17    peeked: Option<Token>,
18    keywords: Vec<&'src str>,
19}
20
21impl<'src> Lexer<'src> {
22    /// Creates a new `Lexer` given a source string slice.
23    pub fn new(source: &'src str) -> Self {
24        Self {
25            source,
26            data: source.chars().collect(),
27            loc: Loc::new(1, 1),
28            pos: 0,
29            byte_pos: 0,
30            peeked: None,
31            keywords: Vec::new(),
32        }
33    }
34
35    /// Configures the lexer with a set of predefined keywords to recognize.
36    pub fn with_keywords(mut self, keywords: &[&'src str]) -> Self {
37        self.keywords = keywords.to_vec();
38        self
39    }
40
41    /// Returns the next token in the stream, consuming it in the process.
42    /// If an EOF is reached, it will continue to return `EOF` tokens.
43    pub fn next(&mut self) -> Token {
44        if let Some(peek) = self.peeked.take() {
45            peek
46        } else {
47            self.next_token()
48        }
49    }
50
51    /// Returns a reference to the next token without consuming it.
52    /// Subsequent calls to `peek()` or `next()` will return this same token.
53    pub fn peek(&mut self) -> &Token {
54        if self.peeked.is_none() {
55            self.peeked = Some(self.next_token());
56        }
57        self.peeked.as_ref().unwrap()
58    }
59
60    fn advance(&mut self) -> char {
61        let ch = self.read_char();
62        self.byte_pos += ch.len_utf8();
63        self.pos += 1;
64        self.loc.next(ch);
65        ch
66    }
67
68    fn read_char(&mut self) -> char {
69        let pos = self.pos;
70        if pos >= self.data.len() {
71            '\0'
72        } else {
73            self.data[pos]
74        }
75    }
76
77    fn next_token(&mut self) -> Token {
78        while self.pos <= self.data.len() {
79            let begin_byte = self.byte_pos;
80            let ch = self.advance();
81            let loc = self.loc;
82
83            let tok = match ch {
84                '/' if self.read_char() == '/' => {
85                    while self.advance() != '\n' {}
86                    continue;
87                }
88                '#' => {
89                    let ch = self.read_char();
90                    if self.byte_pos == 1 && ch == '!' {
91                        while self.advance() != '\n' {}
92                        continue;
93                    }
94                    loop {
95                        let ch = self.read_char();
96                        if ch.is_alphanumeric() || ch == '_' {
97                            self.advance();
98                        } else {
99                            break;
100                        }
101                    }
102                    Token::new(
103                        TokenKind::Directive,
104                        loc,
105                        self.source[begin_byte..self.byte_pos].into(),
106                    )
107                }
108                '-' if self.read_char() == '>' => {
109                    self.advance();
110                    Token::new(
111                        TokenKind::Arrow,
112                        loc,
113                        self.source[begin_byte..self.byte_pos].into(),
114                    )
115                }
116                '=' if self.read_char() == '=' => {
117                    self.advance();
118                    Token::new(
119                        TokenKind::EqEq,
120                        loc,
121                        self.source[begin_byte..self.byte_pos].into(),
122                    )
123                }
124                ':' if self.read_char() == '=' => {
125                    self.advance();
126                    Token::new(
127                        TokenKind::Assign,
128                        loc,
129                        self.source[begin_byte..self.byte_pos].into(),
130                    )
131                }
132                '<' if self.read_char() == '=' => {
133                    self.advance();
134                    Token::new(
135                        TokenKind::LtEq,
136                        loc,
137                        self.source[begin_byte..self.byte_pos].into(),
138                    )
139                }
140                '>' if self.read_char() == '=' => {
141                    self.advance();
142                    Token::new(
143                        TokenKind::GtEq,
144                        loc,
145                        self.source[begin_byte..self.byte_pos].into(),
146                    )
147                }
148                '!' if self.read_char() == '=' => {
149                    self.advance();
150                    Token::new(
151                        TokenKind::NotEq,
152                        loc,
153                        self.source[begin_byte..self.byte_pos].into(),
154                    )
155                }
156                '&' if self.read_char() == '&' => {
157                    self.advance();
158                    Token::new(
159                        TokenKind::DoubleAmpersand,
160                        loc,
161                        self.source[begin_byte..self.byte_pos].into(),
162                    )
163                }
164                '|' if self.read_char() == '|' => {
165                    self.advance();
166                    Token::new(
167                        TokenKind::DoublePipe,
168                        loc,
169                        self.source[begin_byte..self.byte_pos].into(),
170                    )
171                }
172                ':' if self.read_char() == ':' => {
173                    self.advance();
174                    Token::new(
175                        TokenKind::DoubleColon,
176                        loc,
177                        self.source[begin_byte..self.byte_pos].into(),
178                    )
179                }
180                '.' if self.read_char() == '.' && self.read_char() == '.' => {
181                    self.advance();
182                    self.advance();
183                    Token::new(
184                        TokenKind::Ellipsis,
185                        loc,
186                        self.source[begin_byte..self.byte_pos].into(),
187                    )
188                }
189                ch if ch.is_alphabetic() || ch == '_' => return self.lex_identifier(begin_byte),
190                '0'..='9' => return self.lex_number(begin_byte),
191                '"' => return self.lex_string(begin_byte),
192
193                ',' => Token::new(
194                    TokenKind::Comma,
195                    loc,
196                    self.source[begin_byte..self.byte_pos].into(),
197                ),
198                ';' => Token::new(
199                    TokenKind::SemiColon,
200                    loc,
201                    self.source[begin_byte..self.byte_pos].into(),
202                ),
203                ':' => Token::new(
204                    TokenKind::Colon,
205                    loc,
206                    self.source[begin_byte..self.byte_pos].into(),
207                ),
208                '\\' => Token::new(
209                    TokenKind::BackSlash,
210                    loc,
211                    self.source[begin_byte..self.byte_pos].into(),
212                ),
213                '=' => Token::new(
214                    TokenKind::Eq,
215                    loc,
216                    self.source[begin_byte..self.byte_pos].into(),
217                ),
218                '<' => Token::new(
219                    TokenKind::Lt,
220                    loc,
221                    self.source[begin_byte..self.byte_pos].into(),
222                ),
223                '>' => Token::new(
224                    TokenKind::Gt,
225                    loc,
226                    self.source[begin_byte..self.byte_pos].into(),
227                ),
228                '!' => Token::new(
229                    TokenKind::Bang,
230                    loc,
231                    self.source[begin_byte..self.byte_pos].into(),
232                ),
233                '+' => {
234                    let next = self.read_char();
235                    if next == '+' {
236                        self.advance();
237                        Token::new(
238                            TokenKind::Concat,
239                            loc,
240                            self.source[begin_byte..self.byte_pos].into(),
241                        )
242                    } else if next == '=' {
243                        self.advance();
244                        Token::new(
245                            TokenKind::PlusEq,
246                            loc,
247                            self.source[begin_byte..self.byte_pos].into(),
248                        )
249                    } else {
250                        Token::new(
251                            TokenKind::Plus,
252                            loc,
253                            self.source[begin_byte..self.byte_pos].into(),
254                        )
255                    }
256                }
257                '-' => {
258                    let next = self.read_char();
259                    if next == '>' {
260                        self.advance();
261                        Token::new(
262                            TokenKind::Arrow,
263                            loc,
264                            self.source[begin_byte..self.byte_pos].into(),
265                        )
266                    } else if next == '=' {
267                        self.advance();
268                        Token::new(
269                            TokenKind::MinusEq,
270                            loc,
271                            self.source[begin_byte..self.byte_pos].into(),
272                        )
273                    } else {
274                        Token::new(
275                            TokenKind::Minus,
276                            loc,
277                            self.source[begin_byte..self.byte_pos].into(),
278                        )
279                    }
280                }
281                '.' => Token::new(
282                    TokenKind::Dot,
283                    loc,
284                    self.source[begin_byte..self.byte_pos].into(),
285                ),
286                '*' => {
287                    let next = self.read_char();
288                    if next == '=' {
289                        self.advance();
290                        Token::new(
291                            TokenKind::AsteriskEq,
292                            loc,
293                            self.source[begin_byte..self.byte_pos].into(),
294                        )
295                    } else {
296                        Token::new(
297                            TokenKind::Asterisk,
298                            loc,
299                            self.source[begin_byte..self.byte_pos].into(),
300                        )
301                    }
302                }
303                '/' => {
304                    let next = self.read_char();
305                    if next == '=' {
306                        self.advance();
307                        Token::new(
308                            TokenKind::SlashEq,
309                            loc,
310                            self.source[begin_byte..self.byte_pos].into(),
311                        )
312                    } else {
313                        Token::new(
314                            TokenKind::Slash,
315                            loc,
316                            self.source[begin_byte..self.byte_pos].into(),
317                        )
318                    }
319                }
320                '%' => {
321                    let next = self.read_char();
322                    if next == '=' {
323                        self.advance();
324                        Token::new(
325                            TokenKind::ModEq,
326                            loc,
327                            self.source[begin_byte..self.byte_pos].into(),
328                        )
329                    } else {
330                        Token::new(
331                            TokenKind::Mod,
332                            loc,
333                            self.source[begin_byte..self.byte_pos].into(),
334                        )
335                    }
336                }
337                '$' => Token::new(
338                    TokenKind::Dollar,
339                    loc,
340                    self.source[begin_byte..self.byte_pos].into(),
341                ),
342                '&' => Token::new(
343                    TokenKind::Ampersand,
344                    loc,
345                    self.source[begin_byte..self.byte_pos].into(),
346                ),
347                '^' => Token::new(
348                    TokenKind::Caret,
349                    loc,
350                    self.source[begin_byte..self.byte_pos].into(),
351                ),
352                '|' => Token::new(
353                    TokenKind::Pipe,
354                    loc,
355                    self.source[begin_byte..self.byte_pos].into(),
356                ),
357                '(' => Token::new(
358                    TokenKind::OpenParen,
359                    loc,
360                    self.source[begin_byte..self.byte_pos].into(),
361                ),
362                ')' => Token::new(
363                    TokenKind::CloseParen,
364                    loc,
365                    self.source[begin_byte..self.byte_pos].into(),
366                ),
367                '[' => Token::new(
368                    TokenKind::OpenBracket,
369                    loc,
370                    self.source[begin_byte..self.byte_pos].into(),
371                ),
372                ']' => Token::new(
373                    TokenKind::CloseBracket,
374                    loc,
375                    self.source[begin_byte..self.byte_pos].into(),
376                ),
377                '{' => Token::new(
378                    TokenKind::OpenCurly,
379                    loc,
380                    self.source[begin_byte..self.byte_pos].into(),
381                ),
382                '}' => Token::new(
383                    TokenKind::CloseCurly,
384                    loc,
385                    self.source[begin_byte..self.byte_pos].into(),
386                ),
387
388                ch if ch.is_whitespace() => continue,
389                '\0' => return Token::new(TokenKind::EOF, self.loc, "\0".into()),
390                _ => {
391                    return Token::new(
392                        TokenKind::UnexpectedCharacter,
393                        self.loc,
394                        self.source[begin_byte..self.byte_pos].into(),
395                    );
396                }
397            };
398            return tok;
399        }
400
401        Token::new(TokenKind::EOF, self.loc, "".into())
402    }
403
404    fn lex_identifier(&mut self, begin_byte: usize) -> Token {
405        let loc = self.loc;
406        #[allow(unused_mut)]
407        let mut kind = TokenKind::Identifier;
408        loop {
409            let ch = self.read_char();
410            if ch.is_alphanumeric() || ch == '_' {
411                self.advance();
412            } else {
413                break;
414            }
415        }
416        let ident = &self.source[begin_byte..self.byte_pos];
417
418        if self.keywords.contains(&ident) {
419            kind = TokenKind::Keyword;
420        }
421
422        Token::new(kind, loc, ident.into())
423    }
424
425    fn lex_number(&mut self, begin_byte: usize) -> Token {
426        let loc = self.loc;
427        let end;
428        let mut base = 10;
429
430        // Check for base prefix (0x, 0b, 0o)
431        let next = self.read_char();
432        match next {
433            'x' | 'X' => {
434                base = 16;
435                self.advance(); // 0
436                self.advance(); // x
437            }
438            'b' | 'B' => {
439                base = 2;
440                self.advance(); // 0
441                self.advance(); // b
442            }
443            'o' | 'O' => {
444                base = 8;
445                self.advance(); // 0
446                self.advance(); // o
447            }
448            _ => {}
449        }
450
451        // Read digits according to base
452        loop {
453            let c = self.read_char();
454            let valid = match base {
455                2 => matches!(c, '0' | '1'),
456                8 => matches!(c, '0'..='7'),
457                10 if c == '.' => {
458                    self.advance();
459                    loop {
460                        let c = self.read_char();
461                        if !c.is_ascii_digit() {
462                            break;
463                        }
464                        self.advance();
465                    }
466                    end = self.byte_pos;
467                    let num_str = &self.source[begin_byte..end];
468                    return Token::new(TokenKind::RealNumber, loc, (*num_str).into());
469                }
470                10 => c.is_ascii_digit(),
471                16 => c.is_ascii_hexdigit(),
472                _ => false,
473            };
474            if !valid {
475                break;
476            }
477            self.advance();
478        }
479
480        end = self.byte_pos;
481
482        let num_str = &self.source[begin_byte..end]
483            .trim_start_matches("0x")
484            .trim_start_matches("0X")
485            .trim_start_matches("0b")
486            .trim_start_matches("0B")
487            .trim_start_matches("0o")
488            .trim_start_matches("0O");
489        let kind = TokenKind::Number(NumberBase::from(base));
490
491        Token::new(kind, loc, (*num_str).into())
492    }
493
494    fn lex_string(&mut self, begin_byte: usize) -> Token {
495        // let mut buffer = String::new();
496        let loc = self.loc;
497        loop {
498            let ch = self.read_char();
499            match ch {
500                '"' => {
501                    self.advance();
502                    break;
503                }
504                '\0' => {
505                    return Token::new(
506                        TokenKind::UnterminatedStringLiteral,
507                        loc,
508                        self.source[begin_byte..self.byte_pos].into(),
509                    );
510                }
511                '\\' => {
512                    self.advance();
513                    let esc = self.read_char();
514                    match esc {
515                        'r' => {}  // buffer.push('\r'),
516                        'n' => {}  // buffer.push('\n'),
517                        '"' => {}  // buffer.push('"'),
518                        '\'' => {} // buffer.push('\''),
519                        '\\' => {} // buffer.push('\\'),
520                        '0' => {}  // buffer.push('\0'),
521                        _ => {
522                            return Token::new(
523                                TokenKind::InvalidEscapeSequence,
524                                loc,
525                                self.source[begin_byte..self.byte_pos].into(),
526                            );
527                        }
528                    }
529                }
530                _ => {} // buffer.push(ch as char),
531            }
532            self.advance();
533        }
534
535        Token::new(
536            TokenKind::StringLiteral,
537            loc,
538            self.source[begin_byte..self.byte_pos].into(),
539        )
540    }
541}
542
543/// A type representing a token's source string, which can be either an owned `String`
544/// or a leaked `&'static str` when the `interning` feature is enabled.
545#[derive(Debug, Clone, PartialEq, Eq, Hash)]
546pub struct TokenSource(
547    #[cfg(feature = "interning")] pub &'static str,
548    #[cfg(not(feature = "interning"))] pub String,
549);
550
551impl std::ops::Deref for TokenSource {
552    type Target = str;
553
554    #[inline]
555    fn deref(&self) -> &str {
556        #[cfg(feature = "interning")]
557        {
558            self.0
559        }
560        #[cfg(not(feature = "interning"))]
561        {
562            &self.0
563        }
564    }
565}
566
567impl fmt::Display for TokenSource {
568    #[inline]
569    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
570        fmt::Display::fmt(&**self, f)
571    }
572}
573
574impl From<&str> for TokenSource {
575    #[inline]
576    fn from(s: &str) -> Self {
577        #[cfg(feature = "interning")]
578        {
579            Self(intern(s))
580        }
581        #[cfg(not(feature = "interning"))]
582        {
583            Self(s.to_string())
584        }
585    }
586}
587
588impl From<String> for TokenSource {
589    #[inline]
590    fn from(s: String) -> Self {
591        #[cfg(feature = "interning")]
592        {
593            Self(intern(&s))
594        }
595        #[cfg(not(feature = "interning"))]
596        {
597            Self(s)
598        }
599    }
600}
601
602impl From<&String> for TokenSource {
603    #[inline]
604    fn from(s: &String) -> Self {
605        #[cfg(feature = "interning")]
606        {
607            Self(intern(s.as_str()))
608        }
609        #[cfg(not(feature = "interning"))]
610        {
611            Self(s.clone())
612        }
613    }
614}
615
616#[cfg(feature = "interning")]
617static INTERNER: std::sync::OnceLock<std::sync::Mutex<std::collections::HashSet<&'static str>>> =
618    std::sync::OnceLock::new();
619
620#[cfg(feature = "interning")]
621fn intern(s: &str) -> &'static str {
622    let mut interner = INTERNER
623        .get_or_init(|| std::sync::Mutex::new(std::collections::HashSet::new()))
624        .lock()
625        .unwrap();
626    if let Some(interned) = interner.get(s) {
627        interned
628    } else {
629        let leaked: &'static str = Box::leak(s.to_string().into_boxed_str());
630        interner.insert(leaked);
631        leaked
632    }
633}
634
635/// Represents a single analyzed token with its kind, source location, and original string segment.
636#[derive(Debug, Clone, PartialEq, Eq, Hash)]
637pub struct Token {
638    pub kind: TokenKind,
639    pub loc: Loc,
640    // source: &'static str,
641    pub source: TokenSource,
642}
643
644impl fmt::Display for Token {
645    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
646        match self.kind {
647            TokenKind::EOF => write!(f, "EOF"),
648            TokenKind::UnexpectedCharacter => {
649                write!(f, "Unexpected Character `{}`", self.source.escape_default())
650            }
651            TokenKind::InvalidEscapeSequence => {
652                write!(
653                    f,
654                    "Invalid Escape Sequence `{}`",
655                    self.source.escape_default()
656                )
657            }
658            TokenKind::UnterminatedStringLiteral => {
659                write!(
660                    f,
661                    "Unterminated String Literal `{}`",
662                    self.source.escape_default()
663                )
664            }
665            TokenKind::StringLiteral => write!(f, "{}", self.source.escape_default()),
666            TokenKind::CharacterLiteral => write!(f, "{}", self.source.escape_default()),
667            _ => write!(f, "{}", self.source),
668        }
669    }
670}
671
672impl Token {
673    /// Returns a string slice of the original text this token represents.
674    pub fn source(&self) -> &str {
675        // unsafe { transmute::<&'static str, &str>(self.source) }
676        &self.source
677    }
678
679    /// Creates a new `Token` from a given kind, location, and source string.
680    pub fn new(kind: TokenKind, loc: Loc, source: TokenSource) -> Self {
681        Self {
682            kind,
683            loc,
684            // source: unsafe { transmute::<&str, &'static str>(source) },
685            source,
686        }
687    }
688
689    /// Returns whether this token represents the End of File (`EOF`).
690    pub fn is_eof(&self) -> bool {
691        matches!(self.kind, TokenKind::EOF)
692    }
693
694    /// Attempts to unescape this token as a string literal.
695    pub fn unescape(&self) -> String {
696        match self.kind {
697            TokenKind::StringLiteral => token_string_unescape(self.source()),
698            _ => todo!(),
699        }
700    }
701}
702pub fn token_string_unescape(source: &str) -> String {
703    let mut buffer = String::new();
704    let mut esc = false;
705    let mut src = source.chars();
706    src.next();
707    for ch in src {
708        match ch {
709            ch if esc => {
710                match ch {
711                    'r' => buffer.push('\r'),
712                    'n' => buffer.push('\n'),
713                    '"' => buffer.push('"'),
714                    '\'' => buffer.push('\''),
715                    '\\' => buffer.push('\\'),
716                    '0' => buffer.push('\0'),
717                    _ => return buffer,
718                }
719                esc = false;
720            }
721            '"' => return buffer,
722            '\\' => {
723                esc = true;
724                continue;
725            }
726            _ => buffer.push(ch),
727        }
728    }
729    buffer
730}
731
732/// The specific type or category of a parsed token.
733#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
734pub enum TokenKind {
735    #[default]
736    EOF,
737    UnexpectedCharacter,
738    InvalidEscapeSequence,
739    UnterminatedStringLiteral,
740
741    OpenParen,
742    CloseParen,
743    OpenBracket,
744    CloseBracket,
745    OpenCurly,
746    CloseCurly,
747
748    Identifier,
749    Keyword,
750
751    Directive,
752
753    RealNumber,
754    StringLiteral,
755    CharacterLiteral,
756
757    Dot,
758    Ellipsis,
759    Comma,
760    Colon,
761    DoubleColon,
762    SemiColon,
763    Arrow,
764    BackSlash,
765
766    Assign,
767    PlusEq,
768    MinusEq,
769    AsteriskEq,
770    SlashEq,
771    ModEq,
772    Bang,
773    Plus,
774    Concat,
775    Minus,
776    Asterisk,
777    Slash,
778    Eq,
779    EqEq,
780    NotEq,
781    Gt,
782    GtEq,
783    Lt,
784    LtEq,
785    Mod,
786    Ampersand,
787    Pipe,
788    Caret,
789    DoubleAmpersand,
790    DoublePipe,
791
792    Dollar,
793    InvalidNumber,
794
795    Number(NumberBase),
796}
797
798/// The numerical base of a parsed number token (e.g., Binary, Octal, Decimal, Hexadecimal).
799#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
800pub enum NumberBase {
801    B,
802    O,
803    D,
804    X,
805}
806impl NumberBase {
807    pub fn radix(&self) -> u32 {
808        match self {
809            NumberBase::B => 2,
810            NumberBase::O => 8,
811            NumberBase::D => 10,
812            NumberBase::X => 16,
813        }
814    }
815}
816
817impl From<u32> for NumberBase {
818    fn from(value: u32) -> Self {
819        match value {
820            2 => Self::B,
821            8 => Self::O,
822            10 => Self::D,
823            16 => Self::X,
824            _ => panic!("Unkwon base"),
825        }
826    }
827}
828
829impl From<NumberBase> for u32 {
830    fn from(val: NumberBase) -> Self {
831        match val {
832            NumberBase::B => 2,
833            NumberBase::O => 8,
834            NumberBase::D => 10,
835            NumberBase::X => 16,
836        }
837    }
838}
839
840impl TokenKind {
841    pub fn is_assign_kind(&self) -> bool {
842        matches!(
843            self,
844            Self::Assign
845                | Self::Eq
846                | Self::PlusEq
847                | Self::MinusEq
848                | Self::AsteriskEq
849                | Self::SlashEq
850                | Self::ModEq
851        )
852    }
853}
854
855/// Captures physical location in the parsed source, specifically the line and column number.
856#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
857pub struct Loc {
858    pub line: usize,
859    pub col: usize,
860}
861
862impl fmt::Display for Loc {
863    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
864        write!(f, "{}:{}", self.line, self.col)
865    }
866}
867
868impl Loc {
869    pub fn new(line: usize, col: usize) -> Self {
870        Self { line, col }
871    }
872
873    pub fn next_column(&mut self) {
874        self.col += 1;
875    }
876
877    pub fn next_line(&mut self) {
878        self.line += 1;
879        self.col = 1;
880    }
881
882    pub fn next(&mut self, c: char) {
883        match c {
884            '\n' => self.next_line(),
885            '\t' => {
886                let ts = 8;
887                self.col = (self.col / ts) * ts + ts;
888            }
889            c if c.is_control() => {}
890            _ => {
891                // For proper UTF-8 support, we could use unicode-width crate
892                // to get the display width of characters, but for simplicity
893                // we'll treat all non-control characters as width 1
894                self.next_column();
895            }
896        }
897    }
898}
899
900#[cfg(test)]
901mod tests {
902    use super::*;
903
904    #[test]
905    fn test_lexer_init_and_eof() {
906        let mut lexer = Lexer::new("");
907        let tok = lexer.next();
908        assert_eq!(tok.kind, TokenKind::EOF);
909        assert!(tok.is_eof());
910
911        let tok2 = lexer.next();
912        assert_eq!(tok2.kind, TokenKind::EOF);
913    }
914
915    #[test]
916    fn test_lexer_peek() {
917        let mut lexer = Lexer::new("abc");
918        let peeked = lexer.peek().clone();
919        assert_eq!(peeked.kind, TokenKind::Identifier);
920        assert_eq!(peeked.source(), "abc");
921        let next = lexer.next();
922        assert_eq!(next, peeked);
923        assert_eq!(lexer.next().kind, TokenKind::EOF);
924    }
925
926    #[test]
927    fn test_comment_skipping() {
928        let source = "   // this is a line comment\n  identifier";
929        let mut lexer = Lexer::new(source);
930        let tok = lexer.next();
931        assert_eq!(tok.kind, TokenKind::Identifier);
932        assert_eq!(tok.source(), "identifier");
933        assert_eq!(tok.loc.line, 2);
934        assert_eq!(lexer.next().kind, TokenKind::EOF);
935    }
936
937    #[test]
938    fn test_shebang_skipping() {
939        let source = "#!/usr/bin/env rust\nidentifier";
940        let mut lexer = Lexer::new(source);
941        let tok = lexer.next();
942        assert_eq!(tok.kind, TokenKind::Identifier);
943        assert_eq!(tok.source(), "identifier");
944        assert_eq!(tok.loc.line, 2);
945    }
946
947    #[test]
948    fn test_keywords() {
949        let source = "var let my_ident";
950        let mut lexer = Lexer::new(source).with_keywords(&["var", "let"]);
951        let t1 = lexer.next();
952        assert_eq!(t1.kind, TokenKind::Keyword);
953        assert_eq!(t1.source(), "var");
954        let t2 = lexer.next();
955        assert_eq!(t2.kind, TokenKind::Keyword);
956        assert_eq!(t2.source(), "let");
957        let t3 = lexer.next();
958        assert_eq!(t3.kind, TokenKind::Identifier);
959        assert_eq!(t3.source(), "my_ident");
960    }
961
962    #[test]
963    fn test_identifiers() {
964        let source = "a _a a123 _123_abc";
965        let mut lexer = Lexer::new(source);
966        let idents = ["a", "_a", "a123", "_123_abc"];
967        for expected in idents {
968            let tok = lexer.next();
969            assert_eq!(tok.kind, TokenKind::Identifier);
970            assert_eq!(tok.source(), expected);
971        }
972    }
973
974    #[test]
975    fn test_location_tracking() {
976        let source = "a\n\tb";
977        let mut lexer = Lexer::new(source);
978        let t1 = lexer.next();
979        assert_eq!(t1.source(), "a");
980        assert_eq!(t1.loc, Loc::new(1, 2));
981        let t2 = lexer.next();
982        assert_eq!(t2.source(), "b");
983        // After 'a', we have '\n'. line becomes 2, col becomes 1.
984        // Then we have '\t'. col is calculated as:
985        // self.col = (self.col / ts) * ts + ts;
986        // ts = 8. (1 / 8)*8 + 8 = 8.
987        // And when 'b' is read, self.loc.next('b') advances the col to 9.
988        assert_eq!(t2.loc, Loc::new(2, 9));
989    }
990
991    #[test]
992    fn test_multi_char_operators() {
993        let source = "-> == := <= >= != && || :: ...";
994        let mut lex = Lexer::new(source);
995        assert_eq!(lex.next().kind, TokenKind::Arrow);
996        assert_eq!(lex.next().kind, TokenKind::EqEq);
997        assert_eq!(lex.next().kind, TokenKind::Assign);
998        assert_eq!(lex.next().kind, TokenKind::LtEq);
999        assert_eq!(lex.next().kind, TokenKind::GtEq);
1000        assert_eq!(lex.next().kind, TokenKind::NotEq);
1001        assert_eq!(lex.next().kind, TokenKind::DoubleAmpersand);
1002        assert_eq!(lex.next().kind, TokenKind::DoublePipe);
1003        assert_eq!(lex.next().kind, TokenKind::DoubleColon);
1004        assert_eq!(lex.next().kind, TokenKind::Ellipsis);
1005    }
1006
1007    #[test]
1008    fn test_single_and_compound_operators() {
1009        let source = ", ; : \\ = < > ! + ++ += - -= . * *= / /= % %= $ & ^ | ( ) [ ] { }";
1010        let mut lex = Lexer::new(source);
1011        assert_eq!(lex.next().kind, TokenKind::Comma);
1012        assert_eq!(lex.next().kind, TokenKind::SemiColon);
1013        assert_eq!(lex.next().kind, TokenKind::Colon);
1014        assert_eq!(lex.next().kind, TokenKind::BackSlash);
1015        assert_eq!(lex.next().kind, TokenKind::Eq);
1016        assert_eq!(lex.next().kind, TokenKind::Lt);
1017        assert_eq!(lex.next().kind, TokenKind::Gt);
1018        assert_eq!(lex.next().kind, TokenKind::Bang);
1019        assert_eq!(lex.next().kind, TokenKind::Plus);
1020        assert_eq!(lex.next().kind, TokenKind::Concat); // ++
1021        assert_eq!(lex.next().kind, TokenKind::PlusEq); // +=
1022        assert_eq!(lex.next().kind, TokenKind::Minus);
1023        assert_eq!(lex.next().kind, TokenKind::MinusEq); // -=
1024        assert_eq!(lex.next().kind, TokenKind::Dot);
1025        assert_eq!(lex.next().kind, TokenKind::Asterisk);
1026        assert_eq!(lex.next().kind, TokenKind::AsteriskEq); // *=
1027        assert_eq!(lex.next().kind, TokenKind::Slash);
1028        assert_eq!(lex.next().kind, TokenKind::SlashEq); // /=
1029        assert_eq!(lex.next().kind, TokenKind::Mod);
1030        assert_eq!(lex.next().kind, TokenKind::ModEq); // %=
1031        assert_eq!(lex.next().kind, TokenKind::Dollar);
1032        assert_eq!(lex.next().kind, TokenKind::Ampersand);
1033        assert_eq!(lex.next().kind, TokenKind::Caret);
1034        assert_eq!(lex.next().kind, TokenKind::Pipe);
1035        assert_eq!(lex.next().kind, TokenKind::OpenParen);
1036        assert_eq!(lex.next().kind, TokenKind::CloseParen);
1037        assert_eq!(lex.next().kind, TokenKind::OpenBracket);
1038        assert_eq!(lex.next().kind, TokenKind::CloseBracket);
1039        assert_eq!(lex.next().kind, TokenKind::OpenCurly);
1040        assert_eq!(lex.next().kind, TokenKind::CloseCurly);
1041    }
1042
1043    #[test]
1044    fn test_directives() {
1045        let mut lex = Lexer::new("#define ABC");
1046        let tok = lex.next();
1047        assert_eq!(tok.kind, TokenKind::Directive);
1048        assert_eq!(tok.source(), "#define");
1049
1050        let mut lex2 = Lexer::new("#!/bin/bash\n#include");
1051        let tok2 = lex2.next();
1052        assert_eq!(tok2.kind, TokenKind::Directive);
1053        assert_eq!(tok2.source(), "#include");
1054
1055        let mut lex3 = Lexer::new(" #!");
1056        let tok3 = lex3.next();
1057        assert_eq!(tok3.kind, TokenKind::Directive);
1058        assert_eq!(tok3.source(), "#");
1059    }
1060
1061    #[test]
1062    fn test_numeric_bases() {
1063        let source = "123 0b101 0o755 0xFF 1.23";
1064        let mut lex = Lexer::new(source);
1065
1066        let t1 = lex.next();
1067        assert_eq!(t1.kind, TokenKind::Number(NumberBase::D));
1068        assert_eq!(t1.source(), "123");
1069
1070        let t2 = lex.next();
1071        assert_eq!(t2.kind, TokenKind::Number(NumberBase::B));
1072        assert_eq!(t2.source(), "101");
1073
1074        let t3 = lex.next();
1075        assert_eq!(t3.kind, TokenKind::Number(NumberBase::O));
1076        assert_eq!(t3.source(), "755");
1077
1078        let t4 = lex.next();
1079        assert_eq!(t4.kind, TokenKind::Number(NumberBase::X));
1080        assert_eq!(t4.source(), "FF");
1081
1082        let t5 = lex.next();
1083        assert_eq!(t5.kind, TokenKind::RealNumber);
1084        assert_eq!(t5.source(), "1.23");
1085    }
1086
1087    #[test]
1088    fn test_number_base_conversions() {
1089        assert_eq!(NumberBase::B.radix(), 2);
1090        assert_eq!(NumberBase::O.radix(), 8);
1091        assert_eq!(NumberBase::D.radix(), 10);
1092        assert_eq!(NumberBase::X.radix(), 16);
1093
1094        assert_eq!(NumberBase::from(2), NumberBase::B);
1095        assert_eq!(NumberBase::from(8), NumberBase::O);
1096        assert_eq!(NumberBase::from(10), NumberBase::D);
1097        assert_eq!(NumberBase::from(16), NumberBase::X);
1098
1099        assert_eq!(u32::from(NumberBase::B), 2);
1100        assert_eq!(u32::from(NumberBase::O), 8);
1101        assert_eq!(u32::from(NumberBase::D), 10);
1102        assert_eq!(u32::from(NumberBase::X), 16);
1103    }
1104
1105    #[test]
1106    #[should_panic(expected = "Unkwon base")]
1107    fn test_number_base_panic() {
1108        let _ = NumberBase::from(3);
1109    }
1110
1111    #[test]
1112    fn test_string_literals() {
1113        let mut lex = Lexer::new("\"hello\"");
1114        let t = lex.next();
1115        assert_eq!(t.kind, TokenKind::StringLiteral);
1116        assert_eq!(t.source(), "\"hello\"");
1117        assert_eq!(t.unescape(), "hello");
1118
1119        let mut lex = Lexer::new("\"hello\\nworld\"");
1120        let t = lex.next();
1121        assert_eq!(t.kind, TokenKind::StringLiteral);
1122        assert_eq!(t.unescape(), "hello\nworld");
1123
1124        let mut lex = Lexer::new("\"hello\\x\"");
1125        let t = lex.next();
1126        assert_eq!(t.kind, TokenKind::InvalidEscapeSequence);
1127        assert_eq!(t.source(), "\"hello\\");
1128
1129        let mut lex = Lexer::new("\"hello");
1130        let t = lex.next();
1131        assert_eq!(t.kind, TokenKind::UnterminatedStringLiteral);
1132        assert_eq!(t.source(), "\"hello");
1133    }
1134
1135    #[test]
1136    fn test_token_helpers_and_display() {
1137        let loc = Loc::new(5, 10);
1138        let token = Token::new(TokenKind::Identifier, loc, "foo".into());
1139        assert!(!token.is_eof());
1140        assert_eq!(format!("{}", loc), "5:10");
1141        assert_eq!(format!("{}", token), "foo");
1142
1143        let eof_token = Token::new(TokenKind::EOF, loc, "".into());
1144        assert!(eof_token.is_eof());
1145        assert_eq!(format!("{}", eof_token), "EOF");
1146
1147        let err_token = Token::new(TokenKind::UnexpectedCharacter, loc, "@".into());
1148        assert_eq!(format!("{}", err_token), "Unexpected Character `@`");
1149
1150        let esc_err = Token::new(TokenKind::InvalidEscapeSequence, loc, "\\x".into());
1151        assert_eq!(format!("{}", esc_err), "Invalid Escape Sequence `\\\\x`");
1152
1153        let unterminated = Token::new(TokenKind::UnterminatedStringLiteral, loc, "\"abc".into());
1154        assert_eq!(
1155            format!("{}", unterminated),
1156            "Unterminated String Literal `\\\"abc`"
1157        );
1158
1159        let str_tok = Token::new(TokenKind::StringLiteral, loc, "\"abc\"".into());
1160        assert_eq!(format!("{}", str_tok), "\\\"abc\\\"");
1161
1162        let char_tok = Token::new(TokenKind::CharacterLiteral, loc, "'a'".into());
1163        assert_eq!(format!("{}", char_tok), "\\'a\\'");
1164    }
1165
1166    #[test]
1167    fn test_is_assign_kind() {
1168        assert!(TokenKind::Assign.is_assign_kind());
1169        assert!(TokenKind::Eq.is_assign_kind());
1170        assert!(TokenKind::PlusEq.is_assign_kind());
1171        assert!(TokenKind::MinusEq.is_assign_kind());
1172        assert!(TokenKind::AsteriskEq.is_assign_kind());
1173        assert!(TokenKind::SlashEq.is_assign_kind());
1174        assert!(TokenKind::ModEq.is_assign_kind());
1175        assert!(!TokenKind::Plus.is_assign_kind());
1176        assert!(!TokenKind::Identifier.is_assign_kind());
1177    }
1178
1179    #[test]
1180    fn test_unexpected_character() {
1181        let mut lex = Lexer::new("@");
1182        let t = lex.next();
1183        assert_eq!(t.kind, TokenKind::UnexpectedCharacter);
1184        assert_eq!(t.source(), "@");
1185    }
1186
1187    #[test]
1188    fn test_string_interning_pointer_equality() {
1189        let source = "my_var my_var";
1190        let mut lex = Lexer::new(source);
1191        let t1 = lex.next();
1192        let t2 = lex.next();
1193        assert_eq!(t1.source(), "my_var");
1194        assert_eq!(t2.source(), "my_var");
1195
1196        #[cfg(feature = "interning")]
1197        {
1198            let p1 = t1.source.0;
1199            let p2 = t2.source.0;
1200            assert!(std::ptr::eq(p1, p2));
1201        }
1202
1203        #[cfg(not(feature = "interning"))]
1204        {
1205            let p1 = t1.source.0.as_ptr();
1206            let p2 = t2.source.0.as_ptr();
1207            assert!(!std::ptr::eq(p1, p2));
1208        }
1209    }
1210}