mon_core/
lexer.rs

1//! # MON Lexer (Tokenizer)
2//!
3//! This module provides the `Lexer` for the MON language. The lexer, also known as a
4//! tokenizer or scanner, is the first stage in the compilation process. It is responsible
5//! for converting a raw source code string into a sequence of discrete `Token`s.
6//!
7//! ## Architectural Overview
8//!
9//! The `Lexer` is a hand-written, stateful iterator that scans the input character by character
10//! to produce tokens. It recognizes all the fundamental building blocks of the language, such as:
11//!
12//! - **Literals:** Identifiers, strings, and numbers.
13//! - **Keywords:** `true`, `false`, `null`, `import`, etc.
14//! - **Punctuation:** Braces `{{}}`, brackets `[]`, commas `,`, colons `:`, etc.
15//! - **Operators:** `::`, `...`, `&`, `*`, etc.
16//! - **Whitespace and Comments:** These are also produced as tokens, which allows subsequent
17//!   tools (like formatters or IDEs) to preserve them. The [`Parser`](crate::parser::Parser)
18//!   typically filters them out.
19//!
20//! Each `Token` produced contains a [`TokenType`] and its start and end byte positions
21//! in the original source, which is crucial for error reporting.
22//!
23//! ## Use Cases
24//!
25//! Direct use of the `Lexer` is not as common as using the full [`analyze`](crate::api::analyze) pipeline,
26//! but it is essential for tools that operate at the token level.
27//!
28//! - **Syntax Highlighting:** A syntax highlighter can use the lexer to assign colors to
29//!   different token types.
30//! - **Code Formatting:** A formatter (like `rustfmt`) uses the token stream, including
31//!   whitespace and comments, to re-format the code according to a set of rules.
32//! - **Debugging and Educational Tools:** It can be used to show how a source file is broken
33//!   down into its most basic components.
34//!
35//! ## Example: Direct Lexer Usage
36//!
37//! ```rust
38//! use mon_core::lexer::{Lexer, TokenType, Token};
39//!
40//! let source = "key: 123 // A number";
41//!
42//! // Create a new lexer for the source string.
43//! let mut lexer = Lexer::new(source);
44//!
45//! // You can retrieve all tokens at once.
46//! let tokens: Vec<Token> = lexer.lex();
47//!
48//! // Or process them one by one.
49//! let mut lexer = Lexer::new(source);
50//! assert_eq!(lexer.next_token().ttype, TokenType::Identifier("key".to_string()));
51//! assert_eq!(lexer.next_token().ttype, TokenType::Colon);
52//! assert_eq!(lexer.next_token().ttype, TokenType::Whitespace);
53//! assert_eq!(lexer.next_token().ttype, TokenType::Number(123.0));
54//! assert_eq!(lexer.next_token().ttype, TokenType::Whitespace);
55//! assert!(matches!(lexer.next_token().ttype, TokenType::Comment(_)));
56//! assert_eq!(lexer.next_token().ttype, TokenType::Eof);
57//! ```
58/// Represents the different kinds of tokens that the lexer can produce.
59/// Each token is a meaningful unit of the MON language syntax.
60#[derive(Debug, PartialEq, Clone)]
61pub enum TokenType {
62    // == Special Tokens ==
63    /// Represents the end of the input file.
64    Eof,
65    /// Represents a sequence of one or more whitespace characters (spaces, tabs, newlines).
66    Whitespace,
67    /// Represents a comment, starting with `//` and continuing to the end of the line.
68    /// The associated `String` contains the content of the comment.
69    Comment(String),
70    /// Represents a token that could not be recognized by the lexer.
71    Unknown,
72
73    // == Literals ==
74    /// An identifier, used for keys, type names, and anchor/alias names.
75    /// Examples: `name`, `User`, `&default_user`.
76    Identifier(String),
77    /// A string literal, enclosed in double quotes.
78    /// The associated `String` holds the content of the string.
79    String(String),
80    /// A number literal, which can be an integer or a floating-point value.
81    Number(f64),
82
83    // == Keywords ==
84    /// The boolean `true` value, can be written as `true` or `on`.
85    True,
86    /// The boolean `false` value, can be written as `false` or `off`.
87    False,
88    /// The `null` keyword, representing an empty or absent value.
89    Null,
90    /// The `import` keyword, used for the module system.
91    Import,
92    /// The `from` keyword, used for named imports.
93    From,
94    /// The `as` keyword, used for namespacing imports.
95    As,
96
97    // == Punctuation & Operators ==
98    /// Left Brace: `{`
99    LBrace,
100    /// Right Brace: `}`
101    RBrace,
102    /// Left Bracket: `[`
103    LBracket,
104    /// Right Bracket: `]`
105    RBracket,
106    /// Left Parenthesis: `(`
107    LParen,
108    /// Right Parenthesis: `)`
109    RParen,
110    /// Comma: `,`
111    Comma,
112    /// Colon: `:`
113    Colon,
114    /// Double Colon: `::` (used for type annotations)
115    DoubleColon,
116    /// Dot: `.` (used for namespace access)
117    Dot,
118    /// Equals: `=` (used for using structs)
119    Equals,
120    /// Hash: `#` (used as a prefix for type definitions, e.g., `#struct`)
121    Hash,
122    /// Dollar Sign: `$` (used for accessing enum variants)
123    Dollar,
124    /// Ampersand: `&` (used to define an anchor)
125    Ampersand,
126    /// Asterisk: `*` (used to create an alias of an anchor)
127    Asterisk,
128    /// Spread: `...` (used to spread an anchor into an object or array)
129    Spread,
130}
131
132/// Represents a single lexical token, containing its type and position in the source text.
133///
134/// A `Token` is an atomic unit of the language syntax, like an identifier, a keyword, or a symbol.
135#[derive(Debug, Clone)]
136pub struct Token {
137    /// The type of the token, e.g., `TokenType::Identifier`.
138    pub ttype: TokenType,
139    /// The 0-based starting byte position of the token in the source string.
140    pub pos_start: usize,
141    /// The 0-based ending byte position of the token in the source string.
142    pub pos_end: usize,
143}
144
145impl Token {
146    /// Creates a new `Token`.
147    #[must_use]
148    pub fn new(ttype: TokenType, pos_start: usize, pos_end: usize) -> Token {
149        Token {
150            ttype,
151            pos_start,
152            pos_end,
153        }
154    }
155}
156
157/// A lexer for the MON language, also known as a tokenizer or scanner.
158///
159/// The `Lexer`'s primary role is to read MON source code as a stream of characters
160/// and break it down into a sequence of [`Token`]s. Each token represents a
161/// meaningful unit of the language, like an identifier, a number, or a punctuation mark.
162///
163/// The `Lexer` is the first step in the compilation pipeline, providing the input
164/// for the [`Parser`](crate::parser::Parser).
165///
166/// # Example: How to use the Lexer
167///
168/// You can use the `Lexer` to tokenize a MON source string and inspect the tokens.
169///
170/// ```rust
171/// use mon_core::lexer::{Lexer, TokenType, Token};
172///
173/// let source = "{ key: 123 }";
174///
175/// // 1. Create a new lexer for the source code.
176/// let mut lexer = Lexer::new(source);
177///
178/// // 2. Use `lex()` to get all tokens.
179/// let tokens: Vec<Token> = lexer.lex();
180///
181/// assert_eq!(tokens[0].ttype, TokenType::LBrace);
182/// assert_eq!(tokens[1].ttype, TokenType::Whitespace);
183/// assert_eq!(tokens[2].ttype, TokenType::Identifier("key".to_string()));
184/// assert_eq!(tokens[3].ttype, TokenType::Colon);
185/// assert_eq!(tokens[4].ttype, TokenType::Whitespace);
186/// assert_eq!(tokens[5].ttype, TokenType::Number(123.0));
187/// assert_eq!(tokens[6].ttype, TokenType::Whitespace);
188/// assert_eq!(tokens[7].ttype, TokenType::RBrace);
189/// assert_eq!(tokens[8].ttype, TokenType::Eof);
190///
191/// // Alternatively, you can process tokens one by one using `next_token`.
192/// let mut lexer = Lexer::new(source);
193/// assert_eq!(lexer.next_token().ttype, TokenType::LBrace);
194/// assert_eq!(lexer.next_token().ttype, TokenType::Whitespace);
195/// ```
196pub struct Lexer<'a> {
197    chars: std::iter::Peekable<std::str::Chars<'a>>,
198    position: usize,
199}
200
201impl<'a> Lexer<'a> {
202    /// Creates a new `Lexer` for the given input string.
203    #[must_use]
204    pub fn new(input: &'a str) -> Self {
205        Self {
206            chars: input.chars().peekable(),
207            position: 0,
208        }
209    }
210
211    /// Consumes the `Lexer` and returns a `Vec<Token>` containing all tokens from the source.
212    ///
213    /// This method will tokenize the entire input string up to and including the final [`TokenType::Eof`] token.
214    pub fn lex(&mut self) -> Vec<Token> {
215        let mut tokens = Vec::new();
216        loop {
217            let token = self.next_token();
218            if token.ttype == TokenType::Eof {
219                tokens.push(token);
220                break;
221            }
222            tokens.push(token);
223        }
224        tokens
225    }
226
227    /// Scans and returns the next [`Token`] from the input stream.
228    ///
229    /// This is the core tokenizing function. When the end of the input is reached,
230    /// it will repeatedly return a token of type [`TokenType::Eof`].
231    pub fn next_token(&mut self) -> Token {
232        let start_pos = self.position;
233
234        let ttype = if let Some(char) = self.advance() {
235            match char {
236                '{' => TokenType::LBrace,
237                '}' => TokenType::RBrace,
238                '[' => TokenType::LBracket,
239                ']' => TokenType::RBracket,
240                '(' => TokenType::LParen,
241                ')' => TokenType::RParen,
242                ',' => TokenType::Comma,
243                '#' => TokenType::Hash,
244                '$' => TokenType::Dollar,
245                '&' => TokenType::Ampersand,
246                '*' => TokenType::Asterisk,
247                '=' => TokenType::Equals,
248
249                ':' => {
250                    if self.peek() == Some(&':') {
251                        self.advance();
252                        TokenType::DoubleColon
253                    } else {
254                        TokenType::Colon
255                    }
256                }
257                '.' => {
258                    if self.peek() == Some(&'.') {
259                        self.advance();
260                        if self.peek() == Some(&'.') {
261                            self.advance();
262                            TokenType::Spread
263                        } else {
264                            TokenType::Unknown
265                        }
266                    } else {
267                        TokenType::Dot
268                    }
269                }
270                '/' => {
271                    if self.peek() == Some(&'/') {
272                        self.read_comment()
273                    } else {
274                        TokenType::Unknown
275                    }
276                }
277                '"' => self.read_string(),
278                c if c.is_whitespace() => self.read_whitespace(),
279                c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(c),
280                c if c.is_ascii_digit()
281                    || (c == '-' && self.peek().is_some_and(char::is_ascii_digit)) =>
282                {
283                    self.read_number(c)
284                }
285
286                _ => TokenType::Unknown,
287            }
288        } else {
289            TokenType::Eof
290        };
291
292        Token::new(ttype, start_pos, self.position)
293    }
294
295    fn advance(&mut self) -> Option<char> {
296        let char = self.chars.next();
297        if let Some(c) = char {
298            self.position += c.len_utf8();
299        }
300        char
301    }
302
303    fn peek(&mut self) -> Option<&char> {
304        self.chars.peek()
305    }
306
307    fn read_whitespace(&mut self) -> TokenType {
308        while let Some(c) = self.peek() {
309            if c.is_whitespace() {
310                self.advance();
311            } else {
312                break;
313            }
314        }
315        TokenType::Whitespace
316    }
317
318    fn read_comment(&mut self) -> TokenType {
319        self.advance(); // Consume the second '/'
320        let mut comment_text = String::new();
321        while let Some(c) = self.peek() {
322            if *c == '\n' {
323                break;
324            }
325            comment_text.push(self.advance().unwrap());
326        }
327        TokenType::Comment(comment_text.trim().to_string())
328    }
329
330    fn read_string(&mut self) -> TokenType {
331        let mut value = String::new();
332        loop {
333            match self.peek() {
334                Some('"') => {
335                    self.advance(); // Consume the closing quote
336                    return TokenType::String(value);
337                }
338                Some('\\') => {
339                    self.advance(); // Consume the backslash
340                    match self.advance() {
341                        Some('"') => value.push('"'),
342                        Some('\\') => value.push('\\'),
343                        Some('n') => value.push('\n'),
344                        Some('r') => value.push('\r'),
345                        Some('t') => value.push('\t'),
346                        Some(other) => {
347                            value.push('\\');
348                            value.push(other);
349                        }
350                        None => return TokenType::Unknown, // Unclosed escape sequence
351                    }
352                }
353                Some(c) => {
354                    value.push(*c);
355                    self.advance();
356                }
357                None => return TokenType::Unknown, // Unclosed string
358            }
359        }
360    }
361
362    fn read_identifier(&mut self, first_char: char) -> TokenType {
363        let mut ident = String::new();
364        ident.push(first_char);
365
366        while let Some(c) = self.peek() {
367            if c.is_ascii_alphanumeric() || *c == '_' {
368                ident.push(self.advance().unwrap());
369            } else {
370                break;
371            }
372        }
373
374        match ident.as_str() {
375            "true" | "on" => TokenType::True,
376            "false" | "off" => TokenType::False,
377            "null" => TokenType::Null,
378            "import" => TokenType::Import,
379            "from" => TokenType::From,
380            "as" => TokenType::As,
381            _ => TokenType::Identifier(ident),
382        }
383    }
384
385    fn read_number(&mut self, first_char: char) -> TokenType {
386        let mut number_str = String::new();
387        number_str.push(first_char);
388        let mut has_dot = first_char == '.';
389        let mut has_exponent = false;
390
391        while let Some(c) = self.peek() {
392            if c.is_ascii_digit() {
393                number_str.push(self.advance().unwrap());
394            } else if *c == '.' && !has_dot {
395                has_dot = true;
396                number_str.push(self.advance().unwrap());
397            } else if (*c == 'e' || *c == 'E') && !has_exponent {
398                has_exponent = true;
399                number_str.push(self.advance().unwrap());
400                // Check for optional sign after 'e' or 'E'
401                if let Some(sign_char) = self.peek() {
402                    if *sign_char == '+' || *sign_char == '-' {
403                        number_str.push(self.advance().unwrap());
404                    }
405                }
406            } else {
407                break;
408            }
409        }
410
411        if let Ok(num) = number_str.parse::<f64>() {
412            TokenType::Number(num)
413        } else {
414            TokenType::Unknown
415        }
416    }
417}
418
419/// QOL function
420#[allow(dead_code)]
421pub(crate) fn tokens_to_pretty_string(tokens: &[Token]) -> String {
422    let mut buff: Vec<String> = Vec::with_capacity(tokens.len());
423
424    for token in tokens {
425        buff.push(format!(
426            "{:?}, {}, {}",
427            token.ttype, token.pos_start, token.pos_end,
428        ));
429    }
430
431    buff.join("\n")
432}
433
434#[cfg(test)]
435#[allow(clippy::needless_pass_by_value)]
436#[allow(clippy::explicit_auto_deref)]
437mod tests {
438    use super::*;
439
440    fn assert_tokens(input: &str, expected: &[TokenType]) {
441        let mut lexer = Lexer::new(input);
442        let tokens = lexer.lex();
443        let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
444
445        // Filter out whitespace and comments for most tests
446        let filtered_tokens: Vec<TokenType> = token_types
447            .into_iter()
448            .filter(|t| !matches!(t, TokenType::Whitespace | TokenType::Comment(_)))
449            .collect();
450
451        assert_eq!(filtered_tokens, expected);
452    }
453
454    #[test]
455    fn test_eof() {
456        assert_tokens("", &[TokenType::Eof]);
457    }
458
459    #[test]
460    fn test_single_char_tokens() {
461        let input = "{}[](),:#{new_string}*";
462        let expected = vec![
463            TokenType::LBrace,
464            TokenType::RBrace,
465            TokenType::LBracket,
466            TokenType::RBracket,
467            TokenType::LParen,
468            TokenType::RParen,
469            TokenType::Comma,
470            TokenType::Colon,
471            TokenType::Hash,
472            TokenType::LBrace,
473            TokenType::Identifier("new_string".to_string()),
474            TokenType::RBrace,
475            TokenType::Asterisk,
476            TokenType::Eof,
477        ];
478        assert_tokens(input, &expected);
479    }
480
481    #[test]
482    fn test_multi_char_operators() {
483        let input = ":: ...";
484        let expected = vec![TokenType::DoubleColon, TokenType::Spread, TokenType::Eof];
485        assert_tokens(input, &expected);
486    }
487
488    #[test]
489    fn test_keywords() {
490        let input = "true on false off null import from as";
491        let expected = vec![
492            TokenType::True,
493            TokenType::True,
494            TokenType::False,
495            TokenType::False,
496            TokenType::Null,
497            TokenType::Import,
498            TokenType::From,
499            TokenType::As,
500            TokenType::Eof,
501        ];
502        assert_tokens(input, &expected);
503    }
504
505    #[test]
506    fn test_identifiers() {
507        let input = "foo bar_123 _baz";
508        let expected = vec![
509            TokenType::Identifier("foo".to_string()),
510            TokenType::Identifier("bar_123".to_string()),
511            TokenType::Identifier("_baz".to_string()),
512            TokenType::Eof,
513        ];
514        assert_tokens(input, &expected);
515    }
516
517    #[test]
518    fn test_numbers() {
519        let input = "123 45.67 -10 0.5";
520        let expected = vec![
521            TokenType::Number(123.0),
522            TokenType::Number(45.67),
523            TokenType::Number(-10.0),
524            TokenType::Number(0.5),
525            TokenType::Eof,
526        ];
527        assert_tokens(input, &expected);
528    }
529
530    #[test]
531    fn test_comments_and_whitespace() {
532        let input = " // this is a comment\n key: value // another one";
533        let mut lexer = Lexer::new(input);
534        let tokens = lexer.lex();
535        let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
536
537        let expected = vec![
538            TokenType::Whitespace,
539            TokenType::Comment("this is a comment".to_string()),
540            TokenType::Whitespace,
541            TokenType::Identifier("key".to_string()),
542            TokenType::Colon,
543            TokenType::Whitespace,
544            TokenType::Identifier("value".to_string()),
545            TokenType::Whitespace,
546            TokenType::Comment("another one".to_string()),
547            TokenType::Eof,
548        ];
549
550        assert_eq!(token_types, expected);
551    }
552
553    #[test]
554    fn test_complex_mon_structure() {
555        let input = r#"
556        {
557        // Config settings
558        service_name: "My App",
559        port: 8080,
560        is_enabled: on,
561
562        &default_user: {
563            permissions: ["READ", "WRITE"],
564        },
565
566        admin :: User = {
567            ...*default_user,
568            name: "Admin",
569            }
570        }
571                    "#;
572        let expected = vec![
573            TokenType::LBrace,
574            TokenType::Identifier("service_name".to_string()),
575            TokenType::Colon,
576            TokenType::String("My App".to_string()),
577            TokenType::Comma,
578            TokenType::Identifier("port".to_string()),
579            TokenType::Colon,
580            TokenType::Number(8080.0),
581            TokenType::Comma,
582            TokenType::Identifier("is_enabled".to_string()),
583            TokenType::Colon,
584            TokenType::True,
585            TokenType::Comma,
586            TokenType::Ampersand,
587            TokenType::Identifier("default_user".to_string()),
588            TokenType::Colon,
589            TokenType::LBrace,
590            TokenType::Identifier("permissions".to_string()),
591            TokenType::Colon,
592            TokenType::LBracket,
593            TokenType::String("READ".to_string()),
594            TokenType::Comma,
595            TokenType::String("WRITE".to_string()),
596            TokenType::RBracket,
597            TokenType::Comma,
598            TokenType::RBrace,
599            TokenType::Comma,
600            TokenType::Identifier("admin".to_string()),
601            TokenType::DoubleColon,
602            TokenType::Identifier("User".to_string()),
603            TokenType::Equals,
604            TokenType::LBrace,
605            TokenType::Spread,
606            TokenType::Asterisk,
607            TokenType::Identifier("default_user".to_string()),
608            TokenType::Comma,
609            TokenType::Identifier("name".to_string()),
610            TokenType::Colon,
611            TokenType::String("Admin".to_string()),
612            TokenType::Comma,
613            TokenType::RBrace,
614            TokenType::RBrace,
615            TokenType::Eof,
616        ];
617        print!("{input}");
618        assert_tokens(input, &expected);
619    }
620
621    #[test]
622    fn test_unclosed_string() {
623        let input = r#"{ key: "unclosed }"#;
624        let mut lexer = Lexer::new(input);
625        let tokens = lexer.lex();
626
627        // Unclosed string should return Unknown token
628        let has_unknown = tokens.iter().any(|t| matches!(t.ttype, TokenType::Unknown));
629        assert!(has_unknown, "Should have Unknown token for unclosed string");
630    }
631
632    #[test]
633    fn test_string_with_escapes() {
634        let input = r#""hello\nworld\t\"test\"""#;
635        let mut lexer = Lexer::new(input);
636        let token = lexer.next_token();
637
638        match token.ttype {
639            TokenType::String(s) => {
640                // The lexer actually processes the escapes
641                assert!(s.contains('\n'));
642                assert!(s.contains('\t'));
643                assert!(s.contains('"'));
644                assert_eq!(s, "hello\nworld\t\"test\"");
645            }
646            _ => panic!("Expected string token, got {:?}", token.ttype),
647        }
648    }
649
650    #[test]
651    fn test_invalid_escape_at_eof() {
652        let input = r#""test\"#;
653        let mut lexer = Lexer::new(input);
654        let token = lexer.next_token();
655        assert!(matches!(token.ttype, TokenType::Unknown));
656    }
657
658    #[test]
659    fn test_number_with_exponent() {
660        let input = "1.23e10 4.5E-3";
661        let mut lexer = Lexer::new(input);
662
663        let tok1 = lexer.next_token();
664        assert!(matches!(tok1.ttype, TokenType::Number(n) if (n - 1.23e10).abs() < 1e-6));
665
666        lexer.next_token(); // whitespace
667        let tok2 = lexer.next_token();
668        assert!(matches!(tok2.ttype, TokenType::Number(n) if (n - 4.5e-3).abs() < 1e-9));
669    }
670
671    #[test]
672    fn test_negative_numbers() {
673        let input = "-42 -3.2";
674        let expected = vec![
675            TokenType::Number(-42.0),
676            TokenType::Number(-3.2),
677            TokenType::Eof,
678        ];
679        assert_tokens(input, &expected);
680    }
681
682    #[test]
683    fn test_dotdot_not_spread() {
684        // ".." without third dot should be two dots
685        let input = "..";
686        let mut lexer = Lexer::new(input);
687        let tok1 = lexer.next_token();
688
689        // First dot, then either another dot or unknown
690        assert!(matches!(tok1.ttype, TokenType::Dot | TokenType::Unknown));
691    }
692
693    #[test]
694    fn test_unknown_character() {
695        let input = "{ @invalid }";
696        let mut lexer = Lexer::new(input);
697        let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
698
699        // Should have Unknown token for @
700        assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
701    }
702
703    #[test]
704    fn test_single_slash_not_comment() {
705        let input = "test / value";
706        let mut lexer = Lexer::new(input);
707        let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
708
709        // Single slash should produce Unknown
710        assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
711    }
712
713    #[test]
714    fn test_escape_r() {
715        let input = r#""test\rvalue""#;
716        let mut lexer = Lexer::new(input);
717        let token = lexer.next_token();
718        assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
719    }
720
721    #[test]
722    fn test_escape_backslash() {
723        let input = r#""test\\value""#;
724        let mut lexer = Lexer::new(input);
725        let token = lexer.next_token();
726        assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
727    }
728
729    #[test]
730    fn test_unknown_escape_preserved() {
731        let input = r#""test\xvalue""#;
732        let mut lexer = Lexer::new(input);
733        let token = lexer.next_token();
734        // String should parse successfully
735        assert!(matches!(token.ttype, TokenType::String(_)));
736    }
737
738    #[test]
739    fn test_zero_number() {
740        assert_tokens("0", &[TokenType::Number(0.0), TokenType::Eof]);
741    }
742
743    #[test]
744    fn test_decimal_point_only() {
745        assert_tokens("3.69", &[TokenType::Number(3.69), TokenType::Eof]);
746    }
747
748    #[test]
749    fn test_leading_decimal() {
750        // .5 should be parsed as dot  + number
751        let input = ".5";
752        let mut lexer = Lexer::new(input);
753        let tok1 = lexer.next_token();
754        let tok2 = lexer.next_token();
755        assert!(matches!(tok1.ttype, TokenType::Dot));
756        assert!(matches!(tok2.ttype, TokenType::Number(5.0)));
757    }
758
759    #[test]
760    fn test_multiline_comment() {
761        let input = "// line 1\n// line 2\nvalue";
762        let mut lexer = Lexer::new(input);
763        let tokens: Vec<TokenType> = lexer
764            .lex()
765            .into_iter()
766            .filter(|t| !matches!(t.ttype, TokenType::Whitespace | TokenType::Comment(_)))
767            .map(|t| t.ttype)
768            .collect();
769        assert_eq!(
770            tokens,
771            vec![TokenType::Identifier("value".to_string()), TokenType::Eof]
772        );
773    }
774
775    #[test]
776    fn test_comment_at_eof() {
777        let input = "value // comment at end";
778        let mut lexer = Lexer::new(input);
779        let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
780        assert!(tokens.iter().any(|t| matches!(t, TokenType::Comment(_))));
781    }
782
783    #[test]
784    fn test_all_keywords() {
785        let input = "true false null on off import from as";
786        let expected = vec![
787            TokenType::True,
788            TokenType::False,
789            TokenType::Null,
790            TokenType::True,  // 'on' maps to true
791            TokenType::False, // 'off' maps to false
792            TokenType::Import,
793            TokenType::From,
794            TokenType::As,
795            TokenType::Eof,
796        ];
797        assert_tokens(input, &expected);
798    }
799
800    #[test]
801    fn test_identifiers_with_underscores() {
802        let input = "my_var _private __dunder";
803        let expected = vec![
804            TokenType::Identifier("my_var".to_string()),
805            TokenType::Identifier("_private".to_string()),
806            TokenType::Identifier("__dunder".to_string()),
807            TokenType::Eof,
808        ];
809        assert_tokens(input, &expected);
810    }
811
812    #[test]
813    fn test_mixed_operators() {
814        let input = ":: = ...";
815        let expected = vec![
816            TokenType::DoubleColon,
817            TokenType::Equals,
818            TokenType::Spread,
819            TokenType::Eof,
820        ];
821        assert_tokens(input, &expected);
822    }
823
824    #[test]
825    fn test_adjacent_tokens_no_whitespace() {
826        let input = "[1,2,3]";
827        let mut lexer = Lexer::new(input);
828        let tokens: Vec<TokenType> = lexer
829            .lex()
830            .into_iter()
831            .filter(|t| !matches!(t.ttype, TokenType::Whitespace))
832            .map(|t| t.ttype)
833            .collect();
834        assert_eq!(tokens.len(), 8); // [, 1, ,, 2, ,, 3, ], EOF
835    }
836
837    #[test]
838    fn test_hash_token() {
839        let input = "#struct";
840        let expected = vec![
841            TokenType::Hash,
842            TokenType::Identifier("struct".to_string()),
843            TokenType::Eof,
844        ];
845        assert_tokens(input, &expected);
846    }
847
848    #[test]
849    fn test_dollar_token() {
850        let input = "$Status.Active";
851        let expected = vec![
852            TokenType::Dollar,
853            TokenType::Identifier("Status".to_string()),
854            TokenType::Dot,
855            TokenType::Identifier("Active".to_string()),
856            TokenType::Eof,
857        ];
858        assert_tokens(input, &expected);
859    }
860
861    #[test]
862    fn test_empty_string() {
863        let input = r#""""#;
864        let mut lexer = Lexer::new(input);
865        let token = lexer.next_token();
866        assert_eq!(token.ttype, TokenType::String("".to_string()));
867    }
868}
mon_core/lexer.rs

mon_core/
lexer.rs