mon_core/
lexer.rs

1/// Represents the different kinds of tokens that the lexer can produce.
2/// Each token is a meaningful unit of the MON language syntax.
3#[derive(Debug, PartialEq, Clone)]
4pub enum TokenType {
5    // == Special Tokens ==
6    /// Represents the end of the input file.
7    Eof,
8    /// Represents a sequence of one or more whitespace characters (spaces, tabs, newlines).
9    Whitespace,
10    /// Represents a comment, starting with `//` and continuing to the end of the line.
11    /// The associated `String` contains the content of the comment.
12    Comment(String),
13    /// Represents a token that could not be recognized by the lexer.
14    Unknown,
15
16    // == Literals ==
17    /// An identifier, used for keys, type names, and anchor/alias names.
18    /// Examples: `name`, `User`, `&default_user`.
19    Identifier(String),
20    /// A string literal, enclosed in double quotes.
21    /// The associated `String` holds the content of the string.
22    String(String),
23    /// A number literal, which can be an integer or a floating-point value.
24    Number(f64),
25
26    // == Keywords ==
27    /// The boolean `true` value, can be written as `true` or `on`.
28    True,
29    /// The boolean `false` value, can be written as `false` or `off`.
30    False,
31    /// The `null` keyword, representing an empty or absent value.
32    Null,
33    /// The `import` keyword, used for the module system.
34    Import,
35    /// The `from` keyword, used for named imports.
36    From,
37    /// The `as` keyword, used for namespacing imports.
38    As,
39
40    // == Punctuation & Operators ==
41    /// Left Brace: `{`
42    LBrace,
43    /// Right Brace: `}`
44    RBrace,
45    /// Left Bracket: `[`
46    LBracket,
47    /// Right Bracket: `]`
48    RBracket,
49    /// Left Parenthesis: `(`
50    LParen,
51    /// Right Parenthesis: `)`
52    RParen,
53    /// Comma: `,`
54    Comma,
55    /// Colon: `:`
56    Colon,
57    /// Double Colon: `::` (used for type annotations)
58    DoubleColon,
59    /// Dot: `.` (used for namespace access)
60    Dot,
61    /// Equals: `=` (used for using structs)
62    Equals,
63    /// Hash: `#` (used as a prefix for type definitions, e.g., `#struct`)
64    Hash,
65    /// Dollar Sign: `$` (used for accessing enum variants)
66    Dollar,
67    /// Ampersand: `&` (used to define an anchor)
68    Ampersand,
69    /// Asterisk: `*` (used to create an alias of an anchor)
70    Asterisk,
71    /// Spread: `...` (used to spread an anchor into an object or array)
72    Spread,
73}
74
75/// A token with its type and position
76#[derive(Debug, Clone)]
77pub struct Token {
78    pub ttype: TokenType,
79    pub pos_start: usize,
80    pub pos_end: usize,
81}
82
83impl Token {
84    #[must_use] 
85    pub fn new(ttype: TokenType, pos_start: usize, pos_end: usize) -> Token {
86        Token {
87            ttype,
88            pos_start,
89            pos_end,
90        }
91    }
92}
93
94pub struct Lexer<'a> {
95    chars: std::iter::Peekable<std::str::Chars<'a>>,
96    position: usize,
97}
98
99impl<'a> Lexer<'a> {
100    #[must_use] 
101    pub fn new(input: &'a str) -> Self {
102        Self {
103            chars: input.chars().peekable(),
104            position: 0,
105        }
106    }
107
108    pub fn lex(&mut self) -> Vec<Token> {
109        let mut tokens = Vec::new();
110        loop {
111            let token = self.next_token();
112            if token.ttype == TokenType::Eof {
113                tokens.push(token);
114                break;
115            }
116            tokens.push(token);
117        }
118        tokens
119    }
120
121    pub fn next_token(&mut self) -> Token {
122        let start_pos = self.position;
123
124        let ttype = if let Some(char) = self.advance() {
125            match char {
126                '{' => TokenType::LBrace,
127                '}' => TokenType::RBrace,
128                '[' => TokenType::LBracket,
129                ']' => TokenType::RBracket,
130                '(' => TokenType::LParen,
131                ')' => TokenType::RParen,
132                ',' => TokenType::Comma,
133                '#' => TokenType::Hash,
134                '$' => TokenType::Dollar,
135                '&' => TokenType::Ampersand,
136                '*' => TokenType::Asterisk,
137                '=' => TokenType::Equals,
138
139                ':' => {
140                    if self.peek() == Some(&':') {
141                        self.advance();
142                        TokenType::DoubleColon
143                    } else {
144                        TokenType::Colon
145                    }
146                }
147                '.' => {
148                    if self.peek() == Some(&'.') {
149                        self.advance();
150                        if self.peek() == Some(&'.') {
151                            self.advance();
152                            TokenType::Spread
153                        } else {
154                            TokenType::Unknown
155                        }
156                    } else {
157                        TokenType::Dot
158                    }
159                }
160                '/' => {
161                    if self.peek() == Some(&'/') {
162                        self.read_comment()
163                    } else {
164                        TokenType::Unknown
165                    }
166                }
167                '"' => self.read_string(),
168                c if c.is_whitespace() => self.read_whitespace(),
169                c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(c),
170                c if c.is_ascii_digit()
171                    || (c == '-' && self.peek().is_some_and(char::is_ascii_digit)) =>
172                {
173                    self.read_number(c)
174                }
175
176                _ => TokenType::Unknown,
177            }
178        } else {
179            TokenType::Eof
180        };
181
182        Token::new(ttype, start_pos, self.position)
183    }
184
185    fn advance(&mut self) -> Option<char> {
186        let char = self.chars.next();
187        if let Some(c) = char {
188            self.position += c.len_utf8();
189        }
190        char
191    }
192
193    fn peek(&mut self) -> Option<&char> {
194        self.chars.peek()
195    }
196
197    fn read_whitespace(&mut self) -> TokenType {
198        while let Some(c) = self.peek() {
199            if c.is_whitespace() {
200                self.advance();
201            } else {
202                break;
203            }
204        }
205        TokenType::Whitespace
206    }
207
208    fn read_comment(&mut self) -> TokenType {
209        self.advance(); // Consume the second '/'
210        let mut comment_text = String::new();
211        while let Some(c) = self.peek() {
212            if *c == '\n' {
213                break;
214            }
215            comment_text.push(self.advance().unwrap());
216        }
217        TokenType::Comment(comment_text.trim().to_string())
218    }
219
220    fn read_string(&mut self) -> TokenType {
221        let mut value = String::new();
222        loop {
223            match self.peek() {
224                Some('\"') => {
225                    self.advance(); // Consume the closing quote
226                    return TokenType::String(value);
227                }
228                Some('\\') => {
229                    self.advance(); // Consume the backslash
230                    match self.advance() {
231                        Some('\"') => value.push('\"'),
232                        Some('\\') => value.push('\\'),
233                        Some('n') => value.push('\n'),
234                        Some('r') => value.push('\r'),
235                        Some('t') => value.push('\t'),
236                        Some(other) => {
237                            value.push('\\');
238                            value.push(other);
239                        }
240                        None => return TokenType::Unknown, // Unclosed escape sequence
241                    }
242                }
243                Some(c) => {
244                    value.push(*c);
245                    self.advance();
246                }
247                None => return TokenType::Unknown, // Unclosed string
248            }
249        }
250    }
251
252    fn read_identifier(&mut self, first_char: char) -> TokenType {
253        let mut ident = String::new();
254        ident.push(first_char);
255
256        while let Some(c) = self.peek() {
257            if c.is_ascii_alphanumeric() || *c == '_' {
258                ident.push(self.advance().unwrap());
259            } else {
260                break;
261            }
262        }
263
264        match ident.as_str() {
265            "true" | "on" => TokenType::True,
266            "false" | "off" => TokenType::False,
267            "null" => TokenType::Null,
268            "import" => TokenType::Import,
269            "from" => TokenType::From,
270            "as" => TokenType::As,
271            _ => TokenType::Identifier(ident),
272        }
273    }
274
275    fn read_number(&mut self, first_char: char) -> TokenType {
276        let mut number_str = String::new();
277        number_str.push(first_char);
278        let mut has_dot = first_char == '.';
279        let mut has_exponent = false;
280
281        while let Some(c) = self.peek() {
282            if c.is_ascii_digit() {
283                number_str.push(self.advance().unwrap());
284            } else if *c == '.' && !has_dot {
285                has_dot = true;
286                number_str.push(self.advance().unwrap());
287            } else if (*c == 'e' || *c == 'E') && !has_exponent {
288                has_exponent = true;
289                number_str.push(self.advance().unwrap());
290                // Check for optional sign after 'e' or 'E'
291                if let Some(sign_char) = self.peek() {
292                    if *sign_char == '+' || *sign_char == '-' {
293                        number_str.push(self.advance().unwrap());
294                    }
295                }
296            } else {
297                break;
298            }
299        }
300
301        if let Ok(num) = number_str.parse::<f64>() {
302            TokenType::Number(num)
303        } else {
304            TokenType::Unknown
305        }
306    }
307}
308
309/// QOL function
310#[allow(dead_code)]
311pub(crate) fn tokens_to_pretty_string(tokens: &[Token]) -> String {
312    let mut buff: Vec<String> = Vec::with_capacity(tokens.len());
313
314    for token in tokens {
315        buff.push(format!(
316            "{:?}, {}, {}",
317            token.ttype, token.pos_start, token.pos_end
318        ));
319    }
320
321    buff.join("\n")
322}
323
324#[cfg(test)]
325#[allow(clippy::needless_pass_by_value)]
326#[allow(clippy::explicit_auto_deref)]
327mod tests {
328    use super::*;
329
330    fn assert_tokens(input: &str, expected: &[TokenType]) {
331        let mut lexer = Lexer::new(input);
332        let tokens = lexer.lex();
333        let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
334
335        // Filter out whitespace and comments for most tests
336        let filtered_tokens: Vec<TokenType> = token_types
337            .into_iter()
338            .filter(|t| !matches!(t, TokenType::Whitespace | TokenType::Comment(_)))
339            .collect();
340
341        assert_eq!(filtered_tokens, expected);
342    }
343
344    #[test]
345    fn test_eof() {
346        assert_tokens("", &[TokenType::Eof]);
347    }
348
349    #[test]
350    fn test_single_char_tokens() {
351        let input = "{}[](),:#{new_string}*";
352        let expected = vec![
353            TokenType::LBrace,
354            TokenType::RBrace,
355            TokenType::LBracket,
356            TokenType::RBracket,
357            TokenType::LParen,
358            TokenType::RParen,
359            TokenType::Comma,
360            TokenType::Colon,
361            TokenType::Hash,
362            TokenType::LBrace,
363            TokenType::Identifier("new_string".to_string()),
364            TokenType::RBrace,
365            TokenType::Asterisk,
366            TokenType::Eof,
367        ];
368        assert_tokens(input, &expected);
369    }
370
371    #[test]
372    fn test_multi_char_operators() {
373        let input = ":: ...";
374        let expected = vec![TokenType::DoubleColon, TokenType::Spread, TokenType::Eof];
375        assert_tokens(input, &expected);
376    }
377
378    #[test]
379    fn test_keywords() {
380        let input = "true on false off null import from as";
381        let expected = vec![
382            TokenType::True,
383            TokenType::True,
384            TokenType::False,
385            TokenType::False,
386            TokenType::Null,
387            TokenType::Import,
388            TokenType::From,
389            TokenType::As,
390            TokenType::Eof,
391        ];
392        assert_tokens(input, &expected);
393    }
394
395    #[test]
396    fn test_identifiers() {
397        let input = "foo bar_123 _baz";
398        let expected = vec![
399            TokenType::Identifier("foo".to_string()),
400            TokenType::Identifier("bar_123".to_string()),
401            TokenType::Identifier("_baz".to_string()),
402            TokenType::Eof,
403        ];
404        assert_tokens(input, &expected);
405    }
406
407    #[test]
408    fn test_numbers() {
409        let input = "123 45.67 -10 0.5";
410        let expected = vec![
411            TokenType::Number(123.0),
412            TokenType::Number(45.67),
413            TokenType::Number(-10.0),
414            TokenType::Number(0.5),
415            TokenType::Eof,
416        ];
417        assert_tokens(input, &expected);
418    }
419
420    #[test]
421    fn test_strings() {
422        let input = r#""hello world" "" "another""#;
423        let expected = vec![
424            TokenType::String("hello world".to_string()),
425            TokenType::String(String::new()),
426            TokenType::String("another".to_string()),
427            TokenType::Eof,
428        ];
429        assert_tokens(input, &expected);
430    }
431
432    #[test]
433    fn test_strings_with_escapes() {
434        let input = r#""hello \"world\"\t\n\r""#;
435        let expected = vec![
436            TokenType::String("hello \"world\"\t\n\r".to_string()),
437            TokenType::Eof,
438        ];
439        assert_tokens(input, &expected);
440    }
441
442    #[test]
443    fn test_comments_and_whitespace() {
444        let input = " // this is a comment\n key: value // another one";
445        let mut lexer = Lexer::new(input);
446        let tokens = lexer.lex();
447        let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
448
449        let expected = vec![
450            TokenType::Whitespace,
451            TokenType::Comment("this is a comment".to_string()),
452            TokenType::Whitespace,
453            TokenType::Identifier("key".to_string()),
454            TokenType::Colon,
455            TokenType::Whitespace,
456            TokenType::Identifier("value".to_string()),
457            TokenType::Whitespace,
458            TokenType::Comment("another one".to_string()),
459            TokenType::Eof,
460        ];
461
462        assert_eq!(token_types, expected);
463    }
464
465    #[test]
466    fn test_complex_mon_structure() {
467        let input = r#"
468{
469    // Config settings
470    service_name: "My App",
471    port: 8080,
472    is_enabled: on,
473
474    &default_user: {
475        permissions: ["READ", "WRITE"],
476    },
477
478    admin :: User = {
479        ...*default_user,
480        name: "Admin",
481        }
482}
483            "#;
484        let expected = vec![
485            TokenType::LBrace,
486            TokenType::Identifier("service_name".to_string()),
487            TokenType::Colon,
488            TokenType::String("My App".to_string()),
489            TokenType::Comma,
490            TokenType::Identifier("port".to_string()),
491            TokenType::Colon,
492            TokenType::Number(8080.0),
493            TokenType::Comma,
494            TokenType::Identifier("is_enabled".to_string()),
495            TokenType::Colon,
496            TokenType::True,
497            TokenType::Comma,
498            TokenType::Ampersand,
499            TokenType::Identifier("default_user".to_string()),
500            TokenType::Colon,
501            TokenType::LBrace,
502            TokenType::Identifier("permissions".to_string()),
503            TokenType::Colon,
504            TokenType::LBracket,
505            TokenType::String("READ".to_string()),
506            TokenType::Comma,
507            TokenType::String("WRITE".to_string()),
508            TokenType::RBracket,
509            TokenType::Comma,
510            TokenType::RBrace,
511            TokenType::Comma,
512            TokenType::Identifier("admin".to_string()),
513            TokenType::DoubleColon,
514            TokenType::Identifier("User".to_string()),
515            TokenType::Equals,
516            TokenType::LBrace,
517            TokenType::Spread,
518            TokenType::Asterisk,
519            TokenType::Identifier("default_user".to_string()),
520            TokenType::Comma,
521            TokenType::Identifier("name".to_string()),
522            TokenType::Colon,
523            TokenType::String("Admin".to_string()),
524            TokenType::Comma,
525            TokenType::RBrace,
526            TokenType::RBrace,
527            TokenType::Eof,
528        ];
529        print!("{input}");
530        assert_tokens(input, &expected);
531    }
532}