1#[derive(Debug, PartialEq, Clone)]
61pub enum TokenType {
62 Eof,
65 Whitespace,
67 Comment(String),
70 Unknown,
72
73 Identifier(String),
77 String(String),
80 Number(f64),
82
83 True,
86 False,
88 Null,
90 Import,
92 From,
94 As,
96
97 LBrace,
100 RBrace,
102 LBracket,
104 RBracket,
106 LParen,
108 RParen,
110 Comma,
112 Colon,
114 DoubleColon,
116 Dot,
118 Equals,
120 Hash,
122 Dollar,
124 Ampersand,
126 Asterisk,
128 Spread,
130}
131
132#[derive(Debug, Clone)]
136pub struct Token {
137 pub ttype: TokenType,
139 pub pos_start: usize,
141 pub pos_end: usize,
143}
144
145impl Token {
146 #[must_use]
148 pub fn new(ttype: TokenType, pos_start: usize, pos_end: usize) -> Token {
149 Token {
150 ttype,
151 pos_start,
152 pos_end,
153 }
154 }
155}
156
157pub struct Lexer<'a> {
197 chars: std::iter::Peekable<std::str::Chars<'a>>,
198 position: usize,
199}
200
201impl<'a> Lexer<'a> {
202 #[must_use]
204 pub fn new(input: &'a str) -> Self {
205 Self {
206 chars: input.chars().peekable(),
207 position: 0,
208 }
209 }
210
211 pub fn lex(&mut self) -> Vec<Token> {
215 let mut tokens = Vec::new();
216 loop {
217 let token = self.next_token();
218 if token.ttype == TokenType::Eof {
219 tokens.push(token);
220 break;
221 }
222 tokens.push(token);
223 }
224 tokens
225 }
226
227 pub fn next_token(&mut self) -> Token {
232 let start_pos = self.position;
233
234 let ttype = if let Some(char) = self.advance() {
235 match char {
236 '{' => TokenType::LBrace,
237 '}' => TokenType::RBrace,
238 '[' => TokenType::LBracket,
239 ']' => TokenType::RBracket,
240 '(' => TokenType::LParen,
241 ')' => TokenType::RParen,
242 ',' => TokenType::Comma,
243 '#' => TokenType::Hash,
244 '$' => TokenType::Dollar,
245 '&' => TokenType::Ampersand,
246 '*' => TokenType::Asterisk,
247 '=' => TokenType::Equals,
248
249 ':' => {
250 if self.peek() == Some(&':') {
251 self.advance();
252 TokenType::DoubleColon
253 } else {
254 TokenType::Colon
255 }
256 }
257 '.' => {
258 if self.peek() == Some(&'.') {
259 self.advance();
260 if self.peek() == Some(&'.') {
261 self.advance();
262 TokenType::Spread
263 } else {
264 TokenType::Unknown
265 }
266 } else {
267 TokenType::Dot
268 }
269 }
270 '/' => {
271 if self.peek() == Some(&'/') {
272 self.read_comment()
273 } else {
274 TokenType::Unknown
275 }
276 }
277 '"' => self.read_string(),
278 c if c.is_whitespace() => self.read_whitespace(),
279 c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(c),
280 c if c.is_ascii_digit()
281 || (c == '-' && self.peek().is_some_and(char::is_ascii_digit)) =>
282 {
283 self.read_number(c)
284 }
285
286 _ => TokenType::Unknown,
287 }
288 } else {
289 TokenType::Eof
290 };
291
292 Token::new(ttype, start_pos, self.position)
293 }
294
295 fn advance(&mut self) -> Option<char> {
296 let char = self.chars.next();
297 if let Some(c) = char {
298 self.position += c.len_utf8();
299 }
300 char
301 }
302
303 fn peek(&mut self) -> Option<&char> {
304 self.chars.peek()
305 }
306
307 fn read_whitespace(&mut self) -> TokenType {
308 while let Some(c) = self.peek() {
309 if c.is_whitespace() {
310 self.advance();
311 } else {
312 break;
313 }
314 }
315 TokenType::Whitespace
316 }
317
318 fn read_comment(&mut self) -> TokenType {
319 self.advance(); let mut comment_text = String::new();
321 while let Some(c) = self.peek() {
322 if *c == '\n' {
323 break;
324 }
325 comment_text.push(self.advance().unwrap());
326 }
327 TokenType::Comment(comment_text.trim().to_string())
328 }
329
330 fn read_string(&mut self) -> TokenType {
331 let mut value = String::new();
332 loop {
333 match self.peek() {
334 Some('"') => {
335 self.advance(); return TokenType::String(value);
337 }
338 Some('\\') => {
339 self.advance(); match self.advance() {
341 Some('"') => value.push('"'),
342 Some('\\') => value.push('\\'),
343 Some('n') => value.push('\n'),
344 Some('r') => value.push('\r'),
345 Some('t') => value.push('\t'),
346 Some(other) => {
347 value.push('\\');
348 value.push(other);
349 }
350 None => return TokenType::Unknown, }
352 }
353 Some(c) => {
354 value.push(*c);
355 self.advance();
356 }
357 None => return TokenType::Unknown, }
359 }
360 }
361
362 fn read_identifier(&mut self, first_char: char) -> TokenType {
363 let mut ident = String::new();
364 ident.push(first_char);
365
366 while let Some(c) = self.peek() {
367 if c.is_ascii_alphanumeric() || *c == '_' {
368 ident.push(self.advance().unwrap());
369 } else {
370 break;
371 }
372 }
373
374 match ident.as_str() {
375 "true" | "on" => TokenType::True,
376 "false" | "off" => TokenType::False,
377 "null" => TokenType::Null,
378 "import" => TokenType::Import,
379 "from" => TokenType::From,
380 "as" => TokenType::As,
381 _ => TokenType::Identifier(ident),
382 }
383 }
384
385 fn read_number(&mut self, first_char: char) -> TokenType {
386 let mut number_str = String::new();
387 number_str.push(first_char);
388 let mut has_dot = first_char == '.';
389 let mut has_exponent = false;
390
391 while let Some(c) = self.peek() {
392 if c.is_ascii_digit() {
393 number_str.push(self.advance().unwrap());
394 } else if *c == '.' && !has_dot {
395 has_dot = true;
396 number_str.push(self.advance().unwrap());
397 } else if (*c == 'e' || *c == 'E') && !has_exponent {
398 has_exponent = true;
399 number_str.push(self.advance().unwrap());
400 if let Some(sign_char) = self.peek() {
402 if *sign_char == '+' || *sign_char == '-' {
403 number_str.push(self.advance().unwrap());
404 }
405 }
406 } else {
407 break;
408 }
409 }
410
411 if let Ok(num) = number_str.parse::<f64>() {
412 TokenType::Number(num)
413 } else {
414 TokenType::Unknown
415 }
416 }
417}
418
419#[allow(dead_code)]
421pub(crate) fn tokens_to_pretty_string(tokens: &[Token]) -> String {
422 let mut buff: Vec<String> = Vec::with_capacity(tokens.len());
423
424 for token in tokens {
425 buff.push(format!(
426 "{:?}, {}, {}",
427 token.ttype, token.pos_start, token.pos_end,
428 ));
429 }
430
431 buff.join("\n")
432}
433
434#[cfg(test)]
435#[allow(clippy::needless_pass_by_value)]
436#[allow(clippy::explicit_auto_deref)]
437mod tests {
438 use super::*;
439
440 fn assert_tokens(input: &str, expected: &[TokenType]) {
441 let mut lexer = Lexer::new(input);
442 let tokens = lexer.lex();
443 let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
444
445 let filtered_tokens: Vec<TokenType> = token_types
447 .into_iter()
448 .filter(|t| !matches!(t, TokenType::Whitespace | TokenType::Comment(_)))
449 .collect();
450
451 assert_eq!(filtered_tokens, expected);
452 }
453
454 #[test]
455 fn test_eof() {
456 assert_tokens("", &[TokenType::Eof]);
457 }
458
459 #[test]
460 fn test_single_char_tokens() {
461 let input = "{}[](),:#{new_string}*";
462 let expected = vec![
463 TokenType::LBrace,
464 TokenType::RBrace,
465 TokenType::LBracket,
466 TokenType::RBracket,
467 TokenType::LParen,
468 TokenType::RParen,
469 TokenType::Comma,
470 TokenType::Colon,
471 TokenType::Hash,
472 TokenType::LBrace,
473 TokenType::Identifier("new_string".to_string()),
474 TokenType::RBrace,
475 TokenType::Asterisk,
476 TokenType::Eof,
477 ];
478 assert_tokens(input, &expected);
479 }
480
481 #[test]
482 fn test_multi_char_operators() {
483 let input = ":: ...";
484 let expected = vec![TokenType::DoubleColon, TokenType::Spread, TokenType::Eof];
485 assert_tokens(input, &expected);
486 }
487
488 #[test]
489 fn test_keywords() {
490 let input = "true on false off null import from as";
491 let expected = vec![
492 TokenType::True,
493 TokenType::True,
494 TokenType::False,
495 TokenType::False,
496 TokenType::Null,
497 TokenType::Import,
498 TokenType::From,
499 TokenType::As,
500 TokenType::Eof,
501 ];
502 assert_tokens(input, &expected);
503 }
504
505 #[test]
506 fn test_identifiers() {
507 let input = "foo bar_123 _baz";
508 let expected = vec![
509 TokenType::Identifier("foo".to_string()),
510 TokenType::Identifier("bar_123".to_string()),
511 TokenType::Identifier("_baz".to_string()),
512 TokenType::Eof,
513 ];
514 assert_tokens(input, &expected);
515 }
516
517 #[test]
518 fn test_numbers() {
519 let input = "123 45.67 -10 0.5";
520 let expected = vec![
521 TokenType::Number(123.0),
522 TokenType::Number(45.67),
523 TokenType::Number(-10.0),
524 TokenType::Number(0.5),
525 TokenType::Eof,
526 ];
527 assert_tokens(input, &expected);
528 }
529
530 #[test]
531 fn test_comments_and_whitespace() {
532 let input = " // this is a comment\n key: value // another one";
533 let mut lexer = Lexer::new(input);
534 let tokens = lexer.lex();
535 let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
536
537 let expected = vec![
538 TokenType::Whitespace,
539 TokenType::Comment("this is a comment".to_string()),
540 TokenType::Whitespace,
541 TokenType::Identifier("key".to_string()),
542 TokenType::Colon,
543 TokenType::Whitespace,
544 TokenType::Identifier("value".to_string()),
545 TokenType::Whitespace,
546 TokenType::Comment("another one".to_string()),
547 TokenType::Eof,
548 ];
549
550 assert_eq!(token_types, expected);
551 }
552
553 #[test]
554 fn test_complex_mon_structure() {
555 let input = r#"
556 {
557 // Config settings
558 service_name: "My App",
559 port: 8080,
560 is_enabled: on,
561
562 &default_user: {
563 permissions: ["READ", "WRITE"],
564 },
565
566 admin :: User = {
567 ...*default_user,
568 name: "Admin",
569 }
570 }
571 "#;
572 let expected = vec![
573 TokenType::LBrace,
574 TokenType::Identifier("service_name".to_string()),
575 TokenType::Colon,
576 TokenType::String("My App".to_string()),
577 TokenType::Comma,
578 TokenType::Identifier("port".to_string()),
579 TokenType::Colon,
580 TokenType::Number(8080.0),
581 TokenType::Comma,
582 TokenType::Identifier("is_enabled".to_string()),
583 TokenType::Colon,
584 TokenType::True,
585 TokenType::Comma,
586 TokenType::Ampersand,
587 TokenType::Identifier("default_user".to_string()),
588 TokenType::Colon,
589 TokenType::LBrace,
590 TokenType::Identifier("permissions".to_string()),
591 TokenType::Colon,
592 TokenType::LBracket,
593 TokenType::String("READ".to_string()),
594 TokenType::Comma,
595 TokenType::String("WRITE".to_string()),
596 TokenType::RBracket,
597 TokenType::Comma,
598 TokenType::RBrace,
599 TokenType::Comma,
600 TokenType::Identifier("admin".to_string()),
601 TokenType::DoubleColon,
602 TokenType::Identifier("User".to_string()),
603 TokenType::Equals,
604 TokenType::LBrace,
605 TokenType::Spread,
606 TokenType::Asterisk,
607 TokenType::Identifier("default_user".to_string()),
608 TokenType::Comma,
609 TokenType::Identifier("name".to_string()),
610 TokenType::Colon,
611 TokenType::String("Admin".to_string()),
612 TokenType::Comma,
613 TokenType::RBrace,
614 TokenType::RBrace,
615 TokenType::Eof,
616 ];
617 print!("{input}");
618 assert_tokens(input, &expected);
619 }
620
621 #[test]
622 fn test_unclosed_string() {
623 let input = r#"{ key: "unclosed }"#;
624 let mut lexer = Lexer::new(input);
625 let tokens = lexer.lex();
626
627 let has_unknown = tokens.iter().any(|t| matches!(t.ttype, TokenType::Unknown));
629 assert!(has_unknown, "Should have Unknown token for unclosed string");
630 }
631
632 #[test]
633 fn test_string_with_escapes() {
634 let input = r#""hello\nworld\t\"test\"""#;
635 let mut lexer = Lexer::new(input);
636 let token = lexer.next_token();
637
638 match token.ttype {
639 TokenType::String(s) => {
640 assert!(s.contains('\n'));
642 assert!(s.contains('\t'));
643 assert!(s.contains('"'));
644 assert_eq!(s, "hello\nworld\t\"test\"");
645 }
646 _ => panic!("Expected string token, got {:?}", token.ttype),
647 }
648 }
649
650 #[test]
651 fn test_invalid_escape_at_eof() {
652 let input = r#""test\"#;
653 let mut lexer = Lexer::new(input);
654 let token = lexer.next_token();
655 assert!(matches!(token.ttype, TokenType::Unknown));
656 }
657
658 #[test]
659 fn test_number_with_exponent() {
660 let input = "1.23e10 4.5E-3";
661 let mut lexer = Lexer::new(input);
662
663 let tok1 = lexer.next_token();
664 assert!(matches!(tok1.ttype, TokenType::Number(n) if (n - 1.23e10).abs() < 1e-6));
665
666 lexer.next_token(); let tok2 = lexer.next_token();
668 assert!(matches!(tok2.ttype, TokenType::Number(n) if (n - 4.5e-3).abs() < 1e-9));
669 }
670
671 #[test]
672 fn test_negative_numbers() {
673 let input = "-42 -3.2";
674 let expected = vec![
675 TokenType::Number(-42.0),
676 TokenType::Number(-3.2),
677 TokenType::Eof,
678 ];
679 assert_tokens(input, &expected);
680 }
681
682 #[test]
683 fn test_dotdot_not_spread() {
684 let input = "..";
686 let mut lexer = Lexer::new(input);
687 let tok1 = lexer.next_token();
688
689 assert!(matches!(tok1.ttype, TokenType::Dot | TokenType::Unknown));
691 }
692
693 #[test]
694 fn test_unknown_character() {
695 let input = "{ @invalid }";
696 let mut lexer = Lexer::new(input);
697 let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
698
699 assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
701 }
702
703 #[test]
704 fn test_single_slash_not_comment() {
705 let input = "test / value";
706 let mut lexer = Lexer::new(input);
707 let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
708
709 assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
711 }
712
713 #[test]
714 fn test_escape_r() {
715 let input = r#""test\rvalue""#;
716 let mut lexer = Lexer::new(input);
717 let token = lexer.next_token();
718 assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
719 }
720
721 #[test]
722 fn test_escape_backslash() {
723 let input = r#""test\\value""#;
724 let mut lexer = Lexer::new(input);
725 let token = lexer.next_token();
726 assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
727 }
728
729 #[test]
730 fn test_unknown_escape_preserved() {
731 let input = r#""test\xvalue""#;
732 let mut lexer = Lexer::new(input);
733 let token = lexer.next_token();
734 assert!(matches!(token.ttype, TokenType::String(_)));
736 }
737
738 #[test]
739 fn test_zero_number() {
740 assert_tokens("0", &[TokenType::Number(0.0), TokenType::Eof]);
741 }
742
743 #[test]
744 fn test_decimal_point_only() {
745 assert_tokens("3.69", &[TokenType::Number(3.69), TokenType::Eof]);
746 }
747
748 #[test]
749 fn test_leading_decimal() {
750 let input = ".5";
752 let mut lexer = Lexer::new(input);
753 let tok1 = lexer.next_token();
754 let tok2 = lexer.next_token();
755 assert!(matches!(tok1.ttype, TokenType::Dot));
756 assert!(matches!(tok2.ttype, TokenType::Number(5.0)));
757 }
758
759 #[test]
760 fn test_multiline_comment() {
761 let input = "// line 1\n// line 2\nvalue";
762 let mut lexer = Lexer::new(input);
763 let tokens: Vec<TokenType> = lexer
764 .lex()
765 .into_iter()
766 .filter(|t| !matches!(t.ttype, TokenType::Whitespace | TokenType::Comment(_)))
767 .map(|t| t.ttype)
768 .collect();
769 assert_eq!(
770 tokens,
771 vec![TokenType::Identifier("value".to_string()), TokenType::Eof]
772 );
773 }
774
775 #[test]
776 fn test_comment_at_eof() {
777 let input = "value // comment at end";
778 let mut lexer = Lexer::new(input);
779 let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
780 assert!(tokens.iter().any(|t| matches!(t, TokenType::Comment(_))));
781 }
782
783 #[test]
784 fn test_all_keywords() {
785 let input = "true false null on off import from as";
786 let expected = vec![
787 TokenType::True,
788 TokenType::False,
789 TokenType::Null,
790 TokenType::True, TokenType::False, TokenType::Import,
793 TokenType::From,
794 TokenType::As,
795 TokenType::Eof,
796 ];
797 assert_tokens(input, &expected);
798 }
799
800 #[test]
801 fn test_identifiers_with_underscores() {
802 let input = "my_var _private __dunder";
803 let expected = vec![
804 TokenType::Identifier("my_var".to_string()),
805 TokenType::Identifier("_private".to_string()),
806 TokenType::Identifier("__dunder".to_string()),
807 TokenType::Eof,
808 ];
809 assert_tokens(input, &expected);
810 }
811
812 #[test]
813 fn test_mixed_operators() {
814 let input = ":: = ...";
815 let expected = vec![
816 TokenType::DoubleColon,
817 TokenType::Equals,
818 TokenType::Spread,
819 TokenType::Eof,
820 ];
821 assert_tokens(input, &expected);
822 }
823
824 #[test]
825 fn test_adjacent_tokens_no_whitespace() {
826 let input = "[1,2,3]";
827 let mut lexer = Lexer::new(input);
828 let tokens: Vec<TokenType> = lexer
829 .lex()
830 .into_iter()
831 .filter(|t| !matches!(t.ttype, TokenType::Whitespace))
832 .map(|t| t.ttype)
833 .collect();
834 assert_eq!(tokens.len(), 8); }
836
837 #[test]
838 fn test_hash_token() {
839 let input = "#struct";
840 let expected = vec![
841 TokenType::Hash,
842 TokenType::Identifier("struct".to_string()),
843 TokenType::Eof,
844 ];
845 assert_tokens(input, &expected);
846 }
847
848 #[test]
849 fn test_dollar_token() {
850 let input = "$Status.Active";
851 let expected = vec![
852 TokenType::Dollar,
853 TokenType::Identifier("Status".to_string()),
854 TokenType::Dot,
855 TokenType::Identifier("Active".to_string()),
856 TokenType::Eof,
857 ];
858 assert_tokens(input, &expected);
859 }
860
861 #[test]
862 fn test_empty_string() {
863 let input = r#""""#;
864 let mut lexer = Lexer::new(input);
865 let token = lexer.next_token();
866 assert_eq!(token.ttype, TokenType::String("".to_string()));
867 }
868}