1pub use crate::span::{Position, Span};
29use std::fmt;
30
31#[derive(Debug, Clone, PartialEq)]
33pub enum Token {
34 Int(i64),
37 Float(f64),
39 Bool(bool),
41 String(String),
43
44 Ident(String),
47
48 Let,
51 Rec,
53 AndKeyword,
55 In,
57 If,
59 Then,
61 Else,
63 Fun,
65 Match,
67 Type,
69 With,
71 Of,
73 Open,
75 Module,
77 Do,
79 While,
81 Break,
83 Continue,
85 Async,
87 Return,
89 Yield,
91 LetBang,
93 DoBang,
95 ReturnBang,
97 YieldBang,
99
100 Plus,
103 Minus,
105 Star,
107 Slash,
109 Eq,
111 EqEq,
113 Neq,
115 Lt,
117 Lte,
119 Gt,
121 Gte,
123 And,
125 Or,
127 ColonColon,
129 LArrow,
131 PipeRight,
133 PlusPlus,
135
136 LParen,
139 RParen,
141 LBracket,
143 RBracket,
145 LBracketPipe,
147 PipeRBracket,
149 Arrow,
151 Comma,
153 Semicolon,
155 Dot,
157 LBrace,
159 RBrace,
161 LBracePipe,
163 PipeRBrace,
165 Colon,
167 Pipe,
169 Underscore,
171
172 LoadDirective(String),
175
176 Eof,
179}
180
181impl fmt::Display for Token {
182 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183 match self {
184 Token::Int(n) => write!(f, "Int({})", n),
185 Token::Float(n) => write!(f, "Float({})", n),
186 Token::Bool(b) => write!(f, "Bool({})", b),
187 Token::String(s) => write!(f, "String(\"{}\")", s),
188 Token::Ident(s) => write!(f, "Ident({})", s),
189 Token::Let => write!(f, "let"),
190 Token::In => write!(f, "in"),
191 Token::Rec => write!(f, "rec"),
192 Token::AndKeyword => write!(f, "and"),
193 Token::If => write!(f, "if"),
194 Token::Then => write!(f, "then"),
195 Token::Else => write!(f, "else"),
196 Token::Fun => write!(f, "fun"),
197 Token::Match => write!(f, "match"),
198 Token::Type => write!(f, "type"),
199 Token::With => write!(f, "with"),
200 Token::Plus => write!(f, "+"),
201 Token::Minus => write!(f, "-"),
202 Token::Star => write!(f, "*"),
203 Token::Slash => write!(f, "/"),
204 Token::Eq => write!(f, "="),
205 Token::EqEq => write!(f, "=="),
206 Token::Neq => write!(f, "<>"),
207 Token::Lt => write!(f, "<"),
208 Token::Lte => write!(f, "<="),
209 Token::Gt => write!(f, ">"),
210 Token::Gte => write!(f, ">="),
211 Token::And => write!(f, "&&"),
212 Token::Or => write!(f, "||"),
213 Token::ColonColon => write!(f, "::"),
214 Token::LArrow => write!(f, "<-"),
215 Token::PipeRight => write!(f, "|>"),
216 Token::PlusPlus => write!(f, "++"),
217 Token::LParen => write!(f, "("),
218 Token::RParen => write!(f, ")"),
219 Token::LBracket => write!(f, "["),
220 Token::RBracket => write!(f, "]"),
221 Token::LBracketPipe => write!(f, "[|"),
222 Token::PipeRBracket => write!(f, "|]"),
223 Token::Arrow => write!(f, "->"),
224 Token::Comma => write!(f, ","),
225 Token::Semicolon => write!(f, ";"),
226 Token::Dot => write!(f, "."),
227 Token::LBrace => write!(f, "{{"),
228 Token::RBrace => write!(f, "}}"),
229 Token::LBracePipe => write!(f, "{{|"),
230 Token::PipeRBrace => write!(f, "|}}"),
231 Token::Colon => write!(f, ":"),
232 Token::Pipe => write!(f, "|"),
233 Token::Underscore => write!(f, "_"),
234 Token::Of => write!(f, "of"),
235 Token::Open => write!(f, "open"),
236 Token::Module => write!(f, "module"),
237 Token::Do => write!(f, "do"),
238 Token::While => write!(f, "while"),
239 Token::Break => write!(f, "break"),
240 Token::Continue => write!(f, "continue"),
241 Token::Async => write!(f, "async"),
242 Token::Return => write!(f, "return"),
243 Token::Yield => write!(f, "yield"),
244 Token::LetBang => write!(f, "let!"),
245 Token::DoBang => write!(f, "do!"),
246 Token::ReturnBang => write!(f, "return!"),
247 Token::YieldBang => write!(f, "yield!"),
248 Token::LoadDirective(path) => write!(f, "#load \"{}\"", path),
249 Token::Eof => write!(f, "EOF"),
250 }
251 }
252}
253
254#[derive(Debug, Clone, PartialEq)]
256pub struct TokenWithPos {
257 pub token: Token,
259 pub pos: Position,
261}
262
263impl TokenWithPos {
264 pub fn new(token: Token, pos: Position) -> Self {
266 TokenWithPos { token, pos }
267 }
268}
269
270#[derive(Debug, Clone, PartialEq)]
272pub struct TokenWithSpan {
273 pub token: Token,
275 pub span: Span,
277}
278
279impl TokenWithSpan {
280 pub fn new(token: Token, span: Span) -> Self {
282 TokenWithSpan { token, span }
283 }
284}
285
286#[derive(Debug, Clone, PartialEq)]
288pub enum LexError {
289 UnexpectedChar(char, Position),
291 UnterminatedString(Position),
293 InvalidNumber(String, Position),
295 UnterminatedComment(Position),
297 UnknownDirective(String, Position),
299}
300
301impl fmt::Display for LexError {
302 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
303 match self {
304 LexError::UnexpectedChar(ch, pos) => {
305 write!(f, "Unexpected character '{}' at {}", ch, pos)
306 }
307 LexError::UnterminatedString(pos) => {
308 write!(f, "Unterminated string literal at {}", pos)
309 }
310 LexError::InvalidNumber(s, pos) => {
311 write!(f, "Invalid number '{}' at {}", s, pos)
312 }
313 LexError::UnterminatedComment(pos) => {
314 write!(f, "Unterminated multi-line comment at {}", pos)
315 }
316 LexError::UnknownDirective(name, pos) => {
317 write!(f, "Unknown directive '{}' at {}", name, pos)
318 }
319 }
320 }
321}
322
323impl std::error::Error for LexError {}
324
325pub struct Lexer {
329 input: Vec<char>,
331 pos: usize,
333 line: usize,
335 column: usize,
337}
338
339impl Lexer {
340 pub fn new(input: &str) -> Self {
342 Lexer {
343 input: input.chars().collect(),
344 pos: 0,
345 line: 1,
346 column: 1,
347 }
348 }
349
350 pub fn tokenize(&mut self) -> Result<Vec<TokenWithPos>, LexError> {
352 let mut tokens = Vec::new();
353
354 while !self.is_at_end() {
355 self.skip_whitespace_and_comments()?;
356 if self.is_at_end() {
357 break;
358 }
359
360 let start_pos = self.current_position();
361 let token = self.next_token()?;
362 tokens.push(TokenWithPos::new(token, start_pos));
363 }
364
365 tokens.push(TokenWithPos::new(Token::Eof, self.current_position()));
366
367 Ok(tokens)
368 }
369
370 pub fn tokenize_with_spans(&mut self) -> Result<Vec<TokenWithSpan>, LexError> {
372 let mut tokens = Vec::new();
373
374 while !self.is_at_end() {
375 self.skip_whitespace_and_comments()?;
376 if self.is_at_end() {
377 break;
378 }
379
380 let start_pos = self.current_position();
381 let token = self.next_token()?;
382 let end_pos = self.current_position();
383 let span = Span::new(start_pos, end_pos);
384 tokens.push(TokenWithSpan::new(token, span));
385 }
386
387 let eof_pos = self.current_position();
388 tokens.push(TokenWithSpan::new(Token::Eof, Span::point(eof_pos)));
389
390 Ok(tokens)
391 }
392
393 fn next_token(&mut self) -> Result<Token, LexError> {
395 let ch = self.current_char();
396
397 match ch {
398 '0'..='9' => self.lex_number(),
399 'a'..='z' | 'A'..='Z' => self.lex_identifier_or_keyword(),
400 '_' => {
401 if !self.is_at_end_or(1)
403 && (self.peek_char().is_alphanumeric() || self.peek_char() == '_')
404 {
405 self.lex_identifier_or_keyword()
406 } else {
407 self.advance();
408 Ok(Token::Underscore)
409 }
410 }
411 '"' => self.lex_string(),
412 '+' => self.lex_plus_or_plusplus(),
413 '-' => self.lex_minus_or_arrow(),
414 '*' => {
415 self.advance();
416 Ok(Token::Star)
417 }
418 '/' => {
419 self.advance();
420 Ok(Token::Slash)
421 }
422 '=' => self.lex_eq_or_eqeq(),
423 '<' => self.lex_lt_or_lte_or_neq_or_larrow(),
424 '>' => self.lex_gt_or_gte(),
425 '&' => self.lex_and(),
426 '|' => {
427 self.advance();
428 if !self.is_at_end() && self.current_char() == '|' {
429 self.advance();
430 Ok(Token::Or)
431 } else if !self.is_at_end() && self.current_char() == '>' {
432 self.advance();
433 Ok(Token::PipeRight)
434 } else if !self.is_at_end() && self.current_char() == ']' {
435 self.advance();
436 Ok(Token::PipeRBracket)
437 } else if !self.is_at_end() && self.current_char() == '}' {
438 self.advance();
439 Ok(Token::PipeRBrace)
440 } else {
441 Ok(Token::Pipe)
442 }
443 }
444 ':' => self.lex_colon_or_coloncolon(),
445 '(' => {
446 self.advance();
447 Ok(Token::LParen)
448 }
449 ')' => {
450 self.advance();
451 Ok(Token::RParen)
452 }
453 '[' => self.lex_lbracket_or_lbracket_pipe(),
454 ']' => {
455 self.advance();
456 Ok(Token::RBracket)
457 }
458 ',' => {
459 self.advance();
460 Ok(Token::Comma)
461 }
462 ';' => {
463 self.advance();
464 Ok(Token::Semicolon)
465 }
466 '{' => self.lex_lbrace_or_lbrace_pipe(),
467 '}' => {
468 self.advance();
469 Ok(Token::RBrace)
470 }
471 '.' => {
472 self.advance();
473 Ok(Token::Dot)
474 }
475 '#' => self.lex_directive(),
476 _ => Err(LexError::UnexpectedChar(ch, self.current_position())),
477 }
478 }
479
480 fn lex_number(&mut self) -> Result<Token, LexError> {
482 let start = self.pos;
483 let start_pos = self.current_position();
484
485 while !self.is_at_end() && self.current_char().is_ascii_digit() {
486 self.advance();
487 }
488
489 if !self.is_at_end()
491 && self.current_char() == '.'
492 && !self.is_at_end_or(1)
493 && self.peek_char().is_ascii_digit()
494 {
495 self.advance(); while !self.is_at_end() && self.current_char().is_ascii_digit() {
497 self.advance();
498 }
499 let s: String = self.input[start..self.pos].iter().collect();
500 s.parse::<f64>()
501 .map(Token::Float)
502 .map_err(|_| LexError::InvalidNumber(s, start_pos))
503 } else {
504 let s: String = self.input[start..self.pos].iter().collect();
505 s.parse::<i64>()
506 .map(Token::Int)
507 .map_err(|_| LexError::InvalidNumber(s, start_pos))
508 }
509 }
510
511 fn lex_identifier_or_keyword(&mut self) -> Result<Token, LexError> {
513 let start = self.pos;
514
515 while !self.is_at_end()
516 && (self.current_char().is_alphanumeric() || self.current_char() == '_')
517 {
518 self.advance();
519 }
520
521 let s: String = self.input[start..self.pos].iter().collect();
522
523 if !self.is_at_end() && self.current_char() == '!' {
525 self.advance(); let token = match s.as_str() {
527 "let" => Token::LetBang,
528 "do" => Token::DoBang,
529 "return" => Token::ReturnBang,
530 "yield" => Token::YieldBang,
531 _ => {
532 Token::Ident(format!("{}!", s))
534 }
535 };
536 return Ok(token);
537 }
538
539 let token = match s.as_str() {
540 "let" => Token::Let,
541 "in" => Token::In,
542 "rec" => Token::Rec,
543 "match" => Token::Match,
544 "and" => Token::AndKeyword,
545 "if" => Token::If,
546 "then" => Token::Then,
547 "else" => Token::Else,
548 "fun" => Token::Fun,
549 "type" => Token::Type,
550 "with" => Token::With,
551 "of" => Token::Of,
552 "open" => Token::Open,
553 "module" => Token::Module,
554 "do" => Token::Do,
555 "while" => Token::While,
556 "break" => Token::Break,
557 "continue" => Token::Continue,
558 "async" => Token::Async,
559 "return" => Token::Return,
560 "yield" => Token::Yield,
561 "true" => Token::Bool(true),
562 "false" => Token::Bool(false),
563 _ => Token::Ident(s),
564 };
565
566 Ok(token)
567 }
568
569 fn lex_string(&mut self) -> Result<Token, LexError> {
571 let start_pos = self.current_position();
572 self.advance(); let mut s = String::new();
575
576 while !self.is_at_end() && self.current_char() != '"' {
577 let ch = self.current_char();
578 if ch == '\\' {
579 self.advance();
580 if self.is_at_end() {
581 return Err(LexError::UnterminatedString(start_pos));
582 }
583 let escaped = match self.current_char() {
584 'n' => '\n',
585 't' => '\t',
586 'r' => '\r',
587 '\\' => '\\',
588 '"' => '"',
589 c => c, };
591 s.push(escaped);
592 self.advance();
593 } else {
594 s.push(ch);
595 self.advance();
596 }
597 }
598
599 if self.is_at_end() {
600 return Err(LexError::UnterminatedString(start_pos));
601 }
602
603 self.advance(); Ok(Token::String(s))
605 }
606
607 fn lex_plus_or_plusplus(&mut self) -> Result<Token, LexError> {
609 self.advance();
610 if !self.is_at_end() && self.current_char() == '+' {
611 self.advance();
612 Ok(Token::PlusPlus)
613 } else {
614 Ok(Token::Plus)
615 }
616 }
617
618 fn lex_minus_or_arrow(&mut self) -> Result<Token, LexError> {
620 self.advance();
621 if !self.is_at_end() && self.current_char() == '>' {
622 self.advance();
623 Ok(Token::Arrow)
624 } else {
625 Ok(Token::Minus)
626 }
627 }
628
629 fn lex_eq_or_eqeq(&mut self) -> Result<Token, LexError> {
631 self.advance();
632 if !self.is_at_end() && self.current_char() == '=' {
633 self.advance();
634 Ok(Token::EqEq)
635 } else {
636 Ok(Token::Eq)
637 }
638 }
639
640 fn lex_lt_or_lte_or_neq_or_larrow(&mut self) -> Result<Token, LexError> {
642 self.advance();
643 if !self.is_at_end() {
644 match self.current_char() {
645 '=' => {
646 self.advance();
647 Ok(Token::Lte)
648 }
649 '>' => {
650 self.advance();
651 Ok(Token::Neq)
652 }
653 '-' => {
654 self.advance();
655 Ok(Token::LArrow)
656 }
657 _ => Ok(Token::Lt),
658 }
659 } else {
660 Ok(Token::Lt)
661 }
662 }
663
664 fn lex_gt_or_gte(&mut self) -> Result<Token, LexError> {
666 self.advance();
667 if !self.is_at_end() && self.current_char() == '=' {
668 self.advance();
669 Ok(Token::Gte)
670 } else {
671 Ok(Token::Gt)
672 }
673 }
674
675 fn lex_and(&mut self) -> Result<Token, LexError> {
677 let pos = self.current_position();
678 self.advance();
679 if !self.is_at_end() && self.current_char() == '&' {
680 self.advance();
681 Ok(Token::And)
682 } else {
683 Err(LexError::UnexpectedChar('&', pos))
684 }
685 }
686
687 fn lex_colon_or_coloncolon(&mut self) -> Result<Token, LexError> {
689 let _pos = self.current_position();
690 self.advance();
691 if !self.is_at_end() && self.current_char() == ':' {
692 self.advance();
693 Ok(Token::ColonColon)
694 } else {
695 Ok(Token::Colon)
696 }
697 }
698
699 fn lex_lbracket_or_lbracket_pipe(&mut self) -> Result<Token, LexError> {
701 self.advance();
702 if !self.is_at_end() && self.current_char() == '|' {
703 self.advance();
704 Ok(Token::LBracketPipe)
705 } else {
706 Ok(Token::LBracket)
707 }
708 }
709
710 fn lex_lbrace_or_lbrace_pipe(&mut self) -> Result<Token, LexError> {
712 self.advance();
713 if !self.is_at_end() && self.current_char() == '|' {
714 self.advance();
715 Ok(Token::LBracePipe)
716 } else {
717 Ok(Token::LBrace)
718 }
719 }
720
721 fn lex_directive(&mut self) -> Result<Token, LexError> {
723 let start_pos = self.current_position();
724 self.advance(); let directive_start = self.pos;
728 while !self.is_at_end()
729 && (self.current_char().is_alphanumeric() || self.current_char() == '_')
730 {
731 self.advance();
732 }
733
734 let directive: String = self.input[directive_start..self.pos].iter().collect();
735
736 while !self.is_at_end() && matches!(self.current_char(), ' ' | '\t') {
738 self.advance();
739 }
740
741 match directive.as_str() {
742 "load" => {
743 if self.is_at_end() || self.current_char() != '"' {
745 return Err(LexError::UnknownDirective(
746 format!("#load requires a string path"),
747 start_pos,
748 ));
749 }
750 let path_token = self.lex_string()?;
751 if let Token::String(path) = path_token {
752 Ok(Token::LoadDirective(path))
753 } else {
754 Err(LexError::UnknownDirective(
755 format!("#load requires a string path"),
756 start_pos,
757 ))
758 }
759 }
760 _ => Err(LexError::UnknownDirective(directive, start_pos)),
761 }
762 }
763
764 fn skip_whitespace_and_comments(&mut self) -> Result<(), LexError> {
766 loop {
767 if self.is_at_end() {
768 break;
769 }
770
771 match self.current_char() {
772 ' ' | '\t' | '\r' => {
773 self.advance();
774 }
775 '\n' => {
776 self.line += 1;
777 self.column = 0; self.advance();
779 }
780 '/' if !self.is_at_end_or(1) && self.peek_char() == '/' => {
781 self.skip_single_line_comment();
783 }
784 '(' if !self.is_at_end_or(1) && self.peek_char() == '*' => {
785 self.skip_multiline_comment()?;
787 }
788 _ => break,
789 }
790 }
791 Ok(())
792 }
793
794 fn skip_single_line_comment(&mut self) {
796 while !self.is_at_end() && self.current_char() != '\n' {
797 self.advance();
798 }
799 }
800
801 fn skip_multiline_comment(&mut self) -> Result<(), LexError> {
804 let start_pos = self.current_position();
805 let mut depth = 1;
806
807 self.advance(); self.advance(); while depth > 0 && !self.is_at_end() {
811 let ch = self.current_char();
812
813 if ch == '(' && !self.is_at_end_or(1) && self.peek_char() == '*' {
814 depth += 1;
816 self.advance();
817 self.advance();
818 } else if ch == '*' && !self.is_at_end_or(1) && self.peek_char() == ')' {
819 depth -= 1;
821 self.advance();
822 self.advance();
823 } else if ch == '\n' {
824 self.line += 1;
826 self.column = 0; self.advance();
828 } else {
829 self.advance();
830 }
831 }
832
833 if depth > 0 {
834 return Err(LexError::UnterminatedComment(start_pos));
835 }
836
837 Ok(())
838 }
839
840 fn current_char(&self) -> char {
842 if self.is_at_end() {
843 '\0'
844 } else {
845 self.input[self.pos]
846 }
847 }
848
849 fn peek_char(&self) -> char {
851 if self.pos + 1 >= self.input.len() {
852 '\0'
853 } else {
854 self.input[self.pos + 1]
855 }
856 }
857
858 fn advance(&mut self) {
860 if !self.is_at_end() {
861 self.pos += 1;
862 self.column += 1;
863 }
864 }
865
866 fn is_at_end(&self) -> bool {
868 self.pos >= self.input.len()
869 }
870
871 fn is_at_end_or(&self, offset: usize) -> bool {
873 self.pos + offset >= self.input.len()
874 }
875
876 fn current_position(&self) -> Position {
878 Position::new(self.line, self.column, self.pos)
879 }
880}
881
882#[cfg(test)]
884mod tests {
885 use super::*;
886
887 #[test]
888 fn test_lex_integer() {
889 let mut lexer = Lexer::new("42");
890 let tokens = lexer.tokenize().unwrap();
891 assert_eq!(tokens.len(), 2);
892 assert_eq!(tokens[0].token, Token::Int(42));
893 assert_eq!(tokens[1].token, Token::Eof);
894 }
895
896 #[test]
897 fn test_tokenize_with_spans() {
898 let mut lexer = Lexer::new("let x = 42");
899 let tokens = lexer.tokenize_with_spans().unwrap();
900 assert_eq!(tokens.len(), 5); assert_eq!(tokens[0].token, Token::Let);
902 assert!(tokens[0].span.is_single_line());
903 }
904
905 #[test]
906 fn test_lex_anonymous_record_tokens() {
907 let mut lexer = Lexer::new("{| x = 1 |}");
908 let tokens = lexer.tokenize().unwrap();
909 assert_eq!(tokens[0].token, Token::LBracePipe);
910 assert_eq!(tokens[4].token, Token::PipeRBrace);
911 }
912
913 #[test]
914 fn test_lex_pipe_disambiguation() {
915 let mut lexer = Lexer::new("|}");
917 let tokens = lexer.tokenize().unwrap();
918 assert_eq!(tokens.len(), 2); assert_eq!(tokens[0].token, Token::PipeRBrace);
920 }
921
922 #[test]
924 fn test_simple_multiline_comment() {
925 let mut lexer = Lexer::new("(* comment *) let x = 42");
926 let tokens = lexer.tokenize().unwrap();
927 assert_eq!(tokens.len(), 5); assert_eq!(tokens[0].token, Token::Let);
929 }
930
931 #[test]
932 fn test_multiline_comment_multiline() {
933 let source = r#"(* This is a
934 multi-line comment
935 spanning three lines *)
936 let x = 10"#;
937 let mut lexer = Lexer::new(source);
938 let tokens = lexer.tokenize().unwrap();
939 assert_eq!(tokens[0].token, Token::Let);
940 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
941 }
942
943 #[test]
944 fn test_nested_multiline_comments() {
945 let source = "(* outer (* inner *) still outer *) let x = 5";
946 let mut lexer = Lexer::new(source);
947 let tokens = lexer.tokenize().unwrap();
948 assert_eq!(tokens[0].token, Token::Let);
949 }
950
951 #[test]
952 fn test_deeply_nested_comments() {
953 let source = "(* level 1 (* level 2 (* level 3 *) level 2 *) level 1 *) let x = 1";
954 let mut lexer = Lexer::new(source);
955 let tokens = lexer.tokenize().unwrap();
956 assert_eq!(tokens[0].token, Token::Let);
957 }
958
959 #[test]
960 fn test_inline_multiline_comment() {
961 let source = "let x = (* inline comment *) 42";
962 let mut lexer = Lexer::new(source);
963 let tokens = lexer.tokenize().unwrap();
964 assert_eq!(tokens[0].token, Token::Let);
965 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
966 assert_eq!(tokens[2].token, Token::Eq);
967 assert_eq!(tokens[3].token, Token::Int(42));
968 }
969
970 #[test]
971 fn test_multiline_comment_with_special_chars() {
972 let source = r#"(* Comment with special chars: !@#$%^&*()[]{}:;"'<>,.?/\|`~ *) let x = 1"#;
973 let mut lexer = Lexer::new(source);
974 let tokens = lexer.tokenize().unwrap();
975 assert_eq!(tokens[0].token, Token::Let);
976 }
977
978 #[test]
979 fn test_multiple_multiline_comments() {
980 let source = "(* first *) let (* second *) x (* third *) = 42";
981 let mut lexer = Lexer::new(source);
982 let tokens = lexer.tokenize().unwrap();
983 assert_eq!(tokens[0].token, Token::Let);
984 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
985 assert_eq!(tokens[2].token, Token::Eq);
986 assert_eq!(tokens[3].token, Token::Int(42));
987 }
988
989 #[test]
990 fn test_unterminated_multiline_comment() {
991 let source = "(* This comment is not terminated let x = 42";
992 let mut lexer = Lexer::new(source);
993 let result = lexer.tokenize();
994 assert!(result.is_err());
995 match result.unwrap_err() {
996 LexError::UnterminatedComment(_) => (),
997 _ => panic!("Expected UnterminatedComment error"),
998 }
999 }
1000
1001 #[test]
1002 fn test_unterminated_nested_comment() {
1003 let source = "(* outer (* inner *) still outer";
1004 let mut lexer = Lexer::new(source);
1005 let result = lexer.tokenize();
1006 assert!(result.is_err());
1007 match result.unwrap_err() {
1008 LexError::UnterminatedComment(_) => (),
1009 _ => panic!("Expected UnterminatedComment error"),
1010 }
1011 }
1012
1013 #[test]
1014 fn test_mixed_single_and_multiline_comments() {
1015 let source = r#"
1016 // Single-line comment
1017 let x = 42 (* inline multi-line *)
1018 // Another single-line
1019 (* Multi-line
1020 spanning lines *)
1021 "#;
1022 let mut lexer = Lexer::new(source);
1023 let tokens = lexer.tokenize().unwrap();
1024 assert_eq!(tokens[0].token, Token::Let);
1025 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
1026 assert_eq!(tokens[2].token, Token::Eq);
1027 assert_eq!(tokens[3].token, Token::Int(42));
1028 }
1029
1030 #[test]
1031 fn test_comment_does_not_affect_string() {
1032 let source = r#"let s = "(* not a comment *)""#;
1034 let mut lexer = Lexer::new(source);
1035 let tokens = lexer.tokenize().unwrap();
1036 assert_eq!(tokens[0].token, Token::Let);
1037 assert_eq!(tokens[1].token, Token::Ident("s".to_string()));
1038 assert_eq!(tokens[2].token, Token::Eq);
1039 assert_eq!(
1040 tokens[3].token,
1041 Token::String("(* not a comment *)".to_string())
1042 );
1043 }
1044
1045 #[test]
1046 fn test_empty_multiline_comment() {
1047 let source = "(**) let x = 1";
1048 let mut lexer = Lexer::new(source);
1049 let tokens = lexer.tokenize().unwrap();
1050 assert_eq!(tokens[0].token, Token::Let);
1051 }
1052
1053 #[test]
1054 fn test_multiline_comment_with_asterisks() {
1055 let source = "(* ** *** **** *) let x = 1";
1056 let mut lexer = Lexer::new(source);
1057 let tokens = lexer.tokenize().unwrap();
1058 assert_eq!(tokens[0].token, Token::Let);
1059 }
1060
1061 #[test]
1062 fn test_lex_plusplus() {
1063 let mut lexer = Lexer::new("\"hello\" ++ \"world\"");
1064 let tokens = lexer.tokenize().unwrap();
1065 assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].token, Token::String("hello".to_string()));
1067 assert_eq!(tokens[1].token, Token::PlusPlus);
1068 assert_eq!(tokens[2].token, Token::String("world".to_string()));
1069 assert_eq!(tokens[3].token, Token::Eof);
1070 }
1071
1072 #[test]
1073 fn test_lex_plusplus_vs_plus() {
1074 let mut lexer = Lexer::new("1 + 2 ++ 3");
1075 let tokens = lexer.tokenize().unwrap();
1076 assert_eq!(tokens[0].token, Token::Int(1));
1077 assert_eq!(tokens[1].token, Token::Plus);
1078 assert_eq!(tokens[2].token, Token::Int(2));
1079 assert_eq!(tokens[3].token, Token::PlusPlus);
1080 assert_eq!(tokens[4].token, Token::Int(3));
1081 }
1082
1083 #[test]
1084 fn test_lex_plusplus_no_space() {
1085 let mut lexer = Lexer::new("a++b");
1086 let tokens = lexer.tokenize().unwrap();
1087 assert_eq!(tokens[0].token, Token::Ident("a".to_string()));
1088 assert_eq!(tokens[1].token, Token::PlusPlus);
1089 assert_eq!(tokens[2].token, Token::Ident("b".to_string()));
1090 }
1091
1092 #[test]
1093 fn test_lex_async_keyword() {
1094 let mut lexer = Lexer::new("async");
1095 let tokens = lexer.tokenize().unwrap();
1096 assert_eq!(tokens.len(), 2);
1097 assert_eq!(tokens[0].token, Token::Async);
1098 }
1099
1100 #[test]
1101 fn test_lex_return_keyword() {
1102 let mut lexer = Lexer::new("return");
1103 let tokens = lexer.tokenize().unwrap();
1104 assert_eq!(tokens.len(), 2);
1105 assert_eq!(tokens[0].token, Token::Return);
1106 }
1107
1108 #[test]
1109 fn test_lex_yield_keyword() {
1110 let mut lexer = Lexer::new("yield");
1111 let tokens = lexer.tokenize().unwrap();
1112 assert_eq!(tokens.len(), 2);
1113 assert_eq!(tokens[0].token, Token::Yield);
1114 }
1115
1116 #[test]
1117 fn test_lex_let_bang() {
1118 let mut lexer = Lexer::new("let!");
1119 let tokens = lexer.tokenize().unwrap();
1120 assert_eq!(tokens.len(), 2);
1121 assert_eq!(tokens[0].token, Token::LetBang);
1122 }
1123
1124 #[test]
1125 fn test_lex_do_bang() {
1126 let mut lexer = Lexer::new("do!");
1127 let tokens = lexer.tokenize().unwrap();
1128 assert_eq!(tokens.len(), 2);
1129 assert_eq!(tokens[0].token, Token::DoBang);
1130 }
1131
1132 #[test]
1133 fn test_lex_return_bang() {
1134 let mut lexer = Lexer::new("return!");
1135 let tokens = lexer.tokenize().unwrap();
1136 assert_eq!(tokens.len(), 2);
1137 assert_eq!(tokens[0].token, Token::ReturnBang);
1138 }
1139
1140 #[test]
1141 fn test_lex_yield_bang() {
1142 let mut lexer = Lexer::new("yield!");
1143 let tokens = lexer.tokenize().unwrap();
1144 assert_eq!(tokens.len(), 2);
1145 assert_eq!(tokens[0].token, Token::YieldBang);
1146 }
1147
1148 #[test]
1149 fn test_lex_let_vs_let_bang() {
1150 let mut lexer = Lexer::new("let x = let! y = 42");
1151 let tokens = lexer.tokenize().unwrap();
1152 assert_eq!(tokens[0].token, Token::Let);
1153 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
1154 assert_eq!(tokens[2].token, Token::Eq);
1155 assert_eq!(tokens[3].token, Token::LetBang);
1156 assert_eq!(tokens[4].token, Token::Ident("y".to_string()));
1157 assert_eq!(tokens[5].token, Token::Eq);
1158 assert_eq!(tokens[6].token, Token::Int(42));
1159 }
1160
1161 #[test]
1162 fn test_lex_do_vs_do_bang() {
1163 let mut lexer = Lexer::new("do x do! y");
1164 let tokens = lexer.tokenize().unwrap();
1165 assert_eq!(tokens[0].token, Token::Do);
1166 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
1167 assert_eq!(tokens[2].token, Token::DoBang);
1168 assert_eq!(tokens[3].token, Token::Ident("y".to_string()));
1169 }
1170
1171 #[test]
1172 fn test_lex_computation_expression() {
1173 let source = "async { let! x = getAsync() do! printAsync x return! computeAsync x }";
1174 let mut lexer = Lexer::new(source);
1175 let tokens = lexer.tokenize().unwrap();
1176 let async_pos = tokens.iter().position(|t| t.token == Token::Async).unwrap();
1177 let letbang_pos = tokens
1178 .iter()
1179 .position(|t| t.token == Token::LetBang)
1180 .unwrap();
1181 let dobang_pos = tokens
1182 .iter()
1183 .position(|t| t.token == Token::DoBang)
1184 .unwrap();
1185 let returnbang_pos = tokens
1186 .iter()
1187 .position(|t| t.token == Token::ReturnBang)
1188 .unwrap();
1189 assert!(async_pos < letbang_pos);
1190 assert!(letbang_pos < dobang_pos);
1191 assert!(dobang_pos < returnbang_pos);
1192 }
1193
1194 #[test]
1195 fn test_lex_bang_on_non_keyword() {
1196 let mut lexer = Lexer::new("foo!");
1197 let tokens = lexer.tokenize().unwrap();
1198 assert_eq!(tokens.len(), 2);
1199 assert_eq!(tokens[0].token, Token::Ident("foo!".to_string()));
1200 }
1201
1202 #[test]
1203 fn test_lex_yield_in_expression() {
1204 let mut lexer = Lexer::new("yield x + yield! y");
1205 let tokens = lexer.tokenize().unwrap();
1206 assert_eq!(tokens[0].token, Token::Yield);
1207 assert_eq!(tokens[1].token, Token::Ident("x".to_string()));
1208 assert_eq!(tokens[2].token, Token::Plus);
1209 assert_eq!(tokens[3].token, Token::YieldBang);
1210 assert_eq!(tokens[4].token, Token::Ident("y".to_string()));
1211 }
1212}