1use cjc_diag::{Diagnostic, DiagnosticBag, Span};
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum TokenKind {
30 IntLit,
34 FloatLit,
36 StringLit,
38 ByteStringLit,
40 ByteCharLit,
42 RawStringLit,
44 RawByteStringLit,
46 FStringLit,
48 RegexLit,
50 True,
52 False,
54
55 Ident,
59 Struct,
61 Class,
63 Record,
65 Fn,
67 Trait,
69 Impl,
71 Let,
73 Mut,
75 Return,
77 Break,
79 Continue,
81 If,
83 Else,
85 While,
87 For,
89 In,
91 DotDot,
93 DotDotDot,
95 NoGc,
97 Col,
99 Import,
101 Mod,
103 As,
105 Sealed,
107 Match,
109 Enum,
111 Const,
113 Pub,
115 Null,
117 Na,
119 Underscore,
121
122 Plus,
126 Minus,
128 Star,
130 Slash,
132 Percent,
134 StarStar,
136
137 EqEq,
141 BangEq,
143 Lt,
145 Gt,
147 LtEq,
149 GtEq,
151
152 AmpAmp,
156 PipePipe,
158 Bang,
160
161 Eq,
165 Pipe,
167 PipeGt,
169 Question,
171
172 Tilde,
176 TildeEq,
178 BangTilde,
180
181 PlusEq,
185 MinusEq,
187 StarEq,
189 SlashEq,
191 PercentEq,
193 StarStarEq,
195
196 Amp,
200 Caret,
202 LtLt,
204 GtGt,
206
207 AmpEq,
211 PipeEq,
213 CaretEq,
215 LtLtEq,
217 GtGtEq,
219
220 LParen,
224 RParen,
226 LBrace,
228 RBrace,
230 LBracket,
232 RBracket,
234 LBracketPipe,
236 PipeRBracket,
238
239 Comma,
243 Dot,
245 Colon,
247 Semicolon,
249 Arrow,
251 FatArrow,
253 At,
255
256 Eof,
260 Error,
262}
263
264impl TokenKind {
265 pub fn is_keyword(&self) -> bool {
270 matches!(
271 self,
272 TokenKind::Struct
273 | TokenKind::Class
274 | TokenKind::Fn
275 | TokenKind::Trait
276 | TokenKind::Impl
277 | TokenKind::Let
278 | TokenKind::Mut
279 | TokenKind::Return
280 | TokenKind::Break
281 | TokenKind::Continue
282 | TokenKind::If
283 | TokenKind::Else
284 | TokenKind::While
285 | TokenKind::For
286 | TokenKind::In
287 | TokenKind::NoGc
288 | TokenKind::Col
289 | TokenKind::Import
290 | TokenKind::Mod
291 | TokenKind::As
292 | TokenKind::Sealed
293 | TokenKind::Match
294 | TokenKind::Enum
295 | TokenKind::Pub
296 | TokenKind::True
297 | TokenKind::False
298 | TokenKind::Na
299 )
300 }
301
302 pub fn describe(&self) -> &'static str {
306 match self {
307 TokenKind::IntLit => "integer literal",
308 TokenKind::FloatLit => "float literal",
309 TokenKind::StringLit => "string literal",
310 TokenKind::ByteStringLit => "byte string literal",
311 TokenKind::ByteCharLit => "byte char literal",
312 TokenKind::RawStringLit => "raw string literal",
313 TokenKind::RawByteStringLit => "raw byte string literal",
314 TokenKind::FStringLit => "format string literal",
315 TokenKind::RegexLit => "regex literal",
316 TokenKind::True => "`true`",
317 TokenKind::False => "`false`",
318 TokenKind::Ident => "identifier",
319 TokenKind::Struct => "`struct`",
320 TokenKind::Class => "`class`",
321 TokenKind::Record => "`record`",
322 TokenKind::Fn => "`fn`",
323 TokenKind::Trait => "`trait`",
324 TokenKind::Impl => "`impl`",
325 TokenKind::Let => "`let`",
326 TokenKind::Mut => "`mut`",
327 TokenKind::Return => "`return`",
328 TokenKind::Break => "`break`",
329 TokenKind::Continue => "`continue`",
330 TokenKind::If => "`if`",
331 TokenKind::Else => "`else`",
332 TokenKind::While => "`while`",
333 TokenKind::For => "`for`",
334 TokenKind::In => "`in`",
335 TokenKind::DotDot => "`..`",
336 TokenKind::DotDotDot => "`...`",
337 TokenKind::NoGc => "`nogc`",
338 TokenKind::Col => "`col`",
339 TokenKind::Import => "`import`",
340 TokenKind::Mod => "`mod`",
341 TokenKind::As => "`as`",
342 TokenKind::Sealed => "`sealed`",
343 TokenKind::Match => "`match`",
344 TokenKind::Enum => "`enum`",
345 TokenKind::Const => "`const`",
346 TokenKind::Pub => "`pub`",
347 TokenKind::Null => "`null`",
348 TokenKind::Na => "`NA`",
349 TokenKind::Underscore => "`_`",
350 TokenKind::Plus => "`+`",
351 TokenKind::Minus => "`-`",
352 TokenKind::Star => "`*`",
353 TokenKind::Slash => "`/`",
354 TokenKind::Percent => "`%`",
355 TokenKind::StarStar => "`**`",
356 TokenKind::EqEq => "`==`",
357 TokenKind::BangEq => "`!=`",
358 TokenKind::Lt => "`<`",
359 TokenKind::Gt => "`>`",
360 TokenKind::LtEq => "`<=`",
361 TokenKind::GtEq => "`>=`",
362 TokenKind::AmpAmp => "`&&`",
363 TokenKind::PipePipe => "`||`",
364 TokenKind::Bang => "`!`",
365 TokenKind::Eq => "`=`",
366 TokenKind::Pipe => "`|`",
367 TokenKind::PipeGt => "`|>`",
368 TokenKind::Question => "`?`",
369 TokenKind::Tilde => "`~`",
370 TokenKind::TildeEq => "`~=`",
371 TokenKind::BangTilde => "`!~`",
372 TokenKind::PlusEq => "`+=`",
374 TokenKind::MinusEq => "`-=`",
375 TokenKind::StarEq => "`*=`",
376 TokenKind::SlashEq => "`/=`",
377 TokenKind::PercentEq => "`%=`",
378 TokenKind::StarStarEq => "`**=`",
379 TokenKind::Amp => "`&`",
381 TokenKind::Caret => "`^`",
382 TokenKind::LtLt => "`<<`",
383 TokenKind::GtGt => "`>>`",
384 TokenKind::AmpEq => "`&=`",
385 TokenKind::PipeEq => "`|=`",
386 TokenKind::CaretEq => "`^=`",
387 TokenKind::LtLtEq => "`<<=`",
388 TokenKind::GtGtEq => "`>>=`",
389 TokenKind::LParen => "`(`",
390 TokenKind::RParen => "`)`",
391 TokenKind::LBrace => "`{`",
392 TokenKind::RBrace => "`}`",
393 TokenKind::LBracket => "`[`",
394 TokenKind::RBracket => "`]`",
395 TokenKind::LBracketPipe => "`[|`",
396 TokenKind::PipeRBracket => "`|]`",
397 TokenKind::Comma => "`,`",
398 TokenKind::Dot => "`.`",
399 TokenKind::Colon => "`:`",
400 TokenKind::Semicolon => "`;`",
401 TokenKind::Arrow => "`->`",
402 TokenKind::FatArrow => "`=>`",
403 TokenKind::At => "`@`",
404 TokenKind::Eof => "end of file",
405 TokenKind::Error => "error",
406 }
407 }
408}
409
410#[derive(Debug, Clone)]
417pub struct Token {
418 pub kind: TokenKind,
420 pub span: Span,
422 pub text: String,
424}
425
426impl Token {
427 pub fn new(kind: TokenKind, span: Span, text: impl Into<String>) -> Self {
435 Self {
436 kind,
437 span,
438 text: text.into(),
439 }
440 }
441
442 pub fn int_value(&self) -> i64 {
452 let clean = self.text.replace('_', "");
453 if clean.starts_with("0x") || clean.starts_with("0X") {
454 i64::from_str_radix(&clean[2..], 16).unwrap_or(0)
455 } else if clean.starts_with("0b") || clean.starts_with("0B") {
456 i64::from_str_radix(&clean[2..], 2).unwrap_or(0)
457 } else if clean.starts_with("0o") || clean.starts_with("0O") {
458 i64::from_str_radix(&clean[2..], 8).unwrap_or(0)
459 } else {
460 clean.parse().unwrap_or(0)
461 }
462 }
463
464 pub fn float_value(&self) -> f64 {
469 self.text.replace('_', "").parse().unwrap_or(0.0)
470 }
471}
472
473pub struct Lexer<'a> {
488 source: &'a str,
490 bytes: &'a [u8],
492 pos: usize,
494 prev_kind: TokenKind,
497 pub diagnostics: DiagnosticBag,
499}
500
501impl<'a> Lexer<'a> {
502 pub fn new(source: &'a str) -> Self {
508 Self {
509 source,
510 bytes: source.as_bytes(),
511 pos: 0,
512 prev_kind: TokenKind::Eof,
513 diagnostics: DiagnosticBag::new(),
514 }
515 }
516
517 pub fn tokenize(mut self) -> (Vec<Token>, DiagnosticBag) {
526 let mut tokens = Vec::new();
527 loop {
528 let tok = self.next_token();
529 let is_eof = tok.kind == TokenKind::Eof;
530 self.prev_kind = tok.kind;
531 tokens.push(tok);
532 if is_eof {
533 break;
534 }
535 }
536 (tokens, self.diagnostics)
537 }
538
539 fn prev_is_value(&self) -> bool {
542 matches!(
543 self.prev_kind,
544 TokenKind::IntLit
545 | TokenKind::FloatLit
546 | TokenKind::StringLit
547 | TokenKind::ByteStringLit
548 | TokenKind::ByteCharLit
549 | TokenKind::RawStringLit
550 | TokenKind::RawByteStringLit
551 | TokenKind::FStringLit
552 | TokenKind::RegexLit
553 | TokenKind::True
554 | TokenKind::False
555 | TokenKind::Ident
556 | TokenKind::RParen
557 | TokenKind::RBracket
558 | TokenKind::PipeRBracket
559 | TokenKind::RBrace
560 )
561 }
562
563 fn peek(&self) -> u8 {
564 if self.pos < self.bytes.len() {
565 self.bytes[self.pos]
566 } else {
567 0
568 }
569 }
570
571 fn peek_at(&self, offset: usize) -> u8 {
572 let idx = self.pos + offset;
573 if idx < self.bytes.len() {
574 self.bytes[idx]
575 } else {
576 0
577 }
578 }
579
580 fn advance(&mut self) -> u8 {
581 let ch = self.peek();
582 self.pos += 1;
583 ch
584 }
585
586 fn skip_whitespace_and_comments(&mut self) {
587 loop {
588 while self.pos < self.bytes.len() && self.peek().is_ascii_whitespace() {
590 self.advance();
591 }
592
593 if self.pos + 1 < self.bytes.len()
595 && self.bytes[self.pos] == b'/'
596 && self.bytes[self.pos + 1] == b'/'
597 {
598 while self.pos < self.bytes.len() && self.peek() != b'\n' {
599 self.advance();
600 }
601 continue;
602 }
603
604 if self.pos + 1 < self.bytes.len()
606 && self.bytes[self.pos] == b'/'
607 && self.bytes[self.pos + 1] == b'*'
608 {
609 self.advance(); self.advance(); let mut depth = 1;
612 while self.pos < self.bytes.len() && depth > 0 {
613 if self.peek() == b'/' && self.peek_at(1) == b'*' {
614 depth += 1;
615 self.advance();
616 self.advance();
617 } else if self.peek() == b'*' && self.peek_at(1) == b'/' {
618 depth -= 1;
619 self.advance();
620 self.advance();
621 } else {
622 self.advance();
623 }
624 }
625 continue;
626 }
627
628 break;
629 }
630 }
631
632 fn next_token(&mut self) -> Token {
633 self.skip_whitespace_and_comments();
634
635 let start = self.pos;
636
637 if self.pos >= self.bytes.len() {
638 return Token::new(TokenKind::Eof, Span::new(start, start), "");
639 }
640
641 let ch = self.advance();
642
643 match ch {
644 b'(' => Token::new(TokenKind::LParen, Span::new(start, self.pos), "("),
646 b')' => Token::new(TokenKind::RParen, Span::new(start, self.pos), ")"),
647 b'{' => Token::new(TokenKind::LBrace, Span::new(start, self.pos), "{"),
648 b'}' => Token::new(TokenKind::RBrace, Span::new(start, self.pos), "}"),
649 b'[' => {
650 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'|' {
651 self.pos += 1;
652 Token::new(TokenKind::LBracketPipe, Span::new(start, self.pos), "[|")
653 } else {
654 Token::new(TokenKind::LBracket, Span::new(start, self.pos), "[")
655 }
656 }
657 b']' => Token::new(TokenKind::RBracket, Span::new(start, self.pos), "]"),
658 b',' => Token::new(TokenKind::Comma, Span::new(start, self.pos), ","),
659 b'.' => {
660 if self.peek() == b'.' {
661 self.advance();
662 if self.peek() == b'.' {
663 self.advance();
664 Token::new(TokenKind::DotDotDot, Span::new(start, self.pos), "...")
665 } else {
666 Token::new(TokenKind::DotDot, Span::new(start, self.pos), "..")
667 }
668 } else {
669 Token::new(TokenKind::Dot, Span::new(start, self.pos), ".")
670 }
671 }
672 b':' => Token::new(TokenKind::Colon, Span::new(start, self.pos), ":"),
673 b';' => Token::new(TokenKind::Semicolon, Span::new(start, self.pos), ";"),
674 b'+' => {
675 if self.peek() == b'=' {
676 self.advance();
677 Token::new(TokenKind::PlusEq, Span::new(start, self.pos), "+=")
678 } else {
679 Token::new(TokenKind::Plus, Span::new(start, self.pos), "+")
680 }
681 }
682 b'*' => {
683 if self.peek() == b'*' {
684 self.advance();
685 if self.peek() == b'=' {
686 self.advance();
687 Token::new(TokenKind::StarStarEq, Span::new(start, self.pos), "**=")
688 } else {
689 Token::new(TokenKind::StarStar, Span::new(start, self.pos), "**")
690 }
691 } else if self.peek() == b'=' {
692 self.advance();
693 Token::new(TokenKind::StarEq, Span::new(start, self.pos), "*=")
694 } else {
695 Token::new(TokenKind::Star, Span::new(start, self.pos), "*")
696 }
697 }
698 b'%' => {
699 if self.peek() == b'=' {
700 self.advance();
701 Token::new(TokenKind::PercentEq, Span::new(start, self.pos), "%=")
702 } else {
703 Token::new(TokenKind::Percent, Span::new(start, self.pos), "%")
704 }
705 }
706 b'^' => {
707 if self.peek() == b'=' {
708 self.advance();
709 Token::new(TokenKind::CaretEq, Span::new(start, self.pos), "^=")
710 } else {
711 Token::new(TokenKind::Caret, Span::new(start, self.pos), "^")
712 }
713 }
714 b'/' => {
715 if !self.prev_is_value() && self.pos < self.bytes.len() && self.peek() != b'/' && self.peek() != b'*' && self.peek() != b' ' && self.peek() != b'\t' && self.peek() != b'\n' && self.peek() != b'\r' {
716 let save_pos = self.pos;
718 let save_diag_count = self.diagnostics.count();
719 let tok = self.lex_regex(start);
720 if tok.kind == TokenKind::Error {
721 self.pos = save_pos;
723 self.diagnostics.truncate(save_diag_count);
724 Token::new(TokenKind::Slash, Span::new(start, save_pos), "/")
725 } else {
726 tok
727 }
728 } else if self.peek() == b'=' {
729 self.advance();
730 Token::new(TokenKind::SlashEq, Span::new(start, self.pos), "/=")
731 } else {
732 Token::new(TokenKind::Slash, Span::new(start, self.pos), "/")
733 }
734 }
735 b'?' => Token::new(TokenKind::Question, Span::new(start, self.pos), "?"),
736 b'@' => Token::new(TokenKind::At, Span::new(start, self.pos), "@"),
737
738 b'-' => {
740 if self.peek() == b'>' {
741 self.advance();
742 Token::new(TokenKind::Arrow, Span::new(start, self.pos), "->")
743 } else if self.peek() == b'=' {
744 self.advance();
745 Token::new(TokenKind::MinusEq, Span::new(start, self.pos), "-=")
746 } else {
747 Token::new(TokenKind::Minus, Span::new(start, self.pos), "-")
748 }
749 }
750 b'=' => {
751 if self.peek() == b'=' {
752 self.advance();
753 Token::new(TokenKind::EqEq, Span::new(start, self.pos), "==")
754 } else if self.peek() == b'>' {
755 self.advance();
756 Token::new(TokenKind::FatArrow, Span::new(start, self.pos), "=>")
757 } else {
758 Token::new(TokenKind::Eq, Span::new(start, self.pos), "=")
759 }
760 }
761 b'!' => {
762 if self.peek() == b'=' {
763 self.advance();
764 Token::new(TokenKind::BangEq, Span::new(start, self.pos), "!=")
765 } else if self.peek() == b'~' {
766 self.advance();
767 Token::new(TokenKind::BangTilde, Span::new(start, self.pos), "!~")
768 } else {
769 Token::new(TokenKind::Bang, Span::new(start, self.pos), "!")
770 }
771 }
772 b'<' => {
773 if self.peek() == b'<' {
774 self.advance();
775 if self.peek() == b'=' {
776 self.advance();
777 Token::new(TokenKind::LtLtEq, Span::new(start, self.pos), "<<=")
778 } else {
779 Token::new(TokenKind::LtLt, Span::new(start, self.pos), "<<")
780 }
781 } else if self.peek() == b'=' {
782 self.advance();
783 Token::new(TokenKind::LtEq, Span::new(start, self.pos), "<=")
784 } else {
785 Token::new(TokenKind::Lt, Span::new(start, self.pos), "<")
786 }
787 }
788 b'>' => {
789 if self.peek() == b'>' {
790 self.advance();
791 if self.peek() == b'=' {
792 self.advance();
793 Token::new(TokenKind::GtGtEq, Span::new(start, self.pos), ">>=")
794 } else {
795 Token::new(TokenKind::GtGt, Span::new(start, self.pos), ">>")
796 }
797 } else if self.peek() == b'=' {
798 self.advance();
799 Token::new(TokenKind::GtEq, Span::new(start, self.pos), ">=")
800 } else {
801 Token::new(TokenKind::Gt, Span::new(start, self.pos), ">")
802 }
803 }
804 b'&' => {
805 if self.peek() == b'&' {
806 self.advance();
807 Token::new(TokenKind::AmpAmp, Span::new(start, self.pos), "&&")
808 } else if self.peek() == b'=' {
809 self.advance();
810 Token::new(TokenKind::AmpEq, Span::new(start, self.pos), "&=")
811 } else {
812 Token::new(TokenKind::Amp, Span::new(start, self.pos), "&")
813 }
814 }
815 b'|' => {
816 if self.peek() == b'|' {
817 self.advance();
818 Token::new(TokenKind::PipePipe, Span::new(start, self.pos), "||")
819 } else if self.peek() == b'>' {
820 self.advance();
821 Token::new(TokenKind::PipeGt, Span::new(start, self.pos), "|>")
822 } else if self.peek() == b']' {
823 self.advance();
824 Token::new(TokenKind::PipeRBracket, Span::new(start, self.pos), "|]")
825 } else if self.peek() == b'=' {
826 self.advance();
827 Token::new(TokenKind::PipeEq, Span::new(start, self.pos), "|=")
828 } else {
829 Token::new(TokenKind::Pipe, Span::new(start, self.pos), "|")
831 }
832 }
833
834 b'"' => self.lex_string(start),
836
837 b'b' if self.peek() == b'"' => {
839 self.advance(); self.lex_byte_string(start)
841 }
842 b'b' if self.peek() == b'\'' => {
843 self.advance(); self.lex_byte_char(start)
845 }
846 b'b' if self.peek() == b'r' && self.peek_at(1) == b'"' => {
847 self.advance(); self.advance(); self.lex_raw_string(start, true, 0)
850 }
851 b'b' if self.peek() == b'r' && self.peek_at(1) == b'#' => {
852 self.advance(); let mut hashes = 0;
855 while self.peek() == b'#' {
856 self.advance();
857 hashes += 1;
858 }
859 if self.peek() == b'"' {
860 self.advance(); self.lex_raw_string(start, true, hashes)
862 } else {
863 self.diagnostics.emit(Diagnostic::error(
864 "E0010",
865 "expected `\"` after `br` and `#` delimiters",
866 Span::new(start, self.pos),
867 ));
868 Token::new(TokenKind::Error, Span::new(start, self.pos), &self.source[start..self.pos])
869 }
870 }
871
872 b'r' if self.peek() == b'"' => {
874 self.advance(); self.lex_raw_string(start, false, 0)
876 }
877 b'r' if self.peek() == b'#' => {
878 let save_pos = self.pos;
879 let mut hashes = 0;
880 while self.peek() == b'#' {
881 self.advance();
882 hashes += 1;
883 }
884 if self.peek() == b'"' {
885 self.advance(); self.lex_raw_string(start, false, hashes)
887 } else {
888 self.pos = save_pos;
890 self.lex_ident(start)
891 }
892 }
893
894 b'f' if self.peek() == b'"' => {
896 self.advance(); self.lex_fstring(start)
898 }
899
900 b'~' => {
902 if self.peek() == b'=' {
903 self.advance();
904 Token::new(TokenKind::TildeEq, Span::new(start, self.pos), "~=")
905 } else {
906 Token::new(TokenKind::Tilde, Span::new(start, self.pos), "~")
907 }
908 }
909
910 b'0'..=b'9' => self.lex_number(start),
912
913 b'a'..=b'z' | b'A'..=b'Z' | b'_' => self.lex_ident(start),
915
916 _ => {
917 self.diagnostics.emit(Diagnostic::error(
918 "E0002",
919 format!("unexpected character `{}`", ch as char),
920 Span::new(start, self.pos),
921 ));
922 Token::new(
923 TokenKind::Error,
924 Span::new(start, self.pos),
925 &self.source[start..self.pos],
926 )
927 }
928 }
929 }
930
931 fn lex_string(&mut self, start: usize) -> Token {
932 let mut value = String::new();
933 loop {
934 if self.pos >= self.bytes.len() {
935 self.diagnostics.emit(
936 Diagnostic::error(
937 "E0003",
938 "unterminated string literal",
939 Span::new(start, self.pos),
940 )
941 .with_hint("add a closing `\"` to terminate the string"),
942 );
943 break;
944 }
945 let ch = self.advance();
946 match ch {
947 b'"' => break,
948 b'\\' => {
949 if self.pos >= self.bytes.len() {
950 self.diagnostics.emit(Diagnostic::error(
951 "E0003",
952 "unterminated escape sequence",
953 Span::new(self.pos - 1, self.pos),
954 ));
955 break;
956 }
957 let esc = self.advance();
958 match esc {
959 b'n' => value.push('\n'),
960 b't' => value.push('\t'),
961 b'r' => value.push('\r'),
962 b'\\' => value.push('\\'),
963 b'"' => value.push('"'),
964 b'0' => value.push('\0'),
965 _ => {
966 self.diagnostics.emit(
967 Diagnostic::error(
968 "E0004",
969 format!("unknown escape sequence `\\{}`", esc as char),
970 Span::new(self.pos - 2, self.pos),
971 )
972 .with_hint("valid escapes: \\n, \\t, \\r, \\\\, \\\", \\0"),
973 );
974 value.push(esc as char);
975 }
976 }
977 }
978 _ => value.push(ch as char),
979 }
980 }
981 Token::new(TokenKind::StringLit, Span::new(start, self.pos), value)
982 }
983
984 fn lex_fstring(&mut self, start: usize) -> Token {
988 let mut raw = String::new();
989 loop {
990 if self.pos >= self.bytes.len() {
991 self.diagnostics.emit(
992 Diagnostic::error(
993 "E0003",
994 "unterminated format string literal",
995 Span::new(start, self.pos),
996 )
997 .with_hint("add a closing `\"` to terminate the format string"),
998 );
999 break;
1000 }
1001 let ch = self.advance();
1002 match ch {
1003 b'"' => break,
1004 b'\\' => {
1005 if self.pos >= self.bytes.len() {
1006 break;
1007 }
1008 let esc = self.advance();
1009 raw.push('\\');
1010 raw.push(esc as char);
1011 }
1012 b'{' => {
1013 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'{' {
1015 self.advance();
1016 raw.push('{');
1017 } else {
1018 raw.push('{');
1021 let mut depth = 1usize;
1022 while self.pos < self.bytes.len() && depth > 0 {
1023 let inner = self.advance();
1024 raw.push(inner as char);
1025 if inner == b'{' {
1026 depth += 1;
1027 } else if inner == b'}' {
1028 depth -= 1;
1029 }
1030 }
1031 }
1032 }
1033 b'}' => {
1034 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'}' {
1036 self.advance();
1037 raw.push('}');
1038 } else {
1039 self.diagnostics.emit(Diagnostic::error(
1040 "E0450",
1041 "unexpected `}` in format string (use `}}` for a literal `}`)",
1042 Span::new(self.pos - 1, self.pos),
1043 ));
1044 raw.push('}');
1045 }
1046 }
1047 _ => raw.push(ch as char),
1048 }
1049 }
1050 Token::new(TokenKind::FStringLit, Span::new(start, self.pos), raw)
1051 }
1052
1053 fn lex_byte_string(&mut self, start: usize) -> Token {
1055 let mut value = Vec::new();
1056 loop {
1057 if self.pos >= self.bytes.len() {
1058 self.diagnostics.emit(
1059 Diagnostic::error(
1060 "E0003",
1061 "unterminated byte string literal",
1062 Span::new(start, self.pos),
1063 )
1064 .with_hint("add a closing `\"` to terminate the byte string"),
1065 );
1066 break;
1067 }
1068 let ch = self.advance();
1069 match ch {
1070 b'"' => break,
1071 b'\\' => {
1072 if self.pos >= self.bytes.len() {
1073 self.diagnostics.emit(Diagnostic::error(
1074 "E0003",
1075 "unterminated escape sequence in byte string",
1076 Span::new(self.pos - 1, self.pos),
1077 ));
1078 break;
1079 }
1080 let esc = self.advance();
1081 match esc {
1082 b'n' => value.push(b'\n'),
1083 b't' => value.push(b'\t'),
1084 b'r' => value.push(b'\r'),
1085 b'\\' => value.push(b'\\'),
1086 b'"' => value.push(b'"'),
1087 b'0' => value.push(0),
1088 b'x' => {
1089 if self.pos + 1 < self.bytes.len() {
1091 let hi = self.advance();
1092 let lo = self.advance();
1093 match (hex_digit(hi), hex_digit(lo)) {
1094 (Some(h), Some(l)) => value.push(h * 16 + l),
1095 _ => {
1096 self.diagnostics.emit(
1097 Diagnostic::error(
1098 "E0004",
1099 format!("invalid hex escape `\\x{}{}`", hi as char, lo as char),
1100 Span::new(self.pos - 4, self.pos),
1101 )
1102 .with_hint("hex escapes must be \\xNN where N is 0-9 or a-f"),
1103 );
1104 }
1105 }
1106 } else {
1107 self.diagnostics.emit(Diagnostic::error(
1108 "E0004",
1109 "incomplete hex escape in byte string",
1110 Span::new(self.pos - 2, self.pos),
1111 ));
1112 }
1113 }
1114 _ => {
1115 self.diagnostics.emit(
1116 Diagnostic::error(
1117 "E0004",
1118 format!("unknown escape sequence `\\{}`", esc as char),
1119 Span::new(self.pos - 2, self.pos),
1120 )
1121 .with_hint("valid escapes: \\n, \\t, \\r, \\\\, \\\", \\0, \\xNN"),
1122 );
1123 value.push(esc);
1124 }
1125 }
1126 }
1127 _ => value.push(ch),
1128 }
1129 }
1130 let text: String = value.iter().map(|&b| b as char).collect();
1132 Token::new(TokenKind::ByteStringLit, Span::new(start, self.pos), text)
1133 }
1134
1135 fn lex_byte_char(&mut self, start: usize) -> Token {
1137 if self.pos >= self.bytes.len() {
1138 self.diagnostics.emit(Diagnostic::error(
1139 "E0003",
1140 "unterminated byte char literal",
1141 Span::new(start, self.pos),
1142 ));
1143 return Token::new(TokenKind::Error, Span::new(start, self.pos), "");
1144 }
1145
1146 let byte_val = if self.peek() == b'\\' {
1147 self.advance(); if self.pos >= self.bytes.len() {
1149 self.diagnostics.emit(Diagnostic::error(
1150 "E0003",
1151 "unterminated escape in byte char literal",
1152 Span::new(start, self.pos),
1153 ));
1154 return Token::new(TokenKind::Error, Span::new(start, self.pos), "");
1155 }
1156 let esc = self.advance();
1157 match esc {
1158 b'n' => b'\n',
1159 b't' => b'\t',
1160 b'r' => b'\r',
1161 b'\\' => b'\\',
1162 b'\'' => b'\'',
1163 b'0' => 0,
1164 b'x' => {
1165 if self.pos + 1 < self.bytes.len() {
1166 let hi = self.advance();
1167 let lo = self.advance();
1168 match (hex_digit(hi), hex_digit(lo)) {
1169 (Some(h), Some(l)) => h * 16 + l,
1170 _ => {
1171 self.diagnostics.emit(Diagnostic::error(
1172 "E0004",
1173 format!("invalid hex escape in byte char `\\x{}{}`", hi as char, lo as char),
1174 Span::new(self.pos - 4, self.pos),
1175 ));
1176 0
1177 }
1178 }
1179 } else {
1180 self.diagnostics.emit(Diagnostic::error(
1181 "E0004",
1182 "incomplete hex escape in byte char",
1183 Span::new(self.pos - 2, self.pos),
1184 ));
1185 0
1186 }
1187 }
1188 _ => {
1189 self.diagnostics.emit(
1190 Diagnostic::error(
1191 "E0004",
1192 format!("unknown escape `\\{}` in byte char literal", esc as char),
1193 Span::new(self.pos - 2, self.pos),
1194 )
1195 .with_hint("valid escapes: \\n, \\t, \\r, \\\\, \\', \\0, \\xNN"),
1196 );
1197 esc
1198 }
1199 }
1200 } else {
1201 self.advance()
1202 };
1203
1204 if self.pos < self.bytes.len() && self.peek() == b'\'' {
1206 self.advance();
1207 } else {
1208 self.diagnostics.emit(Diagnostic::error(
1209 "E0003",
1210 "unterminated byte char literal, expected closing `'`",
1211 Span::new(start, self.pos),
1212 ));
1213 return Token::new(TokenKind::Error, Span::new(start, self.pos), "");
1214 }
1215
1216 let text = byte_val.to_string();
1218 Token::new(TokenKind::ByteCharLit, Span::new(start, self.pos), text)
1219 }
1220
1221 fn lex_raw_string(&mut self, start: usize, is_byte: bool, hashes: usize) -> Token {
1226 let mut value = String::new();
1227 loop {
1228 if self.pos >= self.bytes.len() {
1229 self.diagnostics.emit(
1230 Diagnostic::error(
1231 "E0003",
1232 "unterminated raw string literal",
1233 Span::new(start, self.pos),
1234 )
1235 .with_hint(if hashes > 0 {
1236 format!("add a closing `\"{}`", "#".repeat(hashes))
1237 } else {
1238 "add a closing `\"` to terminate the raw string".to_string()
1239 }),
1240 );
1241 break;
1242 }
1243 let ch = self.advance();
1244 if ch == b'"' {
1245 let mut found_hashes = 0;
1247 while found_hashes < hashes && self.pos < self.bytes.len() && self.peek() == b'#' {
1248 self.advance();
1249 found_hashes += 1;
1250 }
1251 if found_hashes == hashes {
1252 break; }
1254 value.push('"');
1256 for _ in 0..found_hashes {
1257 value.push('#');
1258 }
1259 } else {
1260 value.push(ch as char);
1261 }
1262 }
1263 let kind = if is_byte {
1264 TokenKind::RawByteStringLit
1265 } else {
1266 TokenKind::RawStringLit
1267 };
1268 Token::new(kind, Span::new(start, self.pos), value)
1269 }
1270
1271 fn lex_regex(&mut self, start: usize) -> Token {
1275 let mut pattern = String::new();
1276 loop {
1277 if self.pos >= self.bytes.len() {
1278 self.diagnostics.emit(
1279 Diagnostic::error(
1280 "E0011",
1281 "unterminated regex literal",
1282 Span::new(start, self.pos),
1283 )
1284 .with_hint("add a closing `/` to terminate the regex"),
1285 );
1286 let text = format!("/{}", pattern);
1287 return Token::new(TokenKind::Error, Span::new(start, self.pos), text);
1288 }
1289 let ch = self.advance();
1290 match ch {
1291 b'/' => break,
1292 b'\\' => {
1293 pattern.push('\\');
1295 if self.pos < self.bytes.len() {
1296 let esc = self.advance();
1297 pattern.push(esc as char);
1298 } else {
1299 self.diagnostics.emit(Diagnostic::error(
1300 "E0011",
1301 "unterminated escape in regex literal",
1302 Span::new(self.pos - 1, self.pos),
1303 ));
1304 let text = format!("/{}", pattern);
1305 return Token::new(TokenKind::Error, Span::new(start, self.pos), text);
1306 }
1307 }
1308 b'\n' => {
1309 self.diagnostics.emit(
1311 Diagnostic::error(
1312 "E0011",
1313 "newline in regex literal",
1314 Span::new(self.pos - 1, self.pos),
1315 )
1316 .with_hint("use the `x` flag for extended mode with whitespace"),
1317 );
1318 let text = format!("/{}", pattern);
1319 return Token::new(TokenKind::Error, Span::new(start, self.pos), text);
1320 }
1321 _ => pattern.push(ch as char),
1322 }
1323 }
1324 let mut flags = String::new();
1326 while self.pos < self.bytes.len() && matches!(self.peek(), b'i' | b'g' | b'm' | b's' | b'x') {
1327 flags.push(self.advance() as char);
1328 }
1329 let text = if flags.is_empty() {
1331 pattern
1332 } else {
1333 format!("{}\0{}", pattern, flags)
1334 };
1335 Token::new(TokenKind::RegexLit, Span::new(start, self.pos), text)
1336 }
1337
1338 fn lex_number(&mut self, start: usize) -> Token {
1339 let mut is_float = false;
1340
1341 if self.source[start..self.pos].starts_with('0') && self.pos < self.bytes.len() {
1345 match self.peek() {
1346 b'x' | b'X' => {
1347 self.advance(); let digit_start = self.pos;
1349 while self.pos < self.bytes.len()
1350 && (self.peek().is_ascii_hexdigit() || self.peek() == b'_')
1351 {
1352 self.advance();
1353 }
1354 if self.pos == digit_start {
1355 self.diagnostics.emit(Diagnostic::error(
1356 "E0006",
1357 "expected hex digits after `0x`",
1358 Span::new(start, self.pos),
1359 ));
1360 return Token::new(TokenKind::Error, Span::new(start, self.pos), &self.source[start..self.pos]);
1361 }
1362 let text = &self.source[start..self.pos];
1363 return Token::new(TokenKind::IntLit, Span::new(start, self.pos), text);
1364 }
1365 b'b' | b'B' if self.peek_at(1) != b'"' && self.peek_at(1) != b'\'' && self.peek_at(1) != b'r' => {
1366 self.advance(); let digit_start = self.pos;
1368 while self.pos < self.bytes.len()
1369 && (self.peek() == b'0' || self.peek() == b'1' || self.peek() == b'_')
1370 {
1371 self.advance();
1372 }
1373 if self.pos == digit_start {
1374 self.diagnostics.emit(Diagnostic::error(
1375 "E0006",
1376 "expected binary digits after `0b`",
1377 Span::new(start, self.pos),
1378 ));
1379 return Token::new(TokenKind::Error, Span::new(start, self.pos), &self.source[start..self.pos]);
1380 }
1381 let text = &self.source[start..self.pos];
1382 return Token::new(TokenKind::IntLit, Span::new(start, self.pos), text);
1383 }
1384 b'o' | b'O' => {
1385 self.advance(); let digit_start = self.pos;
1387 while self.pos < self.bytes.len()
1388 && ((self.peek() >= b'0' && self.peek() <= b'7') || self.peek() == b'_')
1389 {
1390 self.advance();
1391 }
1392 if self.pos == digit_start {
1393 self.diagnostics.emit(Diagnostic::error(
1394 "E0006",
1395 "expected octal digits after `0o`",
1396 Span::new(start, self.pos),
1397 ));
1398 return Token::new(TokenKind::Error, Span::new(start, self.pos), &self.source[start..self.pos]);
1399 }
1400 let text = &self.source[start..self.pos];
1401 return Token::new(TokenKind::IntLit, Span::new(start, self.pos), text);
1402 }
1403 _ => {}
1404 }
1405 }
1406
1407 while self.pos < self.bytes.len()
1409 && (self.peek().is_ascii_digit() || self.peek() == b'_')
1410 {
1411 self.advance();
1412 }
1413
1414 if self.peek() == b'.' && self.peek_at(1).is_ascii_digit() {
1416 is_float = true;
1417 self.advance(); while self.pos < self.bytes.len()
1419 && (self.peek().is_ascii_digit() || self.peek() == b'_')
1420 {
1421 self.advance();
1422 }
1423 }
1424
1425 if self.peek() == b'e' || self.peek() == b'E' {
1427 is_float = true;
1428 self.advance();
1429 if self.peek() == b'+' || self.peek() == b'-' {
1430 self.advance();
1431 }
1432 while self.pos < self.bytes.len() && self.peek().is_ascii_digit() {
1433 self.advance();
1434 }
1435 }
1436
1437 if self.peek() == b'f' || self.peek() == b'i' {
1439 let suffix_start = self.pos;
1440 self.advance();
1441 while self.pos < self.bytes.len() && self.peek().is_ascii_digit() {
1442 self.advance();
1443 }
1444 let suffix = &self.source[suffix_start..self.pos];
1445 match suffix {
1446 "f32" | "f64" => is_float = true,
1447 "i32" | "i64" => {}
1448 _ => {
1449 self.diagnostics.emit(Diagnostic::error(
1450 "E0005",
1451 format!("invalid numeric suffix `{}`", suffix),
1452 Span::new(suffix_start, self.pos),
1453 ));
1454 }
1455 }
1456 }
1457
1458 let text = &self.source[start..self.pos];
1459 let kind = if is_float {
1460 TokenKind::FloatLit
1461 } else {
1462 TokenKind::IntLit
1463 };
1464 Token::new(kind, Span::new(start, self.pos), text)
1465 }
1466
1467 fn lex_ident(&mut self, start: usize) -> Token {
1468 while self.pos < self.bytes.len()
1469 && (self.peek().is_ascii_alphanumeric() || self.peek() == b'_')
1470 {
1471 self.advance();
1472 }
1473
1474 let text = &self.source[start..self.pos];
1475 let kind = match text {
1476 "struct" => TokenKind::Struct,
1477 "class" => TokenKind::Class,
1478 "record" => TokenKind::Record,
1479 "fn" => TokenKind::Fn,
1480 "trait" => TokenKind::Trait,
1481 "impl" => TokenKind::Impl,
1482 "let" => TokenKind::Let,
1483 "mut" => TokenKind::Mut,
1484 "return" => TokenKind::Return,
1485 "break" => TokenKind::Break,
1486 "continue" => TokenKind::Continue,
1487 "if" => TokenKind::If,
1488 "else" => TokenKind::Else,
1489 "while" => TokenKind::While,
1490 "for" => TokenKind::For,
1491 "in" => TokenKind::In,
1492 "nogc" => TokenKind::NoGc,
1493 "col" => TokenKind::Col,
1494 "import" => TokenKind::Import,
1495 "mod" => TokenKind::Mod,
1496 "as" => TokenKind::As,
1497 "sealed" => TokenKind::Sealed,
1498 "match" => TokenKind::Match,
1499 "enum" => TokenKind::Enum,
1500 "const" => TokenKind::Const,
1501 "pub" => TokenKind::Pub,
1502 "null" => TokenKind::Null,
1503 "NA" => TokenKind::Na,
1504 "true" => TokenKind::True,
1505 "false" => TokenKind::False,
1506 "_" => TokenKind::Underscore,
1507 _ => TokenKind::Ident,
1508 };
1509
1510 Token::new(kind, Span::new(start, self.pos), text)
1511 }
1512}
1513
1514fn hex_digit(ch: u8) -> Option<u8> {
1516 match ch {
1517 b'0'..=b'9' => Some(ch - b'0'),
1518 b'a'..=b'f' => Some(ch - b'a' + 10),
1519 b'A'..=b'F' => Some(ch - b'A' + 10),
1520 _ => None,
1521 }
1522}
1523
1524#[cfg(test)]
1525mod tests {
1526 use super::*;
1527
1528 fn lex(source: &str) -> Vec<Token> {
1529 let lexer = Lexer::new(source);
1530 let (tokens, _) = lexer.tokenize();
1531 tokens
1532 }
1533
1534 fn kinds(source: &str) -> Vec<TokenKind> {
1535 lex(source).into_iter().map(|t| t.kind).collect()
1536 }
1537
1538 #[test]
1539 fn test_empty() {
1540 assert_eq!(kinds(""), vec![TokenKind::Eof]);
1541 }
1542
1543 #[test]
1544 fn test_keywords() {
1545 assert_eq!(
1546 kinds("struct class fn trait impl let mut return if else while nogc"),
1547 vec![
1548 TokenKind::Struct,
1549 TokenKind::Class,
1550 TokenKind::Fn,
1551 TokenKind::Trait,
1552 TokenKind::Impl,
1553 TokenKind::Let,
1554 TokenKind::Mut,
1555 TokenKind::Return,
1556 TokenKind::If,
1557 TokenKind::Else,
1558 TokenKind::While,
1559 TokenKind::NoGc,
1560 TokenKind::Eof,
1561 ]
1562 );
1563 }
1564
1565 #[test]
1566 fn test_identifiers() {
1567 let tokens = lex("foo bar_baz _x T123");
1568 assert_eq!(tokens[0].kind, TokenKind::Ident);
1569 assert_eq!(tokens[0].text, "foo");
1570 assert_eq!(tokens[1].text, "bar_baz");
1571 assert_eq!(tokens[2].text, "_x");
1572 assert_eq!(tokens[3].text, "T123");
1573 }
1574
1575 #[test]
1576 fn test_numbers() {
1577 let tokens = lex("42 3.14 1_000 2.5e10 1f32 100i64");
1578 assert_eq!(tokens[0].kind, TokenKind::IntLit);
1579 assert_eq!(tokens[0].text, "42");
1580 assert_eq!(tokens[1].kind, TokenKind::FloatLit);
1581 assert_eq!(tokens[1].text, "3.14");
1582 assert_eq!(tokens[2].kind, TokenKind::IntLit);
1583 assert_eq!(tokens[2].text, "1_000");
1584 assert_eq!(tokens[3].kind, TokenKind::FloatLit);
1585 assert_eq!(tokens[4].kind, TokenKind::FloatLit);
1586 assert_eq!(tokens[5].kind, TokenKind::IntLit);
1587 }
1588
1589 #[test]
1590 fn test_strings() {
1591 let tokens = lex(r#""hello" "world\n" "tab\there""#);
1592 assert_eq!(tokens[0].kind, TokenKind::StringLit);
1593 assert_eq!(tokens[0].text, "hello");
1594 assert_eq!(tokens[1].text, "world\n");
1595 assert_eq!(tokens[2].text, "tab\there");
1596 }
1597
1598 #[test]
1599 fn test_operators() {
1600 assert_eq!(
1601 kinds("+ - * / % == != < > <= >= && || ! = |> -> =>"),
1602 vec![
1603 TokenKind::Plus,
1604 TokenKind::Minus,
1605 TokenKind::Star,
1606 TokenKind::Slash,
1607 TokenKind::Percent,
1608 TokenKind::EqEq,
1609 TokenKind::BangEq,
1610 TokenKind::Lt,
1611 TokenKind::Gt,
1612 TokenKind::LtEq,
1613 TokenKind::GtEq,
1614 TokenKind::AmpAmp,
1615 TokenKind::PipePipe,
1616 TokenKind::Bang,
1617 TokenKind::Eq,
1618 TokenKind::PipeGt,
1619 TokenKind::Arrow,
1620 TokenKind::FatArrow,
1621 TokenKind::Eof,
1622 ]
1623 );
1624 }
1625
1626 #[test]
1627 fn test_delimiters() {
1628 assert_eq!(
1629 kinds("( ) { } [ ] , . : ;"),
1630 vec![
1631 TokenKind::LParen,
1632 TokenKind::RParen,
1633 TokenKind::LBrace,
1634 TokenKind::RBrace,
1635 TokenKind::LBracket,
1636 TokenKind::RBracket,
1637 TokenKind::Comma,
1638 TokenKind::Dot,
1639 TokenKind::Colon,
1640 TokenKind::Semicolon,
1641 TokenKind::Eof,
1642 ]
1643 );
1644 }
1645
1646 #[test]
1647 fn test_comments() {
1648 let tokens = lex("foo // this is a comment\nbar");
1649 assert_eq!(tokens[0].text, "foo");
1650 assert_eq!(tokens[1].text, "bar");
1651 }
1652
1653 #[test]
1654 fn test_block_comment() {
1655 let tokens = lex("foo /* block */ bar");
1656 assert_eq!(tokens[0].text, "foo");
1657 assert_eq!(tokens[1].text, "bar");
1658 }
1659
1660 #[test]
1661 fn test_nested_block_comment() {
1662 let tokens = lex("foo /* outer /* inner */ still comment */ bar");
1663 assert_eq!(tokens[0].text, "foo");
1664 assert_eq!(tokens[1].text, "bar");
1665 }
1666
1667 #[test]
1668 fn test_spans() {
1669 let tokens = lex("let x = 42;");
1670 assert_eq!(tokens[0].span, Span::new(0, 3)); assert_eq!(tokens[1].span, Span::new(4, 5)); assert_eq!(tokens[2].span, Span::new(6, 7)); assert_eq!(tokens[3].span, Span::new(8, 10)); assert_eq!(tokens[4].span, Span::new(10, 11)); }
1676
1677 #[test]
1678 fn test_unterminated_string() {
1679 let lexer = Lexer::new("\"hello");
1680 let (_, diags) = lexer.tokenize();
1681 assert!(diags.has_errors());
1682 }
1683
1684 #[test]
1685 fn test_function_signature() {
1686 assert_eq!(
1687 kinds("fn matmul<T: Float>(a: Tensor<T>, b: Tensor<T>) -> Tensor<T>"),
1688 vec![
1689 TokenKind::Fn,
1690 TokenKind::Ident, TokenKind::Lt, TokenKind::Ident, TokenKind::Colon, TokenKind::Ident, TokenKind::Gt, TokenKind::LParen, TokenKind::Ident, TokenKind::Colon, TokenKind::Ident, TokenKind::Lt, TokenKind::Ident, TokenKind::Gt, TokenKind::Comma, TokenKind::Ident, TokenKind::Colon, TokenKind::Ident, TokenKind::Lt, TokenKind::Ident, TokenKind::Gt, TokenKind::RParen, TokenKind::Arrow, TokenKind::Ident, TokenKind::Lt, TokenKind::Ident, TokenKind::Gt, TokenKind::Eof,
1717 ]
1718 );
1719 }
1720
1721 #[test]
1722 fn test_pipe_operator() {
1723 assert_eq!(
1724 kinds("df |> filter(x) |> group_by(y)"),
1725 vec![
1726 TokenKind::Ident, TokenKind::PipeGt, TokenKind::Ident, TokenKind::LParen,
1730 TokenKind::Ident, TokenKind::RParen,
1732 TokenKind::PipeGt, TokenKind::Ident, TokenKind::LParen,
1735 TokenKind::Ident, TokenKind::RParen,
1737 TokenKind::Eof,
1738 ]
1739 );
1740 }
1741
1742 #[test]
1743 fn test_pipe_token_for_lambda() {
1744 assert_eq!(
1745 kinds("|x: f64| x * 2.0"),
1746 vec![
1747 TokenKind::Pipe, TokenKind::Ident, TokenKind::Colon, TokenKind::Ident, TokenKind::Pipe, TokenKind::Ident, TokenKind::Star, TokenKind::FloatLit, TokenKind::Eof,
1756 ]
1757 );
1758 }
1759
1760 #[test]
1761 fn test_pipe_no_error() {
1762 let lexer = Lexer::new("|x| x");
1763 let (tokens, diags) = lexer.tokenize();
1764 assert!(!diags.has_errors());
1765 assert_eq!(tokens[0].kind, TokenKind::Pipe);
1766 assert_eq!(tokens[2].kind, TokenKind::Pipe);
1767 }
1768
1769 #[test]
1772 fn test_byte_string_basic() {
1773 let tokens = lex(r#"b"hello""#);
1774 assert_eq!(tokens[0].kind, TokenKind::ByteStringLit);
1775 assert_eq!(tokens[0].text, "hello");
1776 }
1777
1778 #[test]
1779 fn test_byte_string_escapes() {
1780 let tokens = lex(r#"b"a\nb\t""#);
1781 assert_eq!(tokens[0].kind, TokenKind::ByteStringLit);
1782 assert_eq!(tokens[0].text, "a\nb\t");
1783 }
1784
1785 #[test]
1786 fn test_byte_string_hex_escape() {
1787 let tokens = lex(r#"b"\xff\x00\x41""#);
1788 assert_eq!(tokens[0].kind, TokenKind::ByteStringLit);
1789 let bytes: Vec<u8> = tokens[0].text.chars().map(|c| c as u8).collect();
1791 assert_eq!(bytes, vec![0xff, 0x00, 0x41]);
1792 }
1793
1794 #[test]
1795 fn test_byte_string_span() {
1796 let tokens = lex(r#"b"abc""#);
1797 assert_eq!(tokens[0].span, Span::new(0, 6)); }
1799
1800 #[test]
1801 fn test_byte_string_unterminated() {
1802 let lexer = Lexer::new(r#"b"hello"#);
1803 let (_, diags) = lexer.tokenize();
1804 assert!(diags.has_errors());
1805 }
1806
1807 #[test]
1810 fn test_byte_char_basic() {
1811 let tokens = lex("b'A'");
1812 assert_eq!(tokens[0].kind, TokenKind::ByteCharLit);
1813 assert_eq!(tokens[0].text, "65"); }
1815
1816 #[test]
1817 fn test_byte_char_newline() {
1818 let tokens = lex(r"b'\n'");
1819 assert_eq!(tokens[0].kind, TokenKind::ByteCharLit);
1820 assert_eq!(tokens[0].text, "10"); }
1822
1823 #[test]
1824 fn test_byte_char_null() {
1825 let tokens = lex(r"b'\0'");
1826 assert_eq!(tokens[0].kind, TokenKind::ByteCharLit);
1827 assert_eq!(tokens[0].text, "0");
1828 }
1829
1830 #[test]
1831 fn test_byte_char_hex() {
1832 let tokens = lex(r"b'\xff'");
1833 assert_eq!(tokens[0].kind, TokenKind::ByteCharLit);
1834 assert_eq!(tokens[0].text, "255");
1835 }
1836
1837 #[test]
1838 fn test_byte_char_backslash() {
1839 let tokens = lex(r"b'\\'");
1840 assert_eq!(tokens[0].kind, TokenKind::ByteCharLit);
1841 assert_eq!(tokens[0].text, "92"); }
1843
1844 #[test]
1845 fn test_byte_char_unterminated() {
1846 let lexer = Lexer::new("b'A");
1847 let (_, diags) = lexer.tokenize();
1848 assert!(diags.has_errors());
1849 }
1850
1851 #[test]
1854 fn test_raw_string_basic() {
1855 let tokens = lex(r#"r"hello\nworld""#);
1856 assert_eq!(tokens[0].kind, TokenKind::RawStringLit);
1857 assert_eq!(tokens[0].text, r"hello\nworld");
1859 }
1860
1861 #[test]
1862 fn test_raw_string_with_hashes() {
1863 let source = "r#\"She said \\\"hi\\\"\"#";
1865 let source2 = r###"r#"contains "quotes""#"###;
1868 let tokens = lex(source2);
1869 assert_eq!(tokens[0].kind, TokenKind::RawStringLit);
1870 assert_eq!(tokens[0].text, r#"contains "quotes""#);
1871 }
1872
1873 #[test]
1874 fn test_raw_string_regex() {
1875 let source = r#"r"(\d+)\s+(\w+)""#;
1876 let tokens = lex(source);
1877 assert_eq!(tokens[0].kind, TokenKind::RawStringLit);
1878 assert_eq!(tokens[0].text, r"(\d+)\s+(\w+)");
1879 }
1880
1881 #[test]
1882 fn test_raw_string_unterminated() {
1883 let lexer = Lexer::new(r#"r"hello"#);
1884 let (_, diags) = lexer.tokenize();
1885 assert!(diags.has_errors());
1886 }
1887
1888 #[test]
1891 fn test_raw_byte_string_basic() {
1892 let source = r#"br"hello\nworld""#;
1893 let tokens = lex(source);
1894 assert_eq!(tokens[0].kind, TokenKind::RawByteStringLit);
1895 assert_eq!(tokens[0].text, r"hello\nworld");
1896 }
1897
1898 #[test]
1899 fn test_raw_byte_string_with_hashes() {
1900 let source = r###"br#"raw "bytes""#"###;
1901 let tokens = lex(source);
1902 assert_eq!(tokens[0].kind, TokenKind::RawByteStringLit);
1903 assert_eq!(tokens[0].text, r#"raw "bytes""#);
1904 }
1905
1906 #[test]
1909 fn test_b_as_identifier() {
1910 let tokens = lex("b + 1");
1912 assert_eq!(tokens[0].kind, TokenKind::Ident);
1913 assert_eq!(tokens[0].text, "b");
1914 }
1915
1916 #[test]
1917 fn test_br_as_identifier() {
1918 let tokens = lex("br + 1");
1920 assert_eq!(tokens[0].kind, TokenKind::Ident);
1921 assert_eq!(tokens[0].text, "br");
1922 }
1923
1924 #[test]
1925 fn test_r_as_identifier() {
1926 let tokens = lex("r + 1");
1928 assert_eq!(tokens[0].kind, TokenKind::Ident);
1929 assert_eq!(tokens[0].text, "r");
1930 }
1931
1932 #[test]
1933 fn test_byte_string_then_ident() {
1934 let tokens = lex(r#"b"data" foo"#);
1935 assert_eq!(tokens[0].kind, TokenKind::ByteStringLit);
1936 assert_eq!(tokens[1].kind, TokenKind::Ident);
1937 assert_eq!(tokens[1].text, "foo");
1938 }
1939
1940 #[test]
1941 fn test_multiple_literal_kinds() {
1942 let source = r#"b"bytes" r"raw" "normal" 42"#;
1943 let tokens = lex(source);
1944 assert_eq!(tokens[0].kind, TokenKind::ByteStringLit);
1945 assert_eq!(tokens[1].kind, TokenKind::RawStringLit);
1946 assert_eq!(tokens[2].kind, TokenKind::StringLit);
1947 assert_eq!(tokens[3].kind, TokenKind::IntLit);
1948 }
1949}