1use std::collections::VecDeque;
4
5use bock_errors::{DiagnosticBag, DiagnosticCode, Span};
6use bock_source::SourceFile;
7
8use crate::token::{keyword_lookup, Token, TokenKind};
9
10const E_UNEXPECTED_CHAR: DiagnosticCode = DiagnosticCode {
12 prefix: 'E',
13 number: 1001,
14};
15const E_UNTERMINATED_STRING: DiagnosticCode = DiagnosticCode {
17 prefix: 'E',
18 number: 1002,
19};
20const E_INVALID_ESCAPE: DiagnosticCode = DiagnosticCode {
22 prefix: 'E',
23 number: 1003,
24};
25const E_INVALID_CHAR_LITERAL: DiagnosticCode = DiagnosticCode {
27 prefix: 'E',
28 number: 1004,
29};
30const E_INVALID_DIGIT: DiagnosticCode = DiagnosticCode {
32 prefix: 'E',
33 number: 1005,
34};
35const E_UNTERMINATED_BLOCK_COMMENT: DiagnosticCode = DiagnosticCode {
37 prefix: 'E',
38 number: 1006,
39};
40
41struct StringResumeCtx {
43 string_start: usize,
45 is_raw: bool,
46 is_multiline: bool,
47}
48
49pub struct Lexer<'src> {
51 source: &'src SourceFile,
52 pos: usize,
54 diagnostics: DiagnosticBag,
55 pending: VecDeque<Token>,
58 interp_brace_depth: Vec<u32>,
62 string_resume: Vec<StringResumeCtx>,
64}
65
66impl<'src> Lexer<'src> {
67 #[must_use]
69 pub fn new(source: &'src SourceFile) -> Self {
70 Self {
71 source,
72 pos: 0,
73 diagnostics: DiagnosticBag::new(),
74 pending: VecDeque::new(),
75 interp_brace_depth: Vec::new(),
76 string_resume: Vec::new(),
77 }
78 }
79
80 #[must_use]
82 pub fn tokenize(&mut self) -> Vec<Token> {
83 let mut tokens = Vec::new();
84 loop {
85 let tok = self.next_token();
86 let is_eof = tok.kind == TokenKind::Eof;
87 tokens.push(tok);
88 if is_eof {
89 break;
90 }
91 }
92 tokens
93 }
94
95 #[must_use]
97 pub fn diagnostics(&self) -> &DiagnosticBag {
98 &self.diagnostics
99 }
100
101 fn peek(&self) -> Option<char> {
105 self.source.content[self.pos..].chars().next()
106 }
107
108 fn peek_next(&self) -> Option<char> {
110 let mut chars = self.source.content[self.pos..].chars();
111 chars.next(); chars.next()
113 }
114
115 fn advance(&mut self) -> Option<char> {
117 let ch = self.source.content[self.pos..].chars().next()?;
118 self.pos += ch.len_utf8();
119 Some(ch)
120 }
121
122 fn skip_whitespace(&mut self) {
124 while let Some(ch) = self.peek() {
125 if ch == '\n' || !ch.is_whitespace() {
126 break;
127 }
128 self.advance();
129 }
130 }
131
132 fn span_from(&self, start: usize) -> Span {
134 Span {
135 file: self.source.id,
136 start,
137 end: self.pos,
138 }
139 }
140
141 fn make_token(&self, kind: TokenKind, start: usize) -> Token {
143 Token::new(kind, self.span_from(start), None)
144 }
145
146 fn next_token(&mut self) -> Token {
150 if let Some(tok) = self.pending.pop_front() {
152 return tok;
153 }
154
155 self.skip_whitespace();
156
157 let start = self.pos;
158
159 let ch = match self.peek() {
160 None => return self.make_token(TokenKind::Eof, start),
161 Some(c) => c,
162 };
163
164 if ch == '\n' {
166 self.advance();
167 return self.make_token(TokenKind::Newline, start);
168 }
169
170 if ch == '\r' {
172 self.advance();
173 if self.peek() == Some('\n') {
174 self.advance();
175 }
176 return self.make_token(TokenKind::Newline, start);
177 }
178
179 if ch == '/' && (self.peek_next() == Some('/') || self.peek_next() == Some('*')) {
181 self.lex_comment();
182 return self.next_token();
184 }
185
186 if ch == '"' {
188 return self.lex_string();
189 }
190 if ch == 'r' && self.peek_next() == Some('"') {
191 return self.lex_string();
192 }
193
194 if ch == '\'' {
196 return self.lex_char();
197 }
198
199 if ch.is_ascii_digit() {
201 return self.lex_number();
202 }
203 if ch.is_alphabetic() || ch == '_' {
208 return self.lex_ident_or_keyword();
209 }
210
211 if ch == '\\' {
214 if self.peek_next() == Some('\n') {
215 self.advance(); self.advance(); return self.next_token();
218 }
219 if self.peek_next() == Some('\r') {
220 self.advance(); self.advance(); if self.peek() == Some('\n') {
224 self.advance(); }
226 return self.next_token();
227 }
228 }
231
232 self.lex_operator()
234 }
235
236 fn lex_string(&mut self) -> Token {
240 let start = self.pos;
241
242 let is_raw = self.peek() == Some('r');
243 if is_raw {
244 self.advance(); }
246
247 let is_multiline = self.source.content[self.pos..].starts_with("\"\"\"");
249 if is_multiline {
250 self.pos += 3; } else {
252 self.advance(); }
254
255 self.process_string_body(start, is_raw, is_multiline, false)
256 }
257
258 fn process_string_body(
265 &mut self,
266 string_start: usize,
267 is_raw: bool,
268 is_multiline: bool,
269 is_continuation: bool,
270 ) -> Token {
271 let segment_start = self.pos;
272 let mut content = String::new();
273
274 loop {
275 match self.peek() {
276 None => {
278 let span = self.span_from(string_start);
279 self.diagnostics.error(
280 E_UNTERMINATED_STRING,
281 "unterminated string literal",
282 span,
283 );
284 let kind = closing_kind(is_raw, is_multiline, is_continuation);
285 return Token::new(kind, span, Some(content));
286 }
287
288 Some('"') => {
290 if is_multiline {
291 if self.source.content[self.pos..].starts_with("\"\"\"") {
292 self.pos += 3; let span = self.span_from(string_start);
294 let processed = if is_multiline && !is_raw {
295 strip_common_indent(&content)
296 } else {
297 content
298 };
299 let kind = closing_kind(is_raw, is_multiline, is_continuation);
300 return Token::new(kind, span, Some(processed));
301 } else {
302 content.push('"');
304 self.advance();
305 }
306 } else {
307 self.advance();
309 let span = self.span_from(string_start);
310 let kind = closing_kind(is_raw, is_multiline, is_continuation);
311 return Token::new(kind, span, Some(content));
312 }
313 }
314
315 Some('\n') if !is_multiline => {
317 let span = self.span_from(string_start);
318 self.diagnostics.error(
319 E_UNTERMINATED_STRING,
320 "unterminated string literal (newline)",
321 span,
322 );
323 let kind = closing_kind(is_raw, is_multiline, is_continuation);
324 return Token::new(kind, span, Some(content));
325 }
326
327 Some('\\') if !is_raw => {
329 self.advance(); match self.advance() {
331 Some('n') => content.push('\n'),
332 Some('t') => content.push('\t'),
333 Some('r') => content.push('\r'),
334 Some('\\') => content.push('\\'),
335 Some('"') => content.push('"'),
336 Some('\'') => content.push('\''),
337 Some('0') => content.push('\0'),
338 Some('$') => content.push('$'),
339 Some('u') => {
340 self.lex_unicode_escape(&mut content, string_start);
341 }
342 Some(other) => {
343 let span = self.span_from(string_start);
344 self.diagnostics.error(
345 E_INVALID_ESCAPE,
346 format!("unknown escape sequence: \\{other}"),
347 span,
348 );
349 content.push(other);
350 }
351 None => {
352 let span = self.span_from(string_start);
353 self.diagnostics.error(
354 E_UNTERMINATED_STRING,
355 "unterminated string literal after backslash",
356 span,
357 );
358 let kind = closing_kind(is_raw, is_multiline, is_continuation);
359 return Token::new(kind, span, Some(content));
360 }
361 }
362 }
363
364 Some('$') if !is_raw => {
366 if self.source.content[self.pos..].starts_with("${") {
367 let part_span = Span {
369 file: self.source.id,
370 start: segment_start,
371 end: self.pos,
372 };
373 let part_tok =
374 Token::new(TokenKind::StringLiteralPart, part_span, Some(content));
375
376 let interp_start = self.pos;
377 self.pos += 2; let interp_span = Span {
379 file: self.source.id,
380 start: interp_start,
381 end: self.pos,
382 };
383 let interp_tok =
384 Token::new(TokenKind::InterpolationStart, interp_span, None);
385
386 self.pending.push_back(interp_tok);
388 self.interp_brace_depth.push(0);
389 self.string_resume.push(StringResumeCtx {
390 string_start,
391 is_raw,
392 is_multiline,
393 });
394
395 return part_tok;
396 } else if self.source.content[self.pos..].starts_with("$$") {
397 content.push('$');
399 self.pos += 2; } else {
401 content.push('$');
402 self.advance();
403 }
404 }
405
406 Some(ch) => {
408 content.push(ch);
409 self.advance();
410 }
411 }
412 }
413 }
414
415 fn resume_string_lex(&mut self, ctx: StringResumeCtx) {
418 let tok = self.process_string_body(ctx.string_start, ctx.is_raw, ctx.is_multiline, true);
419 self.pending.push_front(tok);
422 }
423
424 fn lex_unicode_escape(&mut self, out: &mut String, string_start: usize) {
426 if self.peek() != Some('{') {
427 let span = self.span_from(string_start);
428 self.diagnostics.error(
429 E_INVALID_ESCAPE,
430 "expected '{' after \\u in Unicode escape",
431 span,
432 );
433 return;
434 }
435 self.advance(); let hex_start = self.pos;
438 while self.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or(false) {
439 self.advance();
440 }
441 let hex_str = &self.source.content[hex_start..self.pos];
442
443 if self.peek() != Some('}') {
444 let span = self.span_from(string_start);
445 self.diagnostics.error(
446 E_INVALID_ESCAPE,
447 "expected '}' to close Unicode escape \\u{...}",
448 span,
449 );
450 return;
451 }
452 self.advance(); match u32::from_str_radix(hex_str, 16)
455 .ok()
456 .and_then(char::from_u32)
457 {
458 Some(c) => out.push(c),
459 None => {
460 let span = self.span_from(string_start);
461 self.diagnostics.error(
462 E_INVALID_ESCAPE,
463 format!("invalid Unicode codepoint: \\u{{{hex_str}}}"),
464 span,
465 );
466 }
467 }
468 }
469
470 fn lex_char(&mut self) -> Token {
474 let start = self.pos;
475 self.advance(); let ch = match self.peek() {
478 None => {
479 let span = self.span_from(start);
480 self.diagnostics.error(
481 E_INVALID_CHAR_LITERAL,
482 "unterminated character literal",
483 span,
484 );
485 return Token::new(TokenKind::Error, span, None);
486 }
487 Some('\'') => {
488 self.advance();
490 let span = self.span_from(start);
491 self.diagnostics
492 .error(E_INVALID_CHAR_LITERAL, "empty character literal", span);
493 return Token::new(TokenKind::Error, span, None);
494 }
495 Some('\\') => {
496 self.advance(); match self.advance() {
498 Some('n') => '\n',
499 Some('t') => '\t',
500 Some('r') => '\r',
501 Some('\\') => '\\',
502 Some('\'') => '\'',
503 Some('"') => '"',
504 Some('0') => '\0',
505 Some('u') => {
506 let mut buf = String::new();
507 self.lex_unicode_escape(&mut buf, start);
508 buf.chars().next().unwrap_or('\0')
509 }
510 Some(other) => {
511 let span = self.span_from(start);
512 self.diagnostics.error(
513 E_INVALID_ESCAPE,
514 format!("unknown escape sequence: \\{other}"),
515 span,
516 );
517 other
518 }
519 None => {
520 let span = self.span_from(start);
521 self.diagnostics.error(
522 E_INVALID_CHAR_LITERAL,
523 "unterminated character literal",
524 span,
525 );
526 return Token::new(TokenKind::Error, span, None);
527 }
528 }
529 }
530 Some(c) => {
531 self.advance();
532 c
533 }
534 };
535
536 if self.peek() == Some('\'') {
538 self.advance();
539 let span = self.span_from(start);
540 Token::new(TokenKind::CharLiteral, span, Some(ch.to_string()))
541 } else {
542 let span = self.span_from(start);
543 self.diagnostics.error(
544 E_INVALID_CHAR_LITERAL,
545 "expected closing ' in character literal",
546 span,
547 );
548 Token::new(TokenKind::Error, span, Some(ch.to_string()))
549 }
550 }
551
552 fn lex_number(&mut self) -> Token {
565 let start = self.pos;
566 let mut literal = String::new();
567 let mut is_float = false;
568
569 let first = self.advance().expect("caller guarantees a digit");
571 literal.push(first);
572
573 if first == '0' {
575 match self.peek() {
576 Some('x') | Some('X') => {
577 let prefix = self.advance().expect("peek confirmed 'x'/'X'");
578 literal.push(prefix);
579 let digit_start = self.pos;
580 self.consume_digits(&mut literal, |c| c.is_ascii_hexdigit() || c == '_');
582 if self.pos == digit_start {
583 self.diagnostics.error(
584 E_INVALID_DIGIT,
585 "expected hexadecimal digit after '0x'",
586 self.span_from(start),
587 );
588 }
589 let suffix = self.try_consume_suffix();
590 let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
591 return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
592 }
593 Some('o') | Some('O') => {
594 let prefix = self.advance().expect("peek confirmed 'o'/'O'");
595 literal.push(prefix);
596 let digit_start = self.pos;
597 self.consume_digits(&mut literal, |c| c.is_ascii_digit() || c == '_');
599 if self.pos == digit_start {
600 self.diagnostics.error(
601 E_INVALID_DIGIT,
602 "expected octal digit after '0o'",
603 self.span_from(start),
604 );
605 } else {
606 let body = &self.source.content[digit_start..self.pos];
607 for ch in body.chars() {
608 if ch != '_' && !matches!(ch, '0'..='7') {
609 self.diagnostics.error(
610 E_INVALID_DIGIT,
611 format!("invalid octal digit '{ch}'"),
612 self.span_from(start),
613 );
614 break;
615 }
616 }
617 }
618 let suffix = self.try_consume_suffix();
619 let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
620 return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
621 }
622 Some('b') | Some('B') => {
623 let prefix = self.advance().expect("peek confirmed 'b'/'B'");
624 literal.push(prefix);
625 let digit_start = self.pos;
626 self.consume_digits(&mut literal, |c| c.is_ascii_digit() || c == '_');
628 if self.pos == digit_start {
629 self.diagnostics.error(
630 E_INVALID_DIGIT,
631 "expected binary digit after '0b'",
632 self.span_from(start),
633 );
634 } else {
635 let body = &self.source.content[digit_start..self.pos];
636 for ch in body.chars() {
637 if ch != '_' && !matches!(ch, '0' | '1') {
638 self.diagnostics.error(
639 E_INVALID_DIGIT,
640 format!("invalid binary digit '{ch}'"),
641 self.span_from(start),
642 );
643 break;
644 }
645 }
646 }
647 let suffix = self.try_consume_suffix();
648 let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
649 return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
650 }
651 _ => {}
652 }
653 }
654
655 self.consume_decimal_digits(&mut literal);
658
659 if self.peek() == Some('.') && self.peek_next().is_some_and(|c| c.is_ascii_digit()) {
661 is_float = true;
662 literal.push(self.advance().expect("peek confirmed '.'")); self.consume_decimal_digits(&mut literal);
664 }
665
666 if matches!(self.peek(), Some('e') | Some('E')) {
668 is_float = true;
669 literal.push(self.advance().expect("peek confirmed 'e'/'E'")); if matches!(self.peek(), Some('+') | Some('-')) {
671 literal.push(self.advance().expect("peek confirmed '+'/'-'"));
672 }
673 self.consume_decimal_digits(&mut literal);
674 }
675
676 let suffix = self.try_consume_suffix();
678 let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
679
680 let kind = if is_float {
681 TokenKind::FloatLiteral
682 } else {
683 TokenKind::IntLiteral
684 };
685 Token::new(kind, self.span_from(start), Some(full))
686 }
687
688 fn consume_digits(&mut self, buf: &mut String, predicate: impl Fn(char) -> bool) {
690 while let Some(ch) = self.peek() {
691 if predicate(ch) {
692 buf.push(ch);
693 self.advance();
694 } else {
695 break;
696 }
697 }
698 }
699
700 fn consume_decimal_digits(&mut self, buf: &mut String) {
703 loop {
704 match self.peek() {
705 Some(c) if c.is_ascii_digit() => {
706 buf.push(c);
707 self.advance();
708 }
709 Some('_') => {
710 if self.peek_next().is_some_and(|c| c.is_alphabetic()) {
713 break;
714 }
715 buf.push('_');
716 self.advance();
717 }
718 _ => break,
719 }
720 }
721 }
722
723 fn try_consume_suffix(&mut self) -> Option<String> {
726 if self.peek() == Some('_') && self.peek_next().is_some_and(|c| c.is_alphabetic()) {
728 let mut suffix = String::new();
729 suffix.push(self.advance().expect("peek confirmed '_'")); while let Some(ch) = self.peek() {
731 if ch.is_alphanumeric() || ch == '_' {
732 suffix.push(ch);
733 self.advance();
734 } else {
735 break;
736 }
737 }
738 Some(suffix)
739 } else {
740 None
741 }
742 }
743
744 fn lex_comment(&mut self) {
750 let start = self.pos;
751 self.advance();
753
754 match self.peek() {
755 Some('/') => {
756 self.advance(); if self.peek() == Some('/') {
760 self.advance(); let content_start = self.pos;
763 while let Some(ch) = self.peek() {
764 if ch == '\n' {
765 break;
766 }
767 self.advance();
768 }
769 let content = self.source.content[content_start..self.pos]
770 .trim()
771 .to_owned();
772 let span = self.span_from(start);
773 self.pending
774 .push_back(Token::new(TokenKind::DocComment, span, Some(content)));
775 } else if self.peek() == Some('!') {
776 self.advance(); let content_start = self.pos;
779 while let Some(ch) = self.peek() {
780 if ch == '\n' {
781 break;
782 }
783 self.advance();
784 }
785 let content = self.source.content[content_start..self.pos]
786 .trim()
787 .to_owned();
788 let span = self.span_from(start);
789 self.pending.push_back(Token::new(
790 TokenKind::ModuleDocComment,
791 span,
792 Some(content),
793 ));
794 } else {
795 while let Some(ch) = self.peek() {
797 if ch == '\n' {
798 break;
799 }
800 self.advance();
801 }
802 }
803 }
804 Some('*') => {
805 self.advance(); let mut depth: u32 = 1;
808 loop {
809 match self.peek() {
810 None => {
811 let span = self.span_from(start);
812 self.diagnostics.error(
813 E_UNTERMINATED_BLOCK_COMMENT,
814 "unterminated block comment",
815 span,
816 );
817 break;
818 }
819 Some('/') => {
820 self.advance();
821 if self.peek() == Some('*') {
822 self.advance();
823 depth += 1;
824 }
825 }
826 Some('*') => {
827 self.advance();
828 if self.peek() == Some('/') {
829 self.advance();
830 depth -= 1;
831 if depth == 0 {
832 break;
833 }
834 }
835 }
836 Some(_) => {
837 self.advance();
838 }
839 }
840 }
841 }
842 _ => {
843 }
845 }
846 }
847
848 fn lex_ident_or_keyword(&mut self) -> Token {
852 let start = self.pos;
853
854 while let Some(ch) = self.peek() {
855 if ch.is_alphanumeric() || ch == '_' {
856 self.advance();
857 } else {
858 break;
859 }
860 }
861
862 let text = &self.source.content[start..self.pos];
863 let span = self.span_from(start);
864
865 if let Some(kw) = keyword_lookup(text) {
866 Token::new(kw, span, None)
867 } else if text.starts_with(|c: char| c.is_uppercase()) {
868 Token::new(TokenKind::TypeIdent, span, Some(text.to_owned()))
869 } else if text == "_" {
870 Token::new(TokenKind::Underscore, span, None)
871 } else {
872 Token::new(TokenKind::Ident, span, Some(text.to_owned()))
873 }
874 }
875
876 #[allow(clippy::too_many_lines)]
880 fn lex_operator(&mut self) -> Token {
881 let start = self.pos;
882 let ch = self.advance().expect("called with a character available");
883
884 let kind = match ch {
885 '(' => TokenKind::LParen,
887 ')' => TokenKind::RParen,
888 '[' => TokenKind::LBracket,
889 ']' => TokenKind::RBracket,
890
891 '{' => {
893 if !self.interp_brace_depth.is_empty() {
894 *self.interp_brace_depth.last_mut().expect("non-empty") += 1;
895 }
896 TokenKind::LBrace
897 }
898
899 '}' => {
901 if !self.interp_brace_depth.is_empty() {
902 let top = *self.interp_brace_depth.last().expect("non-empty");
903 if top == 0 {
904 self.interp_brace_depth.pop();
906 let ctx = self
907 .string_resume
908 .pop()
909 .expect("resume stack mirrors brace stack");
910 self.resume_string_lex(ctx);
911 TokenKind::InterpolationEnd
912 } else {
913 *self.interp_brace_depth.last_mut().expect("non-empty") -= 1;
914 TokenKind::RBrace
915 }
916 } else {
917 TokenKind::RBrace
918 }
919 }
920
921 ',' => TokenKind::Comma,
922 ':' => TokenKind::Colon,
923 ';' => TokenKind::Semicolon,
924 '@' => TokenKind::At,
925 '#' => TokenKind::Hash,
926 '~' => TokenKind::BitNot,
927 '^' => TokenKind::BitXor,
928 '?' => TokenKind::Question,
929
930 '+' => {
932 if self.peek() == Some('=') {
933 self.advance();
934 TokenKind::PlusEq
935 } else {
936 TokenKind::Plus
937 }
938 }
939
940 '-' => match self.peek() {
942 Some('=') => {
943 self.advance();
944 TokenKind::MinusEq
945 }
946 Some('>') => {
947 self.advance();
948 TokenKind::ThinArrow
949 }
950 _ => TokenKind::Minus,
951 },
952
953 '*' => match self.peek() {
955 Some('=') => {
956 self.advance();
957 TokenKind::StarEq
958 }
959 Some('*') => {
960 self.advance();
961 TokenKind::Power
962 }
963 _ => TokenKind::Star,
964 },
965
966 '/' => {
968 if self.peek() == Some('=') {
969 self.advance();
970 TokenKind::SlashEq
971 } else {
972 TokenKind::Slash
973 }
974 }
975
976 '%' => {
978 if self.peek() == Some('=') {
979 self.advance();
980 TokenKind::PercentEq
981 } else {
982 TokenKind::Percent
983 }
984 }
985
986 '=' => match self.peek() {
988 Some('=') => {
989 self.advance();
990 TokenKind::Eq
991 }
992 Some('>') => {
993 self.advance();
994 TokenKind::FatArrow
995 }
996 _ => TokenKind::Assign,
997 },
998
999 '!' => {
1001 if self.peek() == Some('=') {
1002 self.advance();
1003 TokenKind::Neq
1004 } else {
1005 TokenKind::Not
1006 }
1007 }
1008
1009 '<' => match self.peek() {
1011 Some('=') => {
1012 self.advance();
1013 TokenKind::Lte
1014 }
1015 Some('<') => {
1016 self.advance();
1017 TokenKind::Shl
1018 }
1019 _ => TokenKind::Lt,
1020 },
1021
1022 '>' => match self.peek() {
1024 Some('=') => {
1025 self.advance();
1026 TokenKind::Gte
1027 }
1028 Some('>') => {
1029 self.advance();
1030 TokenKind::Shr
1031 }
1032 _ => TokenKind::Gt,
1033 },
1034
1035 '&' => {
1037 if self.peek() == Some('&') {
1038 self.advance();
1039 TokenKind::And
1040 } else {
1041 TokenKind::BitAnd
1042 }
1043 }
1044
1045 '|' => match self.peek() {
1047 Some('|') => {
1048 self.advance();
1049 TokenKind::Or
1050 }
1051 Some('>') => {
1052 self.advance();
1053 TokenKind::Pipe
1054 }
1055 _ => TokenKind::BitOr,
1056 },
1057
1058 '.' => {
1060 if self.peek() == Some('.') {
1061 self.advance(); if self.peek() == Some('=') {
1063 self.advance(); TokenKind::DotDotEq
1065 } else {
1066 TokenKind::DotDot
1067 }
1068 } else {
1069 TokenKind::Dot
1070 }
1071 }
1072
1073 other => {
1075 let span = self.span_from(start);
1076 self.diagnostics.error(
1077 E_UNEXPECTED_CHAR,
1078 format!("unexpected character {:?}", other),
1079 span,
1080 );
1081 return Token::new(TokenKind::Error, span, Some(other.to_string()));
1082 }
1083 };
1084
1085 self.make_token(kind, start)
1086 }
1087}
1088
1089fn closing_kind(is_raw: bool, is_multiline: bool, is_continuation: bool) -> TokenKind {
1093 if is_continuation {
1094 TokenKind::StringLiteralPart
1095 } else if is_raw && is_multiline {
1096 TokenKind::RawMultiLineStringLiteral
1097 } else if is_multiline {
1098 TokenKind::MultiLineStringLiteral
1099 } else if is_raw {
1100 TokenKind::RawStringLiteral
1101 } else {
1102 TokenKind::StringLiteral
1103 }
1104}
1105
1106fn strip_common_indent(s: &str) -> String {
1111 let raw_lines: Vec<&str> = s.split('\n').collect();
1112
1113 let lines: &[&str] = if raw_lines
1115 .first()
1116 .map(|l| l.trim().is_empty())
1117 .unwrap_or(false)
1118 {
1119 &raw_lines[1..]
1120 } else {
1121 &raw_lines
1122 };
1123
1124 let common = lines
1126 .iter()
1127 .filter(|l| !l.trim().is_empty())
1128 .map(|l| l.len() - l.trim_start().len())
1129 .min()
1130 .unwrap_or(0);
1131
1132 let stripped: Vec<&str> = lines
1134 .iter()
1135 .map(|l| if l.len() >= common { &l[common..] } else { *l })
1136 .collect();
1137
1138 let joined = stripped.join("\n");
1139 joined.trim_end_matches('\n').to_string()
1141}
1142
1143#[cfg(test)]
1146mod tests {
1147 use super::*;
1148 use bock_source::SourceFile;
1149 use std::path::PathBuf;
1150
1151 fn lex(src: &str) -> Vec<Token> {
1152 let file = SourceFile::new(
1153 bock_errors::FileId(0),
1154 PathBuf::from("test.bock"),
1155 src.to_string(),
1156 );
1157 let mut lexer = Lexer::new(&file);
1158 lexer.tokenize()
1159 }
1160
1161 fn kinds(src: &str) -> Vec<TokenKind> {
1162 lex(src).into_iter().map(|t| t.kind).collect()
1163 }
1164
1165 fn literals(src: &str) -> Vec<Option<String>> {
1166 lex(src).into_iter().map(|t| t.literal).collect()
1167 }
1168
1169 #[test]
1172 fn lex_simple_identifier() {
1173 let toks = kinds("foo");
1174 assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
1175 }
1176
1177 #[test]
1178 fn lex_type_identifier() {
1179 let toks = kinds("Foo");
1180 assert_eq!(toks, vec![TokenKind::TypeIdent, TokenKind::Eof]);
1181 }
1182
1183 #[test]
1184 fn lex_underscore() {
1185 let toks = kinds("_");
1186 assert_eq!(toks, vec![TokenKind::Underscore, TokenKind::Eof]);
1187 }
1188
1189 #[test]
1190 fn lex_underscore_ident() {
1191 let toks = kinds("_foo");
1193 assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
1194 }
1195
1196 #[test]
1197 fn lex_keywords() {
1198 let toks = kinds("fn let mut const if else match for in while loop break continue return");
1199 assert_eq!(
1200 toks,
1201 vec![
1202 TokenKind::Fn,
1203 TokenKind::Let,
1204 TokenKind::Mut,
1205 TokenKind::Const,
1206 TokenKind::If,
1207 TokenKind::Else,
1208 TokenKind::Match,
1209 TokenKind::For,
1210 TokenKind::In,
1211 TokenKind::While,
1212 TokenKind::Loop,
1213 TokenKind::Break,
1214 TokenKind::Continue,
1215 TokenKind::Return,
1216 TokenKind::Eof,
1217 ]
1218 );
1219 }
1220
1221 #[test]
1222 fn lex_true_false_as_bool_literal() {
1223 let toks = kinds("true false");
1224 assert_eq!(
1225 toks,
1226 vec![
1227 TokenKind::BoolLiteral,
1228 TokenKind::BoolLiteral,
1229 TokenKind::Eof
1230 ]
1231 );
1232 }
1233
1234 #[test]
1235 fn bool_literal_round_trip() {
1236 let src = "true false";
1237 let tokens = lex(src);
1238 assert_eq!(tokens[0].kind, TokenKind::BoolLiteral);
1240 assert_eq!(tokens[1].kind, TokenKind::BoolLiteral);
1241 assert_eq!(&src[tokens[0].span.start..tokens[0].span.end], "true");
1243 assert_eq!(&src[tokens[1].span.start..tokens[1].span.end], "false");
1244 }
1245
1246 #[test]
1247 fn lex_self_keywords() {
1248 let toks = kinds("self Self");
1249 assert_eq!(
1250 toks,
1251 vec![TokenKind::SelfLower, TokenKind::SelfUpper, TokenKind::Eof]
1252 );
1253 }
1254
1255 #[test]
1256 fn lex_ok_err_some_none() {
1257 let toks = kinds("Ok Err Some None");
1258 assert_eq!(
1259 toks,
1260 vec![
1261 TokenKind::Ok_,
1262 TokenKind::Err_,
1263 TokenKind::Some_,
1264 TokenKind::None_,
1265 TokenKind::Eof,
1266 ]
1267 );
1268 }
1269
1270 #[test]
1273 fn lex_single_char_ops() {
1274 let toks = kinds("+ - * / % ! & | ^ ~ ? # @");
1275 assert_eq!(
1276 toks,
1277 vec![
1278 TokenKind::Plus,
1279 TokenKind::Minus,
1280 TokenKind::Star,
1281 TokenKind::Slash,
1282 TokenKind::Percent,
1283 TokenKind::Not,
1284 TokenKind::BitAnd,
1285 TokenKind::BitOr,
1286 TokenKind::BitXor,
1287 TokenKind::BitNot,
1288 TokenKind::Question,
1289 TokenKind::Hash,
1290 TokenKind::At,
1291 TokenKind::Eof,
1292 ]
1293 );
1294 }
1295
1296 #[test]
1297 fn lex_pipe_vs_bitor() {
1298 let toks = kinds("|> |");
1299 assert_eq!(
1300 toks,
1301 vec![TokenKind::Pipe, TokenKind::BitOr, TokenKind::Eof]
1302 );
1303 }
1304
1305 #[test]
1306 fn lex_compose() {
1307 let toks = kinds(">>");
1309 assert_eq!(toks, vec![TokenKind::Shr, TokenKind::Eof]);
1310 }
1311
1312 #[test]
1313 fn lex_dotdot_dotdoteq_dot() {
1314 let toks = kinds(". .. ..=");
1315 assert_eq!(
1316 toks,
1317 vec![
1318 TokenKind::Dot,
1319 TokenKind::DotDot,
1320 TokenKind::DotDotEq,
1321 TokenKind::Eof
1322 ]
1323 );
1324 }
1325
1326 #[test]
1327 fn lex_fat_arrow_vs_eq() {
1328 let toks = kinds("=> = ==");
1329 assert_eq!(
1330 toks,
1331 vec![
1332 TokenKind::FatArrow,
1333 TokenKind::Assign,
1334 TokenKind::Eq,
1335 TokenKind::Eof
1336 ]
1337 );
1338 }
1339
1340 #[test]
1341 fn lex_thin_arrow_vs_minus() {
1342 let toks = kinds("-> - -=");
1343 assert_eq!(
1344 toks,
1345 vec![
1346 TokenKind::ThinArrow,
1347 TokenKind::Minus,
1348 TokenKind::MinusEq,
1349 TokenKind::Eof
1350 ]
1351 );
1352 }
1353
1354 #[test]
1355 fn lex_power_vs_star() {
1356 let toks = kinds("** * *=");
1357 assert_eq!(
1358 toks,
1359 vec![
1360 TokenKind::Power,
1361 TokenKind::Star,
1362 TokenKind::StarEq,
1363 TokenKind::Eof
1364 ]
1365 );
1366 }
1367
1368 #[test]
1369 fn lex_shift_ops() {
1370 let toks = kinds("<< >>");
1371 assert_eq!(toks, vec![TokenKind::Shl, TokenKind::Shr, TokenKind::Eof]);
1372 }
1373
1374 #[test]
1375 fn lex_assignment_ops() {
1376 let toks = kinds("+= -= *= /= %=");
1377 assert_eq!(
1378 toks,
1379 vec![
1380 TokenKind::PlusEq,
1381 TokenKind::MinusEq,
1382 TokenKind::StarEq,
1383 TokenKind::SlashEq,
1384 TokenKind::PercentEq,
1385 TokenKind::Eof,
1386 ]
1387 );
1388 }
1389
1390 #[test]
1391 fn lex_comparison_ops() {
1392 let toks = kinds("== != < > <= >=");
1393 assert_eq!(
1394 toks,
1395 vec![
1396 TokenKind::Eq,
1397 TokenKind::Neq,
1398 TokenKind::Lt,
1399 TokenKind::Gt,
1400 TokenKind::Lte,
1401 TokenKind::Gte,
1402 TokenKind::Eof,
1403 ]
1404 );
1405 }
1406
1407 #[test]
1408 fn lex_logical_ops() {
1409 let toks = kinds("&& || !");
1410 assert_eq!(
1411 toks,
1412 vec![
1413 TokenKind::And,
1414 TokenKind::Or,
1415 TokenKind::Not,
1416 TokenKind::Eof
1417 ]
1418 );
1419 }
1420
1421 #[test]
1424 fn lex_delimiters() {
1425 let toks = kinds("( ) [ ] { }");
1426 assert_eq!(
1427 toks,
1428 vec![
1429 TokenKind::LParen,
1430 TokenKind::RParen,
1431 TokenKind::LBracket,
1432 TokenKind::RBracket,
1433 TokenKind::LBrace,
1434 TokenKind::RBrace,
1435 TokenKind::Eof,
1436 ]
1437 );
1438 }
1439
1440 #[test]
1441 fn lex_misc_punct() {
1442 let toks = kinds(", : ;");
1443 assert_eq!(
1444 toks,
1445 vec![
1446 TokenKind::Comma,
1447 TokenKind::Colon,
1448 TokenKind::Semicolon,
1449 TokenKind::Eof
1450 ]
1451 );
1452 }
1453
1454 #[test]
1457 fn lex_newlines() {
1458 let toks = kinds("foo\nbar");
1459 assert_eq!(
1460 toks,
1461 vec![
1462 TokenKind::Ident,
1463 TokenKind::Newline,
1464 TokenKind::Ident,
1465 TokenKind::Eof,
1466 ]
1467 );
1468 }
1469
1470 #[test]
1471 fn lex_crlf_newline() {
1472 let toks = kinds("foo\r\nbar");
1473 assert_eq!(
1474 toks,
1475 vec![
1476 TokenKind::Ident,
1477 TokenKind::Newline,
1478 TokenKind::Ident,
1479 TokenKind::Eof,
1480 ]
1481 );
1482 }
1483
1484 #[test]
1485 fn lex_multiple_newlines() {
1486 let toks = kinds("a\n\nb");
1487 assert_eq!(
1488 toks,
1489 vec![
1490 TokenKind::Ident,
1491 TokenKind::Newline,
1492 TokenKind::Newline,
1493 TokenKind::Ident,
1494 TokenKind::Eof,
1495 ]
1496 );
1497 }
1498
1499 #[test]
1502 fn lex_unknown_char_produces_error() {
1503 let file = SourceFile::new(
1504 bock_errors::FileId(0),
1505 PathBuf::from("test.bock"),
1506 "§".to_string(),
1507 );
1508 let mut lexer = Lexer::new(&file);
1509 let toks = lexer.tokenize();
1510 assert_eq!(toks[0].kind, TokenKind::Error);
1511 assert!(lexer.diagnostics().has_errors());
1512 }
1513
1514 #[test]
1517 fn integration_basic_function_signature() {
1518 let toks = kinds("fn add(x: Int) -> Int");
1520 assert_eq!(
1521 toks,
1522 vec![
1523 TokenKind::Fn,
1524 TokenKind::Ident, TokenKind::LParen,
1526 TokenKind::Ident, TokenKind::Colon,
1528 TokenKind::TypeIdent, TokenKind::RParen,
1530 TokenKind::ThinArrow,
1531 TokenKind::TypeIdent, TokenKind::Eof,
1533 ]
1534 );
1535 }
1536
1537 #[test]
1538 fn integration_let_binding() {
1539 let toks = kinds("let mut x =");
1542 assert_eq!(
1543 toks,
1544 vec![
1545 TokenKind::Let,
1546 TokenKind::Mut,
1547 TokenKind::Ident,
1548 TokenKind::Assign,
1549 TokenKind::Eof,
1550 ]
1551 );
1552 }
1553
1554 #[test]
1555 fn integration_match_arm() {
1556 let toks = kinds("Ok(x) => x");
1558 assert_eq!(
1559 toks,
1560 vec![
1561 TokenKind::Ok_,
1562 TokenKind::LParen,
1563 TokenKind::Ident,
1564 TokenKind::RParen,
1565 TokenKind::FatArrow,
1566 TokenKind::Ident,
1567 TokenKind::Eof,
1568 ]
1569 );
1570 }
1571
1572 #[test]
1573 fn integration_pipe_expression() {
1574 let toks = kinds("xs |> map |> filter");
1576 assert_eq!(
1577 toks,
1578 vec![
1579 TokenKind::Ident,
1580 TokenKind::Pipe,
1581 TokenKind::Ident,
1582 TokenKind::Pipe,
1583 TokenKind::Ident,
1584 TokenKind::Eof,
1585 ]
1586 );
1587 }
1588
1589 #[test]
1590 fn integration_multiline() {
1591 let src = "fn foo()\n let x = y\n x";
1592 let toks = kinds(src);
1593 assert_eq!(
1596 toks,
1597 vec![
1598 TokenKind::Fn,
1599 TokenKind::Ident,
1600 TokenKind::LParen,
1601 TokenKind::RParen,
1602 TokenKind::Newline,
1603 TokenKind::Let,
1604 TokenKind::Ident,
1605 TokenKind::Assign,
1606 TokenKind::Ident,
1607 TokenKind::Newline,
1608 TokenKind::Ident,
1609 TokenKind::Eof,
1610 ]
1611 );
1612 }
1613
1614 #[test]
1617 fn lex_plain_string() {
1618 let toks = lex(r#""hello""#);
1619 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1620 assert_eq!(toks[0].literal.as_deref(), Some("hello"));
1621 assert_eq!(toks[1].kind, TokenKind::Eof);
1622 }
1623
1624 #[test]
1625 fn lex_string_escape_sequences() {
1626 let toks = lex("\"a\\nb\\tc\\\\\"");
1628 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1629 assert_eq!(toks[0].literal.as_deref(), Some("a\nb\tc\\"));
1630 }
1631
1632 #[test]
1633 fn lex_string_escape_dollar() {
1634 let toks = lex(r#""\$""#);
1635 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1636 assert_eq!(toks[0].literal.as_deref(), Some("$"));
1637 }
1638
1639 #[test]
1640 fn lex_string_double_dollar_escape() {
1641 let toks = lex(r#""$$""#);
1643 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1644 assert_eq!(toks[0].literal.as_deref(), Some("$"));
1645 }
1646
1647 #[test]
1648 fn lex_string_unicode_escape() {
1649 let toks = lex("\"\\u{41}\"");
1651 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1652 assert_eq!(toks[0].literal.as_deref(), Some("A"));
1653 }
1654
1655 #[test]
1656 fn lex_string_unicode_escape_multibyte() {
1657 let toks = lex("\"\\u{1F600}\"");
1659 assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1660 assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1661 }
1662
1663 #[test]
1664 fn lex_raw_string() {
1665 let toks = lex(r#"r"hello\nworld""#);
1666 assert_eq!(toks[0].kind, TokenKind::RawStringLiteral);
1667 assert_eq!(toks[0].literal.as_deref(), Some("hello\\nworld"));
1669 }
1670
1671 #[test]
1672 fn lex_raw_string_dollar_literal() {
1673 let toks = lex(r#"r"${not interp}""#);
1675 assert_eq!(toks[0].kind, TokenKind::RawStringLiteral);
1676 assert_eq!(toks[0].literal.as_deref(), Some("${not interp}"));
1677 }
1678
1679 #[test]
1680 fn lex_multiline_string() {
1681 let src = "\"\"\"hello world\"\"\"";
1682 let toks = lex(src);
1683 assert_eq!(toks[0].kind, TokenKind::MultiLineStringLiteral);
1684 assert_eq!(toks[0].literal.as_deref(), Some("hello world"));
1685 }
1686
1687 #[test]
1688 fn lex_multiline_string_indent_stripping() {
1689 let src = "\"\"\"\n hello\n world\n\"\"\"";
1694 let toks = lex(src);
1695 assert_eq!(toks[0].kind, TokenKind::MultiLineStringLiteral);
1696 assert_eq!(toks[0].literal.as_deref(), Some("hello\nworld"));
1697 }
1698
1699 #[test]
1700 fn lex_raw_multiline_string() {
1701 let src = "r\"\"\"\nhello\\nworld\n\"\"\"";
1702 let toks = lex(src);
1703 assert_eq!(toks[0].kind, TokenKind::RawMultiLineStringLiteral);
1704 assert!(toks[0]
1706 .literal
1707 .as_deref()
1708 .unwrap()
1709 .contains("hello\\nworld"));
1710 }
1711
1712 #[test]
1715 fn lex_interpolated_string_simple() {
1716 let toks = lex("\"hello ${name}!\"");
1718 assert_eq!(toks[0].kind, TokenKind::StringLiteralPart);
1721 assert_eq!(toks[0].literal.as_deref(), Some("hello "));
1722 assert_eq!(toks[1].kind, TokenKind::InterpolationStart);
1723 assert_eq!(toks[2].kind, TokenKind::Ident);
1724 assert_eq!(toks[3].kind, TokenKind::InterpolationEnd);
1725 assert_eq!(toks[4].kind, TokenKind::StringLiteralPart);
1726 assert_eq!(toks[4].literal.as_deref(), Some("!"));
1727 assert_eq!(toks[5].kind, TokenKind::Eof);
1728 }
1729
1730 #[test]
1731 fn lex_interpolated_string_nested_braces() {
1732 let toks = lex("\"${f({key: val})}\"");
1734 let ks: Vec<_> = toks.iter().map(|t| t.kind.clone()).collect();
1738 assert_eq!(
1739 ks,
1740 vec![
1741 TokenKind::StringLiteralPart, TokenKind::InterpolationStart,
1743 TokenKind::Ident, TokenKind::LParen,
1745 TokenKind::LBrace,
1746 TokenKind::Ident, TokenKind::Colon,
1748 TokenKind::Ident, TokenKind::RBrace,
1750 TokenKind::RParen,
1751 TokenKind::InterpolationEnd,
1752 TokenKind::StringLiteralPart, TokenKind::Eof,
1754 ]
1755 );
1756 }
1757
1758 #[test]
1759 fn lex_interpolated_string_multiple_interps() {
1760 let toks = lex("\"${a} + ${b}\"");
1762 let ks: Vec<_> = toks.iter().map(|t| t.kind.clone()).collect();
1763 assert_eq!(
1764 ks,
1765 vec![
1766 TokenKind::StringLiteralPart, TokenKind::InterpolationStart,
1768 TokenKind::Ident, TokenKind::InterpolationEnd,
1770 TokenKind::StringLiteralPart, TokenKind::InterpolationStart,
1772 TokenKind::Ident, TokenKind::InterpolationEnd,
1774 TokenKind::StringLiteralPart, TokenKind::Eof,
1776 ]
1777 );
1778 assert_eq!(toks[4].literal.as_deref(), Some(" + "));
1779 }
1780
1781 #[test]
1784 fn lex_char_simple() {
1785 let toks = lex("'a'");
1786 assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1787 assert_eq!(toks[0].literal.as_deref(), Some("a"));
1788 }
1789
1790 #[test]
1791 fn lex_char_newline_escape() {
1792 let toks = lex("'\\n'");
1793 assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1794 assert_eq!(toks[0].literal.as_deref(), Some("\n"));
1795 }
1796
1797 #[test]
1798 fn lex_char_unicode_escape() {
1799 let toks = lex("'\\u{1F600}'");
1801 assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1802 assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1803 }
1804
1805 #[test]
1806 fn lex_char_multibyte_unicode() {
1807 let toks = lex("'😀'");
1809 assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1810 assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1811 }
1812
1813 #[test]
1816 fn lex_unterminated_string_produces_diagnostic() {
1817 let file = SourceFile::new(
1818 bock_errors::FileId(0),
1819 PathBuf::from("test.bock"),
1820 "\"unterminated".to_string(),
1821 );
1822 let mut lexer = Lexer::new(&file);
1823 let _ = lexer.tokenize();
1824 assert!(lexer.diagnostics().has_errors());
1825 }
1826
1827 #[test]
1828 fn lex_empty_char_literal_produces_diagnostic() {
1829 let file = SourceFile::new(
1830 bock_errors::FileId(0),
1831 PathBuf::from("test.bock"),
1832 "''".to_string(),
1833 );
1834 let mut lexer = Lexer::new(&file);
1835 let toks = lexer.tokenize();
1836 assert_eq!(toks[0].kind, TokenKind::Error);
1837 assert!(lexer.diagnostics().has_errors());
1838 }
1839
1840 #[test]
1841 fn lex_literals_helper() {
1842 let lits = literals(r#""hi""#);
1844 assert_eq!(lits[0].as_deref(), Some("hi"));
1845 }
1846
1847 fn lex_num(src: &str) -> Vec<Token> {
1850 lex(src)
1851 }
1852
1853 #[test]
1854 fn lex_decimal_integer() {
1855 let toks = lex_num("42");
1856 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1857 assert_eq!(toks[0].literal.as_deref(), Some("42"));
1858 }
1859
1860 #[test]
1861 fn lex_decimal_with_underscores() {
1862 let toks = lex_num("1_000_000");
1863 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1864 assert_eq!(toks[0].literal.as_deref(), Some("1_000_000"));
1865 }
1866
1867 #[test]
1868 fn lex_hex_literal() {
1869 let toks = lex_num("0xFF");
1870 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1871 assert_eq!(toks[0].literal.as_deref(), Some("0xFF"));
1872 }
1873
1874 #[test]
1875 fn lex_hex_literal_uppercase_prefix() {
1876 let toks = lex_num("0XFF");
1877 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1878 assert_eq!(toks[0].literal.as_deref(), Some("0XFF"));
1879 }
1880
1881 #[test]
1882 fn lex_octal_literal() {
1883 let toks = lex_num("0o77");
1884 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1885 assert_eq!(toks[0].literal.as_deref(), Some("0o77"));
1886 }
1887
1888 #[test]
1889 fn lex_octal_literal_uppercase_prefix() {
1890 let toks = lex_num("0O77");
1891 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1892 assert_eq!(toks[0].literal.as_deref(), Some("0O77"));
1893 }
1894
1895 #[test]
1896 fn lex_binary_literal() {
1897 let toks = lex_num("0b1010");
1898 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1899 assert_eq!(toks[0].literal.as_deref(), Some("0b1010"));
1900 }
1901
1902 #[test]
1903 fn lex_binary_literal_uppercase_prefix() {
1904 let toks = lex_num("0B1010");
1905 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1906 assert_eq!(toks[0].literal.as_deref(), Some("0B1010"));
1907 }
1908
1909 #[test]
1910 fn lex_float_simple() {
1911 let toks = lex_num("3.14");
1912 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1913 assert_eq!(toks[0].literal.as_deref(), Some("3.14"));
1914 }
1915
1916 #[test]
1917 fn lex_float_exponent_lower() {
1918 let toks = lex_num("1.0e10");
1919 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1920 assert_eq!(toks[0].literal.as_deref(), Some("1.0e10"));
1921 }
1922
1923 #[test]
1924 fn lex_float_exponent_upper() {
1925 let toks = lex_num("2.5E-3");
1926 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1927 assert_eq!(toks[0].literal.as_deref(), Some("2.5E-3"));
1928 }
1929
1930 #[test]
1931 fn lex_float_exponent_no_dot() {
1932 let toks = lex_num("1e6");
1934 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1935 assert_eq!(toks[0].literal.as_deref(), Some("1e6"));
1936 }
1937
1938 #[test]
1939 fn lex_float_exponent_plus() {
1940 let toks = lex_num("1.5E+3");
1941 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1942 assert_eq!(toks[0].literal.as_deref(), Some("1.5E+3"));
1943 }
1944
1945 #[test]
1946 fn lex_int_with_type_suffix() {
1947 let toks = lex_num("42_u8");
1948 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1949 assert_eq!(toks[0].literal.as_deref(), Some("42_u8"));
1950 }
1951
1952 #[test]
1953 fn lex_float_with_type_suffix() {
1954 let toks = lex_num("3.14_f64");
1955 assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1956 assert_eq!(toks[0].literal.as_deref(), Some("3.14_f64"));
1957 }
1958
1959 #[test]
1960 fn lex_range_does_not_consume_dotdot() {
1961 let toks = lex_num("1..2");
1963 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1964 assert_eq!(toks[0].literal.as_deref(), Some("1"));
1965 assert_eq!(toks[1].kind, TokenKind::DotDot);
1966 assert_eq!(toks[2].kind, TokenKind::IntLiteral);
1967 assert_eq!(toks[2].literal.as_deref(), Some("2"));
1968 }
1969
1970 #[test]
1971 fn lex_invalid_binary_digit_produces_diagnostic() {
1972 let file = bock_source::SourceFile::new(
1973 bock_errors::FileId(0),
1974 std::path::PathBuf::from("test.bock"),
1975 "0b123".to_string(),
1976 );
1977 let mut lexer = Lexer::new(&file);
1978 let _ = lexer.tokenize();
1979 assert!(
1980 !lexer.diagnostics().is_empty(),
1981 "expected diagnostic for invalid binary digit"
1982 );
1983 }
1984
1985 #[test]
1986 fn lex_invalid_octal_digit_produces_diagnostic() {
1987 let file = bock_source::SourceFile::new(
1988 bock_errors::FileId(0),
1989 std::path::PathBuf::from("test.bock"),
1990 "0o89".to_string(),
1991 );
1992 let mut lexer = Lexer::new(&file);
1993 let _ = lexer.tokenize();
1994 assert!(
1995 !lexer.diagnostics().is_empty(),
1996 "expected diagnostic for invalid octal digit"
1997 );
1998 }
1999
2000 #[test]
2001 fn lex_hex_with_underscores() {
2002 let toks = lex_num("0xFF_FF");
2003 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
2004 assert_eq!(toks[0].literal.as_deref(), Some("0xFF_FF"));
2005 }
2006
2007 #[test]
2008 fn lex_zero_alone() {
2009 let toks = lex_num("0");
2010 assert_eq!(toks[0].kind, TokenKind::IntLiteral);
2011 assert_eq!(toks[0].literal.as_deref(), Some("0"));
2012 }
2013
2014 fn has_errors(src: &str) -> bool {
2017 let file = SourceFile::new(
2018 bock_errors::FileId(0),
2019 std::path::PathBuf::from("test.bock"),
2020 src.to_string(),
2021 );
2022 let mut lexer = Lexer::new(&file);
2023 let _ = lexer.tokenize();
2024 !lexer.diagnostics().is_empty()
2025 }
2026
2027 #[test]
2028 fn lex_line_comment_produces_no_token() {
2029 let toks = kinds("// this is a comment\nfoo");
2031 assert_eq!(
2032 toks,
2033 vec![TokenKind::Newline, TokenKind::Ident, TokenKind::Eof]
2034 );
2035 }
2036
2037 #[test]
2038 fn lex_line_comment_at_eof() {
2039 let toks = kinds("// comment at eof");
2041 assert_eq!(toks, vec![TokenKind::Eof]);
2042 }
2043
2044 #[test]
2045 fn lex_doc_comment_produces_token() {
2046 let toks = lex("/// doc comment");
2047 assert_eq!(toks[0].kind, TokenKind::DocComment);
2048 assert_eq!(toks[0].literal.as_deref(), Some("doc comment"));
2049 }
2050
2051 #[test]
2052 fn lex_doc_comment_content_trimmed() {
2053 let toks = lex("/// spaces around ");
2054 assert_eq!(toks[0].kind, TokenKind::DocComment);
2055 assert_eq!(toks[0].literal.as_deref(), Some("spaces around"));
2056 }
2057
2058 #[test]
2059 fn lex_module_doc_comment_produces_token() {
2060 let toks = lex("//! module doc");
2061 assert_eq!(toks[0].kind, TokenKind::ModuleDocComment);
2062 assert_eq!(toks[0].literal.as_deref(), Some("module doc"));
2063 }
2064
2065 #[test]
2066 fn lex_doc_comment_then_ident() {
2067 let toks = kinds("/// docs\nfoo");
2068 assert_eq!(
2069 toks,
2070 vec![
2071 TokenKind::DocComment,
2072 TokenKind::Newline,
2073 TokenKind::Ident,
2074 TokenKind::Eof,
2075 ]
2076 );
2077 }
2078
2079 #[test]
2080 fn lex_block_comment_produces_no_token() {
2081 let toks = kinds("/* block comment */ foo");
2082 assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2083 }
2084
2085 #[test]
2086 fn lex_nested_block_comment() {
2087 let toks = kinds("/* outer /* inner */ still outer */ foo");
2089 assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2090 }
2091
2092 #[test]
2093 fn lex_deeply_nested_block_comment() {
2094 let toks = kinds("/* a /* b /* c */ b */ a */ x");
2095 assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2096 }
2097
2098 #[test]
2099 fn lex_unterminated_block_comment_produces_diagnostic() {
2100 assert!(
2101 has_errors("/* not closed"),
2102 "expected diagnostic for unterminated block comment"
2103 );
2104 }
2105
2106 #[test]
2107 fn lex_block_comment_inline() {
2108 let toks = kinds("foo /* ignore */ bar");
2110 assert_eq!(
2111 toks,
2112 vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
2113 );
2114 }
2115
2116 #[test]
2119 fn raw_multiline_string_has_distinct_kind() {
2120 let src = "r\"\"\"\nhello\n\"\"\"";
2121 let toks = lex(src);
2122 assert_eq!(toks[0].kind, TokenKind::RawMultiLineStringLiteral);
2123 let toks2 = lex("\"\"\"\nhello\n\"\"\"");
2125 assert_eq!(toks2[0].kind, TokenKind::MultiLineStringLiteral);
2126 }
2127
2128 #[test]
2131 fn backslash_newline_joins_lines() {
2132 let toks = kinds("let \\\nx = 1");
2134 assert_eq!(
2135 toks,
2136 vec![
2137 TokenKind::Let,
2138 TokenKind::Ident,
2139 TokenKind::Assign,
2140 TokenKind::IntLiteral,
2141 TokenKind::Eof,
2142 ]
2143 );
2144 }
2145
2146 #[test]
2147 fn backslash_without_newline_is_error() {
2148 let toks = lex("\\x");
2149 assert_eq!(toks[0].kind, TokenKind::Error);
2150 }
2151
2152 #[test]
2153 fn backslash_continuation_multiline_expr() {
2154 let toks = kinds("1 + \\\n 2 + \\\n 3");
2156 assert_eq!(
2157 toks,
2158 vec![
2159 TokenKind::IntLiteral,
2160 TokenKind::Plus,
2161 TokenKind::IntLiteral,
2162 TokenKind::Plus,
2163 TokenKind::IntLiteral,
2164 TokenKind::Eof,
2165 ]
2166 );
2167 }
2168
2169 #[test]
2170 fn backslash_at_eof_is_error() {
2171 let toks = lex("\\");
2172 assert_eq!(toks[0].kind, TokenKind::Error);
2173 }
2174}