1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.try_consume_unsupported_raw_variant(self.input.len()) {
52 continue;
53 }
54
55 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
56 let mut fstring_tokens = self.lex_format_string_tokens();
57 fstring_tokens.reverse();
58 self.pending_tokens = fstring_tokens;
59 continue;
60 }
61
62 let token = self.create_token();
63 tokens.push(token);
64 }
65
66 let tokens = self.insert_semicolons(tokens);
67
68 LexResult {
69 tokens,
70 errors: self.errors,
71 trivia: self.trivia,
72 }
73 }
74
75 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
76 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
77
78 for i in 0..tokens.len() {
79 let token = tokens[i];
80 result.push(token);
81
82 if !Self::triggers_asi(token.kind) {
83 continue;
84 }
85
86 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
87 if Self::continues_expression(next_token.kind) {
88 continue;
89 }
90
91 let token_end = (token.byte_offset + token.byte_length) as usize;
92 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
93 result.push(self.make_synthetic_semicolon(token_end));
94 }
95 }
96 }
97
98 result
99 }
100
101 fn triggers_asi(kind: TokenKind) -> bool {
102 matches!(
103 kind,
104 TokenKind::Identifier
105 | TokenKind::Integer
106 | TokenKind::Imaginary
107 | TokenKind::Float
108 | TokenKind::String
109 | TokenKind::RawString
110 | TokenKind::Char
111 | TokenKind::Boolean
112 | TokenKind::RightParen
113 | TokenKind::RightSquareBracket
114 | TokenKind::RightCurlyBrace
115 | TokenKind::Break
116 | TokenKind::Continue
117 | TokenKind::Return
118 | TokenKind::DotDot
119 | TokenKind::DotDotEqual
120 | TokenKind::QuestionMark
121 )
122 }
123
124 fn continues_expression(kind: TokenKind) -> bool {
125 matches!(
126 kind,
127 TokenKind::Plus
128 | TokenKind::Star
129 | TokenKind::Slash
130 | TokenKind::Percent
131 | TokenKind::Pipeline
132 | TokenKind::AmpersandDouble
133 | TokenKind::PipeDouble
134 | TokenKind::EqualDouble
135 | TokenKind::NotEqual
136 | TokenKind::LeftAngleBracket
137 | TokenKind::RightAngleBracket
138 | TokenKind::LessThanOrEqual
139 | TokenKind::GreaterThanOrEqual
140 | TokenKind::Dot
141 | TokenKind::Equal
142 | TokenKind::PlusEqual
143 | TokenKind::MinusEqual
144 | TokenKind::StarEqual
145 | TokenKind::SlashEqual
146 | TokenKind::AmpersandEqual
147 | TokenKind::PipeEqual
148 | TokenKind::CaretEqual
149 | TokenKind::AndNotEqual
150 | TokenKind::ShiftLeftEqual
151 | TokenKind::ShiftRightEqual
152 | TokenKind::Else
153 | TokenKind::LeftCurlyBrace
154 | TokenKind::RightCurlyBrace
155 | TokenKind::RightParen
156 | TokenKind::RightSquareBracket
157 | TokenKind::As
158 )
159 }
160
161 fn find_next_non_comment_token<'a>(
162 &self,
163 tokens: &'a [Token<'source>],
164 start_index: usize,
165 ) -> Option<&'a Token<'source>> {
166 tokens
167 .iter()
168 .skip(start_index)
169 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
170 }
171
172 fn has_newline_between(&self, start: usize, end: usize) -> bool {
173 self.input[start..end].contains('\n')
174 }
175
176 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
177 Token {
178 kind: TokenKind::Semicolon,
179 text: "",
180 byte_offset: position as u32,
181 byte_length: 0,
182 }
183 }
184
185 fn create_token(&mut self) -> Token<'source> {
186 if let Some(token) = self.lex_lookahead_symbol() {
187 return token;
188 }
189
190 let c = self.current_char();
191 match c {
192 '0'..='9' => self.lex_number(),
193 'r' if self.peek_byte() == b'"' => self.lex_raw_string_literal(),
194 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
195 '"' => self.lex_string_literal(),
196 '`' => self.lex_backtick_literal(),
197 '\'' => self.lex_char(),
198 '/' => self.lex_slash(),
199 ';' => self.semicolon_token(),
200 '@' => self.lex_directive(),
201 _ => self.handle_unexpected_char(),
202 }
203 }
204
205 #[inline]
206 fn current_byte(&self) -> u8 {
207 if self.current_offset < self.input_bytes.len() {
208 self.input_bytes[self.current_offset]
209 } else {
210 0
211 }
212 }
213
214 #[inline]
215 fn current_char(&self) -> char {
216 self.input[self.current_offset..]
217 .chars()
218 .next()
219 .unwrap_or('\0')
220 }
221
222 #[inline]
223 fn peek_byte(&self) -> u8 {
224 if self.current_offset + 1 < self.input_bytes.len() {
225 self.input_bytes[self.current_offset + 1]
226 } else {
227 0
228 }
229 }
230
231 #[inline]
232 fn peek_byte_at(&self, n: usize) -> u8 {
233 let offset = self.current_offset + n;
234 if offset < self.input_bytes.len() {
235 self.input_bytes[offset]
236 } else {
237 0
238 }
239 }
240
241 #[inline]
242 fn peek_char(&self) -> char {
243 let next_offset = if self.current_byte() < 128 {
244 self.current_offset + 1
245 } else {
246 self.current_offset + self.current_char().len_utf8()
247 };
248 self.input[next_offset..].chars().next().unwrap_or('\0')
249 }
250
251 fn peek_char_n(&self, n: usize) -> char {
252 let mut offset = self.current_offset;
253 for _ in 0..n {
254 if offset >= self.input.len() {
255 return '\0';
256 }
257 let c = self.input[offset..].chars().next().unwrap_or('\0');
258 offset += c.len_utf8();
259 }
260 self.input[offset..].chars().next().unwrap_or('\0')
261 }
262
263 fn next(&mut self) {
264 if self.at_eof() {
265 return;
266 }
267 if self.current_byte() < 128 {
268 self.current_offset += 1;
269 } else {
270 self.current_offset += self.current_char().len_utf8();
271 }
272 }
273
274 fn skip(&mut self, count: usize) {
275 for _ in 0..count {
276 self.next();
277 }
278 }
279
280 fn skip_whitespace(&mut self) {
281 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
282 if self.current_byte() == b'\n' {
283 self.record_newline();
284 }
285 self.next();
286 }
287 }
288
289 fn skip_horizontal_whitespace(&mut self) {
290 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
291 self.next();
292 }
293 }
294
295 fn record_newline(&mut self) {
296 let offset = self.current_offset;
297
298 if let Some(last) = self.last_newline_offset {
299 let between = &self.input[last + 1..offset];
300 let is_blank = between.is_empty()
301 || between
302 .chars()
303 .all(|c| c.is_ascii_whitespace() && c != '\n');
304 if is_blank {
305 self.trivia.blank_lines.push(offset as u32);
306 }
307 }
308
309 self.last_newline_offset = Some(offset);
310 }
311
312 fn at_eof(&self) -> bool {
313 self.current_offset >= self.input.len()
314 }
315
316 fn previous_char(&self) -> char {
317 if self.current_offset == 0 {
318 return '\0';
319 }
320 self.input[..self.current_offset]
321 .chars()
322 .next_back()
323 .unwrap_or('\0')
324 }
325
326 fn resync_on_error(&mut self) {
327 while !self.at_eof() {
328 let byte = self.current_byte();
329
330 if byte == b';' || byte == b'}' {
331 break;
332 }
333
334 self.next();
335 }
336 }
337
338 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
340 let start_offset = self.current_offset;
341 let current_char = self.current_char();
342 let next_char = self.peek_char();
343 let third_char = self.peek_char_n(2);
344
345 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
346 self.skip(3);
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
357 self.skip(2);
358 let end_offset = self.current_offset;
359 return Some(Token {
360 kind,
361 text: &self.input[start_offset..end_offset],
362 byte_offset: start_offset as u32,
363 byte_length: (end_offset - start_offset) as u32,
364 });
365 }
366
367 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
368 self.next();
369 let end_offset = self.current_offset;
370 return Some(Token {
371 kind,
372 text: &self.input[start_offset..end_offset],
373 byte_offset: start_offset as u32,
374 byte_length: (end_offset - start_offset) as u32,
375 });
376 }
377
378 None
379 }
380
381 fn lex_number(&mut self) -> Token<'source> {
382 let start_offset = self.current_offset;
383
384 if self.current_byte() == b'0' {
385 let next = self.peek_byte();
386 match next {
387 b'x' | b'X' => {
388 self.next(); self.next(); return self.lex_hex_number(start_offset);
391 }
392 b'o' | b'O' => {
393 self.next(); self.next(); return self.lex_octal_number(start_offset);
396 }
397 b'b' | b'B' => {
398 self.next(); self.next(); return self.lex_binary_number(start_offset);
401 }
402 b'0'..=b'7' => {
403 return self.lex_legacy_octal_number(start_offset);
404 }
405 _ => {} }
407 }
408
409 let mut kind = TokenKind::Integer;
410
411 while !self.at_eof() {
412 let byte = self.current_byte();
413 if byte.is_ascii_digit() || byte == b'_' {
414 if byte == b'_' && self.previous_char() == '_' {
415 let underscore_start = self.current_offset - 1;
416 self.error_consecutive_underscores(underscore_start);
417 }
418 self.next();
419 } else {
420 break;
421 }
422 }
423
424 if self.previous_char() == '_' {
425 self.error_number_trailing_underscore(
426 self.current_offset - self.previous_char().len_utf8(),
427 );
428 }
429
430 let preceded_by_dot = start_offset > 0
433 && self.input_bytes[start_offset - 1] == b'.'
434 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
435
436 if !preceded_by_dot
437 && self.current_byte() == b'.'
438 && self.peek_byte() != b'.'
439 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
440 {
441 kind = TokenKind::Float;
442 self.next();
443
444 if self.current_byte() == b'_' {
445 self.error_decimal_leading_underscore(self.current_offset);
446 }
447
448 while !self.at_eof() {
449 let byte = self.current_byte();
450 if byte.is_ascii_digit() || byte == b'_' {
451 if byte == b'_' && self.previous_char() == '_' {
452 let underscore_start = self.current_offset - 1;
453 self.error_consecutive_underscores(underscore_start);
454 }
455 self.next();
456 } else {
457 break;
458 }
459 }
460
461 if self.previous_char() == '_' {
462 self.error_number_trailing_underscore(
463 self.current_offset - self.previous_char().len_utf8(),
464 );
465 }
466 }
467
468 if self.current_byte() == b'e' || self.current_byte() == b'E' {
469 kind = TokenKind::Float;
470 let exponent_start = self.current_offset;
471 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
474 self.next();
475 }
476
477 if !self.current_byte().is_ascii_digit() {
478 self.error_missing_exponent_digits(
479 exponent_start,
480 self.current_offset - exponent_start,
481 );
482 }
483
484 while !self.at_eof() {
485 let byte = self.current_byte();
486 if byte.is_ascii_digit() || byte == b'_' {
487 if byte == b'_' && self.previous_char() == '_' {
488 let underscore_start = self.current_offset - 1;
489 self.error_consecutive_underscores(underscore_start);
490 }
491 self.next();
492 } else {
493 break;
494 }
495 }
496
497 if self.previous_char() == '_' {
498 self.error_number_trailing_underscore(
499 self.current_offset - self.previous_char().len_utf8(),
500 );
501 }
502 }
503
504 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
505 self.next(); let end_offset = self.current_offset;
507 return Token {
508 kind: TokenKind::Imaginary,
509 text: &self.input[start_offset..end_offset],
510 byte_offset: start_offset as u32,
511 byte_length: (end_offset - start_offset) as u32,
512 };
513 }
514
515 let end_offset = self.current_offset;
516 Token {
517 kind,
518 text: &self.input[start_offset..end_offset],
519 byte_offset: start_offset as u32,
520 byte_length: (end_offset - start_offset) as u32,
521 }
522 }
523
524 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
525 let digits_start = self.current_offset;
526
527 while !self.at_eof() {
528 let byte = self.current_byte();
529 if byte.is_ascii_hexdigit() || byte == b'_' {
530 if byte == b'_' && self.previous_char() == '_' {
531 let underscore_start = self.current_offset - 1;
532 self.error_consecutive_underscores(underscore_start);
533 }
534 self.next();
535 } else {
536 break;
537 }
538 }
539
540 if self.current_offset == digits_start {
541 self.error_missing_hex_digits(start_offset, 2);
542 }
543
544 if self.previous_char() == '_' {
545 self.error_number_trailing_underscore(
546 self.current_offset - self.previous_char().len_utf8(),
547 );
548 }
549
550 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
551 self.next(); let end_offset = self.current_offset;
553 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
554 return Token {
555 kind: TokenKind::Imaginary,
556 text: &self.input[start_offset..end_offset],
557 byte_offset: start_offset as u32,
558 byte_length: (end_offset - start_offset) as u32,
559 };
560 }
561
562 let end_offset = self.current_offset;
563 Token {
564 kind: TokenKind::Integer,
565 text: &self.input[start_offset..end_offset],
566 byte_offset: start_offset as u32,
567 byte_length: (end_offset - start_offset) as u32,
568 }
569 }
570
571 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
572 let digits_start = self.current_offset;
573
574 while !self.at_eof() {
575 let byte = self.current_byte();
576 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
577 if byte == b'_' && self.previous_char() == '_' {
578 let underscore_start = self.current_offset - 1;
579 self.error_consecutive_underscores(underscore_start);
580 }
581 self.next();
582 } else if byte == b'8' || byte == b'9' {
583 self.error_invalid_octal_digit(self.current_offset);
584 self.next();
585 } else {
586 break;
587 }
588 }
589
590 if self.current_offset == digits_start {
591 self.error_missing_octal_digits(start_offset, 2);
592 }
593
594 if self.previous_char() == '_' {
595 self.error_number_trailing_underscore(
596 self.current_offset - self.previous_char().len_utf8(),
597 );
598 }
599
600 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
601 self.next(); let end_offset = self.current_offset;
603 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
604 return Token {
605 kind: TokenKind::Imaginary,
606 text: &self.input[start_offset..end_offset],
607 byte_offset: start_offset as u32,
608 byte_length: (end_offset - start_offset) as u32,
609 };
610 }
611
612 let end_offset = self.current_offset;
613 Token {
614 kind: TokenKind::Integer,
615 text: &self.input[start_offset..end_offset],
616 byte_offset: start_offset as u32,
617 byte_length: (end_offset - start_offset) as u32,
618 }
619 }
620
621 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
622 self.next();
623
624 while !self.at_eof() {
625 let byte = self.current_byte();
626 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
627 if byte == b'_' && self.previous_char() == '_' {
628 let underscore_start = self.current_offset - 1;
629 self.error_consecutive_underscores(underscore_start);
630 }
631 self.next();
632 } else if byte == b'8' || byte == b'9' {
633 self.error_invalid_octal_digit(self.current_offset);
634 self.next();
635 } else {
636 break;
637 }
638 }
639
640 if self.previous_char() == '_' {
641 self.error_number_trailing_underscore(
642 self.current_offset - self.previous_char().len_utf8(),
643 );
644 }
645
646 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
647 self.next();
648 let end_offset = self.current_offset;
649 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
650 return Token {
651 kind: TokenKind::Imaginary,
652 text: &self.input[start_offset..end_offset],
653 byte_offset: start_offset as u32,
654 byte_length: (end_offset - start_offset) as u32,
655 };
656 }
657
658 let end_offset = self.current_offset;
659 Token {
660 kind: TokenKind::Integer,
661 text: &self.input[start_offset..end_offset],
662 byte_offset: start_offset as u32,
663 byte_length: (end_offset - start_offset) as u32,
664 }
665 }
666
667 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
668 let digits_start = self.current_offset;
669
670 while !self.at_eof() {
671 let byte = self.current_byte();
672 if byte == b'0' || byte == b'1' || byte == b'_' {
673 if byte == b'_' && self.previous_char() == '_' {
674 let underscore_start = self.current_offset - 1;
675 self.error_consecutive_underscores(underscore_start);
676 }
677 self.next();
678 } else if (b'2'..=b'9').contains(&byte) {
679 self.error_invalid_binary_digit(self.current_offset);
680 self.next();
681 } else {
682 break;
683 }
684 }
685
686 if self.current_offset == digits_start {
687 self.error_missing_binary_digits(start_offset, 2);
688 }
689
690 if self.previous_char() == '_' {
691 self.error_number_trailing_underscore(
692 self.current_offset - self.previous_char().len_utf8(),
693 );
694 }
695
696 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
697 self.next();
698 let end_offset = self.current_offset;
699 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
700 return Token {
701 kind: TokenKind::Imaginary,
702 text: &self.input[start_offset..end_offset],
703 byte_offset: start_offset as u32,
704 byte_length: (end_offset - start_offset) as u32,
705 };
706 }
707
708 let end_offset = self.current_offset;
709 Token {
710 kind: TokenKind::Integer,
711 text: &self.input[start_offset..end_offset],
712 byte_offset: start_offset as u32,
713 byte_length: (end_offset - start_offset) as u32,
714 }
715 }
716
717 fn lex_identifier(&mut self) -> Token<'source> {
718 let start_offset = self.current_offset;
719
720 while !self.at_eof() {
721 let c = self.current_char();
722 if c.is_alphanumeric() || c == '_' {
723 self.next();
724 } else {
725 break;
726 }
727 }
728
729 let end_offset = self.current_offset;
730 let text = &self.input[start_offset..end_offset];
731
732 let kind = match text {
733 "true" | "false" => TokenKind::Boolean,
734 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
735 };
736
737 Token {
738 kind,
739 text,
740 byte_offset: start_offset as u32,
741 byte_length: (end_offset - start_offset) as u32,
742 }
743 }
744
745 fn lex_backtick_literal(&mut self) -> Token<'source> {
746 let start_offset = self.current_offset;
747
748 self.next();
749
750 let mut terminated = false;
751
752 while !self.at_eof() {
753 let byte = self.current_byte();
754 if byte == b'`' {
755 terminated = true;
756 self.next();
757 break;
758 }
759 self.next();
760 }
761
762 let end_offset = self.current_offset;
763 let length = end_offset - start_offset;
764
765 if !terminated {
766 self.error_unterminated_backtick(start_offset, length);
767 }
768
769 Token {
770 kind: TokenKind::Backtick,
771 text: &self.input[start_offset..end_offset],
772 byte_offset: start_offset as u32,
773 byte_length: length as u32,
774 }
775 }
776
777 fn consume_unicode_escape(&mut self, escape_start: usize) {
778 if self.at_eof() || self.current_byte() != b'{' {
779 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
780 return;
781 }
782 self.next();
783
784 let hex_start = self.current_offset;
785 let mut all_hex = true;
786 while !self.at_eof() {
787 let byte = self.current_byte();
788 if byte == b'}' || byte == b'"' || byte == b'\n' {
789 break;
790 }
791 if !byte.is_ascii_hexdigit() {
792 all_hex = false;
793 }
794 self.next();
795 }
796 let hex_end = self.current_offset;
797
798 let closed = !self.at_eof() && self.current_byte() == b'}';
799 if closed {
800 self.next();
801 }
802
803 let hex_len = hex_end - hex_start;
804 let total_len = self.current_offset - escape_start;
805
806 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
807 self.error_invalid_unicode_escape(escape_start, total_len);
808 return;
809 }
810
811 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
812 .expect("hex digits validated above");
813 if char::from_u32(codepoint).is_none() {
814 self.error_unicode_escape_out_of_range(escape_start, total_len);
815 }
816 }
817
818 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
820 let mut value: u16 = (first_digit - b'0') as u16;
821 for _ in 0..2 {
822 if self.at_eof() {
823 break;
824 }
825 match self.current_byte() {
826 d @ b'0'..=b'7' => {
827 value = value * 8 + (d - b'0') as u16;
828 self.next();
829 }
830 _ => break,
831 }
832 }
833 value
834 }
835
836 fn lex_string_literal(&mut self) -> Token<'source> {
837 let start_offset = self.current_offset;
838
839 self.next();
840
841 let mut escaped = false;
842 let mut terminated = false;
843
844 while !self.at_eof() && !terminated {
845 let byte = self.current_byte();
846 if escaped {
847 match byte {
848 b'0'..=b'7' => {
849 let escape_start = self.current_offset - 1;
850 self.next();
851 let value = self.consume_octal_escape(byte);
852 if value > 255 {
853 let escape_len = self.current_offset - escape_start;
854 self.error_octal_escape_out_of_range(escape_start, escape_len);
855 }
856 escaped = false;
857 continue;
858 }
859 b'u' => {
860 let escape_start = self.current_offset - 1;
861 self.next();
862 self.consume_unicode_escape(escape_start);
863 escaped = false;
864 continue;
865 }
866 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
867 }
868 b'\'' => {}
869 _ => {
870 self.error_invalid_escape(self.current_char());
871 }
872 }
873 escaped = false;
874 } else if byte == b'\\' {
875 escaped = true;
876 } else if byte == b'"' {
877 terminated = true;
878 self.next();
879 break;
880 }
881
882 self.next();
883 }
884
885 let end_offset = self.current_offset;
886 let length = end_offset - start_offset;
887
888 if escaped {
889 self.error_unterminated_escape(start_offset);
890 }
891
892 if !terminated {
893 self.error_unterminated_string(start_offset, 1);
894 }
895
896 Token {
897 kind: TokenKind::String,
898 text: &self.input[start_offset..end_offset],
899 byte_offset: start_offset as u32,
900 byte_length: length as u32,
901 }
902 }
903
904 fn lex_raw_string_literal(&mut self) -> Token<'source> {
905 let start_offset = self.current_offset;
906 self.next(); self.next(); let mut terminated = false;
910 while !self.at_eof() {
911 let byte = self.current_byte();
912 if byte == b'"' {
913 terminated = true;
914 self.next();
915 break;
916 } else if byte == 0 {
917 self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
918 self.next();
919 continue;
920 }
921 self.next();
922 }
923
924 let end_offset = self.current_offset;
925 let length = end_offset - start_offset;
926
927 if !terminated {
928 self.error_unterminated_raw_string(start_offset, 2);
929 }
930
931 Token {
932 kind: TokenKind::RawString,
933 text: &self.input[start_offset..end_offset],
934 byte_offset: start_offset as u32,
935 byte_length: length as u32,
936 }
937 }
938
939 fn try_consume_unsupported_raw_variant(&mut self, end: usize) -> bool {
940 let raw_format_prefix = if self.current_byte() == b'r'
941 && self.peek_byte() == b'f'
942 && self.peek_byte_at(2) == b'"'
943 {
944 Some("rf")
945 } else if self.current_byte() == b'f'
946 && self.peek_byte() == b'r'
947 && self.peek_byte_at(2) == b'"'
948 {
949 Some("fr")
950 } else {
951 None
952 };
953 if let Some(prefix) = raw_format_prefix {
954 let start = self.current_offset;
955 self.skip(3);
956 while self.current_offset < end
957 && self.current_byte() != b'"'
958 && self.current_byte() != b'\n'
959 {
960 self.next();
961 }
962 if self.current_offset < end && self.current_byte() == b'"' {
963 self.next();
964 }
965 let length = self.current_offset - start;
966 self.error_unsupported_raw_format_string(start, length, prefix);
967 return true;
968 }
969
970 if self.current_byte() == b'r' && self.peek_byte() == b'#' {
971 let mut hash_count = 0usize;
972 let mut probe = self.current_offset + 1;
973 while probe < self.input_bytes.len() && self.input_bytes[probe] == b'#' {
974 hash_count += 1;
975 probe += 1;
976 }
977 if hash_count > 0 && probe < self.input_bytes.len() && self.input_bytes[probe] == b'"' {
978 let start = self.current_offset;
979 self.skip(1 + hash_count + 1);
980 loop {
981 if self.current_offset >= end || self.current_byte() == b'\n' {
982 break;
983 }
984 if self.current_byte() == b'"' {
985 let mut closer_matches = true;
986 for i in 1..=hash_count {
987 if self.peek_byte_at(i) != b'#' {
988 closer_matches = false;
989 break;
990 }
991 }
992 if closer_matches {
993 self.skip(1 + hash_count);
994 break;
995 }
996 }
997 self.next();
998 }
999 let length = self.current_offset - start;
1000 self.error_unsupported_hash_delimited_raw_string(start, length);
1001 return true;
1002 }
1003 }
1004
1005 false
1006 }
1007
1008 fn push_format_string_text_if_needed(
1009 &self,
1010 tokens: &mut Vec<Token<'source>>,
1011 text_segment_start: usize,
1012 ) {
1013 if text_segment_start < self.current_offset {
1014 tokens.push(Token {
1015 kind: TokenKind::FormatStringText,
1016 text: &self.input[text_segment_start..self.current_offset],
1017 byte_offset: text_segment_start as u32,
1018 byte_length: (self.current_offset - text_segment_start) as u32,
1019 });
1020 }
1021 }
1022
1023 fn lex_format_string_interpolation(
1024 &mut self,
1025 tokens: &mut Vec<Token<'source>>,
1026 ) -> Result<(), ()> {
1027 let interp_start = self.current_offset;
1028 self.next();
1029
1030 tokens.push(Token {
1031 kind: TokenKind::FormatStringInterpolationStart,
1032 text: &self.input[interp_start..self.current_offset],
1033 byte_offset: interp_start as u32,
1034 byte_length: (self.current_offset - interp_start) as u32,
1035 });
1036
1037 let Some(interpolation_end) = self.find_interpolation_boundary() else {
1038 if self.has_newline_between(interp_start, self.input.len()) {
1039 self.error_multiline_format_string_interpolation(interp_start);
1040 } else {
1041 self.error_unclosed_brace_in_format_string(interp_start);
1042 }
1043 self.skip_to_format_string_end();
1044 return Err(());
1045 };
1046
1047 if self.has_newline_between(interp_start, interpolation_end) {
1048 self.error_multiline_format_string_interpolation(interp_start);
1049 }
1050
1051 while self.current_offset < interpolation_end {
1052 self.skip_horizontal_whitespace();
1053 if self.current_offset >= interpolation_end {
1054 break;
1055 }
1056
1057 if self.try_consume_unsupported_raw_variant(interpolation_end) {
1058 continue;
1059 }
1060
1061 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
1062 let mut fstring_tokens = self.lex_format_string_tokens();
1063 tokens.append(&mut fstring_tokens);
1064 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
1065 self.error_escaped_quote_in_interpolation(self.current_offset);
1066 self.skip(2);
1067 } else if self.current_byte() == b'r' && self.peek_byte() == b'"' {
1068 self.error_raw_string_in_interpolation(self.current_offset);
1069 self.skip(2);
1070 while self.current_offset < interpolation_end
1071 && self.current_byte() != b'"'
1072 && self.current_byte() != b'\n'
1073 {
1074 self.next();
1075 }
1076 if self.current_offset < interpolation_end && self.current_byte() == b'"' {
1077 self.next();
1078 }
1079 } else {
1080 let token = self.create_token();
1081 tokens.push(token);
1082 }
1083 }
1084
1085 let close_offset = self.current_offset;
1086 self.next();
1087 tokens.push(Token {
1088 kind: TokenKind::FormatStringInterpolationEnd,
1089 text: &self.input[close_offset..self.current_offset],
1090 byte_offset: close_offset as u32,
1091 byte_length: (self.current_offset - close_offset) as u32,
1092 });
1093
1094 Ok(())
1095 }
1096
1097 fn scan_interpolation(&self, start: usize) -> Option<usize> {
1098 let bytes = self.input.as_bytes();
1099 let mut p = start;
1100 let mut depth = 1;
1101
1102 while p < bytes.len() && depth > 0 {
1103 match bytes[p] {
1104 b'{' => {
1105 depth += 1;
1106 p += 1;
1107 }
1108 b'}' => {
1109 depth -= 1;
1110 if depth > 0 {
1111 p += 1;
1112 }
1113 }
1114 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
1115 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
1116 p = self.scan_past_fstring(p)?;
1117 }
1118 b'\\' => p += 2,
1119 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
1120 b'\n' => return None,
1121 _ => p += 1,
1122 }
1123 }
1124
1125 (depth == 0).then_some(p)
1126 }
1127
1128 fn find_interpolation_boundary(&self) -> Option<usize> {
1129 self.scan_interpolation(self.current_offset)
1130 }
1131
1132 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
1133 let bytes = self.input.as_bytes();
1134 let mut p = start + 1;
1135 while p < bytes.len() {
1136 match bytes[p] {
1137 b'\\' if delimiter != b'`' => p += 2,
1138 b'\n' => return None,
1139 b if b == delimiter => return Some(p + 1),
1140 _ => p += 1,
1141 }
1142 }
1143 None
1144 }
1145
1146 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1147 let bytes = self.input.as_bytes();
1148 let mut p = position + 2; while p < bytes.len() {
1150 match bytes[p] {
1151 b'\\' => p += 2,
1152 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1153 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1154 b'{' => {
1155 p = self.scan_interpolation(p + 1)?;
1156 p += 1;
1157 }
1158 b'"' => return Some(p + 1),
1159 b'\n' => return None,
1160 _ => p += 1,
1161 }
1162 }
1163 None
1164 }
1165
1166 fn skip_to_format_string_end(&mut self) {
1171 let mut depth = 1;
1172 while !self.at_eof() {
1173 match self.current_byte() {
1174 b'\\' => {
1175 self.next();
1176 if !self.at_eof() {
1177 self.next();
1178 }
1179 }
1180 b'"' if depth == 0 => {
1181 self.next();
1182 return;
1183 }
1184 b'"' => {
1185 self.next();
1186 while !self.at_eof() && self.current_byte() != b'"' {
1187 if self.current_byte() == b'\\' {
1188 self.next();
1189 if self.at_eof() {
1190 break;
1191 }
1192 }
1193 self.next();
1194 }
1195 if !self.at_eof() {
1196 self.next();
1197 }
1198 }
1199 b'{' => {
1200 depth += 1;
1201 self.next();
1202 }
1203 b'}' => {
1204 if depth > 0 {
1205 depth -= 1;
1206 }
1207 self.next();
1208 }
1209 _ => self.next(),
1210 }
1211 }
1212 }
1213
1214 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1215 let start_offset = self.current_offset;
1216 let mut tokens = Vec::new();
1217
1218 self.skip(2);
1219
1220 let fstring_start_end = self.current_offset;
1221 tokens.push(Token {
1222 kind: TokenKind::FormatStringStart,
1223 text: &self.input[start_offset..fstring_start_end],
1224 byte_offset: start_offset as u32,
1225 byte_length: (fstring_start_end - start_offset) as u32,
1226 });
1227
1228 let mut text_segment_start = self.current_offset;
1229
1230 while !self.at_eof() {
1231 let byte = self.current_byte();
1232
1233 match byte {
1234 b'\\' if !self.at_eof() => {
1235 let escape_start = self.current_offset;
1236 self.next();
1237 if !self.at_eof() {
1238 let b = self.current_byte();
1239 self.next();
1240 if matches!(b, b'0'..=b'7') {
1241 let value = self.consume_octal_escape(b);
1242 if value > 255 {
1243 let escape_len = self.current_offset - escape_start;
1244 self.error_octal_escape_out_of_range(escape_start, escape_len);
1245 }
1246 } else if b == b'u' {
1247 self.consume_unicode_escape(escape_start);
1248 }
1249 }
1250 }
1251 b'{' if self.peek_byte() == b'{' => {
1252 self.skip(2);
1253 }
1254 b'}' if self.peek_byte() == b'}' => {
1255 self.skip(2);
1256 }
1257 b'"' => {
1258 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1259
1260 let end_offset = self.current_offset;
1261 self.next();
1262
1263 tokens.push(Token {
1264 kind: TokenKind::FormatStringEnd,
1265 text: &self.input[end_offset..self.current_offset],
1266 byte_offset: end_offset as u32,
1267 byte_length: (self.current_offset - end_offset) as u32,
1268 });
1269 return tokens;
1270 }
1271
1272 b'{' => {
1273 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1274
1275 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1276 return tokens;
1277 }
1278 text_segment_start = self.current_offset;
1279 }
1280 b'}' => {
1281 self.error_unmatched_brace_in_format_string(self.current_offset);
1282 self.next();
1283 }
1284 _ => {
1285 self.next();
1286 }
1287 }
1288 }
1289
1290 self.error_unterminated_format_string(start_offset, 2);
1291 tokens
1292 }
1293
1294 fn lex_char(&mut self) -> Token<'source> {
1295 let start_offset = self.current_offset;
1296
1297 self.next();
1298
1299 if self.at_eof() || self.current_byte() == b'\'' {
1300 self.error_empty_rune_literal(start_offset);
1301 let end_offset = self.current_offset;
1302 return Token {
1303 kind: TokenKind::Char,
1304 text: &self.input[start_offset..end_offset],
1305 byte_offset: start_offset as u32,
1306 byte_length: (end_offset - start_offset) as u32,
1307 };
1308 }
1309
1310 if self.current_byte() != b'\\' {
1311 self.next();
1312 } else {
1313 self.next();
1314
1315 if self.at_eof() {
1316 self.error_unterminated_escape(start_offset);
1317 let end_offset = self.current_offset;
1318 return Token {
1319 kind: TokenKind::Char,
1320 text: &self.input[start_offset..end_offset],
1321 byte_offset: start_offset as u32,
1322 byte_length: (end_offset - start_offset) as u32,
1323 };
1324 }
1325
1326 match self.current_byte() {
1327 b'0'..=b'7' => {
1328 let escape_start = self.current_offset - 1;
1329 let first = self.current_byte();
1330 self.next();
1331 let value = self.consume_octal_escape(first);
1332 if value > 255 {
1333 let escape_len = self.current_offset - escape_start;
1334 self.error_octal_escape_out_of_range(escape_start, escape_len);
1335 }
1336 }
1337 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1338 self.next();
1339 }
1340 _ => {
1341 self.error_invalid_escape(self.current_char());
1342
1343 while !self.at_eof() && self.current_byte() != b'\'' {
1344 self.next();
1345 }
1346
1347 if !self.at_eof() && self.current_byte() == b'\'' {
1348 self.next();
1349 }
1350
1351 let end_offset = self.current_offset;
1352 return Token {
1353 kind: TokenKind::Char,
1354 text: &self.input[start_offset..end_offset],
1355 byte_offset: start_offset as u32,
1356 byte_length: (end_offset - start_offset) as u32,
1357 };
1358 }
1359 }
1360 }
1361
1362 if self.at_eof() || self.current_byte() != b'\'' {
1363 let length = self.current_offset - start_offset;
1364 self.error_unterminated_rune(start_offset, length);
1365 }
1366
1367 if !self.at_eof() && self.current_byte() == b'\'' {
1368 self.next();
1369 }
1370
1371 let end_offset = self.current_offset;
1372 Token {
1373 kind: TokenKind::Char,
1374 text: &self.input[start_offset..end_offset],
1375 byte_offset: start_offset as u32,
1376 byte_length: (end_offset - start_offset) as u32,
1377 }
1378 }
1379
1380 fn lex_slash(&mut self) -> Token<'source> {
1381 let start_offset = self.current_offset;
1382
1383 if self.peek_byte() != b'/' {
1384 self.next();
1385 return Token {
1386 kind: TokenKind::Slash,
1387 text: &self.input[start_offset..self.current_offset],
1388 byte_offset: start_offset as u32,
1389 byte_length: 1,
1390 };
1391 }
1392
1393 let slash_count = self.count_consecutive(b'/');
1394
1395 if slash_count >= 4 {
1396 self.error_excess_slashes_in_comment(start_offset, slash_count);
1397 }
1398
1399 self.skip(slash_count);
1400
1401 if slash_count == 3 {
1402 if self.current_byte() == b' ' {
1403 self.next();
1404 }
1405 let text_start = self.current_offset;
1406 self.skip_to_eol();
1407 let end_offset = self.current_offset;
1408
1409 self.trivia
1410 .doc_comments
1411 .push((start_offset as u32, end_offset as u32));
1412
1413 return Token {
1414 kind: TokenKind::DocComment,
1415 text: &self.input[text_start..end_offset],
1416 byte_offset: start_offset as u32,
1417 byte_length: (end_offset - start_offset) as u32,
1418 };
1419 }
1420
1421 self.skip_to_eol();
1422 let end_offset = self.current_offset;
1423
1424 self.trivia
1425 .comments
1426 .push((start_offset as u32, end_offset as u32));
1427
1428 Token {
1429 kind: TokenKind::Comment,
1430 text: &self.input[start_offset..end_offset],
1431 byte_offset: start_offset as u32,
1432 byte_length: (end_offset - start_offset) as u32,
1433 }
1434 }
1435
1436 fn count_consecutive(&self, byte: u8) -> usize {
1437 let mut count = 0;
1438 let mut offset = self.current_offset;
1439 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1440 count += 1;
1441 offset += 1;
1442 }
1443 count
1444 }
1445
1446 fn skip_to_eol(&mut self) {
1447 while !self.at_eof() && self.current_byte() != b'\n' {
1448 self.next();
1449 }
1450 }
1451
1452 fn lex_directive(&mut self) -> Token<'source> {
1453 let start_offset = self.current_offset;
1454
1455 self.next();
1456
1457 while !self.at_eof() {
1458 let byte = self.current_byte();
1459 if byte.is_ascii_alphanumeric() || byte == b'_' {
1460 self.next();
1461 } else {
1462 break;
1463 }
1464 }
1465
1466 let end_offset = self.current_offset;
1467 Token {
1468 kind: TokenKind::Directive,
1469 text: &self.input[start_offset..end_offset],
1470 byte_offset: start_offset as u32,
1471 byte_length: (end_offset - start_offset) as u32,
1472 }
1473 }
1474
1475 fn handle_unexpected_char(&mut self) -> Token<'source> {
1476 let start_offset = self.current_offset;
1477
1478 self.error_unexpected_char(self.current_offset, self.current_char());
1479
1480 self.resync_on_error();
1481
1482 let end_offset = self.current_offset;
1483
1484 Token {
1485 kind: TokenKind::Error,
1486 text: &self.input[start_offset..end_offset],
1487 byte_offset: start_offset as u32,
1488 byte_length: (end_offset - start_offset) as u32,
1489 }
1490 }
1491
1492 fn eof_token(&self) -> Token<'source> {
1493 Token {
1494 kind: TokenKind::EOF,
1495 text: &self.input[self.current_offset..self.current_offset],
1496 byte_offset: self.current_offset as u32,
1497 byte_length: 0,
1498 }
1499 }
1500
1501 fn semicolon_token(&mut self) -> Token<'source> {
1502 let start_offset = self.current_offset;
1503
1504 self.next();
1505
1506 Token {
1507 kind: TokenKind::Semicolon,
1508 text: &self.input[start_offset..self.current_offset],
1509 byte_offset: start_offset as u32,
1510 byte_length: (self.current_offset - start_offset) as u32,
1511 }
1512 }
1513}