1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.try_consume_unsupported_raw_variant(self.input.len()) {
52 continue;
53 }
54
55 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
56 let mut fstring_tokens = self.lex_format_string_tokens();
57 fstring_tokens.reverse();
58 self.pending_tokens = fstring_tokens;
59 continue;
60 }
61
62 let token = self.create_token();
63 tokens.push(token);
64 }
65
66 let tokens = self.insert_semicolons(tokens);
67
68 LexResult {
69 tokens,
70 errors: self.errors,
71 trivia: self.trivia,
72 }
73 }
74
75 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
76 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
77
78 for i in 0..tokens.len() {
79 let token = tokens[i];
80 result.push(token);
81
82 if !Self::triggers_asi(token.kind) {
83 continue;
84 }
85
86 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
87 if Self::continues_expression(next_token.kind) {
88 continue;
89 }
90
91 let token_end = (token.byte_offset + token.byte_length) as usize;
92 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
93 result.push(self.make_synthetic_semicolon(token_end));
94 }
95 }
96 }
97
98 result
99 }
100
101 fn triggers_asi(kind: TokenKind) -> bool {
102 matches!(
103 kind,
104 TokenKind::Identifier
105 | TokenKind::Integer
106 | TokenKind::Imaginary
107 | TokenKind::Float
108 | TokenKind::String
109 | TokenKind::RawString
110 | TokenKind::Char
111 | TokenKind::Boolean
112 | TokenKind::RightParen
113 | TokenKind::RightSquareBracket
114 | TokenKind::RightCurlyBrace
115 | TokenKind::Break
116 | TokenKind::Continue
117 | TokenKind::Return
118 | TokenKind::DotDot
119 | TokenKind::DotDotEqual
120 | TokenKind::QuestionMark
121 )
122 }
123
124 fn continues_expression(kind: TokenKind) -> bool {
125 matches!(
126 kind,
127 TokenKind::Plus
128 | TokenKind::Star
129 | TokenKind::Slash
130 | TokenKind::Percent
131 | TokenKind::Pipeline
132 | TokenKind::AmpersandDouble
133 | TokenKind::PipeDouble
134 | TokenKind::EqualDouble
135 | TokenKind::NotEqual
136 | TokenKind::LeftAngleBracket
137 | TokenKind::RightAngleBracket
138 | TokenKind::LessThanOrEqual
139 | TokenKind::GreaterThanOrEqual
140 | TokenKind::Dot
141 | TokenKind::Equal
142 | TokenKind::PlusEqual
143 | TokenKind::MinusEqual
144 | TokenKind::StarEqual
145 | TokenKind::SlashEqual
146 | TokenKind::AmpersandEqual
147 | TokenKind::PipeEqual
148 | TokenKind::CaretEqual
149 | TokenKind::AndNotEqual
150 | TokenKind::ShiftLeftEqual
151 | TokenKind::ShiftRightEqual
152 | TokenKind::Else
153 | TokenKind::LeftCurlyBrace
154 | TokenKind::RightCurlyBrace
155 | TokenKind::RightParen
156 | TokenKind::RightSquareBracket
157 | TokenKind::As
158 )
159 }
160
161 fn find_next_non_comment_token<'a>(
162 &self,
163 tokens: &'a [Token<'source>],
164 start_index: usize,
165 ) -> Option<&'a Token<'source>> {
166 tokens
167 .iter()
168 .skip(start_index)
169 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
170 }
171
172 fn has_newline_between(&self, start: usize, end: usize) -> bool {
173 self.input[start..end].contains('\n')
174 }
175
176 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
177 Token {
178 kind: TokenKind::Semicolon,
179 text: "",
180 byte_offset: position as u32,
181 byte_length: 0,
182 }
183 }
184
185 fn create_token(&mut self) -> Token<'source> {
186 if let Some(token) = self.lex_lookahead_symbol() {
187 return token;
188 }
189
190 let c = self.current_char();
191 match c {
192 '0'..='9' => self.lex_number(),
193 'r' if self.peek_byte() == b'"' => self.lex_raw_string_literal(),
194 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
195 '"' => self.lex_string_literal(),
196 '`' => self.lex_backtick_literal(),
197 '\'' => self.lex_char(),
198 '/' => self.lex_slash(),
199 ';' => self.semicolon_token(),
200 '@' => self.lex_directive(),
201 _ => self.handle_unexpected_char(),
202 }
203 }
204
205 #[inline]
206 fn current_byte(&self) -> u8 {
207 if self.current_offset < self.input_bytes.len() {
208 self.input_bytes[self.current_offset]
209 } else {
210 0
211 }
212 }
213
214 #[inline]
215 fn current_char(&self) -> char {
216 self.input[self.current_offset..]
217 .chars()
218 .next()
219 .unwrap_or('\0')
220 }
221
222 #[inline]
223 fn peek_byte(&self) -> u8 {
224 if self.current_offset + 1 < self.input_bytes.len() {
225 self.input_bytes[self.current_offset + 1]
226 } else {
227 0
228 }
229 }
230
231 #[inline]
232 fn peek_byte_at(&self, n: usize) -> u8 {
233 let offset = self.current_offset + n;
234 if offset < self.input_bytes.len() {
235 self.input_bytes[offset]
236 } else {
237 0
238 }
239 }
240
241 #[inline]
242 fn peek_char(&self) -> char {
243 let next_offset = if self.current_byte() < 128 {
244 self.current_offset + 1
245 } else {
246 self.current_offset + self.current_char().len_utf8()
247 };
248 self.input[next_offset..].chars().next().unwrap_or('\0')
249 }
250
251 fn peek_char_n(&self, n: usize) -> char {
252 let mut offset = self.current_offset;
253 for _ in 0..n {
254 if offset >= self.input.len() {
255 return '\0';
256 }
257 let c = self.input[offset..].chars().next().unwrap_or('\0');
258 offset += c.len_utf8();
259 }
260 self.input[offset..].chars().next().unwrap_or('\0')
261 }
262
263 fn next(&mut self) {
264 if self.at_eof() {
265 return;
266 }
267 if self.current_byte() < 128 {
268 self.current_offset += 1;
269 } else {
270 self.current_offset += self.current_char().len_utf8();
271 }
272 }
273
274 fn skip(&mut self, count: usize) {
275 for _ in 0..count {
276 self.next();
277 }
278 }
279
280 fn skip_whitespace(&mut self) {
281 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
282 if self.current_byte() == b'\n' {
283 self.record_newline();
284 }
285 self.next();
286 }
287 }
288
289 fn skip_horizontal_whitespace(&mut self) {
290 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
291 self.next();
292 }
293 }
294
295 fn record_newline(&mut self) {
296 let offset = self.current_offset;
297
298 if let Some(last) = self.last_newline_offset {
299 let between = &self.input[last + 1..offset];
300 let is_blank = between.is_empty()
301 || between
302 .chars()
303 .all(|c| c.is_ascii_whitespace() && c != '\n');
304 if is_blank {
305 self.trivia.blank_lines.push(offset as u32);
306 }
307 }
308
309 self.last_newline_offset = Some(offset);
310 }
311
312 fn at_eof(&self) -> bool {
313 self.current_offset >= self.input.len()
314 }
315
316 fn previous_char(&self) -> char {
317 if self.current_offset == 0 {
318 return '\0';
319 }
320 self.input[..self.current_offset]
321 .chars()
322 .next_back()
323 .unwrap_or('\0')
324 }
325
326 fn resync_on_error(&mut self) {
327 while !self.at_eof() {
328 let byte = self.current_byte();
329
330 if byte == b';' || byte == b'}' {
331 break;
332 }
333
334 self.next();
335 }
336 }
337
338 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
340 let start_offset = self.current_offset;
341 let current_char = self.current_char();
342 let next_char = self.peek_char();
343 let third_char = self.peek_char_n(2);
344
345 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
346 self.skip(3);
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
357 self.skip(2);
358 let end_offset = self.current_offset;
359 return Some(Token {
360 kind,
361 text: &self.input[start_offset..end_offset],
362 byte_offset: start_offset as u32,
363 byte_length: (end_offset - start_offset) as u32,
364 });
365 }
366
367 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
368 self.next();
369 let end_offset = self.current_offset;
370 return Some(Token {
371 kind,
372 text: &self.input[start_offset..end_offset],
373 byte_offset: start_offset as u32,
374 byte_length: (end_offset - start_offset) as u32,
375 });
376 }
377
378 None
379 }
380
381 fn lex_number(&mut self) -> Token<'source> {
382 let start_offset = self.current_offset;
383
384 if self.current_byte() == b'0' {
385 let next = self.peek_byte();
386 match next {
387 b'x' | b'X' => {
388 self.next(); self.next(); return self.lex_hex_number(start_offset);
391 }
392 b'o' | b'O' => {
393 self.next(); self.next(); return self.lex_octal_number(start_offset);
396 }
397 b'b' | b'B' => {
398 self.next(); self.next(); return self.lex_binary_number(start_offset);
401 }
402 _ => {} }
404 }
405
406 let mut kind = TokenKind::Integer;
407
408 while !self.at_eof() {
409 let byte = self.current_byte();
410 if byte.is_ascii_digit() || byte == b'_' {
411 if byte == b'_' && self.previous_char() == '_' {
412 let underscore_start = self.current_offset - 1;
413 self.error_consecutive_underscores(underscore_start);
414 }
415 self.next();
416 } else {
417 break;
418 }
419 }
420
421 if self.previous_char() == '_' {
422 self.error_number_trailing_underscore(
423 self.current_offset - self.previous_char().len_utf8(),
424 );
425 }
426
427 let preceded_by_dot = start_offset > 0
430 && self.input_bytes[start_offset - 1] == b'.'
431 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
432
433 if !preceded_by_dot
434 && self.current_byte() == b'.'
435 && self.peek_byte() != b'.'
436 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
437 {
438 kind = TokenKind::Float;
439 self.next();
440
441 if self.current_byte() == b'_' {
442 self.error_decimal_leading_underscore(self.current_offset);
443 }
444
445 while !self.at_eof() {
446 let byte = self.current_byte();
447 if byte.is_ascii_digit() || byte == b'_' {
448 if byte == b'_' && self.previous_char() == '_' {
449 let underscore_start = self.current_offset - 1;
450 self.error_consecutive_underscores(underscore_start);
451 }
452 self.next();
453 } else {
454 break;
455 }
456 }
457
458 if self.previous_char() == '_' {
459 self.error_number_trailing_underscore(
460 self.current_offset - self.previous_char().len_utf8(),
461 );
462 }
463 }
464
465 if self.current_byte() == b'e' || self.current_byte() == b'E' {
466 kind = TokenKind::Float;
467 let exponent_start = self.current_offset;
468 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
471 self.next();
472 }
473
474 if !self.current_byte().is_ascii_digit() {
475 self.error_missing_exponent_digits(
476 exponent_start,
477 self.current_offset - exponent_start,
478 );
479 }
480
481 while !self.at_eof() {
482 let byte = self.current_byte();
483 if byte.is_ascii_digit() || byte == b'_' {
484 if byte == b'_' && self.previous_char() == '_' {
485 let underscore_start = self.current_offset - 1;
486 self.error_consecutive_underscores(underscore_start);
487 }
488 self.next();
489 } else {
490 break;
491 }
492 }
493
494 if self.previous_char() == '_' {
495 self.error_number_trailing_underscore(
496 self.current_offset - self.previous_char().len_utf8(),
497 );
498 }
499 }
500
501 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
502 self.next(); let end_offset = self.current_offset;
504 return Token {
505 kind: TokenKind::Imaginary,
506 text: &self.input[start_offset..end_offset],
507 byte_offset: start_offset as u32,
508 byte_length: (end_offset - start_offset) as u32,
509 };
510 }
511
512 let end_offset = self.current_offset;
513 Token {
514 kind,
515 text: &self.input[start_offset..end_offset],
516 byte_offset: start_offset as u32,
517 byte_length: (end_offset - start_offset) as u32,
518 }
519 }
520
521 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
522 let digits_start = self.current_offset;
523
524 while !self.at_eof() {
525 let byte = self.current_byte();
526 if byte.is_ascii_hexdigit() || byte == b'_' {
527 if byte == b'_' && self.previous_char() == '_' {
528 let underscore_start = self.current_offset - 1;
529 self.error_consecutive_underscores(underscore_start);
530 }
531 self.next();
532 } else {
533 break;
534 }
535 }
536
537 if self.current_offset == digits_start {
538 self.error_missing_hex_digits(start_offset, 2);
539 }
540
541 if self.previous_char() == '_' {
542 self.error_number_trailing_underscore(
543 self.current_offset - self.previous_char().len_utf8(),
544 );
545 }
546
547 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
548 self.next(); let end_offset = self.current_offset;
550 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
551 return Token {
552 kind: TokenKind::Imaginary,
553 text: &self.input[start_offset..end_offset],
554 byte_offset: start_offset as u32,
555 byte_length: (end_offset - start_offset) as u32,
556 };
557 }
558
559 let end_offset = self.current_offset;
560 Token {
561 kind: TokenKind::Integer,
562 text: &self.input[start_offset..end_offset],
563 byte_offset: start_offset as u32,
564 byte_length: (end_offset - start_offset) as u32,
565 }
566 }
567
568 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
569 let digits_start = self.current_offset;
570
571 while !self.at_eof() {
572 let byte = self.current_byte();
573 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
574 if byte == b'_' && self.previous_char() == '_' {
575 let underscore_start = self.current_offset - 1;
576 self.error_consecutive_underscores(underscore_start);
577 }
578 self.next();
579 } else if byte == b'8' || byte == b'9' {
580 self.error_invalid_octal_digit(self.current_offset);
581 self.next();
582 } else {
583 break;
584 }
585 }
586
587 if self.current_offset == digits_start {
588 self.error_missing_octal_digits(start_offset, 2);
589 }
590
591 if self.previous_char() == '_' {
592 self.error_number_trailing_underscore(
593 self.current_offset - self.previous_char().len_utf8(),
594 );
595 }
596
597 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
598 self.next(); let end_offset = self.current_offset;
600 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
601 return Token {
602 kind: TokenKind::Imaginary,
603 text: &self.input[start_offset..end_offset],
604 byte_offset: start_offset as u32,
605 byte_length: (end_offset - start_offset) as u32,
606 };
607 }
608
609 let end_offset = self.current_offset;
610 Token {
611 kind: TokenKind::Integer,
612 text: &self.input[start_offset..end_offset],
613 byte_offset: start_offset as u32,
614 byte_length: (end_offset - start_offset) as u32,
615 }
616 }
617
618 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
619 let digits_start = self.current_offset;
620
621 while !self.at_eof() {
622 let byte = self.current_byte();
623 if byte == b'0' || byte == b'1' || byte == b'_' {
624 if byte == b'_' && self.previous_char() == '_' {
625 let underscore_start = self.current_offset - 1;
626 self.error_consecutive_underscores(underscore_start);
627 }
628 self.next();
629 } else if (b'2'..=b'9').contains(&byte) {
630 self.error_invalid_binary_digit(self.current_offset);
631 self.next();
632 } else {
633 break;
634 }
635 }
636
637 if self.current_offset == digits_start {
638 self.error_missing_binary_digits(start_offset, 2);
639 }
640
641 if self.previous_char() == '_' {
642 self.error_number_trailing_underscore(
643 self.current_offset - self.previous_char().len_utf8(),
644 );
645 }
646
647 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
648 self.next();
649 let end_offset = self.current_offset;
650 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
651 return Token {
652 kind: TokenKind::Imaginary,
653 text: &self.input[start_offset..end_offset],
654 byte_offset: start_offset as u32,
655 byte_length: (end_offset - start_offset) as u32,
656 };
657 }
658
659 let end_offset = self.current_offset;
660 Token {
661 kind: TokenKind::Integer,
662 text: &self.input[start_offset..end_offset],
663 byte_offset: start_offset as u32,
664 byte_length: (end_offset - start_offset) as u32,
665 }
666 }
667
668 fn lex_identifier(&mut self) -> Token<'source> {
669 let start_offset = self.current_offset;
670
671 while !self.at_eof() {
672 let c = self.current_char();
673 if c.is_alphanumeric() || c == '_' {
674 self.next();
675 } else {
676 break;
677 }
678 }
679
680 let end_offset = self.current_offset;
681 let text = &self.input[start_offset..end_offset];
682
683 let kind = match text {
684 "true" | "false" => TokenKind::Boolean,
685 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
686 };
687
688 Token {
689 kind,
690 text,
691 byte_offset: start_offset as u32,
692 byte_length: (end_offset - start_offset) as u32,
693 }
694 }
695
696 fn lex_backtick_literal(&mut self) -> Token<'source> {
697 let start_offset = self.current_offset;
698
699 self.next();
700
701 let mut terminated = false;
702
703 while !self.at_eof() {
704 let byte = self.current_byte();
705 if byte == b'`' {
706 terminated = true;
707 self.next();
708 break;
709 }
710 self.next();
711 }
712
713 let end_offset = self.current_offset;
714 let length = end_offset - start_offset;
715
716 if !terminated {
717 self.error_unterminated_backtick(start_offset, length);
718 }
719
720 Token {
721 kind: TokenKind::Backtick,
722 text: &self.input[start_offset..end_offset],
723 byte_offset: start_offset as u32,
724 byte_length: length as u32,
725 }
726 }
727
728 fn consume_unicode_escape(&mut self, escape_start: usize) {
729 if self.at_eof() || self.current_byte() != b'{' {
730 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
731 return;
732 }
733 self.next();
734
735 let hex_start = self.current_offset;
736 let mut all_hex = true;
737 while !self.at_eof() {
738 let byte = self.current_byte();
739 if byte == b'}' || byte == b'"' || byte == b'\n' {
740 break;
741 }
742 if !byte.is_ascii_hexdigit() {
743 all_hex = false;
744 }
745 self.next();
746 }
747 let hex_end = self.current_offset;
748
749 let closed = !self.at_eof() && self.current_byte() == b'}';
750 if closed {
751 self.next();
752 }
753
754 let hex_len = hex_end - hex_start;
755 let total_len = self.current_offset - escape_start;
756
757 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
758 self.error_invalid_unicode_escape(escape_start, total_len);
759 return;
760 }
761
762 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
763 .expect("hex digits validated above");
764 if char::from_u32(codepoint).is_none() {
765 self.error_unicode_escape_out_of_range(escape_start, total_len);
766 }
767 }
768
769 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
771 let mut value: u16 = (first_digit - b'0') as u16;
772 for _ in 0..2 {
773 if self.at_eof() {
774 break;
775 }
776 match self.current_byte() {
777 d @ b'0'..=b'7' => {
778 value = value * 8 + (d - b'0') as u16;
779 self.next();
780 }
781 _ => break,
782 }
783 }
784 value
785 }
786
787 fn lex_string_literal(&mut self) -> Token<'source> {
788 let start_offset = self.current_offset;
789
790 self.next();
791
792 let mut escaped = false;
793 let mut terminated = false;
794
795 while !self.at_eof() && !terminated {
796 let byte = self.current_byte();
797 if escaped {
798 match byte {
799 b'0'..=b'7' => {
800 let escape_start = self.current_offset - 1;
801 self.next();
802 let value = self.consume_octal_escape(byte);
803 if value > 255 {
804 let escape_len = self.current_offset - escape_start;
805 self.error_octal_escape_out_of_range(escape_start, escape_len);
806 }
807 escaped = false;
808 continue;
809 }
810 b'u' => {
811 let escape_start = self.current_offset - 1;
812 self.next();
813 self.consume_unicode_escape(escape_start);
814 escaped = false;
815 continue;
816 }
817 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
818 }
819 b'\'' => {}
820 _ => {
821 self.error_invalid_escape(self.current_char());
822 }
823 }
824 escaped = false;
825 } else if byte == b'\\' {
826 escaped = true;
827 } else if byte == b'"' {
828 terminated = true;
829 self.next();
830 break;
831 }
832
833 self.next();
834 }
835
836 let end_offset = self.current_offset;
837 let length = end_offset - start_offset;
838
839 if escaped {
840 self.error_unterminated_escape(start_offset);
841 }
842
843 if !terminated {
844 self.error_unterminated_string(start_offset, 1);
845 }
846
847 Token {
848 kind: TokenKind::String,
849 text: &self.input[start_offset..end_offset],
850 byte_offset: start_offset as u32,
851 byte_length: length as u32,
852 }
853 }
854
855 fn lex_raw_string_literal(&mut self) -> Token<'source> {
856 let start_offset = self.current_offset;
857 self.next(); self.next(); let mut terminated = false;
861 while !self.at_eof() {
862 let byte = self.current_byte();
863 if byte == b'"' {
864 terminated = true;
865 self.next();
866 break;
867 } else if byte == 0 {
868 self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
869 self.next();
870 continue;
871 }
872 self.next();
873 }
874
875 let end_offset = self.current_offset;
876 let length = end_offset - start_offset;
877
878 if !terminated {
879 self.error_unterminated_raw_string(start_offset, 2);
880 }
881
882 Token {
883 kind: TokenKind::RawString,
884 text: &self.input[start_offset..end_offset],
885 byte_offset: start_offset as u32,
886 byte_length: length as u32,
887 }
888 }
889
890 fn try_consume_unsupported_raw_variant(&mut self, end: usize) -> bool {
891 let raw_format_prefix = if self.current_byte() == b'r'
892 && self.peek_byte() == b'f'
893 && self.peek_byte_at(2) == b'"'
894 {
895 Some("rf")
896 } else if self.current_byte() == b'f'
897 && self.peek_byte() == b'r'
898 && self.peek_byte_at(2) == b'"'
899 {
900 Some("fr")
901 } else {
902 None
903 };
904 if let Some(prefix) = raw_format_prefix {
905 let start = self.current_offset;
906 self.skip(3);
907 while self.current_offset < end
908 && self.current_byte() != b'"'
909 && self.current_byte() != b'\n'
910 {
911 self.next();
912 }
913 if self.current_offset < end && self.current_byte() == b'"' {
914 self.next();
915 }
916 let length = self.current_offset - start;
917 self.error_unsupported_raw_format_string(start, length, prefix);
918 return true;
919 }
920
921 if self.current_byte() == b'r' && self.peek_byte() == b'#' {
922 let mut hash_count = 0usize;
923 let mut probe = self.current_offset + 1;
924 while probe < self.input_bytes.len() && self.input_bytes[probe] == b'#' {
925 hash_count += 1;
926 probe += 1;
927 }
928 if hash_count > 0 && probe < self.input_bytes.len() && self.input_bytes[probe] == b'"' {
929 let start = self.current_offset;
930 self.skip(1 + hash_count + 1);
931 loop {
932 if self.current_offset >= end || self.current_byte() == b'\n' {
933 break;
934 }
935 if self.current_byte() == b'"' {
936 let mut closer_matches = true;
937 for i in 1..=hash_count {
938 if self.peek_byte_at(i) != b'#' {
939 closer_matches = false;
940 break;
941 }
942 }
943 if closer_matches {
944 self.skip(1 + hash_count);
945 break;
946 }
947 }
948 self.next();
949 }
950 let length = self.current_offset - start;
951 self.error_unsupported_hash_delimited_raw_string(start, length);
952 return true;
953 }
954 }
955
956 false
957 }
958
959 fn push_format_string_text_if_needed(
960 &self,
961 tokens: &mut Vec<Token<'source>>,
962 text_segment_start: usize,
963 ) {
964 if text_segment_start < self.current_offset {
965 tokens.push(Token {
966 kind: TokenKind::FormatStringText,
967 text: &self.input[text_segment_start..self.current_offset],
968 byte_offset: text_segment_start as u32,
969 byte_length: (self.current_offset - text_segment_start) as u32,
970 });
971 }
972 }
973
974 fn lex_format_string_interpolation(
975 &mut self,
976 tokens: &mut Vec<Token<'source>>,
977 ) -> Result<(), ()> {
978 let interp_start = self.current_offset;
979 self.next();
980
981 tokens.push(Token {
982 kind: TokenKind::FormatStringInterpolationStart,
983 text: &self.input[interp_start..self.current_offset],
984 byte_offset: interp_start as u32,
985 byte_length: (self.current_offset - interp_start) as u32,
986 });
987
988 let Some(interpolation_end) = self.find_interpolation_boundary() else {
989 if self.has_newline_between(interp_start, self.input.len()) {
990 self.error_multiline_format_string_interpolation(interp_start);
991 } else {
992 self.error_unclosed_brace_in_format_string(interp_start);
993 }
994 self.skip_to_format_string_end();
995 return Err(());
996 };
997
998 if self.has_newline_between(interp_start, interpolation_end) {
999 self.error_multiline_format_string_interpolation(interp_start);
1000 }
1001
1002 while self.current_offset < interpolation_end {
1003 self.skip_horizontal_whitespace();
1004 if self.current_offset >= interpolation_end {
1005 break;
1006 }
1007
1008 if self.try_consume_unsupported_raw_variant(interpolation_end) {
1009 continue;
1010 }
1011
1012 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
1013 let mut fstring_tokens = self.lex_format_string_tokens();
1014 tokens.append(&mut fstring_tokens);
1015 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
1016 self.error_escaped_quote_in_interpolation(self.current_offset);
1017 self.skip(2);
1018 } else if self.current_byte() == b'r' && self.peek_byte() == b'"' {
1019 self.error_raw_string_in_interpolation(self.current_offset);
1020 self.skip(2);
1021 while self.current_offset < interpolation_end
1022 && self.current_byte() != b'"'
1023 && self.current_byte() != b'\n'
1024 {
1025 self.next();
1026 }
1027 if self.current_offset < interpolation_end && self.current_byte() == b'"' {
1028 self.next();
1029 }
1030 } else {
1031 let token = self.create_token();
1032 tokens.push(token);
1033 }
1034 }
1035
1036 let close_offset = self.current_offset;
1037 self.next();
1038 tokens.push(Token {
1039 kind: TokenKind::FormatStringInterpolationEnd,
1040 text: &self.input[close_offset..self.current_offset],
1041 byte_offset: close_offset as u32,
1042 byte_length: (self.current_offset - close_offset) as u32,
1043 });
1044
1045 Ok(())
1046 }
1047
1048 fn scan_interpolation(&self, start: usize) -> Option<usize> {
1049 let bytes = self.input.as_bytes();
1050 let mut p = start;
1051 let mut depth = 1;
1052
1053 while p < bytes.len() && depth > 0 {
1054 match bytes[p] {
1055 b'{' => {
1056 depth += 1;
1057 p += 1;
1058 }
1059 b'}' => {
1060 depth -= 1;
1061 if depth > 0 {
1062 p += 1;
1063 }
1064 }
1065 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
1066 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
1067 p = self.scan_past_fstring(p)?;
1068 }
1069 b'\\' => p += 2,
1070 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
1071 b'\n' => return None,
1072 _ => p += 1,
1073 }
1074 }
1075
1076 (depth == 0).then_some(p)
1077 }
1078
1079 fn find_interpolation_boundary(&self) -> Option<usize> {
1080 self.scan_interpolation(self.current_offset)
1081 }
1082
1083 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
1084 let bytes = self.input.as_bytes();
1085 let mut p = start + 1;
1086 while p < bytes.len() {
1087 match bytes[p] {
1088 b'\\' if delimiter != b'`' => p += 2,
1089 b'\n' => return None,
1090 b if b == delimiter => return Some(p + 1),
1091 _ => p += 1,
1092 }
1093 }
1094 None
1095 }
1096
1097 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1098 let bytes = self.input.as_bytes();
1099 let mut p = position + 2; while p < bytes.len() {
1101 match bytes[p] {
1102 b'\\' => p += 2,
1103 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1104 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1105 b'{' => {
1106 p = self.scan_interpolation(p + 1)?;
1107 p += 1;
1108 }
1109 b'"' => return Some(p + 1),
1110 b'\n' => return None,
1111 _ => p += 1,
1112 }
1113 }
1114 None
1115 }
1116
1117 fn skip_to_format_string_end(&mut self) {
1122 let mut depth = 1;
1123 while !self.at_eof() {
1124 match self.current_byte() {
1125 b'\\' => {
1126 self.next();
1127 if !self.at_eof() {
1128 self.next();
1129 }
1130 }
1131 b'"' if depth == 0 => {
1132 self.next();
1133 return;
1134 }
1135 b'"' => {
1136 self.next();
1137 while !self.at_eof() && self.current_byte() != b'"' {
1138 if self.current_byte() == b'\\' {
1139 self.next();
1140 if self.at_eof() {
1141 break;
1142 }
1143 }
1144 self.next();
1145 }
1146 if !self.at_eof() {
1147 self.next();
1148 }
1149 }
1150 b'{' => {
1151 depth += 1;
1152 self.next();
1153 }
1154 b'}' => {
1155 if depth > 0 {
1156 depth -= 1;
1157 }
1158 self.next();
1159 }
1160 _ => self.next(),
1161 }
1162 }
1163 }
1164
1165 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1166 let start_offset = self.current_offset;
1167 let mut tokens = Vec::new();
1168
1169 self.skip(2);
1170
1171 let fstring_start_end = self.current_offset;
1172 tokens.push(Token {
1173 kind: TokenKind::FormatStringStart,
1174 text: &self.input[start_offset..fstring_start_end],
1175 byte_offset: start_offset as u32,
1176 byte_length: (fstring_start_end - start_offset) as u32,
1177 });
1178
1179 let mut text_segment_start = self.current_offset;
1180
1181 while !self.at_eof() {
1182 let byte = self.current_byte();
1183
1184 match byte {
1185 b'\\' if !self.at_eof() => {
1186 let escape_start = self.current_offset;
1187 self.next();
1188 if !self.at_eof() {
1189 let b = self.current_byte();
1190 self.next();
1191 if matches!(b, b'0'..=b'7') {
1192 let value = self.consume_octal_escape(b);
1193 if value > 255 {
1194 let escape_len = self.current_offset - escape_start;
1195 self.error_octal_escape_out_of_range(escape_start, escape_len);
1196 }
1197 } else if b == b'u' {
1198 self.consume_unicode_escape(escape_start);
1199 }
1200 }
1201 }
1202 b'{' if self.peek_byte() == b'{' => {
1203 self.skip(2);
1204 }
1205 b'}' if self.peek_byte() == b'}' => {
1206 self.skip(2);
1207 }
1208 b'"' => {
1209 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1210
1211 let end_offset = self.current_offset;
1212 self.next();
1213
1214 tokens.push(Token {
1215 kind: TokenKind::FormatStringEnd,
1216 text: &self.input[end_offset..self.current_offset],
1217 byte_offset: end_offset as u32,
1218 byte_length: (self.current_offset - end_offset) as u32,
1219 });
1220 return tokens;
1221 }
1222
1223 b'{' => {
1224 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1225
1226 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1227 return tokens;
1228 }
1229 text_segment_start = self.current_offset;
1230 }
1231 b'}' => {
1232 self.error_unmatched_brace_in_format_string(self.current_offset);
1233 self.next();
1234 }
1235 _ => {
1236 self.next();
1237 }
1238 }
1239 }
1240
1241 self.error_unterminated_format_string(start_offset, 2);
1242 tokens
1243 }
1244
1245 fn lex_char(&mut self) -> Token<'source> {
1246 let start_offset = self.current_offset;
1247
1248 self.next();
1249
1250 if self.at_eof() || self.current_byte() == b'\'' {
1251 self.error_empty_rune_literal(start_offset);
1252 let end_offset = self.current_offset;
1253 return Token {
1254 kind: TokenKind::Char,
1255 text: &self.input[start_offset..end_offset],
1256 byte_offset: start_offset as u32,
1257 byte_length: (end_offset - start_offset) as u32,
1258 };
1259 }
1260
1261 if self.current_byte() != b'\\' {
1262 self.next();
1263 } else {
1264 self.next();
1265
1266 if self.at_eof() {
1267 self.error_unterminated_escape(start_offset);
1268 let end_offset = self.current_offset;
1269 return Token {
1270 kind: TokenKind::Char,
1271 text: &self.input[start_offset..end_offset],
1272 byte_offset: start_offset as u32,
1273 byte_length: (end_offset - start_offset) as u32,
1274 };
1275 }
1276
1277 match self.current_byte() {
1278 b'0'..=b'7' => {
1279 let escape_start = self.current_offset - 1;
1280 let first = self.current_byte();
1281 self.next();
1282 let value = self.consume_octal_escape(first);
1283 if value > 255 {
1284 let escape_len = self.current_offset - escape_start;
1285 self.error_octal_escape_out_of_range(escape_start, escape_len);
1286 }
1287 }
1288 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1289 self.next();
1290 }
1291 _ => {
1292 self.error_invalid_escape(self.current_char());
1293
1294 while !self.at_eof() && self.current_byte() != b'\'' {
1295 self.next();
1296 }
1297
1298 if !self.at_eof() && self.current_byte() == b'\'' {
1299 self.next();
1300 }
1301
1302 let end_offset = self.current_offset;
1303 return Token {
1304 kind: TokenKind::Char,
1305 text: &self.input[start_offset..end_offset],
1306 byte_offset: start_offset as u32,
1307 byte_length: (end_offset - start_offset) as u32,
1308 };
1309 }
1310 }
1311 }
1312
1313 if self.at_eof() || self.current_byte() != b'\'' {
1314 let length = self.current_offset - start_offset;
1315 self.error_unterminated_rune(start_offset, length);
1316 }
1317
1318 if !self.at_eof() && self.current_byte() == b'\'' {
1319 self.next();
1320 }
1321
1322 let end_offset = self.current_offset;
1323 Token {
1324 kind: TokenKind::Char,
1325 text: &self.input[start_offset..end_offset],
1326 byte_offset: start_offset as u32,
1327 byte_length: (end_offset - start_offset) as u32,
1328 }
1329 }
1330
1331 fn lex_slash(&mut self) -> Token<'source> {
1332 let start_offset = self.current_offset;
1333
1334 if self.peek_byte() != b'/' {
1335 self.next();
1336 return Token {
1337 kind: TokenKind::Slash,
1338 text: &self.input[start_offset..self.current_offset],
1339 byte_offset: start_offset as u32,
1340 byte_length: 1,
1341 };
1342 }
1343
1344 let slash_count = self.count_consecutive(b'/');
1345
1346 if slash_count >= 4 {
1347 self.error_excess_slashes_in_comment(start_offset, slash_count);
1348 }
1349
1350 self.skip(slash_count);
1351
1352 if slash_count == 3 {
1353 if self.current_byte() == b' ' {
1354 self.next();
1355 }
1356 let text_start = self.current_offset;
1357 self.skip_to_eol();
1358 let end_offset = self.current_offset;
1359
1360 self.trivia
1361 .doc_comments
1362 .push((start_offset as u32, end_offset as u32));
1363
1364 return Token {
1365 kind: TokenKind::DocComment,
1366 text: &self.input[text_start..end_offset],
1367 byte_offset: start_offset as u32,
1368 byte_length: (end_offset - start_offset) as u32,
1369 };
1370 }
1371
1372 self.skip_to_eol();
1373 let end_offset = self.current_offset;
1374
1375 self.trivia
1376 .comments
1377 .push((start_offset as u32, end_offset as u32));
1378
1379 Token {
1380 kind: TokenKind::Comment,
1381 text: &self.input[start_offset..end_offset],
1382 byte_offset: start_offset as u32,
1383 byte_length: (end_offset - start_offset) as u32,
1384 }
1385 }
1386
1387 fn count_consecutive(&self, byte: u8) -> usize {
1388 let mut count = 0;
1389 let mut offset = self.current_offset;
1390 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1391 count += 1;
1392 offset += 1;
1393 }
1394 count
1395 }
1396
1397 fn skip_to_eol(&mut self) {
1398 while !self.at_eof() && self.current_byte() != b'\n' {
1399 self.next();
1400 }
1401 }
1402
1403 fn lex_directive(&mut self) -> Token<'source> {
1404 let start_offset = self.current_offset;
1405
1406 self.next();
1407
1408 while !self.at_eof() {
1409 let byte = self.current_byte();
1410 if byte.is_ascii_alphanumeric() || byte == b'_' {
1411 self.next();
1412 } else {
1413 break;
1414 }
1415 }
1416
1417 let end_offset = self.current_offset;
1418 Token {
1419 kind: TokenKind::Directive,
1420 text: &self.input[start_offset..end_offset],
1421 byte_offset: start_offset as u32,
1422 byte_length: (end_offset - start_offset) as u32,
1423 }
1424 }
1425
1426 fn handle_unexpected_char(&mut self) -> Token<'source> {
1427 let start_offset = self.current_offset;
1428
1429 self.error_unexpected_char(self.current_offset, self.current_char());
1430
1431 self.resync_on_error();
1432
1433 let end_offset = self.current_offset;
1434
1435 Token {
1436 kind: TokenKind::Error,
1437 text: &self.input[start_offset..end_offset],
1438 byte_offset: start_offset as u32,
1439 byte_length: (end_offset - start_offset) as u32,
1440 }
1441 }
1442
1443 fn eof_token(&self) -> Token<'source> {
1444 Token {
1445 kind: TokenKind::EOF,
1446 text: &self.input[self.current_offset..self.current_offset],
1447 byte_offset: self.current_offset as u32,
1448 byte_length: 0,
1449 }
1450 }
1451
1452 fn semicolon_token(&mut self) -> Token<'source> {
1453 let start_offset = self.current_offset;
1454
1455 self.next();
1456
1457 Token {
1458 kind: TokenKind::Semicolon,
1459 text: &self.input[start_offset..self.current_offset],
1460 byte_offset: start_offset as u32,
1461 byte_length: (self.current_offset - start_offset) as u32,
1462 }
1463 }
1464}