1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.try_consume_unsupported_raw_variant(self.input.len()) {
52 continue;
53 }
54
55 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
56 let mut fstring_tokens = self.lex_format_string_tokens();
57 fstring_tokens.reverse();
58 self.pending_tokens = fstring_tokens;
59 continue;
60 }
61
62 let token = self.create_token();
63 tokens.push(token);
64 }
65
66 let tokens = self.insert_semicolons(tokens);
67
68 LexResult {
69 tokens,
70 errors: self.errors,
71 trivia: self.trivia,
72 }
73 }
74
75 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
76 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
77
78 for i in 0..tokens.len() {
79 let token = tokens[i];
80 result.push(token);
81
82 if !Self::triggers_asi(token.kind) {
83 continue;
84 }
85
86 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
87 if Self::continues_expression(next_token.kind) {
88 continue;
89 }
90
91 let token_end = (token.byte_offset + token.byte_length) as usize;
92 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
93 result.push(self.make_synthetic_semicolon(token_end));
94 }
95 }
96 }
97
98 result
99 }
100
101 fn triggers_asi(kind: TokenKind) -> bool {
102 matches!(
103 kind,
104 TokenKind::Identifier
105 | TokenKind::Integer
106 | TokenKind::Imaginary
107 | TokenKind::Float
108 | TokenKind::String
109 | TokenKind::RawString
110 | TokenKind::Char
111 | TokenKind::Boolean
112 | TokenKind::RightParen
113 | TokenKind::RightSquareBracket
114 | TokenKind::RightCurlyBrace
115 | TokenKind::Break
116 | TokenKind::Continue
117 | TokenKind::Return
118 | TokenKind::DotDot
119 | TokenKind::DotDotEqual
120 | TokenKind::QuestionMark
121 )
122 }
123
124 fn continues_expression(kind: TokenKind) -> bool {
125 matches!(
126 kind,
127 TokenKind::Plus
128 | TokenKind::Star
129 | TokenKind::Slash
130 | TokenKind::Percent
131 | TokenKind::Pipeline
132 | TokenKind::AmpersandDouble
133 | TokenKind::PipeDouble
134 | TokenKind::EqualDouble
135 | TokenKind::NotEqual
136 | TokenKind::LeftAngleBracket
137 | TokenKind::RightAngleBracket
138 | TokenKind::LessThanOrEqual
139 | TokenKind::GreaterThanOrEqual
140 | TokenKind::Dot
141 | TokenKind::Equal
142 | TokenKind::PlusEqual
143 | TokenKind::MinusEqual
144 | TokenKind::StarEqual
145 | TokenKind::SlashEqual
146 | TokenKind::Else
147 | TokenKind::LeftCurlyBrace
148 | TokenKind::RightCurlyBrace
149 | TokenKind::RightParen
150 | TokenKind::RightSquareBracket
151 | TokenKind::As
152 )
153 }
154
155 fn find_next_non_comment_token<'a>(
156 &self,
157 tokens: &'a [Token<'source>],
158 start_index: usize,
159 ) -> Option<&'a Token<'source>> {
160 tokens
161 .iter()
162 .skip(start_index)
163 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
164 }
165
166 fn has_newline_between(&self, start: usize, end: usize) -> bool {
167 self.input[start..end].contains('\n')
168 }
169
170 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
171 Token {
172 kind: TokenKind::Semicolon,
173 text: "",
174 byte_offset: position as u32,
175 byte_length: 0,
176 }
177 }
178
179 fn create_token(&mut self) -> Token<'source> {
180 if let Some(token) = self.lex_lookahead_symbol() {
181 return token;
182 }
183
184 let c = self.current_char();
185 match c {
186 '0'..='9' => self.lex_number(),
187 'r' if self.peek_byte() == b'"' => self.lex_raw_string_literal(),
188 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
189 '"' => self.lex_string_literal(),
190 '`' => self.lex_backtick_literal(),
191 '\'' => self.lex_char(),
192 '/' => self.lex_slash(),
193 ';' => self.semicolon_token(),
194 '@' => self.lex_directive(),
195 _ => self.handle_unexpected_char(),
196 }
197 }
198
199 #[inline]
200 fn current_byte(&self) -> u8 {
201 if self.current_offset < self.input_bytes.len() {
202 self.input_bytes[self.current_offset]
203 } else {
204 0
205 }
206 }
207
208 #[inline]
209 fn current_char(&self) -> char {
210 self.input[self.current_offset..]
211 .chars()
212 .next()
213 .unwrap_or('\0')
214 }
215
216 #[inline]
217 fn peek_byte(&self) -> u8 {
218 if self.current_offset + 1 < self.input_bytes.len() {
219 self.input_bytes[self.current_offset + 1]
220 } else {
221 0
222 }
223 }
224
225 #[inline]
226 fn peek_byte_at(&self, n: usize) -> u8 {
227 let offset = self.current_offset + n;
228 if offset < self.input_bytes.len() {
229 self.input_bytes[offset]
230 } else {
231 0
232 }
233 }
234
235 #[inline]
236 fn peek_char(&self) -> char {
237 let next_offset = if self.current_byte() < 128 {
238 self.current_offset + 1
239 } else {
240 self.current_offset + self.current_char().len_utf8()
241 };
242 self.input[next_offset..].chars().next().unwrap_or('\0')
243 }
244
245 fn peek_char_n(&self, n: usize) -> char {
246 let mut offset = self.current_offset;
247 for _ in 0..n {
248 if offset >= self.input.len() {
249 return '\0';
250 }
251 let c = self.input[offset..].chars().next().unwrap_or('\0');
252 offset += c.len_utf8();
253 }
254 self.input[offset..].chars().next().unwrap_or('\0')
255 }
256
257 fn next(&mut self) {
258 if self.at_eof() {
259 return;
260 }
261 if self.current_byte() < 128 {
262 self.current_offset += 1;
263 } else {
264 self.current_offset += self.current_char().len_utf8();
265 }
266 }
267
268 fn skip(&mut self, count: usize) {
269 for _ in 0..count {
270 self.next();
271 }
272 }
273
274 fn skip_whitespace(&mut self) {
275 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
276 if self.current_byte() == b'\n' {
277 self.record_newline();
278 }
279 self.next();
280 }
281 }
282
283 fn skip_horizontal_whitespace(&mut self) {
284 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
285 self.next();
286 }
287 }
288
289 fn record_newline(&mut self) {
290 let offset = self.current_offset;
291
292 if let Some(last) = self.last_newline_offset {
293 let between = &self.input[last + 1..offset];
294 let is_blank = between.is_empty()
295 || between
296 .chars()
297 .all(|c| c.is_ascii_whitespace() && c != '\n');
298 if is_blank {
299 self.trivia.blank_lines.push(offset as u32);
300 }
301 }
302
303 self.last_newline_offset = Some(offset);
304 }
305
306 fn at_eof(&self) -> bool {
307 self.current_offset >= self.input.len()
308 }
309
310 fn previous_char(&self) -> char {
311 if self.current_offset == 0 {
312 return '\0';
313 }
314 self.input[..self.current_offset]
315 .chars()
316 .next_back()
317 .unwrap_or('\0')
318 }
319
320 fn resync_on_error(&mut self) {
321 while !self.at_eof() {
322 let byte = self.current_byte();
323
324 if byte == b';' || byte == b'}' {
325 break;
326 }
327
328 self.next();
329 }
330 }
331
332 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
334 let start_offset = self.current_offset;
335 let current_char = self.current_char();
336 let next_char = self.peek_char();
337 let third_char = self.peek_char_n(2);
338
339 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
340 self.skip(3);
341 let end_offset = self.current_offset;
342 return Some(Token {
343 kind,
344 text: &self.input[start_offset..end_offset],
345 byte_offset: start_offset as u32,
346 byte_length: (end_offset - start_offset) as u32,
347 });
348 }
349
350 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
351 self.skip(2);
352 let end_offset = self.current_offset;
353 return Some(Token {
354 kind,
355 text: &self.input[start_offset..end_offset],
356 byte_offset: start_offset as u32,
357 byte_length: (end_offset - start_offset) as u32,
358 });
359 }
360
361 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
362 self.next();
363 let end_offset = self.current_offset;
364 return Some(Token {
365 kind,
366 text: &self.input[start_offset..end_offset],
367 byte_offset: start_offset as u32,
368 byte_length: (end_offset - start_offset) as u32,
369 });
370 }
371
372 None
373 }
374
375 fn lex_number(&mut self) -> Token<'source> {
376 let start_offset = self.current_offset;
377
378 if self.current_byte() == b'0' {
379 let next = self.peek_byte();
380 match next {
381 b'x' | b'X' => {
382 self.next(); self.next(); return self.lex_hex_number(start_offset);
385 }
386 b'o' | b'O' => {
387 self.next(); self.next(); return self.lex_octal_number(start_offset);
390 }
391 b'b' | b'B' => {
392 self.next(); self.next(); return self.lex_binary_number(start_offset);
395 }
396 b'0'..=b'7' => {
397 return self.lex_legacy_octal_number(start_offset);
398 }
399 _ => {} }
401 }
402
403 let mut kind = TokenKind::Integer;
404
405 while !self.at_eof() {
406 let byte = self.current_byte();
407 if byte.is_ascii_digit() || byte == b'_' {
408 if byte == b'_' && self.previous_char() == '_' {
409 let underscore_start = self.current_offset - 1;
410 self.error_consecutive_underscores(underscore_start);
411 }
412 self.next();
413 } else {
414 break;
415 }
416 }
417
418 if self.previous_char() == '_' {
419 self.error_number_trailing_underscore(
420 self.current_offset - self.previous_char().len_utf8(),
421 );
422 }
423
424 let preceded_by_dot = start_offset > 0
427 && self.input_bytes[start_offset - 1] == b'.'
428 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
429
430 if !preceded_by_dot
431 && self.current_byte() == b'.'
432 && self.peek_byte() != b'.'
433 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
434 {
435 kind = TokenKind::Float;
436 self.next();
437
438 if self.current_byte() == b'_' {
439 self.error_decimal_leading_underscore(self.current_offset);
440 }
441
442 while !self.at_eof() {
443 let byte = self.current_byte();
444 if byte.is_ascii_digit() || byte == b'_' {
445 if byte == b'_' && self.previous_char() == '_' {
446 let underscore_start = self.current_offset - 1;
447 self.error_consecutive_underscores(underscore_start);
448 }
449 self.next();
450 } else {
451 break;
452 }
453 }
454
455 if self.previous_char() == '_' {
456 self.error_number_trailing_underscore(
457 self.current_offset - self.previous_char().len_utf8(),
458 );
459 }
460 }
461
462 if self.current_byte() == b'e' || self.current_byte() == b'E' {
463 kind = TokenKind::Float;
464 let exponent_start = self.current_offset;
465 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
468 self.next();
469 }
470
471 if !self.current_byte().is_ascii_digit() {
472 self.error_missing_exponent_digits(
473 exponent_start,
474 self.current_offset - exponent_start,
475 );
476 }
477
478 while !self.at_eof() {
479 let byte = self.current_byte();
480 if byte.is_ascii_digit() || byte == b'_' {
481 if byte == b'_' && self.previous_char() == '_' {
482 let underscore_start = self.current_offset - 1;
483 self.error_consecutive_underscores(underscore_start);
484 }
485 self.next();
486 } else {
487 break;
488 }
489 }
490
491 if self.previous_char() == '_' {
492 self.error_number_trailing_underscore(
493 self.current_offset - self.previous_char().len_utf8(),
494 );
495 }
496 }
497
498 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
499 self.next(); let end_offset = self.current_offset;
501 return Token {
502 kind: TokenKind::Imaginary,
503 text: &self.input[start_offset..end_offset],
504 byte_offset: start_offset as u32,
505 byte_length: (end_offset - start_offset) as u32,
506 };
507 }
508
509 let end_offset = self.current_offset;
510 Token {
511 kind,
512 text: &self.input[start_offset..end_offset],
513 byte_offset: start_offset as u32,
514 byte_length: (end_offset - start_offset) as u32,
515 }
516 }
517
518 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
519 let digits_start = self.current_offset;
520
521 while !self.at_eof() {
522 let byte = self.current_byte();
523 if byte.is_ascii_hexdigit() || byte == b'_' {
524 if byte == b'_' && self.previous_char() == '_' {
525 let underscore_start = self.current_offset - 1;
526 self.error_consecutive_underscores(underscore_start);
527 }
528 self.next();
529 } else {
530 break;
531 }
532 }
533
534 if self.current_offset == digits_start {
535 self.error_missing_hex_digits(start_offset, 2);
536 }
537
538 if self.previous_char() == '_' {
539 self.error_number_trailing_underscore(
540 self.current_offset - self.previous_char().len_utf8(),
541 );
542 }
543
544 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
545 self.next(); let end_offset = self.current_offset;
547 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
548 return Token {
549 kind: TokenKind::Imaginary,
550 text: &self.input[start_offset..end_offset],
551 byte_offset: start_offset as u32,
552 byte_length: (end_offset - start_offset) as u32,
553 };
554 }
555
556 let end_offset = self.current_offset;
557 Token {
558 kind: TokenKind::Integer,
559 text: &self.input[start_offset..end_offset],
560 byte_offset: start_offset as u32,
561 byte_length: (end_offset - start_offset) as u32,
562 }
563 }
564
565 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
566 let digits_start = self.current_offset;
567
568 while !self.at_eof() {
569 let byte = self.current_byte();
570 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
571 if byte == b'_' && self.previous_char() == '_' {
572 let underscore_start = self.current_offset - 1;
573 self.error_consecutive_underscores(underscore_start);
574 }
575 self.next();
576 } else if byte == b'8' || byte == b'9' {
577 self.error_invalid_octal_digit(self.current_offset);
578 self.next();
579 } else {
580 break;
581 }
582 }
583
584 if self.current_offset == digits_start {
585 self.error_missing_octal_digits(start_offset, 2);
586 }
587
588 if self.previous_char() == '_' {
589 self.error_number_trailing_underscore(
590 self.current_offset - self.previous_char().len_utf8(),
591 );
592 }
593
594 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
595 self.next(); let end_offset = self.current_offset;
597 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
598 return Token {
599 kind: TokenKind::Imaginary,
600 text: &self.input[start_offset..end_offset],
601 byte_offset: start_offset as u32,
602 byte_length: (end_offset - start_offset) as u32,
603 };
604 }
605
606 let end_offset = self.current_offset;
607 Token {
608 kind: TokenKind::Integer,
609 text: &self.input[start_offset..end_offset],
610 byte_offset: start_offset as u32,
611 byte_length: (end_offset - start_offset) as u32,
612 }
613 }
614
615 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
616 self.next();
617
618 while !self.at_eof() {
619 let byte = self.current_byte();
620 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
621 if byte == b'_' && self.previous_char() == '_' {
622 let underscore_start = self.current_offset - 1;
623 self.error_consecutive_underscores(underscore_start);
624 }
625 self.next();
626 } else if byte == b'8' || byte == b'9' {
627 self.error_invalid_octal_digit(self.current_offset);
628 self.next();
629 } else {
630 break;
631 }
632 }
633
634 if self.previous_char() == '_' {
635 self.error_number_trailing_underscore(
636 self.current_offset - self.previous_char().len_utf8(),
637 );
638 }
639
640 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
641 self.next();
642 let end_offset = self.current_offset;
643 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
644 return Token {
645 kind: TokenKind::Imaginary,
646 text: &self.input[start_offset..end_offset],
647 byte_offset: start_offset as u32,
648 byte_length: (end_offset - start_offset) as u32,
649 };
650 }
651
652 let end_offset = self.current_offset;
653 Token {
654 kind: TokenKind::Integer,
655 text: &self.input[start_offset..end_offset],
656 byte_offset: start_offset as u32,
657 byte_length: (end_offset - start_offset) as u32,
658 }
659 }
660
661 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
662 let digits_start = self.current_offset;
663
664 while !self.at_eof() {
665 let byte = self.current_byte();
666 if byte == b'0' || byte == b'1' || byte == b'_' {
667 if byte == b'_' && self.previous_char() == '_' {
668 let underscore_start = self.current_offset - 1;
669 self.error_consecutive_underscores(underscore_start);
670 }
671 self.next();
672 } else if (b'2'..=b'9').contains(&byte) {
673 self.error_invalid_binary_digit(self.current_offset);
674 self.next();
675 } else {
676 break;
677 }
678 }
679
680 if self.current_offset == digits_start {
681 self.error_missing_binary_digits(start_offset, 2);
682 }
683
684 if self.previous_char() == '_' {
685 self.error_number_trailing_underscore(
686 self.current_offset - self.previous_char().len_utf8(),
687 );
688 }
689
690 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
691 self.next();
692 let end_offset = self.current_offset;
693 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
694 return Token {
695 kind: TokenKind::Imaginary,
696 text: &self.input[start_offset..end_offset],
697 byte_offset: start_offset as u32,
698 byte_length: (end_offset - start_offset) as u32,
699 };
700 }
701
702 let end_offset = self.current_offset;
703 Token {
704 kind: TokenKind::Integer,
705 text: &self.input[start_offset..end_offset],
706 byte_offset: start_offset as u32,
707 byte_length: (end_offset - start_offset) as u32,
708 }
709 }
710
711 fn lex_identifier(&mut self) -> Token<'source> {
712 let start_offset = self.current_offset;
713
714 while !self.at_eof() {
715 let c = self.current_char();
716 if c.is_alphanumeric() || c == '_' {
717 self.next();
718 } else {
719 break;
720 }
721 }
722
723 let end_offset = self.current_offset;
724 let text = &self.input[start_offset..end_offset];
725
726 let kind = match text {
727 "true" | "false" => TokenKind::Boolean,
728 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
729 };
730
731 Token {
732 kind,
733 text,
734 byte_offset: start_offset as u32,
735 byte_length: (end_offset - start_offset) as u32,
736 }
737 }
738
739 fn lex_backtick_literal(&mut self) -> Token<'source> {
740 let start_offset = self.current_offset;
741
742 self.next();
743
744 let mut terminated = false;
745
746 while !self.at_eof() {
747 let byte = self.current_byte();
748 if byte == b'`' {
749 terminated = true;
750 self.next();
751 break;
752 } else if byte == b'\n' {
753 break;
754 }
755 self.next();
756 }
757
758 let end_offset = self.current_offset;
759 let length = end_offset - start_offset;
760
761 if !terminated {
762 self.error_unterminated_backtick(start_offset, length);
763 }
764
765 Token {
766 kind: TokenKind::Backtick,
767 text: &self.input[start_offset..end_offset],
768 byte_offset: start_offset as u32,
769 byte_length: length as u32,
770 }
771 }
772
773 fn consume_unicode_escape(&mut self, escape_start: usize) {
774 if self.at_eof() || self.current_byte() != b'{' {
775 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
776 return;
777 }
778 self.next();
779
780 let hex_start = self.current_offset;
781 let mut all_hex = true;
782 while !self.at_eof() {
783 let byte = self.current_byte();
784 if byte == b'}' || byte == b'"' || byte == b'\n' {
785 break;
786 }
787 if !byte.is_ascii_hexdigit() {
788 all_hex = false;
789 }
790 self.next();
791 }
792 let hex_end = self.current_offset;
793
794 let closed = !self.at_eof() && self.current_byte() == b'}';
795 if closed {
796 self.next();
797 }
798
799 let hex_len = hex_end - hex_start;
800 let total_len = self.current_offset - escape_start;
801
802 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
803 self.error_invalid_unicode_escape(escape_start, total_len);
804 return;
805 }
806
807 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
808 .expect("hex digits validated above");
809 if char::from_u32(codepoint).is_none() {
810 self.error_unicode_escape_out_of_range(escape_start, total_len);
811 }
812 }
813
814 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
816 let mut value: u16 = (first_digit - b'0') as u16;
817 for _ in 0..2 {
818 if self.at_eof() {
819 break;
820 }
821 match self.current_byte() {
822 d @ b'0'..=b'7' => {
823 value = value * 8 + (d - b'0') as u16;
824 self.next();
825 }
826 _ => break,
827 }
828 }
829 value
830 }
831
832 fn lex_string_literal(&mut self) -> Token<'source> {
833 let start_offset = self.current_offset;
834
835 self.next();
836
837 let mut escaped = false;
838 let mut terminated = false;
839
840 while !self.at_eof() && !terminated {
841 let byte = self.current_byte();
842 if escaped {
843 match byte {
844 b'0'..=b'7' => {
845 let escape_start = self.current_offset - 1;
846 self.next();
847 let value = self.consume_octal_escape(byte);
848 if value > 255 {
849 let escape_len = self.current_offset - escape_start;
850 self.error_octal_escape_out_of_range(escape_start, escape_len);
851 }
852 escaped = false;
853 continue;
854 }
855 b'u' => {
856 let escape_start = self.current_offset - 1;
857 self.next();
858 self.consume_unicode_escape(escape_start);
859 escaped = false;
860 continue;
861 }
862 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
863 }
864 b'\'' => {}
865 _ => {
866 self.error_invalid_escape(self.current_char());
867 }
868 }
869 escaped = false;
870 } else if byte == b'\\' {
871 escaped = true;
872 } else if byte == b'"' {
873 terminated = true;
874 self.next();
875 break;
876 } else if byte == b'\n' {
877 break; }
879
880 self.next();
881 }
882
883 let end_offset = self.current_offset;
884 let length = end_offset - start_offset;
885
886 if escaped {
887 self.error_unterminated_escape(start_offset);
888 }
889
890 if !terminated {
891 self.error_unterminated_string(start_offset, length);
892 }
893
894 Token {
895 kind: TokenKind::String,
896 text: &self.input[start_offset..end_offset],
897 byte_offset: start_offset as u32,
898 byte_length: length as u32,
899 }
900 }
901
902 fn lex_raw_string_literal(&mut self) -> Token<'source> {
903 let start_offset = self.current_offset;
904 self.next(); self.next(); let mut terminated = false;
908 while !self.at_eof() {
909 let byte = self.current_byte();
910 if byte == b'"' {
911 terminated = true;
912 self.next();
913 break;
914 } else if byte == b'\n' {
915 break;
916 } else if byte == 0 {
917 self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
918 self.next();
919 continue;
920 }
921 self.next();
922 }
923
924 let end_offset = self.current_offset;
925 let length = end_offset - start_offset;
926
927 if !terminated {
928 self.error_unterminated_raw_string(start_offset, length);
929 }
930
931 Token {
932 kind: TokenKind::RawString,
933 text: &self.input[start_offset..end_offset],
934 byte_offset: start_offset as u32,
935 byte_length: length as u32,
936 }
937 }
938
939 fn try_consume_unsupported_raw_variant(&mut self, end: usize) -> bool {
940 let raw_format_prefix = if self.current_byte() == b'r'
941 && self.peek_byte() == b'f'
942 && self.peek_byte_at(2) == b'"'
943 {
944 Some("rf")
945 } else if self.current_byte() == b'f'
946 && self.peek_byte() == b'r'
947 && self.peek_byte_at(2) == b'"'
948 {
949 Some("fr")
950 } else {
951 None
952 };
953 if let Some(prefix) = raw_format_prefix {
954 let start = self.current_offset;
955 self.skip(3);
956 while self.current_offset < end
957 && self.current_byte() != b'"'
958 && self.current_byte() != b'\n'
959 {
960 self.next();
961 }
962 if self.current_offset < end && self.current_byte() == b'"' {
963 self.next();
964 }
965 let length = self.current_offset - start;
966 self.error_unsupported_raw_format_string(start, length, prefix);
967 return true;
968 }
969
970 if self.current_byte() == b'r' && self.peek_byte() == b'#' {
971 let mut hash_count = 0usize;
972 let mut probe = self.current_offset + 1;
973 while probe < self.input_bytes.len() && self.input_bytes[probe] == b'#' {
974 hash_count += 1;
975 probe += 1;
976 }
977 if hash_count > 0 && probe < self.input_bytes.len() && self.input_bytes[probe] == b'"' {
978 let start = self.current_offset;
979 self.skip(1 + hash_count + 1);
980 loop {
981 if self.current_offset >= end || self.current_byte() == b'\n' {
982 break;
983 }
984 if self.current_byte() == b'"' {
985 let mut closer_matches = true;
986 for i in 1..=hash_count {
987 if self.peek_byte_at(i) != b'#' {
988 closer_matches = false;
989 break;
990 }
991 }
992 if closer_matches {
993 self.skip(1 + hash_count);
994 break;
995 }
996 }
997 self.next();
998 }
999 let length = self.current_offset - start;
1000 self.error_unsupported_hash_delimited_raw_string(start, length);
1001 return true;
1002 }
1003 }
1004
1005 false
1006 }
1007
1008 fn push_format_string_text_if_needed(
1009 &self,
1010 tokens: &mut Vec<Token<'source>>,
1011 text_segment_start: usize,
1012 ) {
1013 if text_segment_start < self.current_offset {
1014 tokens.push(Token {
1015 kind: TokenKind::FormatStringText,
1016 text: &self.input[text_segment_start..self.current_offset],
1017 byte_offset: text_segment_start as u32,
1018 byte_length: (self.current_offset - text_segment_start) as u32,
1019 });
1020 }
1021 }
1022
1023 fn lex_format_string_interpolation(
1024 &mut self,
1025 tokens: &mut Vec<Token<'source>>,
1026 ) -> Result<(), ()> {
1027 let interp_start = self.current_offset;
1028 self.next();
1029
1030 tokens.push(Token {
1031 kind: TokenKind::FormatStringInterpolationStart,
1032 text: &self.input[interp_start..self.current_offset],
1033 byte_offset: interp_start as u32,
1034 byte_length: (self.current_offset - interp_start) as u32,
1035 });
1036
1037 let Some(interpolation_end) = self.find_interpolation_boundary() else {
1038 if self.has_newline_between(interp_start, self.input.len()) {
1039 self.error_multiline_format_string_interpolation(interp_start);
1040 } else {
1041 self.error_unclosed_brace_in_format_string(interp_start);
1042 }
1043 self.skip_to_format_string_end();
1044 return Err(());
1045 };
1046
1047 if self.has_newline_between(interp_start, interpolation_end) {
1048 self.error_multiline_format_string_interpolation(interp_start);
1049 }
1050
1051 while self.current_offset < interpolation_end {
1052 self.skip_horizontal_whitespace();
1053 if self.current_offset >= interpolation_end {
1054 break;
1055 }
1056
1057 if self.try_consume_unsupported_raw_variant(interpolation_end) {
1058 continue;
1059 }
1060
1061 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
1062 let mut fstring_tokens = self.lex_format_string_tokens();
1063 tokens.append(&mut fstring_tokens);
1064 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
1065 self.error_escaped_quote_in_interpolation(self.current_offset);
1066 self.skip(2);
1067 } else if self.current_byte() == b'r' && self.peek_byte() == b'"' {
1068 self.error_raw_string_in_interpolation(self.current_offset);
1069 self.skip(2);
1070 while self.current_offset < interpolation_end
1071 && self.current_byte() != b'"'
1072 && self.current_byte() != b'\n'
1073 {
1074 self.next();
1075 }
1076 if self.current_offset < interpolation_end && self.current_byte() == b'"' {
1077 self.next();
1078 }
1079 } else {
1080 let token = self.create_token();
1081 tokens.push(token);
1082 }
1083 }
1084
1085 let close_offset = self.current_offset;
1086 self.next();
1087 tokens.push(Token {
1088 kind: TokenKind::FormatStringInterpolationEnd,
1089 text: &self.input[close_offset..self.current_offset],
1090 byte_offset: close_offset as u32,
1091 byte_length: (self.current_offset - close_offset) as u32,
1092 });
1093
1094 Ok(())
1095 }
1096
1097 fn scan_interpolation(&self, start: usize) -> Option<usize> {
1098 let bytes = self.input.as_bytes();
1099 let mut p = start;
1100 let mut depth = 1;
1101
1102 while p < bytes.len() && depth > 0 {
1103 match bytes[p] {
1104 b'{' => {
1105 depth += 1;
1106 p += 1;
1107 }
1108 b'}' => {
1109 depth -= 1;
1110 if depth > 0 {
1111 p += 1;
1112 }
1113 }
1114 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
1115 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
1116 p = self.scan_past_fstring(p)?;
1117 }
1118 b'\\' => p += 2,
1119 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
1120 b'\n' => return None,
1121 _ => p += 1,
1122 }
1123 }
1124
1125 (depth == 0).then_some(p)
1126 }
1127
1128 fn find_interpolation_boundary(&self) -> Option<usize> {
1129 self.scan_interpolation(self.current_offset)
1130 }
1131
1132 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
1133 let bytes = self.input.as_bytes();
1134 let mut p = start + 1;
1135 while p < bytes.len() {
1136 match bytes[p] {
1137 b'\\' if delimiter != b'`' => p += 2,
1138 b'\n' => return None,
1139 b if b == delimiter => return Some(p + 1),
1140 _ => p += 1,
1141 }
1142 }
1143 None
1144 }
1145
1146 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1147 let bytes = self.input.as_bytes();
1148 let mut p = position + 2; while p < bytes.len() {
1150 match bytes[p] {
1151 b'\\' => p += 2,
1152 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1153 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1154 b'{' => {
1155 p = self.scan_interpolation(p + 1)?;
1156 p += 1;
1157 }
1158 b'"' => return Some(p + 1),
1159 b'\n' => return None,
1160 _ => p += 1,
1161 }
1162 }
1163 None
1164 }
1165
1166 fn skip_to_format_string_end(&mut self) {
1167 while !self.at_eof() {
1168 match self.current_byte() {
1169 b'"' => {
1170 self.next();
1171 return;
1172 }
1173 b'\n' => return,
1174 _ => self.next(),
1175 }
1176 }
1177 }
1178
1179 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1180 let start_offset = self.current_offset;
1181 let mut tokens = Vec::new();
1182
1183 self.skip(2);
1184
1185 let fstring_start_end = self.current_offset;
1186 tokens.push(Token {
1187 kind: TokenKind::FormatStringStart,
1188 text: &self.input[start_offset..fstring_start_end],
1189 byte_offset: start_offset as u32,
1190 byte_length: (fstring_start_end - start_offset) as u32,
1191 });
1192
1193 let mut text_segment_start = self.current_offset;
1194
1195 while !self.at_eof() {
1196 let byte = self.current_byte();
1197
1198 match byte {
1199 b'\\' if !self.at_eof() => {
1200 let escape_start = self.current_offset;
1201 self.next();
1202 if !self.at_eof() {
1203 let b = self.current_byte();
1204 self.next();
1205 if matches!(b, b'0'..=b'7') {
1206 let value = self.consume_octal_escape(b);
1207 if value > 255 {
1208 let escape_len = self.current_offset - escape_start;
1209 self.error_octal_escape_out_of_range(escape_start, escape_len);
1210 }
1211 } else if b == b'u' {
1212 self.consume_unicode_escape(escape_start);
1213 }
1214 }
1215 }
1216 b'{' if self.peek_byte() == b'{' => {
1217 self.skip(2);
1218 }
1219 b'}' if self.peek_byte() == b'}' => {
1220 self.skip(2);
1221 }
1222 b'"' => {
1223 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1224
1225 let end_offset = self.current_offset;
1226 self.next();
1227
1228 tokens.push(Token {
1229 kind: TokenKind::FormatStringEnd,
1230 text: &self.input[end_offset..self.current_offset],
1231 byte_offset: end_offset as u32,
1232 byte_length: (self.current_offset - end_offset) as u32,
1233 });
1234 return tokens;
1235 }
1236
1237 b'\n' => {
1238 let length = self.current_offset.saturating_sub(start_offset);
1239 self.error_unterminated_format_string(start_offset, length);
1240 return tokens;
1241 }
1242
1243 b'{' => {
1244 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1245
1246 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1247 return tokens;
1248 }
1249 text_segment_start = self.current_offset;
1250 }
1251 b'}' => {
1252 self.error_unmatched_brace_in_format_string(self.current_offset);
1253 self.next();
1254 }
1255 _ => {
1256 self.next();
1257 }
1258 }
1259 }
1260
1261 let length = self.current_offset.saturating_sub(start_offset);
1262 self.error_unterminated_format_string(start_offset, length);
1263 tokens
1264 }
1265
1266 fn lex_char(&mut self) -> Token<'source> {
1267 let start_offset = self.current_offset;
1268
1269 self.next();
1270
1271 if self.at_eof() || self.current_byte() == b'\'' {
1272 self.error_empty_rune_literal(start_offset);
1273 let end_offset = self.current_offset;
1274 return Token {
1275 kind: TokenKind::Char,
1276 text: &self.input[start_offset..end_offset],
1277 byte_offset: start_offset as u32,
1278 byte_length: (end_offset - start_offset) as u32,
1279 };
1280 }
1281
1282 if self.current_byte() != b'\\' {
1283 self.next();
1284 } else {
1285 self.next();
1286
1287 if self.at_eof() {
1288 self.error_unterminated_escape(start_offset);
1289 let end_offset = self.current_offset;
1290 return Token {
1291 kind: TokenKind::Char,
1292 text: &self.input[start_offset..end_offset],
1293 byte_offset: start_offset as u32,
1294 byte_length: (end_offset - start_offset) as u32,
1295 };
1296 }
1297
1298 match self.current_byte() {
1299 b'0'..=b'7' => {
1300 let escape_start = self.current_offset - 1;
1301 let first = self.current_byte();
1302 self.next();
1303 let value = self.consume_octal_escape(first);
1304 if value > 255 {
1305 let escape_len = self.current_offset - escape_start;
1306 self.error_octal_escape_out_of_range(escape_start, escape_len);
1307 }
1308 }
1309 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1310 self.next();
1311 }
1312 _ => {
1313 self.error_invalid_escape(self.current_char());
1314
1315 while !self.at_eof() && self.current_byte() != b'\'' {
1316 self.next();
1317 }
1318
1319 if !self.at_eof() && self.current_byte() == b'\'' {
1320 self.next();
1321 }
1322
1323 let end_offset = self.current_offset;
1324 return Token {
1325 kind: TokenKind::Char,
1326 text: &self.input[start_offset..end_offset],
1327 byte_offset: start_offset as u32,
1328 byte_length: (end_offset - start_offset) as u32,
1329 };
1330 }
1331 }
1332 }
1333
1334 if self.at_eof() || self.current_byte() != b'\'' {
1335 let length = self.current_offset - start_offset;
1336 self.error_unterminated_rune(start_offset, length);
1337 }
1338
1339 if !self.at_eof() && self.current_byte() == b'\'' {
1340 self.next();
1341 }
1342
1343 let end_offset = self.current_offset;
1344 Token {
1345 kind: TokenKind::Char,
1346 text: &self.input[start_offset..end_offset],
1347 byte_offset: start_offset as u32,
1348 byte_length: (end_offset - start_offset) as u32,
1349 }
1350 }
1351
1352 fn lex_slash(&mut self) -> Token<'source> {
1353 let start_offset = self.current_offset;
1354
1355 if self.peek_byte() != b'/' {
1356 self.next();
1357 return Token {
1358 kind: TokenKind::Slash,
1359 text: &self.input[start_offset..self.current_offset],
1360 byte_offset: start_offset as u32,
1361 byte_length: 1,
1362 };
1363 }
1364
1365 let slash_count = self.count_consecutive(b'/');
1366
1367 if slash_count >= 4 {
1368 self.error_excess_slashes_in_comment(start_offset, slash_count);
1369 }
1370
1371 self.skip(slash_count);
1372
1373 if slash_count == 3 {
1374 if self.current_byte() == b' ' {
1375 self.next();
1376 }
1377 let text_start = self.current_offset;
1378 self.skip_to_eol();
1379 let end_offset = self.current_offset;
1380
1381 self.trivia
1382 .doc_comments
1383 .push((start_offset as u32, end_offset as u32));
1384
1385 return Token {
1386 kind: TokenKind::DocComment,
1387 text: &self.input[text_start..end_offset],
1388 byte_offset: start_offset as u32,
1389 byte_length: (end_offset - start_offset) as u32,
1390 };
1391 }
1392
1393 self.skip_to_eol();
1394 let end_offset = self.current_offset;
1395
1396 self.trivia
1397 .comments
1398 .push((start_offset as u32, end_offset as u32));
1399
1400 Token {
1401 kind: TokenKind::Comment,
1402 text: &self.input[start_offset..end_offset],
1403 byte_offset: start_offset as u32,
1404 byte_length: (end_offset - start_offset) as u32,
1405 }
1406 }
1407
1408 fn count_consecutive(&self, byte: u8) -> usize {
1409 let mut count = 0;
1410 let mut offset = self.current_offset;
1411 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1412 count += 1;
1413 offset += 1;
1414 }
1415 count
1416 }
1417
1418 fn skip_to_eol(&mut self) {
1419 while !self.at_eof() && self.current_byte() != b'\n' {
1420 self.next();
1421 }
1422 }
1423
1424 fn lex_directive(&mut self) -> Token<'source> {
1425 let start_offset = self.current_offset;
1426
1427 self.next();
1428
1429 while !self.at_eof() {
1430 let byte = self.current_byte();
1431 if byte.is_ascii_alphanumeric() || byte == b'_' {
1432 self.next();
1433 } else {
1434 break;
1435 }
1436 }
1437
1438 let end_offset = self.current_offset;
1439 Token {
1440 kind: TokenKind::Directive,
1441 text: &self.input[start_offset..end_offset],
1442 byte_offset: start_offset as u32,
1443 byte_length: (end_offset - start_offset) as u32,
1444 }
1445 }
1446
1447 fn handle_unexpected_char(&mut self) -> Token<'source> {
1448 let start_offset = self.current_offset;
1449
1450 self.error_unexpected_char(self.current_offset, self.current_char());
1451
1452 self.resync_on_error();
1453
1454 let end_offset = self.current_offset;
1455
1456 Token {
1457 kind: TokenKind::Error,
1458 text: &self.input[start_offset..end_offset],
1459 byte_offset: start_offset as u32,
1460 byte_length: (end_offset - start_offset) as u32,
1461 }
1462 }
1463
1464 fn eof_token(&self) -> Token<'source> {
1465 Token {
1466 kind: TokenKind::EOF,
1467 text: &self.input[self.current_offset..self.current_offset],
1468 byte_offset: self.current_offset as u32,
1469 byte_length: 0,
1470 }
1471 }
1472
1473 fn semicolon_token(&mut self) -> Token<'source> {
1474 let start_offset = self.current_offset;
1475
1476 self.next();
1477
1478 Token {
1479 kind: TokenKind::Semicolon,
1480 text: &self.input[start_offset..self.current_offset],
1481 byte_offset: start_offset as u32,
1482 byte_length: (self.current_offset - start_offset) as u32,
1483 }
1484 }
1485}