1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.try_consume_unsupported_raw_variant(self.input.len()) {
52 continue;
53 }
54
55 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
56 let mut fstring_tokens = self.lex_format_string_tokens();
57 fstring_tokens.reverse();
58 self.pending_tokens = fstring_tokens;
59 continue;
60 }
61
62 let token = self.create_token();
63 tokens.push(token);
64 }
65
66 let tokens = self.insert_semicolons(tokens);
67
68 LexResult {
69 tokens,
70 errors: self.errors,
71 trivia: self.trivia,
72 }
73 }
74
75 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
76 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
77
78 for i in 0..tokens.len() {
79 let token = tokens[i];
80 result.push(token);
81
82 if !Self::triggers_asi(token.kind) {
83 continue;
84 }
85
86 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
87 if Self::continues_expression(next_token.kind) {
88 continue;
89 }
90
91 let token_end = (token.byte_offset + token.byte_length) as usize;
92 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
93 result.push(self.make_synthetic_semicolon(token_end));
94 }
95 }
96 }
97
98 result
99 }
100
101 fn triggers_asi(kind: TokenKind) -> bool {
102 matches!(
103 kind,
104 TokenKind::Identifier
105 | TokenKind::Integer
106 | TokenKind::Imaginary
107 | TokenKind::Float
108 | TokenKind::String
109 | TokenKind::RawString
110 | TokenKind::Char
111 | TokenKind::Boolean
112 | TokenKind::RightParen
113 | TokenKind::RightSquareBracket
114 | TokenKind::RightCurlyBrace
115 | TokenKind::Break
116 | TokenKind::Continue
117 | TokenKind::Return
118 | TokenKind::DotDot
119 | TokenKind::DotDotEqual
120 | TokenKind::QuestionMark
121 )
122 }
123
124 fn continues_expression(kind: TokenKind) -> bool {
125 matches!(
126 kind,
127 TokenKind::Plus
128 | TokenKind::Star
129 | TokenKind::Slash
130 | TokenKind::Percent
131 | TokenKind::Pipeline
132 | TokenKind::AmpersandDouble
133 | TokenKind::PipeDouble
134 | TokenKind::EqualDouble
135 | TokenKind::NotEqual
136 | TokenKind::LeftAngleBracket
137 | TokenKind::RightAngleBracket
138 | TokenKind::LessThanOrEqual
139 | TokenKind::GreaterThanOrEqual
140 | TokenKind::Dot
141 | TokenKind::Equal
142 | TokenKind::PlusEqual
143 | TokenKind::MinusEqual
144 | TokenKind::StarEqual
145 | TokenKind::SlashEqual
146 | TokenKind::Else
147 | TokenKind::LeftCurlyBrace
148 | TokenKind::RightCurlyBrace
149 | TokenKind::RightParen
150 | TokenKind::RightSquareBracket
151 | TokenKind::As
152 )
153 }
154
155 fn find_next_non_comment_token<'a>(
156 &self,
157 tokens: &'a [Token<'source>],
158 start_index: usize,
159 ) -> Option<&'a Token<'source>> {
160 tokens
161 .iter()
162 .skip(start_index)
163 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
164 }
165
166 fn has_newline_between(&self, start: usize, end: usize) -> bool {
167 self.input[start..end].contains('\n')
168 }
169
170 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
171 Token {
172 kind: TokenKind::Semicolon,
173 text: "",
174 byte_offset: position as u32,
175 byte_length: 0,
176 }
177 }
178
179 fn create_token(&mut self) -> Token<'source> {
180 if let Some(token) = self.lex_lookahead_symbol() {
181 return token;
182 }
183
184 let c = self.current_char();
185 match c {
186 '0'..='9' => self.lex_number(),
187 'r' if self.peek_byte() == b'"' => self.lex_raw_string_literal(),
188 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
189 '"' => self.lex_string_literal(),
190 '`' => self.lex_backtick_literal(),
191 '\'' => self.lex_char(),
192 '/' => self.lex_slash(),
193 ';' => self.semicolon_token(),
194 '@' => self.lex_directive(),
195 _ => self.handle_unexpected_char(),
196 }
197 }
198
199 #[inline]
200 fn current_byte(&self) -> u8 {
201 if self.current_offset < self.input_bytes.len() {
202 self.input_bytes[self.current_offset]
203 } else {
204 0
205 }
206 }
207
208 #[inline]
209 fn current_char(&self) -> char {
210 self.input[self.current_offset..]
211 .chars()
212 .next()
213 .unwrap_or('\0')
214 }
215
216 #[inline]
217 fn peek_byte(&self) -> u8 {
218 if self.current_offset + 1 < self.input_bytes.len() {
219 self.input_bytes[self.current_offset + 1]
220 } else {
221 0
222 }
223 }
224
225 #[inline]
226 fn peek_byte_at(&self, n: usize) -> u8 {
227 let offset = self.current_offset + n;
228 if offset < self.input_bytes.len() {
229 self.input_bytes[offset]
230 } else {
231 0
232 }
233 }
234
235 #[inline]
236 fn peek_char(&self) -> char {
237 let next_offset = if self.current_byte() < 128 {
238 self.current_offset + 1
239 } else {
240 self.current_offset + self.current_char().len_utf8()
241 };
242 self.input[next_offset..].chars().next().unwrap_or('\0')
243 }
244
245 fn peek_char_n(&self, n: usize) -> char {
246 let mut offset = self.current_offset;
247 for _ in 0..n {
248 if offset >= self.input.len() {
249 return '\0';
250 }
251 let c = self.input[offset..].chars().next().unwrap_or('\0');
252 offset += c.len_utf8();
253 }
254 self.input[offset..].chars().next().unwrap_or('\0')
255 }
256
257 fn next(&mut self) {
258 if self.at_eof() {
259 return;
260 }
261 if self.current_byte() < 128 {
262 self.current_offset += 1;
263 } else {
264 self.current_offset += self.current_char().len_utf8();
265 }
266 }
267
268 fn skip(&mut self, count: usize) {
269 for _ in 0..count {
270 self.next();
271 }
272 }
273
274 fn skip_whitespace(&mut self) {
275 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
276 if self.current_byte() == b'\n' {
277 self.record_newline();
278 }
279 self.next();
280 }
281 }
282
283 fn skip_horizontal_whitespace(&mut self) {
284 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
285 self.next();
286 }
287 }
288
289 fn record_newline(&mut self) {
290 let offset = self.current_offset;
291
292 if let Some(last) = self.last_newline_offset {
293 let between = &self.input[last + 1..offset];
294 let is_blank = between.is_empty()
295 || between
296 .chars()
297 .all(|c| c.is_ascii_whitespace() && c != '\n');
298 if is_blank {
299 self.trivia.blank_lines.push(offset as u32);
300 }
301 }
302
303 self.last_newline_offset = Some(offset);
304 }
305
306 fn at_eof(&self) -> bool {
307 self.current_offset >= self.input.len()
308 }
309
310 fn previous_char(&self) -> char {
311 if self.current_offset == 0 {
312 return '\0';
313 }
314 self.input[..self.current_offset]
315 .chars()
316 .next_back()
317 .unwrap_or('\0')
318 }
319
320 fn resync_on_error(&mut self) {
321 while !self.at_eof() {
322 let byte = self.current_byte();
323
324 if byte == b';' || byte == b'}' {
325 break;
326 }
327
328 self.next();
329 }
330 }
331
332 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
334 let start_offset = self.current_offset;
335 let current_char = self.current_char();
336 let next_char = self.peek_char();
337 let third_char = self.peek_char_n(2);
338
339 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
340 self.skip(3);
341 let end_offset = self.current_offset;
342 return Some(Token {
343 kind,
344 text: &self.input[start_offset..end_offset],
345 byte_offset: start_offset as u32,
346 byte_length: (end_offset - start_offset) as u32,
347 });
348 }
349
350 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
351 self.skip(2);
352 let end_offset = self.current_offset;
353 return Some(Token {
354 kind,
355 text: &self.input[start_offset..end_offset],
356 byte_offset: start_offset as u32,
357 byte_length: (end_offset - start_offset) as u32,
358 });
359 }
360
361 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
362 self.next();
363 let end_offset = self.current_offset;
364 return Some(Token {
365 kind,
366 text: &self.input[start_offset..end_offset],
367 byte_offset: start_offset as u32,
368 byte_length: (end_offset - start_offset) as u32,
369 });
370 }
371
372 None
373 }
374
375 fn lex_number(&mut self) -> Token<'source> {
376 let start_offset = self.current_offset;
377
378 if self.current_byte() == b'0' {
379 let next = self.peek_byte();
380 match next {
381 b'x' | b'X' => {
382 self.next(); self.next(); return self.lex_hex_number(start_offset);
385 }
386 b'o' | b'O' => {
387 self.next(); self.next(); return self.lex_octal_number(start_offset);
390 }
391 b'b' | b'B' => {
392 self.next(); self.next(); return self.lex_binary_number(start_offset);
395 }
396 b'0'..=b'7' => {
397 return self.lex_legacy_octal_number(start_offset);
398 }
399 _ => {} }
401 }
402
403 let mut kind = TokenKind::Integer;
404
405 while !self.at_eof() {
406 let byte = self.current_byte();
407 if byte.is_ascii_digit() || byte == b'_' {
408 if byte == b'_' && self.previous_char() == '_' {
409 let underscore_start = self.current_offset - 1;
410 self.error_consecutive_underscores(underscore_start);
411 }
412 self.next();
413 } else {
414 break;
415 }
416 }
417
418 if self.previous_char() == '_' {
419 self.error_number_trailing_underscore(
420 self.current_offset - self.previous_char().len_utf8(),
421 );
422 }
423
424 let preceded_by_dot = start_offset > 0
427 && self.input_bytes[start_offset - 1] == b'.'
428 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
429
430 if !preceded_by_dot
431 && self.current_byte() == b'.'
432 && self.peek_byte() != b'.'
433 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
434 {
435 kind = TokenKind::Float;
436 self.next();
437
438 if self.current_byte() == b'_' {
439 self.error_decimal_leading_underscore(self.current_offset);
440 }
441
442 while !self.at_eof() {
443 let byte = self.current_byte();
444 if byte.is_ascii_digit() || byte == b'_' {
445 if byte == b'_' && self.previous_char() == '_' {
446 let underscore_start = self.current_offset - 1;
447 self.error_consecutive_underscores(underscore_start);
448 }
449 self.next();
450 } else {
451 break;
452 }
453 }
454
455 if self.previous_char() == '_' {
456 self.error_number_trailing_underscore(
457 self.current_offset - self.previous_char().len_utf8(),
458 );
459 }
460 }
461
462 if self.current_byte() == b'e' || self.current_byte() == b'E' {
463 kind = TokenKind::Float;
464 let exponent_start = self.current_offset;
465 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
468 self.next();
469 }
470
471 if !self.current_byte().is_ascii_digit() {
472 self.error_missing_exponent_digits(
473 exponent_start,
474 self.current_offset - exponent_start,
475 );
476 }
477
478 while !self.at_eof() {
479 let byte = self.current_byte();
480 if byte.is_ascii_digit() || byte == b'_' {
481 if byte == b'_' && self.previous_char() == '_' {
482 let underscore_start = self.current_offset - 1;
483 self.error_consecutive_underscores(underscore_start);
484 }
485 self.next();
486 } else {
487 break;
488 }
489 }
490
491 if self.previous_char() == '_' {
492 self.error_number_trailing_underscore(
493 self.current_offset - self.previous_char().len_utf8(),
494 );
495 }
496 }
497
498 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
499 self.next(); let end_offset = self.current_offset;
501 return Token {
502 kind: TokenKind::Imaginary,
503 text: &self.input[start_offset..end_offset],
504 byte_offset: start_offset as u32,
505 byte_length: (end_offset - start_offset) as u32,
506 };
507 }
508
509 let end_offset = self.current_offset;
510 Token {
511 kind,
512 text: &self.input[start_offset..end_offset],
513 byte_offset: start_offset as u32,
514 byte_length: (end_offset - start_offset) as u32,
515 }
516 }
517
518 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
519 let digits_start = self.current_offset;
520
521 while !self.at_eof() {
522 let byte = self.current_byte();
523 if byte.is_ascii_hexdigit() || byte == b'_' {
524 if byte == b'_' && self.previous_char() == '_' {
525 let underscore_start = self.current_offset - 1;
526 self.error_consecutive_underscores(underscore_start);
527 }
528 self.next();
529 } else {
530 break;
531 }
532 }
533
534 if self.current_offset == digits_start {
535 self.error_missing_hex_digits(start_offset, 2);
536 }
537
538 if self.previous_char() == '_' {
539 self.error_number_trailing_underscore(
540 self.current_offset - self.previous_char().len_utf8(),
541 );
542 }
543
544 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
545 self.next(); let end_offset = self.current_offset;
547 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
548 return Token {
549 kind: TokenKind::Imaginary,
550 text: &self.input[start_offset..end_offset],
551 byte_offset: start_offset as u32,
552 byte_length: (end_offset - start_offset) as u32,
553 };
554 }
555
556 let end_offset = self.current_offset;
557 Token {
558 kind: TokenKind::Integer,
559 text: &self.input[start_offset..end_offset],
560 byte_offset: start_offset as u32,
561 byte_length: (end_offset - start_offset) as u32,
562 }
563 }
564
565 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
566 let digits_start = self.current_offset;
567
568 while !self.at_eof() {
569 let byte = self.current_byte();
570 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
571 if byte == b'_' && self.previous_char() == '_' {
572 let underscore_start = self.current_offset - 1;
573 self.error_consecutive_underscores(underscore_start);
574 }
575 self.next();
576 } else if byte == b'8' || byte == b'9' {
577 self.error_invalid_octal_digit(self.current_offset);
578 self.next();
579 } else {
580 break;
581 }
582 }
583
584 if self.current_offset == digits_start {
585 self.error_missing_octal_digits(start_offset, 2);
586 }
587
588 if self.previous_char() == '_' {
589 self.error_number_trailing_underscore(
590 self.current_offset - self.previous_char().len_utf8(),
591 );
592 }
593
594 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
595 self.next(); let end_offset = self.current_offset;
597 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
598 return Token {
599 kind: TokenKind::Imaginary,
600 text: &self.input[start_offset..end_offset],
601 byte_offset: start_offset as u32,
602 byte_length: (end_offset - start_offset) as u32,
603 };
604 }
605
606 let end_offset = self.current_offset;
607 Token {
608 kind: TokenKind::Integer,
609 text: &self.input[start_offset..end_offset],
610 byte_offset: start_offset as u32,
611 byte_length: (end_offset - start_offset) as u32,
612 }
613 }
614
615 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
616 self.next();
617
618 while !self.at_eof() {
619 let byte = self.current_byte();
620 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
621 if byte == b'_' && self.previous_char() == '_' {
622 let underscore_start = self.current_offset - 1;
623 self.error_consecutive_underscores(underscore_start);
624 }
625 self.next();
626 } else if byte == b'8' || byte == b'9' {
627 self.error_invalid_octal_digit(self.current_offset);
628 self.next();
629 } else {
630 break;
631 }
632 }
633
634 if self.previous_char() == '_' {
635 self.error_number_trailing_underscore(
636 self.current_offset - self.previous_char().len_utf8(),
637 );
638 }
639
640 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
641 self.next();
642 let end_offset = self.current_offset;
643 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
644 return Token {
645 kind: TokenKind::Imaginary,
646 text: &self.input[start_offset..end_offset],
647 byte_offset: start_offset as u32,
648 byte_length: (end_offset - start_offset) as u32,
649 };
650 }
651
652 let end_offset = self.current_offset;
653 Token {
654 kind: TokenKind::Integer,
655 text: &self.input[start_offset..end_offset],
656 byte_offset: start_offset as u32,
657 byte_length: (end_offset - start_offset) as u32,
658 }
659 }
660
661 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
662 let digits_start = self.current_offset;
663
664 while !self.at_eof() {
665 let byte = self.current_byte();
666 if byte == b'0' || byte == b'1' || byte == b'_' {
667 if byte == b'_' && self.previous_char() == '_' {
668 let underscore_start = self.current_offset - 1;
669 self.error_consecutive_underscores(underscore_start);
670 }
671 self.next();
672 } else if (b'2'..=b'9').contains(&byte) {
673 self.error_invalid_binary_digit(self.current_offset);
674 self.next();
675 } else {
676 break;
677 }
678 }
679
680 if self.current_offset == digits_start {
681 self.error_missing_binary_digits(start_offset, 2);
682 }
683
684 if self.previous_char() == '_' {
685 self.error_number_trailing_underscore(
686 self.current_offset - self.previous_char().len_utf8(),
687 );
688 }
689
690 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
691 self.next();
692 let end_offset = self.current_offset;
693 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
694 return Token {
695 kind: TokenKind::Imaginary,
696 text: &self.input[start_offset..end_offset],
697 byte_offset: start_offset as u32,
698 byte_length: (end_offset - start_offset) as u32,
699 };
700 }
701
702 let end_offset = self.current_offset;
703 Token {
704 kind: TokenKind::Integer,
705 text: &self.input[start_offset..end_offset],
706 byte_offset: start_offset as u32,
707 byte_length: (end_offset - start_offset) as u32,
708 }
709 }
710
711 fn lex_identifier(&mut self) -> Token<'source> {
712 let start_offset = self.current_offset;
713
714 while !self.at_eof() {
715 let c = self.current_char();
716 if c.is_alphanumeric() || c == '_' {
717 self.next();
718 } else {
719 break;
720 }
721 }
722
723 let end_offset = self.current_offset;
724 let text = &self.input[start_offset..end_offset];
725
726 let kind = match text {
727 "true" | "false" => TokenKind::Boolean,
728 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
729 };
730
731 Token {
732 kind,
733 text,
734 byte_offset: start_offset as u32,
735 byte_length: (end_offset - start_offset) as u32,
736 }
737 }
738
739 fn lex_backtick_literal(&mut self) -> Token<'source> {
740 let start_offset = self.current_offset;
741
742 self.next();
743
744 let mut terminated = false;
745
746 while !self.at_eof() {
747 let byte = self.current_byte();
748 if byte == b'`' {
749 terminated = true;
750 self.next();
751 break;
752 } else if byte == b'\n' {
753 break;
754 }
755 self.next();
756 }
757
758 let end_offset = self.current_offset;
759 let length = end_offset - start_offset;
760
761 if !terminated {
762 self.error_unterminated_backtick(start_offset, length);
763 }
764
765 Token {
766 kind: TokenKind::Backtick,
767 text: &self.input[start_offset..end_offset],
768 byte_offset: start_offset as u32,
769 byte_length: length as u32,
770 }
771 }
772
773 fn consume_unicode_escape(&mut self, escape_start: usize) {
774 if self.at_eof() || self.current_byte() != b'{' {
775 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
776 return;
777 }
778 self.next();
779
780 let hex_start = self.current_offset;
781 let mut all_hex = true;
782 while !self.at_eof() {
783 let byte = self.current_byte();
784 if byte == b'}' || byte == b'"' || byte == b'\n' {
785 break;
786 }
787 if !byte.is_ascii_hexdigit() {
788 all_hex = false;
789 }
790 self.next();
791 }
792 let hex_end = self.current_offset;
793
794 let closed = !self.at_eof() && self.current_byte() == b'}';
795 if closed {
796 self.next();
797 }
798
799 let hex_len = hex_end - hex_start;
800 let total_len = self.current_offset - escape_start;
801
802 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
803 self.error_invalid_unicode_escape(escape_start, total_len);
804 return;
805 }
806
807 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
808 .expect("hex digits validated above");
809 if char::from_u32(codepoint).is_none() {
810 self.error_unicode_escape_out_of_range(escape_start, total_len);
811 }
812 }
813
814 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
816 let mut value: u16 = (first_digit - b'0') as u16;
817 for _ in 0..2 {
818 if self.at_eof() {
819 break;
820 }
821 match self.current_byte() {
822 d @ b'0'..=b'7' => {
823 value = value * 8 + (d - b'0') as u16;
824 self.next();
825 }
826 _ => break,
827 }
828 }
829 value
830 }
831
832 fn lex_string_literal(&mut self) -> Token<'source> {
833 let start_offset = self.current_offset;
834
835 self.next();
836
837 let mut escaped = false;
838 let mut terminated = false;
839
840 while !self.at_eof() && !terminated {
841 let byte = self.current_byte();
842 if escaped {
843 match byte {
844 b'0'..=b'7' => {
845 let escape_start = self.current_offset - 1;
846 self.next();
847 let value = self.consume_octal_escape(byte);
848 if value > 255 {
849 let escape_len = self.current_offset - escape_start;
850 self.error_octal_escape_out_of_range(escape_start, escape_len);
851 }
852 escaped = false;
853 continue;
854 }
855 b'u' => {
856 let escape_start = self.current_offset - 1;
857 self.next();
858 self.consume_unicode_escape(escape_start);
859 escaped = false;
860 continue;
861 }
862 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
863 }
864 b'\'' => {}
865 _ => {
866 self.error_invalid_escape(self.current_char());
867 }
868 }
869 escaped = false;
870 } else if byte == b'\\' {
871 escaped = true;
872 } else if byte == b'"' {
873 terminated = true;
874 self.next();
875 break;
876 }
877
878 self.next();
879 }
880
881 let end_offset = self.current_offset;
882 let length = end_offset - start_offset;
883
884 if escaped {
885 self.error_unterminated_escape(start_offset);
886 }
887
888 if !terminated {
889 self.error_unterminated_string(start_offset, 1);
890 }
891
892 Token {
893 kind: TokenKind::String,
894 text: &self.input[start_offset..end_offset],
895 byte_offset: start_offset as u32,
896 byte_length: length as u32,
897 }
898 }
899
900 fn lex_raw_string_literal(&mut self) -> Token<'source> {
901 let start_offset = self.current_offset;
902 self.next(); self.next(); let mut terminated = false;
906 while !self.at_eof() {
907 let byte = self.current_byte();
908 if byte == b'"' {
909 terminated = true;
910 self.next();
911 break;
912 } else if byte == 0 {
913 self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
914 self.next();
915 continue;
916 }
917 self.next();
918 }
919
920 let end_offset = self.current_offset;
921 let length = end_offset - start_offset;
922
923 if !terminated {
924 self.error_unterminated_raw_string(start_offset, 2);
925 }
926
927 Token {
928 kind: TokenKind::RawString,
929 text: &self.input[start_offset..end_offset],
930 byte_offset: start_offset as u32,
931 byte_length: length as u32,
932 }
933 }
934
935 fn try_consume_unsupported_raw_variant(&mut self, end: usize) -> bool {
936 let raw_format_prefix = if self.current_byte() == b'r'
937 && self.peek_byte() == b'f'
938 && self.peek_byte_at(2) == b'"'
939 {
940 Some("rf")
941 } else if self.current_byte() == b'f'
942 && self.peek_byte() == b'r'
943 && self.peek_byte_at(2) == b'"'
944 {
945 Some("fr")
946 } else {
947 None
948 };
949 if let Some(prefix) = raw_format_prefix {
950 let start = self.current_offset;
951 self.skip(3);
952 while self.current_offset < end
953 && self.current_byte() != b'"'
954 && self.current_byte() != b'\n'
955 {
956 self.next();
957 }
958 if self.current_offset < end && self.current_byte() == b'"' {
959 self.next();
960 }
961 let length = self.current_offset - start;
962 self.error_unsupported_raw_format_string(start, length, prefix);
963 return true;
964 }
965
966 if self.current_byte() == b'r' && self.peek_byte() == b'#' {
967 let mut hash_count = 0usize;
968 let mut probe = self.current_offset + 1;
969 while probe < self.input_bytes.len() && self.input_bytes[probe] == b'#' {
970 hash_count += 1;
971 probe += 1;
972 }
973 if hash_count > 0 && probe < self.input_bytes.len() && self.input_bytes[probe] == b'"' {
974 let start = self.current_offset;
975 self.skip(1 + hash_count + 1);
976 loop {
977 if self.current_offset >= end || self.current_byte() == b'\n' {
978 break;
979 }
980 if self.current_byte() == b'"' {
981 let mut closer_matches = true;
982 for i in 1..=hash_count {
983 if self.peek_byte_at(i) != b'#' {
984 closer_matches = false;
985 break;
986 }
987 }
988 if closer_matches {
989 self.skip(1 + hash_count);
990 break;
991 }
992 }
993 self.next();
994 }
995 let length = self.current_offset - start;
996 self.error_unsupported_hash_delimited_raw_string(start, length);
997 return true;
998 }
999 }
1000
1001 false
1002 }
1003
1004 fn push_format_string_text_if_needed(
1005 &self,
1006 tokens: &mut Vec<Token<'source>>,
1007 text_segment_start: usize,
1008 ) {
1009 if text_segment_start < self.current_offset {
1010 tokens.push(Token {
1011 kind: TokenKind::FormatStringText,
1012 text: &self.input[text_segment_start..self.current_offset],
1013 byte_offset: text_segment_start as u32,
1014 byte_length: (self.current_offset - text_segment_start) as u32,
1015 });
1016 }
1017 }
1018
1019 fn lex_format_string_interpolation(
1020 &mut self,
1021 tokens: &mut Vec<Token<'source>>,
1022 ) -> Result<(), ()> {
1023 let interp_start = self.current_offset;
1024 self.next();
1025
1026 tokens.push(Token {
1027 kind: TokenKind::FormatStringInterpolationStart,
1028 text: &self.input[interp_start..self.current_offset],
1029 byte_offset: interp_start as u32,
1030 byte_length: (self.current_offset - interp_start) as u32,
1031 });
1032
1033 let Some(interpolation_end) = self.find_interpolation_boundary() else {
1034 if self.has_newline_between(interp_start, self.input.len()) {
1035 self.error_multiline_format_string_interpolation(interp_start);
1036 } else {
1037 self.error_unclosed_brace_in_format_string(interp_start);
1038 }
1039 self.skip_to_format_string_end();
1040 return Err(());
1041 };
1042
1043 if self.has_newline_between(interp_start, interpolation_end) {
1044 self.error_multiline_format_string_interpolation(interp_start);
1045 }
1046
1047 while self.current_offset < interpolation_end {
1048 self.skip_horizontal_whitespace();
1049 if self.current_offset >= interpolation_end {
1050 break;
1051 }
1052
1053 if self.try_consume_unsupported_raw_variant(interpolation_end) {
1054 continue;
1055 }
1056
1057 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
1058 let mut fstring_tokens = self.lex_format_string_tokens();
1059 tokens.append(&mut fstring_tokens);
1060 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
1061 self.error_escaped_quote_in_interpolation(self.current_offset);
1062 self.skip(2);
1063 } else if self.current_byte() == b'r' && self.peek_byte() == b'"' {
1064 self.error_raw_string_in_interpolation(self.current_offset);
1065 self.skip(2);
1066 while self.current_offset < interpolation_end
1067 && self.current_byte() != b'"'
1068 && self.current_byte() != b'\n'
1069 {
1070 self.next();
1071 }
1072 if self.current_offset < interpolation_end && self.current_byte() == b'"' {
1073 self.next();
1074 }
1075 } else {
1076 let token = self.create_token();
1077 tokens.push(token);
1078 }
1079 }
1080
1081 let close_offset = self.current_offset;
1082 self.next();
1083 tokens.push(Token {
1084 kind: TokenKind::FormatStringInterpolationEnd,
1085 text: &self.input[close_offset..self.current_offset],
1086 byte_offset: close_offset as u32,
1087 byte_length: (self.current_offset - close_offset) as u32,
1088 });
1089
1090 Ok(())
1091 }
1092
1093 fn scan_interpolation(&self, start: usize) -> Option<usize> {
1094 let bytes = self.input.as_bytes();
1095 let mut p = start;
1096 let mut depth = 1;
1097
1098 while p < bytes.len() && depth > 0 {
1099 match bytes[p] {
1100 b'{' => {
1101 depth += 1;
1102 p += 1;
1103 }
1104 b'}' => {
1105 depth -= 1;
1106 if depth > 0 {
1107 p += 1;
1108 }
1109 }
1110 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
1111 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
1112 p = self.scan_past_fstring(p)?;
1113 }
1114 b'\\' => p += 2,
1115 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
1116 b'\n' => return None,
1117 _ => p += 1,
1118 }
1119 }
1120
1121 (depth == 0).then_some(p)
1122 }
1123
1124 fn find_interpolation_boundary(&self) -> Option<usize> {
1125 self.scan_interpolation(self.current_offset)
1126 }
1127
1128 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
1129 let bytes = self.input.as_bytes();
1130 let mut p = start + 1;
1131 while p < bytes.len() {
1132 match bytes[p] {
1133 b'\\' if delimiter != b'`' => p += 2,
1134 b'\n' => return None,
1135 b if b == delimiter => return Some(p + 1),
1136 _ => p += 1,
1137 }
1138 }
1139 None
1140 }
1141
1142 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1143 let bytes = self.input.as_bytes();
1144 let mut p = position + 2; while p < bytes.len() {
1146 match bytes[p] {
1147 b'\\' => p += 2,
1148 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1149 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1150 b'{' => {
1151 p = self.scan_interpolation(p + 1)?;
1152 p += 1;
1153 }
1154 b'"' => return Some(p + 1),
1155 b'\n' => return None,
1156 _ => p += 1,
1157 }
1158 }
1159 None
1160 }
1161
1162 fn skip_to_format_string_end(&mut self) {
1167 let mut depth = 1;
1168 while !self.at_eof() {
1169 match self.current_byte() {
1170 b'\\' => {
1171 self.next();
1172 if !self.at_eof() {
1173 self.next();
1174 }
1175 }
1176 b'"' if depth == 0 => {
1177 self.next();
1178 return;
1179 }
1180 b'"' => {
1181 self.next();
1182 while !self.at_eof() && self.current_byte() != b'"' {
1183 if self.current_byte() == b'\\' {
1184 self.next();
1185 if self.at_eof() {
1186 break;
1187 }
1188 }
1189 self.next();
1190 }
1191 if !self.at_eof() {
1192 self.next();
1193 }
1194 }
1195 b'{' => {
1196 depth += 1;
1197 self.next();
1198 }
1199 b'}' => {
1200 if depth > 0 {
1201 depth -= 1;
1202 }
1203 self.next();
1204 }
1205 _ => self.next(),
1206 }
1207 }
1208 }
1209
1210 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1211 let start_offset = self.current_offset;
1212 let mut tokens = Vec::new();
1213
1214 self.skip(2);
1215
1216 let fstring_start_end = self.current_offset;
1217 tokens.push(Token {
1218 kind: TokenKind::FormatStringStart,
1219 text: &self.input[start_offset..fstring_start_end],
1220 byte_offset: start_offset as u32,
1221 byte_length: (fstring_start_end - start_offset) as u32,
1222 });
1223
1224 let mut text_segment_start = self.current_offset;
1225
1226 while !self.at_eof() {
1227 let byte = self.current_byte();
1228
1229 match byte {
1230 b'\\' if !self.at_eof() => {
1231 let escape_start = self.current_offset;
1232 self.next();
1233 if !self.at_eof() {
1234 let b = self.current_byte();
1235 self.next();
1236 if matches!(b, b'0'..=b'7') {
1237 let value = self.consume_octal_escape(b);
1238 if value > 255 {
1239 let escape_len = self.current_offset - escape_start;
1240 self.error_octal_escape_out_of_range(escape_start, escape_len);
1241 }
1242 } else if b == b'u' {
1243 self.consume_unicode_escape(escape_start);
1244 }
1245 }
1246 }
1247 b'{' if self.peek_byte() == b'{' => {
1248 self.skip(2);
1249 }
1250 b'}' if self.peek_byte() == b'}' => {
1251 self.skip(2);
1252 }
1253 b'"' => {
1254 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1255
1256 let end_offset = self.current_offset;
1257 self.next();
1258
1259 tokens.push(Token {
1260 kind: TokenKind::FormatStringEnd,
1261 text: &self.input[end_offset..self.current_offset],
1262 byte_offset: end_offset as u32,
1263 byte_length: (self.current_offset - end_offset) as u32,
1264 });
1265 return tokens;
1266 }
1267
1268 b'{' => {
1269 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1270
1271 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1272 return tokens;
1273 }
1274 text_segment_start = self.current_offset;
1275 }
1276 b'}' => {
1277 self.error_unmatched_brace_in_format_string(self.current_offset);
1278 self.next();
1279 }
1280 _ => {
1281 self.next();
1282 }
1283 }
1284 }
1285
1286 self.error_unterminated_format_string(start_offset, 2);
1287 tokens
1288 }
1289
1290 fn lex_char(&mut self) -> Token<'source> {
1291 let start_offset = self.current_offset;
1292
1293 self.next();
1294
1295 if self.at_eof() || self.current_byte() == b'\'' {
1296 self.error_empty_rune_literal(start_offset);
1297 let end_offset = self.current_offset;
1298 return Token {
1299 kind: TokenKind::Char,
1300 text: &self.input[start_offset..end_offset],
1301 byte_offset: start_offset as u32,
1302 byte_length: (end_offset - start_offset) as u32,
1303 };
1304 }
1305
1306 if self.current_byte() != b'\\' {
1307 self.next();
1308 } else {
1309 self.next();
1310
1311 if self.at_eof() {
1312 self.error_unterminated_escape(start_offset);
1313 let end_offset = self.current_offset;
1314 return Token {
1315 kind: TokenKind::Char,
1316 text: &self.input[start_offset..end_offset],
1317 byte_offset: start_offset as u32,
1318 byte_length: (end_offset - start_offset) as u32,
1319 };
1320 }
1321
1322 match self.current_byte() {
1323 b'0'..=b'7' => {
1324 let escape_start = self.current_offset - 1;
1325 let first = self.current_byte();
1326 self.next();
1327 let value = self.consume_octal_escape(first);
1328 if value > 255 {
1329 let escape_len = self.current_offset - escape_start;
1330 self.error_octal_escape_out_of_range(escape_start, escape_len);
1331 }
1332 }
1333 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1334 self.next();
1335 }
1336 _ => {
1337 self.error_invalid_escape(self.current_char());
1338
1339 while !self.at_eof() && self.current_byte() != b'\'' {
1340 self.next();
1341 }
1342
1343 if !self.at_eof() && self.current_byte() == b'\'' {
1344 self.next();
1345 }
1346
1347 let end_offset = self.current_offset;
1348 return Token {
1349 kind: TokenKind::Char,
1350 text: &self.input[start_offset..end_offset],
1351 byte_offset: start_offset as u32,
1352 byte_length: (end_offset - start_offset) as u32,
1353 };
1354 }
1355 }
1356 }
1357
1358 if self.at_eof() || self.current_byte() != b'\'' {
1359 let length = self.current_offset - start_offset;
1360 self.error_unterminated_rune(start_offset, length);
1361 }
1362
1363 if !self.at_eof() && self.current_byte() == b'\'' {
1364 self.next();
1365 }
1366
1367 let end_offset = self.current_offset;
1368 Token {
1369 kind: TokenKind::Char,
1370 text: &self.input[start_offset..end_offset],
1371 byte_offset: start_offset as u32,
1372 byte_length: (end_offset - start_offset) as u32,
1373 }
1374 }
1375
1376 fn lex_slash(&mut self) -> Token<'source> {
1377 let start_offset = self.current_offset;
1378
1379 if self.peek_byte() != b'/' {
1380 self.next();
1381 return Token {
1382 kind: TokenKind::Slash,
1383 text: &self.input[start_offset..self.current_offset],
1384 byte_offset: start_offset as u32,
1385 byte_length: 1,
1386 };
1387 }
1388
1389 let slash_count = self.count_consecutive(b'/');
1390
1391 if slash_count >= 4 {
1392 self.error_excess_slashes_in_comment(start_offset, slash_count);
1393 }
1394
1395 self.skip(slash_count);
1396
1397 if slash_count == 3 {
1398 if self.current_byte() == b' ' {
1399 self.next();
1400 }
1401 let text_start = self.current_offset;
1402 self.skip_to_eol();
1403 let end_offset = self.current_offset;
1404
1405 self.trivia
1406 .doc_comments
1407 .push((start_offset as u32, end_offset as u32));
1408
1409 return Token {
1410 kind: TokenKind::DocComment,
1411 text: &self.input[text_start..end_offset],
1412 byte_offset: start_offset as u32,
1413 byte_length: (end_offset - start_offset) as u32,
1414 };
1415 }
1416
1417 self.skip_to_eol();
1418 let end_offset = self.current_offset;
1419
1420 self.trivia
1421 .comments
1422 .push((start_offset as u32, end_offset as u32));
1423
1424 Token {
1425 kind: TokenKind::Comment,
1426 text: &self.input[start_offset..end_offset],
1427 byte_offset: start_offset as u32,
1428 byte_length: (end_offset - start_offset) as u32,
1429 }
1430 }
1431
1432 fn count_consecutive(&self, byte: u8) -> usize {
1433 let mut count = 0;
1434 let mut offset = self.current_offset;
1435 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1436 count += 1;
1437 offset += 1;
1438 }
1439 count
1440 }
1441
1442 fn skip_to_eol(&mut self) {
1443 while !self.at_eof() && self.current_byte() != b'\n' {
1444 self.next();
1445 }
1446 }
1447
1448 fn lex_directive(&mut self) -> Token<'source> {
1449 let start_offset = self.current_offset;
1450
1451 self.next();
1452
1453 while !self.at_eof() {
1454 let byte = self.current_byte();
1455 if byte.is_ascii_alphanumeric() || byte == b'_' {
1456 self.next();
1457 } else {
1458 break;
1459 }
1460 }
1461
1462 let end_offset = self.current_offset;
1463 Token {
1464 kind: TokenKind::Directive,
1465 text: &self.input[start_offset..end_offset],
1466 byte_offset: start_offset as u32,
1467 byte_length: (end_offset - start_offset) as u32,
1468 }
1469 }
1470
1471 fn handle_unexpected_char(&mut self) -> Token<'source> {
1472 let start_offset = self.current_offset;
1473
1474 self.error_unexpected_char(self.current_offset, self.current_char());
1475
1476 self.resync_on_error();
1477
1478 let end_offset = self.current_offset;
1479
1480 Token {
1481 kind: TokenKind::Error,
1482 text: &self.input[start_offset..end_offset],
1483 byte_offset: start_offset as u32,
1484 byte_length: (end_offset - start_offset) as u32,
1485 }
1486 }
1487
1488 fn eof_token(&self) -> Token<'source> {
1489 Token {
1490 kind: TokenKind::EOF,
1491 text: &self.input[self.current_offset..self.current_offset],
1492 byte_offset: self.current_offset as u32,
1493 byte_length: 0,
1494 }
1495 }
1496
1497 fn semicolon_token(&mut self) -> Token<'source> {
1498 let start_offset = self.current_offset;
1499
1500 self.next();
1501
1502 Token {
1503 kind: TokenKind::Semicolon,
1504 text: &self.input[start_offset..self.current_offset],
1505 byte_offset: start_offset as u32,
1506 byte_length: (self.current_offset - start_offset) as u32,
1507 }
1508 }
1509}