1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.try_consume_unsupported_raw_variant(self.input.len()) {
52 continue;
53 }
54
55 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
56 let mut fstring_tokens = self.lex_format_string_tokens();
57 fstring_tokens.reverse();
58 self.pending_tokens = fstring_tokens;
59 continue;
60 }
61
62 let token = self.create_token();
63 tokens.push(token);
64 }
65
66 let tokens = self.insert_semicolons(tokens);
67
68 LexResult {
69 tokens,
70 errors: self.errors,
71 trivia: self.trivia,
72 }
73 }
74
75 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
76 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
77
78 for i in 0..tokens.len() {
79 let token = tokens[i];
80 result.push(token);
81
82 if !Self::triggers_asi(token.kind) {
83 continue;
84 }
85
86 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
87 if Self::continues_expression(next_token.kind) {
88 continue;
89 }
90
91 let token_end = (token.byte_offset + token.byte_length) as usize;
92 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
93 result.push(self.make_synthetic_semicolon(token_end));
94 }
95 }
96 }
97
98 result
99 }
100
101 fn triggers_asi(kind: TokenKind) -> bool {
102 matches!(
103 kind,
104 TokenKind::Identifier
105 | TokenKind::Integer
106 | TokenKind::Imaginary
107 | TokenKind::Float
108 | TokenKind::String
109 | TokenKind::RawString
110 | TokenKind::Char
111 | TokenKind::Boolean
112 | TokenKind::RightParen
113 | TokenKind::RightSquareBracket
114 | TokenKind::RightCurlyBrace
115 | TokenKind::Break
116 | TokenKind::Continue
117 | TokenKind::Return
118 | TokenKind::DotDot
119 | TokenKind::DotDotEqual
120 | TokenKind::QuestionMark
121 )
122 }
123
124 fn continues_expression(kind: TokenKind) -> bool {
125 matches!(
126 kind,
127 TokenKind::Plus
128 | TokenKind::Star
129 | TokenKind::Slash
130 | TokenKind::Percent
131 | TokenKind::Pipeline
132 | TokenKind::AmpersandDouble
133 | TokenKind::PipeDouble
134 | TokenKind::EqualDouble
135 | TokenKind::NotEqual
136 | TokenKind::LeftAngleBracket
137 | TokenKind::RightAngleBracket
138 | TokenKind::LessThanOrEqual
139 | TokenKind::GreaterThanOrEqual
140 | TokenKind::Dot
141 | TokenKind::Equal
142 | TokenKind::PlusEqual
143 | TokenKind::MinusEqual
144 | TokenKind::StarEqual
145 | TokenKind::SlashEqual
146 | TokenKind::Else
147 | TokenKind::LeftCurlyBrace
148 | TokenKind::RightCurlyBrace
149 | TokenKind::RightParen
150 | TokenKind::RightSquareBracket
151 | TokenKind::As
152 )
153 }
154
155 fn find_next_non_comment_token<'a>(
156 &self,
157 tokens: &'a [Token<'source>],
158 start_index: usize,
159 ) -> Option<&'a Token<'source>> {
160 tokens
161 .iter()
162 .skip(start_index)
163 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
164 }
165
166 fn has_newline_between(&self, start: usize, end: usize) -> bool {
167 self.input[start..end].contains('\n')
168 }
169
170 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
171 Token {
172 kind: TokenKind::Semicolon,
173 text: "",
174 byte_offset: position as u32,
175 byte_length: 0,
176 }
177 }
178
179 fn create_token(&mut self) -> Token<'source> {
180 if let Some(token) = self.lex_lookahead_symbol() {
181 return token;
182 }
183
184 let c = self.current_char();
185 match c {
186 '0'..='9' => self.lex_number(),
187 'r' if self.peek_byte() == b'"' => self.lex_raw_string_literal(),
188 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
189 '"' => self.lex_string_literal(),
190 '`' => self.lex_backtick_literal(),
191 '\'' => self.lex_char(),
192 '/' => self.lex_slash(),
193 ';' => self.semicolon_token(),
194 '@' => self.lex_directive(),
195 _ => self.handle_unexpected_char(),
196 }
197 }
198
199 #[inline]
200 fn current_byte(&self) -> u8 {
201 if self.current_offset < self.input_bytes.len() {
202 self.input_bytes[self.current_offset]
203 } else {
204 0
205 }
206 }
207
208 #[inline]
209 fn current_char(&self) -> char {
210 self.input[self.current_offset..]
211 .chars()
212 .next()
213 .unwrap_or('\0')
214 }
215
216 #[inline]
217 fn peek_byte(&self) -> u8 {
218 if self.current_offset + 1 < self.input_bytes.len() {
219 self.input_bytes[self.current_offset + 1]
220 } else {
221 0
222 }
223 }
224
225 #[inline]
226 fn peek_byte_at(&self, n: usize) -> u8 {
227 let offset = self.current_offset + n;
228 if offset < self.input_bytes.len() {
229 self.input_bytes[offset]
230 } else {
231 0
232 }
233 }
234
235 #[inline]
236 fn peek_char(&self) -> char {
237 let next_offset = if self.current_byte() < 128 {
238 self.current_offset + 1
239 } else {
240 self.current_offset + self.current_char().len_utf8()
241 };
242 self.input[next_offset..].chars().next().unwrap_or('\0')
243 }
244
245 fn peek_char_n(&self, n: usize) -> char {
246 let mut offset = self.current_offset;
247 for _ in 0..n {
248 if offset >= self.input.len() {
249 return '\0';
250 }
251 let c = self.input[offset..].chars().next().unwrap_or('\0');
252 offset += c.len_utf8();
253 }
254 self.input[offset..].chars().next().unwrap_or('\0')
255 }
256
257 fn next(&mut self) {
258 if self.at_eof() {
259 return;
260 }
261 if self.current_byte() < 128 {
262 self.current_offset += 1;
263 } else {
264 self.current_offset += self.current_char().len_utf8();
265 }
266 }
267
268 fn skip(&mut self, count: usize) {
269 for _ in 0..count {
270 self.next();
271 }
272 }
273
274 fn skip_whitespace(&mut self) {
275 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
276 if self.current_byte() == b'\n' {
277 self.record_newline();
278 }
279 self.next();
280 }
281 }
282
283 fn skip_horizontal_whitespace(&mut self) {
284 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
285 self.next();
286 }
287 }
288
289 fn record_newline(&mut self) {
290 let offset = self.current_offset;
291
292 if let Some(last) = self.last_newline_offset {
293 let between = &self.input[last + 1..offset];
294 let is_blank = between.is_empty()
295 || between
296 .chars()
297 .all(|c| c.is_ascii_whitespace() && c != '\n');
298 if is_blank {
299 self.trivia.blank_lines.push(offset as u32);
300 }
301 }
302
303 self.last_newline_offset = Some(offset);
304 }
305
306 fn at_eof(&self) -> bool {
307 self.current_offset >= self.input.len()
308 }
309
310 fn previous_char(&self) -> char {
311 if self.current_offset == 0 {
312 return '\0';
313 }
314 self.input[..self.current_offset]
315 .chars()
316 .next_back()
317 .unwrap_or('\0')
318 }
319
320 fn resync_on_error(&mut self) {
321 while !self.at_eof() {
322 let byte = self.current_byte();
323
324 if byte == b';' || byte == b'}' {
325 break;
326 }
327
328 self.next();
329 }
330 }
331
332 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
334 let start_offset = self.current_offset;
335 let current_char = self.current_char();
336 let next_char = self.peek_char();
337 let third_char = self.peek_char_n(2);
338
339 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
340 self.skip(3);
341 let end_offset = self.current_offset;
342 return Some(Token {
343 kind,
344 text: &self.input[start_offset..end_offset],
345 byte_offset: start_offset as u32,
346 byte_length: (end_offset - start_offset) as u32,
347 });
348 }
349
350 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
351 self.skip(2);
352 let end_offset = self.current_offset;
353 return Some(Token {
354 kind,
355 text: &self.input[start_offset..end_offset],
356 byte_offset: start_offset as u32,
357 byte_length: (end_offset - start_offset) as u32,
358 });
359 }
360
361 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
362 self.next();
363 let end_offset = self.current_offset;
364 return Some(Token {
365 kind,
366 text: &self.input[start_offset..end_offset],
367 byte_offset: start_offset as u32,
368 byte_length: (end_offset - start_offset) as u32,
369 });
370 }
371
372 None
373 }
374
375 fn lex_number(&mut self) -> Token<'source> {
376 let start_offset = self.current_offset;
377
378 if self.current_byte() == b'0' {
379 let next = self.peek_byte();
380 match next {
381 b'x' | b'X' => {
382 self.next(); self.next(); return self.lex_hex_number(start_offset);
385 }
386 b'o' | b'O' => {
387 self.next(); self.next(); return self.lex_octal_number(start_offset);
390 }
391 b'b' | b'B' => {
392 self.next(); self.next(); return self.lex_binary_number(start_offset);
395 }
396 b'0'..=b'7' => {
397 return self.lex_legacy_octal_number(start_offset);
398 }
399 _ => {} }
401 }
402
403 let mut kind = TokenKind::Integer;
404
405 while !self.at_eof() {
406 let byte = self.current_byte();
407 if byte.is_ascii_digit() || byte == b'_' {
408 if byte == b'_' && self.previous_char() == '_' {
409 let underscore_start = self.current_offset - 1;
410 self.error_consecutive_underscores(underscore_start);
411 }
412 self.next();
413 } else {
414 break;
415 }
416 }
417
418 if self.previous_char() == '_' {
419 self.error_number_trailing_underscore(
420 self.current_offset - self.previous_char().len_utf8(),
421 );
422 }
423
424 let preceded_by_dot = start_offset > 0
427 && self.input_bytes[start_offset - 1] == b'.'
428 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
429
430 if !preceded_by_dot
431 && self.current_byte() == b'.'
432 && self.peek_byte() != b'.'
433 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
434 {
435 kind = TokenKind::Float;
436 self.next();
437
438 if self.current_byte() == b'_' {
439 self.error_decimal_leading_underscore(self.current_offset);
440 }
441
442 while !self.at_eof() {
443 let byte = self.current_byte();
444 if byte.is_ascii_digit() || byte == b'_' {
445 if byte == b'_' && self.previous_char() == '_' {
446 let underscore_start = self.current_offset - 1;
447 self.error_consecutive_underscores(underscore_start);
448 }
449 self.next();
450 } else {
451 break;
452 }
453 }
454
455 if self.previous_char() == '_' {
456 self.error_number_trailing_underscore(
457 self.current_offset - self.previous_char().len_utf8(),
458 );
459 }
460 }
461
462 if self.current_byte() == b'e' || self.current_byte() == b'E' {
463 kind = TokenKind::Float;
464 let exponent_start = self.current_offset;
465 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
468 self.next();
469 }
470
471 if !self.current_byte().is_ascii_digit() {
472 self.error_missing_exponent_digits(
473 exponent_start,
474 self.current_offset - exponent_start,
475 );
476 }
477
478 while !self.at_eof() {
479 let byte = self.current_byte();
480 if byte.is_ascii_digit() || byte == b'_' {
481 if byte == b'_' && self.previous_char() == '_' {
482 let underscore_start = self.current_offset - 1;
483 self.error_consecutive_underscores(underscore_start);
484 }
485 self.next();
486 } else {
487 break;
488 }
489 }
490
491 if self.previous_char() == '_' {
492 self.error_number_trailing_underscore(
493 self.current_offset - self.previous_char().len_utf8(),
494 );
495 }
496 }
497
498 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
499 self.next(); let end_offset = self.current_offset;
501 return Token {
502 kind: TokenKind::Imaginary,
503 text: &self.input[start_offset..end_offset],
504 byte_offset: start_offset as u32,
505 byte_length: (end_offset - start_offset) as u32,
506 };
507 }
508
509 let end_offset = self.current_offset;
510 Token {
511 kind,
512 text: &self.input[start_offset..end_offset],
513 byte_offset: start_offset as u32,
514 byte_length: (end_offset - start_offset) as u32,
515 }
516 }
517
518 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
519 let digits_start = self.current_offset;
520
521 while !self.at_eof() {
522 let byte = self.current_byte();
523 if byte.is_ascii_hexdigit() || byte == b'_' {
524 if byte == b'_' && self.previous_char() == '_' {
525 let underscore_start = self.current_offset - 1;
526 self.error_consecutive_underscores(underscore_start);
527 }
528 self.next();
529 } else {
530 break;
531 }
532 }
533
534 if self.current_offset == digits_start {
535 self.error_missing_hex_digits(start_offset, 2);
536 }
537
538 if self.previous_char() == '_' {
539 self.error_number_trailing_underscore(
540 self.current_offset - self.previous_char().len_utf8(),
541 );
542 }
543
544 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
545 self.next(); let end_offset = self.current_offset;
547 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
548 return Token {
549 kind: TokenKind::Imaginary,
550 text: &self.input[start_offset..end_offset],
551 byte_offset: start_offset as u32,
552 byte_length: (end_offset - start_offset) as u32,
553 };
554 }
555
556 let end_offset = self.current_offset;
557 Token {
558 kind: TokenKind::Integer,
559 text: &self.input[start_offset..end_offset],
560 byte_offset: start_offset as u32,
561 byte_length: (end_offset - start_offset) as u32,
562 }
563 }
564
565 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
566 let digits_start = self.current_offset;
567
568 while !self.at_eof() {
569 let byte = self.current_byte();
570 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
571 if byte == b'_' && self.previous_char() == '_' {
572 let underscore_start = self.current_offset - 1;
573 self.error_consecutive_underscores(underscore_start);
574 }
575 self.next();
576 } else if byte == b'8' || byte == b'9' {
577 self.error_invalid_octal_digit(self.current_offset);
578 self.next();
579 } else {
580 break;
581 }
582 }
583
584 if self.current_offset == digits_start {
585 self.error_missing_octal_digits(start_offset, 2);
586 }
587
588 if self.previous_char() == '_' {
589 self.error_number_trailing_underscore(
590 self.current_offset - self.previous_char().len_utf8(),
591 );
592 }
593
594 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
595 self.next(); let end_offset = self.current_offset;
597 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
598 return Token {
599 kind: TokenKind::Imaginary,
600 text: &self.input[start_offset..end_offset],
601 byte_offset: start_offset as u32,
602 byte_length: (end_offset - start_offset) as u32,
603 };
604 }
605
606 let end_offset = self.current_offset;
607 Token {
608 kind: TokenKind::Integer,
609 text: &self.input[start_offset..end_offset],
610 byte_offset: start_offset as u32,
611 byte_length: (end_offset - start_offset) as u32,
612 }
613 }
614
615 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
616 self.next();
617
618 while !self.at_eof() {
619 let byte = self.current_byte();
620 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
621 if byte == b'_' && self.previous_char() == '_' {
622 let underscore_start = self.current_offset - 1;
623 self.error_consecutive_underscores(underscore_start);
624 }
625 self.next();
626 } else if byte == b'8' || byte == b'9' {
627 self.error_invalid_octal_digit(self.current_offset);
628 self.next();
629 } else {
630 break;
631 }
632 }
633
634 if self.previous_char() == '_' {
635 self.error_number_trailing_underscore(
636 self.current_offset - self.previous_char().len_utf8(),
637 );
638 }
639
640 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
641 self.next();
642 let end_offset = self.current_offset;
643 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
644 return Token {
645 kind: TokenKind::Imaginary,
646 text: &self.input[start_offset..end_offset],
647 byte_offset: start_offset as u32,
648 byte_length: (end_offset - start_offset) as u32,
649 };
650 }
651
652 let end_offset = self.current_offset;
653 Token {
654 kind: TokenKind::Integer,
655 text: &self.input[start_offset..end_offset],
656 byte_offset: start_offset as u32,
657 byte_length: (end_offset - start_offset) as u32,
658 }
659 }
660
661 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
662 let digits_start = self.current_offset;
663
664 while !self.at_eof() {
665 let byte = self.current_byte();
666 if byte == b'0' || byte == b'1' || byte == b'_' {
667 if byte == b'_' && self.previous_char() == '_' {
668 let underscore_start = self.current_offset - 1;
669 self.error_consecutive_underscores(underscore_start);
670 }
671 self.next();
672 } else if (b'2'..=b'9').contains(&byte) {
673 self.error_invalid_binary_digit(self.current_offset);
674 self.next();
675 } else {
676 break;
677 }
678 }
679
680 if self.current_offset == digits_start {
681 self.error_missing_binary_digits(start_offset, 2);
682 }
683
684 if self.previous_char() == '_' {
685 self.error_number_trailing_underscore(
686 self.current_offset - self.previous_char().len_utf8(),
687 );
688 }
689
690 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
691 self.next();
692 let end_offset = self.current_offset;
693 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
694 return Token {
695 kind: TokenKind::Imaginary,
696 text: &self.input[start_offset..end_offset],
697 byte_offset: start_offset as u32,
698 byte_length: (end_offset - start_offset) as u32,
699 };
700 }
701
702 let end_offset = self.current_offset;
703 Token {
704 kind: TokenKind::Integer,
705 text: &self.input[start_offset..end_offset],
706 byte_offset: start_offset as u32,
707 byte_length: (end_offset - start_offset) as u32,
708 }
709 }
710
711 fn lex_identifier(&mut self) -> Token<'source> {
712 let start_offset = self.current_offset;
713
714 while !self.at_eof() {
715 let c = self.current_char();
716 if c.is_alphanumeric() || c == '_' {
717 self.next();
718 } else {
719 break;
720 }
721 }
722
723 let end_offset = self.current_offset;
724 let text = &self.input[start_offset..end_offset];
725
726 let kind = match text {
727 "true" | "false" => TokenKind::Boolean,
728 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
729 };
730
731 Token {
732 kind,
733 text,
734 byte_offset: start_offset as u32,
735 byte_length: (end_offset - start_offset) as u32,
736 }
737 }
738
739 fn lex_backtick_literal(&mut self) -> Token<'source> {
740 let start_offset = self.current_offset;
741
742 self.next();
743
744 let mut terminated = false;
745
746 while !self.at_eof() {
747 let byte = self.current_byte();
748 if byte == b'`' {
749 terminated = true;
750 self.next();
751 break;
752 }
753 self.next();
754 }
755
756 let end_offset = self.current_offset;
757 let length = end_offset - start_offset;
758
759 if !terminated {
760 self.error_unterminated_backtick(start_offset, length);
761 }
762
763 Token {
764 kind: TokenKind::Backtick,
765 text: &self.input[start_offset..end_offset],
766 byte_offset: start_offset as u32,
767 byte_length: length as u32,
768 }
769 }
770
771 fn consume_unicode_escape(&mut self, escape_start: usize) {
772 if self.at_eof() || self.current_byte() != b'{' {
773 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
774 return;
775 }
776 self.next();
777
778 let hex_start = self.current_offset;
779 let mut all_hex = true;
780 while !self.at_eof() {
781 let byte = self.current_byte();
782 if byte == b'}' || byte == b'"' || byte == b'\n' {
783 break;
784 }
785 if !byte.is_ascii_hexdigit() {
786 all_hex = false;
787 }
788 self.next();
789 }
790 let hex_end = self.current_offset;
791
792 let closed = !self.at_eof() && self.current_byte() == b'}';
793 if closed {
794 self.next();
795 }
796
797 let hex_len = hex_end - hex_start;
798 let total_len = self.current_offset - escape_start;
799
800 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
801 self.error_invalid_unicode_escape(escape_start, total_len);
802 return;
803 }
804
805 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
806 .expect("hex digits validated above");
807 if char::from_u32(codepoint).is_none() {
808 self.error_unicode_escape_out_of_range(escape_start, total_len);
809 }
810 }
811
812 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
814 let mut value: u16 = (first_digit - b'0') as u16;
815 for _ in 0..2 {
816 if self.at_eof() {
817 break;
818 }
819 match self.current_byte() {
820 d @ b'0'..=b'7' => {
821 value = value * 8 + (d - b'0') as u16;
822 self.next();
823 }
824 _ => break,
825 }
826 }
827 value
828 }
829
830 fn lex_string_literal(&mut self) -> Token<'source> {
831 let start_offset = self.current_offset;
832
833 self.next();
834
835 let mut escaped = false;
836 let mut terminated = false;
837
838 while !self.at_eof() && !terminated {
839 let byte = self.current_byte();
840 if escaped {
841 match byte {
842 b'0'..=b'7' => {
843 let escape_start = self.current_offset - 1;
844 self.next();
845 let value = self.consume_octal_escape(byte);
846 if value > 255 {
847 let escape_len = self.current_offset - escape_start;
848 self.error_octal_escape_out_of_range(escape_start, escape_len);
849 }
850 escaped = false;
851 continue;
852 }
853 b'u' => {
854 let escape_start = self.current_offset - 1;
855 self.next();
856 self.consume_unicode_escape(escape_start);
857 escaped = false;
858 continue;
859 }
860 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
861 }
862 b'\'' => {}
863 _ => {
864 self.error_invalid_escape(self.current_char());
865 }
866 }
867 escaped = false;
868 } else if byte == b'\\' {
869 escaped = true;
870 } else if byte == b'"' {
871 terminated = true;
872 self.next();
873 break;
874 }
875
876 self.next();
877 }
878
879 let end_offset = self.current_offset;
880 let length = end_offset - start_offset;
881
882 if escaped {
883 self.error_unterminated_escape(start_offset);
884 }
885
886 if !terminated {
887 self.error_unterminated_string(start_offset, 1);
888 }
889
890 Token {
891 kind: TokenKind::String,
892 text: &self.input[start_offset..end_offset],
893 byte_offset: start_offset as u32,
894 byte_length: length as u32,
895 }
896 }
897
898 fn lex_raw_string_literal(&mut self) -> Token<'source> {
899 let start_offset = self.current_offset;
900 self.next(); self.next(); let mut terminated = false;
904 while !self.at_eof() {
905 let byte = self.current_byte();
906 if byte == b'"' {
907 terminated = true;
908 self.next();
909 break;
910 } else if byte == 0 {
911 self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
912 self.next();
913 continue;
914 }
915 self.next();
916 }
917
918 let end_offset = self.current_offset;
919 let length = end_offset - start_offset;
920
921 if !terminated {
922 self.error_unterminated_raw_string(start_offset, 2);
923 }
924
925 Token {
926 kind: TokenKind::RawString,
927 text: &self.input[start_offset..end_offset],
928 byte_offset: start_offset as u32,
929 byte_length: length as u32,
930 }
931 }
932
933 fn try_consume_unsupported_raw_variant(&mut self, end: usize) -> bool {
934 let raw_format_prefix = if self.current_byte() == b'r'
935 && self.peek_byte() == b'f'
936 && self.peek_byte_at(2) == b'"'
937 {
938 Some("rf")
939 } else if self.current_byte() == b'f'
940 && self.peek_byte() == b'r'
941 && self.peek_byte_at(2) == b'"'
942 {
943 Some("fr")
944 } else {
945 None
946 };
947 if let Some(prefix) = raw_format_prefix {
948 let start = self.current_offset;
949 self.skip(3);
950 while self.current_offset < end
951 && self.current_byte() != b'"'
952 && self.current_byte() != b'\n'
953 {
954 self.next();
955 }
956 if self.current_offset < end && self.current_byte() == b'"' {
957 self.next();
958 }
959 let length = self.current_offset - start;
960 self.error_unsupported_raw_format_string(start, length, prefix);
961 return true;
962 }
963
964 if self.current_byte() == b'r' && self.peek_byte() == b'#' {
965 let mut hash_count = 0usize;
966 let mut probe = self.current_offset + 1;
967 while probe < self.input_bytes.len() && self.input_bytes[probe] == b'#' {
968 hash_count += 1;
969 probe += 1;
970 }
971 if hash_count > 0 && probe < self.input_bytes.len() && self.input_bytes[probe] == b'"' {
972 let start = self.current_offset;
973 self.skip(1 + hash_count + 1);
974 loop {
975 if self.current_offset >= end || self.current_byte() == b'\n' {
976 break;
977 }
978 if self.current_byte() == b'"' {
979 let mut closer_matches = true;
980 for i in 1..=hash_count {
981 if self.peek_byte_at(i) != b'#' {
982 closer_matches = false;
983 break;
984 }
985 }
986 if closer_matches {
987 self.skip(1 + hash_count);
988 break;
989 }
990 }
991 self.next();
992 }
993 let length = self.current_offset - start;
994 self.error_unsupported_hash_delimited_raw_string(start, length);
995 return true;
996 }
997 }
998
999 false
1000 }
1001
1002 fn push_format_string_text_if_needed(
1003 &self,
1004 tokens: &mut Vec<Token<'source>>,
1005 text_segment_start: usize,
1006 ) {
1007 if text_segment_start < self.current_offset {
1008 tokens.push(Token {
1009 kind: TokenKind::FormatStringText,
1010 text: &self.input[text_segment_start..self.current_offset],
1011 byte_offset: text_segment_start as u32,
1012 byte_length: (self.current_offset - text_segment_start) as u32,
1013 });
1014 }
1015 }
1016
1017 fn lex_format_string_interpolation(
1018 &mut self,
1019 tokens: &mut Vec<Token<'source>>,
1020 ) -> Result<(), ()> {
1021 let interp_start = self.current_offset;
1022 self.next();
1023
1024 tokens.push(Token {
1025 kind: TokenKind::FormatStringInterpolationStart,
1026 text: &self.input[interp_start..self.current_offset],
1027 byte_offset: interp_start as u32,
1028 byte_length: (self.current_offset - interp_start) as u32,
1029 });
1030
1031 let Some(interpolation_end) = self.find_interpolation_boundary() else {
1032 if self.has_newline_between(interp_start, self.input.len()) {
1033 self.error_multiline_format_string_interpolation(interp_start);
1034 } else {
1035 self.error_unclosed_brace_in_format_string(interp_start);
1036 }
1037 self.skip_to_format_string_end();
1038 return Err(());
1039 };
1040
1041 if self.has_newline_between(interp_start, interpolation_end) {
1042 self.error_multiline_format_string_interpolation(interp_start);
1043 }
1044
1045 while self.current_offset < interpolation_end {
1046 self.skip_horizontal_whitespace();
1047 if self.current_offset >= interpolation_end {
1048 break;
1049 }
1050
1051 if self.try_consume_unsupported_raw_variant(interpolation_end) {
1052 continue;
1053 }
1054
1055 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
1056 let mut fstring_tokens = self.lex_format_string_tokens();
1057 tokens.append(&mut fstring_tokens);
1058 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
1059 self.error_escaped_quote_in_interpolation(self.current_offset);
1060 self.skip(2);
1061 } else if self.current_byte() == b'r' && self.peek_byte() == b'"' {
1062 self.error_raw_string_in_interpolation(self.current_offset);
1063 self.skip(2);
1064 while self.current_offset < interpolation_end
1065 && self.current_byte() != b'"'
1066 && self.current_byte() != b'\n'
1067 {
1068 self.next();
1069 }
1070 if self.current_offset < interpolation_end && self.current_byte() == b'"' {
1071 self.next();
1072 }
1073 } else {
1074 let token = self.create_token();
1075 tokens.push(token);
1076 }
1077 }
1078
1079 let close_offset = self.current_offset;
1080 self.next();
1081 tokens.push(Token {
1082 kind: TokenKind::FormatStringInterpolationEnd,
1083 text: &self.input[close_offset..self.current_offset],
1084 byte_offset: close_offset as u32,
1085 byte_length: (self.current_offset - close_offset) as u32,
1086 });
1087
1088 Ok(())
1089 }
1090
1091 fn scan_interpolation(&self, start: usize) -> Option<usize> {
1092 let bytes = self.input.as_bytes();
1093 let mut p = start;
1094 let mut depth = 1;
1095
1096 while p < bytes.len() && depth > 0 {
1097 match bytes[p] {
1098 b'{' => {
1099 depth += 1;
1100 p += 1;
1101 }
1102 b'}' => {
1103 depth -= 1;
1104 if depth > 0 {
1105 p += 1;
1106 }
1107 }
1108 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
1109 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
1110 p = self.scan_past_fstring(p)?;
1111 }
1112 b'\\' => p += 2,
1113 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
1114 b'\n' => return None,
1115 _ => p += 1,
1116 }
1117 }
1118
1119 (depth == 0).then_some(p)
1120 }
1121
1122 fn find_interpolation_boundary(&self) -> Option<usize> {
1123 self.scan_interpolation(self.current_offset)
1124 }
1125
1126 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
1127 let bytes = self.input.as_bytes();
1128 let mut p = start + 1;
1129 while p < bytes.len() {
1130 match bytes[p] {
1131 b'\\' if delimiter != b'`' => p += 2,
1132 b'\n' => return None,
1133 b if b == delimiter => return Some(p + 1),
1134 _ => p += 1,
1135 }
1136 }
1137 None
1138 }
1139
1140 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1141 let bytes = self.input.as_bytes();
1142 let mut p = position + 2; while p < bytes.len() {
1144 match bytes[p] {
1145 b'\\' => p += 2,
1146 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1147 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1148 b'{' => {
1149 p = self.scan_interpolation(p + 1)?;
1150 p += 1;
1151 }
1152 b'"' => return Some(p + 1),
1153 b'\n' => return None,
1154 _ => p += 1,
1155 }
1156 }
1157 None
1158 }
1159
1160 fn skip_to_format_string_end(&mut self) {
1165 let mut depth = 1;
1166 while !self.at_eof() {
1167 match self.current_byte() {
1168 b'\\' => {
1169 self.next();
1170 if !self.at_eof() {
1171 self.next();
1172 }
1173 }
1174 b'"' if depth == 0 => {
1175 self.next();
1176 return;
1177 }
1178 b'"' => {
1179 self.next();
1180 while !self.at_eof() && self.current_byte() != b'"' {
1181 if self.current_byte() == b'\\' {
1182 self.next();
1183 if self.at_eof() {
1184 break;
1185 }
1186 }
1187 self.next();
1188 }
1189 if !self.at_eof() {
1190 self.next();
1191 }
1192 }
1193 b'{' => {
1194 depth += 1;
1195 self.next();
1196 }
1197 b'}' => {
1198 if depth > 0 {
1199 depth -= 1;
1200 }
1201 self.next();
1202 }
1203 _ => self.next(),
1204 }
1205 }
1206 }
1207
1208 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1209 let start_offset = self.current_offset;
1210 let mut tokens = Vec::new();
1211
1212 self.skip(2);
1213
1214 let fstring_start_end = self.current_offset;
1215 tokens.push(Token {
1216 kind: TokenKind::FormatStringStart,
1217 text: &self.input[start_offset..fstring_start_end],
1218 byte_offset: start_offset as u32,
1219 byte_length: (fstring_start_end - start_offset) as u32,
1220 });
1221
1222 let mut text_segment_start = self.current_offset;
1223
1224 while !self.at_eof() {
1225 let byte = self.current_byte();
1226
1227 match byte {
1228 b'\\' if !self.at_eof() => {
1229 let escape_start = self.current_offset;
1230 self.next();
1231 if !self.at_eof() {
1232 let b = self.current_byte();
1233 self.next();
1234 if matches!(b, b'0'..=b'7') {
1235 let value = self.consume_octal_escape(b);
1236 if value > 255 {
1237 let escape_len = self.current_offset - escape_start;
1238 self.error_octal_escape_out_of_range(escape_start, escape_len);
1239 }
1240 } else if b == b'u' {
1241 self.consume_unicode_escape(escape_start);
1242 }
1243 }
1244 }
1245 b'{' if self.peek_byte() == b'{' => {
1246 self.skip(2);
1247 }
1248 b'}' if self.peek_byte() == b'}' => {
1249 self.skip(2);
1250 }
1251 b'"' => {
1252 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1253
1254 let end_offset = self.current_offset;
1255 self.next();
1256
1257 tokens.push(Token {
1258 kind: TokenKind::FormatStringEnd,
1259 text: &self.input[end_offset..self.current_offset],
1260 byte_offset: end_offset as u32,
1261 byte_length: (self.current_offset - end_offset) as u32,
1262 });
1263 return tokens;
1264 }
1265
1266 b'{' => {
1267 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1268
1269 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1270 return tokens;
1271 }
1272 text_segment_start = self.current_offset;
1273 }
1274 b'}' => {
1275 self.error_unmatched_brace_in_format_string(self.current_offset);
1276 self.next();
1277 }
1278 _ => {
1279 self.next();
1280 }
1281 }
1282 }
1283
1284 self.error_unterminated_format_string(start_offset, 2);
1285 tokens
1286 }
1287
1288 fn lex_char(&mut self) -> Token<'source> {
1289 let start_offset = self.current_offset;
1290
1291 self.next();
1292
1293 if self.at_eof() || self.current_byte() == b'\'' {
1294 self.error_empty_rune_literal(start_offset);
1295 let end_offset = self.current_offset;
1296 return Token {
1297 kind: TokenKind::Char,
1298 text: &self.input[start_offset..end_offset],
1299 byte_offset: start_offset as u32,
1300 byte_length: (end_offset - start_offset) as u32,
1301 };
1302 }
1303
1304 if self.current_byte() != b'\\' {
1305 self.next();
1306 } else {
1307 self.next();
1308
1309 if self.at_eof() {
1310 self.error_unterminated_escape(start_offset);
1311 let end_offset = self.current_offset;
1312 return Token {
1313 kind: TokenKind::Char,
1314 text: &self.input[start_offset..end_offset],
1315 byte_offset: start_offset as u32,
1316 byte_length: (end_offset - start_offset) as u32,
1317 };
1318 }
1319
1320 match self.current_byte() {
1321 b'0'..=b'7' => {
1322 let escape_start = self.current_offset - 1;
1323 let first = self.current_byte();
1324 self.next();
1325 let value = self.consume_octal_escape(first);
1326 if value > 255 {
1327 let escape_len = self.current_offset - escape_start;
1328 self.error_octal_escape_out_of_range(escape_start, escape_len);
1329 }
1330 }
1331 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1332 self.next();
1333 }
1334 _ => {
1335 self.error_invalid_escape(self.current_char());
1336
1337 while !self.at_eof() && self.current_byte() != b'\'' {
1338 self.next();
1339 }
1340
1341 if !self.at_eof() && self.current_byte() == b'\'' {
1342 self.next();
1343 }
1344
1345 let end_offset = self.current_offset;
1346 return Token {
1347 kind: TokenKind::Char,
1348 text: &self.input[start_offset..end_offset],
1349 byte_offset: start_offset as u32,
1350 byte_length: (end_offset - start_offset) as u32,
1351 };
1352 }
1353 }
1354 }
1355
1356 if self.at_eof() || self.current_byte() != b'\'' {
1357 let length = self.current_offset - start_offset;
1358 self.error_unterminated_rune(start_offset, length);
1359 }
1360
1361 if !self.at_eof() && self.current_byte() == b'\'' {
1362 self.next();
1363 }
1364
1365 let end_offset = self.current_offset;
1366 Token {
1367 kind: TokenKind::Char,
1368 text: &self.input[start_offset..end_offset],
1369 byte_offset: start_offset as u32,
1370 byte_length: (end_offset - start_offset) as u32,
1371 }
1372 }
1373
1374 fn lex_slash(&mut self) -> Token<'source> {
1375 let start_offset = self.current_offset;
1376
1377 if self.peek_byte() != b'/' {
1378 self.next();
1379 return Token {
1380 kind: TokenKind::Slash,
1381 text: &self.input[start_offset..self.current_offset],
1382 byte_offset: start_offset as u32,
1383 byte_length: 1,
1384 };
1385 }
1386
1387 let slash_count = self.count_consecutive(b'/');
1388
1389 if slash_count >= 4 {
1390 self.error_excess_slashes_in_comment(start_offset, slash_count);
1391 }
1392
1393 self.skip(slash_count);
1394
1395 if slash_count == 3 {
1396 if self.current_byte() == b' ' {
1397 self.next();
1398 }
1399 let text_start = self.current_offset;
1400 self.skip_to_eol();
1401 let end_offset = self.current_offset;
1402
1403 self.trivia
1404 .doc_comments
1405 .push((start_offset as u32, end_offset as u32));
1406
1407 return Token {
1408 kind: TokenKind::DocComment,
1409 text: &self.input[text_start..end_offset],
1410 byte_offset: start_offset as u32,
1411 byte_length: (end_offset - start_offset) as u32,
1412 };
1413 }
1414
1415 self.skip_to_eol();
1416 let end_offset = self.current_offset;
1417
1418 self.trivia
1419 .comments
1420 .push((start_offset as u32, end_offset as u32));
1421
1422 Token {
1423 kind: TokenKind::Comment,
1424 text: &self.input[start_offset..end_offset],
1425 byte_offset: start_offset as u32,
1426 byte_length: (end_offset - start_offset) as u32,
1427 }
1428 }
1429
1430 fn count_consecutive(&self, byte: u8) -> usize {
1431 let mut count = 0;
1432 let mut offset = self.current_offset;
1433 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1434 count += 1;
1435 offset += 1;
1436 }
1437 count
1438 }
1439
1440 fn skip_to_eol(&mut self) {
1441 while !self.at_eof() && self.current_byte() != b'\n' {
1442 self.next();
1443 }
1444 }
1445
1446 fn lex_directive(&mut self) -> Token<'source> {
1447 let start_offset = self.current_offset;
1448
1449 self.next();
1450
1451 while !self.at_eof() {
1452 let byte = self.current_byte();
1453 if byte.is_ascii_alphanumeric() || byte == b'_' {
1454 self.next();
1455 } else {
1456 break;
1457 }
1458 }
1459
1460 let end_offset = self.current_offset;
1461 Token {
1462 kind: TokenKind::Directive,
1463 text: &self.input[start_offset..end_offset],
1464 byte_offset: start_offset as u32,
1465 byte_length: (end_offset - start_offset) as u32,
1466 }
1467 }
1468
1469 fn handle_unexpected_char(&mut self) -> Token<'source> {
1470 let start_offset = self.current_offset;
1471
1472 self.error_unexpected_char(self.current_offset, self.current_char());
1473
1474 self.resync_on_error();
1475
1476 let end_offset = self.current_offset;
1477
1478 Token {
1479 kind: TokenKind::Error,
1480 text: &self.input[start_offset..end_offset],
1481 byte_offset: start_offset as u32,
1482 byte_length: (end_offset - start_offset) as u32,
1483 }
1484 }
1485
1486 fn eof_token(&self) -> Token<'source> {
1487 Token {
1488 kind: TokenKind::EOF,
1489 text: &self.input[self.current_offset..self.current_offset],
1490 byte_offset: self.current_offset as u32,
1491 byte_length: 0,
1492 }
1493 }
1494
1495 fn semicolon_token(&mut self) -> Token<'source> {
1496 let start_offset = self.current_offset;
1497
1498 self.next();
1499
1500 Token {
1501 kind: TokenKind::Semicolon,
1502 text: &self.input[start_offset..self.current_offset],
1503 byte_offset: start_offset as u32,
1504 byte_length: (self.current_offset - start_offset) as u32,
1505 }
1506 }
1507}