1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
52 let mut fstring_tokens = self.lex_format_string_tokens();
53 fstring_tokens.reverse();
54 self.pending_tokens = fstring_tokens;
55 continue;
56 }
57
58 let token = self.create_token();
59 tokens.push(token);
60 }
61
62 let tokens = self.insert_semicolons(tokens);
63
64 LexResult {
65 tokens,
66 errors: self.errors,
67 trivia: self.trivia,
68 }
69 }
70
71 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
72 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
73
74 for i in 0..tokens.len() {
75 let token = tokens[i];
76 result.push(token);
77
78 if !Self::triggers_asi(token.kind) {
79 continue;
80 }
81
82 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
83 if Self::continues_expression(next_token.kind) {
84 continue;
85 }
86
87 let token_end = (token.byte_offset + token.byte_length) as usize;
88 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
89 result.push(self.make_synthetic_semicolon(token_end));
90 }
91 }
92 }
93
94 result
95 }
96
97 fn triggers_asi(kind: TokenKind) -> bool {
98 matches!(
99 kind,
100 TokenKind::Identifier
101 | TokenKind::Integer
102 | TokenKind::Imaginary
103 | TokenKind::Float
104 | TokenKind::String
105 | TokenKind::Char
106 | TokenKind::Boolean
107 | TokenKind::RightParen
108 | TokenKind::RightSquareBracket
109 | TokenKind::RightCurlyBrace
110 | TokenKind::Break
111 | TokenKind::Continue
112 | TokenKind::Return
113 | TokenKind::DotDot
114 | TokenKind::DotDotEqual
115 | TokenKind::QuestionMark
116 )
117 }
118
119 fn continues_expression(kind: TokenKind) -> bool {
120 matches!(
121 kind,
122 TokenKind::Plus
123 | TokenKind::Star
124 | TokenKind::Slash
125 | TokenKind::Percent
126 | TokenKind::Pipeline
127 | TokenKind::AmpersandDouble
128 | TokenKind::PipeDouble
129 | TokenKind::EqualDouble
130 | TokenKind::NotEqual
131 | TokenKind::LeftAngleBracket
132 | TokenKind::RightAngleBracket
133 | TokenKind::LessThanOrEqual
134 | TokenKind::GreaterThanOrEqual
135 | TokenKind::Dot
136 | TokenKind::Equal
137 | TokenKind::PlusEqual
138 | TokenKind::MinusEqual
139 | TokenKind::StarEqual
140 | TokenKind::SlashEqual
141 | TokenKind::Else
142 | TokenKind::LeftCurlyBrace
143 | TokenKind::RightCurlyBrace
144 | TokenKind::RightParen
145 | TokenKind::RightSquareBracket
146 | TokenKind::As
147 )
148 }
149
150 fn find_next_non_comment_token<'a>(
151 &self,
152 tokens: &'a [Token<'source>],
153 start_index: usize,
154 ) -> Option<&'a Token<'source>> {
155 tokens
156 .iter()
157 .skip(start_index)
158 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
159 }
160
161 fn has_newline_between(&self, start: usize, end: usize) -> bool {
162 self.input[start..end].contains('\n')
163 }
164
165 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
166 Token {
167 kind: TokenKind::Semicolon,
168 text: "",
169 byte_offset: position as u32,
170 byte_length: 0,
171 }
172 }
173
174 fn create_token(&mut self) -> Token<'source> {
175 if let Some(token) = self.lex_lookahead_symbol() {
176 return token;
177 }
178
179 let c = self.current_char();
180 match c {
181 '0'..='9' => self.lex_number(),
182 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
183 '"' => self.lex_string_literal(),
184 '`' => self.lex_backtick_literal(),
185 '\'' => self.lex_char(),
186 '/' => self.lex_slash(),
187 ';' => self.semicolon_token(),
188 '@' => self.lex_directive(),
189 _ => self.handle_unexpected_char(),
190 }
191 }
192
193 #[inline]
194 fn current_byte(&self) -> u8 {
195 if self.current_offset < self.input_bytes.len() {
196 self.input_bytes[self.current_offset]
197 } else {
198 0
199 }
200 }
201
202 #[inline]
203 fn current_char(&self) -> char {
204 self.input[self.current_offset..]
205 .chars()
206 .next()
207 .unwrap_or('\0')
208 }
209
210 #[inline]
211 fn peek_byte(&self) -> u8 {
212 if self.current_offset + 1 < self.input_bytes.len() {
213 self.input_bytes[self.current_offset + 1]
214 } else {
215 0
216 }
217 }
218
219 #[inline]
220 fn peek_char(&self) -> char {
221 let next_offset = if self.current_byte() < 128 {
222 self.current_offset + 1
223 } else {
224 self.current_offset + self.current_char().len_utf8()
225 };
226 self.input[next_offset..].chars().next().unwrap_or('\0')
227 }
228
229 fn peek_char_n(&self, n: usize) -> char {
230 let mut offset = self.current_offset;
231 for _ in 0..n {
232 if offset >= self.input.len() {
233 return '\0';
234 }
235 let c = self.input[offset..].chars().next().unwrap_or('\0');
236 offset += c.len_utf8();
237 }
238 self.input[offset..].chars().next().unwrap_or('\0')
239 }
240
241 fn next(&mut self) {
242 if self.at_eof() {
243 return;
244 }
245 if self.current_byte() < 128 {
246 self.current_offset += 1;
247 } else {
248 self.current_offset += self.current_char().len_utf8();
249 }
250 }
251
252 fn skip(&mut self, count: usize) {
253 for _ in 0..count {
254 self.next();
255 }
256 }
257
258 fn skip_whitespace(&mut self) {
259 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
260 if self.current_byte() == b'\n' {
261 self.record_newline();
262 }
263 self.next();
264 }
265 }
266
267 fn skip_horizontal_whitespace(&mut self) {
268 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
269 self.next();
270 }
271 }
272
273 fn record_newline(&mut self) {
274 let offset = self.current_offset;
275
276 if let Some(last) = self.last_newline_offset {
277 let between = &self.input[last + 1..offset];
278 let is_blank = between.is_empty()
279 || between
280 .chars()
281 .all(|c| c.is_ascii_whitespace() && c != '\n');
282 if is_blank {
283 self.trivia.blank_lines.push(offset as u32);
284 }
285 }
286
287 self.last_newline_offset = Some(offset);
288 }
289
290 fn at_eof(&self) -> bool {
291 self.current_offset >= self.input.len()
292 }
293
294 fn previous_char(&self) -> char {
295 if self.current_offset == 0 {
296 return '\0';
297 }
298 self.input[..self.current_offset]
299 .chars()
300 .next_back()
301 .unwrap_or('\0')
302 }
303
304 fn resync_on_error(&mut self) {
305 while !self.at_eof() {
306 let byte = self.current_byte();
307
308 if byte == b';' || byte == b'}' {
309 break;
310 }
311
312 self.next();
313 }
314 }
315
316 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
318 let start_offset = self.current_offset;
319 let current_char = self.current_char();
320 let next_char = self.peek_char();
321 let third_char = self.peek_char_n(2);
322
323 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
324 self.skip(3);
325 let end_offset = self.current_offset;
326 return Some(Token {
327 kind,
328 text: &self.input[start_offset..end_offset],
329 byte_offset: start_offset as u32,
330 byte_length: (end_offset - start_offset) as u32,
331 });
332 }
333
334 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
335 self.skip(2);
336 let end_offset = self.current_offset;
337 return Some(Token {
338 kind,
339 text: &self.input[start_offset..end_offset],
340 byte_offset: start_offset as u32,
341 byte_length: (end_offset - start_offset) as u32,
342 });
343 }
344
345 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
346 self.next();
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 None
357 }
358
359 fn lex_number(&mut self) -> Token<'source> {
360 let start_offset = self.current_offset;
361
362 if self.current_byte() == b'0' {
363 let next = self.peek_byte();
364 match next {
365 b'x' | b'X' => {
366 self.next(); self.next(); return self.lex_hex_number(start_offset);
369 }
370 b'o' | b'O' => {
371 self.next(); self.next(); return self.lex_octal_number(start_offset);
374 }
375 b'b' | b'B' => {
376 self.next(); self.next(); return self.lex_binary_number(start_offset);
379 }
380 b'0'..=b'7' => {
381 return self.lex_legacy_octal_number(start_offset);
382 }
383 _ => {} }
385 }
386
387 let mut kind = TokenKind::Integer;
388
389 while !self.at_eof() {
390 let byte = self.current_byte();
391 if byte.is_ascii_digit() || byte == b'_' {
392 if byte == b'_' && self.previous_char() == '_' {
393 let underscore_start = self.current_offset - 1;
394 self.error_consecutive_underscores(underscore_start);
395 }
396 self.next();
397 } else {
398 break;
399 }
400 }
401
402 if self.previous_char() == '_' {
403 self.error_number_trailing_underscore(
404 self.current_offset - self.previous_char().len_utf8(),
405 );
406 }
407
408 let preceded_by_dot = start_offset > 0
411 && self.input_bytes[start_offset - 1] == b'.'
412 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
413
414 if !preceded_by_dot
415 && self.current_byte() == b'.'
416 && self.peek_byte() != b'.'
417 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
418 {
419 kind = TokenKind::Float;
420 self.next();
421
422 if self.current_byte() == b'_' {
423 self.error_decimal_leading_underscore(self.current_offset);
424 }
425
426 while !self.at_eof() {
427 let byte = self.current_byte();
428 if byte.is_ascii_digit() || byte == b'_' {
429 if byte == b'_' && self.previous_char() == '_' {
430 let underscore_start = self.current_offset - 1;
431 self.error_consecutive_underscores(underscore_start);
432 }
433 self.next();
434 } else {
435 break;
436 }
437 }
438
439 if self.previous_char() == '_' {
440 self.error_number_trailing_underscore(
441 self.current_offset - self.previous_char().len_utf8(),
442 );
443 }
444 }
445
446 if self.current_byte() == b'e' || self.current_byte() == b'E' {
447 kind = TokenKind::Float;
448 let exponent_start = self.current_offset;
449 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
452 self.next();
453 }
454
455 if !self.current_byte().is_ascii_digit() {
456 self.error_missing_exponent_digits(
457 exponent_start,
458 self.current_offset - exponent_start,
459 );
460 }
461
462 while !self.at_eof() {
463 let byte = self.current_byte();
464 if byte.is_ascii_digit() || byte == b'_' {
465 if byte == b'_' && self.previous_char() == '_' {
466 let underscore_start = self.current_offset - 1;
467 self.error_consecutive_underscores(underscore_start);
468 }
469 self.next();
470 } else {
471 break;
472 }
473 }
474
475 if self.previous_char() == '_' {
476 self.error_number_trailing_underscore(
477 self.current_offset - self.previous_char().len_utf8(),
478 );
479 }
480 }
481
482 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
483 self.next(); let end_offset = self.current_offset;
485 return Token {
486 kind: TokenKind::Imaginary,
487 text: &self.input[start_offset..end_offset],
488 byte_offset: start_offset as u32,
489 byte_length: (end_offset - start_offset) as u32,
490 };
491 }
492
493 let end_offset = self.current_offset;
494 Token {
495 kind,
496 text: &self.input[start_offset..end_offset],
497 byte_offset: start_offset as u32,
498 byte_length: (end_offset - start_offset) as u32,
499 }
500 }
501
502 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
503 let digits_start = self.current_offset;
504
505 while !self.at_eof() {
506 let byte = self.current_byte();
507 if byte.is_ascii_hexdigit() || byte == b'_' {
508 if byte == b'_' && self.previous_char() == '_' {
509 let underscore_start = self.current_offset - 1;
510 self.error_consecutive_underscores(underscore_start);
511 }
512 self.next();
513 } else {
514 break;
515 }
516 }
517
518 if self.current_offset == digits_start {
519 self.error_missing_hex_digits(start_offset, 2);
520 }
521
522 if self.previous_char() == '_' {
523 self.error_number_trailing_underscore(
524 self.current_offset - self.previous_char().len_utf8(),
525 );
526 }
527
528 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
529 self.next(); let end_offset = self.current_offset;
531 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
532 return Token {
533 kind: TokenKind::Imaginary,
534 text: &self.input[start_offset..end_offset],
535 byte_offset: start_offset as u32,
536 byte_length: (end_offset - start_offset) as u32,
537 };
538 }
539
540 let end_offset = self.current_offset;
541 Token {
542 kind: TokenKind::Integer,
543 text: &self.input[start_offset..end_offset],
544 byte_offset: start_offset as u32,
545 byte_length: (end_offset - start_offset) as u32,
546 }
547 }
548
549 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
550 let digits_start = self.current_offset;
551
552 while !self.at_eof() {
553 let byte = self.current_byte();
554 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
555 if byte == b'_' && self.previous_char() == '_' {
556 let underscore_start = self.current_offset - 1;
557 self.error_consecutive_underscores(underscore_start);
558 }
559 self.next();
560 } else if byte == b'8' || byte == b'9' {
561 self.error_invalid_octal_digit(self.current_offset);
562 self.next();
563 } else {
564 break;
565 }
566 }
567
568 if self.current_offset == digits_start {
569 self.error_missing_octal_digits(start_offset, 2);
570 }
571
572 if self.previous_char() == '_' {
573 self.error_number_trailing_underscore(
574 self.current_offset - self.previous_char().len_utf8(),
575 );
576 }
577
578 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
579 self.next(); let end_offset = self.current_offset;
581 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
582 return Token {
583 kind: TokenKind::Imaginary,
584 text: &self.input[start_offset..end_offset],
585 byte_offset: start_offset as u32,
586 byte_length: (end_offset - start_offset) as u32,
587 };
588 }
589
590 let end_offset = self.current_offset;
591 Token {
592 kind: TokenKind::Integer,
593 text: &self.input[start_offset..end_offset],
594 byte_offset: start_offset as u32,
595 byte_length: (end_offset - start_offset) as u32,
596 }
597 }
598
599 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
600 self.next();
601
602 while !self.at_eof() {
603 let byte = self.current_byte();
604 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
605 if byte == b'_' && self.previous_char() == '_' {
606 let underscore_start = self.current_offset - 1;
607 self.error_consecutive_underscores(underscore_start);
608 }
609 self.next();
610 } else if byte == b'8' || byte == b'9' {
611 self.error_invalid_octal_digit(self.current_offset);
612 self.next();
613 } else {
614 break;
615 }
616 }
617
618 if self.previous_char() == '_' {
619 self.error_number_trailing_underscore(
620 self.current_offset - self.previous_char().len_utf8(),
621 );
622 }
623
624 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
625 self.next();
626 let end_offset = self.current_offset;
627 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
628 return Token {
629 kind: TokenKind::Imaginary,
630 text: &self.input[start_offset..end_offset],
631 byte_offset: start_offset as u32,
632 byte_length: (end_offset - start_offset) as u32,
633 };
634 }
635
636 let end_offset = self.current_offset;
637 Token {
638 kind: TokenKind::Integer,
639 text: &self.input[start_offset..end_offset],
640 byte_offset: start_offset as u32,
641 byte_length: (end_offset - start_offset) as u32,
642 }
643 }
644
645 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
646 let digits_start = self.current_offset;
647
648 while !self.at_eof() {
649 let byte = self.current_byte();
650 if byte == b'0' || byte == b'1' || byte == b'_' {
651 if byte == b'_' && self.previous_char() == '_' {
652 let underscore_start = self.current_offset - 1;
653 self.error_consecutive_underscores(underscore_start);
654 }
655 self.next();
656 } else if (b'2'..=b'9').contains(&byte) {
657 self.error_invalid_binary_digit(self.current_offset);
658 self.next();
659 } else {
660 break;
661 }
662 }
663
664 if self.current_offset == digits_start {
665 self.error_missing_binary_digits(start_offset, 2);
666 }
667
668 if self.previous_char() == '_' {
669 self.error_number_trailing_underscore(
670 self.current_offset - self.previous_char().len_utf8(),
671 );
672 }
673
674 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
675 self.next();
676 let end_offset = self.current_offset;
677 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
678 return Token {
679 kind: TokenKind::Imaginary,
680 text: &self.input[start_offset..end_offset],
681 byte_offset: start_offset as u32,
682 byte_length: (end_offset - start_offset) as u32,
683 };
684 }
685
686 let end_offset = self.current_offset;
687 Token {
688 kind: TokenKind::Integer,
689 text: &self.input[start_offset..end_offset],
690 byte_offset: start_offset as u32,
691 byte_length: (end_offset - start_offset) as u32,
692 }
693 }
694
695 fn lex_identifier(&mut self) -> Token<'source> {
696 let start_offset = self.current_offset;
697
698 while !self.at_eof() {
699 let c = self.current_char();
700 if c.is_alphanumeric() || c == '_' {
701 self.next();
702 } else {
703 break;
704 }
705 }
706
707 let end_offset = self.current_offset;
708 let text = &self.input[start_offset..end_offset];
709
710 let kind = match text {
711 "true" | "false" => TokenKind::Boolean,
712 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
713 };
714
715 Token {
716 kind,
717 text,
718 byte_offset: start_offset as u32,
719 byte_length: (end_offset - start_offset) as u32,
720 }
721 }
722
723 fn lex_backtick_literal(&mut self) -> Token<'source> {
724 let start_offset = self.current_offset;
725
726 self.next();
727
728 let mut terminated = false;
729
730 while !self.at_eof() {
731 let byte = self.current_byte();
732 if byte == b'`' {
733 terminated = true;
734 self.next();
735 break;
736 } else if byte == b'\n' {
737 break;
738 }
739 self.next();
740 }
741
742 let end_offset = self.current_offset;
743 let length = end_offset - start_offset;
744
745 if !terminated {
746 self.error_unterminated_backtick(start_offset, length);
747 }
748
749 Token {
750 kind: TokenKind::Backtick,
751 text: &self.input[start_offset..end_offset],
752 byte_offset: start_offset as u32,
753 byte_length: length as u32,
754 }
755 }
756
757 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
759 let mut value: u16 = (first_digit - b'0') as u16;
760 for _ in 0..2 {
761 if self.at_eof() {
762 break;
763 }
764 match self.current_byte() {
765 d @ b'0'..=b'7' => {
766 value = value * 8 + (d - b'0') as u16;
767 self.next();
768 }
769 _ => break,
770 }
771 }
772 value
773 }
774
775 fn lex_string_literal(&mut self) -> Token<'source> {
776 let start_offset = self.current_offset;
777
778 self.next();
779
780 let mut escaped = false;
781 let mut terminated = false;
782
783 while !self.at_eof() && !terminated {
784 let byte = self.current_byte();
785 if escaped {
786 match byte {
787 b'0'..=b'7' => {
788 let escape_start = self.current_offset - 1;
789 self.next();
790 let value = self.consume_octal_escape(byte);
791 if value > 255 {
792 let escape_len = self.current_offset - escape_start;
793 self.error_octal_escape_out_of_range(escape_start, escape_len);
794 }
795 escaped = false;
796 continue;
797 }
798 b'n' | b't' | b'r' | b'\\' | b'"' | b'x' | b'u' | b'U' => {}
799 b'\'' => {}
800 _ => {
801 self.error_invalid_escape(start_offset, self.current_char());
802 }
803 }
804 escaped = false;
805 } else if byte == b'\\' {
806 escaped = true;
807 } else if byte == b'"' {
808 terminated = true;
809 self.next();
810 break;
811 } else if byte == b'\n' {
812 break; }
814
815 self.next();
816 }
817
818 let end_offset = self.current_offset;
819 let length = end_offset - start_offset;
820
821 if escaped {
822 self.error_unterminated_escape(start_offset);
823 }
824
825 if !terminated {
826 self.error_unterminated_string(start_offset, length);
827 }
828
829 Token {
830 kind: TokenKind::String,
831 text: &self.input[start_offset..end_offset],
832 byte_offset: start_offset as u32,
833 byte_length: length as u32,
834 }
835 }
836
837 fn push_format_string_text_if_needed(
838 &self,
839 tokens: &mut Vec<Token<'source>>,
840 text_segment_start: usize,
841 ) {
842 if text_segment_start < self.current_offset {
843 tokens.push(Token {
844 kind: TokenKind::FormatStringText,
845 text: &self.input[text_segment_start..self.current_offset],
846 byte_offset: text_segment_start as u32,
847 byte_length: (self.current_offset - text_segment_start) as u32,
848 });
849 }
850 }
851
852 fn lex_format_string_interpolation(
853 &mut self,
854 tokens: &mut Vec<Token<'source>>,
855 ) -> Result<(), ()> {
856 let interp_start = self.current_offset;
857 self.next();
858
859 tokens.push(Token {
860 kind: TokenKind::FormatStringInterpolationStart,
861 text: &self.input[interp_start..self.current_offset],
862 byte_offset: interp_start as u32,
863 byte_length: (self.current_offset - interp_start) as u32,
864 });
865
866 let Some(interpolation_end) = self.find_interpolation_boundary() else {
867 if self.has_newline_between(interp_start, self.input.len()) {
868 self.error_multiline_format_string_interpolation(interp_start);
869 } else {
870 self.error_unclosed_brace_in_format_string(interp_start);
871 }
872 self.skip_to_format_string_end();
873 return Err(());
874 };
875
876 if self.has_newline_between(interp_start, interpolation_end) {
877 self.error_multiline_format_string_interpolation(interp_start);
878 }
879
880 while self.current_offset < interpolation_end {
881 self.skip_horizontal_whitespace();
882 if self.current_offset >= interpolation_end {
883 break;
884 }
885
886 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
887 let mut fstring_tokens = self.lex_format_string_tokens();
888 tokens.append(&mut fstring_tokens);
889 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
890 self.error_escaped_quote_in_interpolation(self.current_offset);
891 self.skip(2);
892 } else {
893 let token = self.create_token();
894 tokens.push(token);
895 }
896 }
897
898 let close_offset = self.current_offset;
899 self.next();
900 tokens.push(Token {
901 kind: TokenKind::FormatStringInterpolationEnd,
902 text: &self.input[close_offset..self.current_offset],
903 byte_offset: close_offset as u32,
904 byte_length: (self.current_offset - close_offset) as u32,
905 });
906
907 Ok(())
908 }
909
910 fn scan_interpolation(&self, start: usize) -> Option<usize> {
911 let bytes = self.input.as_bytes();
912 let mut p = start;
913 let mut depth = 1;
914
915 while p < bytes.len() && depth > 0 {
916 match bytes[p] {
917 b'{' => {
918 depth += 1;
919 p += 1;
920 }
921 b'}' => {
922 depth -= 1;
923 if depth > 0 {
924 p += 1;
925 }
926 }
927 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
928 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
929 p = self.scan_past_fstring(p)?;
930 }
931 b'\\' => p += 2,
932 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
933 b'\n' => return None,
934 _ => p += 1,
935 }
936 }
937
938 (depth == 0).then_some(p)
939 }
940
941 fn find_interpolation_boundary(&self) -> Option<usize> {
942 self.scan_interpolation(self.current_offset)
943 }
944
945 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
946 let bytes = self.input.as_bytes();
947 let mut p = start + 1;
948 while p < bytes.len() {
949 match bytes[p] {
950 b'\\' if delimiter != b'`' => p += 2,
951 b'\n' => return None,
952 b if b == delimiter => return Some(p + 1),
953 _ => p += 1,
954 }
955 }
956 None
957 }
958
959 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
960 let bytes = self.input.as_bytes();
961 let mut p = position + 2; while p < bytes.len() {
963 match bytes[p] {
964 b'\\' => p += 2,
965 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
966 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
967 b'{' => {
968 p = self.scan_interpolation(p + 1)?;
969 p += 1;
970 }
971 b'"' => return Some(p + 1),
972 b'\n' => return None,
973 _ => p += 1,
974 }
975 }
976 None
977 }
978
979 fn skip_to_format_string_end(&mut self) {
980 while !self.at_eof() {
981 match self.current_byte() {
982 b'"' => {
983 self.next();
984 return;
985 }
986 b'\n' => return,
987 _ => self.next(),
988 }
989 }
990 }
991
992 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
993 let start_offset = self.current_offset;
994 let mut tokens = Vec::new();
995
996 self.skip(2);
997
998 let fstring_start_end = self.current_offset;
999 tokens.push(Token {
1000 kind: TokenKind::FormatStringStart,
1001 text: &self.input[start_offset..fstring_start_end],
1002 byte_offset: start_offset as u32,
1003 byte_length: (fstring_start_end - start_offset) as u32,
1004 });
1005
1006 let mut text_segment_start = self.current_offset;
1007
1008 while !self.at_eof() {
1009 let byte = self.current_byte();
1010
1011 match byte {
1012 b'\\' if !self.at_eof() => {
1013 let escape_start = self.current_offset;
1014 self.next();
1015 if !self.at_eof() {
1016 let b = self.current_byte();
1017 self.next();
1018 if matches!(b, b'0'..=b'7') {
1019 let value = self.consume_octal_escape(b);
1020 if value > 255 {
1021 let escape_len = self.current_offset - escape_start;
1022 self.error_octal_escape_out_of_range(escape_start, escape_len);
1023 }
1024 }
1025 }
1026 }
1027 b'{' if self.peek_byte() == b'{' => {
1028 self.skip(2);
1029 }
1030 b'}' if self.peek_byte() == b'}' => {
1031 self.skip(2);
1032 }
1033 b'"' => {
1034 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1035
1036 let end_offset = self.current_offset;
1037 self.next();
1038
1039 tokens.push(Token {
1040 kind: TokenKind::FormatStringEnd,
1041 text: &self.input[end_offset..self.current_offset],
1042 byte_offset: end_offset as u32,
1043 byte_length: (self.current_offset - end_offset) as u32,
1044 });
1045 return tokens;
1046 }
1047
1048 b'\n' => {
1049 let length = self.current_offset.saturating_sub(start_offset);
1050 self.error_unterminated_format_string(start_offset, length);
1051 return tokens;
1052 }
1053
1054 b'{' => {
1055 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1056
1057 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1058 return tokens;
1059 }
1060 text_segment_start = self.current_offset;
1061 }
1062 b'}' => {
1063 self.error_unmatched_brace_in_format_string(self.current_offset);
1064 self.next();
1065 }
1066 _ => {
1067 self.next();
1068 }
1069 }
1070 }
1071
1072 let length = self.current_offset.saturating_sub(start_offset);
1073 self.error_unterminated_format_string(start_offset, length);
1074 tokens
1075 }
1076
1077 fn lex_char(&mut self) -> Token<'source> {
1078 let start_offset = self.current_offset;
1079
1080 self.next();
1081
1082 if self.at_eof() || self.current_byte() == b'\'' {
1083 self.error_empty_rune_literal(start_offset);
1084 let end_offset = self.current_offset;
1085 return Token {
1086 kind: TokenKind::Char,
1087 text: &self.input[start_offset..end_offset],
1088 byte_offset: start_offset as u32,
1089 byte_length: (end_offset - start_offset) as u32,
1090 };
1091 }
1092
1093 if self.current_byte() != b'\\' {
1094 self.next();
1095 } else {
1096 self.next();
1097
1098 if self.at_eof() {
1099 self.error_unterminated_escape(start_offset);
1100 let end_offset = self.current_offset;
1101 return Token {
1102 kind: TokenKind::Char,
1103 text: &self.input[start_offset..end_offset],
1104 byte_offset: start_offset as u32,
1105 byte_length: (end_offset - start_offset) as u32,
1106 };
1107 }
1108
1109 match self.current_byte() {
1110 b'0'..=b'7' => {
1111 let escape_start = self.current_offset - 1;
1112 let first = self.current_byte();
1113 self.next();
1114 let value = self.consume_octal_escape(first);
1115 if value > 255 {
1116 let escape_len = self.current_offset - escape_start;
1117 self.error_octal_escape_out_of_range(escape_start, escape_len);
1118 }
1119 }
1120 b'n' | b't' | b'r' | b'\\' | b'\'' | b'x' => {
1121 self.next();
1122 }
1123 _ => {
1124 self.error_invalid_escape(start_offset, self.current_char());
1125
1126 while !self.at_eof() && self.current_byte() != b'\'' {
1127 self.next();
1128 }
1129
1130 if !self.at_eof() && self.current_byte() == b'\'' {
1131 self.next();
1132 }
1133
1134 let end_offset = self.current_offset;
1135 return Token {
1136 kind: TokenKind::Char,
1137 text: &self.input[start_offset..end_offset],
1138 byte_offset: start_offset as u32,
1139 byte_length: (end_offset - start_offset) as u32,
1140 };
1141 }
1142 }
1143 }
1144
1145 if self.at_eof() || self.current_byte() != b'\'' {
1146 let length = self.current_offset - start_offset;
1147 self.error_unterminated_rune(start_offset, length);
1148 }
1149
1150 if !self.at_eof() && self.current_byte() == b'\'' {
1151 self.next();
1152 }
1153
1154 let end_offset = self.current_offset;
1155 Token {
1156 kind: TokenKind::Char,
1157 text: &self.input[start_offset..end_offset],
1158 byte_offset: start_offset as u32,
1159 byte_length: (end_offset - start_offset) as u32,
1160 }
1161 }
1162
1163 fn lex_slash(&mut self) -> Token<'source> {
1164 let start_offset = self.current_offset;
1165
1166 if self.peek_byte() != b'/' {
1167 self.next();
1168 return Token {
1169 kind: TokenKind::Slash,
1170 text: &self.input[start_offset..self.current_offset],
1171 byte_offset: start_offset as u32,
1172 byte_length: 1,
1173 };
1174 }
1175
1176 let slash_count = self.count_consecutive(b'/');
1177
1178 if slash_count >= 4 {
1179 self.error_excess_slashes_in_comment(start_offset, slash_count);
1180 }
1181
1182 self.skip(slash_count);
1183
1184 if slash_count == 3 {
1185 if self.current_byte() == b' ' {
1186 self.next();
1187 }
1188 let text_start = self.current_offset;
1189 self.skip_to_eol();
1190 let end_offset = self.current_offset;
1191
1192 self.trivia
1193 .doc_comments
1194 .push((start_offset as u32, end_offset as u32));
1195
1196 return Token {
1197 kind: TokenKind::DocComment,
1198 text: &self.input[text_start..end_offset],
1199 byte_offset: start_offset as u32,
1200 byte_length: (end_offset - start_offset) as u32,
1201 };
1202 }
1203
1204 self.skip_to_eol();
1205 let end_offset = self.current_offset;
1206
1207 self.trivia
1208 .comments
1209 .push((start_offset as u32, end_offset as u32));
1210
1211 Token {
1212 kind: TokenKind::Comment,
1213 text: &self.input[start_offset..end_offset],
1214 byte_offset: start_offset as u32,
1215 byte_length: (end_offset - start_offset) as u32,
1216 }
1217 }
1218
1219 fn count_consecutive(&self, byte: u8) -> usize {
1220 let mut count = 0;
1221 let mut offset = self.current_offset;
1222 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1223 count += 1;
1224 offset += 1;
1225 }
1226 count
1227 }
1228
1229 fn skip_to_eol(&mut self) {
1230 while !self.at_eof() && self.current_byte() != b'\n' {
1231 self.next();
1232 }
1233 }
1234
1235 fn lex_directive(&mut self) -> Token<'source> {
1236 let start_offset = self.current_offset;
1237
1238 self.next();
1239
1240 while !self.at_eof() {
1241 let byte = self.current_byte();
1242 if byte.is_ascii_alphanumeric() || byte == b'_' {
1243 self.next();
1244 } else {
1245 break;
1246 }
1247 }
1248
1249 let end_offset = self.current_offset;
1250 Token {
1251 kind: TokenKind::Directive,
1252 text: &self.input[start_offset..end_offset],
1253 byte_offset: start_offset as u32,
1254 byte_length: (end_offset - start_offset) as u32,
1255 }
1256 }
1257
1258 fn handle_unexpected_char(&mut self) -> Token<'source> {
1259 let start_offset = self.current_offset;
1260
1261 self.error_unexpected_char(self.current_offset, self.current_char());
1262
1263 self.resync_on_error();
1264
1265 let end_offset = self.current_offset;
1266
1267 Token {
1268 kind: TokenKind::Error,
1269 text: &self.input[start_offset..end_offset],
1270 byte_offset: start_offset as u32,
1271 byte_length: (end_offset - start_offset) as u32,
1272 }
1273 }
1274
1275 fn eof_token(&self) -> Token<'source> {
1276 Token {
1277 kind: TokenKind::EOF,
1278 text: &self.input[self.current_offset..self.current_offset],
1279 byte_offset: self.current_offset as u32,
1280 byte_length: 0,
1281 }
1282 }
1283
1284 fn semicolon_token(&mut self) -> Token<'source> {
1285 let start_offset = self.current_offset;
1286
1287 self.next();
1288
1289 Token {
1290 kind: TokenKind::Semicolon,
1291 text: &self.input[start_offset..self.current_offset],
1292 byte_offset: start_offset as u32,
1293 byte_length: (self.current_offset - start_offset) as u32,
1294 }
1295 }
1296}