1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
52 let mut fstring_tokens = self.lex_format_string_tokens();
53 fstring_tokens.reverse();
54 self.pending_tokens = fstring_tokens;
55 continue;
56 }
57
58 let token = self.create_token();
59 tokens.push(token);
60 }
61
62 let tokens = self.insert_semicolons(tokens);
63
64 LexResult {
65 tokens,
66 errors: self.errors,
67 trivia: self.trivia,
68 }
69 }
70
71 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
72 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
73
74 for i in 0..tokens.len() {
75 let token = tokens[i];
76 result.push(token);
77
78 if !Self::triggers_asi(token.kind) {
79 continue;
80 }
81
82 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
83 if Self::continues_expression(next_token.kind) {
84 continue;
85 }
86
87 let token_end = (token.byte_offset + token.byte_length) as usize;
88 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
89 result.push(self.make_synthetic_semicolon(token_end));
90 }
91 }
92 }
93
94 result
95 }
96
97 fn triggers_asi(kind: TokenKind) -> bool {
98 matches!(
99 kind,
100 TokenKind::Identifier
101 | TokenKind::Integer
102 | TokenKind::Imaginary
103 | TokenKind::Float
104 | TokenKind::String
105 | TokenKind::Char
106 | TokenKind::Boolean
107 | TokenKind::RightParen
108 | TokenKind::RightSquareBracket
109 | TokenKind::RightCurlyBrace
110 | TokenKind::Break
111 | TokenKind::Continue
112 | TokenKind::Return
113 | TokenKind::DotDot
114 | TokenKind::DotDotEqual
115 | TokenKind::QuestionMark
116 )
117 }
118
119 fn continues_expression(kind: TokenKind) -> bool {
120 matches!(
121 kind,
122 TokenKind::Plus
123 | TokenKind::Star
124 | TokenKind::Slash
125 | TokenKind::Percent
126 | TokenKind::Pipeline
127 | TokenKind::AmpersandDouble
128 | TokenKind::PipeDouble
129 | TokenKind::EqualDouble
130 | TokenKind::NotEqual
131 | TokenKind::LeftAngleBracket
132 | TokenKind::RightAngleBracket
133 | TokenKind::LessThanOrEqual
134 | TokenKind::GreaterThanOrEqual
135 | TokenKind::Dot
136 | TokenKind::Equal
137 | TokenKind::PlusEqual
138 | TokenKind::MinusEqual
139 | TokenKind::StarEqual
140 | TokenKind::SlashEqual
141 | TokenKind::Else
142 | TokenKind::LeftCurlyBrace
143 | TokenKind::RightCurlyBrace
144 | TokenKind::RightParen
145 | TokenKind::RightSquareBracket
146 | TokenKind::As
147 )
148 }
149
150 fn find_next_non_comment_token<'a>(
151 &self,
152 tokens: &'a [Token<'source>],
153 start_index: usize,
154 ) -> Option<&'a Token<'source>> {
155 tokens
156 .iter()
157 .skip(start_index)
158 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
159 }
160
161 fn has_newline_between(&self, start: usize, end: usize) -> bool {
162 self.input[start..end].contains('\n')
163 }
164
165 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
166 Token {
167 kind: TokenKind::Semicolon,
168 text: "",
169 byte_offset: position as u32,
170 byte_length: 0,
171 }
172 }
173
174 fn create_token(&mut self) -> Token<'source> {
175 if let Some(token) = self.lex_lookahead_symbol() {
176 return token;
177 }
178
179 let c = self.current_char();
180 match c {
181 '0'..='9' => self.lex_number(),
182 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
183 '"' => self.lex_string_literal(),
184 '`' => self.lex_backtick_literal(),
185 '\'' => self.lex_char(),
186 '/' => self.lex_slash(),
187 ';' => self.semicolon_token(),
188 '@' => self.lex_directive(),
189 _ => self.handle_unexpected_char(),
190 }
191 }
192
193 #[inline]
194 fn current_byte(&self) -> u8 {
195 if self.current_offset < self.input_bytes.len() {
196 self.input_bytes[self.current_offset]
197 } else {
198 0
199 }
200 }
201
202 #[inline]
203 fn current_char(&self) -> char {
204 self.input[self.current_offset..]
205 .chars()
206 .next()
207 .unwrap_or('\0')
208 }
209
210 #[inline]
211 fn peek_byte(&self) -> u8 {
212 if self.current_offset + 1 < self.input_bytes.len() {
213 self.input_bytes[self.current_offset + 1]
214 } else {
215 0
216 }
217 }
218
219 #[inline]
220 fn peek_char(&self) -> char {
221 let next_offset = if self.current_byte() < 128 {
222 self.current_offset + 1
223 } else {
224 self.current_offset + self.current_char().len_utf8()
225 };
226 self.input[next_offset..].chars().next().unwrap_or('\0')
227 }
228
229 fn peek_char_n(&self, n: usize) -> char {
230 let mut offset = self.current_offset;
231 for _ in 0..n {
232 if offset >= self.input.len() {
233 return '\0';
234 }
235 let c = self.input[offset..].chars().next().unwrap_or('\0');
236 offset += c.len_utf8();
237 }
238 self.input[offset..].chars().next().unwrap_or('\0')
239 }
240
241 fn next(&mut self) {
242 if self.at_eof() {
243 return;
244 }
245 if self.current_byte() < 128 {
246 self.current_offset += 1;
247 } else {
248 self.current_offset += self.current_char().len_utf8();
249 }
250 }
251
252 fn skip(&mut self, count: usize) {
253 for _ in 0..count {
254 self.next();
255 }
256 }
257
258 fn skip_whitespace(&mut self) {
259 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
260 if self.current_byte() == b'\n' {
261 self.record_newline();
262 }
263 self.next();
264 }
265 }
266
267 fn skip_horizontal_whitespace(&mut self) {
268 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
269 self.next();
270 }
271 }
272
273 fn record_newline(&mut self) {
274 let offset = self.current_offset;
275
276 if let Some(last) = self.last_newline_offset {
277 let between = &self.input[last + 1..offset];
278 let is_blank = between.is_empty()
279 || between
280 .chars()
281 .all(|c| c.is_ascii_whitespace() && c != '\n');
282 if is_blank {
283 self.trivia.blank_lines.push(offset as u32);
284 }
285 }
286
287 self.last_newline_offset = Some(offset);
288 }
289
290 fn at_eof(&self) -> bool {
291 self.current_offset >= self.input.len()
292 }
293
294 fn previous_char(&self) -> char {
295 if self.current_offset == 0 {
296 return '\0';
297 }
298 self.input[..self.current_offset]
299 .chars()
300 .next_back()
301 .unwrap_or('\0')
302 }
303
304 fn resync_on_error(&mut self) {
305 while !self.at_eof() {
306 let byte = self.current_byte();
307
308 if byte == b';' || byte == b'}' {
309 break;
310 }
311
312 self.next();
313 }
314 }
315
316 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
318 let start_offset = self.current_offset;
319 let current_char = self.current_char();
320 let next_char = self.peek_char();
321 let third_char = self.peek_char_n(2);
322
323 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
324 self.skip(3);
325 let end_offset = self.current_offset;
326 return Some(Token {
327 kind,
328 text: &self.input[start_offset..end_offset],
329 byte_offset: start_offset as u32,
330 byte_length: (end_offset - start_offset) as u32,
331 });
332 }
333
334 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
335 self.skip(2);
336 let end_offset = self.current_offset;
337 return Some(Token {
338 kind,
339 text: &self.input[start_offset..end_offset],
340 byte_offset: start_offset as u32,
341 byte_length: (end_offset - start_offset) as u32,
342 });
343 }
344
345 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
346 self.next();
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 None
357 }
358
359 fn lex_number(&mut self) -> Token<'source> {
360 let start_offset = self.current_offset;
361
362 if self.current_byte() == b'0' {
363 let next = self.peek_byte();
364 match next {
365 b'x' | b'X' => {
366 self.next(); self.next(); return self.lex_hex_number(start_offset);
369 }
370 b'o' | b'O' => {
371 self.next(); self.next(); return self.lex_octal_number(start_offset);
374 }
375 b'b' | b'B' => {
376 self.next(); self.next(); return self.lex_binary_number(start_offset);
379 }
380 b'0'..=b'7' => {
381 return self.lex_legacy_octal_number(start_offset);
382 }
383 _ => {} }
385 }
386
387 let mut kind = TokenKind::Integer;
388
389 while !self.at_eof() {
390 let byte = self.current_byte();
391 if byte.is_ascii_digit() || byte == b'_' {
392 if byte == b'_' && self.previous_char() == '_' {
393 let underscore_start = self.current_offset - 1;
394 self.error_consecutive_underscores(underscore_start);
395 }
396 self.next();
397 } else {
398 break;
399 }
400 }
401
402 if self.previous_char() == '_' {
403 self.error_number_trailing_underscore(
404 self.current_offset - self.previous_char().len_utf8(),
405 );
406 }
407
408 let preceded_by_dot = start_offset > 0
411 && self.input_bytes[start_offset - 1] == b'.'
412 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
413
414 if !preceded_by_dot
415 && self.current_byte() == b'.'
416 && self.peek_byte() != b'.'
417 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
418 {
419 kind = TokenKind::Float;
420 self.next();
421
422 if self.current_byte() == b'_' {
423 self.error_decimal_leading_underscore(self.current_offset);
424 }
425
426 while !self.at_eof() {
427 let byte = self.current_byte();
428 if byte.is_ascii_digit() || byte == b'_' {
429 if byte == b'_' && self.previous_char() == '_' {
430 let underscore_start = self.current_offset - 1;
431 self.error_consecutive_underscores(underscore_start);
432 }
433 self.next();
434 } else {
435 break;
436 }
437 }
438
439 if self.previous_char() == '_' {
440 self.error_number_trailing_underscore(
441 self.current_offset - self.previous_char().len_utf8(),
442 );
443 }
444 }
445
446 if self.current_byte() == b'e' || self.current_byte() == b'E' {
447 kind = TokenKind::Float;
448 let exponent_start = self.current_offset;
449 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
452 self.next();
453 }
454
455 if !self.current_byte().is_ascii_digit() {
456 self.error_missing_exponent_digits(
457 exponent_start,
458 self.current_offset - exponent_start,
459 );
460 }
461
462 while !self.at_eof() {
463 let byte = self.current_byte();
464 if byte.is_ascii_digit() || byte == b'_' {
465 if byte == b'_' && self.previous_char() == '_' {
466 let underscore_start = self.current_offset - 1;
467 self.error_consecutive_underscores(underscore_start);
468 }
469 self.next();
470 } else {
471 break;
472 }
473 }
474
475 if self.previous_char() == '_' {
476 self.error_number_trailing_underscore(
477 self.current_offset - self.previous_char().len_utf8(),
478 );
479 }
480 }
481
482 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
483 self.next(); let end_offset = self.current_offset;
485 return Token {
486 kind: TokenKind::Imaginary,
487 text: &self.input[start_offset..end_offset],
488 byte_offset: start_offset as u32,
489 byte_length: (end_offset - start_offset) as u32,
490 };
491 }
492
493 let end_offset = self.current_offset;
494 Token {
495 kind,
496 text: &self.input[start_offset..end_offset],
497 byte_offset: start_offset as u32,
498 byte_length: (end_offset - start_offset) as u32,
499 }
500 }
501
502 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
503 let digits_start = self.current_offset;
504
505 while !self.at_eof() {
506 let byte = self.current_byte();
507 if byte.is_ascii_hexdigit() || byte == b'_' {
508 if byte == b'_' && self.previous_char() == '_' {
509 let underscore_start = self.current_offset - 1;
510 self.error_consecutive_underscores(underscore_start);
511 }
512 self.next();
513 } else {
514 break;
515 }
516 }
517
518 if self.current_offset == digits_start {
519 self.error_missing_hex_digits(start_offset, 2);
520 }
521
522 if self.previous_char() == '_' {
523 self.error_number_trailing_underscore(
524 self.current_offset - self.previous_char().len_utf8(),
525 );
526 }
527
528 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
529 self.next(); let end_offset = self.current_offset;
531 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
532 return Token {
533 kind: TokenKind::Imaginary,
534 text: &self.input[start_offset..end_offset],
535 byte_offset: start_offset as u32,
536 byte_length: (end_offset - start_offset) as u32,
537 };
538 }
539
540 let end_offset = self.current_offset;
541 Token {
542 kind: TokenKind::Integer,
543 text: &self.input[start_offset..end_offset],
544 byte_offset: start_offset as u32,
545 byte_length: (end_offset - start_offset) as u32,
546 }
547 }
548
549 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
550 let digits_start = self.current_offset;
551
552 while !self.at_eof() {
553 let byte = self.current_byte();
554 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
555 if byte == b'_' && self.previous_char() == '_' {
556 let underscore_start = self.current_offset - 1;
557 self.error_consecutive_underscores(underscore_start);
558 }
559 self.next();
560 } else if byte == b'8' || byte == b'9' {
561 self.error_invalid_octal_digit(self.current_offset);
562 self.next();
563 } else {
564 break;
565 }
566 }
567
568 if self.current_offset == digits_start {
569 self.error_missing_octal_digits(start_offset, 2);
570 }
571
572 if self.previous_char() == '_' {
573 self.error_number_trailing_underscore(
574 self.current_offset - self.previous_char().len_utf8(),
575 );
576 }
577
578 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
579 self.next(); let end_offset = self.current_offset;
581 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
582 return Token {
583 kind: TokenKind::Imaginary,
584 text: &self.input[start_offset..end_offset],
585 byte_offset: start_offset as u32,
586 byte_length: (end_offset - start_offset) as u32,
587 };
588 }
589
590 let end_offset = self.current_offset;
591 Token {
592 kind: TokenKind::Integer,
593 text: &self.input[start_offset..end_offset],
594 byte_offset: start_offset as u32,
595 byte_length: (end_offset - start_offset) as u32,
596 }
597 }
598
599 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
600 self.next();
601
602 while !self.at_eof() {
603 let byte = self.current_byte();
604 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
605 if byte == b'_' && self.previous_char() == '_' {
606 let underscore_start = self.current_offset - 1;
607 self.error_consecutive_underscores(underscore_start);
608 }
609 self.next();
610 } else if byte == b'8' || byte == b'9' {
611 self.error_invalid_octal_digit(self.current_offset);
612 self.next();
613 } else {
614 break;
615 }
616 }
617
618 if self.previous_char() == '_' {
619 self.error_number_trailing_underscore(
620 self.current_offset - self.previous_char().len_utf8(),
621 );
622 }
623
624 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
625 self.next();
626 let end_offset = self.current_offset;
627 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
628 return Token {
629 kind: TokenKind::Imaginary,
630 text: &self.input[start_offset..end_offset],
631 byte_offset: start_offset as u32,
632 byte_length: (end_offset - start_offset) as u32,
633 };
634 }
635
636 let end_offset = self.current_offset;
637 Token {
638 kind: TokenKind::Integer,
639 text: &self.input[start_offset..end_offset],
640 byte_offset: start_offset as u32,
641 byte_length: (end_offset - start_offset) as u32,
642 }
643 }
644
645 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
646 let digits_start = self.current_offset;
647
648 while !self.at_eof() {
649 let byte = self.current_byte();
650 if byte == b'0' || byte == b'1' || byte == b'_' {
651 if byte == b'_' && self.previous_char() == '_' {
652 let underscore_start = self.current_offset - 1;
653 self.error_consecutive_underscores(underscore_start);
654 }
655 self.next();
656 } else if (b'2'..=b'9').contains(&byte) {
657 self.error_invalid_binary_digit(self.current_offset);
658 self.next();
659 } else {
660 break;
661 }
662 }
663
664 if self.current_offset == digits_start {
665 self.error_missing_binary_digits(start_offset, 2);
666 }
667
668 if self.previous_char() == '_' {
669 self.error_number_trailing_underscore(
670 self.current_offset - self.previous_char().len_utf8(),
671 );
672 }
673
674 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
675 self.next();
676 let end_offset = self.current_offset;
677 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
678 return Token {
679 kind: TokenKind::Imaginary,
680 text: &self.input[start_offset..end_offset],
681 byte_offset: start_offset as u32,
682 byte_length: (end_offset - start_offset) as u32,
683 };
684 }
685
686 let end_offset = self.current_offset;
687 Token {
688 kind: TokenKind::Integer,
689 text: &self.input[start_offset..end_offset],
690 byte_offset: start_offset as u32,
691 byte_length: (end_offset - start_offset) as u32,
692 }
693 }
694
695 fn lex_identifier(&mut self) -> Token<'source> {
696 let start_offset = self.current_offset;
697
698 while !self.at_eof() {
699 let c = self.current_char();
700 if c.is_alphanumeric() || c == '_' {
701 self.next();
702 } else {
703 break;
704 }
705 }
706
707 let end_offset = self.current_offset;
708 let text = &self.input[start_offset..end_offset];
709
710 let kind = match text {
711 "true" | "false" => TokenKind::Boolean,
712 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
713 };
714
715 Token {
716 kind,
717 text,
718 byte_offset: start_offset as u32,
719 byte_length: (end_offset - start_offset) as u32,
720 }
721 }
722
723 fn lex_backtick_literal(&mut self) -> Token<'source> {
724 let start_offset = self.current_offset;
725
726 self.next();
727
728 let mut terminated = false;
729
730 while !self.at_eof() {
731 let byte = self.current_byte();
732 if byte == b'`' {
733 terminated = true;
734 self.next();
735 break;
736 } else if byte == b'\n' {
737 break;
738 }
739 self.next();
740 }
741
742 let end_offset = self.current_offset;
743 let length = end_offset - start_offset;
744
745 if !terminated {
746 self.error_unterminated_backtick(start_offset, length);
747 }
748
749 Token {
750 kind: TokenKind::Backtick,
751 text: &self.input[start_offset..end_offset],
752 byte_offset: start_offset as u32,
753 byte_length: length as u32,
754 }
755 }
756
757 fn lex_string_literal(&mut self) -> Token<'source> {
758 let start_offset = self.current_offset;
759
760 self.next();
761
762 let mut escaped = false;
763 let mut terminated = false;
764
765 while !self.at_eof() && !terminated {
766 let byte = self.current_byte();
767 if escaped {
768 match byte {
769 b'n' | b't' | b'r' | b'0' | b'\\' | b'"' | b'x' | b'u' | b'U' => {}
770 b'\'' => {}
771 _ => {
772 self.error_invalid_escape(start_offset, self.current_char());
773 }
774 }
775 escaped = false;
776 } else if byte == b'\\' {
777 escaped = true;
778 } else if byte == b'"' {
779 terminated = true;
780 self.next();
781 break;
782 } else if byte == b'\n' {
783 break; }
785
786 self.next();
787 }
788
789 let end_offset = self.current_offset;
790 let length = end_offset - start_offset;
791
792 if escaped {
793 self.error_unterminated_escape(start_offset);
794 }
795
796 if !terminated {
797 self.error_unterminated_string(start_offset, length);
798 }
799
800 Token {
801 kind: TokenKind::String,
802 text: &self.input[start_offset..end_offset],
803 byte_offset: start_offset as u32,
804 byte_length: length as u32,
805 }
806 }
807
808 fn push_format_string_text_if_needed(
809 &self,
810 tokens: &mut Vec<Token<'source>>,
811 text_segment_start: usize,
812 ) {
813 if text_segment_start < self.current_offset {
814 tokens.push(Token {
815 kind: TokenKind::FormatStringText,
816 text: &self.input[text_segment_start..self.current_offset],
817 byte_offset: text_segment_start as u32,
818 byte_length: (self.current_offset - text_segment_start) as u32,
819 });
820 }
821 }
822
823 fn lex_format_string_interpolation(
824 &mut self,
825 tokens: &mut Vec<Token<'source>>,
826 ) -> Result<(), ()> {
827 let interp_start = self.current_offset;
828 self.next();
829
830 tokens.push(Token {
831 kind: TokenKind::FormatStringInterpolationStart,
832 text: &self.input[interp_start..self.current_offset],
833 byte_offset: interp_start as u32,
834 byte_length: (self.current_offset - interp_start) as u32,
835 });
836
837 let Some(interpolation_end) = self.find_interpolation_boundary() else {
838 if self.has_newline_between(interp_start, self.input.len()) {
839 self.error_multiline_format_string_interpolation(interp_start);
840 } else {
841 self.error_unclosed_brace_in_format_string(interp_start);
842 }
843 self.skip_to_format_string_end();
844 return Err(());
845 };
846
847 if self.has_newline_between(interp_start, interpolation_end) {
848 self.error_multiline_format_string_interpolation(interp_start);
849 }
850
851 while self.current_offset < interpolation_end {
852 self.skip_horizontal_whitespace();
853 if self.current_offset >= interpolation_end {
854 break;
855 }
856
857 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
858 let mut fstring_tokens = self.lex_format_string_tokens();
859 tokens.append(&mut fstring_tokens);
860 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
861 self.error_escaped_quote_in_interpolation(self.current_offset);
862 self.skip(2);
863 } else {
864 let token = self.create_token();
865 tokens.push(token);
866 }
867 }
868
869 let close_offset = self.current_offset;
870 self.next();
871 tokens.push(Token {
872 kind: TokenKind::FormatStringInterpolationEnd,
873 text: &self.input[close_offset..self.current_offset],
874 byte_offset: close_offset as u32,
875 byte_length: (self.current_offset - close_offset) as u32,
876 });
877
878 Ok(())
879 }
880
881 fn scan_interpolation(&self, start: usize) -> Option<usize> {
882 let bytes = self.input.as_bytes();
883 let mut p = start;
884 let mut depth = 1;
885
886 while p < bytes.len() && depth > 0 {
887 match bytes[p] {
888 b'{' => {
889 depth += 1;
890 p += 1;
891 }
892 b'}' => {
893 depth -= 1;
894 if depth > 0 {
895 p += 1;
896 }
897 }
898 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
899 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
900 p = self.scan_past_fstring(p)?;
901 }
902 b'\\' => p += 2,
903 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
904 b'\n' => return None,
905 _ => p += 1,
906 }
907 }
908
909 (depth == 0).then_some(p)
910 }
911
912 fn find_interpolation_boundary(&self) -> Option<usize> {
913 self.scan_interpolation(self.current_offset)
914 }
915
916 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
917 let bytes = self.input.as_bytes();
918 let mut p = start + 1;
919 while p < bytes.len() {
920 match bytes[p] {
921 b'\\' if delimiter != b'`' => p += 2,
922 b'\n' => return None,
923 b if b == delimiter => return Some(p + 1),
924 _ => p += 1,
925 }
926 }
927 None
928 }
929
930 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
931 let bytes = self.input.as_bytes();
932 let mut p = position + 2; while p < bytes.len() {
934 match bytes[p] {
935 b'\\' => p += 2,
936 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
937 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
938 b'{' => {
939 p = self.scan_interpolation(p + 1)?;
940 p += 1;
941 }
942 b'"' => return Some(p + 1),
943 b'\n' => return None,
944 _ => p += 1,
945 }
946 }
947 None
948 }
949
950 fn skip_to_format_string_end(&mut self) {
951 while !self.at_eof() {
952 match self.current_byte() {
953 b'"' => {
954 self.next();
955 return;
956 }
957 b'\n' => return,
958 _ => self.next(),
959 }
960 }
961 }
962
963 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
964 let start_offset = self.current_offset;
965 let mut tokens = Vec::new();
966
967 self.skip(2);
968
969 let fstring_start_end = self.current_offset;
970 tokens.push(Token {
971 kind: TokenKind::FormatStringStart,
972 text: &self.input[start_offset..fstring_start_end],
973 byte_offset: start_offset as u32,
974 byte_length: (fstring_start_end - start_offset) as u32,
975 });
976
977 let mut text_segment_start = self.current_offset;
978
979 while !self.at_eof() {
980 let byte = self.current_byte();
981
982 match byte {
983 b'\\' if !self.at_eof() => {
984 self.next();
985 if !self.at_eof() {
986 self.next();
987 }
988 }
989 b'{' if self.peek_byte() == b'{' => {
990 self.skip(2);
991 }
992 b'}' if self.peek_byte() == b'}' => {
993 self.skip(2);
994 }
995 b'"' => {
996 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
997
998 let end_offset = self.current_offset;
999 self.next();
1000
1001 tokens.push(Token {
1002 kind: TokenKind::FormatStringEnd,
1003 text: &self.input[end_offset..self.current_offset],
1004 byte_offset: end_offset as u32,
1005 byte_length: (self.current_offset - end_offset) as u32,
1006 });
1007 return tokens;
1008 }
1009
1010 b'\n' => {
1011 let length = self.current_offset.saturating_sub(start_offset);
1012 self.error_unterminated_format_string(start_offset, length);
1013 return tokens;
1014 }
1015
1016 b'{' => {
1017 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1018
1019 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1020 return tokens;
1021 }
1022 text_segment_start = self.current_offset;
1023 }
1024 b'}' => {
1025 self.error_unmatched_brace_in_format_string(self.current_offset);
1026 self.next();
1027 }
1028 _ => {
1029 self.next();
1030 }
1031 }
1032 }
1033
1034 let length = self.current_offset.saturating_sub(start_offset);
1035 self.error_unterminated_format_string(start_offset, length);
1036 tokens
1037 }
1038
1039 fn lex_char(&mut self) -> Token<'source> {
1040 let start_offset = self.current_offset;
1041
1042 self.next();
1043
1044 if self.at_eof() || self.current_byte() == b'\'' {
1045 self.error_empty_rune_literal(start_offset);
1046 let end_offset = self.current_offset;
1047 return Token {
1048 kind: TokenKind::Char,
1049 text: &self.input[start_offset..end_offset],
1050 byte_offset: start_offset as u32,
1051 byte_length: (end_offset - start_offset) as u32,
1052 };
1053 }
1054
1055 if self.current_byte() != b'\\' {
1056 self.next();
1057 } else {
1058 self.next();
1059
1060 if self.at_eof() {
1061 self.error_unterminated_escape(start_offset);
1062 let end_offset = self.current_offset;
1063 return Token {
1064 kind: TokenKind::Char,
1065 text: &self.input[start_offset..end_offset],
1066 byte_offset: start_offset as u32,
1067 byte_length: (end_offset - start_offset) as u32,
1068 };
1069 }
1070
1071 match self.current_byte() {
1072 b'n' | b't' | b'r' | b'0' | b'\\' | b'\'' | b'x' => {
1073 self.next();
1074 }
1075 _ => {
1076 self.error_invalid_escape(start_offset, self.current_char());
1077
1078 while !self.at_eof() && self.current_byte() != b'\'' {
1079 self.next();
1080 }
1081
1082 if !self.at_eof() && self.current_byte() == b'\'' {
1083 self.next();
1084 }
1085
1086 let end_offset = self.current_offset;
1087 return Token {
1088 kind: TokenKind::Char,
1089 text: &self.input[start_offset..end_offset],
1090 byte_offset: start_offset as u32,
1091 byte_length: (end_offset - start_offset) as u32,
1092 };
1093 }
1094 }
1095 }
1096
1097 if self.at_eof() || self.current_byte() != b'\'' {
1098 let length = self.current_offset - start_offset;
1099 self.error_unterminated_rune(start_offset, length);
1100 }
1101
1102 if !self.at_eof() && self.current_byte() == b'\'' {
1103 self.next();
1104 }
1105
1106 let end_offset = self.current_offset;
1107 Token {
1108 kind: TokenKind::Char,
1109 text: &self.input[start_offset..end_offset],
1110 byte_offset: start_offset as u32,
1111 byte_length: (end_offset - start_offset) as u32,
1112 }
1113 }
1114
1115 fn lex_slash(&mut self) -> Token<'source> {
1116 let start_offset = self.current_offset;
1117
1118 if self.peek_byte() != b'/' {
1119 self.next();
1120 return Token {
1121 kind: TokenKind::Slash,
1122 text: &self.input[start_offset..self.current_offset],
1123 byte_offset: start_offset as u32,
1124 byte_length: 1,
1125 };
1126 }
1127
1128 let slash_count = self.count_consecutive(b'/');
1129
1130 if slash_count >= 4 {
1131 self.error_excess_slashes_in_comment(start_offset, slash_count);
1132 }
1133
1134 self.skip(slash_count);
1135
1136 if slash_count == 3 {
1137 if self.current_byte() == b' ' {
1138 self.next();
1139 }
1140 let text_start = self.current_offset;
1141 self.skip_to_eol();
1142 let end_offset = self.current_offset;
1143
1144 self.trivia
1145 .doc_comments
1146 .push((start_offset as u32, end_offset as u32));
1147
1148 return Token {
1149 kind: TokenKind::DocComment,
1150 text: &self.input[text_start..end_offset],
1151 byte_offset: start_offset as u32,
1152 byte_length: (end_offset - start_offset) as u32,
1153 };
1154 }
1155
1156 self.skip_to_eol();
1157 let end_offset = self.current_offset;
1158
1159 self.trivia
1160 .comments
1161 .push((start_offset as u32, end_offset as u32));
1162
1163 Token {
1164 kind: TokenKind::Comment,
1165 text: &self.input[start_offset..end_offset],
1166 byte_offset: start_offset as u32,
1167 byte_length: (end_offset - start_offset) as u32,
1168 }
1169 }
1170
1171 fn count_consecutive(&self, byte: u8) -> usize {
1172 let mut count = 0;
1173 let mut offset = self.current_offset;
1174 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1175 count += 1;
1176 offset += 1;
1177 }
1178 count
1179 }
1180
1181 fn skip_to_eol(&mut self) {
1182 while !self.at_eof() && self.current_byte() != b'\n' {
1183 self.next();
1184 }
1185 }
1186
1187 fn lex_directive(&mut self) -> Token<'source> {
1188 let start_offset = self.current_offset;
1189
1190 self.next();
1191
1192 while !self.at_eof() {
1193 let byte = self.current_byte();
1194 if byte.is_ascii_alphanumeric() || byte == b'_' {
1195 self.next();
1196 } else {
1197 break;
1198 }
1199 }
1200
1201 let end_offset = self.current_offset;
1202 Token {
1203 kind: TokenKind::Directive,
1204 text: &self.input[start_offset..end_offset],
1205 byte_offset: start_offset as u32,
1206 byte_length: (end_offset - start_offset) as u32,
1207 }
1208 }
1209
1210 fn handle_unexpected_char(&mut self) -> Token<'source> {
1211 let start_offset = self.current_offset;
1212
1213 self.error_unexpected_char(self.current_offset, self.current_char());
1214
1215 self.resync_on_error();
1216
1217 let end_offset = self.current_offset;
1218
1219 Token {
1220 kind: TokenKind::Error,
1221 text: &self.input[start_offset..end_offset],
1222 byte_offset: start_offset as u32,
1223 byte_length: (end_offset - start_offset) as u32,
1224 }
1225 }
1226
1227 fn eof_token(&self) -> Token<'source> {
1228 Token {
1229 kind: TokenKind::EOF,
1230 text: &self.input[self.current_offset..self.current_offset],
1231 byte_offset: self.current_offset as u32,
1232 byte_length: 0,
1233 }
1234 }
1235
1236 fn semicolon_token(&mut self) -> Token<'source> {
1237 let start_offset = self.current_offset;
1238
1239 self.next();
1240
1241 Token {
1242 kind: TokenKind::Semicolon,
1243 text: &self.input[start_offset..self.current_offset],
1244 byte_offset: start_offset as u32,
1245 byte_length: (self.current_offset - start_offset) as u32,
1246 }
1247 }
1248}