1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
52 let mut fstring_tokens = self.lex_format_string_tokens();
53 fstring_tokens.reverse();
54 self.pending_tokens = fstring_tokens;
55 continue;
56 }
57
58 let token = self.create_token();
59 tokens.push(token);
60 }
61
62 let tokens = self.insert_semicolons(tokens);
63
64 LexResult {
65 tokens,
66 errors: self.errors,
67 trivia: self.trivia,
68 }
69 }
70
71 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
72 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
73
74 for i in 0..tokens.len() {
75 let token = tokens[i];
76 result.push(token);
77
78 if !Self::triggers_asi(token.kind) {
79 continue;
80 }
81
82 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
83 if Self::continues_expression(next_token.kind) {
84 continue;
85 }
86
87 let token_end = (token.byte_offset + token.byte_length) as usize;
88 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
89 result.push(self.make_synthetic_semicolon(token_end));
90 }
91 }
92 }
93
94 result
95 }
96
97 fn triggers_asi(kind: TokenKind) -> bool {
98 matches!(
99 kind,
100 TokenKind::Identifier
101 | TokenKind::Integer
102 | TokenKind::Imaginary
103 | TokenKind::Float
104 | TokenKind::String
105 | TokenKind::Char
106 | TokenKind::Boolean
107 | TokenKind::RightParen
108 | TokenKind::RightSquareBracket
109 | TokenKind::RightCurlyBrace
110 | TokenKind::Break
111 | TokenKind::Continue
112 | TokenKind::Return
113 | TokenKind::DotDot
114 | TokenKind::DotDotEqual
115 | TokenKind::QuestionMark
116 )
117 }
118
119 fn continues_expression(kind: TokenKind) -> bool {
120 matches!(
121 kind,
122 TokenKind::Plus
123 | TokenKind::Star
124 | TokenKind::Slash
125 | TokenKind::Percent
126 | TokenKind::Pipeline
127 | TokenKind::AmpersandDouble
128 | TokenKind::PipeDouble
129 | TokenKind::EqualDouble
130 | TokenKind::NotEqual
131 | TokenKind::LeftAngleBracket
132 | TokenKind::RightAngleBracket
133 | TokenKind::LessThanOrEqual
134 | TokenKind::GreaterThanOrEqual
135 | TokenKind::Dot
136 | TokenKind::Equal
137 | TokenKind::PlusEqual
138 | TokenKind::MinusEqual
139 | TokenKind::StarEqual
140 | TokenKind::SlashEqual
141 | TokenKind::Else
142 | TokenKind::LeftCurlyBrace
143 | TokenKind::RightCurlyBrace
144 | TokenKind::RightParen
145 | TokenKind::RightSquareBracket
146 | TokenKind::As
147 )
148 }
149
150 fn find_next_non_comment_token<'a>(
151 &self,
152 tokens: &'a [Token<'source>],
153 start_index: usize,
154 ) -> Option<&'a Token<'source>> {
155 tokens
156 .iter()
157 .skip(start_index)
158 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
159 }
160
161 fn has_newline_between(&self, start: usize, end: usize) -> bool {
162 self.input[start..end].contains('\n')
163 }
164
165 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
166 Token {
167 kind: TokenKind::Semicolon,
168 text: "",
169 byte_offset: position as u32,
170 byte_length: 0,
171 }
172 }
173
174 fn create_token(&mut self) -> Token<'source> {
175 if let Some(token) = self.lex_lookahead_symbol() {
176 return token;
177 }
178
179 let c = self.current_char();
180 match c {
181 '0'..='9' => self.lex_number(),
182 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
183 '"' => self.lex_string_literal(),
184 '`' => self.lex_backtick_literal(),
185 '\'' => self.lex_char(),
186 '/' => self.lex_slash(),
187 ';' => self.semicolon_token(),
188 '@' => self.lex_directive(),
189 _ => self.handle_unexpected_char(),
190 }
191 }
192
193 #[inline]
194 fn current_byte(&self) -> u8 {
195 if self.current_offset < self.input_bytes.len() {
196 self.input_bytes[self.current_offset]
197 } else {
198 0
199 }
200 }
201
202 #[inline]
203 fn current_char(&self) -> char {
204 self.input[self.current_offset..]
205 .chars()
206 .next()
207 .unwrap_or('\0')
208 }
209
210 #[inline]
211 fn peek_byte(&self) -> u8 {
212 if self.current_offset + 1 < self.input_bytes.len() {
213 self.input_bytes[self.current_offset + 1]
214 } else {
215 0
216 }
217 }
218
219 #[inline]
220 fn peek_char(&self) -> char {
221 let next_offset = if self.current_byte() < 128 {
222 self.current_offset + 1
223 } else {
224 self.current_offset + self.current_char().len_utf8()
225 };
226 self.input[next_offset..].chars().next().unwrap_or('\0')
227 }
228
229 fn peek_char_n(&self, n: usize) -> char {
230 let mut offset = self.current_offset;
231 for _ in 0..n {
232 if offset >= self.input.len() {
233 return '\0';
234 }
235 let c = self.input[offset..].chars().next().unwrap_or('\0');
236 offset += c.len_utf8();
237 }
238 self.input[offset..].chars().next().unwrap_or('\0')
239 }
240
241 fn next(&mut self) {
242 if self.at_eof() {
243 return;
244 }
245 if self.current_byte() < 128 {
246 self.current_offset += 1;
247 } else {
248 self.current_offset += self.current_char().len_utf8();
249 }
250 }
251
252 fn skip(&mut self, count: usize) {
253 for _ in 0..count {
254 self.next();
255 }
256 }
257
258 fn skip_whitespace(&mut self) {
259 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
260 if self.current_byte() == b'\n' {
261 self.record_newline();
262 }
263 self.next();
264 }
265 }
266
267 fn skip_horizontal_whitespace(&mut self) {
268 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
269 self.next();
270 }
271 }
272
273 fn record_newline(&mut self) {
274 let offset = self.current_offset;
275
276 if let Some(last) = self.last_newline_offset {
277 let between = &self.input[last + 1..offset];
278 let is_blank = between.is_empty()
279 || between
280 .chars()
281 .all(|c| c.is_ascii_whitespace() && c != '\n');
282 if is_blank {
283 self.trivia.blank_lines.push(offset as u32);
284 }
285 }
286
287 self.last_newline_offset = Some(offset);
288 }
289
290 fn at_eof(&self) -> bool {
291 self.current_offset >= self.input.len()
292 }
293
294 fn previous_char(&self) -> char {
295 if self.current_offset == 0 {
296 return '\0';
297 }
298 self.input[..self.current_offset]
299 .chars()
300 .next_back()
301 .unwrap_or('\0')
302 }
303
304 fn resync_on_error(&mut self) {
305 while !self.at_eof() {
306 let byte = self.current_byte();
307
308 if byte == b';' || byte == b'}' {
309 break;
310 }
311
312 self.next();
313 }
314 }
315
316 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
318 let start_offset = self.current_offset;
319 let current_char = self.current_char();
320 let next_char = self.peek_char();
321 let third_char = self.peek_char_n(2);
322
323 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
324 self.skip(3);
325 let end_offset = self.current_offset;
326 return Some(Token {
327 kind,
328 text: &self.input[start_offset..end_offset],
329 byte_offset: start_offset as u32,
330 byte_length: (end_offset - start_offset) as u32,
331 });
332 }
333
334 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
335 self.skip(2);
336 let end_offset = self.current_offset;
337 return Some(Token {
338 kind,
339 text: &self.input[start_offset..end_offset],
340 byte_offset: start_offset as u32,
341 byte_length: (end_offset - start_offset) as u32,
342 });
343 }
344
345 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
346 self.next();
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 None
357 }
358
359 fn lex_number(&mut self) -> Token<'source> {
360 let start_offset = self.current_offset;
361
362 if self.current_byte() == b'0' {
363 let next = self.peek_byte();
364 match next {
365 b'x' | b'X' => {
366 self.next(); self.next(); return self.lex_hex_number(start_offset);
369 }
370 b'o' | b'O' => {
371 self.next(); self.next(); return self.lex_octal_number(start_offset);
374 }
375 b'b' | b'B' => {
376 self.next(); self.next(); return self.lex_binary_number(start_offset);
379 }
380 b'0'..=b'7' => {
381 return self.lex_legacy_octal_number(start_offset);
382 }
383 _ => {} }
385 }
386
387 let mut kind = TokenKind::Integer;
388
389 while !self.at_eof() {
390 let byte = self.current_byte();
391 if byte.is_ascii_digit() || byte == b'_' {
392 if byte == b'_' && self.previous_char() == '_' {
393 let underscore_start = self.current_offset - 1;
394 self.error_consecutive_underscores(underscore_start);
395 }
396 self.next();
397 } else {
398 break;
399 }
400 }
401
402 if self.previous_char() == '_' {
403 self.error_number_trailing_underscore(
404 self.current_offset - self.previous_char().len_utf8(),
405 );
406 }
407
408 let preceded_by_dot = start_offset > 0
411 && self.input_bytes[start_offset - 1] == b'.'
412 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
413
414 if !preceded_by_dot
415 && self.current_byte() == b'.'
416 && self.peek_byte() != b'.'
417 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
418 {
419 kind = TokenKind::Float;
420 self.next();
421
422 if self.current_byte() == b'_' {
423 self.error_decimal_leading_underscore(self.current_offset);
424 }
425
426 while !self.at_eof() {
427 let byte = self.current_byte();
428 if byte.is_ascii_digit() || byte == b'_' {
429 if byte == b'_' && self.previous_char() == '_' {
430 let underscore_start = self.current_offset - 1;
431 self.error_consecutive_underscores(underscore_start);
432 }
433 self.next();
434 } else {
435 break;
436 }
437 }
438
439 if self.previous_char() == '_' {
440 self.error_number_trailing_underscore(
441 self.current_offset - self.previous_char().len_utf8(),
442 );
443 }
444 }
445
446 if self.current_byte() == b'e' || self.current_byte() == b'E' {
447 kind = TokenKind::Float;
448 let exponent_start = self.current_offset;
449 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
452 self.next();
453 }
454
455 if !self.current_byte().is_ascii_digit() {
456 self.error_missing_exponent_digits(
457 exponent_start,
458 self.current_offset - exponent_start,
459 );
460 }
461
462 while !self.at_eof() {
463 let byte = self.current_byte();
464 if byte.is_ascii_digit() || byte == b'_' {
465 if byte == b'_' && self.previous_char() == '_' {
466 let underscore_start = self.current_offset - 1;
467 self.error_consecutive_underscores(underscore_start);
468 }
469 self.next();
470 } else {
471 break;
472 }
473 }
474
475 if self.previous_char() == '_' {
476 self.error_number_trailing_underscore(
477 self.current_offset - self.previous_char().len_utf8(),
478 );
479 }
480 }
481
482 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
483 self.next(); let end_offset = self.current_offset;
485 return Token {
486 kind: TokenKind::Imaginary,
487 text: &self.input[start_offset..end_offset],
488 byte_offset: start_offset as u32,
489 byte_length: (end_offset - start_offset) as u32,
490 };
491 }
492
493 let end_offset = self.current_offset;
494 Token {
495 kind,
496 text: &self.input[start_offset..end_offset],
497 byte_offset: start_offset as u32,
498 byte_length: (end_offset - start_offset) as u32,
499 }
500 }
501
502 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
503 let digits_start = self.current_offset;
504
505 while !self.at_eof() {
506 let byte = self.current_byte();
507 if byte.is_ascii_hexdigit() || byte == b'_' {
508 if byte == b'_' && self.previous_char() == '_' {
509 let underscore_start = self.current_offset - 1;
510 self.error_consecutive_underscores(underscore_start);
511 }
512 self.next();
513 } else {
514 break;
515 }
516 }
517
518 if self.current_offset == digits_start {
519 self.error_missing_hex_digits(start_offset, 2);
520 }
521
522 if self.previous_char() == '_' {
523 self.error_number_trailing_underscore(
524 self.current_offset - self.previous_char().len_utf8(),
525 );
526 }
527
528 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
529 self.next(); let end_offset = self.current_offset;
531 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
532 return Token {
533 kind: TokenKind::Imaginary,
534 text: &self.input[start_offset..end_offset],
535 byte_offset: start_offset as u32,
536 byte_length: (end_offset - start_offset) as u32,
537 };
538 }
539
540 let end_offset = self.current_offset;
541 Token {
542 kind: TokenKind::Integer,
543 text: &self.input[start_offset..end_offset],
544 byte_offset: start_offset as u32,
545 byte_length: (end_offset - start_offset) as u32,
546 }
547 }
548
549 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
550 let digits_start = self.current_offset;
551
552 while !self.at_eof() {
553 let byte = self.current_byte();
554 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
555 if byte == b'_' && self.previous_char() == '_' {
556 let underscore_start = self.current_offset - 1;
557 self.error_consecutive_underscores(underscore_start);
558 }
559 self.next();
560 } else if byte == b'8' || byte == b'9' {
561 self.error_invalid_octal_digit(self.current_offset);
562 self.next();
563 } else {
564 break;
565 }
566 }
567
568 if self.current_offset == digits_start {
569 self.error_missing_octal_digits(start_offset, 2);
570 }
571
572 if self.previous_char() == '_' {
573 self.error_number_trailing_underscore(
574 self.current_offset - self.previous_char().len_utf8(),
575 );
576 }
577
578 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
579 self.next(); let end_offset = self.current_offset;
581 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
582 return Token {
583 kind: TokenKind::Imaginary,
584 text: &self.input[start_offset..end_offset],
585 byte_offset: start_offset as u32,
586 byte_length: (end_offset - start_offset) as u32,
587 };
588 }
589
590 let end_offset = self.current_offset;
591 Token {
592 kind: TokenKind::Integer,
593 text: &self.input[start_offset..end_offset],
594 byte_offset: start_offset as u32,
595 byte_length: (end_offset - start_offset) as u32,
596 }
597 }
598
599 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
600 self.next();
601
602 while !self.at_eof() {
603 let byte = self.current_byte();
604 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
605 if byte == b'_' && self.previous_char() == '_' {
606 let underscore_start = self.current_offset - 1;
607 self.error_consecutive_underscores(underscore_start);
608 }
609 self.next();
610 } else if byte == b'8' || byte == b'9' {
611 self.error_invalid_octal_digit(self.current_offset);
612 self.next();
613 } else {
614 break;
615 }
616 }
617
618 if self.previous_char() == '_' {
619 self.error_number_trailing_underscore(
620 self.current_offset - self.previous_char().len_utf8(),
621 );
622 }
623
624 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
625 self.next();
626 let end_offset = self.current_offset;
627 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
628 return Token {
629 kind: TokenKind::Imaginary,
630 text: &self.input[start_offset..end_offset],
631 byte_offset: start_offset as u32,
632 byte_length: (end_offset - start_offset) as u32,
633 };
634 }
635
636 let end_offset = self.current_offset;
637 Token {
638 kind: TokenKind::Integer,
639 text: &self.input[start_offset..end_offset],
640 byte_offset: start_offset as u32,
641 byte_length: (end_offset - start_offset) as u32,
642 }
643 }
644
645 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
646 let digits_start = self.current_offset;
647
648 while !self.at_eof() {
649 let byte = self.current_byte();
650 if byte == b'0' || byte == b'1' || byte == b'_' {
651 if byte == b'_' && self.previous_char() == '_' {
652 let underscore_start = self.current_offset - 1;
653 self.error_consecutive_underscores(underscore_start);
654 }
655 self.next();
656 } else if (b'2'..=b'9').contains(&byte) {
657 self.error_invalid_binary_digit(self.current_offset);
658 self.next();
659 } else {
660 break;
661 }
662 }
663
664 if self.current_offset == digits_start {
665 self.error_missing_binary_digits(start_offset, 2);
666 }
667
668 if self.previous_char() == '_' {
669 self.error_number_trailing_underscore(
670 self.current_offset - self.previous_char().len_utf8(),
671 );
672 }
673
674 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
675 self.next();
676 let end_offset = self.current_offset;
677 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
678 return Token {
679 kind: TokenKind::Imaginary,
680 text: &self.input[start_offset..end_offset],
681 byte_offset: start_offset as u32,
682 byte_length: (end_offset - start_offset) as u32,
683 };
684 }
685
686 let end_offset = self.current_offset;
687 Token {
688 kind: TokenKind::Integer,
689 text: &self.input[start_offset..end_offset],
690 byte_offset: start_offset as u32,
691 byte_length: (end_offset - start_offset) as u32,
692 }
693 }
694
695 fn lex_identifier(&mut self) -> Token<'source> {
696 let start_offset = self.current_offset;
697
698 while !self.at_eof() {
699 let c = self.current_char();
700 if c.is_alphanumeric() || c == '_' {
701 self.next();
702 } else {
703 break;
704 }
705 }
706
707 let end_offset = self.current_offset;
708 let text = &self.input[start_offset..end_offset];
709
710 let kind = match text {
711 "true" | "false" => TokenKind::Boolean,
712 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
713 };
714
715 Token {
716 kind,
717 text,
718 byte_offset: start_offset as u32,
719 byte_length: (end_offset - start_offset) as u32,
720 }
721 }
722
723 fn lex_backtick_literal(&mut self) -> Token<'source> {
724 let start_offset = self.current_offset;
725
726 self.next();
727
728 let mut terminated = false;
729
730 while !self.at_eof() {
731 let byte = self.current_byte();
732 if byte == b'`' {
733 terminated = true;
734 self.next();
735 break;
736 } else if byte == b'\n' {
737 break;
738 }
739 self.next();
740 }
741
742 let end_offset = self.current_offset;
743 let length = end_offset - start_offset;
744
745 if !terminated {
746 self.error_unterminated_backtick(start_offset, length);
747 }
748
749 Token {
750 kind: TokenKind::Backtick,
751 text: &self.input[start_offset..end_offset],
752 byte_offset: start_offset as u32,
753 byte_length: length as u32,
754 }
755 }
756
757 fn consume_unicode_escape(&mut self, escape_start: usize) {
758 if self.at_eof() || self.current_byte() != b'{' {
759 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
760 return;
761 }
762 self.next();
763
764 let hex_start = self.current_offset;
765 let mut all_hex = true;
766 while !self.at_eof() {
767 let byte = self.current_byte();
768 if byte == b'}' || byte == b'"' || byte == b'\n' {
769 break;
770 }
771 if !byte.is_ascii_hexdigit() {
772 all_hex = false;
773 }
774 self.next();
775 }
776 let hex_end = self.current_offset;
777
778 let closed = !self.at_eof() && self.current_byte() == b'}';
779 if closed {
780 self.next();
781 }
782
783 let hex_len = hex_end - hex_start;
784 let total_len = self.current_offset - escape_start;
785
786 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
787 self.error_invalid_unicode_escape(escape_start, total_len);
788 return;
789 }
790
791 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
792 .expect("hex digits validated above");
793 if char::from_u32(codepoint).is_none() {
794 self.error_unicode_escape_out_of_range(escape_start, total_len);
795 }
796 }
797
798 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
800 let mut value: u16 = (first_digit - b'0') as u16;
801 for _ in 0..2 {
802 if self.at_eof() {
803 break;
804 }
805 match self.current_byte() {
806 d @ b'0'..=b'7' => {
807 value = value * 8 + (d - b'0') as u16;
808 self.next();
809 }
810 _ => break,
811 }
812 }
813 value
814 }
815
816 fn lex_string_literal(&mut self) -> Token<'source> {
817 let start_offset = self.current_offset;
818
819 self.next();
820
821 let mut escaped = false;
822 let mut terminated = false;
823
824 while !self.at_eof() && !terminated {
825 let byte = self.current_byte();
826 if escaped {
827 match byte {
828 b'0'..=b'7' => {
829 let escape_start = self.current_offset - 1;
830 self.next();
831 let value = self.consume_octal_escape(byte);
832 if value > 255 {
833 let escape_len = self.current_offset - escape_start;
834 self.error_octal_escape_out_of_range(escape_start, escape_len);
835 }
836 escaped = false;
837 continue;
838 }
839 b'u' => {
840 let escape_start = self.current_offset - 1;
841 self.next();
842 self.consume_unicode_escape(escape_start);
843 escaped = false;
844 continue;
845 }
846 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'"' | b'x' | b'U' => {
847 }
848 b'\'' => {}
849 _ => {
850 self.error_invalid_escape(self.current_char());
851 }
852 }
853 escaped = false;
854 } else if byte == b'\\' {
855 escaped = true;
856 } else if byte == b'"' {
857 terminated = true;
858 self.next();
859 break;
860 } else if byte == b'\n' {
861 break; }
863
864 self.next();
865 }
866
867 let end_offset = self.current_offset;
868 let length = end_offset - start_offset;
869
870 if escaped {
871 self.error_unterminated_escape(start_offset);
872 }
873
874 if !terminated {
875 self.error_unterminated_string(start_offset, length);
876 }
877
878 Token {
879 kind: TokenKind::String,
880 text: &self.input[start_offset..end_offset],
881 byte_offset: start_offset as u32,
882 byte_length: length as u32,
883 }
884 }
885
886 fn push_format_string_text_if_needed(
887 &self,
888 tokens: &mut Vec<Token<'source>>,
889 text_segment_start: usize,
890 ) {
891 if text_segment_start < self.current_offset {
892 tokens.push(Token {
893 kind: TokenKind::FormatStringText,
894 text: &self.input[text_segment_start..self.current_offset],
895 byte_offset: text_segment_start as u32,
896 byte_length: (self.current_offset - text_segment_start) as u32,
897 });
898 }
899 }
900
901 fn lex_format_string_interpolation(
902 &mut self,
903 tokens: &mut Vec<Token<'source>>,
904 ) -> Result<(), ()> {
905 let interp_start = self.current_offset;
906 self.next();
907
908 tokens.push(Token {
909 kind: TokenKind::FormatStringInterpolationStart,
910 text: &self.input[interp_start..self.current_offset],
911 byte_offset: interp_start as u32,
912 byte_length: (self.current_offset - interp_start) as u32,
913 });
914
915 let Some(interpolation_end) = self.find_interpolation_boundary() else {
916 if self.has_newline_between(interp_start, self.input.len()) {
917 self.error_multiline_format_string_interpolation(interp_start);
918 } else {
919 self.error_unclosed_brace_in_format_string(interp_start);
920 }
921 self.skip_to_format_string_end();
922 return Err(());
923 };
924
925 if self.has_newline_between(interp_start, interpolation_end) {
926 self.error_multiline_format_string_interpolation(interp_start);
927 }
928
929 while self.current_offset < interpolation_end {
930 self.skip_horizontal_whitespace();
931 if self.current_offset >= interpolation_end {
932 break;
933 }
934
935 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
936 let mut fstring_tokens = self.lex_format_string_tokens();
937 tokens.append(&mut fstring_tokens);
938 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
939 self.error_escaped_quote_in_interpolation(self.current_offset);
940 self.skip(2);
941 } else {
942 let token = self.create_token();
943 tokens.push(token);
944 }
945 }
946
947 let close_offset = self.current_offset;
948 self.next();
949 tokens.push(Token {
950 kind: TokenKind::FormatStringInterpolationEnd,
951 text: &self.input[close_offset..self.current_offset],
952 byte_offset: close_offset as u32,
953 byte_length: (self.current_offset - close_offset) as u32,
954 });
955
956 Ok(())
957 }
958
959 fn scan_interpolation(&self, start: usize) -> Option<usize> {
960 let bytes = self.input.as_bytes();
961 let mut p = start;
962 let mut depth = 1;
963
964 while p < bytes.len() && depth > 0 {
965 match bytes[p] {
966 b'{' => {
967 depth += 1;
968 p += 1;
969 }
970 b'}' => {
971 depth -= 1;
972 if depth > 0 {
973 p += 1;
974 }
975 }
976 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
977 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
978 p = self.scan_past_fstring(p)?;
979 }
980 b'\\' => p += 2,
981 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
982 b'\n' => return None,
983 _ => p += 1,
984 }
985 }
986
987 (depth == 0).then_some(p)
988 }
989
990 fn find_interpolation_boundary(&self) -> Option<usize> {
991 self.scan_interpolation(self.current_offset)
992 }
993
994 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
995 let bytes = self.input.as_bytes();
996 let mut p = start + 1;
997 while p < bytes.len() {
998 match bytes[p] {
999 b'\\' if delimiter != b'`' => p += 2,
1000 b'\n' => return None,
1001 b if b == delimiter => return Some(p + 1),
1002 _ => p += 1,
1003 }
1004 }
1005 None
1006 }
1007
1008 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1009 let bytes = self.input.as_bytes();
1010 let mut p = position + 2; while p < bytes.len() {
1012 match bytes[p] {
1013 b'\\' => p += 2,
1014 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1015 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1016 b'{' => {
1017 p = self.scan_interpolation(p + 1)?;
1018 p += 1;
1019 }
1020 b'"' => return Some(p + 1),
1021 b'\n' => return None,
1022 _ => p += 1,
1023 }
1024 }
1025 None
1026 }
1027
1028 fn skip_to_format_string_end(&mut self) {
1029 while !self.at_eof() {
1030 match self.current_byte() {
1031 b'"' => {
1032 self.next();
1033 return;
1034 }
1035 b'\n' => return,
1036 _ => self.next(),
1037 }
1038 }
1039 }
1040
1041 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1042 let start_offset = self.current_offset;
1043 let mut tokens = Vec::new();
1044
1045 self.skip(2);
1046
1047 let fstring_start_end = self.current_offset;
1048 tokens.push(Token {
1049 kind: TokenKind::FormatStringStart,
1050 text: &self.input[start_offset..fstring_start_end],
1051 byte_offset: start_offset as u32,
1052 byte_length: (fstring_start_end - start_offset) as u32,
1053 });
1054
1055 let mut text_segment_start = self.current_offset;
1056
1057 while !self.at_eof() {
1058 let byte = self.current_byte();
1059
1060 match byte {
1061 b'\\' if !self.at_eof() => {
1062 let escape_start = self.current_offset;
1063 self.next();
1064 if !self.at_eof() {
1065 let b = self.current_byte();
1066 self.next();
1067 if matches!(b, b'0'..=b'7') {
1068 let value = self.consume_octal_escape(b);
1069 if value > 255 {
1070 let escape_len = self.current_offset - escape_start;
1071 self.error_octal_escape_out_of_range(escape_start, escape_len);
1072 }
1073 } else if b == b'u' {
1074 self.consume_unicode_escape(escape_start);
1075 }
1076 }
1077 }
1078 b'{' if self.peek_byte() == b'{' => {
1079 self.skip(2);
1080 }
1081 b'}' if self.peek_byte() == b'}' => {
1082 self.skip(2);
1083 }
1084 b'"' => {
1085 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1086
1087 let end_offset = self.current_offset;
1088 self.next();
1089
1090 tokens.push(Token {
1091 kind: TokenKind::FormatStringEnd,
1092 text: &self.input[end_offset..self.current_offset],
1093 byte_offset: end_offset as u32,
1094 byte_length: (self.current_offset - end_offset) as u32,
1095 });
1096 return tokens;
1097 }
1098
1099 b'\n' => {
1100 let length = self.current_offset.saturating_sub(start_offset);
1101 self.error_unterminated_format_string(start_offset, length);
1102 return tokens;
1103 }
1104
1105 b'{' => {
1106 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1107
1108 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1109 return tokens;
1110 }
1111 text_segment_start = self.current_offset;
1112 }
1113 b'}' => {
1114 self.error_unmatched_brace_in_format_string(self.current_offset);
1115 self.next();
1116 }
1117 _ => {
1118 self.next();
1119 }
1120 }
1121 }
1122
1123 let length = self.current_offset.saturating_sub(start_offset);
1124 self.error_unterminated_format_string(start_offset, length);
1125 tokens
1126 }
1127
1128 fn lex_char(&mut self) -> Token<'source> {
1129 let start_offset = self.current_offset;
1130
1131 self.next();
1132
1133 if self.at_eof() || self.current_byte() == b'\'' {
1134 self.error_empty_rune_literal(start_offset);
1135 let end_offset = self.current_offset;
1136 return Token {
1137 kind: TokenKind::Char,
1138 text: &self.input[start_offset..end_offset],
1139 byte_offset: start_offset as u32,
1140 byte_length: (end_offset - start_offset) as u32,
1141 };
1142 }
1143
1144 if self.current_byte() != b'\\' {
1145 self.next();
1146 } else {
1147 self.next();
1148
1149 if self.at_eof() {
1150 self.error_unterminated_escape(start_offset);
1151 let end_offset = self.current_offset;
1152 return Token {
1153 kind: TokenKind::Char,
1154 text: &self.input[start_offset..end_offset],
1155 byte_offset: start_offset as u32,
1156 byte_length: (end_offset - start_offset) as u32,
1157 };
1158 }
1159
1160 match self.current_byte() {
1161 b'0'..=b'7' => {
1162 let escape_start = self.current_offset - 1;
1163 let first = self.current_byte();
1164 self.next();
1165 let value = self.consume_octal_escape(first);
1166 if value > 255 {
1167 let escape_len = self.current_offset - escape_start;
1168 self.error_octal_escape_out_of_range(escape_start, escape_len);
1169 }
1170 }
1171 b'a' | b'b' | b'f' | b'n' | b'r' | b't' | b'v' | b'\\' | b'\'' | b'x' => {
1172 self.next();
1173 }
1174 _ => {
1175 self.error_invalid_escape(self.current_char());
1176
1177 while !self.at_eof() && self.current_byte() != b'\'' {
1178 self.next();
1179 }
1180
1181 if !self.at_eof() && self.current_byte() == b'\'' {
1182 self.next();
1183 }
1184
1185 let end_offset = self.current_offset;
1186 return Token {
1187 kind: TokenKind::Char,
1188 text: &self.input[start_offset..end_offset],
1189 byte_offset: start_offset as u32,
1190 byte_length: (end_offset - start_offset) as u32,
1191 };
1192 }
1193 }
1194 }
1195
1196 if self.at_eof() || self.current_byte() != b'\'' {
1197 let length = self.current_offset - start_offset;
1198 self.error_unterminated_rune(start_offset, length);
1199 }
1200
1201 if !self.at_eof() && self.current_byte() == b'\'' {
1202 self.next();
1203 }
1204
1205 let end_offset = self.current_offset;
1206 Token {
1207 kind: TokenKind::Char,
1208 text: &self.input[start_offset..end_offset],
1209 byte_offset: start_offset as u32,
1210 byte_length: (end_offset - start_offset) as u32,
1211 }
1212 }
1213
1214 fn lex_slash(&mut self) -> Token<'source> {
1215 let start_offset = self.current_offset;
1216
1217 if self.peek_byte() != b'/' {
1218 self.next();
1219 return Token {
1220 kind: TokenKind::Slash,
1221 text: &self.input[start_offset..self.current_offset],
1222 byte_offset: start_offset as u32,
1223 byte_length: 1,
1224 };
1225 }
1226
1227 let slash_count = self.count_consecutive(b'/');
1228
1229 if slash_count >= 4 {
1230 self.error_excess_slashes_in_comment(start_offset, slash_count);
1231 }
1232
1233 self.skip(slash_count);
1234
1235 if slash_count == 3 {
1236 if self.current_byte() == b' ' {
1237 self.next();
1238 }
1239 let text_start = self.current_offset;
1240 self.skip_to_eol();
1241 let end_offset = self.current_offset;
1242
1243 self.trivia
1244 .doc_comments
1245 .push((start_offset as u32, end_offset as u32));
1246
1247 return Token {
1248 kind: TokenKind::DocComment,
1249 text: &self.input[text_start..end_offset],
1250 byte_offset: start_offset as u32,
1251 byte_length: (end_offset - start_offset) as u32,
1252 };
1253 }
1254
1255 self.skip_to_eol();
1256 let end_offset = self.current_offset;
1257
1258 self.trivia
1259 .comments
1260 .push((start_offset as u32, end_offset as u32));
1261
1262 Token {
1263 kind: TokenKind::Comment,
1264 text: &self.input[start_offset..end_offset],
1265 byte_offset: start_offset as u32,
1266 byte_length: (end_offset - start_offset) as u32,
1267 }
1268 }
1269
1270 fn count_consecutive(&self, byte: u8) -> usize {
1271 let mut count = 0;
1272 let mut offset = self.current_offset;
1273 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1274 count += 1;
1275 offset += 1;
1276 }
1277 count
1278 }
1279
1280 fn skip_to_eol(&mut self) {
1281 while !self.at_eof() && self.current_byte() != b'\n' {
1282 self.next();
1283 }
1284 }
1285
1286 fn lex_directive(&mut self) -> Token<'source> {
1287 let start_offset = self.current_offset;
1288
1289 self.next();
1290
1291 while !self.at_eof() {
1292 let byte = self.current_byte();
1293 if byte.is_ascii_alphanumeric() || byte == b'_' {
1294 self.next();
1295 } else {
1296 break;
1297 }
1298 }
1299
1300 let end_offset = self.current_offset;
1301 Token {
1302 kind: TokenKind::Directive,
1303 text: &self.input[start_offset..end_offset],
1304 byte_offset: start_offset as u32,
1305 byte_length: (end_offset - start_offset) as u32,
1306 }
1307 }
1308
1309 fn handle_unexpected_char(&mut self) -> Token<'source> {
1310 let start_offset = self.current_offset;
1311
1312 self.error_unexpected_char(self.current_offset, self.current_char());
1313
1314 self.resync_on_error();
1315
1316 let end_offset = self.current_offset;
1317
1318 Token {
1319 kind: TokenKind::Error,
1320 text: &self.input[start_offset..end_offset],
1321 byte_offset: start_offset as u32,
1322 byte_length: (end_offset - start_offset) as u32,
1323 }
1324 }
1325
1326 fn eof_token(&self) -> Token<'source> {
1327 Token {
1328 kind: TokenKind::EOF,
1329 text: &self.input[self.current_offset..self.current_offset],
1330 byte_offset: self.current_offset as u32,
1331 byte_length: 0,
1332 }
1333 }
1334
1335 fn semicolon_token(&mut self) -> Token<'source> {
1336 let start_offset = self.current_offset;
1337
1338 self.next();
1339
1340 Token {
1341 kind: TokenKind::Semicolon,
1342 text: &self.input[start_offset..self.current_offset],
1343 byte_offset: start_offset as u32,
1344 byte_length: (self.current_offset - start_offset) as u32,
1345 }
1346 }
1347}