1pub use token::{Token, TokenKind};
2pub use types::{LexResult, Trivia};
3
4use crate::parse::ParseError;
5
6mod errors;
7mod token;
8mod types;
9
10pub struct Lexer<'source> {
11 input: &'source str,
12 input_bytes: &'source [u8],
13 current_offset: usize,
14 file_id: u32,
15 errors: Vec<ParseError>,
16 pending_tokens: Vec<Token<'source>>,
17 trivia: Trivia,
18 last_newline_offset: Option<usize>,
19}
20
21impl<'source> Lexer<'source> {
22 pub fn new(input: &'source str, file_id: u32) -> Lexer<'source> {
23 Lexer {
24 input,
25 input_bytes: input.as_bytes(),
26 current_offset: 0,
27 file_id,
28 errors: vec![],
29 pending_tokens: vec![],
30 trivia: Trivia::default(),
31 last_newline_offset: None,
32 }
33 }
34
35 pub fn lex(mut self) -> LexResult<'source> {
36 let mut tokens = Vec::new();
37
38 loop {
39 if let Some(token) = self.pending_tokens.pop() {
40 tokens.push(token);
41 continue;
42 }
43
44 self.skip_whitespace();
45
46 if self.at_eof() {
47 tokens.push(self.eof_token());
48 break;
49 }
50
51 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
52 let mut fstring_tokens = self.lex_format_string_tokens();
53 fstring_tokens.reverse();
54 self.pending_tokens = fstring_tokens;
55 continue;
56 }
57
58 let token = self.create_token();
59 tokens.push(token);
60 }
61
62 let tokens = self.insert_semicolons(tokens);
63
64 LexResult {
65 tokens,
66 errors: self.errors,
67 trivia: self.trivia,
68 }
69 }
70
71 fn insert_semicolons(&self, tokens: Vec<Token<'source>>) -> Vec<Token<'source>> {
72 let mut result = Vec::with_capacity(tokens.len() + tokens.len() / 4);
73
74 for i in 0..tokens.len() {
75 let token = tokens[i];
76 result.push(token);
77
78 if !Self::triggers_asi(token.kind) {
79 continue;
80 }
81
82 if let Some(next_token) = self.find_next_non_comment_token(&tokens, i + 1) {
83 if Self::continues_expression(next_token.kind) {
84 continue;
85 }
86
87 let token_end = (token.byte_offset + token.byte_length) as usize;
88 if self.has_newline_between(token_end, next_token.byte_offset as usize) {
89 result.push(self.make_synthetic_semicolon(token_end));
90 }
91 }
92 }
93
94 result
95 }
96
97 fn triggers_asi(kind: TokenKind) -> bool {
98 matches!(
99 kind,
100 TokenKind::Identifier
101 | TokenKind::Integer
102 | TokenKind::Imaginary
103 | TokenKind::Float
104 | TokenKind::String
105 | TokenKind::Char
106 | TokenKind::Boolean
107 | TokenKind::RightParen
108 | TokenKind::RightSquareBracket
109 | TokenKind::RightCurlyBrace
110 | TokenKind::Break
111 | TokenKind::Continue
112 | TokenKind::Return
113 | TokenKind::DotDot
114 | TokenKind::DotDotEqual
115 | TokenKind::QuestionMark
116 )
117 }
118
119 fn continues_expression(kind: TokenKind) -> bool {
120 matches!(
121 kind,
122 TokenKind::Plus
123 | TokenKind::Star
124 | TokenKind::Slash
125 | TokenKind::Percent
126 | TokenKind::Pipeline
127 | TokenKind::AmpersandDouble
128 | TokenKind::PipeDouble
129 | TokenKind::EqualDouble
130 | TokenKind::NotEqual
131 | TokenKind::LeftAngleBracket
132 | TokenKind::RightAngleBracket
133 | TokenKind::LessThanOrEqual
134 | TokenKind::GreaterThanOrEqual
135 | TokenKind::Dot
136 | TokenKind::Equal
137 | TokenKind::PlusEqual
138 | TokenKind::MinusEqual
139 | TokenKind::StarEqual
140 | TokenKind::SlashEqual
141 | TokenKind::Else
142 | TokenKind::LeftCurlyBrace
143 | TokenKind::RightCurlyBrace
144 | TokenKind::RightParen
145 | TokenKind::RightSquareBracket
146 | TokenKind::As
147 )
148 }
149
150 fn find_next_non_comment_token<'a>(
151 &self,
152 tokens: &'a [Token<'source>],
153 start_index: usize,
154 ) -> Option<&'a Token<'source>> {
155 tokens
156 .iter()
157 .skip(start_index)
158 .find(|&token| token.kind != TokenKind::Comment && token.kind != TokenKind::DocComment)
159 }
160
161 fn has_newline_between(&self, start: usize, end: usize) -> bool {
162 self.input[start..end].contains('\n')
163 }
164
165 fn make_synthetic_semicolon(&self, position: usize) -> Token<'source> {
166 Token {
167 kind: TokenKind::Semicolon,
168 text: "",
169 byte_offset: position as u32,
170 byte_length: 0,
171 }
172 }
173
174 fn create_token(&mut self) -> Token<'source> {
175 if let Some(token) = self.lex_lookahead_symbol() {
176 return token;
177 }
178
179 let c = self.current_char();
180 match c {
181 '0'..='9' => self.lex_number(),
182 _ if c.is_alphabetic() || c == '_' => self.lex_identifier(),
183 '"' => self.lex_string_literal(),
184 '`' => self.lex_backtick_literal(),
185 '\'' => self.lex_char(),
186 '/' => self.lex_slash(),
187 ';' => self.semicolon_token(),
188 '@' => self.lex_directive(),
189 _ => self.handle_unexpected_char(),
190 }
191 }
192
193 #[inline]
194 fn current_byte(&self) -> u8 {
195 if self.current_offset < self.input_bytes.len() {
196 self.input_bytes[self.current_offset]
197 } else {
198 0
199 }
200 }
201
202 #[inline]
203 fn current_char(&self) -> char {
204 self.input[self.current_offset..]
205 .chars()
206 .next()
207 .unwrap_or('\0')
208 }
209
210 #[inline]
211 fn peek_byte(&self) -> u8 {
212 if self.current_offset + 1 < self.input_bytes.len() {
213 self.input_bytes[self.current_offset + 1]
214 } else {
215 0
216 }
217 }
218
219 #[inline]
220 fn peek_char(&self) -> char {
221 let next_offset = if self.current_byte() < 128 {
222 self.current_offset + 1
223 } else {
224 self.current_offset + self.current_char().len_utf8()
225 };
226 self.input[next_offset..].chars().next().unwrap_or('\0')
227 }
228
229 fn peek_char_n(&self, n: usize) -> char {
230 let mut offset = self.current_offset;
231 for _ in 0..n {
232 if offset >= self.input.len() {
233 return '\0';
234 }
235 let c = self.input[offset..].chars().next().unwrap_or('\0');
236 offset += c.len_utf8();
237 }
238 self.input[offset..].chars().next().unwrap_or('\0')
239 }
240
241 fn next(&mut self) {
242 if self.at_eof() {
243 return;
244 }
245 if self.current_byte() < 128 {
246 self.current_offset += 1;
247 } else {
248 self.current_offset += self.current_char().len_utf8();
249 }
250 }
251
252 fn skip(&mut self, count: usize) {
253 for _ in 0..count {
254 self.next();
255 }
256 }
257
258 fn skip_whitespace(&mut self) {
259 while !self.at_eof() && self.current_byte().is_ascii_whitespace() {
260 if self.current_byte() == b'\n' {
261 self.record_newline();
262 }
263 self.next();
264 }
265 }
266
267 fn skip_horizontal_whitespace(&mut self) {
268 while !self.at_eof() && matches!(self.current_byte(), b' ' | b'\t') {
269 self.next();
270 }
271 }
272
273 fn record_newline(&mut self) {
274 let offset = self.current_offset;
275
276 if let Some(last) = self.last_newline_offset {
277 let between = &self.input[last + 1..offset];
278 let is_blank = between.is_empty()
279 || between
280 .chars()
281 .all(|c| c.is_ascii_whitespace() && c != '\n');
282 if is_blank {
283 self.trivia.blank_lines.push(offset as u32);
284 }
285 }
286
287 self.last_newline_offset = Some(offset);
288 }
289
290 fn at_eof(&self) -> bool {
291 self.current_offset >= self.input.len()
292 }
293
294 fn previous_char(&self) -> char {
295 if self.current_offset == 0 {
296 return '\0';
297 }
298 self.input[..self.current_offset]
299 .chars()
300 .next_back()
301 .unwrap_or('\0')
302 }
303
304 fn resync_on_error(&mut self) {
305 while !self.at_eof() {
306 let byte = self.current_byte();
307
308 if byte == b';' || byte == b'}' {
309 break;
310 }
311
312 self.next();
313 }
314 }
315
316 fn lex_lookahead_symbol(&mut self) -> Option<Token<'source>> {
318 let start_offset = self.current_offset;
319 let current_char = self.current_char();
320 let next_char = self.peek_char();
321 let third_char = self.peek_char_n(2);
322
323 if let Some(kind) = TokenKind::from_three_char_symbol(current_char, next_char, third_char) {
324 self.skip(3);
325 let end_offset = self.current_offset;
326 return Some(Token {
327 kind,
328 text: &self.input[start_offset..end_offset],
329 byte_offset: start_offset as u32,
330 byte_length: (end_offset - start_offset) as u32,
331 });
332 }
333
334 if let Some(kind) = TokenKind::from_two_char_symbol(current_char, next_char) {
335 self.skip(2);
336 let end_offset = self.current_offset;
337 return Some(Token {
338 kind,
339 text: &self.input[start_offset..end_offset],
340 byte_offset: start_offset as u32,
341 byte_length: (end_offset - start_offset) as u32,
342 });
343 }
344
345 if let Some(kind) = TokenKind::from_one_char_symbol(current_char) {
346 self.next();
347 let end_offset = self.current_offset;
348 return Some(Token {
349 kind,
350 text: &self.input[start_offset..end_offset],
351 byte_offset: start_offset as u32,
352 byte_length: (end_offset - start_offset) as u32,
353 });
354 }
355
356 None
357 }
358
359 fn lex_number(&mut self) -> Token<'source> {
360 let start_offset = self.current_offset;
361
362 if self.current_byte() == b'0' {
363 let next = self.peek_byte();
364 match next {
365 b'x' | b'X' => {
366 self.next(); self.next(); return self.lex_hex_number(start_offset);
369 }
370 b'o' | b'O' => {
371 self.next(); self.next(); return self.lex_octal_number(start_offset);
374 }
375 b'b' | b'B' => {
376 self.next(); self.next(); return self.lex_binary_number(start_offset);
379 }
380 b'0'..=b'7' => {
381 return self.lex_legacy_octal_number(start_offset);
382 }
383 _ => {} }
385 }
386
387 let mut kind = TokenKind::Integer;
388
389 while !self.at_eof() {
390 let byte = self.current_byte();
391 if byte.is_ascii_digit() || byte == b'_' {
392 if byte == b'_' && self.previous_char() == '_' {
393 let underscore_start = self.current_offset - 1;
394 self.error_consecutive_underscores(underscore_start);
395 }
396 self.next();
397 } else {
398 break;
399 }
400 }
401
402 if self.previous_char() == '_' {
403 self.error_number_trailing_underscore(
404 self.current_offset - self.previous_char().len_utf8(),
405 );
406 }
407
408 let preceded_by_dot = start_offset > 0
411 && self.input_bytes[start_offset - 1] == b'.'
412 && !(start_offset > 1 && self.input_bytes[start_offset - 2] == b'.');
413
414 if !preceded_by_dot
415 && self.current_byte() == b'.'
416 && self.peek_byte() != b'.'
417 && (self.peek_byte().is_ascii_digit() || self.peek_byte() == b'_')
418 {
419 kind = TokenKind::Float;
420 self.next();
421
422 if self.current_byte() == b'_' {
423 self.error_decimal_leading_underscore(self.current_offset);
424 }
425
426 while !self.at_eof() {
427 let byte = self.current_byte();
428 if byte.is_ascii_digit() || byte == b'_' {
429 if byte == b'_' && self.previous_char() == '_' {
430 let underscore_start = self.current_offset - 1;
431 self.error_consecutive_underscores(underscore_start);
432 }
433 self.next();
434 } else {
435 break;
436 }
437 }
438
439 if self.previous_char() == '_' {
440 self.error_number_trailing_underscore(
441 self.current_offset - self.previous_char().len_utf8(),
442 );
443 }
444 }
445
446 if self.current_byte() == b'e' || self.current_byte() == b'E' {
447 kind = TokenKind::Float;
448 let exponent_start = self.current_offset;
449 self.next(); if self.current_byte() == b'+' || self.current_byte() == b'-' {
452 self.next();
453 }
454
455 if !self.current_byte().is_ascii_digit() {
456 self.error_missing_exponent_digits(
457 exponent_start,
458 self.current_offset - exponent_start,
459 );
460 }
461
462 while !self.at_eof() {
463 let byte = self.current_byte();
464 if byte.is_ascii_digit() || byte == b'_' {
465 if byte == b'_' && self.previous_char() == '_' {
466 let underscore_start = self.current_offset - 1;
467 self.error_consecutive_underscores(underscore_start);
468 }
469 self.next();
470 } else {
471 break;
472 }
473 }
474
475 if self.previous_char() == '_' {
476 self.error_number_trailing_underscore(
477 self.current_offset - self.previous_char().len_utf8(),
478 );
479 }
480 }
481
482 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
483 self.next(); let end_offset = self.current_offset;
485 return Token {
486 kind: TokenKind::Imaginary,
487 text: &self.input[start_offset..end_offset],
488 byte_offset: start_offset as u32,
489 byte_length: (end_offset - start_offset) as u32,
490 };
491 }
492
493 let end_offset = self.current_offset;
494 Token {
495 kind,
496 text: &self.input[start_offset..end_offset],
497 byte_offset: start_offset as u32,
498 byte_length: (end_offset - start_offset) as u32,
499 }
500 }
501
502 fn lex_hex_number(&mut self, start_offset: usize) -> Token<'source> {
503 let digits_start = self.current_offset;
504
505 while !self.at_eof() {
506 let byte = self.current_byte();
507 if byte.is_ascii_hexdigit() || byte == b'_' {
508 if byte == b'_' && self.previous_char() == '_' {
509 let underscore_start = self.current_offset - 1;
510 self.error_consecutive_underscores(underscore_start);
511 }
512 self.next();
513 } else {
514 break;
515 }
516 }
517
518 if self.current_offset == digits_start {
519 self.error_missing_hex_digits(start_offset, 2);
520 }
521
522 if self.previous_char() == '_' {
523 self.error_number_trailing_underscore(
524 self.current_offset - self.previous_char().len_utf8(),
525 );
526 }
527
528 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
529 self.next(); let end_offset = self.current_offset;
531 self.error_non_decimal_imaginary("hex", start_offset, end_offset - start_offset);
532 return Token {
533 kind: TokenKind::Imaginary,
534 text: &self.input[start_offset..end_offset],
535 byte_offset: start_offset as u32,
536 byte_length: (end_offset - start_offset) as u32,
537 };
538 }
539
540 let end_offset = self.current_offset;
541 Token {
542 kind: TokenKind::Integer,
543 text: &self.input[start_offset..end_offset],
544 byte_offset: start_offset as u32,
545 byte_length: (end_offset - start_offset) as u32,
546 }
547 }
548
549 fn lex_octal_number(&mut self, start_offset: usize) -> Token<'source> {
550 let digits_start = self.current_offset;
551
552 while !self.at_eof() {
553 let byte = self.current_byte();
554 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
555 if byte == b'_' && self.previous_char() == '_' {
556 let underscore_start = self.current_offset - 1;
557 self.error_consecutive_underscores(underscore_start);
558 }
559 self.next();
560 } else if byte == b'8' || byte == b'9' {
561 self.error_invalid_octal_digit(self.current_offset);
562 self.next();
563 } else {
564 break;
565 }
566 }
567
568 if self.current_offset == digits_start {
569 self.error_missing_octal_digits(start_offset, 2);
570 }
571
572 if self.previous_char() == '_' {
573 self.error_number_trailing_underscore(
574 self.current_offset - self.previous_char().len_utf8(),
575 );
576 }
577
578 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
579 self.next(); let end_offset = self.current_offset;
581 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
582 return Token {
583 kind: TokenKind::Imaginary,
584 text: &self.input[start_offset..end_offset],
585 byte_offset: start_offset as u32,
586 byte_length: (end_offset - start_offset) as u32,
587 };
588 }
589
590 let end_offset = self.current_offset;
591 Token {
592 kind: TokenKind::Integer,
593 text: &self.input[start_offset..end_offset],
594 byte_offset: start_offset as u32,
595 byte_length: (end_offset - start_offset) as u32,
596 }
597 }
598
599 fn lex_legacy_octal_number(&mut self, start_offset: usize) -> Token<'source> {
600 self.next();
601
602 while !self.at_eof() {
603 let byte = self.current_byte();
604 if (b'0'..=b'7').contains(&byte) || byte == b'_' {
605 if byte == b'_' && self.previous_char() == '_' {
606 let underscore_start = self.current_offset - 1;
607 self.error_consecutive_underscores(underscore_start);
608 }
609 self.next();
610 } else if byte == b'8' || byte == b'9' {
611 self.error_invalid_octal_digit(self.current_offset);
612 self.next();
613 } else {
614 break;
615 }
616 }
617
618 if self.previous_char() == '_' {
619 self.error_number_trailing_underscore(
620 self.current_offset - self.previous_char().len_utf8(),
621 );
622 }
623
624 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
625 self.next();
626 let end_offset = self.current_offset;
627 self.error_non_decimal_imaginary("octal", start_offset, end_offset - start_offset);
628 return Token {
629 kind: TokenKind::Imaginary,
630 text: &self.input[start_offset..end_offset],
631 byte_offset: start_offset as u32,
632 byte_length: (end_offset - start_offset) as u32,
633 };
634 }
635
636 let end_offset = self.current_offset;
637 Token {
638 kind: TokenKind::Integer,
639 text: &self.input[start_offset..end_offset],
640 byte_offset: start_offset as u32,
641 byte_length: (end_offset - start_offset) as u32,
642 }
643 }
644
645 fn lex_binary_number(&mut self, start_offset: usize) -> Token<'source> {
646 let digits_start = self.current_offset;
647
648 while !self.at_eof() {
649 let byte = self.current_byte();
650 if byte == b'0' || byte == b'1' || byte == b'_' {
651 if byte == b'_' && self.previous_char() == '_' {
652 let underscore_start = self.current_offset - 1;
653 self.error_consecutive_underscores(underscore_start);
654 }
655 self.next();
656 } else if (b'2'..=b'9').contains(&byte) {
657 self.error_invalid_binary_digit(self.current_offset);
658 self.next();
659 } else {
660 break;
661 }
662 }
663
664 if self.current_offset == digits_start {
665 self.error_missing_binary_digits(start_offset, 2);
666 }
667
668 if self.previous_char() == '_' {
669 self.error_number_trailing_underscore(
670 self.current_offset - self.previous_char().len_utf8(),
671 );
672 }
673
674 if self.current_byte() == b'i' && !self.peek_byte().is_ascii_alphanumeric() {
675 self.next();
676 let end_offset = self.current_offset;
677 self.error_non_decimal_imaginary("binary", start_offset, end_offset - start_offset);
678 return Token {
679 kind: TokenKind::Imaginary,
680 text: &self.input[start_offset..end_offset],
681 byte_offset: start_offset as u32,
682 byte_length: (end_offset - start_offset) as u32,
683 };
684 }
685
686 let end_offset = self.current_offset;
687 Token {
688 kind: TokenKind::Integer,
689 text: &self.input[start_offset..end_offset],
690 byte_offset: start_offset as u32,
691 byte_length: (end_offset - start_offset) as u32,
692 }
693 }
694
695 fn lex_identifier(&mut self) -> Token<'source> {
696 let start_offset = self.current_offset;
697
698 while !self.at_eof() {
699 let c = self.current_char();
700 if c.is_alphanumeric() || c == '_' {
701 self.next();
702 } else {
703 break;
704 }
705 }
706
707 let end_offset = self.current_offset;
708 let text = &self.input[start_offset..end_offset];
709
710 let kind = match text {
711 "true" | "false" => TokenKind::Boolean,
712 _ => TokenKind::from_keyword(text).unwrap_or(TokenKind::Identifier),
713 };
714
715 Token {
716 kind,
717 text,
718 byte_offset: start_offset as u32,
719 byte_length: (end_offset - start_offset) as u32,
720 }
721 }
722
723 fn lex_backtick_literal(&mut self) -> Token<'source> {
724 let start_offset = self.current_offset;
725
726 self.next();
727
728 let mut terminated = false;
729
730 while !self.at_eof() {
731 let byte = self.current_byte();
732 if byte == b'`' {
733 terminated = true;
734 self.next();
735 break;
736 } else if byte == b'\n' {
737 break;
738 }
739 self.next();
740 }
741
742 let end_offset = self.current_offset;
743 let length = end_offset - start_offset;
744
745 if !terminated {
746 self.error_unterminated_backtick(start_offset, length);
747 }
748
749 Token {
750 kind: TokenKind::Backtick,
751 text: &self.input[start_offset..end_offset],
752 byte_offset: start_offset as u32,
753 byte_length: length as u32,
754 }
755 }
756
757 fn consume_unicode_escape(&mut self, escape_start: usize) {
758 if self.at_eof() || self.current_byte() != b'{' {
759 self.error_invalid_unicode_escape(escape_start, self.current_offset - escape_start);
760 return;
761 }
762 self.next();
763
764 let hex_start = self.current_offset;
765 let mut all_hex = true;
766 while !self.at_eof() {
767 let byte = self.current_byte();
768 if byte == b'}' || byte == b'"' || byte == b'\n' {
769 break;
770 }
771 if !byte.is_ascii_hexdigit() {
772 all_hex = false;
773 }
774 self.next();
775 }
776 let hex_end = self.current_offset;
777
778 let closed = !self.at_eof() && self.current_byte() == b'}';
779 if closed {
780 self.next();
781 }
782
783 let hex_len = hex_end - hex_start;
784 let total_len = self.current_offset - escape_start;
785
786 if !closed || !all_hex || hex_len == 0 || hex_len > 6 {
787 self.error_invalid_unicode_escape(escape_start, total_len);
788 return;
789 }
790
791 let codepoint = u32::from_str_radix(&self.input[hex_start..hex_end], 16)
792 .expect("hex digits validated above");
793 if char::from_u32(codepoint).is_none() {
794 self.error_unicode_escape_out_of_range(escape_start, total_len);
795 }
796 }
797
798 fn consume_octal_escape(&mut self, first_digit: u8) -> u16 {
800 let mut value: u16 = (first_digit - b'0') as u16;
801 for _ in 0..2 {
802 if self.at_eof() {
803 break;
804 }
805 match self.current_byte() {
806 d @ b'0'..=b'7' => {
807 value = value * 8 + (d - b'0') as u16;
808 self.next();
809 }
810 _ => break,
811 }
812 }
813 value
814 }
815
816 fn lex_string_literal(&mut self) -> Token<'source> {
817 let start_offset = self.current_offset;
818
819 self.next();
820
821 let mut escaped = false;
822 let mut terminated = false;
823
824 while !self.at_eof() && !terminated {
825 let byte = self.current_byte();
826 if escaped {
827 match byte {
828 b'0'..=b'7' => {
829 let escape_start = self.current_offset - 1;
830 self.next();
831 let value = self.consume_octal_escape(byte);
832 if value > 255 {
833 let escape_len = self.current_offset - escape_start;
834 self.error_octal_escape_out_of_range(escape_start, escape_len);
835 }
836 escaped = false;
837 continue;
838 }
839 b'u' => {
840 let escape_start = self.current_offset - 1;
841 self.next();
842 self.consume_unicode_escape(escape_start);
843 escaped = false;
844 continue;
845 }
846 b'n' | b't' | b'r' | b'\\' | b'"' | b'x' | b'U' => {}
847 b'\'' => {}
848 _ => {
849 self.error_invalid_escape(self.current_char());
850 }
851 }
852 escaped = false;
853 } else if byte == b'\\' {
854 escaped = true;
855 } else if byte == b'"' {
856 terminated = true;
857 self.next();
858 break;
859 } else if byte == b'\n' {
860 break; }
862
863 self.next();
864 }
865
866 let end_offset = self.current_offset;
867 let length = end_offset - start_offset;
868
869 if escaped {
870 self.error_unterminated_escape(start_offset);
871 }
872
873 if !terminated {
874 self.error_unterminated_string(start_offset, length);
875 }
876
877 Token {
878 kind: TokenKind::String,
879 text: &self.input[start_offset..end_offset],
880 byte_offset: start_offset as u32,
881 byte_length: length as u32,
882 }
883 }
884
885 fn push_format_string_text_if_needed(
886 &self,
887 tokens: &mut Vec<Token<'source>>,
888 text_segment_start: usize,
889 ) {
890 if text_segment_start < self.current_offset {
891 tokens.push(Token {
892 kind: TokenKind::FormatStringText,
893 text: &self.input[text_segment_start..self.current_offset],
894 byte_offset: text_segment_start as u32,
895 byte_length: (self.current_offset - text_segment_start) as u32,
896 });
897 }
898 }
899
900 fn lex_format_string_interpolation(
901 &mut self,
902 tokens: &mut Vec<Token<'source>>,
903 ) -> Result<(), ()> {
904 let interp_start = self.current_offset;
905 self.next();
906
907 tokens.push(Token {
908 kind: TokenKind::FormatStringInterpolationStart,
909 text: &self.input[interp_start..self.current_offset],
910 byte_offset: interp_start as u32,
911 byte_length: (self.current_offset - interp_start) as u32,
912 });
913
914 let Some(interpolation_end) = self.find_interpolation_boundary() else {
915 if self.has_newline_between(interp_start, self.input.len()) {
916 self.error_multiline_format_string_interpolation(interp_start);
917 } else {
918 self.error_unclosed_brace_in_format_string(interp_start);
919 }
920 self.skip_to_format_string_end();
921 return Err(());
922 };
923
924 if self.has_newline_between(interp_start, interpolation_end) {
925 self.error_multiline_format_string_interpolation(interp_start);
926 }
927
928 while self.current_offset < interpolation_end {
929 self.skip_horizontal_whitespace();
930 if self.current_offset >= interpolation_end {
931 break;
932 }
933
934 if self.current_byte() == b'f' && self.peek_byte() == b'"' {
935 let mut fstring_tokens = self.lex_format_string_tokens();
936 tokens.append(&mut fstring_tokens);
937 } else if self.current_byte() == b'\\' && self.peek_byte() == b'"' {
938 self.error_escaped_quote_in_interpolation(self.current_offset);
939 self.skip(2);
940 } else {
941 let token = self.create_token();
942 tokens.push(token);
943 }
944 }
945
946 let close_offset = self.current_offset;
947 self.next();
948 tokens.push(Token {
949 kind: TokenKind::FormatStringInterpolationEnd,
950 text: &self.input[close_offset..self.current_offset],
951 byte_offset: close_offset as u32,
952 byte_length: (self.current_offset - close_offset) as u32,
953 });
954
955 Ok(())
956 }
957
958 fn scan_interpolation(&self, start: usize) -> Option<usize> {
959 let bytes = self.input.as_bytes();
960 let mut p = start;
961 let mut depth = 1;
962
963 while p < bytes.len() && depth > 0 {
964 match bytes[p] {
965 b'{' => {
966 depth += 1;
967 p += 1;
968 }
969 b'}' => {
970 depth -= 1;
971 if depth > 0 {
972 p += 1;
973 }
974 }
975 b'"' | b'\'' | b'`' => p = self.scan_past_quoted(p, bytes[p])?,
976 b'f' if matches!(bytes.get(p + 1), Some(b'"')) => {
977 p = self.scan_past_fstring(p)?;
978 }
979 b'\\' => p += 2,
980 b'/' if matches!(bytes.get(p + 1), Some(b'/')) => return None,
981 b'\n' => return None,
982 _ => p += 1,
983 }
984 }
985
986 (depth == 0).then_some(p)
987 }
988
989 fn find_interpolation_boundary(&self) -> Option<usize> {
990 self.scan_interpolation(self.current_offset)
991 }
992
993 fn scan_past_quoted(&self, start: usize, delimiter: u8) -> Option<usize> {
994 let bytes = self.input.as_bytes();
995 let mut p = start + 1;
996 while p < bytes.len() {
997 match bytes[p] {
998 b'\\' if delimiter != b'`' => p += 2,
999 b'\n' => return None,
1000 b if b == delimiter => return Some(p + 1),
1001 _ => p += 1,
1002 }
1003 }
1004 None
1005 }
1006
1007 fn scan_past_fstring(&self, position: usize) -> Option<usize> {
1008 let bytes = self.input.as_bytes();
1009 let mut p = position + 2; while p < bytes.len() {
1011 match bytes[p] {
1012 b'\\' => p += 2,
1013 b'{' if matches!(bytes.get(p + 1), Some(b'{')) => p += 2,
1014 b'}' if matches!(bytes.get(p + 1), Some(b'}')) => p += 2,
1015 b'{' => {
1016 p = self.scan_interpolation(p + 1)?;
1017 p += 1;
1018 }
1019 b'"' => return Some(p + 1),
1020 b'\n' => return None,
1021 _ => p += 1,
1022 }
1023 }
1024 None
1025 }
1026
1027 fn skip_to_format_string_end(&mut self) {
1028 while !self.at_eof() {
1029 match self.current_byte() {
1030 b'"' => {
1031 self.next();
1032 return;
1033 }
1034 b'\n' => return,
1035 _ => self.next(),
1036 }
1037 }
1038 }
1039
1040 fn lex_format_string_tokens(&mut self) -> Vec<Token<'source>> {
1041 let start_offset = self.current_offset;
1042 let mut tokens = Vec::new();
1043
1044 self.skip(2);
1045
1046 let fstring_start_end = self.current_offset;
1047 tokens.push(Token {
1048 kind: TokenKind::FormatStringStart,
1049 text: &self.input[start_offset..fstring_start_end],
1050 byte_offset: start_offset as u32,
1051 byte_length: (fstring_start_end - start_offset) as u32,
1052 });
1053
1054 let mut text_segment_start = self.current_offset;
1055
1056 while !self.at_eof() {
1057 let byte = self.current_byte();
1058
1059 match byte {
1060 b'\\' if !self.at_eof() => {
1061 let escape_start = self.current_offset;
1062 self.next();
1063 if !self.at_eof() {
1064 let b = self.current_byte();
1065 self.next();
1066 if matches!(b, b'0'..=b'7') {
1067 let value = self.consume_octal_escape(b);
1068 if value > 255 {
1069 let escape_len = self.current_offset - escape_start;
1070 self.error_octal_escape_out_of_range(escape_start, escape_len);
1071 }
1072 } else if b == b'u' {
1073 self.consume_unicode_escape(escape_start);
1074 }
1075 }
1076 }
1077 b'{' if self.peek_byte() == b'{' => {
1078 self.skip(2);
1079 }
1080 b'}' if self.peek_byte() == b'}' => {
1081 self.skip(2);
1082 }
1083 b'"' => {
1084 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1085
1086 let end_offset = self.current_offset;
1087 self.next();
1088
1089 tokens.push(Token {
1090 kind: TokenKind::FormatStringEnd,
1091 text: &self.input[end_offset..self.current_offset],
1092 byte_offset: end_offset as u32,
1093 byte_length: (self.current_offset - end_offset) as u32,
1094 });
1095 return tokens;
1096 }
1097
1098 b'\n' => {
1099 let length = self.current_offset.saturating_sub(start_offset);
1100 self.error_unterminated_format_string(start_offset, length);
1101 return tokens;
1102 }
1103
1104 b'{' => {
1105 self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
1106
1107 if self.lex_format_string_interpolation(&mut tokens).is_err() {
1108 return tokens;
1109 }
1110 text_segment_start = self.current_offset;
1111 }
1112 b'}' => {
1113 self.error_unmatched_brace_in_format_string(self.current_offset);
1114 self.next();
1115 }
1116 _ => {
1117 self.next();
1118 }
1119 }
1120 }
1121
1122 let length = self.current_offset.saturating_sub(start_offset);
1123 self.error_unterminated_format_string(start_offset, length);
1124 tokens
1125 }
1126
1127 fn lex_char(&mut self) -> Token<'source> {
1128 let start_offset = self.current_offset;
1129
1130 self.next();
1131
1132 if self.at_eof() || self.current_byte() == b'\'' {
1133 self.error_empty_rune_literal(start_offset);
1134 let end_offset = self.current_offset;
1135 return Token {
1136 kind: TokenKind::Char,
1137 text: &self.input[start_offset..end_offset],
1138 byte_offset: start_offset as u32,
1139 byte_length: (end_offset - start_offset) as u32,
1140 };
1141 }
1142
1143 if self.current_byte() != b'\\' {
1144 self.next();
1145 } else {
1146 self.next();
1147
1148 if self.at_eof() {
1149 self.error_unterminated_escape(start_offset);
1150 let end_offset = self.current_offset;
1151 return Token {
1152 kind: TokenKind::Char,
1153 text: &self.input[start_offset..end_offset],
1154 byte_offset: start_offset as u32,
1155 byte_length: (end_offset - start_offset) as u32,
1156 };
1157 }
1158
1159 match self.current_byte() {
1160 b'0'..=b'7' => {
1161 let escape_start = self.current_offset - 1;
1162 let first = self.current_byte();
1163 self.next();
1164 let value = self.consume_octal_escape(first);
1165 if value > 255 {
1166 let escape_len = self.current_offset - escape_start;
1167 self.error_octal_escape_out_of_range(escape_start, escape_len);
1168 }
1169 }
1170 b'n' | b't' | b'r' | b'\\' | b'\'' | b'x' => {
1171 self.next();
1172 }
1173 _ => {
1174 self.error_invalid_escape(self.current_char());
1175
1176 while !self.at_eof() && self.current_byte() != b'\'' {
1177 self.next();
1178 }
1179
1180 if !self.at_eof() && self.current_byte() == b'\'' {
1181 self.next();
1182 }
1183
1184 let end_offset = self.current_offset;
1185 return Token {
1186 kind: TokenKind::Char,
1187 text: &self.input[start_offset..end_offset],
1188 byte_offset: start_offset as u32,
1189 byte_length: (end_offset - start_offset) as u32,
1190 };
1191 }
1192 }
1193 }
1194
1195 if self.at_eof() || self.current_byte() != b'\'' {
1196 let length = self.current_offset - start_offset;
1197 self.error_unterminated_rune(start_offset, length);
1198 }
1199
1200 if !self.at_eof() && self.current_byte() == b'\'' {
1201 self.next();
1202 }
1203
1204 let end_offset = self.current_offset;
1205 Token {
1206 kind: TokenKind::Char,
1207 text: &self.input[start_offset..end_offset],
1208 byte_offset: start_offset as u32,
1209 byte_length: (end_offset - start_offset) as u32,
1210 }
1211 }
1212
1213 fn lex_slash(&mut self) -> Token<'source> {
1214 let start_offset = self.current_offset;
1215
1216 if self.peek_byte() != b'/' {
1217 self.next();
1218 return Token {
1219 kind: TokenKind::Slash,
1220 text: &self.input[start_offset..self.current_offset],
1221 byte_offset: start_offset as u32,
1222 byte_length: 1,
1223 };
1224 }
1225
1226 let slash_count = self.count_consecutive(b'/');
1227
1228 if slash_count >= 4 {
1229 self.error_excess_slashes_in_comment(start_offset, slash_count);
1230 }
1231
1232 self.skip(slash_count);
1233
1234 if slash_count == 3 {
1235 if self.current_byte() == b' ' {
1236 self.next();
1237 }
1238 let text_start = self.current_offset;
1239 self.skip_to_eol();
1240 let end_offset = self.current_offset;
1241
1242 self.trivia
1243 .doc_comments
1244 .push((start_offset as u32, end_offset as u32));
1245
1246 return Token {
1247 kind: TokenKind::DocComment,
1248 text: &self.input[text_start..end_offset],
1249 byte_offset: start_offset as u32,
1250 byte_length: (end_offset - start_offset) as u32,
1251 };
1252 }
1253
1254 self.skip_to_eol();
1255 let end_offset = self.current_offset;
1256
1257 self.trivia
1258 .comments
1259 .push((start_offset as u32, end_offset as u32));
1260
1261 Token {
1262 kind: TokenKind::Comment,
1263 text: &self.input[start_offset..end_offset],
1264 byte_offset: start_offset as u32,
1265 byte_length: (end_offset - start_offset) as u32,
1266 }
1267 }
1268
1269 fn count_consecutive(&self, byte: u8) -> usize {
1270 let mut count = 0;
1271 let mut offset = self.current_offset;
1272 while offset < self.input_bytes.len() && self.input_bytes[offset] == byte {
1273 count += 1;
1274 offset += 1;
1275 }
1276 count
1277 }
1278
1279 fn skip_to_eol(&mut self) {
1280 while !self.at_eof() && self.current_byte() != b'\n' {
1281 self.next();
1282 }
1283 }
1284
1285 fn lex_directive(&mut self) -> Token<'source> {
1286 let start_offset = self.current_offset;
1287
1288 self.next();
1289
1290 while !self.at_eof() {
1291 let byte = self.current_byte();
1292 if byte.is_ascii_alphanumeric() || byte == b'_' {
1293 self.next();
1294 } else {
1295 break;
1296 }
1297 }
1298
1299 let end_offset = self.current_offset;
1300 Token {
1301 kind: TokenKind::Directive,
1302 text: &self.input[start_offset..end_offset],
1303 byte_offset: start_offset as u32,
1304 byte_length: (end_offset - start_offset) as u32,
1305 }
1306 }
1307
1308 fn handle_unexpected_char(&mut self) -> Token<'source> {
1309 let start_offset = self.current_offset;
1310
1311 self.error_unexpected_char(self.current_offset, self.current_char());
1312
1313 self.resync_on_error();
1314
1315 let end_offset = self.current_offset;
1316
1317 Token {
1318 kind: TokenKind::Error,
1319 text: &self.input[start_offset..end_offset],
1320 byte_offset: start_offset as u32,
1321 byte_length: (end_offset - start_offset) as u32,
1322 }
1323 }
1324
1325 fn eof_token(&self) -> Token<'source> {
1326 Token {
1327 kind: TokenKind::EOF,
1328 text: &self.input[self.current_offset..self.current_offset],
1329 byte_offset: self.current_offset as u32,
1330 byte_length: 0,
1331 }
1332 }
1333
1334 fn semicolon_token(&mut self) -> Token<'source> {
1335 let start_offset = self.current_offset;
1336
1337 self.next();
1338
1339 Token {
1340 kind: TokenKind::Semicolon,
1341 text: &self.input[start_offset..self.current_offset],
1342 byte_offset: start_offset as u32,
1343 byte_length: (self.current_offset - start_offset) as u32,
1344 }
1345 }
1346}