1use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27 InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38#[derive(Debug)]
40pub struct Lexer<'src> {
41 source: &'src str,
43
44 cursor: Cursor<'src>,
46
47 current_kind: TokenKind,
49
50 current_range: TextRange,
52
53 current_value: TokenValue,
55
56 current_flags: TokenFlags,
58
59 state: State,
61
62 nesting: u32,
65
66 indentations: Indentations,
68 pending_indentation: Option<Indentation>,
69
70 mode: Mode,
72
73 interpolated_strings: InterpolatedStrings,
75
76 errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81 pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87 assert!(
88 u32::try_from(source.len()).is_ok(),
89 "Lexer only supports files with a size up to 4GB"
90 );
91
92 let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93 (State::Other, 1)
94 } else {
95 (State::AfterNewline, 0)
96 };
97
98 let mut lexer = Lexer {
99 source,
100 cursor: Cursor::new(source),
101 state,
102 current_kind: TokenKind::EndOfFile,
103 current_range: TextRange::empty(start_offset),
104 current_value: TokenValue::None,
105 current_flags: TokenFlags::empty(),
106 nesting,
107 indentations: Indentations::default(),
108 pending_indentation: None,
109 mode,
110 interpolated_strings: InterpolatedStrings::default(),
111 errors: Vec::new(),
112 };
113
114 if start_offset == TextSize::new(0) {
115 lexer.cursor.eat_char(BOM);
117 } else {
118 lexer.cursor.skip_bytes(start_offset.to_usize());
119 }
120
121 lexer
122 }
123
124 pub(crate) fn current_kind(&self) -> TokenKind {
126 self.current_kind
127 }
128
129 pub(crate) fn current_range(&self) -> TextRange {
131 self.current_range
132 }
133
134 pub(crate) fn current_flags(&self) -> TokenFlags {
136 self.current_flags
137 }
138
139 pub(crate) fn take_value(&mut self) -> TokenValue {
145 std::mem::take(&mut self.current_value)
146 }
147
148 fn push_error(&mut self, error: LexicalError) -> TokenKind {
151 self.current_range = error.location();
152 self.errors.push(error);
153 TokenKind::Unknown
154 }
155
156 pub fn next_token(&mut self) -> TokenKind {
158 self.cursor.start_token();
159 self.current_value = TokenValue::None;
160 self.current_flags = TokenFlags::empty();
161 self.current_kind = self.lex_token();
162 if !matches!(self.current_kind, TokenKind::Unknown) {
164 self.current_range = self.token_range();
165 }
166 self.current_kind
167 }
168
169 fn lex_token(&mut self) -> TokenKind {
170 if let Some(interpolated_string) = self.interpolated_strings.current() {
171 if !interpolated_string.is_in_interpolation(self.nesting) {
172 if let Some(token) = self.lex_interpolated_string_middle_or_end() {
173 if token.is_interpolated_string_end() {
174 self.interpolated_strings.pop();
175 }
176 return token;
177 }
178 }
179 }
180 else if let Some(indentation) = self.pending_indentation.take() {
182 match self.indentations.current().try_compare(indentation) {
183 Ok(Ordering::Greater) => {
184 self.pending_indentation = Some(indentation);
185 if self.indentations.dedent_one(indentation).is_err() {
186 return self.push_error(LexicalError::new(
187 LexicalErrorType::IndentationError,
188 self.token_range(),
189 ));
190 }
191 return TokenKind::Dedent;
192 }
193 Ok(_) => {}
194 Err(_) => {
195 return self.push_error(LexicalError::new(
196 LexicalErrorType::IndentationError,
197 self.token_range(),
198 ));
199 }
200 }
201 }
202
203 if self.state.is_after_newline() {
204 if let Some(indentation) = self.eat_indentation() {
205 return indentation;
206 }
207 } else {
208 if let Err(error) = self.skip_whitespace() {
209 return self.push_error(error);
210 }
211 }
212
213 self.cursor.start_token();
215
216 if let Some(c) = self.cursor.bump() {
217 if c.is_ascii() {
218 self.consume_ascii_character(c)
219 } else if is_unicode_identifier_start(c) {
220 let identifier = self.lex_identifier(c);
221 self.state = State::Other;
222
223 identifier
224 } else {
225 self.push_error(LexicalError::new(
226 LexicalErrorType::UnrecognizedToken { tok: c },
227 self.token_range(),
228 ))
229 }
230 } else {
231 self.consume_end()
234 }
235 }
236
237 fn eat_indentation(&mut self) -> Option<TokenKind> {
238 let mut indentation = Indentation::root();
239
240 loop {
241 match self.cursor.first() {
242 ' ' => {
243 self.cursor.bump();
244 indentation = indentation.add_space();
245 }
246 '\t' => {
247 self.cursor.bump();
248 indentation = indentation.add_tab();
249 }
250 '\\' => {
251 self.cursor.bump();
252 if self.cursor.eat_char('\r') {
253 self.cursor.eat_char('\n');
254 } else if !self.cursor.eat_char('\n') {
255 return Some(self.push_error(LexicalError::new(
256 LexicalErrorType::LineContinuationError,
257 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
258 )));
259 }
260 if self.cursor.is_eof() {
261 return Some(self.push_error(LexicalError::new(
262 LexicalErrorType::Eof,
263 self.token_range(),
264 )));
265 }
266 indentation = Indentation::root();
267 }
268 '\x0C' => {
270 self.cursor.bump();
271 indentation = Indentation::root();
272 }
273 _ => break,
274 }
275 }
276
277 if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
279 self.state = State::NonEmptyLogicalLine;
280
281 return self.handle_indentation(indentation);
283 }
284
285 None
286 }
287
288 fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
289 match self.indentations.current().try_compare(indentation) {
290 Ok(Ordering::Greater) => {
292 self.pending_indentation = Some(indentation);
293
294 if self.indentations.dedent_one(indentation).is_err() {
295 return Some(self.push_error(LexicalError::new(
296 LexicalErrorType::IndentationError,
297 self.token_range(),
298 )));
299 }
300
301 self.cursor.start_token();
315
316 Some(TokenKind::Dedent)
317 }
318
319 Ok(Ordering::Equal) => None,
320
321 Ok(Ordering::Less) => {
323 self.indentations.indent(indentation);
324 Some(TokenKind::Indent)
325 }
326 Err(_) => Some(self.push_error(LexicalError::new(
327 LexicalErrorType::IndentationError,
328 self.token_range(),
329 ))),
330 }
331 }
332
333 fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
334 loop {
335 match self.cursor.first() {
336 ' ' => {
337 self.cursor.bump();
338 }
339 '\t' => {
340 self.cursor.bump();
341 }
342 '\\' => {
343 self.cursor.bump();
344 if self.cursor.eat_char('\r') {
345 self.cursor.eat_char('\n');
346 } else if !self.cursor.eat_char('\n') {
347 return Err(LexicalError::new(
348 LexicalErrorType::LineContinuationError,
349 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
350 ));
351 }
352 if self.cursor.is_eof() {
353 return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
354 }
355 }
356 '\x0C' => {
358 self.cursor.bump();
359 }
360 _ => break,
361 }
362 }
363
364 Ok(())
365 }
366
367 fn consume_ascii_character(&mut self, c: char) -> TokenKind {
369 let token = match c {
370 c if is_ascii_identifier_start(c) => self.lex_identifier(c),
371 '0'..='9' => self.lex_number(c),
372 '#' => return self.lex_comment(),
373 '\'' | '"' => self.lex_string(c),
374 '=' => {
375 if self.cursor.eat_char('=') {
376 TokenKind::EqEqual
377 } else {
378 self.state = State::AfterEqual;
379 return TokenKind::Equal;
380 }
381 }
382 '+' => {
383 if self.cursor.eat_char('=') {
384 TokenKind::PlusEqual
385 } else {
386 TokenKind::Plus
387 }
388 }
389 '*' => {
390 if self.cursor.eat_char('=') {
391 TokenKind::StarEqual
392 } else if self.cursor.eat_char('*') {
393 if self.cursor.eat_char('=') {
394 TokenKind::DoubleStarEqual
395 } else {
396 TokenKind::DoubleStar
397 }
398 } else {
399 TokenKind::Star
400 }
401 }
402
403 c @ ('%' | '!')
404 if self.mode == Mode::Ipython
405 && self.state.is_after_equal()
406 && self.nesting == 0 =>
407 {
408 self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
410 }
411
412 c @ ('%' | '!' | '?' | '/' | ';' | ',')
413 if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
414 {
415 let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
416 self.cursor.bump();
417 kind
418 } else {
419 IpyEscapeKind::try_from(c).unwrap()
421 };
422
423 self.lex_ipython_escape_command(kind)
424 }
425
426 '?' if self.mode == Mode::Ipython => TokenKind::Question,
427
428 '/' => {
429 if self.cursor.eat_char('=') {
430 TokenKind::SlashEqual
431 } else if self.cursor.eat_char('/') {
432 if self.cursor.eat_char('=') {
433 TokenKind::DoubleSlashEqual
434 } else {
435 TokenKind::DoubleSlash
436 }
437 } else {
438 TokenKind::Slash
439 }
440 }
441 '%' => {
442 if self.cursor.eat_char('=') {
443 TokenKind::PercentEqual
444 } else {
445 TokenKind::Percent
446 }
447 }
448 '|' => {
449 if self.cursor.eat_char('=') {
450 TokenKind::VbarEqual
451 } else {
452 TokenKind::Vbar
453 }
454 }
455 '^' => {
456 if self.cursor.eat_char('=') {
457 TokenKind::CircumflexEqual
458 } else {
459 TokenKind::CircumFlex
460 }
461 }
462 '&' => {
463 if self.cursor.eat_char('=') {
464 TokenKind::AmperEqual
465 } else {
466 TokenKind::Amper
467 }
468 }
469 '-' => {
470 if self.cursor.eat_char('=') {
471 TokenKind::MinusEqual
472 } else if self.cursor.eat_char('>') {
473 TokenKind::Rarrow
474 } else {
475 TokenKind::Minus
476 }
477 }
478 '@' => {
479 if self.cursor.eat_char('=') {
480 TokenKind::AtEqual
481 } else {
482 TokenKind::At
483 }
484 }
485 '!' => {
486 if self.cursor.eat_char('=') {
487 TokenKind::NotEqual
488 } else {
489 TokenKind::Exclamation
490 }
491 }
492 '~' => TokenKind::Tilde,
493 '(' => {
494 self.nesting += 1;
495 TokenKind::Lpar
496 }
497 ')' => {
498 self.nesting = self.nesting.saturating_sub(1);
499 TokenKind::Rpar
500 }
501 '[' => {
502 self.nesting += 1;
503 TokenKind::Lsqb
504 }
505 ']' => {
506 self.nesting = self.nesting.saturating_sub(1);
507 TokenKind::Rsqb
508 }
509 '{' => {
510 self.nesting += 1;
511 TokenKind::Lbrace
512 }
513 '}' => {
514 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
515 if interpolated_string.nesting() == self.nesting {
516 let error_type = LexicalErrorType::from_interpolated_string_error(
517 InterpolatedStringErrorType::SingleRbrace,
518 interpolated_string.kind(),
519 );
520 return self.push_error(LexicalError::new(error_type, self.token_range()));
521 }
522 interpolated_string.try_end_format_spec(self.nesting);
523 }
524 self.nesting = self.nesting.saturating_sub(1);
525 TokenKind::Rbrace
526 }
527 ':' => {
528 if self
529 .interpolated_strings
530 .current_mut()
531 .is_some_and(|interpolated_string| {
532 interpolated_string.try_start_format_spec(self.nesting)
533 })
534 {
535 TokenKind::Colon
536 } else if self.cursor.eat_char('=') {
537 TokenKind::ColonEqual
538 } else {
539 TokenKind::Colon
540 }
541 }
542 ';' => TokenKind::Semi,
543 '<' => {
544 if self.cursor.eat_char('<') {
545 if self.cursor.eat_char('=') {
546 TokenKind::LeftShiftEqual
547 } else {
548 TokenKind::LeftShift
549 }
550 } else if self.cursor.eat_char('=') {
551 TokenKind::LessEqual
552 } else {
553 TokenKind::Less
554 }
555 }
556 '>' => {
557 if self.cursor.eat_char('>') {
558 if self.cursor.eat_char('=') {
559 TokenKind::RightShiftEqual
560 } else {
561 TokenKind::RightShift
562 }
563 } else if self.cursor.eat_char('=') {
564 TokenKind::GreaterEqual
565 } else {
566 TokenKind::Greater
567 }
568 }
569 ',' => TokenKind::Comma,
570 '.' => {
571 if self.cursor.first().is_ascii_digit() {
572 self.lex_decimal_number('.')
573 } else if self.cursor.eat_char2('.', '.') {
574 TokenKind::Ellipsis
575 } else {
576 TokenKind::Dot
577 }
578 }
579 '\n' => {
580 return if self.nesting == 0 && !self.state.is_new_logical_line() {
581 self.state = State::AfterNewline;
582 TokenKind::Newline
583 } else {
584 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
585 interpolated_string.try_end_format_spec(self.nesting);
586 }
587 TokenKind::NonLogicalNewline
588 };
589 }
590 '\r' => {
591 self.cursor.eat_char('\n');
592
593 return if self.nesting == 0 && !self.state.is_new_logical_line() {
594 self.state = State::AfterNewline;
595 TokenKind::Newline
596 } else {
597 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
598 interpolated_string.try_end_format_spec(self.nesting);
599 }
600 TokenKind::NonLogicalNewline
601 };
602 }
603
604 _ => {
605 self.state = State::Other;
606
607 return self.push_error(LexicalError::new(
608 LexicalErrorType::UnrecognizedToken { tok: c },
609 self.token_range(),
610 ));
611 }
612 };
613
614 self.state = State::Other;
615
616 token
617 }
618
619 fn lex_identifier(&mut self, first: char) -> TokenKind {
621 let quote = match (first, self.cursor.first()) {
623 (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
624 self.cursor.bump();
625 quote
626 }),
627 (_, second) if is_quote(self.cursor.second()) => {
628 self.try_double_char_prefix([first, second]).then(|| {
629 self.cursor.bump();
630 self.cursor.bump().unwrap()
632 })
633 }
634 _ => None,
635 };
636
637 if let Some(quote) = quote {
638 if self.current_flags.is_interpolated_string() {
639 if let Some(kind) = self.lex_interpolated_string_start(quote) {
640 return kind;
641 }
642 }
643
644 return self.lex_string(quote);
645 }
646
647 let mut is_ascii = first.is_ascii();
655 self.cursor
656 .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
657
658 let text = self.token_text();
659
660 if !is_ascii {
661 self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
662 return TokenKind::Name;
663 }
664
665 if text.len() > 8 {
669 self.current_value = TokenValue::Name(Name::new(text));
670 return TokenKind::Name;
671 }
672
673 match text {
674 "False" => TokenKind::False,
675 "None" => TokenKind::None,
676 "True" => TokenKind::True,
677 "and" => TokenKind::And,
678 "as" => TokenKind::As,
679 "assert" => TokenKind::Assert,
680 "async" => TokenKind::Async,
681 "await" => TokenKind::Await,
682 "break" => TokenKind::Break,
683 "case" => TokenKind::Case,
684 "class" => TokenKind::Class,
685 "continue" => TokenKind::Continue,
686 "def" => TokenKind::Def,
687 "del" => TokenKind::Del,
688 "elif" => TokenKind::Elif,
689 "else" => TokenKind::Else,
690 "except" => TokenKind::Except,
691 "finally" => TokenKind::Finally,
692 "for" => TokenKind::For,
693 "from" => TokenKind::From,
694 "global" => TokenKind::Global,
695 "if" => TokenKind::If,
696 "import" => TokenKind::Import,
697 "in" => TokenKind::In,
698 "is" => TokenKind::Is,
699 "lambda" => TokenKind::Lambda,
700 "match" => TokenKind::Match,
701 "nonlocal" => TokenKind::Nonlocal,
702 "not" => TokenKind::Not,
703 "or" => TokenKind::Or,
704 "pass" => TokenKind::Pass,
705 "raise" => TokenKind::Raise,
706 "return" => TokenKind::Return,
707 "try" => TokenKind::Try,
708 "type" => TokenKind::Type,
709 "while" => TokenKind::While,
710 "with" => TokenKind::With,
711 "yield" => TokenKind::Yield,
712 _ => {
713 self.current_value = TokenValue::Name(Name::new(text));
714 TokenKind::Name
715 }
716 }
717 }
718
719 fn try_single_char_prefix(&mut self, first: char) -> bool {
722 match first {
723 'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
724 't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
725 'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
726 'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
727 'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
728 'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
729 _ => return false,
730 }
731 true
732 }
733
734 fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
737 match value {
738 ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
739 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
740 }
741 ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
742 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
743 }
744 ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
745 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
746 }
747 ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
748 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
749 }
750 ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
751 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
752 }
753 ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
754 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
755 }
756 _ => return false,
757 }
758 true
759 }
760
761 fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
763 #[cfg(debug_assertions)]
764 debug_assert_eq!(self.cursor.previous(), quote);
765
766 if quote == '"' {
767 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
768 }
769
770 if self.cursor.eat_char2(quote, quote) {
771 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
772 }
773
774 let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
775
776 let kind = ftcontext.kind();
777
778 self.interpolated_strings.push(ftcontext);
779
780 Some(kind.start_token())
781 }
782
783 fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
785 let interpolated_string = self.interpolated_strings.current().unwrap();
787 let string_kind = interpolated_string.kind();
788 let interpolated_flags = interpolated_string.flags();
789
790 if interpolated_string.is_triple_quoted() {
792 let quote_char = interpolated_string.quote_char();
793 if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
794 self.current_flags = interpolated_string.flags();
795 return Some(string_kind.end_token());
796 }
797 } else if self.cursor.eat_char(interpolated_string.quote_char()) {
798 self.current_flags = interpolated_string.flags();
799 return Some(string_kind.end_token());
800 }
801
802 let mut normalized = String::new();
806
807 let mut last_offset = self.offset();
809
810 let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
812
813 let mut in_named_unicode = false;
814
815 loop {
816 match self.cursor.first() {
817 EOF_CHAR if self.cursor.is_eof() => {
821 let error = if interpolated_string.is_triple_quoted() {
822 InterpolatedStringErrorType::UnterminatedTripleQuotedString
823 } else {
824 InterpolatedStringErrorType::UnterminatedString
825 };
826
827 self.nesting = interpolated_string.nesting();
828 self.interpolated_strings.pop();
829 self.current_flags |= TokenFlags::UNCLOSED_STRING;
830 self.push_error(LexicalError::new(
831 LexicalErrorType::from_interpolated_string_error(error, string_kind),
832 self.token_range(),
833 ));
834
835 break;
836 }
837 '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
838 let error_type = if in_format_spec {
841 InterpolatedStringErrorType::NewlineInFormatSpec
842 } else {
843 InterpolatedStringErrorType::UnterminatedString
844 };
845
846 self.nesting = interpolated_string.nesting();
847 self.interpolated_strings.pop();
848 self.current_flags |= TokenFlags::UNCLOSED_STRING;
849
850 self.push_error(LexicalError::new(
851 LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
852 self.token_range(),
853 ));
854
855 break;
856 }
857 '\\' => {
858 self.cursor.bump(); if matches!(self.cursor.first(), '{' | '}') {
860 continue;
863 } else if !interpolated_string.is_raw_string() {
864 if self.cursor.eat_char2('N', '{') {
865 in_named_unicode = true;
866 continue;
867 }
868 }
869 if self.cursor.eat_char('\r') {
871 self.cursor.eat_char('\n');
872 } else {
873 self.cursor.bump();
874 }
875 }
876 quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
877 if let Some(triple_quotes) = interpolated_string.triple_quotes() {
878 if self.cursor.rest().starts_with(triple_quotes) {
879 break;
880 }
881 self.cursor.bump();
882 } else {
883 break;
884 }
885 }
886 '{' => {
887 if self.cursor.second() == '{' && !in_format_spec {
888 self.cursor.bump();
889 normalized
890 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
891 self.cursor.bump(); last_offset = self.offset();
893 } else {
894 break;
895 }
896 }
897 '}' => {
898 if in_named_unicode {
899 in_named_unicode = false;
900 self.cursor.bump();
901 } else if self.cursor.second() == '}' && !in_format_spec {
902 self.cursor.bump();
903 normalized
904 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
905 self.cursor.bump(); last_offset = self.offset();
907 } else {
908 break;
909 }
910 }
911 _ => {
912 self.cursor.bump();
913 }
914 }
915 }
916 let range = self.token_range();
917 if range.is_empty() {
918 return None;
919 }
920
921 let value = if normalized.is_empty() {
922 self.source[range].to_string()
923 } else {
924 normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
925 normalized
926 };
927
928 self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
929
930 self.current_flags = interpolated_flags;
931 Some(string_kind.middle_token())
932 }
933
934 fn lex_string(&mut self, quote: char) -> TokenKind {
936 #[cfg(debug_assertions)]
937 debug_assert_eq!(self.cursor.previous(), quote);
938
939 if quote == '"' {
940 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
941 }
942
943 if self.cursor.eat_char2(quote, quote) {
946 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
947 }
948
949 let value_start = self.offset();
950
951 let quote_byte = u8::try_from(quote).expect("char that fits in u8");
952 let value_end = if self.current_flags.is_triple_quoted() {
953 loop {
956 let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
957 self.cursor.skip_to_end();
958
959 self.current_flags |= TokenFlags::UNCLOSED_STRING;
960 self.push_error(LexicalError::new(
961 LexicalErrorType::UnclosedStringError,
962 self.token_range(),
963 ));
964 break self.offset();
965 };
966
967 let num_backslashes = self.cursor.rest().as_bytes()[..index]
970 .iter()
971 .rev()
972 .take_while(|&&c| c == b'\\')
973 .count();
974
975 self.cursor.skip_bytes(index + 1);
977
978 if num_backslashes % 2 == 1 {
980 continue;
981 }
982
983 if self.cursor.eat_char2(quote, quote) {
985 break self.offset() - TextSize::new(3);
986 }
987 }
988 } else {
989 loop {
992 let Some(index) =
993 memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
994 else {
995 self.cursor.skip_to_end();
996 self.current_flags |= TokenFlags::UNCLOSED_STRING;
997
998 self.push_error(LexicalError::new(
999 LexicalErrorType::UnclosedStringError,
1000 self.token_range(),
1001 ));
1002
1003 break self.offset();
1004 };
1005
1006 let num_backslashes = self.cursor.rest().as_bytes()[..index]
1009 .iter()
1010 .rev()
1011 .take_while(|&&c| c == b'\\')
1012 .count();
1013
1014 self.cursor.skip_bytes(index);
1016
1017 let quote_or_newline = self.cursor.first();
1019
1020 if num_backslashes % 2 == 1 {
1022 self.cursor.bump();
1023 if quote_or_newline == '\r' {
1024 self.cursor.eat_char('\n');
1025 }
1026 continue;
1027 }
1028
1029 match quote_or_newline {
1030 '\r' | '\n' => {
1031 self.current_flags |= TokenFlags::UNCLOSED_STRING;
1032 self.push_error(LexicalError::new(
1033 LexicalErrorType::UnclosedStringError,
1034 self.token_range(),
1035 ));
1036 break self.offset();
1037 }
1038 ch if ch == quote => {
1039 let value_end = self.offset();
1040 self.cursor.bump();
1041 break value_end;
1042 }
1043 _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1044 }
1045 }
1046 };
1047
1048 self.current_value = TokenValue::String(
1049 self.source[TextRange::new(value_start, value_end)]
1050 .to_string()
1051 .into_boxed_str(),
1052 );
1053
1054 TokenKind::String
1055 }
1056
1057 fn lex_number(&mut self, first: char) -> TokenKind {
1059 if first == '0' {
1060 if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1061 self.lex_number_radix(Radix::Hex)
1062 } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1063 self.lex_number_radix(Radix::Octal)
1064 } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1065 self.lex_number_radix(Radix::Binary)
1066 } else {
1067 self.lex_decimal_number(first)
1068 }
1069 } else {
1070 self.lex_decimal_number(first)
1071 }
1072 }
1073
1074 fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1076 #[cfg(debug_assertions)]
1077 debug_assert!(matches!(
1078 self.cursor.previous().to_ascii_lowercase(),
1079 'x' | 'o' | 'b'
1080 ));
1081
1082 let mut number = LexedText::new(self.offset(), self.source);
1084 self.radix_run(&mut number, radix);
1085
1086 let token = &self.source[self.token_range()];
1088
1089 let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1090 Ok(int) => int,
1091 Err(err) => {
1092 return self.push_error(LexicalError::new(
1093 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1094 self.token_range(),
1095 ));
1096 }
1097 };
1098 self.current_value = TokenValue::Int(value);
1099 TokenKind::Int
1100 }
1101
1102 fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1104 #[cfg(debug_assertions)]
1105 debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1106 let start_is_zero = first_digit_or_dot == '0';
1107
1108 let mut number = LexedText::new(self.token_start(), self.source);
1109 if first_digit_or_dot != '.' {
1110 number.push(first_digit_or_dot);
1111 self.radix_run(&mut number, Radix::Decimal);
1112 }
1113
1114 let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1115 number.push('.');
1116
1117 if self.cursor.eat_char('_') {
1118 return self.push_error(LexicalError::new(
1119 LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1120 TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1121 ));
1122 }
1123
1124 self.radix_run(&mut number, Radix::Decimal);
1125 true
1126 } else {
1127 false
1129 };
1130
1131 let is_float = match self.cursor.rest().as_bytes() {
1132 [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1133 number.push(self.cursor.bump().unwrap());
1135
1136 if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1137 number.push(sign);
1138 }
1139
1140 self.radix_run(&mut number, Radix::Decimal);
1141
1142 true
1143 }
1144 _ => is_float,
1145 };
1146
1147 if is_float {
1148 let Ok(value) = f64::from_str(number.as_str()) else {
1150 return self.push_error(LexicalError::new(
1151 LexicalErrorType::OtherError(
1152 "Invalid decimal literal".to_string().into_boxed_str(),
1153 ),
1154 self.token_range(),
1155 ));
1156 };
1157
1158 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1160 self.current_value = TokenValue::Complex {
1161 real: 0.0,
1162 imag: value,
1163 };
1164 TokenKind::Complex
1165 } else {
1166 self.current_value = TokenValue::Float(value);
1167 TokenKind::Float
1168 }
1169 } else {
1170 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1172 let imag = f64::from_str(number.as_str()).unwrap();
1173 self.current_value = TokenValue::Complex { real: 0.0, imag };
1174 TokenKind::Complex
1175 } else {
1176 let value = match Int::from_str(number.as_str()) {
1177 Ok(value) => {
1178 if start_is_zero && value.as_u8() != Some(0) {
1179 return self.push_error(LexicalError::new(
1181 LexicalErrorType::OtherError(
1182 "Invalid decimal integer literal"
1183 .to_string()
1184 .into_boxed_str(),
1185 ),
1186 self.token_range(),
1187 ));
1188 }
1189 value
1190 }
1191 Err(err) => {
1192 return self.push_error(LexicalError::new(
1193 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1194 self.token_range(),
1195 ));
1196 }
1197 };
1198 self.current_value = TokenValue::Int(value);
1199 TokenKind::Int
1200 }
1201 }
1202 }
1203
1204 fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1208 loop {
1209 if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1210 number.push(c);
1211 }
1212 else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1214 self.cursor.bump();
1216 number.skip_char();
1217 } else {
1218 break;
1219 }
1220 }
1221 }
1222
1223 fn lex_comment(&mut self) -> TokenKind {
1225 #[cfg(debug_assertions)]
1226 debug_assert_eq!(self.cursor.previous(), '#');
1227
1228 let bytes = self.cursor.rest().as_bytes();
1229 let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1230 self.cursor.skip_bytes(offset);
1231
1232 TokenKind::Comment
1233 }
1234
1235 fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
1237 let mut value = String::new();
1238
1239 loop {
1240 match self.cursor.first() {
1241 '\\' => {
1242 if self.cursor.second() == '\r' {
1252 self.cursor.bump();
1253 self.cursor.bump();
1254 self.cursor.eat_char('\n');
1255 continue;
1256 } else if self.cursor.second() == '\n' {
1257 self.cursor.bump();
1258 self.cursor.bump();
1259 continue;
1260 }
1261
1262 self.cursor.bump();
1263 value.push('\\');
1264 }
1265 '?' => {
1278 self.cursor.bump();
1279 let mut question_count = 1u32;
1280 while self.cursor.eat_char('?') {
1281 question_count += 1;
1282 }
1283
1284 if question_count > 2
1297 || value.chars().last().is_none_or(is_python_whitespace)
1298 || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1299 {
1300 value.reserve(question_count as usize);
1302 for _ in 0..question_count {
1303 value.push('?');
1304 }
1305 continue;
1306 }
1307
1308 if escape_kind.is_help() {
1309 value = value.trim_start_matches([' ', '?']).to_string();
1316 } else if escape_kind.is_magic() {
1317 value.insert_str(0, escape_kind.as_str());
1322 }
1323
1324 let kind = match question_count {
1325 1 => IpyEscapeKind::Help,
1326 2 => IpyEscapeKind::Help2,
1327 _ => unreachable!("`question_count` is always 1 or 2"),
1328 };
1329
1330 self.current_value = TokenValue::IpyEscapeCommand {
1331 kind,
1332 value: value.into_boxed_str(),
1333 };
1334
1335 return TokenKind::IpyEscapeCommand;
1336 }
1337 '\n' | '\r' | EOF_CHAR => {
1338 self.current_value = TokenValue::IpyEscapeCommand {
1339 kind: escape_kind,
1340 value: value.into_boxed_str(),
1341 };
1342
1343 return TokenKind::IpyEscapeCommand;
1344 }
1345 c => {
1346 self.cursor.bump();
1347 value.push(c);
1348 }
1349 }
1350 }
1351 }
1352
1353 fn consume_end(&mut self) -> TokenKind {
1354 while let Some(interpolated_string) = self.interpolated_strings.pop() {
1358 self.nesting = interpolated_string.nesting();
1359 self.push_error(LexicalError::new(
1360 LexicalErrorType::from_interpolated_string_error(
1361 InterpolatedStringErrorType::UnterminatedString,
1362 interpolated_string.kind(),
1363 ),
1364 self.token_range(),
1365 ));
1366 }
1367
1368 let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1372
1373 if self.nesting > init_nesting {
1374 self.nesting = 0;
1376 return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1377 }
1378
1379 if !self.state.is_new_logical_line() {
1381 self.state = State::AfterNewline;
1382 TokenKind::Newline
1383 }
1384 else if self.indentations.dedent().is_some() {
1386 TokenKind::Dedent
1387 } else {
1388 TokenKind::EndOfFile
1389 }
1390 }
1391
1392 pub(crate) fn re_lex_logical_token(
1448 &mut self,
1449 non_logical_newline_start: Option<TextSize>,
1450 ) -> bool {
1451 if self.nesting == 0 {
1452 return false;
1453 }
1454
1455 self.nesting -= 1;
1458
1459 if self.current_flags.is_triple_quoted_interpolated_string() {
1462 return false;
1463 }
1464
1465 let Some(new_position) = non_logical_newline_start else {
1466 return false;
1467 };
1468
1469 if matches!(
1483 self.current_kind,
1484 TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1485 ) {
1486 self.nesting += 1;
1487 }
1488
1489 self.cursor = Cursor::new(self.source);
1490 self.cursor.skip_bytes(new_position.to_usize());
1491 self.state = State::Other;
1492 self.next_token();
1493 true
1494 }
1495
1496 pub(crate) fn re_lex_string_token_in_interpolation_element(
1505 &mut self,
1506 kind: InterpolatedStringKind,
1507 ) {
1508 let Some(interpolated_string) = self.interpolated_strings.current() else {
1509 return;
1510 };
1511
1512 let current_string_flags = self.current_flags().as_any_string_flags();
1513
1514 if !matches!(self.current_kind, TokenKind::String)
1516 || !self.current_flags.is_unclosed()
1517 || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1518 || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1519 || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1520 {
1521 return;
1522 }
1523
1524 let first_line = &self.source
1527 [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1528
1529 for c in first_line.chars() {
1530 if matches!(c, '\n' | '\r' | '#') {
1531 break;
1532 }
1533
1534 if !is_python_whitespace(c) {
1536 return;
1537 }
1538 }
1539
1540 if self.errors.last().is_some_and(|error| {
1541 error.location() == self.current_range
1542 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1543 }) {
1544 self.errors.pop();
1545 }
1546
1547 self.current_range =
1548 TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1549 self.current_kind = kind.end_token();
1550 self.current_value = TokenValue::None;
1551 self.current_flags = TokenFlags::empty();
1552
1553 self.nesting = interpolated_string.nesting();
1554 self.interpolated_strings.pop();
1555
1556 self.cursor = Cursor::new(self.source);
1557 self.cursor.skip_bytes(self.current_range.end().to_usize());
1558 }
1559
1560 pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1574 if matches!(self.current_kind, TokenKind::String)
1577 && self.current_flags.is_unclosed()
1578 && self.current_flags.prefix()
1579 == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1580 {
1581 if self.errors.last().is_some_and(|error| {
1582 error.location() == self.current_range
1583 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1584 }) {
1585 self.errors.pop();
1586 }
1587
1588 self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1589 self.current_kind = TokenKind::Name;
1590 self.current_value = TokenValue::Name(Name::new_static("r"));
1591 self.current_flags = TokenFlags::empty();
1592 self.cursor = Cursor::new(self.source);
1593 self.cursor.skip_bytes(self.current_range.end().to_usize());
1594 }
1595 }
1596
1597 #[inline]
1598 fn token_range(&self) -> TextRange {
1599 let end = self.offset();
1600 let len = self.cursor.token_len();
1601
1602 TextRange::at(end - len, len)
1603 }
1604
1605 #[inline]
1606 fn token_text(&self) -> &'src str {
1607 &self.source[self.token_range()]
1608 }
1609
1610 #[expect(clippy::cast_possible_truncation)]
1613 #[inline]
1614 fn offset(&self) -> TextSize {
1615 TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1616 }
1617
1618 #[inline]
1619 fn token_start(&self) -> TextSize {
1620 self.token_range().start()
1621 }
1622
1623 pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1625 LexerCheckpoint {
1626 value: self.current_value.clone(),
1627 current_kind: self.current_kind,
1628 current_range: self.current_range,
1629 current_flags: self.current_flags,
1630 cursor_offset: self.offset(),
1631 state: self.state,
1632 nesting: self.nesting,
1633 indentations_checkpoint: self.indentations.checkpoint(),
1634 pending_indentation: self.pending_indentation,
1635 interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1636 errors_position: self.errors.len(),
1637 }
1638 }
1639
1640 pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1642 let LexerCheckpoint {
1643 value,
1644 current_kind,
1645 current_range,
1646 current_flags,
1647 cursor_offset,
1648 state,
1649 nesting,
1650 indentations_checkpoint,
1651 pending_indentation,
1652 interpolated_strings_checkpoint,
1653 errors_position,
1654 } = checkpoint;
1655
1656 let mut cursor = Cursor::new(self.source);
1657 cursor.skip_bytes(cursor_offset.to_usize());
1659
1660 self.current_value = value;
1661 self.current_kind = current_kind;
1662 self.current_range = current_range;
1663 self.current_flags = current_flags;
1664 self.cursor = cursor;
1665 self.state = state;
1666 self.nesting = nesting;
1667 self.indentations.rewind(indentations_checkpoint);
1668 self.pending_indentation = pending_indentation;
1669 self.interpolated_strings
1670 .rewind(interpolated_strings_checkpoint);
1671 self.errors.truncate(errors_position);
1672 }
1673
1674 pub fn finish(self) -> Vec<LexicalError> {
1675 self.errors
1676 }
1677}
1678
1679pub(crate) struct LexerCheckpoint {
1680 value: TokenValue,
1681 current_kind: TokenKind,
1682 current_range: TextRange,
1683 current_flags: TokenFlags,
1684 cursor_offset: TextSize,
1685 state: State,
1686 nesting: u32,
1687 indentations_checkpoint: IndentationsCheckpoint,
1688 pending_indentation: Option<Indentation>,
1689 interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1690 errors_position: usize,
1691}
1692
1693#[derive(Copy, Clone, Debug)]
1694enum State {
1695 AfterNewline,
1697
1698 NonEmptyLogicalLine,
1700
1701 AfterEqual,
1703
1704 Other,
1706}
1707
1708impl State {
1709 const fn is_after_newline(self) -> bool {
1710 matches!(self, State::AfterNewline)
1711 }
1712
1713 const fn is_new_logical_line(self) -> bool {
1714 matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1715 }
1716
1717 const fn is_after_equal(self) -> bool {
1718 matches!(self, State::AfterEqual)
1719 }
1720}
1721
1722#[derive(Copy, Clone, Debug)]
1723enum Radix {
1724 Binary,
1725 Octal,
1726 Decimal,
1727 Hex,
1728}
1729
1730impl Radix {
1731 const fn as_u32(self) -> u32 {
1732 match self {
1733 Radix::Binary => 2,
1734 Radix::Octal => 8,
1735 Radix::Decimal => 10,
1736 Radix::Hex => 16,
1737 }
1738 }
1739
1740 const fn is_digit(self, c: char) -> bool {
1741 match self {
1742 Radix::Binary => matches!(c, '0'..='1'),
1743 Radix::Octal => matches!(c, '0'..='7'),
1744 Radix::Decimal => c.is_ascii_digit(),
1745 Radix::Hex => c.is_ascii_hexdigit(),
1746 }
1747 }
1748}
1749
1750const fn is_quote(c: char) -> bool {
1751 matches!(c, '\'' | '"')
1752}
1753
1754const fn is_ascii_identifier_start(c: char) -> bool {
1755 matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1756}
1757
1758fn is_unicode_identifier_start(c: char) -> bool {
1761 is_xid_start(c)
1762}
1763
1764fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1771 if c.is_ascii() {
1774 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1775 } else {
1776 *identifier_is_ascii_only = false;
1777 is_xid_continue(c)
1778 }
1779}
1780
1781enum LexedText<'a> {
1782 Source { source: &'a str, range: TextRange },
1783 Owned(String),
1784}
1785
1786impl<'a> LexedText<'a> {
1787 fn new(start: TextSize, source: &'a str) -> Self {
1788 Self::Source {
1789 range: TextRange::empty(start),
1790 source,
1791 }
1792 }
1793
1794 fn push(&mut self, c: char) {
1795 match self {
1796 LexedText::Source { range, source } => {
1797 *range = range.add_end(c.text_len());
1798 debug_assert!(source[*range].ends_with(c));
1799 }
1800 LexedText::Owned(owned) => owned.push(c),
1801 }
1802 }
1803
1804 fn as_str<'b>(&'b self) -> &'b str
1805 where
1806 'b: 'a,
1807 {
1808 match self {
1809 LexedText::Source { range, source } => &source[*range],
1810 LexedText::Owned(owned) => owned,
1811 }
1812 }
1813
1814 fn skip_char(&mut self) {
1815 match self {
1816 LexedText::Source { range, source } => {
1817 *self = LexedText::Owned(source[*range].to_string());
1818 }
1819 LexedText::Owned(_) => {}
1820 }
1821 }
1822}
1823
1824pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1826 Lexer::new(source, mode, TextSize::default())
1827}
1828