ruff_python_parser/
lexer.rs

1//! This module takes care of lexing Python source text.
2//!
3//! This means source code is scanned and translated into separate tokens. The rules
4//! governing what is and is not a valid token are defined in the Python reference
5//! guide section on [Lexical analysis].
6//!
7//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
8
9use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27    InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38/// A lexer for Python source code.
39#[derive(Debug)]
40pub struct Lexer<'src> {
41    /// Source code to be lexed.
42    source: &'src str,
43
44    /// A pointer to the current character of the source code which is being lexed.
45    cursor: Cursor<'src>,
46
47    /// The kind of the current token.
48    current_kind: TokenKind,
49
50    /// The range of the current token.
51    current_range: TextRange,
52
53    /// The value of the current token.
54    current_value: TokenValue,
55
56    /// Flags for the current token.
57    current_flags: TokenFlags,
58
59    /// Lexer state.
60    state: State,
61
62    /// Represents the current level of nesting in the lexer, indicating the depth of parentheses.
63    /// The lexer is within a parenthesized context if the value is greater than 0.
64    nesting: u32,
65
66    /// A stack of indentation representing the current indentation level.
67    indentations: Indentations,
68    pending_indentation: Option<Indentation>,
69
70    /// Lexer mode.
71    mode: Mode,
72
73    /// F-string and t-string contexts.
74    interpolated_strings: InterpolatedStrings,
75
76    /// Errors encountered while lexing.
77    errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81    /// Create a new lexer for the given input source which starts at the given offset.
82    ///
83    /// If the start offset is greater than 0, the cursor is moved ahead that many bytes.
84    /// This means that the input source should be the complete source code and not the
85    /// sliced version.
86    pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87        assert!(
88            u32::try_from(source.len()).is_ok(),
89            "Lexer only supports files with a size up to 4GB"
90        );
91
92        let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93            (State::Other, 1)
94        } else {
95            (State::AfterNewline, 0)
96        };
97
98        let mut lexer = Lexer {
99            source,
100            cursor: Cursor::new(source),
101            state,
102            current_kind: TokenKind::EndOfFile,
103            current_range: TextRange::empty(start_offset),
104            current_value: TokenValue::None,
105            current_flags: TokenFlags::empty(),
106            nesting,
107            indentations: Indentations::default(),
108            pending_indentation: None,
109            mode,
110            interpolated_strings: InterpolatedStrings::default(),
111            errors: Vec::new(),
112        };
113
114        if start_offset == TextSize::new(0) {
115            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
116            lexer.cursor.eat_char(BOM);
117        } else {
118            lexer.cursor.skip_bytes(start_offset.to_usize());
119        }
120
121        lexer
122    }
123
124    /// Returns the kind of the current token.
125    pub(crate) fn current_kind(&self) -> TokenKind {
126        self.current_kind
127    }
128
129    /// Returns the range of the current token.
130    pub(crate) fn current_range(&self) -> TextRange {
131        self.current_range
132    }
133
134    /// Returns the flags for the current token.
135    pub(crate) fn current_flags(&self) -> TokenFlags {
136        self.current_flags
137    }
138
139    /// Takes the token value corresponding to the current token out of the lexer, replacing it
140    /// with the default value.
141    ///
142    /// All the subsequent call to this method without moving the lexer would always return the
143    /// default value which is [`TokenValue::None`].
144    pub(crate) fn take_value(&mut self) -> TokenValue {
145        std::mem::take(&mut self.current_value)
146    }
147
148    /// Helper function to push the given error, updating the current range with the error location
149    /// and return the [`TokenKind::Unknown`] token.
150    fn push_error(&mut self, error: LexicalError) -> TokenKind {
151        self.current_range = error.location();
152        self.errors.push(error);
153        TokenKind::Unknown
154    }
155
156    /// Lex the next token.
157    pub fn next_token(&mut self) -> TokenKind {
158        self.cursor.start_token();
159        self.current_value = TokenValue::None;
160        self.current_flags = TokenFlags::empty();
161        self.current_kind = self.lex_token();
162        // For `Unknown` token, the `push_error` method updates the current range.
163        if !matches!(self.current_kind, TokenKind::Unknown) {
164            self.current_range = self.token_range();
165        }
166        self.current_kind
167    }
168
169    fn lex_token(&mut self) -> TokenKind {
170        if let Some(interpolated_string) = self.interpolated_strings.current() {
171            if !interpolated_string.is_in_interpolation(self.nesting) {
172                if let Some(token) = self.lex_interpolated_string_middle_or_end() {
173                    if token.is_interpolated_string_end() {
174                        self.interpolated_strings.pop();
175                    }
176                    return token;
177                }
178            }
179        }
180        // Return dedent tokens until the current indentation level matches the indentation of the next token.
181        else if let Some(indentation) = self.pending_indentation.take() {
182            match self.indentations.current().try_compare(indentation) {
183                Ok(Ordering::Greater) => {
184                    self.pending_indentation = Some(indentation);
185                    if self.indentations.dedent_one(indentation).is_err() {
186                        return self.push_error(LexicalError::new(
187                            LexicalErrorType::IndentationError,
188                            self.token_range(),
189                        ));
190                    }
191                    return TokenKind::Dedent;
192                }
193                Ok(_) => {}
194                Err(_) => {
195                    return self.push_error(LexicalError::new(
196                        LexicalErrorType::IndentationError,
197                        self.token_range(),
198                    ));
199                }
200            }
201        }
202
203        if self.state.is_after_newline() {
204            if let Some(indentation) = self.eat_indentation() {
205                return indentation;
206            }
207        } else {
208            if let Err(error) = self.skip_whitespace() {
209                return self.push_error(error);
210            }
211        }
212
213        // The lexer might've skipped whitespaces, so update the start offset
214        self.cursor.start_token();
215
216        if let Some(c) = self.cursor.bump() {
217            if c.is_ascii() {
218                self.consume_ascii_character(c)
219            } else if is_unicode_identifier_start(c) {
220                let identifier = self.lex_identifier(c);
221                self.state = State::Other;
222
223                identifier
224            } else {
225                self.push_error(LexicalError::new(
226                    LexicalErrorType::UnrecognizedToken { tok: c },
227                    self.token_range(),
228                ))
229            }
230        } else {
231            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
232            // empty the dedent stack, and finally, return the EndOfFile token.
233            self.consume_end()
234        }
235    }
236
237    fn eat_indentation(&mut self) -> Option<TokenKind> {
238        let mut indentation = Indentation::root();
239
240        loop {
241            match self.cursor.first() {
242                ' ' => {
243                    self.cursor.bump();
244                    indentation = indentation.add_space();
245                }
246                '\t' => {
247                    self.cursor.bump();
248                    indentation = indentation.add_tab();
249                }
250                '\\' => {
251                    self.cursor.bump();
252                    if self.cursor.eat_char('\r') {
253                        self.cursor.eat_char('\n');
254                    } else if !self.cursor.eat_char('\n') {
255                        return Some(self.push_error(LexicalError::new(
256                            LexicalErrorType::LineContinuationError,
257                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
258                        )));
259                    }
260                    if self.cursor.is_eof() {
261                        return Some(self.push_error(LexicalError::new(
262                            LexicalErrorType::Eof,
263                            self.token_range(),
264                        )));
265                    }
266                    indentation = Indentation::root();
267                }
268                // Form feed
269                '\x0C' => {
270                    self.cursor.bump();
271                    indentation = Indentation::root();
272                }
273                _ => break,
274            }
275        }
276
277        // Handle indentation if this is a new, not all empty, logical line
278        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
279            self.state = State::NonEmptyLogicalLine;
280
281            // Set to false so that we don't handle indentation on the next call.
282            return self.handle_indentation(indentation);
283        }
284
285        None
286    }
287
288    fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
289        match self.indentations.current().try_compare(indentation) {
290            // Dedent
291            Ok(Ordering::Greater) => {
292                self.pending_indentation = Some(indentation);
293
294                if self.indentations.dedent_one(indentation).is_err() {
295                    return Some(self.push_error(LexicalError::new(
296                        LexicalErrorType::IndentationError,
297                        self.token_range(),
298                    )));
299                }
300
301                // The lexer might've eaten some whitespaces to calculate the `indentation`. For
302                // example:
303                //
304                // ```py
305                // if first:
306                //     if second:
307                //         pass
308                //     foo
309                // #   ^
310                // ```
311                //
312                // Here, the cursor is at `^` and the `indentation` contains the whitespaces before
313                // the `pass` token.
314                self.cursor.start_token();
315
316                Some(TokenKind::Dedent)
317            }
318
319            Ok(Ordering::Equal) => None,
320
321            // Indent
322            Ok(Ordering::Less) => {
323                self.indentations.indent(indentation);
324                Some(TokenKind::Indent)
325            }
326            Err(_) => Some(self.push_error(LexicalError::new(
327                LexicalErrorType::IndentationError,
328                self.token_range(),
329            ))),
330        }
331    }
332
333    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
334        loop {
335            match self.cursor.first() {
336                ' ' => {
337                    self.cursor.bump();
338                }
339                '\t' => {
340                    self.cursor.bump();
341                }
342                '\\' => {
343                    self.cursor.bump();
344                    if self.cursor.eat_char('\r') {
345                        self.cursor.eat_char('\n');
346                    } else if !self.cursor.eat_char('\n') {
347                        return Err(LexicalError::new(
348                            LexicalErrorType::LineContinuationError,
349                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
350                        ));
351                    }
352                    if self.cursor.is_eof() {
353                        return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
354                    }
355                }
356                // Form feed
357                '\x0C' => {
358                    self.cursor.bump();
359                }
360                _ => break,
361            }
362        }
363
364        Ok(())
365    }
366
367    // Dispatch based on the given character.
368    fn consume_ascii_character(&mut self, c: char) -> TokenKind {
369        let token = match c {
370            c if is_ascii_identifier_start(c) => self.lex_identifier(c),
371            '0'..='9' => self.lex_number(c),
372            '#' => return self.lex_comment(),
373            '\'' | '"' => self.lex_string(c),
374            '=' => {
375                if self.cursor.eat_char('=') {
376                    TokenKind::EqEqual
377                } else {
378                    self.state = State::AfterEqual;
379                    return TokenKind::Equal;
380                }
381            }
382            '+' => {
383                if self.cursor.eat_char('=') {
384                    TokenKind::PlusEqual
385                } else {
386                    TokenKind::Plus
387                }
388            }
389            '*' => {
390                if self.cursor.eat_char('=') {
391                    TokenKind::StarEqual
392                } else if self.cursor.eat_char('*') {
393                    if self.cursor.eat_char('=') {
394                        TokenKind::DoubleStarEqual
395                    } else {
396                        TokenKind::DoubleStar
397                    }
398                } else {
399                    TokenKind::Star
400                }
401            }
402
403            c @ ('%' | '!')
404                if self.mode == Mode::Ipython
405                    && self.state.is_after_equal()
406                    && self.nesting == 0 =>
407            {
408                // SAFETY: Safe because `c` has been matched against one of the possible escape command token
409                self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
410            }
411
412            c @ ('%' | '!' | '?' | '/' | ';' | ',')
413                if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
414            {
415                let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
416                    self.cursor.bump();
417                    kind
418                } else {
419                    // SAFETY: Safe because `c` has been matched against one of the possible escape command token
420                    IpyEscapeKind::try_from(c).unwrap()
421                };
422
423                self.lex_ipython_escape_command(kind)
424            }
425
426            '?' if self.mode == Mode::Ipython => TokenKind::Question,
427
428            '/' => {
429                if self.cursor.eat_char('=') {
430                    TokenKind::SlashEqual
431                } else if self.cursor.eat_char('/') {
432                    if self.cursor.eat_char('=') {
433                        TokenKind::DoubleSlashEqual
434                    } else {
435                        TokenKind::DoubleSlash
436                    }
437                } else {
438                    TokenKind::Slash
439                }
440            }
441            '%' => {
442                if self.cursor.eat_char('=') {
443                    TokenKind::PercentEqual
444                } else {
445                    TokenKind::Percent
446                }
447            }
448            '|' => {
449                if self.cursor.eat_char('=') {
450                    TokenKind::VbarEqual
451                } else {
452                    TokenKind::Vbar
453                }
454            }
455            '^' => {
456                if self.cursor.eat_char('=') {
457                    TokenKind::CircumflexEqual
458                } else {
459                    TokenKind::CircumFlex
460                }
461            }
462            '&' => {
463                if self.cursor.eat_char('=') {
464                    TokenKind::AmperEqual
465                } else {
466                    TokenKind::Amper
467                }
468            }
469            '-' => {
470                if self.cursor.eat_char('=') {
471                    TokenKind::MinusEqual
472                } else if self.cursor.eat_char('>') {
473                    TokenKind::Rarrow
474                } else {
475                    TokenKind::Minus
476                }
477            }
478            '@' => {
479                if self.cursor.eat_char('=') {
480                    TokenKind::AtEqual
481                } else {
482                    TokenKind::At
483                }
484            }
485            '!' => {
486                if self.cursor.eat_char('=') {
487                    TokenKind::NotEqual
488                } else {
489                    TokenKind::Exclamation
490                }
491            }
492            '~' => TokenKind::Tilde,
493            '(' => {
494                self.nesting += 1;
495                TokenKind::Lpar
496            }
497            ')' => {
498                self.nesting = self.nesting.saturating_sub(1);
499                TokenKind::Rpar
500            }
501            '[' => {
502                self.nesting += 1;
503                TokenKind::Lsqb
504            }
505            ']' => {
506                self.nesting = self.nesting.saturating_sub(1);
507                TokenKind::Rsqb
508            }
509            '{' => {
510                self.nesting += 1;
511                TokenKind::Lbrace
512            }
513            '}' => {
514                if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
515                    if interpolated_string.nesting() == self.nesting {
516                        let error_type = LexicalErrorType::from_interpolated_string_error(
517                            InterpolatedStringErrorType::SingleRbrace,
518                            interpolated_string.kind(),
519                        );
520                        return self.push_error(LexicalError::new(error_type, self.token_range()));
521                    }
522                    interpolated_string.try_end_format_spec(self.nesting);
523                }
524                self.nesting = self.nesting.saturating_sub(1);
525                TokenKind::Rbrace
526            }
527            ':' => {
528                if self
529                    .interpolated_strings
530                    .current_mut()
531                    .is_some_and(|interpolated_string| {
532                        interpolated_string.try_start_format_spec(self.nesting)
533                    })
534                {
535                    TokenKind::Colon
536                } else if self.cursor.eat_char('=') {
537                    TokenKind::ColonEqual
538                } else {
539                    TokenKind::Colon
540                }
541            }
542            ';' => TokenKind::Semi,
543            '<' => {
544                if self.cursor.eat_char('<') {
545                    if self.cursor.eat_char('=') {
546                        TokenKind::LeftShiftEqual
547                    } else {
548                        TokenKind::LeftShift
549                    }
550                } else if self.cursor.eat_char('=') {
551                    TokenKind::LessEqual
552                } else {
553                    TokenKind::Less
554                }
555            }
556            '>' => {
557                if self.cursor.eat_char('>') {
558                    if self.cursor.eat_char('=') {
559                        TokenKind::RightShiftEqual
560                    } else {
561                        TokenKind::RightShift
562                    }
563                } else if self.cursor.eat_char('=') {
564                    TokenKind::GreaterEqual
565                } else {
566                    TokenKind::Greater
567                }
568            }
569            ',' => TokenKind::Comma,
570            '.' => {
571                if self.cursor.first().is_ascii_digit() {
572                    self.lex_decimal_number('.')
573                } else if self.cursor.eat_char2('.', '.') {
574                    TokenKind::Ellipsis
575                } else {
576                    TokenKind::Dot
577                }
578            }
579            '\n' => {
580                return if self.nesting == 0 && !self.state.is_new_logical_line() {
581                    self.state = State::AfterNewline;
582                    TokenKind::Newline
583                } else {
584                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
585                        interpolated_string.try_end_format_spec(self.nesting);
586                    }
587                    TokenKind::NonLogicalNewline
588                };
589            }
590            '\r' => {
591                self.cursor.eat_char('\n');
592
593                return if self.nesting == 0 && !self.state.is_new_logical_line() {
594                    self.state = State::AfterNewline;
595                    TokenKind::Newline
596                } else {
597                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
598                        interpolated_string.try_end_format_spec(self.nesting);
599                    }
600                    TokenKind::NonLogicalNewline
601                };
602            }
603
604            _ => {
605                self.state = State::Other;
606
607                return self.push_error(LexicalError::new(
608                    LexicalErrorType::UnrecognizedToken { tok: c },
609                    self.token_range(),
610                ));
611            }
612        };
613
614        self.state = State::Other;
615
616        token
617    }
618
619    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
620    fn lex_identifier(&mut self, first: char) -> TokenKind {
621        // Detect potential string like rb'' b'' f'' t'' u'' r''
622        let quote = match (first, self.cursor.first()) {
623            (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
624                self.cursor.bump();
625                quote
626            }),
627            (_, second) if is_quote(self.cursor.second()) => {
628                self.try_double_char_prefix([first, second]).then(|| {
629                    self.cursor.bump();
630                    // SAFETY: Safe because of the `is_quote` check in this match arm's guard
631                    self.cursor.bump().unwrap()
632                })
633            }
634            _ => None,
635        };
636
637        if let Some(quote) = quote {
638            if self.current_flags.is_interpolated_string() {
639                if let Some(kind) = self.lex_interpolated_string_start(quote) {
640                    return kind;
641                }
642            }
643
644            return self.lex_string(quote);
645        }
646
647        // Keep track of whether the identifier is ASCII-only or not.
648        //
649        // This is important because Python applies NFKC normalization to
650        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
651        // We need to therefore do the same in our lexer, but applying NFKC normalization
652        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
653        // (by far the most common case), we can skip NFKC normalization of the identifier.
654        let mut is_ascii = first.is_ascii();
655        self.cursor
656            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
657
658        let text = self.token_text();
659
660        if !is_ascii {
661            self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
662            return TokenKind::Name;
663        }
664
665        // Short circuit for names that are longer than any known keyword.
666        // It helps Rust to predict that the Name::new call in the keyword match's default branch
667        // is guaranteed to fit into a stack allocated (inline) Name.
668        if text.len() > 8 {
669            self.current_value = TokenValue::Name(Name::new(text));
670            return TokenKind::Name;
671        }
672
673        match text {
674            "False" => TokenKind::False,
675            "None" => TokenKind::None,
676            "True" => TokenKind::True,
677            "and" => TokenKind::And,
678            "as" => TokenKind::As,
679            "assert" => TokenKind::Assert,
680            "async" => TokenKind::Async,
681            "await" => TokenKind::Await,
682            "break" => TokenKind::Break,
683            "case" => TokenKind::Case,
684            "class" => TokenKind::Class,
685            "continue" => TokenKind::Continue,
686            "def" => TokenKind::Def,
687            "del" => TokenKind::Del,
688            "elif" => TokenKind::Elif,
689            "else" => TokenKind::Else,
690            "except" => TokenKind::Except,
691            "finally" => TokenKind::Finally,
692            "for" => TokenKind::For,
693            "from" => TokenKind::From,
694            "global" => TokenKind::Global,
695            "if" => TokenKind::If,
696            "import" => TokenKind::Import,
697            "in" => TokenKind::In,
698            "is" => TokenKind::Is,
699            "lambda" => TokenKind::Lambda,
700            "match" => TokenKind::Match,
701            "nonlocal" => TokenKind::Nonlocal,
702            "not" => TokenKind::Not,
703            "or" => TokenKind::Or,
704            "pass" => TokenKind::Pass,
705            "raise" => TokenKind::Raise,
706            "return" => TokenKind::Return,
707            "try" => TokenKind::Try,
708            "type" => TokenKind::Type,
709            "while" => TokenKind::While,
710            "with" => TokenKind::With,
711            "yield" => TokenKind::Yield,
712            _ => {
713                self.current_value = TokenValue::Name(Name::new(text));
714                TokenKind::Name
715            }
716        }
717    }
718
719    /// Try lexing the single character string prefix, updating the token flags accordingly.
720    /// Returns `true` if it matches.
721    fn try_single_char_prefix(&mut self, first: char) -> bool {
722        match first {
723            'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
724            't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
725            'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
726            'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
727            'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
728            'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
729            _ => return false,
730        }
731        true
732    }
733
734    /// Try lexing the double character string prefix, updating the token flags accordingly.
735    /// Returns `true` if it matches.
736    fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
737        match value {
738            ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
739                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
740            }
741            ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
742                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
743            }
744            ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
745                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
746            }
747            ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
748                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
749            }
750            ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
751                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
752            }
753            ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
754                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
755            }
756            _ => return false,
757        }
758        true
759    }
760
761    /// Lex a f-string or t-string start token if positioned at the start of an f-string or t-string.
762    fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
763        #[cfg(debug_assertions)]
764        debug_assert_eq!(self.cursor.previous(), quote);
765
766        if quote == '"' {
767            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
768        }
769
770        if self.cursor.eat_char2(quote, quote) {
771            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
772        }
773
774        let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
775
776        let kind = ftcontext.kind();
777
778        self.interpolated_strings.push(ftcontext);
779
780        Some(kind.start_token())
781    }
782
783    /// Lex an f-string or t-string middle or end token.
784    fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
785        // SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
786        let interpolated_string = self.interpolated_strings.current().unwrap();
787        let string_kind = interpolated_string.kind();
788        let interpolated_flags = interpolated_string.flags();
789
790        // Check if we're at the end of the f-string.
791        if interpolated_string.is_triple_quoted() {
792            let quote_char = interpolated_string.quote_char();
793            if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
794                self.current_flags = interpolated_string.flags();
795                return Some(string_kind.end_token());
796            }
797        } else if self.cursor.eat_char(interpolated_string.quote_char()) {
798            self.current_flags = interpolated_string.flags();
799            return Some(string_kind.end_token());
800        }
801
802        // We have to decode `{{` and `}}` into `{` and `}` respectively. As an
803        // optimization, we only allocate a new string we find any escaped curly braces,
804        // otherwise this string will remain empty and we'll use a source slice instead.
805        let mut normalized = String::new();
806
807        // Tracks the last offset of token value that has been written to `normalized`.
808        let mut last_offset = self.offset();
809
810        // This isn't going to change for the duration of the loop.
811        let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
812
813        let mut in_named_unicode = false;
814
815        loop {
816            match self.cursor.first() {
817                // The condition is to differentiate between the `NUL` (`\0`) character
818                // in the source code and the one returned by `self.cursor.first()` when
819                // we reach the end of the source code.
820                EOF_CHAR if self.cursor.is_eof() => {
821                    let error = if interpolated_string.is_triple_quoted() {
822                        InterpolatedStringErrorType::UnterminatedTripleQuotedString
823                    } else {
824                        InterpolatedStringErrorType::UnterminatedString
825                    };
826
827                    self.nesting = interpolated_string.nesting();
828                    self.interpolated_strings.pop();
829                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
830                    self.push_error(LexicalError::new(
831                        LexicalErrorType::from_interpolated_string_error(error, string_kind),
832                        self.token_range(),
833                    ));
834
835                    break;
836                }
837                '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
838                    // https://github.com/astral-sh/ruff/issues/18632
839
840                    let error_type = if in_format_spec {
841                        InterpolatedStringErrorType::NewlineInFormatSpec
842                    } else {
843                        InterpolatedStringErrorType::UnterminatedString
844                    };
845
846                    self.nesting = interpolated_string.nesting();
847                    self.interpolated_strings.pop();
848                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
849
850                    self.push_error(LexicalError::new(
851                        LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
852                        self.token_range(),
853                    ));
854
855                    break;
856                }
857                '\\' => {
858                    self.cursor.bump(); // '\'
859                    if matches!(self.cursor.first(), '{' | '}') {
860                        // Don't consume `{` or `}` as we want them to be emitted as tokens.
861                        // They will be handled in the next iteration.
862                        continue;
863                    } else if !interpolated_string.is_raw_string() {
864                        if self.cursor.eat_char2('N', '{') {
865                            in_named_unicode = true;
866                            continue;
867                        }
868                    }
869                    // Consume the escaped character.
870                    if self.cursor.eat_char('\r') {
871                        self.cursor.eat_char('\n');
872                    } else {
873                        self.cursor.bump();
874                    }
875                }
876                quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
877                    if let Some(triple_quotes) = interpolated_string.triple_quotes() {
878                        if self.cursor.rest().starts_with(triple_quotes) {
879                            break;
880                        }
881                        self.cursor.bump();
882                    } else {
883                        break;
884                    }
885                }
886                '{' => {
887                    if self.cursor.second() == '{' && !in_format_spec {
888                        self.cursor.bump();
889                        normalized
890                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
891                        self.cursor.bump(); // Skip the second `{`
892                        last_offset = self.offset();
893                    } else {
894                        break;
895                    }
896                }
897                '}' => {
898                    if in_named_unicode {
899                        in_named_unicode = false;
900                        self.cursor.bump();
901                    } else if self.cursor.second() == '}' && !in_format_spec {
902                        self.cursor.bump();
903                        normalized
904                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
905                        self.cursor.bump(); // Skip the second `}`
906                        last_offset = self.offset();
907                    } else {
908                        break;
909                    }
910                }
911                _ => {
912                    self.cursor.bump();
913                }
914            }
915        }
916        let range = self.token_range();
917        if range.is_empty() {
918            return None;
919        }
920
921        let value = if normalized.is_empty() {
922            self.source[range].to_string()
923        } else {
924            normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
925            normalized
926        };
927
928        self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
929
930        self.current_flags = interpolated_flags;
931        Some(string_kind.middle_token())
932    }
933
934    /// Lex a string literal.
935    fn lex_string(&mut self, quote: char) -> TokenKind {
936        #[cfg(debug_assertions)]
937        debug_assert_eq!(self.cursor.previous(), quote);
938
939        if quote == '"' {
940            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
941        }
942
943        // If the next two characters are also the quote character, then we have a triple-quoted
944        // string; consume those two characters and ensure that we require a triple-quote to close
945        if self.cursor.eat_char2(quote, quote) {
946            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
947        }
948
949        let value_start = self.offset();
950
951        let quote_byte = u8::try_from(quote).expect("char that fits in u8");
952        let value_end = if self.current_flags.is_triple_quoted() {
953            // For triple-quoted strings, scan until we find the closing quote (ignoring escaped
954            // quotes) or the end of the file.
955            loop {
956                let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
957                    self.cursor.skip_to_end();
958
959                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
960                    self.push_error(LexicalError::new(
961                        LexicalErrorType::UnclosedStringError,
962                        self.token_range(),
963                    ));
964                    break self.offset();
965                };
966
967                // Rare case: if there are an odd number of backslashes before the quote, then
968                // the quote is escaped and we should continue scanning.
969                let num_backslashes = self.cursor.rest().as_bytes()[..index]
970                    .iter()
971                    .rev()
972                    .take_while(|&&c| c == b'\\')
973                    .count();
974
975                // Advance the cursor past the quote and continue scanning.
976                self.cursor.skip_bytes(index + 1);
977
978                // If the character is escaped, continue scanning.
979                if num_backslashes % 2 == 1 {
980                    continue;
981                }
982
983                // Otherwise, if it's followed by two more quotes, then we're done.
984                if self.cursor.eat_char2(quote, quote) {
985                    break self.offset() - TextSize::new(3);
986                }
987            }
988        } else {
989            // For non-triple-quoted strings, scan until we find the closing quote, but end early
990            // if we encounter a newline or the end of the file.
991            loop {
992                let Some(index) =
993                    memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
994                else {
995                    self.cursor.skip_to_end();
996                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
997
998                    self.push_error(LexicalError::new(
999                        LexicalErrorType::UnclosedStringError,
1000                        self.token_range(),
1001                    ));
1002
1003                    break self.offset();
1004                };
1005
1006                // Rare case: if there are an odd number of backslashes before the quote, then
1007                // the quote is escaped and we should continue scanning.
1008                let num_backslashes = self.cursor.rest().as_bytes()[..index]
1009                    .iter()
1010                    .rev()
1011                    .take_while(|&&c| c == b'\\')
1012                    .count();
1013
1014                // Skip up to the current character.
1015                self.cursor.skip_bytes(index);
1016
1017                // Lookahead because we want to bump only if it's a quote or being escaped.
1018                let quote_or_newline = self.cursor.first();
1019
1020                // If the character is escaped, continue scanning.
1021                if num_backslashes % 2 == 1 {
1022                    self.cursor.bump();
1023                    if quote_or_newline == '\r' {
1024                        self.cursor.eat_char('\n');
1025                    }
1026                    continue;
1027                }
1028
1029                match quote_or_newline {
1030                    '\r' | '\n' => {
1031                        self.current_flags |= TokenFlags::UNCLOSED_STRING;
1032                        self.push_error(LexicalError::new(
1033                            LexicalErrorType::UnclosedStringError,
1034                            self.token_range(),
1035                        ));
1036                        break self.offset();
1037                    }
1038                    ch if ch == quote => {
1039                        let value_end = self.offset();
1040                        self.cursor.bump();
1041                        break value_end;
1042                    }
1043                    _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1044                }
1045            }
1046        };
1047
1048        self.current_value = TokenValue::String(
1049            self.source[TextRange::new(value_start, value_end)]
1050                .to_string()
1051                .into_boxed_str(),
1052        );
1053
1054        TokenKind::String
1055    }
1056
1057    /// Numeric lexing. The feast can start!
1058    fn lex_number(&mut self, first: char) -> TokenKind {
1059        if first == '0' {
1060            if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1061                self.lex_number_radix(Radix::Hex)
1062            } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1063                self.lex_number_radix(Radix::Octal)
1064            } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1065                self.lex_number_radix(Radix::Binary)
1066            } else {
1067                self.lex_decimal_number(first)
1068            }
1069        } else {
1070            self.lex_decimal_number(first)
1071        }
1072    }
1073
1074    /// Lex a hex/octal/decimal/binary number without a decimal point.
1075    fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1076        #[cfg(debug_assertions)]
1077        debug_assert!(matches!(
1078            self.cursor.previous().to_ascii_lowercase(),
1079            'x' | 'o' | 'b'
1080        ));
1081
1082        // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
1083        let mut number = LexedText::new(self.offset(), self.source);
1084        self.radix_run(&mut number, radix);
1085
1086        // Extract the entire number, including the base prefix (e.g., `0x9D5`).
1087        let token = &self.source[self.token_range()];
1088
1089        let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1090            Ok(int) => int,
1091            Err(err) => {
1092                return self.push_error(LexicalError::new(
1093                    LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1094                    self.token_range(),
1095                ));
1096            }
1097        };
1098        self.current_value = TokenValue::Int(value);
1099        TokenKind::Int
1100    }
1101
1102    /// Lex a normal number, that is, no octal, hex or binary number.
1103    fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1104        #[cfg(debug_assertions)]
1105        debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1106        let start_is_zero = first_digit_or_dot == '0';
1107
1108        let mut number = LexedText::new(self.token_start(), self.source);
1109        if first_digit_or_dot != '.' {
1110            number.push(first_digit_or_dot);
1111            self.radix_run(&mut number, Radix::Decimal);
1112        }
1113
1114        let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1115            number.push('.');
1116
1117            if self.cursor.eat_char('_') {
1118                return self.push_error(LexicalError::new(
1119                    LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1120                    TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1121                ));
1122            }
1123
1124            self.radix_run(&mut number, Radix::Decimal);
1125            true
1126        } else {
1127            // Normal number:
1128            false
1129        };
1130
1131        let is_float = match self.cursor.rest().as_bytes() {
1132            [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1133                // 'e' | 'E'
1134                number.push(self.cursor.bump().unwrap());
1135
1136                if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1137                    number.push(sign);
1138                }
1139
1140                self.radix_run(&mut number, Radix::Decimal);
1141
1142                true
1143            }
1144            _ => is_float,
1145        };
1146
1147        if is_float {
1148            // Improvement: Use `Cow` instead of pushing to value text
1149            let Ok(value) = f64::from_str(number.as_str()) else {
1150                return self.push_error(LexicalError::new(
1151                    LexicalErrorType::OtherError(
1152                        "Invalid decimal literal".to_string().into_boxed_str(),
1153                    ),
1154                    self.token_range(),
1155                ));
1156            };
1157
1158            // Parse trailing 'j':
1159            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1160                self.current_value = TokenValue::Complex {
1161                    real: 0.0,
1162                    imag: value,
1163                };
1164                TokenKind::Complex
1165            } else {
1166                self.current_value = TokenValue::Float(value);
1167                TokenKind::Float
1168            }
1169        } else {
1170            // Parse trailing 'j':
1171            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1172                let imag = f64::from_str(number.as_str()).unwrap();
1173                self.current_value = TokenValue::Complex { real: 0.0, imag };
1174                TokenKind::Complex
1175            } else {
1176                let value = match Int::from_str(number.as_str()) {
1177                    Ok(value) => {
1178                        if start_is_zero && value.as_u8() != Some(0) {
1179                            // Leading zeros in decimal integer literals are not permitted.
1180                            return self.push_error(LexicalError::new(
1181                                LexicalErrorType::OtherError(
1182                                    "Invalid decimal integer literal"
1183                                        .to_string()
1184                                        .into_boxed_str(),
1185                                ),
1186                                self.token_range(),
1187                            ));
1188                        }
1189                        value
1190                    }
1191                    Err(err) => {
1192                        return self.push_error(LexicalError::new(
1193                            LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1194                            self.token_range(),
1195                        ));
1196                    }
1197                };
1198                self.current_value = TokenValue::Int(value);
1199                TokenKind::Int
1200            }
1201        }
1202    }
1203
1204    /// Consume a sequence of numbers with the given radix,
1205    /// the digits can be decorated with underscores
1206    /// like this: '`1_2_3_4`' == '1234'
1207    fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1208        loop {
1209            if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1210                number.push(c);
1211            }
1212            // Number that contains `_` separators. Remove them from the parsed text.
1213            else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1214                // Skip over `_`
1215                self.cursor.bump();
1216                number.skip_char();
1217            } else {
1218                break;
1219            }
1220        }
1221    }
1222
1223    /// Lex a single comment.
1224    fn lex_comment(&mut self) -> TokenKind {
1225        #[cfg(debug_assertions)]
1226        debug_assert_eq!(self.cursor.previous(), '#');
1227
1228        let bytes = self.cursor.rest().as_bytes();
1229        let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1230        self.cursor.skip_bytes(offset);
1231
1232        TokenKind::Comment
1233    }
1234
1235    /// Lex a single IPython escape command.
1236    fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
1237        let mut value = String::new();
1238
1239        loop {
1240            match self.cursor.first() {
1241                '\\' => {
1242                    // Only skip the line continuation if it is followed by a newline
1243                    // otherwise it is a normal backslash which is part of the magic command:
1244                    //
1245                    //        Skip this backslash
1246                    //        v
1247                    //   !pwd \
1248                    //      && ls -a | sed 's/^/\\    /'
1249                    //                          ^^
1250                    //                          Don't skip these backslashes
1251                    if self.cursor.second() == '\r' {
1252                        self.cursor.bump();
1253                        self.cursor.bump();
1254                        self.cursor.eat_char('\n');
1255                        continue;
1256                    } else if self.cursor.second() == '\n' {
1257                        self.cursor.bump();
1258                        self.cursor.bump();
1259                        continue;
1260                    }
1261
1262                    self.cursor.bump();
1263                    value.push('\\');
1264                }
1265                // Help end escape commands are those that end with 1 or 2 question marks.
1266                // Here, we're only looking for a subset of help end escape commands which
1267                // are the ones that has the escape token at the start of the line as well.
1268                // On the other hand, we're not looking for help end escape commands that
1269                // are strict in the sense that the escape token is only at the end. For example,
1270                //
1271                //   * `%foo?` is recognized as a help end escape command but not as a strict one.
1272                //   * `foo?` is recognized as a strict help end escape command which is not
1273                //     lexed here but is identified at the parser level.
1274                //
1275                // Help end escape commands implemented in the IPython codebase using regex:
1276                // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
1277                '?' => {
1278                    self.cursor.bump();
1279                    let mut question_count = 1u32;
1280                    while self.cursor.eat_char('?') {
1281                        question_count += 1;
1282                    }
1283
1284                    // The original implementation in the IPython codebase is based on regex which
1285                    // means that it's strict in the sense that it won't recognize a help end escape:
1286                    //   * If there's any whitespace before the escape token (e.g. `%foo ?`)
1287                    //   * If there are more than 2 question mark tokens (e.g. `%foo???`)
1288                    // which is what we're doing here as well. In that case, we'll continue with
1289                    // the prefixed escape token.
1290                    //
1291                    // Now, the whitespace and empty value check also makes sure that an empty
1292                    // command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
1293                    // is not recognized as a help end escape command. So, `%?` and `? ??` are
1294                    // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
1295                    // tokens.
1296                    if question_count > 2
1297                        || value.chars().last().is_none_or(is_python_whitespace)
1298                        || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1299                    {
1300                        // Not a help end escape command, so continue with the lexing.
1301                        value.reserve(question_count as usize);
1302                        for _ in 0..question_count {
1303                            value.push('?');
1304                        }
1305                        continue;
1306                    }
1307
1308                    if escape_kind.is_help() {
1309                        // If we've recognize this as a help end escape command, then
1310                        // any question mark token / whitespaces at the start are not
1311                        // considered as part of the value.
1312                        //
1313                        // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
1314                        // `value` is `foo` instead of `??foo`.
1315                        value = value.trim_start_matches([' ', '?']).to_string();
1316                    } else if escape_kind.is_magic() {
1317                        // Between `%` and `?` (at the end), the `?` takes priority
1318                        // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
1319                        // and `value` is `%foo` instead of `foo`. So, we need to
1320                        // insert the magic escape token at the start.
1321                        value.insert_str(0, escape_kind.as_str());
1322                    }
1323
1324                    let kind = match question_count {
1325                        1 => IpyEscapeKind::Help,
1326                        2 => IpyEscapeKind::Help2,
1327                        _ => unreachable!("`question_count` is always 1 or 2"),
1328                    };
1329
1330                    self.current_value = TokenValue::IpyEscapeCommand {
1331                        kind,
1332                        value: value.into_boxed_str(),
1333                    };
1334
1335                    return TokenKind::IpyEscapeCommand;
1336                }
1337                '\n' | '\r' | EOF_CHAR => {
1338                    self.current_value = TokenValue::IpyEscapeCommand {
1339                        kind: escape_kind,
1340                        value: value.into_boxed_str(),
1341                    };
1342
1343                    return TokenKind::IpyEscapeCommand;
1344                }
1345                c => {
1346                    self.cursor.bump();
1347                    value.push(c);
1348                }
1349            }
1350        }
1351    }
1352
1353    fn consume_end(&mut self) -> TokenKind {
1354        // We reached end of file.
1355
1356        // First, finish any unterminated interpolated-strings.
1357        while let Some(interpolated_string) = self.interpolated_strings.pop() {
1358            self.nesting = interpolated_string.nesting();
1359            self.push_error(LexicalError::new(
1360                LexicalErrorType::from_interpolated_string_error(
1361                    InterpolatedStringErrorType::UnterminatedString,
1362                    interpolated_string.kind(),
1363                ),
1364                self.token_range(),
1365            ));
1366        }
1367
1368        // Second, finish all nestings.
1369        // For Mode::ParenthesizedExpression we start with nesting level 1.
1370        // So we check if we end with that level.
1371        let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1372
1373        if self.nesting > init_nesting {
1374            // Reset the nesting to avoid going into infinite loop.
1375            self.nesting = 0;
1376            return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1377        }
1378
1379        // Next, insert a trailing newline, if required.
1380        if !self.state.is_new_logical_line() {
1381            self.state = State::AfterNewline;
1382            TokenKind::Newline
1383        }
1384        // Next, flush the indentation stack to zero.
1385        else if self.indentations.dedent().is_some() {
1386            TokenKind::Dedent
1387        } else {
1388            TokenKind::EndOfFile
1389        }
1390    }
1391
1392    /// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
1393    /// line.
1394    ///
1395    /// Returns a boolean indicating whether the lexer's position has changed. This could result
1396    /// into the new current token being different than the previous current token but is not
1397    /// necessarily true. If the return value is `true` then the caller is responsible for updating
1398    /// it's state accordingly.
1399    ///
1400    /// This method is a no-op if the lexer isn't in a parenthesized context.
1401    ///
1402    /// ## Explanation
1403    ///
1404    /// The lexer emits two different kinds of newline token based on the context. If it's in a
1405    /// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a
1406    /// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and
1407    /// emit the indentation tokens appropriately which affects the structure of the code.
1408    ///
1409    /// For example:
1410    /// ```py
1411    /// if call(foo
1412    ///     def bar():
1413    ///         pass
1414    /// ```
1415    ///
1416    /// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer
1417    /// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which
1418    /// considers the function `bar` as part of the module block and the `if` block remains empty.
1419    ///
1420    /// This method is to facilitate the parser if it recovers from these kind of scenarios so that
1421    /// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in
1422    /// turn helps the parser to build the correct AST.
1423    ///
1424    /// In the above snippet, it would mean that this method would move the lexer back to the
1425    /// newline character after the `foo` token and emit it as a [`Newline`] token instead of
1426    /// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an
1427    /// `Indent` token.
1428    ///
1429    /// There are cases where the lexer's position will change but the re-lexed token will remain
1430    /// the same. This is to help the parser to add the error message at an appropriate location.
1431    /// Consider the following example:
1432    ///
1433    /// ```py
1434    /// if call(foo, [a, b
1435    ///     def bar():
1436    ///         pass
1437    /// ```
1438    ///
1439    /// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call
1440    /// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic
1441    /// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token.
1442    /// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit
1443    /// the [`Newline`] token.
1444    ///
1445    /// [`Newline`]: TokenKind::Newline
1446    /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
1447    pub(crate) fn re_lex_logical_token(
1448        &mut self,
1449        non_logical_newline_start: Option<TextSize>,
1450    ) -> bool {
1451        if self.nesting == 0 {
1452            return false;
1453        }
1454
1455        // Reduce the nesting level because the parser recovered from an error inside list parsing
1456        // i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
1457        self.nesting -= 1;
1458
1459        // The lexer can't be moved back for a triple-quoted f/t-string because the newlines are
1460        // part of the f/t-string itself, so there is no newline token to be emitted.
1461        if self.current_flags.is_triple_quoted_interpolated_string() {
1462            return false;
1463        }
1464
1465        let Some(new_position) = non_logical_newline_start else {
1466            return false;
1467        };
1468
1469        // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
1470        // position is going to be moved back, the lexer needs to be put back into a
1471        // parenthesized context if the current token is a closing parenthesis.
1472        //
1473        // ```py
1474        // (a, [b,
1475        //     c
1476        // )
1477        // ```
1478        //
1479        // Here, the parser would request to re-lex the token when it's at `)` and can recover
1480        // from an unclosed `[`. This method will move the lexer back to the newline character
1481        // after `c` which means it goes back into parenthesized context.
1482        if matches!(
1483            self.current_kind,
1484            TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1485        ) {
1486            self.nesting += 1;
1487        }
1488
1489        self.cursor = Cursor::new(self.source);
1490        self.cursor.skip_bytes(new_position.to_usize());
1491        self.state = State::Other;
1492        self.next_token();
1493        true
1494    }
1495
1496    /// Re-lexes an unclosed string token in the context of an interpolated string element.
1497    ///
1498    /// ```py
1499    /// f'{a'
1500    /// ```
1501    ///
1502    /// This method re-lexes the trailing `'` as the end of the f-string rather than the
1503    /// start of a new string token for better error recovery.
1504    pub(crate) fn re_lex_string_token_in_interpolation_element(
1505        &mut self,
1506        kind: InterpolatedStringKind,
1507    ) {
1508        let Some(interpolated_string) = self.interpolated_strings.current() else {
1509            return;
1510        };
1511
1512        let current_string_flags = self.current_flags().as_any_string_flags();
1513
1514        // Only unclosed strings, that have the same quote character
1515        if !matches!(self.current_kind, TokenKind::String)
1516            || !self.current_flags.is_unclosed()
1517            || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1518            || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1519            || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1520        {
1521            return;
1522        }
1523
1524        // Only if the string's first line only contains whitespace,
1525        // or ends in a comment (not `f"{"abc`)
1526        let first_line = &self.source
1527            [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1528
1529        for c in first_line.chars() {
1530            if matches!(c, '\n' | '\r' | '#') {
1531                break;
1532            }
1533
1534            // `f'{'ab`, we want to parse `ab` as a normal string and not the closing element of the f-string
1535            if !is_python_whitespace(c) {
1536                return;
1537            }
1538        }
1539
1540        if self.errors.last().is_some_and(|error| {
1541            error.location() == self.current_range
1542                && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1543        }) {
1544            self.errors.pop();
1545        }
1546
1547        self.current_range =
1548            TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1549        self.current_kind = kind.end_token();
1550        self.current_value = TokenValue::None;
1551        self.current_flags = TokenFlags::empty();
1552
1553        self.nesting = interpolated_string.nesting();
1554        self.interpolated_strings.pop();
1555
1556        self.cursor = Cursor::new(self.source);
1557        self.cursor.skip_bytes(self.current_range.end().to_usize());
1558    }
1559
1560    /// Re-lex `r"` in a format specifier position.
1561    ///
1562    /// `r"` in a format specifier position is unlikely to be the start of a raw string.
1563    /// Instead, it's the format specifier `!r` followed by the closing quote of the f-string,
1564    /// when the `}` is missing.
1565    ///
1566    /// ```py
1567    /// f"{test!r"
1568    /// ```
1569    ///
1570    /// This function re-lexes the `r"` as `r` (a name token). The next `next_token` call will
1571    /// return a unclosed string token for `"`, which [`Self::re_lex_string_token_in_interpolation_element`]
1572    /// can then re-lex as the end of the f-string.
1573    pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1574        // Re-lex `r"` as `NAME r` followed by an unclosed string
1575        // `f"{test!r"` -> `f"{test!`, `r`, `"`
1576        if matches!(self.current_kind, TokenKind::String)
1577            && self.current_flags.is_unclosed()
1578            && self.current_flags.prefix()
1579                == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1580        {
1581            if self.errors.last().is_some_and(|error| {
1582                error.location() == self.current_range
1583                    && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1584            }) {
1585                self.errors.pop();
1586            }
1587
1588            self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1589            self.current_kind = TokenKind::Name;
1590            self.current_value = TokenValue::Name(Name::new_static("r"));
1591            self.current_flags = TokenFlags::empty();
1592            self.cursor = Cursor::new(self.source);
1593            self.cursor.skip_bytes(self.current_range.end().to_usize());
1594        }
1595    }
1596
1597    #[inline]
1598    fn token_range(&self) -> TextRange {
1599        let end = self.offset();
1600        let len = self.cursor.token_len();
1601
1602        TextRange::at(end - len, len)
1603    }
1604
1605    #[inline]
1606    fn token_text(&self) -> &'src str {
1607        &self.source[self.token_range()]
1608    }
1609
1610    /// Retrieves the current offset of the cursor within the source code.
1611    // SAFETY: Lexer doesn't allow files larger than 4GB
1612    #[expect(clippy::cast_possible_truncation)]
1613    #[inline]
1614    fn offset(&self) -> TextSize {
1615        TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1616    }
1617
1618    #[inline]
1619    fn token_start(&self) -> TextSize {
1620        self.token_range().start()
1621    }
1622
1623    /// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].
1624    pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1625        LexerCheckpoint {
1626            value: self.current_value.clone(),
1627            current_kind: self.current_kind,
1628            current_range: self.current_range,
1629            current_flags: self.current_flags,
1630            cursor_offset: self.offset(),
1631            state: self.state,
1632            nesting: self.nesting,
1633            indentations_checkpoint: self.indentations.checkpoint(),
1634            pending_indentation: self.pending_indentation,
1635            interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1636            errors_position: self.errors.len(),
1637        }
1638    }
1639
1640    /// Restore the lexer to the given checkpoint.
1641    pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1642        let LexerCheckpoint {
1643            value,
1644            current_kind,
1645            current_range,
1646            current_flags,
1647            cursor_offset,
1648            state,
1649            nesting,
1650            indentations_checkpoint,
1651            pending_indentation,
1652            interpolated_strings_checkpoint,
1653            errors_position,
1654        } = checkpoint;
1655
1656        let mut cursor = Cursor::new(self.source);
1657        // We preserve the previous char using this method.
1658        cursor.skip_bytes(cursor_offset.to_usize());
1659
1660        self.current_value = value;
1661        self.current_kind = current_kind;
1662        self.current_range = current_range;
1663        self.current_flags = current_flags;
1664        self.cursor = cursor;
1665        self.state = state;
1666        self.nesting = nesting;
1667        self.indentations.rewind(indentations_checkpoint);
1668        self.pending_indentation = pending_indentation;
1669        self.interpolated_strings
1670            .rewind(interpolated_strings_checkpoint);
1671        self.errors.truncate(errors_position);
1672    }
1673
1674    pub fn finish(self) -> Vec<LexicalError> {
1675        self.errors
1676    }
1677}
1678
1679pub(crate) struct LexerCheckpoint {
1680    value: TokenValue,
1681    current_kind: TokenKind,
1682    current_range: TextRange,
1683    current_flags: TokenFlags,
1684    cursor_offset: TextSize,
1685    state: State,
1686    nesting: u32,
1687    indentations_checkpoint: IndentationsCheckpoint,
1688    pending_indentation: Option<Indentation>,
1689    interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1690    errors_position: usize,
1691}
1692
1693#[derive(Copy, Clone, Debug)]
1694enum State {
1695    /// Lexer is right at the beginning of the file or after a `Newline` token.
1696    AfterNewline,
1697
1698    /// The lexer is at the start of a new logical line but **after** the indentation
1699    NonEmptyLogicalLine,
1700
1701    /// Lexer is right after an equal token
1702    AfterEqual,
1703
1704    /// Inside of a logical line
1705    Other,
1706}
1707
1708impl State {
1709    const fn is_after_newline(self) -> bool {
1710        matches!(self, State::AfterNewline)
1711    }
1712
1713    const fn is_new_logical_line(self) -> bool {
1714        matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1715    }
1716
1717    const fn is_after_equal(self) -> bool {
1718        matches!(self, State::AfterEqual)
1719    }
1720}
1721
1722#[derive(Copy, Clone, Debug)]
1723enum Radix {
1724    Binary,
1725    Octal,
1726    Decimal,
1727    Hex,
1728}
1729
1730impl Radix {
1731    const fn as_u32(self) -> u32 {
1732        match self {
1733            Radix::Binary => 2,
1734            Radix::Octal => 8,
1735            Radix::Decimal => 10,
1736            Radix::Hex => 16,
1737        }
1738    }
1739
1740    const fn is_digit(self, c: char) -> bool {
1741        match self {
1742            Radix::Binary => matches!(c, '0'..='1'),
1743            Radix::Octal => matches!(c, '0'..='7'),
1744            Radix::Decimal => c.is_ascii_digit(),
1745            Radix::Hex => c.is_ascii_hexdigit(),
1746        }
1747    }
1748}
1749
1750const fn is_quote(c: char) -> bool {
1751    matches!(c, '\'' | '"')
1752}
1753
1754const fn is_ascii_identifier_start(c: char) -> bool {
1755    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1756}
1757
1758// Checks if the character c is a valid starting character as described
1759// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1760fn is_unicode_identifier_start(c: char) -> bool {
1761    is_xid_start(c)
1762}
1763
1764/// Checks if the character c is a valid continuation character as described
1765/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1766///
1767/// Additionally, this function also keeps track of whether or not the total
1768/// identifier is ASCII-only or not by mutably altering a reference to a
1769/// boolean value passed in.
1770fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1771    // Arrange things such that ASCII codepoints never
1772    // result in the slower `is_xid_continue` getting called.
1773    if c.is_ascii() {
1774        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1775    } else {
1776        *identifier_is_ascii_only = false;
1777        is_xid_continue(c)
1778    }
1779}
1780
1781enum LexedText<'a> {
1782    Source { source: &'a str, range: TextRange },
1783    Owned(String),
1784}
1785
1786impl<'a> LexedText<'a> {
1787    fn new(start: TextSize, source: &'a str) -> Self {
1788        Self::Source {
1789            range: TextRange::empty(start),
1790            source,
1791        }
1792    }
1793
1794    fn push(&mut self, c: char) {
1795        match self {
1796            LexedText::Source { range, source } => {
1797                *range = range.add_end(c.text_len());
1798                debug_assert!(source[*range].ends_with(c));
1799            }
1800            LexedText::Owned(owned) => owned.push(c),
1801        }
1802    }
1803
1804    fn as_str<'b>(&'b self) -> &'b str
1805    where
1806        'b: 'a,
1807    {
1808        match self {
1809            LexedText::Source { range, source } => &source[*range],
1810            LexedText::Owned(owned) => owned,
1811        }
1812    }
1813
1814    fn skip_char(&mut self) {
1815        match self {
1816            LexedText::Source { range, source } => {
1817                *self = LexedText::Owned(source[*range].to_string());
1818            }
1819            LexedText::Owned(_) => {}
1820        }
1821    }
1822}
1823
1824/// Create a new [`Lexer`] for the given source code and [`Mode`].
1825pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1826    Lexer::new(source, mode, TextSize::default())
1827}
1828
ruff_python_parser/lexer.rs

ruff_python_parser/
lexer.rs