ruff_python_parser/
lexer.rs

1//! This module takes care of lexing Python source text.
2//!
3//! This means source code is scanned and translated into separate tokens. The rules
4//! governing what is and is not a valid token are defined in the Python reference
5//! guide section on [Lexical analysis].
6//!
7//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
8
9use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27    InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38/// A lexer for Python source code.
39#[derive(Debug)]
40pub struct Lexer<'src> {
41    /// Source code to be lexed.
42    source: &'src str,
43
44    /// A pointer to the current character of the source code which is being lexed.
45    cursor: Cursor<'src>,
46
47    /// The kind of the current token.
48    current_kind: TokenKind,
49
50    /// The range of the current token.
51    current_range: TextRange,
52
53    /// The value of the current token.
54    current_value: TokenValue,
55
56    /// Flags for the current token.
57    current_flags: TokenFlags,
58
59    /// Lexer state.
60    state: State,
61
62    /// Represents the current level of nesting in the lexer, indicating the depth of parentheses.
63    /// The lexer is within a parenthesized context if the value is greater than 0.
64    nesting: u32,
65
66    /// A stack of indentation representing the current indentation level.
67    indentations: Indentations,
68    pending_indentation: Option<Indentation>,
69
70    /// Lexer mode.
71    mode: Mode,
72
73    /// F-string and t-string contexts.
74    interpolated_strings: InterpolatedStrings,
75
76    /// Errors encountered while lexing.
77    errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81    /// Create a new lexer for the given input source which starts at the given offset.
82    ///
83    /// If the start offset is greater than 0, the cursor is moved ahead that many bytes.
84    /// This means that the input source should be the complete source code and not the
85    /// sliced version.
86    pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87        assert!(
88            u32::try_from(source.len()).is_ok(),
89            "Lexer only supports files with a size up to 4GB"
90        );
91
92        let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93            (State::Other, 1)
94        } else {
95            (State::AfterNewline, 0)
96        };
97
98        let mut lexer = Lexer {
99            source,
100            cursor: Cursor::new(source),
101            state,
102            current_kind: TokenKind::EndOfFile,
103            current_range: TextRange::empty(start_offset),
104            current_value: TokenValue::None,
105            current_flags: TokenFlags::empty(),
106            nesting,
107            indentations: Indentations::default(),
108            pending_indentation: None,
109            mode,
110            interpolated_strings: InterpolatedStrings::default(),
111            errors: Vec::new(),
112        };
113
114        if start_offset == TextSize::new(0) {
115            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
116            lexer.cursor.eat_char(BOM);
117        } else {
118            lexer.cursor.skip_bytes(start_offset.to_usize());
119        }
120
121        lexer
122    }
123
124    /// Returns the kind of the current token.
125    pub(crate) fn current_kind(&self) -> TokenKind {
126        self.current_kind
127    }
128
129    /// Returns the range of the current token.
130    pub(crate) fn current_range(&self) -> TextRange {
131        self.current_range
132    }
133
134    /// Returns the flags for the current token.
135    pub(crate) fn current_flags(&self) -> TokenFlags {
136        self.current_flags
137    }
138
139    /// Takes the token value corresponding to the current token out of the lexer, replacing it
140    /// with the default value.
141    ///
142    /// All the subsequent call to this method without moving the lexer would always return the
143    /// default value which is [`TokenValue::None`].
144    pub(crate) fn take_value(&mut self) -> TokenValue {
145        std::mem::take(&mut self.current_value)
146    }
147
148    /// Helper function to push the given error, updating the current range with the error location
149    /// and return the [`TokenKind::Unknown`] token.
150    fn push_error(&mut self, error: LexicalError) -> TokenKind {
151        self.current_range = error.location();
152        self.errors.push(error);
153        TokenKind::Unknown
154    }
155
156    /// Lex the next token.
157    pub fn next_token(&mut self) -> TokenKind {
158        self.cursor.start_token();
159        self.current_value = TokenValue::None;
160        self.current_flags = TokenFlags::empty();
161        self.current_kind = self.lex_token();
162        // For `Unknown` token, the `push_error` method updates the current range.
163        if !matches!(self.current_kind, TokenKind::Unknown) {
164            self.current_range = self.token_range();
165        }
166        self.current_kind
167    }
168
169    fn lex_token(&mut self) -> TokenKind {
170        if let Some(interpolated_string) = self.interpolated_strings.current()
171            && !interpolated_string.is_in_interpolation(self.nesting)
172        {
173            if let Some(token) = self.lex_interpolated_string_middle_or_end() {
174                if token.is_interpolated_string_end() {
175                    self.interpolated_strings.pop();
176                }
177                return token;
178            }
179        }
180        // Return dedent tokens until the current indentation level matches the indentation of the next token.
181        else if let Some(indentation) = self.pending_indentation.take() {
182            match self.indentations.current().try_compare(indentation) {
183                Ok(Ordering::Greater) => {
184                    self.pending_indentation = Some(indentation);
185                    if self.indentations.dedent_one(indentation).is_err() {
186                        return self.push_error(LexicalError::new(
187                            LexicalErrorType::IndentationError,
188                            self.token_range(),
189                        ));
190                    }
191                    return TokenKind::Dedent;
192                }
193                Ok(_) => {}
194                Err(_) => {
195                    return self.push_error(LexicalError::new(
196                        LexicalErrorType::IndentationError,
197                        self.token_range(),
198                    ));
199                }
200            }
201        }
202
203        if self.state.is_after_newline() {
204            if let Some(indentation) = self.eat_indentation() {
205                return indentation;
206            }
207        } else if let Err(error) = self.skip_whitespace() {
208            return self.push_error(error);
209        }
210
211        // The lexer might've skipped whitespaces, so update the start offset
212        self.cursor.start_token();
213
214        if let Some(c) = self.cursor.bump() {
215            if c.is_ascii() {
216                self.consume_ascii_character(c)
217            } else if is_unicode_identifier_start(c) {
218                let identifier = self.lex_identifier(c);
219                self.state = State::Other;
220
221                identifier
222            } else {
223                self.push_error(LexicalError::new(
224                    LexicalErrorType::UnrecognizedToken { tok: c },
225                    self.token_range(),
226                ))
227            }
228        } else {
229            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
230            // empty the dedent stack, and finally, return the EndOfFile token.
231            self.consume_end()
232        }
233    }
234
235    fn eat_indentation(&mut self) -> Option<TokenKind> {
236        let mut indentation = Indentation::root();
237
238        loop {
239            match self.cursor.first() {
240                ' ' => {
241                    self.cursor.bump();
242                    indentation = indentation.add_space();
243                }
244                '\t' => {
245                    self.cursor.bump();
246                    indentation = indentation.add_tab();
247                }
248                '\\' => {
249                    self.cursor.bump();
250                    if self.cursor.eat_char('\r') {
251                        self.cursor.eat_char('\n');
252                    } else if !self.cursor.eat_char('\n') {
253                        return Some(self.push_error(LexicalError::new(
254                            LexicalErrorType::LineContinuationError,
255                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
256                        )));
257                    }
258                    if self.cursor.is_eof() {
259                        return Some(self.push_error(LexicalError::new(
260                            LexicalErrorType::Eof,
261                            self.token_range(),
262                        )));
263                    }
264                    indentation = Indentation::root();
265                }
266                // Form feed
267                '\x0C' => {
268                    self.cursor.bump();
269                    indentation = Indentation::root();
270                }
271                _ => break,
272            }
273        }
274
275        // Handle indentation if this is a new, not all empty, logical line
276        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
277            self.state = State::NonEmptyLogicalLine;
278
279            // Set to false so that we don't handle indentation on the next call.
280            return self.handle_indentation(indentation);
281        }
282
283        None
284    }
285
286    fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
287        match self.indentations.current().try_compare(indentation) {
288            // Dedent
289            Ok(Ordering::Greater) => {
290                self.pending_indentation = Some(indentation);
291
292                if self.indentations.dedent_one(indentation).is_err() {
293                    return Some(self.push_error(LexicalError::new(
294                        LexicalErrorType::IndentationError,
295                        self.token_range(),
296                    )));
297                }
298
299                // The lexer might've eaten some whitespaces to calculate the `indentation`. For
300                // example:
301                //
302                // ```py
303                // if first:
304                //     if second:
305                //         pass
306                //     foo
307                // #   ^
308                // ```
309                //
310                // Here, the cursor is at `^` and the `indentation` contains the whitespaces before
311                // the `pass` token.
312                self.cursor.start_token();
313
314                Some(TokenKind::Dedent)
315            }
316
317            Ok(Ordering::Equal) => None,
318
319            // Indent
320            Ok(Ordering::Less) => {
321                self.indentations.indent(indentation);
322                Some(TokenKind::Indent)
323            }
324            Err(_) => Some(self.push_error(LexicalError::new(
325                LexicalErrorType::IndentationError,
326                self.token_range(),
327            ))),
328        }
329    }
330
331    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
332        loop {
333            match self.cursor.first() {
334                ' ' => {
335                    self.cursor.bump();
336                }
337                '\t' => {
338                    self.cursor.bump();
339                }
340                '\\' => {
341                    self.cursor.bump();
342                    if self.cursor.eat_char('\r') {
343                        self.cursor.eat_char('\n');
344                    } else if !self.cursor.eat_char('\n') {
345                        return Err(LexicalError::new(
346                            LexicalErrorType::LineContinuationError,
347                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
348                        ));
349                    }
350                    if self.cursor.is_eof() {
351                        return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
352                    }
353                }
354                // Form feed
355                '\x0C' => {
356                    self.cursor.bump();
357                }
358                _ => break,
359            }
360        }
361
362        Ok(())
363    }
364
365    // Dispatch based on the given character.
366    fn consume_ascii_character(&mut self, c: char) -> TokenKind {
367        let token = match c {
368            c if is_ascii_identifier_start(c) => self.lex_identifier(c),
369            '0'..='9' => self.lex_number(c),
370            '#' => return self.lex_comment(),
371            '\'' | '"' => self.lex_string(c),
372            '=' => {
373                if self.cursor.eat_char('=') {
374                    TokenKind::EqEqual
375                } else {
376                    self.state = State::AfterEqual;
377                    return TokenKind::Equal;
378                }
379            }
380            '+' => {
381                if self.cursor.eat_char('=') {
382                    TokenKind::PlusEqual
383                } else {
384                    TokenKind::Plus
385                }
386            }
387            '*' => {
388                if self.cursor.eat_char('=') {
389                    TokenKind::StarEqual
390                } else if self.cursor.eat_char('*') {
391                    if self.cursor.eat_char('=') {
392                        TokenKind::DoubleStarEqual
393                    } else {
394                        TokenKind::DoubleStar
395                    }
396                } else {
397                    TokenKind::Star
398                }
399            }
400
401            c @ ('%' | '!')
402                if self.mode == Mode::Ipython
403                    && self.state.is_after_equal()
404                    && self.nesting == 0 =>
405            {
406                // SAFETY: Safe because `c` has been matched against one of the possible escape command token
407                self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
408            }
409
410            c @ ('%' | '!' | '?' | '/' | ';' | ',')
411                if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
412            {
413                let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
414                    self.cursor.bump();
415                    kind
416                } else {
417                    // SAFETY: Safe because `c` has been matched against one of the possible escape command token
418                    IpyEscapeKind::try_from(c).unwrap()
419                };
420
421                self.lex_ipython_escape_command(kind)
422            }
423
424            '?' if self.mode == Mode::Ipython => TokenKind::Question,
425
426            '/' => {
427                if self.cursor.eat_char('=') {
428                    TokenKind::SlashEqual
429                } else if self.cursor.eat_char('/') {
430                    if self.cursor.eat_char('=') {
431                        TokenKind::DoubleSlashEqual
432                    } else {
433                        TokenKind::DoubleSlash
434                    }
435                } else {
436                    TokenKind::Slash
437                }
438            }
439            '%' => {
440                if self.cursor.eat_char('=') {
441                    TokenKind::PercentEqual
442                } else {
443                    TokenKind::Percent
444                }
445            }
446            '|' => {
447                if self.cursor.eat_char('=') {
448                    TokenKind::VbarEqual
449                } else {
450                    TokenKind::Vbar
451                }
452            }
453            '^' => {
454                if self.cursor.eat_char('=') {
455                    TokenKind::CircumflexEqual
456                } else {
457                    TokenKind::CircumFlex
458                }
459            }
460            '&' => {
461                if self.cursor.eat_char('=') {
462                    TokenKind::AmperEqual
463                } else {
464                    TokenKind::Amper
465                }
466            }
467            '-' => {
468                if self.cursor.eat_char('=') {
469                    TokenKind::MinusEqual
470                } else if self.cursor.eat_char('>') {
471                    TokenKind::Rarrow
472                } else {
473                    TokenKind::Minus
474                }
475            }
476            '@' => {
477                if self.cursor.eat_char('=') {
478                    TokenKind::AtEqual
479                } else {
480                    TokenKind::At
481                }
482            }
483            '!' => {
484                if self.cursor.eat_char('=') {
485                    TokenKind::NotEqual
486                } else {
487                    TokenKind::Exclamation
488                }
489            }
490            '~' => TokenKind::Tilde,
491            '(' => {
492                self.nesting += 1;
493                TokenKind::Lpar
494            }
495            ')' => {
496                self.nesting = self.nesting.saturating_sub(1);
497                TokenKind::Rpar
498            }
499            '[' => {
500                self.nesting += 1;
501                TokenKind::Lsqb
502            }
503            ']' => {
504                self.nesting = self.nesting.saturating_sub(1);
505                TokenKind::Rsqb
506            }
507            '{' => {
508                self.nesting += 1;
509                TokenKind::Lbrace
510            }
511            '}' => {
512                if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
513                    if interpolated_string.nesting() == self.nesting {
514                        let error_type = LexicalErrorType::from_interpolated_string_error(
515                            InterpolatedStringErrorType::SingleRbrace,
516                            interpolated_string.kind(),
517                        );
518                        return self.push_error(LexicalError::new(error_type, self.token_range()));
519                    }
520                    interpolated_string.try_end_format_spec(self.nesting);
521                }
522                self.nesting = self.nesting.saturating_sub(1);
523                TokenKind::Rbrace
524            }
525            ':' => {
526                if self
527                    .interpolated_strings
528                    .current_mut()
529                    .is_some_and(|interpolated_string| {
530                        interpolated_string.try_start_format_spec(self.nesting)
531                    })
532                {
533                    TokenKind::Colon
534                } else if self.cursor.eat_char('=') {
535                    TokenKind::ColonEqual
536                } else {
537                    TokenKind::Colon
538                }
539            }
540            ';' => TokenKind::Semi,
541            '<' => {
542                if self.cursor.eat_char('<') {
543                    if self.cursor.eat_char('=') {
544                        TokenKind::LeftShiftEqual
545                    } else {
546                        TokenKind::LeftShift
547                    }
548                } else if self.cursor.eat_char('=') {
549                    TokenKind::LessEqual
550                } else {
551                    TokenKind::Less
552                }
553            }
554            '>' => {
555                if self.cursor.eat_char('>') {
556                    if self.cursor.eat_char('=') {
557                        TokenKind::RightShiftEqual
558                    } else {
559                        TokenKind::RightShift
560                    }
561                } else if self.cursor.eat_char('=') {
562                    TokenKind::GreaterEqual
563                } else {
564                    TokenKind::Greater
565                }
566            }
567            ',' => TokenKind::Comma,
568            '.' => {
569                if self.cursor.first().is_ascii_digit() {
570                    self.lex_decimal_number('.')
571                } else if self.cursor.eat_char2('.', '.') {
572                    TokenKind::Ellipsis
573                } else {
574                    TokenKind::Dot
575                }
576            }
577            '\n' => {
578                return if self.nesting == 0 && !self.state.is_new_logical_line() {
579                    self.state = State::AfterNewline;
580                    TokenKind::Newline
581                } else {
582                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
583                        interpolated_string.try_end_format_spec(self.nesting);
584                    }
585                    TokenKind::NonLogicalNewline
586                };
587            }
588            '\r' => {
589                self.cursor.eat_char('\n');
590
591                return if self.nesting == 0 && !self.state.is_new_logical_line() {
592                    self.state = State::AfterNewline;
593                    TokenKind::Newline
594                } else {
595                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
596                        interpolated_string.try_end_format_spec(self.nesting);
597                    }
598                    TokenKind::NonLogicalNewline
599                };
600            }
601
602            _ => {
603                self.state = State::Other;
604
605                return self.push_error(LexicalError::new(
606                    LexicalErrorType::UnrecognizedToken { tok: c },
607                    self.token_range(),
608                ));
609            }
610        };
611
612        self.state = State::Other;
613
614        token
615    }
616
617    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
618    fn lex_identifier(&mut self, first: char) -> TokenKind {
619        // Detect potential string like rb'' b'' f'' t'' u'' r''
620        let quote = match (first, self.cursor.first()) {
621            (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
622                self.cursor.bump();
623                quote
624            }),
625            (_, second) if is_quote(self.cursor.second()) => {
626                self.try_double_char_prefix([first, second]).then(|| {
627                    self.cursor.bump();
628                    // SAFETY: Safe because of the `is_quote` check in this match arm's guard
629                    self.cursor.bump().unwrap()
630                })
631            }
632            _ => None,
633        };
634
635        if let Some(quote) = quote {
636            if self.current_flags.is_interpolated_string()
637                && let Some(kind) = self.lex_interpolated_string_start(quote)
638            {
639                return kind;
640            }
641
642            return self.lex_string(quote);
643        }
644
645        // Keep track of whether the identifier is ASCII-only or not.
646        //
647        // This is important because Python applies NFKC normalization to
648        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
649        // We need to therefore do the same in our lexer, but applying NFKC normalization
650        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
651        // (by far the most common case), we can skip NFKC normalization of the identifier.
652        let mut is_ascii = first.is_ascii();
653        self.cursor
654            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
655
656        let text = self.token_text();
657
658        if !is_ascii {
659            self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
660            return TokenKind::Name;
661        }
662
663        // Short circuit for names that are longer than any known keyword.
664        // It helps Rust to predict that the Name::new call in the keyword match's default branch
665        // is guaranteed to fit into a stack allocated (inline) Name.
666        if text.len() > 8 {
667            self.current_value = TokenValue::Name(Name::new(text));
668            return TokenKind::Name;
669        }
670
671        match text {
672            "False" => TokenKind::False,
673            "None" => TokenKind::None,
674            "True" => TokenKind::True,
675            "and" => TokenKind::And,
676            "as" => TokenKind::As,
677            "assert" => TokenKind::Assert,
678            "async" => TokenKind::Async,
679            "await" => TokenKind::Await,
680            "break" => TokenKind::Break,
681            "case" => TokenKind::Case,
682            "class" => TokenKind::Class,
683            "continue" => TokenKind::Continue,
684            "def" => TokenKind::Def,
685            "del" => TokenKind::Del,
686            "elif" => TokenKind::Elif,
687            "else" => TokenKind::Else,
688            "except" => TokenKind::Except,
689            "finally" => TokenKind::Finally,
690            "for" => TokenKind::For,
691            "from" => TokenKind::From,
692            "global" => TokenKind::Global,
693            "if" => TokenKind::If,
694            "import" => TokenKind::Import,
695            "in" => TokenKind::In,
696            "is" => TokenKind::Is,
697            "lambda" => TokenKind::Lambda,
698            "match" => TokenKind::Match,
699            "nonlocal" => TokenKind::Nonlocal,
700            "not" => TokenKind::Not,
701            "or" => TokenKind::Or,
702            "pass" => TokenKind::Pass,
703            "raise" => TokenKind::Raise,
704            "return" => TokenKind::Return,
705            "try" => TokenKind::Try,
706            "type" => TokenKind::Type,
707            "while" => TokenKind::While,
708            "with" => TokenKind::With,
709            "yield" => TokenKind::Yield,
710            _ => {
711                self.current_value = TokenValue::Name(Name::new(text));
712                TokenKind::Name
713            }
714        }
715    }
716
717    /// Try lexing the single character string prefix, updating the token flags accordingly.
718    /// Returns `true` if it matches.
719    fn try_single_char_prefix(&mut self, first: char) -> bool {
720        match first {
721            'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
722            't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
723            'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
724            'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
725            'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
726            'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
727            _ => return false,
728        }
729        true
730    }
731
732    /// Try lexing the double character string prefix, updating the token flags accordingly.
733    /// Returns `true` if it matches.
734    fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
735        match value {
736            ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
737                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
738            }
739            ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
740                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
741            }
742            ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
743                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
744            }
745            ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
746                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
747            }
748            ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
749                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
750            }
751            ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
752                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
753            }
754            _ => return false,
755        }
756        true
757    }
758
759    /// Lex a f-string or t-string start token if positioned at the start of an f-string or t-string.
760    fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
761        #[cfg(debug_assertions)]
762        debug_assert_eq!(self.cursor.previous(), quote);
763
764        if quote == '"' {
765            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
766        }
767
768        if self.cursor.eat_char2(quote, quote) {
769            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
770        }
771
772        let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
773
774        let kind = ftcontext.kind();
775
776        self.interpolated_strings.push(ftcontext);
777
778        Some(kind.start_token())
779    }
780
781    /// Lex an f-string or t-string middle or end token.
782    fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
783        // SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
784        let interpolated_string = self.interpolated_strings.current().unwrap();
785        let string_kind = interpolated_string.kind();
786        let interpolated_flags = interpolated_string.flags();
787
788        // Check if we're at the end of the f-string.
789        if interpolated_string.is_triple_quoted() {
790            let quote_char = interpolated_string.quote_char();
791            if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
792                self.current_flags = interpolated_string.flags();
793                return Some(string_kind.end_token());
794            }
795        } else if self.cursor.eat_char(interpolated_string.quote_char()) {
796            self.current_flags = interpolated_string.flags();
797            return Some(string_kind.end_token());
798        }
799
800        // We have to decode `{{` and `}}` into `{` and `}` respectively. As an
801        // optimization, we only allocate a new string we find any escaped curly braces,
802        // otherwise this string will remain empty and we'll use a source slice instead.
803        let mut normalized = String::new();
804
805        // Tracks the last offset of token value that has been written to `normalized`.
806        let mut last_offset = self.offset();
807
808        // This isn't going to change for the duration of the loop.
809        let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
810
811        let mut in_named_unicode = false;
812
813        loop {
814            match self.cursor.first() {
815                // The condition is to differentiate between the `NUL` (`\0`) character
816                // in the source code and the one returned by `self.cursor.first()` when
817                // we reach the end of the source code.
818                EOF_CHAR if self.cursor.is_eof() => {
819                    let error = if interpolated_string.is_triple_quoted() {
820                        InterpolatedStringErrorType::UnterminatedTripleQuotedString
821                    } else {
822                        InterpolatedStringErrorType::UnterminatedString
823                    };
824
825                    self.nesting = interpolated_string.nesting();
826                    self.interpolated_strings.pop();
827                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
828                    self.push_error(LexicalError::new(
829                        LexicalErrorType::from_interpolated_string_error(error, string_kind),
830                        self.token_range(),
831                    ));
832
833                    break;
834                }
835                '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
836                    // https://github.com/astral-sh/ruff/issues/18632
837
838                    let error_type = if in_format_spec {
839                        InterpolatedStringErrorType::NewlineInFormatSpec
840                    } else {
841                        InterpolatedStringErrorType::UnterminatedString
842                    };
843
844                    self.nesting = interpolated_string.nesting();
845                    self.interpolated_strings.pop();
846                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
847
848                    self.push_error(LexicalError::new(
849                        LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
850                        self.token_range(),
851                    ));
852
853                    break;
854                }
855                '\\' => {
856                    self.cursor.bump(); // '\'
857                    if matches!(self.cursor.first(), '{' | '}') {
858                        // Don't consume `{` or `}` as we want them to be emitted as tokens.
859                        // They will be handled in the next iteration.
860                        continue;
861                    } else if !interpolated_string.is_raw_string()
862                        && self.cursor.eat_char2('N', '{')
863                    {
864                        in_named_unicode = true;
865                        continue;
866                    }
867                    // Consume the escaped character.
868                    if self.cursor.eat_char('\r') {
869                        self.cursor.eat_char('\n');
870                    } else {
871                        self.cursor.bump();
872                    }
873                }
874                quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
875                    if let Some(triple_quotes) = interpolated_string.triple_quotes() {
876                        if self.cursor.rest().starts_with(triple_quotes) {
877                            break;
878                        }
879                        self.cursor.bump();
880                    } else {
881                        break;
882                    }
883                }
884                '{' => {
885                    if self.cursor.second() == '{' && !in_format_spec {
886                        self.cursor.bump();
887                        normalized
888                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
889                        self.cursor.bump(); // Skip the second `{`
890                        last_offset = self.offset();
891                    } else {
892                        break;
893                    }
894                }
895                '}' => {
896                    if in_named_unicode {
897                        in_named_unicode = false;
898                        self.cursor.bump();
899                    } else if self.cursor.second() == '}' && !in_format_spec {
900                        self.cursor.bump();
901                        normalized
902                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
903                        self.cursor.bump(); // Skip the second `}`
904                        last_offset = self.offset();
905                    } else {
906                        break;
907                    }
908                }
909                _ => {
910                    self.cursor.bump();
911                }
912            }
913        }
914        let range = self.token_range();
915        if range.is_empty() {
916            return None;
917        }
918
919        let value = if normalized.is_empty() {
920            self.source[range].to_string()
921        } else {
922            normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
923            normalized
924        };
925
926        self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
927
928        self.current_flags = interpolated_flags;
929        Some(string_kind.middle_token())
930    }
931
932    /// Lex a string literal.
933    fn lex_string(&mut self, quote: char) -> TokenKind {
934        #[cfg(debug_assertions)]
935        debug_assert_eq!(self.cursor.previous(), quote);
936
937        if quote == '"' {
938            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
939        }
940
941        // If the next two characters are also the quote character, then we have a triple-quoted
942        // string; consume those two characters and ensure that we require a triple-quote to close
943        if self.cursor.eat_char2(quote, quote) {
944            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
945        }
946
947        let value_start = self.offset();
948
949        let quote_byte = u8::try_from(quote).expect("char that fits in u8");
950        let value_end = if self.current_flags.is_triple_quoted() {
951            // For triple-quoted strings, scan until we find the closing quote (ignoring escaped
952            // quotes) or the end of the file.
953            loop {
954                let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
955                    self.cursor.skip_to_end();
956
957                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
958                    self.push_error(LexicalError::new(
959                        LexicalErrorType::UnclosedStringError,
960                        self.token_range(),
961                    ));
962                    break self.offset();
963                };
964
965                // Rare case: if there are an odd number of backslashes before the quote, then
966                // the quote is escaped and we should continue scanning.
967                let num_backslashes = self.cursor.rest().as_bytes()[..index]
968                    .iter()
969                    .rev()
970                    .take_while(|&&c| c == b'\\')
971                    .count();
972
973                // Advance the cursor past the quote and continue scanning.
974                self.cursor.skip_bytes(index + 1);
975
976                // If the character is escaped, continue scanning.
977                if num_backslashes % 2 == 1 {
978                    continue;
979                }
980
981                // Otherwise, if it's followed by two more quotes, then we're done.
982                if self.cursor.eat_char2(quote, quote) {
983                    break self.offset() - TextSize::new(3);
984                }
985            }
986        } else {
987            // For non-triple-quoted strings, scan until we find the closing quote, but end early
988            // if we encounter a newline or the end of the file.
989            loop {
990                let Some(index) =
991                    memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
992                else {
993                    self.cursor.skip_to_end();
994                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
995
996                    self.push_error(LexicalError::new(
997                        LexicalErrorType::UnclosedStringError,
998                        self.token_range(),
999                    ));
1000
1001                    break self.offset();
1002                };
1003
1004                // Rare case: if there are an odd number of backslashes before the quote, then
1005                // the quote is escaped and we should continue scanning.
1006                let num_backslashes = self.cursor.rest().as_bytes()[..index]
1007                    .iter()
1008                    .rev()
1009                    .take_while(|&&c| c == b'\\')
1010                    .count();
1011
1012                // Skip up to the current character.
1013                self.cursor.skip_bytes(index);
1014
1015                // Lookahead because we want to bump only if it's a quote or being escaped.
1016                let quote_or_newline = self.cursor.first();
1017
1018                // If the character is escaped, continue scanning.
1019                if num_backslashes % 2 == 1 {
1020                    self.cursor.bump();
1021                    if quote_or_newline == '\r' {
1022                        self.cursor.eat_char('\n');
1023                    }
1024                    continue;
1025                }
1026
1027                match quote_or_newline {
1028                    '\r' | '\n' => {
1029                        self.current_flags |= TokenFlags::UNCLOSED_STRING;
1030                        self.push_error(LexicalError::new(
1031                            LexicalErrorType::UnclosedStringError,
1032                            self.token_range(),
1033                        ));
1034                        break self.offset();
1035                    }
1036                    ch if ch == quote => {
1037                        let value_end = self.offset();
1038                        self.cursor.bump();
1039                        break value_end;
1040                    }
1041                    _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1042                }
1043            }
1044        };
1045
1046        self.current_value = TokenValue::String(
1047            self.source[TextRange::new(value_start, value_end)]
1048                .to_string()
1049                .into_boxed_str(),
1050        );
1051
1052        TokenKind::String
1053    }
1054
1055    /// Numeric lexing. The feast can start!
1056    fn lex_number(&mut self, first: char) -> TokenKind {
1057        if first == '0' {
1058            if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1059                self.lex_number_radix(Radix::Hex)
1060            } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1061                self.lex_number_radix(Radix::Octal)
1062            } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1063                self.lex_number_radix(Radix::Binary)
1064            } else {
1065                self.lex_decimal_number(first)
1066            }
1067        } else {
1068            self.lex_decimal_number(first)
1069        }
1070    }
1071
1072    /// Lex a hex/octal/decimal/binary number without a decimal point.
1073    fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1074        #[cfg(debug_assertions)]
1075        debug_assert!(matches!(
1076            self.cursor.previous().to_ascii_lowercase(),
1077            'x' | 'o' | 'b'
1078        ));
1079
1080        // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
1081        let mut number = LexedText::new(self.offset(), self.source);
1082        self.radix_run(&mut number, radix);
1083
1084        // Extract the entire number, including the base prefix (e.g., `0x9D5`).
1085        let token = &self.source[self.token_range()];
1086
1087        let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1088            Ok(int) => int,
1089            Err(err) => {
1090                return self.push_error(LexicalError::new(
1091                    LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1092                    self.token_range(),
1093                ));
1094            }
1095        };
1096        self.current_value = TokenValue::Int(value);
1097        TokenKind::Int
1098    }
1099
1100    /// Lex a normal number, that is, no octal, hex or binary number.
1101    fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1102        #[cfg(debug_assertions)]
1103        debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1104        let start_is_zero = first_digit_or_dot == '0';
1105
1106        let mut number = LexedText::new(self.token_start(), self.source);
1107        if first_digit_or_dot != '.' {
1108            number.push(first_digit_or_dot);
1109            self.radix_run(&mut number, Radix::Decimal);
1110        }
1111
1112        let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1113            number.push('.');
1114
1115            if self.cursor.eat_char('_') {
1116                return self.push_error(LexicalError::new(
1117                    LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1118                    TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1119                ));
1120            }
1121
1122            self.radix_run(&mut number, Radix::Decimal);
1123            true
1124        } else {
1125            // Normal number:
1126            false
1127        };
1128
1129        let is_float = match self.cursor.rest().as_bytes() {
1130            [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1131                // 'e' | 'E'
1132                number.push(self.cursor.bump().unwrap());
1133
1134                if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1135                    number.push(sign);
1136                }
1137
1138                self.radix_run(&mut number, Radix::Decimal);
1139
1140                true
1141            }
1142            _ => is_float,
1143        };
1144
1145        if is_float {
1146            // Improvement: Use `Cow` instead of pushing to value text
1147            let Ok(value) = f64::from_str(number.as_str()) else {
1148                return self.push_error(LexicalError::new(
1149                    LexicalErrorType::OtherError(
1150                        "Invalid decimal literal".to_string().into_boxed_str(),
1151                    ),
1152                    self.token_range(),
1153                ));
1154            };
1155
1156            // Parse trailing 'j':
1157            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1158                self.current_value = TokenValue::Complex {
1159                    real: 0.0,
1160                    imag: value,
1161                };
1162                TokenKind::Complex
1163            } else {
1164                self.current_value = TokenValue::Float(value);
1165                TokenKind::Float
1166            }
1167        } else {
1168            // Parse trailing 'j':
1169            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1170                let imag = f64::from_str(number.as_str()).unwrap();
1171                self.current_value = TokenValue::Complex { real: 0.0, imag };
1172                TokenKind::Complex
1173            } else {
1174                let value = match Int::from_str(number.as_str()) {
1175                    Ok(value) => {
1176                        if start_is_zero && value.as_u8() != Some(0) {
1177                            // Leading zeros in decimal integer literals are not permitted.
1178                            return self.push_error(LexicalError::new(
1179                                LexicalErrorType::OtherError(
1180                                    "Invalid decimal integer literal"
1181                                        .to_string()
1182                                        .into_boxed_str(),
1183                                ),
1184                                self.token_range(),
1185                            ));
1186                        }
1187                        value
1188                    }
1189                    Err(err) => {
1190                        return self.push_error(LexicalError::new(
1191                            LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1192                            self.token_range(),
1193                        ));
1194                    }
1195                };
1196                self.current_value = TokenValue::Int(value);
1197                TokenKind::Int
1198            }
1199        }
1200    }
1201
1202    /// Consume a sequence of numbers with the given radix,
1203    /// the digits can be decorated with underscores
1204    /// like this: '`1_2_3_4`' == '1234'
1205    fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1206        loop {
1207            if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1208                number.push(c);
1209            }
1210            // Number that contains `_` separators. Remove them from the parsed text.
1211            else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1212                // Skip over `_`
1213                self.cursor.bump();
1214                number.skip_char();
1215            } else {
1216                break;
1217            }
1218        }
1219    }
1220
1221    /// Lex a single comment.
1222    fn lex_comment(&mut self) -> TokenKind {
1223        #[cfg(debug_assertions)]
1224        debug_assert_eq!(self.cursor.previous(), '#');
1225
1226        let bytes = self.cursor.rest().as_bytes();
1227        let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1228        self.cursor.skip_bytes(offset);
1229
1230        TokenKind::Comment
1231    }
1232
1233    /// Lex a single IPython escape command.
1234    fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
1235        let mut value = String::new();
1236
1237        loop {
1238            match self.cursor.first() {
1239                '\\' => {
1240                    // Only skip the line continuation if it is followed by a newline
1241                    // otherwise it is a normal backslash which is part of the magic command:
1242                    //
1243                    //        Skip this backslash
1244                    //        v
1245                    //   !pwd \
1246                    //      && ls -a | sed 's/^/\\    /'
1247                    //                          ^^
1248                    //                          Don't skip these backslashes
1249                    if self.cursor.second() == '\r' {
1250                        self.cursor.bump();
1251                        self.cursor.bump();
1252                        self.cursor.eat_char('\n');
1253                        continue;
1254                    } else if self.cursor.second() == '\n' {
1255                        self.cursor.bump();
1256                        self.cursor.bump();
1257                        continue;
1258                    }
1259
1260                    self.cursor.bump();
1261                    value.push('\\');
1262                }
1263                // Help end escape commands are those that end with 1 or 2 question marks.
1264                // Here, we're only looking for a subset of help end escape commands which
1265                // are the ones that has the escape token at the start of the line as well.
1266                // On the other hand, we're not looking for help end escape commands that
1267                // are strict in the sense that the escape token is only at the end. For example,
1268                //
1269                //   * `%foo?` is recognized as a help end escape command but not as a strict one.
1270                //   * `foo?` is recognized as a strict help end escape command which is not
1271                //     lexed here but is identified at the parser level.
1272                //
1273                // Help end escape commands implemented in the IPython codebase using regex:
1274                // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
1275                '?' => {
1276                    self.cursor.bump();
1277                    let mut question_count = 1u32;
1278                    while self.cursor.eat_char('?') {
1279                        question_count += 1;
1280                    }
1281
1282                    // The original implementation in the IPython codebase is based on regex which
1283                    // means that it's strict in the sense that it won't recognize a help end escape:
1284                    //   * If there's any whitespace before the escape token (e.g. `%foo ?`)
1285                    //   * If there are more than 2 question mark tokens (e.g. `%foo???`)
1286                    // which is what we're doing here as well. In that case, we'll continue with
1287                    // the prefixed escape token.
1288                    //
1289                    // Now, the whitespace and empty value check also makes sure that an empty
1290                    // command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
1291                    // is not recognized as a help end escape command. So, `%?` and `? ??` are
1292                    // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
1293                    // tokens.
1294                    if question_count > 2
1295                        || value.chars().last().is_none_or(is_python_whitespace)
1296                        || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1297                    {
1298                        // Not a help end escape command, so continue with the lexing.
1299                        value.reserve(question_count as usize);
1300                        for _ in 0..question_count {
1301                            value.push('?');
1302                        }
1303                        continue;
1304                    }
1305
1306                    if escape_kind.is_help() {
1307                        // If we've recognize this as a help end escape command, then
1308                        // any question mark token / whitespaces at the start are not
1309                        // considered as part of the value.
1310                        //
1311                        // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
1312                        // `value` is `foo` instead of `??foo`.
1313                        value = value.trim_start_matches([' ', '?']).to_string();
1314                    } else if escape_kind.is_magic() {
1315                        // Between `%` and `?` (at the end), the `?` takes priority
1316                        // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
1317                        // and `value` is `%foo` instead of `foo`. So, we need to
1318                        // insert the magic escape token at the start.
1319                        value.insert_str(0, escape_kind.as_str());
1320                    }
1321
1322                    let kind = match question_count {
1323                        1 => IpyEscapeKind::Help,
1324                        2 => IpyEscapeKind::Help2,
1325                        _ => unreachable!("`question_count` is always 1 or 2"),
1326                    };
1327
1328                    self.current_value = TokenValue::IpyEscapeCommand {
1329                        kind,
1330                        value: value.into_boxed_str(),
1331                    };
1332
1333                    return TokenKind::IpyEscapeCommand;
1334                }
1335                '\n' | '\r' | EOF_CHAR => {
1336                    self.current_value = TokenValue::IpyEscapeCommand {
1337                        kind: escape_kind,
1338                        value: value.into_boxed_str(),
1339                    };
1340
1341                    return TokenKind::IpyEscapeCommand;
1342                }
1343                c => {
1344                    self.cursor.bump();
1345                    value.push(c);
1346                }
1347            }
1348        }
1349    }
1350
1351    fn consume_end(&mut self) -> TokenKind {
1352        // We reached end of file.
1353
1354        // First, finish any unterminated interpolated-strings.
1355        while let Some(interpolated_string) = self.interpolated_strings.pop() {
1356            self.nesting = interpolated_string.nesting();
1357            self.push_error(LexicalError::new(
1358                LexicalErrorType::from_interpolated_string_error(
1359                    InterpolatedStringErrorType::UnterminatedString,
1360                    interpolated_string.kind(),
1361                ),
1362                self.token_range(),
1363            ));
1364        }
1365
1366        // Second, finish all nestings.
1367        // For Mode::ParenthesizedExpression we start with nesting level 1.
1368        // So we check if we end with that level.
1369        let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1370
1371        if self.nesting > init_nesting {
1372            // Reset the nesting to avoid going into infinite loop.
1373            self.nesting = 0;
1374            return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1375        }
1376
1377        // Next, insert a trailing newline, if required.
1378        if !self.state.is_new_logical_line() {
1379            self.state = State::AfterNewline;
1380            TokenKind::Newline
1381        }
1382        // Next, flush the indentation stack to zero.
1383        else if self.indentations.dedent().is_some() {
1384            TokenKind::Dedent
1385        } else {
1386            TokenKind::EndOfFile
1387        }
1388    }
1389
1390    /// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
1391    /// line.
1392    ///
1393    /// Returns a boolean indicating whether the lexer's position has changed. This could result
1394    /// into the new current token being different than the previous current token but is not
1395    /// necessarily true. If the return value is `true` then the caller is responsible for updating
1396    /// it's state accordingly.
1397    ///
1398    /// This method is a no-op if the lexer isn't in a parenthesized context.
1399    ///
1400    /// ## Explanation
1401    ///
1402    /// The lexer emits two different kinds of newline token based on the context. If it's in a
1403    /// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a
1404    /// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and
1405    /// emit the indentation tokens appropriately which affects the structure of the code.
1406    ///
1407    /// For example:
1408    /// ```py
1409    /// if call(foo
1410    ///     def bar():
1411    ///         pass
1412    /// ```
1413    ///
1414    /// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer
1415    /// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which
1416    /// considers the function `bar` as part of the module block and the `if` block remains empty.
1417    ///
1418    /// This method is to facilitate the parser if it recovers from these kind of scenarios so that
1419    /// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in
1420    /// turn helps the parser to build the correct AST.
1421    ///
1422    /// In the above snippet, it would mean that this method would move the lexer back to the
1423    /// newline character after the `foo` token and emit it as a [`Newline`] token instead of
1424    /// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an
1425    /// `Indent` token.
1426    ///
1427    /// There are cases where the lexer's position will change but the re-lexed token will remain
1428    /// the same. This is to help the parser to add the error message at an appropriate location.
1429    /// Consider the following example:
1430    ///
1431    /// ```py
1432    /// if call(foo, [a, b
1433    ///     def bar():
1434    ///         pass
1435    /// ```
1436    ///
1437    /// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call
1438    /// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic
1439    /// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token.
1440    /// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit
1441    /// the [`Newline`] token.
1442    ///
1443    /// [`Newline`]: TokenKind::Newline
1444    /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
1445    pub(crate) fn re_lex_logical_token(
1446        &mut self,
1447        non_logical_newline_start: Option<TextSize>,
1448    ) -> bool {
1449        if self.nesting == 0 {
1450            return false;
1451        }
1452
1453        // Reduce the nesting level because the parser recovered from an error inside list parsing
1454        // i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
1455        self.nesting -= 1;
1456
1457        // The lexer can't be moved back for a triple-quoted f/t-string because the newlines are
1458        // part of the f/t-string itself, so there is no newline token to be emitted.
1459        if self.current_flags.is_triple_quoted_interpolated_string() {
1460            return false;
1461        }
1462
1463        let Some(new_position) = non_logical_newline_start else {
1464            return false;
1465        };
1466
1467        // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
1468        // position is going to be moved back, the lexer needs to be put back into a
1469        // parenthesized context if the current token is a closing parenthesis.
1470        //
1471        // ```py
1472        // (a, [b,
1473        //     c
1474        // )
1475        // ```
1476        //
1477        // Here, the parser would request to re-lex the token when it's at `)` and can recover
1478        // from an unclosed `[`. This method will move the lexer back to the newline character
1479        // after `c` which means it goes back into parenthesized context.
1480        if matches!(
1481            self.current_kind,
1482            TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1483        ) {
1484            self.nesting += 1;
1485        }
1486
1487        self.cursor = Cursor::new(self.source);
1488        self.cursor.skip_bytes(new_position.to_usize());
1489        self.state = State::Other;
1490        self.next_token();
1491        true
1492    }
1493
1494    /// Re-lexes an unclosed string token in the context of an interpolated string element.
1495    ///
1496    /// ```py
1497    /// f'{a'
1498    /// ```
1499    ///
1500    /// This method re-lexes the trailing `'` as the end of the f-string rather than the
1501    /// start of a new string token for better error recovery.
1502    pub(crate) fn re_lex_string_token_in_interpolation_element(
1503        &mut self,
1504        kind: InterpolatedStringKind,
1505    ) {
1506        let Some(interpolated_string) = self.interpolated_strings.current() else {
1507            return;
1508        };
1509
1510        let current_string_flags = self.current_flags().as_any_string_flags();
1511
1512        // Only unclosed strings, that have the same quote character
1513        if !matches!(self.current_kind, TokenKind::String)
1514            || !self.current_flags.is_unclosed()
1515            || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1516            || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1517            || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1518        {
1519            return;
1520        }
1521
1522        // Only if the string's first line only contains whitespace,
1523        // or ends in a comment (not `f"{"abc`)
1524        let first_line = &self.source
1525            [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1526
1527        for c in first_line.chars() {
1528            if matches!(c, '\n' | '\r' | '#') {
1529                break;
1530            }
1531
1532            // `f'{'ab`, we want to parse `ab` as a normal string and not the closing element of the f-string
1533            if !is_python_whitespace(c) {
1534                return;
1535            }
1536        }
1537
1538        if self.errors.last().is_some_and(|error| {
1539            error.location() == self.current_range
1540                && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1541        }) {
1542            self.errors.pop();
1543        }
1544
1545        self.current_range =
1546            TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1547        self.current_kind = kind.end_token();
1548        self.current_value = TokenValue::None;
1549        self.current_flags = TokenFlags::empty();
1550
1551        self.nesting = interpolated_string.nesting();
1552        self.interpolated_strings.pop();
1553
1554        self.cursor = Cursor::new(self.source);
1555        self.cursor.skip_bytes(self.current_range.end().to_usize());
1556    }
1557
1558    /// Re-lex `r"` in a format specifier position.
1559    ///
1560    /// `r"` in a format specifier position is unlikely to be the start of a raw string.
1561    /// Instead, it's the format specifier `!r` followed by the closing quote of the f-string,
1562    /// when the `}` is missing.
1563    ///
1564    /// ```py
1565    /// f"{test!r"
1566    /// ```
1567    ///
1568    /// This function re-lexes the `r"` as `r` (a name token). The next `next_token` call will
1569    /// return a unclosed string token for `"`, which [`Self::re_lex_string_token_in_interpolation_element`]
1570    /// can then re-lex as the end of the f-string.
1571    pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1572        // Re-lex `r"` as `NAME r` followed by an unclosed string
1573        // `f"{test!r"` -> `f"{test!`, `r`, `"`
1574        if matches!(self.current_kind, TokenKind::String)
1575            && self.current_flags.is_unclosed()
1576            && self.current_flags.prefix()
1577                == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1578        {
1579            if self.errors.last().is_some_and(|error| {
1580                error.location() == self.current_range
1581                    && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1582            }) {
1583                self.errors.pop();
1584            }
1585
1586            self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1587            self.current_kind = TokenKind::Name;
1588            self.current_value = TokenValue::Name(Name::new_static("r"));
1589            self.current_flags = TokenFlags::empty();
1590            self.cursor = Cursor::new(self.source);
1591            self.cursor.skip_bytes(self.current_range.end().to_usize());
1592        }
1593    }
1594
1595    #[inline]
1596    fn token_range(&self) -> TextRange {
1597        let end = self.offset();
1598        let len = self.cursor.token_len();
1599
1600        TextRange::at(end - len, len)
1601    }
1602
1603    #[inline]
1604    fn token_text(&self) -> &'src str {
1605        &self.source[self.token_range()]
1606    }
1607
1608    /// Retrieves the current offset of the cursor within the source code.
1609    // SAFETY: Lexer doesn't allow files larger than 4GB
1610    #[expect(clippy::cast_possible_truncation)]
1611    #[inline]
1612    fn offset(&self) -> TextSize {
1613        TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1614    }
1615
1616    #[inline]
1617    fn token_start(&self) -> TextSize {
1618        self.token_range().start()
1619    }
1620
1621    /// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].
1622    pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1623        LexerCheckpoint {
1624            value: self.current_value.clone(),
1625            current_kind: self.current_kind,
1626            current_range: self.current_range,
1627            current_flags: self.current_flags,
1628            cursor_offset: self.offset(),
1629            state: self.state,
1630            nesting: self.nesting,
1631            indentations_checkpoint: self.indentations.checkpoint(),
1632            pending_indentation: self.pending_indentation,
1633            interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1634            errors_position: self.errors.len(),
1635        }
1636    }
1637
1638    /// Restore the lexer to the given checkpoint.
1639    pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1640        let LexerCheckpoint {
1641            value,
1642            current_kind,
1643            current_range,
1644            current_flags,
1645            cursor_offset,
1646            state,
1647            nesting,
1648            indentations_checkpoint,
1649            pending_indentation,
1650            interpolated_strings_checkpoint,
1651            errors_position,
1652        } = checkpoint;
1653
1654        let mut cursor = Cursor::new(self.source);
1655        // We preserve the previous char using this method.
1656        cursor.skip_bytes(cursor_offset.to_usize());
1657
1658        self.current_value = value;
1659        self.current_kind = current_kind;
1660        self.current_range = current_range;
1661        self.current_flags = current_flags;
1662        self.cursor = cursor;
1663        self.state = state;
1664        self.nesting = nesting;
1665        self.indentations.rewind(indentations_checkpoint);
1666        self.pending_indentation = pending_indentation;
1667        self.interpolated_strings
1668            .rewind(interpolated_strings_checkpoint);
1669        self.errors.truncate(errors_position);
1670    }
1671
1672    pub fn finish(self) -> Vec<LexicalError> {
1673        self.errors
1674    }
1675}
1676
1677pub(crate) struct LexerCheckpoint {
1678    value: TokenValue,
1679    current_kind: TokenKind,
1680    current_range: TextRange,
1681    current_flags: TokenFlags,
1682    cursor_offset: TextSize,
1683    state: State,
1684    nesting: u32,
1685    indentations_checkpoint: IndentationsCheckpoint,
1686    pending_indentation: Option<Indentation>,
1687    interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1688    errors_position: usize,
1689}
1690
1691#[derive(Copy, Clone, Debug)]
1692enum State {
1693    /// Lexer is right at the beginning of the file or after a `Newline` token.
1694    AfterNewline,
1695
1696    /// The lexer is at the start of a new logical line but **after** the indentation
1697    NonEmptyLogicalLine,
1698
1699    /// Lexer is right after an equal token
1700    AfterEqual,
1701
1702    /// Inside of a logical line
1703    Other,
1704}
1705
1706impl State {
1707    const fn is_after_newline(self) -> bool {
1708        matches!(self, State::AfterNewline)
1709    }
1710
1711    const fn is_new_logical_line(self) -> bool {
1712        matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1713    }
1714
1715    const fn is_after_equal(self) -> bool {
1716        matches!(self, State::AfterEqual)
1717    }
1718}
1719
1720#[derive(Copy, Clone, Debug)]
1721enum Radix {
1722    Binary,
1723    Octal,
1724    Decimal,
1725    Hex,
1726}
1727
1728impl Radix {
1729    const fn as_u32(self) -> u32 {
1730        match self {
1731            Radix::Binary => 2,
1732            Radix::Octal => 8,
1733            Radix::Decimal => 10,
1734            Radix::Hex => 16,
1735        }
1736    }
1737
1738    const fn is_digit(self, c: char) -> bool {
1739        match self {
1740            Radix::Binary => matches!(c, '0'..='1'),
1741            Radix::Octal => matches!(c, '0'..='7'),
1742            Radix::Decimal => c.is_ascii_digit(),
1743            Radix::Hex => c.is_ascii_hexdigit(),
1744        }
1745    }
1746}
1747
1748const fn is_quote(c: char) -> bool {
1749    matches!(c, '\'' | '"')
1750}
1751
1752const fn is_ascii_identifier_start(c: char) -> bool {
1753    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1754}
1755
1756// Checks if the character c is a valid starting character as described
1757// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1758fn is_unicode_identifier_start(c: char) -> bool {
1759    is_xid_start(c)
1760}
1761
1762/// Checks if the character c is a valid continuation character as described
1763/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1764///
1765/// Additionally, this function also keeps track of whether or not the total
1766/// identifier is ASCII-only or not by mutably altering a reference to a
1767/// boolean value passed in.
1768fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1769    // Arrange things such that ASCII codepoints never
1770    // result in the slower `is_xid_continue` getting called.
1771    if c.is_ascii() {
1772        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1773    } else {
1774        *identifier_is_ascii_only = false;
1775        is_xid_continue(c)
1776    }
1777}
1778
1779enum LexedText<'a> {
1780    Source { source: &'a str, range: TextRange },
1781    Owned(String),
1782}
1783
1784impl<'a> LexedText<'a> {
1785    fn new(start: TextSize, source: &'a str) -> Self {
1786        Self::Source {
1787            range: TextRange::empty(start),
1788            source,
1789        }
1790    }
1791
1792    fn push(&mut self, c: char) {
1793        match self {
1794            LexedText::Source { range, source } => {
1795                *range = range.add_end(c.text_len());
1796                debug_assert!(source[*range].ends_with(c));
1797            }
1798            LexedText::Owned(owned) => owned.push(c),
1799        }
1800    }
1801
1802    fn as_str<'b>(&'b self) -> &'b str
1803    where
1804        'b: 'a,
1805    {
1806        match self {
1807            LexedText::Source { range, source } => &source[*range],
1808            LexedText::Owned(owned) => owned,
1809        }
1810    }
1811
1812    fn skip_char(&mut self) {
1813        match self {
1814            LexedText::Source { range, source } => {
1815                *self = LexedText::Owned(source[*range].to_string());
1816            }
1817            LexedText::Owned(_) => {}
1818        }
1819    }
1820}
1821
1822/// Create a new [`Lexer`] for the given source code and [`Mode`].
1823pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1824    Lexer::new(source, mode, TextSize::default())
1825}
ruff_python_parser/lexer.rs

ruff_python_parser/
lexer.rs