rustpython_ruff_python_parser/
lexer.rs

1//! This module takes care of lexing Python source text.
2//!
3//! This means source code is scanned and translated into separate tokens. The rules
4//! governing what is and is not a valid token are defined in the Python reference
5//! guide section on [Lexical analysis].
6//!
7//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
8
9use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27    InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38/// A lexer for Python source code.
39#[derive(Debug)]
40pub struct Lexer<'src> {
41    /// Source code to be lexed.
42    source: &'src str,
43
44    /// A pointer to the current character of the source code which is being lexed.
45    cursor: Cursor<'src>,
46
47    /// The kind of the current token.
48    current_kind: TokenKind,
49
50    /// The range of the current token.
51    current_range: TextRange,
52
53    /// The value of the current token.
54    current_value: TokenValue,
55
56    /// Flags for the current token.
57    current_flags: TokenFlags,
58
59    /// Lexer state.
60    state: State,
61
62    /// Represents the current level of nesting in the lexer, indicating the depth of parentheses.
63    /// The lexer is within a parenthesized context if the value is greater than 0.
64    nesting: u32,
65
66    /// A stack of indentation representing the current indentation level.
67    indentations: Indentations,
68    pending_indentation: Option<Indentation>,
69
70    /// Lexer mode.
71    mode: Mode,
72
73    /// F-string and t-string contexts.
74    interpolated_strings: InterpolatedStrings,
75
76    /// Errors encountered while lexing.
77    errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81    /// Create a new lexer for the given input source which starts at the given offset.
82    ///
83    /// If the start offset is greater than 0, the cursor is moved ahead that many bytes.
84    /// This means that the input source should be the complete source code and not the
85    /// sliced version.
86    pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87        assert!(
88            u32::try_from(source.len()).is_ok(),
89            "Lexer only supports files with a size up to 4GB"
90        );
91
92        let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93            (State::Other, 1)
94        } else {
95            (State::AfterNewline, 0)
96        };
97
98        let mut lexer = Lexer {
99            source,
100            cursor: Cursor::new(source),
101            state,
102            current_kind: TokenKind::EndOfFile,
103            current_range: TextRange::empty(start_offset),
104            current_value: TokenValue::None,
105            current_flags: TokenFlags::empty(),
106            nesting,
107            indentations: Indentations::default(),
108            pending_indentation: None,
109            mode,
110            interpolated_strings: InterpolatedStrings::default(),
111            errors: Vec::new(),
112        };
113
114        if start_offset == TextSize::new(0) {
115            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
116            lexer.cursor.eat_char(BOM);
117        } else {
118            lexer.cursor.skip_bytes(start_offset.to_usize());
119        }
120
121        lexer
122    }
123
124    /// Returns the kind of the current token.
125    pub(crate) fn current_kind(&self) -> TokenKind {
126        self.current_kind
127    }
128
129    /// Returns the range of the current token.
130    pub(crate) fn current_range(&self) -> TextRange {
131        self.current_range
132    }
133
134    /// Returns the flags for the current token.
135    pub(crate) fn current_flags(&self) -> TokenFlags {
136        self.current_flags
137    }
138
139    /// Takes the token value corresponding to the current token out of the lexer, replacing it
140    /// with the default value.
141    ///
142    /// All the subsequent call to this method without moving the lexer would always return the
143    /// default value which is [`TokenValue::None`].
144    pub(crate) fn take_value(&mut self) -> TokenValue {
145        std::mem::take(&mut self.current_value)
146    }
147
148    /// Helper function to push the given error, updating the current range with the error location
149    /// and return the [`TokenKind::Unknown`] token.
150    fn push_error(&mut self, error: LexicalError) -> TokenKind {
151        self.current_range = error.location();
152        self.errors.push(error);
153        TokenKind::Unknown
154    }
155
156    /// Lex the next token.
157    pub fn next_token(&mut self) -> TokenKind {
158        self.cursor.start_token();
159        self.current_value = TokenValue::None;
160        self.current_flags = TokenFlags::empty();
161        self.current_kind = self.lex_token();
162        // For `Unknown` token, the `push_error` method updates the current range.
163        if !matches!(self.current_kind, TokenKind::Unknown) {
164            self.current_range = self.token_range();
165        }
166        self.current_kind
167    }
168
169    fn lex_token(&mut self) -> TokenKind {
170        if let Some(interpolated_string) = self.interpolated_strings.current() {
171            if !interpolated_string.is_in_interpolation(self.nesting) {
172                if let Some(token) = self.lex_interpolated_string_middle_or_end() {
173                    if token.is_interpolated_string_end() {
174                        self.interpolated_strings.pop();
175                    }
176                    return token;
177                }
178            }
179        }
180        // Return dedent tokens until the current indentation level matches the indentation of the next token.
181        else if let Some(indentation) = self.pending_indentation.take() {
182            match self.indentations.current().try_compare(indentation) {
183                Ok(Ordering::Greater) => {
184                    self.pending_indentation = Some(indentation);
185                    if self.indentations.dedent_one(indentation).is_err() {
186                        return self.push_error(LexicalError::new(
187                            LexicalErrorType::IndentationError,
188                            self.token_range(),
189                        ));
190                    }
191                    return TokenKind::Dedent;
192                }
193                Ok(_) => {}
194                Err(_) => {
195                    return self.push_error(LexicalError::new(
196                        LexicalErrorType::IndentationError,
197                        self.token_range(),
198                    ));
199                }
200            }
201        }
202
203        if self.state.is_after_newline() {
204            if let Some(indentation) = self.eat_indentation() {
205                return indentation;
206            }
207        } else {
208            if let Err(error) = self.skip_whitespace() {
209                return self.push_error(error);
210            }
211        }
212
213        // The lexer might've skipped whitespaces, so update the start offset
214        self.cursor.start_token();
215
216        if let Some(c) = self.cursor.bump() {
217            if c.is_ascii() {
218                self.consume_ascii_character(c)
219            } else if is_unicode_identifier_start(c) {
220                let identifier = self.lex_identifier(c);
221                self.state = State::Other;
222
223                identifier
224            } else {
225                self.push_error(LexicalError::new(
226                    LexicalErrorType::UnrecognizedToken { tok: c },
227                    self.token_range(),
228                ))
229            }
230        } else {
231            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
232            // empty the dedent stack, and finally, return the EndOfFile token.
233            self.consume_end()
234        }
235    }
236
237    fn eat_indentation(&mut self) -> Option<TokenKind> {
238        let mut indentation = Indentation::root();
239
240        loop {
241            match self.cursor.first() {
242                ' ' => {
243                    self.cursor.bump();
244                    indentation = indentation.add_space();
245                }
246                '\t' => {
247                    self.cursor.bump();
248                    indentation = indentation.add_tab();
249                }
250                '\\' => {
251                    self.cursor.bump();
252                    if self.cursor.eat_char('\r') {
253                        self.cursor.eat_char('\n');
254                    } else if !self.cursor.eat_char('\n') {
255                        return Some(self.push_error(LexicalError::new(
256                            LexicalErrorType::LineContinuationError,
257                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
258                        )));
259                    }
260                    if self.cursor.is_eof() {
261                        return Some(self.push_error(LexicalError::new(
262                            LexicalErrorType::Eof,
263                            self.token_range(),
264                        )));
265                    }
266                    // test_ok backslash_continuation_indentation
267                    // if True:
268                    //     \
269                    //         1
270                    //     \
271                    // 2
272                    // else:\
273                    //     3
274
275                    // test_err backslash_continuation_indentation_error
276                    // if True:
277                    //     1
278                    //       \
279                    //     2
280
281                    // > Indentation cannot be split over multiple physical lines using backslashes;
282                    // > the whitespace up to the first backslash determines the indentation.
283                    // >
284                    // > https://docs.python.org/3/reference/lexical_analysis.html#indentation
285                    //
286                    // Skip whitespace after the continuation-line without accumulating it into
287                    // `indentation`. However, if the backslash is at column 0 (no prior
288                    // indentation), let the loop continue so the next line's whitespace is
289                    // accumulated normally.
290                    //
291                    // See also: https://github.com/python/cpython/issues/90249
292                    if indentation != Indentation::root() {
293                        self.cursor.eat_while(is_python_whitespace);
294                    }
295                }
296                // Form feed
297                '\x0C' => {
298                    self.cursor.bump();
299                    indentation = Indentation::root();
300                }
301                _ => break,
302            }
303        }
304
305        // Handle indentation if this is a new, not all empty, logical line
306        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
307            self.state = State::NonEmptyLogicalLine;
308
309            // Set to false so that we don't handle indentation on the next call.
310            return self.handle_indentation(indentation);
311        }
312
313        None
314    }
315
316    fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
317        match self.indentations.current().try_compare(indentation) {
318            // Dedent
319            Ok(Ordering::Greater) => {
320                self.pending_indentation = Some(indentation);
321
322                if self.indentations.dedent_one(indentation).is_err() {
323                    return Some(self.push_error(LexicalError::new(
324                        LexicalErrorType::IndentationError,
325                        self.token_range(),
326                    )));
327                }
328
329                // The lexer might've eaten some whitespaces to calculate the `indentation`. For
330                // example:
331                //
332                // ```py
333                // if first:
334                //     if second:
335                //         pass
336                //     foo
337                // #   ^
338                // ```
339                //
340                // Here, the cursor is at `^` and the `indentation` contains the whitespaces before
341                // the `pass` token.
342                self.cursor.start_token();
343
344                Some(TokenKind::Dedent)
345            }
346
347            Ok(Ordering::Equal) => None,
348
349            // Indent
350            Ok(Ordering::Less) => {
351                self.indentations.indent(indentation);
352                Some(TokenKind::Indent)
353            }
354            Err(_) => Some(self.push_error(LexicalError::new(
355                LexicalErrorType::IndentationError,
356                self.token_range(),
357            ))),
358        }
359    }
360
361    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
362        loop {
363            match self.cursor.first() {
364                ' ' => {
365                    self.cursor.bump();
366                }
367                '\t' => {
368                    self.cursor.bump();
369                }
370                '\\' => {
371                    self.cursor.bump();
372                    if self.cursor.eat_char('\r') {
373                        self.cursor.eat_char('\n');
374                    } else if !self.cursor.eat_char('\n') {
375                        return Err(LexicalError::new(
376                            LexicalErrorType::LineContinuationError,
377                            TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
378                        ));
379                    }
380                    if self.cursor.is_eof() {
381                        return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
382                    }
383                }
384                // Form feed
385                '\x0C' => {
386                    self.cursor.bump();
387                }
388                _ => break,
389            }
390        }
391
392        Ok(())
393    }
394
395    // Dispatch based on the given character.
396    fn consume_ascii_character(&mut self, c: char) -> TokenKind {
397        let token = match c {
398            c if is_ascii_identifier_start(c) => self.lex_identifier(c),
399            '0'..='9' => self.lex_number(c),
400            '#' => return self.lex_comment(),
401            '\'' | '"' => self.lex_string(c),
402            '=' => {
403                if self.cursor.eat_char('=') {
404                    TokenKind::EqEqual
405                } else {
406                    self.state = State::AfterEqual;
407                    return TokenKind::Equal;
408                }
409            }
410            '+' => {
411                if self.cursor.eat_char('=') {
412                    TokenKind::PlusEqual
413                } else {
414                    TokenKind::Plus
415                }
416            }
417            '*' => {
418                if self.cursor.eat_char('=') {
419                    TokenKind::StarEqual
420                } else if self.cursor.eat_char('*') {
421                    if self.cursor.eat_char('=') {
422                        TokenKind::DoubleStarEqual
423                    } else {
424                        TokenKind::DoubleStar
425                    }
426                } else {
427                    TokenKind::Star
428                }
429            }
430
431            c @ ('%' | '!')
432                if self.mode == Mode::Ipython
433                    && self.state.is_after_equal()
434                    && self.nesting == 0 =>
435            {
436                // SAFETY: Safe because `c` has been matched against one of the possible escape command token
437                self.lex_ipython_escape_command(
438                    IpyEscapeKind::try_from(c).unwrap(),
439                    IpyEscapeLexContext::Assignment,
440                )
441            }
442
443            c @ ('%' | '!' | '?' | '/' | ';' | ',')
444                if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
445            {
446                let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
447                    self.cursor.bump();
448                    kind
449                } else {
450                    // SAFETY: Safe because `c` has been matched against one of the possible escape command token
451                    IpyEscapeKind::try_from(c).unwrap()
452                };
453
454                self.lex_ipython_escape_command(kind, IpyEscapeLexContext::LogicalLineStart)
455            }
456
457            '?' if self.mode == Mode::Ipython => TokenKind::Question,
458
459            '/' => {
460                if self.cursor.eat_char('=') {
461                    TokenKind::SlashEqual
462                } else if self.cursor.eat_char('/') {
463                    if self.cursor.eat_char('=') {
464                        TokenKind::DoubleSlashEqual
465                    } else {
466                        TokenKind::DoubleSlash
467                    }
468                } else {
469                    TokenKind::Slash
470                }
471            }
472            '%' => {
473                if self.cursor.eat_char('=') {
474                    TokenKind::PercentEqual
475                } else {
476                    TokenKind::Percent
477                }
478            }
479            '|' => {
480                if self.cursor.eat_char('=') {
481                    TokenKind::VbarEqual
482                } else {
483                    TokenKind::Vbar
484                }
485            }
486            '^' => {
487                if self.cursor.eat_char('=') {
488                    TokenKind::CircumflexEqual
489                } else {
490                    TokenKind::CircumFlex
491                }
492            }
493            '&' => {
494                if self.cursor.eat_char('=') {
495                    TokenKind::AmperEqual
496                } else {
497                    TokenKind::Amper
498                }
499            }
500            '-' => {
501                if self.cursor.eat_char('=') {
502                    TokenKind::MinusEqual
503                } else if self.cursor.eat_char('>') {
504                    TokenKind::Rarrow
505                } else {
506                    TokenKind::Minus
507                }
508            }
509            '@' => {
510                if self.cursor.eat_char('=') {
511                    TokenKind::AtEqual
512                } else {
513                    TokenKind::At
514                }
515            }
516            '!' => {
517                if self.cursor.eat_char('=') {
518                    TokenKind::NotEqual
519                } else {
520                    TokenKind::Exclamation
521                }
522            }
523            '~' => TokenKind::Tilde,
524            '(' => {
525                self.nesting += 1;
526                TokenKind::Lpar
527            }
528            ')' => {
529                self.nesting = self.nesting.saturating_sub(1);
530                TokenKind::Rpar
531            }
532            '[' => {
533                self.nesting += 1;
534                TokenKind::Lsqb
535            }
536            ']' => {
537                self.nesting = self.nesting.saturating_sub(1);
538                TokenKind::Rsqb
539            }
540            '{' => {
541                self.nesting += 1;
542                TokenKind::Lbrace
543            }
544            '}' => {
545                if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
546                    if interpolated_string.nesting() == self.nesting {
547                        let error_type = LexicalErrorType::from_interpolated_string_error(
548                            InterpolatedStringErrorType::SingleRbrace,
549                            interpolated_string.kind(),
550                        );
551                        return self.push_error(LexicalError::new(error_type, self.token_range()));
552                    }
553                    interpolated_string.try_end_format_spec(self.nesting);
554                }
555                self.nesting = self.nesting.saturating_sub(1);
556                TokenKind::Rbrace
557            }
558            ':' => {
559                if self
560                    .interpolated_strings
561                    .current_mut()
562                    .is_some_and(|interpolated_string| {
563                        interpolated_string.try_start_format_spec(self.nesting)
564                    })
565                {
566                    TokenKind::Colon
567                } else if self.cursor.eat_char('=') {
568                    TokenKind::ColonEqual
569                } else {
570                    TokenKind::Colon
571                }
572            }
573            ';' => TokenKind::Semi,
574            '<' => {
575                if self.cursor.eat_char('<') {
576                    if self.cursor.eat_char('=') {
577                        TokenKind::LeftShiftEqual
578                    } else {
579                        TokenKind::LeftShift
580                    }
581                } else if self.cursor.eat_char('=') {
582                    TokenKind::LessEqual
583                } else {
584                    TokenKind::Less
585                }
586            }
587            '>' => {
588                if self.cursor.eat_char('>') {
589                    if self.cursor.eat_char('=') {
590                        TokenKind::RightShiftEqual
591                    } else {
592                        TokenKind::RightShift
593                    }
594                } else if self.cursor.eat_char('=') {
595                    TokenKind::GreaterEqual
596                } else {
597                    TokenKind::Greater
598                }
599            }
600            ',' => TokenKind::Comma,
601            '.' => {
602                if self.cursor.first().is_ascii_digit() {
603                    self.lex_decimal_number('.')
604                } else if self.cursor.eat_char2('.', '.') {
605                    TokenKind::Ellipsis
606                } else {
607                    TokenKind::Dot
608                }
609            }
610            '\n' => {
611                return if self.nesting == 0 && !self.state.is_new_logical_line() {
612                    self.state = State::AfterNewline;
613                    TokenKind::Newline
614                } else {
615                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
616                        interpolated_string.try_end_format_spec(self.nesting);
617                    }
618                    TokenKind::NonLogicalNewline
619                };
620            }
621            '\r' => {
622                self.cursor.eat_char('\n');
623
624                return if self.nesting == 0 && !self.state.is_new_logical_line() {
625                    self.state = State::AfterNewline;
626                    TokenKind::Newline
627                } else {
628                    if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
629                        interpolated_string.try_end_format_spec(self.nesting);
630                    }
631                    TokenKind::NonLogicalNewline
632                };
633            }
634
635            _ => {
636                self.state = State::Other;
637
638                return self.push_error(LexicalError::new(
639                    LexicalErrorType::UnrecognizedToken { tok: c },
640                    self.token_range(),
641                ));
642            }
643        };
644
645        self.state = State::Other;
646
647        token
648    }
649
650    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
651    fn lex_identifier(&mut self, first: char) -> TokenKind {
652        // Detect potential string like rb'' b'' f'' t'' u'' r''
653        let quote = match (first, self.cursor.first()) {
654            (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
655                self.cursor.bump();
656                quote
657            }),
658            (_, second) if is_quote(self.cursor.second()) => {
659                self.try_double_char_prefix([first, second]).then(|| {
660                    self.cursor.bump();
661                    // SAFETY: Safe because of the `is_quote` check in this match arm's guard
662                    self.cursor.bump().unwrap()
663                })
664            }
665            _ => None,
666        };
667
668        if let Some(quote) = quote {
669            if self.current_flags.is_interpolated_string() {
670                if let Some(kind) = self.lex_interpolated_string_start(quote) {
671                    return kind;
672                }
673            }
674
675            return self.lex_string(quote);
676        }
677
678        // Keep track of whether the identifier is ASCII-only or not.
679        //
680        // This is important because Python applies NFKC normalization to
681        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
682        // We need to therefore do the same in our lexer, but applying NFKC normalization
683        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
684        // (by far the most common case), we can skip NFKC normalization of the identifier.
685        let mut is_ascii = first.is_ascii();
686        self.cursor
687            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
688
689        let text = self.token_text();
690
691        if !is_ascii {
692            self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
693            return TokenKind::Name;
694        }
695
696        // Short circuit for names that are longer than any known keyword.
697        // It helps Rust to predict that the Name::new call in the keyword match's default branch
698        // is guaranteed to fit into a stack allocated (inline) Name.
699        if text.len() > 8 {
700            self.current_value = TokenValue::Name(Name::new(text));
701            return TokenKind::Name;
702        }
703
704        match text {
705            "False" => TokenKind::False,
706            "None" => TokenKind::None,
707            "True" => TokenKind::True,
708            "and" => TokenKind::And,
709            "as" => TokenKind::As,
710            "assert" => TokenKind::Assert,
711            "async" => TokenKind::Async,
712            "await" => TokenKind::Await,
713            "break" => TokenKind::Break,
714            "case" => TokenKind::Case,
715            "class" => TokenKind::Class,
716            "continue" => TokenKind::Continue,
717            "def" => TokenKind::Def,
718            "del" => TokenKind::Del,
719            "elif" => TokenKind::Elif,
720            "else" => TokenKind::Else,
721            "except" => TokenKind::Except,
722            "finally" => TokenKind::Finally,
723            "for" => TokenKind::For,
724            "from" => TokenKind::From,
725            "global" => TokenKind::Global,
726            "if" => TokenKind::If,
727            "import" => TokenKind::Import,
728            "in" => TokenKind::In,
729            "is" => TokenKind::Is,
730            "lazy" => TokenKind::Lazy,
731            "lambda" => TokenKind::Lambda,
732            "match" => TokenKind::Match,
733            "nonlocal" => TokenKind::Nonlocal,
734            "not" => TokenKind::Not,
735            "or" => TokenKind::Or,
736            "pass" => TokenKind::Pass,
737            "raise" => TokenKind::Raise,
738            "return" => TokenKind::Return,
739            "try" => TokenKind::Try,
740            "type" => TokenKind::Type,
741            "while" => TokenKind::While,
742            "with" => TokenKind::With,
743            "yield" => TokenKind::Yield,
744            _ => {
745                self.current_value = TokenValue::Name(Name::new(text));
746                TokenKind::Name
747            }
748        }
749    }
750
751    /// Try lexing the single character string prefix, updating the token flags accordingly.
752    /// Returns `true` if it matches.
753    fn try_single_char_prefix(&mut self, first: char) -> bool {
754        match first {
755            'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
756            't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
757            'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
758            'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
759            'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
760            'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
761            _ => return false,
762        }
763        true
764    }
765
766    /// Try lexing the double character string prefix, updating the token flags accordingly.
767    /// Returns `true` if it matches.
768    fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
769        match value {
770            ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
771                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
772            }
773            ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
774                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
775            }
776            ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
777                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
778            }
779            ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
780                self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
781            }
782            ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
783                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
784            }
785            ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
786                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
787            }
788            _ => return false,
789        }
790        true
791    }
792
793    /// Lex a f-string or t-string start token if positioned at the start of an f-string or t-string.
794    fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
795        #[cfg(debug_assertions)]
796        debug_assert_eq!(self.cursor.previous(), quote);
797
798        if quote == '"' {
799            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
800        }
801
802        if self.cursor.eat_char2(quote, quote) {
803            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
804        }
805
806        let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
807
808        let kind = ftcontext.kind();
809
810        self.interpolated_strings.push(ftcontext);
811
812        Some(kind.start_token())
813    }
814
815    /// Lex an f-string or t-string middle or end token.
816    fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
817        // SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
818        let interpolated_string = self.interpolated_strings.current().unwrap();
819        let string_kind = interpolated_string.kind();
820        let interpolated_flags = interpolated_string.flags();
821
822        // Check if we're at the end of the f-string.
823        if interpolated_string.is_triple_quoted() {
824            let quote_char = interpolated_string.quote_char();
825            if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
826                self.current_flags = interpolated_string.flags();
827                return Some(string_kind.end_token());
828            }
829        } else if self.cursor.eat_char(interpolated_string.quote_char()) {
830            self.current_flags = interpolated_string.flags();
831            return Some(string_kind.end_token());
832        }
833
834        // We have to decode `{{` and `}}` into `{` and `}` respectively. As an
835        // optimization, we only allocate a new string we find any escaped curly braces,
836        // otherwise this string will remain empty and we'll use a source slice instead.
837        let mut normalized = String::new();
838
839        // Tracks the last offset of token value that has been written to `normalized`.
840        let mut last_offset = self.offset();
841
842        // This isn't going to change for the duration of the loop.
843        let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
844
845        let mut in_named_unicode = false;
846
847        loop {
848            match self.cursor.first() {
849                // The condition is to differentiate between the `NUL` (`\0`) character
850                // in the source code and the one returned by `self.cursor.first()` when
851                // we reach the end of the source code.
852                EOF_CHAR if self.cursor.is_eof() => {
853                    let error = if interpolated_string.is_triple_quoted() {
854                        InterpolatedStringErrorType::UnterminatedTripleQuotedString
855                    } else {
856                        InterpolatedStringErrorType::UnterminatedString
857                    };
858
859                    self.nesting = interpolated_string.nesting();
860                    self.interpolated_strings.pop();
861                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
862                    self.push_error(LexicalError::new(
863                        LexicalErrorType::from_interpolated_string_error(error, string_kind),
864                        self.token_range(),
865                    ));
866
867                    break;
868                }
869                '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
870                    // https://github.com/astral-sh/ruff/issues/18632
871
872                    let error_type = if in_format_spec {
873                        InterpolatedStringErrorType::NewlineInFormatSpec
874                    } else {
875                        InterpolatedStringErrorType::UnterminatedString
876                    };
877
878                    self.nesting = interpolated_string.nesting();
879                    self.interpolated_strings.pop();
880                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
881
882                    self.push_error(LexicalError::new(
883                        LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
884                        self.token_range(),
885                    ));
886
887                    break;
888                }
889                '\\' => {
890                    self.cursor.bump(); // '\'
891                    if matches!(self.cursor.first(), '{' | '}') {
892                        // Don't consume `{` or `}` as we want them to be emitted as tokens.
893                        // They will be handled in the next iteration.
894                        continue;
895                    } else if !interpolated_string.is_raw_string() {
896                        if self.cursor.eat_char2('N', '{') {
897                            in_named_unicode = true;
898                            continue;
899                        }
900                    }
901                    // Consume the escaped character.
902                    if self.cursor.eat_char('\r') {
903                        self.cursor.eat_char('\n');
904                    } else {
905                        self.cursor.bump();
906                    }
907                }
908                quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
909                    if let Some(triple_quotes) = interpolated_string.triple_quotes() {
910                        if self.cursor.rest().starts_with(triple_quotes) {
911                            break;
912                        }
913                        self.cursor.bump();
914                    } else {
915                        break;
916                    }
917                }
918                '{' => {
919                    if self.cursor.second() == '{' && !in_format_spec {
920                        self.cursor.bump();
921                        normalized
922                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
923                        self.cursor.bump(); // Skip the second `{`
924                        last_offset = self.offset();
925                    } else {
926                        break;
927                    }
928                }
929                '}' => {
930                    if in_named_unicode {
931                        in_named_unicode = false;
932                        self.cursor.bump();
933                    } else if self.cursor.second() == '}' && !in_format_spec {
934                        self.cursor.bump();
935                        normalized
936                            .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
937                        self.cursor.bump(); // Skip the second `}`
938                        last_offset = self.offset();
939                    } else {
940                        break;
941                    }
942                }
943                _ => {
944                    self.cursor.bump();
945                }
946            }
947        }
948        let range = self.token_range();
949        if range.is_empty() {
950            return None;
951        }
952
953        let value = if normalized.is_empty() {
954            self.source[range].to_string()
955        } else {
956            normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
957            normalized
958        };
959
960        self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
961
962        self.current_flags = interpolated_flags;
963        Some(string_kind.middle_token())
964    }
965
966    /// Lex a string literal.
967    fn lex_string(&mut self, quote: char) -> TokenKind {
968        #[cfg(debug_assertions)]
969        debug_assert_eq!(self.cursor.previous(), quote);
970
971        if quote == '"' {
972            self.current_flags |= TokenFlags::DOUBLE_QUOTES;
973        }
974
975        // If the next two characters are also the quote character, then we have a triple-quoted
976        // string; consume those two characters and ensure that we require a triple-quote to close
977        if self.cursor.eat_char2(quote, quote) {
978            self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
979        }
980
981        let value_start = self.offset();
982
983        let quote_byte = u8::try_from(quote).expect("char that fits in u8");
984        let value_end = if self.current_flags.is_triple_quoted() {
985            // For triple-quoted strings, scan until we find the closing quote (ignoring escaped
986            // quotes) or the end of the file.
987            loop {
988                let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
989                    self.cursor.skip_to_end();
990
991                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
992                    self.push_error(LexicalError::new(
993                        LexicalErrorType::UnclosedStringError,
994                        self.token_range(),
995                    ));
996                    break self.offset();
997                };
998
999                // Rare case: if there are an odd number of backslashes before the quote, then
1000                // the quote is escaped and we should continue scanning.
1001                let num_backslashes = self.cursor.rest().as_bytes()[..index]
1002                    .iter()
1003                    .rev()
1004                    .take_while(|&&c| c == b'\\')
1005                    .count();
1006
1007                // Advance the cursor past the quote and continue scanning.
1008                self.cursor.skip_bytes(index + 1);
1009
1010                // If the character is escaped, continue scanning.
1011                if num_backslashes % 2 == 1 {
1012                    continue;
1013                }
1014
1015                // Otherwise, if it's followed by two more quotes, then we're done.
1016                if self.cursor.eat_char2(quote, quote) {
1017                    break self.offset() - TextSize::new(3);
1018                }
1019            }
1020        } else {
1021            // For non-triple-quoted strings, scan until we find the closing quote, but end early
1022            // if we encounter a newline or the end of the file.
1023            loop {
1024                let Some(index) =
1025                    memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
1026                else {
1027                    self.cursor.skip_to_end();
1028                    self.current_flags |= TokenFlags::UNCLOSED_STRING;
1029
1030                    self.push_error(LexicalError::new(
1031                        LexicalErrorType::UnclosedStringError,
1032                        self.token_range(),
1033                    ));
1034
1035                    break self.offset();
1036                };
1037
1038                // Rare case: if there are an odd number of backslashes before the quote, then
1039                // the quote is escaped and we should continue scanning.
1040                let num_backslashes = self.cursor.rest().as_bytes()[..index]
1041                    .iter()
1042                    .rev()
1043                    .take_while(|&&c| c == b'\\')
1044                    .count();
1045
1046                // Skip up to the current character.
1047                self.cursor.skip_bytes(index);
1048
1049                // Lookahead because we want to bump only if it's a quote or being escaped.
1050                let quote_or_newline = self.cursor.first();
1051
1052                // If the character is escaped, continue scanning.
1053                if num_backslashes % 2 == 1 {
1054                    self.cursor.bump();
1055                    if quote_or_newline == '\r' {
1056                        self.cursor.eat_char('\n');
1057                    }
1058                    continue;
1059                }
1060
1061                match quote_or_newline {
1062                    '\r' | '\n' => {
1063                        self.current_flags |= TokenFlags::UNCLOSED_STRING;
1064                        self.push_error(LexicalError::new(
1065                            LexicalErrorType::UnclosedStringError,
1066                            self.token_range(),
1067                        ));
1068                        break self.offset();
1069                    }
1070                    ch if ch == quote => {
1071                        let value_end = self.offset();
1072                        self.cursor.bump();
1073                        break value_end;
1074                    }
1075                    _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1076                }
1077            }
1078        };
1079
1080        self.current_value = TokenValue::String(
1081            self.source[TextRange::new(value_start, value_end)]
1082                .to_string()
1083                .into_boxed_str(),
1084        );
1085
1086        TokenKind::String
1087    }
1088
1089    /// Numeric lexing. The feast can start!
1090    fn lex_number(&mut self, first: char) -> TokenKind {
1091        if first == '0' {
1092            if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1093                self.lex_number_radix(Radix::Hex)
1094            } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1095                self.lex_number_radix(Radix::Octal)
1096            } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1097                self.lex_number_radix(Radix::Binary)
1098            } else {
1099                self.lex_decimal_number(first)
1100            }
1101        } else {
1102            self.lex_decimal_number(first)
1103        }
1104    }
1105
1106    /// Lex a hex/octal/decimal/binary number without a decimal point.
1107    fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1108        #[cfg(debug_assertions)]
1109        debug_assert!(matches!(
1110            self.cursor.previous().to_ascii_lowercase(),
1111            'x' | 'o' | 'b'
1112        ));
1113
1114        // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
1115        let mut number = LexedText::new(self.offset(), self.source);
1116        self.radix_run(&mut number, radix);
1117
1118        // Extract the entire number, including the base prefix (e.g., `0x9D5`).
1119        let token = &self.source[self.token_range()];
1120
1121        let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1122            Ok(int) => int,
1123            Err(err) => {
1124                return self.push_error(LexicalError::new(
1125                    LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1126                    self.token_range(),
1127                ));
1128            }
1129        };
1130        self.current_value = TokenValue::Int(value);
1131        TokenKind::Int
1132    }
1133
1134    /// Lex a normal number, that is, no octal, hex or binary number.
1135    fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1136        #[cfg(debug_assertions)]
1137        debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1138        let start_is_zero = first_digit_or_dot == '0';
1139
1140        let mut number = LexedText::new(self.token_start(), self.source);
1141        if first_digit_or_dot != '.' {
1142            number.push(first_digit_or_dot);
1143            self.radix_run(&mut number, Radix::Decimal);
1144        }
1145
1146        let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1147            number.push('.');
1148
1149            if self.cursor.eat_char('_') {
1150                return self.push_error(LexicalError::new(
1151                    LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1152                    TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1153                ));
1154            }
1155
1156            self.radix_run(&mut number, Radix::Decimal);
1157            true
1158        } else {
1159            // Normal number:
1160            false
1161        };
1162
1163        let is_float = match self.cursor.rest().as_bytes() {
1164            [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1165                // 'e' | 'E'
1166                number.push(self.cursor.bump().unwrap());
1167
1168                if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1169                    number.push(sign);
1170                }
1171
1172                self.radix_run(&mut number, Radix::Decimal);
1173
1174                true
1175            }
1176            _ => is_float,
1177        };
1178
1179        if is_float {
1180            // Improvement: Use `Cow` instead of pushing to value text
1181            let Ok(value) = f64::from_str(number.as_str()) else {
1182                return self.push_error(LexicalError::new(
1183                    LexicalErrorType::OtherError(
1184                        "Invalid decimal literal".to_string().into_boxed_str(),
1185                    ),
1186                    self.token_range(),
1187                ));
1188            };
1189
1190            // Parse trailing 'j':
1191            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1192                self.current_value = TokenValue::Complex {
1193                    real: 0.0,
1194                    imag: value,
1195                };
1196                TokenKind::Complex
1197            } else {
1198                self.current_value = TokenValue::Float(value);
1199                TokenKind::Float
1200            }
1201        } else {
1202            // Parse trailing 'j':
1203            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1204                let imag = f64::from_str(number.as_str()).unwrap();
1205                self.current_value = TokenValue::Complex { real: 0.0, imag };
1206                TokenKind::Complex
1207            } else {
1208                let value = match Int::from_str(number.as_str()) {
1209                    Ok(value) => {
1210                        if start_is_zero && value.as_u8() != Some(0) {
1211                            // Leading zeros in decimal integer literals are not permitted.
1212                            return self.push_error(LexicalError::new(
1213                                LexicalErrorType::OtherError(
1214                                    "Invalid decimal integer literal"
1215                                        .to_string()
1216                                        .into_boxed_str(),
1217                                ),
1218                                self.token_range(),
1219                            ));
1220                        }
1221                        value
1222                    }
1223                    Err(err) => {
1224                        return self.push_error(LexicalError::new(
1225                            LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1226                            self.token_range(),
1227                        ));
1228                    }
1229                };
1230                self.current_value = TokenValue::Int(value);
1231                TokenKind::Int
1232            }
1233        }
1234    }
1235
1236    /// Consume a sequence of numbers with the given radix,
1237    /// the digits can be decorated with underscores
1238    /// like this: '`1_2_3_4`' == '1234'
1239    fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1240        loop {
1241            if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1242                number.push(c);
1243            }
1244            // Number that contains `_` separators. Remove them from the parsed text.
1245            else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1246                // Skip over `_`
1247                self.cursor.bump();
1248                number.skip_char();
1249            } else {
1250                break;
1251            }
1252        }
1253    }
1254
1255    /// Lex a single comment.
1256    fn lex_comment(&mut self) -> TokenKind {
1257        #[cfg(debug_assertions)]
1258        debug_assert_eq!(self.cursor.previous(), '#');
1259
1260        let bytes = self.cursor.rest().as_bytes();
1261        let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1262        self.cursor.skip_bytes(offset);
1263
1264        TokenKind::Comment
1265    }
1266
1267    /// Lex a single IPython escape command.
1268    fn lex_ipython_escape_command(
1269        &mut self,
1270        escape_kind: IpyEscapeKind,
1271        context: IpyEscapeLexContext,
1272    ) -> TokenKind {
1273        let mut value = String::new();
1274
1275        loop {
1276            match self.cursor.first() {
1277                '\\' => {
1278                    // Only skip the line continuation if it is followed by a newline
1279                    // otherwise it is a normal backslash which is part of the magic command:
1280                    //
1281                    //        Skip this backslash
1282                    //        v
1283                    //   !pwd \
1284                    //      && ls -a | sed 's/^/\\    /'
1285                    //                          ^^
1286                    //                          Don't skip these backslashes
1287                    if self.cursor.second() == '\r' {
1288                        self.cursor.bump();
1289                        self.cursor.bump();
1290                        self.cursor.eat_char('\n');
1291                        continue;
1292                    } else if self.cursor.second() == '\n' {
1293                        self.cursor.bump();
1294                        self.cursor.bump();
1295                        continue;
1296                    }
1297
1298                    self.cursor.bump();
1299                    value.push('\\');
1300                }
1301                // Help end escape commands are those that end with 1 or 2 question marks.
1302                // Here, we're only looking for a subset of help end escape commands which
1303                // are the ones that has the escape token at the start of the line as well.
1304                // On the other hand, we're not looking for help end escape commands that
1305                // are strict in the sense that the escape token is only at the end. For example,
1306                //
1307                //   * `%foo?` is recognized as a help end escape command but not as a strict one.
1308                //   * `foo?` is recognized as a strict help end escape command which is not
1309                //     lexed here but is identified at the parser level.
1310                //
1311                // Help end escape commands implemented in the IPython codebase using regex:
1312                // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
1313                '?' => {
1314                    self.cursor.bump();
1315                    let mut question_count = 1u32;
1316                    while self.cursor.eat_char('?') {
1317                        question_count += 1;
1318                    }
1319
1320                    // Help end tokens (`?` / `??`) are only valid in certain contexts
1321                    // (e.g., not within f-strings or parenthesized expressions), and only
1322                    // for escape kinds that IPython recognizes as supporting a trailing `?`
1323                    // (i.e., `%`, `%%`, `?`, and `??`). For other escape kinds like `!` or
1324                    // `/`, the `?` is just part of the command value.
1325                    if !context.allows_help_end()
1326                        || !matches!(
1327                            escape_kind,
1328                            IpyEscapeKind::Magic
1329                                | IpyEscapeKind::Magic2
1330                                | IpyEscapeKind::Help
1331                                | IpyEscapeKind::Help2
1332                        )
1333                    {
1334                        value.reserve(question_count as usize);
1335                        for _ in 0..question_count {
1336                            value.push('?');
1337                        }
1338                        continue;
1339                    }
1340
1341                    // The original implementation in the IPython codebase is based on regex which
1342                    // means that it's strict in the sense that it won't recognize a help end escape:
1343                    //   * If there's any whitespace before the escape token (e.g. `%foo ?`)
1344                    //   * If there are more than 2 question mark tokens (e.g. `%foo???`)
1345                    // which is what we're doing here as well. In that case, we'll continue with
1346                    // the prefixed escape token.
1347                    //
1348                    // Now, the whitespace and empty value check also makes sure that an empty
1349                    // command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
1350                    // is not recognized as a help end escape command. So, `%?` and `? ??` are
1351                    // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
1352                    // tokens.
1353                    if question_count > 2
1354                        || value.chars().last().is_none_or(is_python_whitespace)
1355                        || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1356                    {
1357                        // Not a help end escape command, so continue with the lexing.
1358                        value.reserve(question_count as usize);
1359                        for _ in 0..question_count {
1360                            value.push('?');
1361                        }
1362                        continue;
1363                    }
1364
1365                    if escape_kind.is_help() {
1366                        // If we've recognize this as a help end escape command, then
1367                        // any question mark token / whitespaces at the start are not
1368                        // considered as part of the value.
1369                        //
1370                        // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
1371                        // `value` is `foo` instead of `??foo`.
1372                        value = value.trim_start_matches([' ', '?']).to_string();
1373                    } else if escape_kind.is_magic() {
1374                        // Between `%` and `?` (at the end), the `?` takes priority
1375                        // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
1376                        // and `value` is `%foo` instead of `foo`. So, we need to
1377                        // insert the magic escape token at the start.
1378                        value.insert_str(0, escape_kind.as_str());
1379                    }
1380
1381                    let kind = match question_count {
1382                        1 => IpyEscapeKind::Help,
1383                        2 => IpyEscapeKind::Help2,
1384                        _ => unreachable!("`question_count` is always 1 or 2"),
1385                    };
1386
1387                    self.current_value = TokenValue::IpyEscapeCommand {
1388                        kind,
1389                        value: value.into_boxed_str(),
1390                    };
1391
1392                    return TokenKind::IpyEscapeCommand;
1393                }
1394                '\n' | '\r' | EOF_CHAR => {
1395                    self.current_value = TokenValue::IpyEscapeCommand {
1396                        kind: escape_kind,
1397                        value: value.into_boxed_str(),
1398                    };
1399
1400                    return TokenKind::IpyEscapeCommand;
1401                }
1402                c => {
1403                    self.cursor.bump();
1404                    value.push(c);
1405                }
1406            }
1407        }
1408    }
1409
1410    fn consume_end(&mut self) -> TokenKind {
1411        // We reached end of file.
1412
1413        // First, finish any unterminated interpolated-strings.
1414        while let Some(interpolated_string) = self.interpolated_strings.pop() {
1415            self.nesting = interpolated_string.nesting();
1416            self.push_error(LexicalError::new(
1417                LexicalErrorType::from_interpolated_string_error(
1418                    InterpolatedStringErrorType::UnterminatedString,
1419                    interpolated_string.kind(),
1420                ),
1421                self.token_range(),
1422            ));
1423        }
1424
1425        // Second, finish all nestings.
1426        // For Mode::ParenthesizedExpression we start with nesting level 1.
1427        // So we check if we end with that level.
1428        let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1429
1430        if self.nesting > init_nesting {
1431            // Reset the nesting to avoid going into infinite loop.
1432            self.nesting = 0;
1433            return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1434        }
1435
1436        // Next, insert a trailing newline, if required.
1437        if !self.state.is_new_logical_line() {
1438            self.state = State::AfterNewline;
1439            TokenKind::Newline
1440        }
1441        // Next, flush the indentation stack to zero.
1442        else if self.indentations.dedent().is_some() {
1443            TokenKind::Dedent
1444        } else {
1445            TokenKind::EndOfFile
1446        }
1447    }
1448
1449    /// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
1450    /// line.
1451    ///
1452    /// Returns a boolean indicating whether the lexer's position has changed. This could result
1453    /// into the new current token being different than the previous current token but is not
1454    /// necessarily true. If the return value is `true` then the caller is responsible for updating
1455    /// it's state accordingly.
1456    ///
1457    /// This method is a no-op if the lexer isn't in a parenthesized context.
1458    ///
1459    /// ## Explanation
1460    ///
1461    /// The lexer emits two different kinds of newline token based on the context. If it's in a
1462    /// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a
1463    /// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and
1464    /// emit the indentation tokens appropriately which affects the structure of the code.
1465    ///
1466    /// For example:
1467    /// ```py
1468    /// if call(foo
1469    ///     def bar():
1470    ///         pass
1471    /// ```
1472    ///
1473    /// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer
1474    /// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which
1475    /// considers the function `bar` as part of the module block and the `if` block remains empty.
1476    ///
1477    /// This method is to facilitate the parser if it recovers from these kind of scenarios so that
1478    /// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in
1479    /// turn helps the parser to build the correct AST.
1480    ///
1481    /// In the above snippet, it would mean that this method would move the lexer back to the
1482    /// newline character after the `foo` token and emit it as a [`Newline`] token instead of
1483    /// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an
1484    /// `Indent` token.
1485    ///
1486    /// There are cases where the lexer's position will change but the re-lexed token will remain
1487    /// the same. This is to help the parser to add the error message at an appropriate location.
1488    /// Consider the following example:
1489    ///
1490    /// ```py
1491    /// if call(foo, [a, b
1492    ///     def bar():
1493    ///         pass
1494    /// ```
1495    ///
1496    /// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call
1497    /// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic
1498    /// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token.
1499    /// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit
1500    /// the [`Newline`] token.
1501    ///
1502    /// [`Newline`]: TokenKind::Newline
1503    /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
1504    pub(crate) fn re_lex_logical_token(
1505        &mut self,
1506        non_logical_newline_start: Option<TextSize>,
1507    ) -> bool {
1508        if self.nesting == 0 {
1509            return false;
1510        }
1511
1512        // Reduce the nesting level because the parser recovered from an error inside list parsing
1513        // i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
1514        self.nesting -= 1;
1515
1516        // The lexer can't be moved back for a triple-quoted f/t-string because the newlines are
1517        // part of the f/t-string itself, so there is no newline token to be emitted.
1518        if self.current_flags.is_triple_quoted_interpolated_string() {
1519            return false;
1520        }
1521
1522        let Some(new_position) = non_logical_newline_start else {
1523            return false;
1524        };
1525
1526        // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
1527        // position is going to be moved back, the lexer needs to be put back into a
1528        // parenthesized context if the current token is a closing parenthesis.
1529        //
1530        // ```py
1531        // (a, [b,
1532        //     c
1533        // )
1534        // ```
1535        //
1536        // Here, the parser would request to re-lex the token when it's at `)` and can recover
1537        // from an unclosed `[`. This method will move the lexer back to the newline character
1538        // after `c` which means it goes back into parenthesized context.
1539        if matches!(
1540            self.current_kind,
1541            TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1542        ) {
1543            self.nesting += 1;
1544        }
1545
1546        self.cursor = Cursor::new(self.source);
1547        self.cursor.skip_bytes(new_position.to_usize());
1548        self.state = State::Other;
1549        self.next_token();
1550        true
1551    }
1552
1553    /// Re-lexes an unclosed string token in the context of an interpolated string element.
1554    ///
1555    /// ```py
1556    /// f'{a'
1557    /// ```
1558    ///
1559    /// This method re-lexes the trailing `'` as the end of the f-string rather than the
1560    /// start of a new string token for better error recovery.
1561    pub(crate) fn re_lex_string_token_in_interpolation_element(
1562        &mut self,
1563        kind: InterpolatedStringKind,
1564    ) {
1565        let Some(interpolated_string) = self.interpolated_strings.current() else {
1566            return;
1567        };
1568
1569        let current_string_flags = self.current_flags().as_any_string_flags();
1570
1571        // Only unclosed strings, that have the same quote character
1572        if !matches!(self.current_kind, TokenKind::String)
1573            || !self.current_flags.is_unclosed()
1574            || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1575            || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1576            || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1577        {
1578            return;
1579        }
1580
1581        // Only if the string's first line only contains whitespace,
1582        // or ends in a comment (not `f"{"abc`)
1583        let first_line = &self.source
1584            [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1585
1586        for c in first_line.chars() {
1587            if matches!(c, '\n' | '\r' | '#') {
1588                break;
1589            }
1590
1591            // `f'{'ab`, we want to parse `ab` as a normal string and not the closing element of the f-string
1592            if !is_python_whitespace(c) {
1593                return;
1594            }
1595        }
1596
1597        if self.errors.last().is_some_and(|error| {
1598            error.location() == self.current_range
1599                && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1600        }) {
1601            self.errors.pop();
1602        }
1603
1604        self.current_range =
1605            TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1606        self.current_kind = kind.end_token();
1607        self.current_value = TokenValue::None;
1608        self.current_flags = TokenFlags::empty();
1609
1610        self.nesting = interpolated_string.nesting();
1611        self.interpolated_strings.pop();
1612
1613        self.cursor = Cursor::new(self.source);
1614        self.cursor.skip_bytes(self.current_range.end().to_usize());
1615    }
1616
1617    /// Re-lex `r"` in a format specifier position.
1618    ///
1619    /// `r"` in a format specifier position is unlikely to be the start of a raw string.
1620    /// Instead, it's the format specifier `!r` followed by the closing quote of the f-string,
1621    /// when the `}` is missing.
1622    ///
1623    /// ```py
1624    /// f"{test!r"
1625    /// ```
1626    ///
1627    /// This function re-lexes the `r"` as `r` (a name token). The next `next_token` call will
1628    /// return a unclosed string token for `"`, which [`Self::re_lex_string_token_in_interpolation_element`]
1629    /// can then re-lex as the end of the f-string.
1630    pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1631        // Re-lex `r"` as `NAME r` followed by an unclosed string
1632        // `f"{test!r"` -> `f"{test!`, `r`, `"`
1633        if matches!(self.current_kind, TokenKind::String)
1634            && self.current_flags.is_unclosed()
1635            && self.current_flags.prefix()
1636                == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1637        {
1638            if self.errors.last().is_some_and(|error| {
1639                error.location() == self.current_range
1640                    && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1641            }) {
1642                self.errors.pop();
1643            }
1644
1645            self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1646            self.current_kind = TokenKind::Name;
1647            self.current_value = TokenValue::Name(Name::new_static("r"));
1648            self.current_flags = TokenFlags::empty();
1649            self.cursor = Cursor::new(self.source);
1650            self.cursor.skip_bytes(self.current_range.end().to_usize());
1651        }
1652    }
1653
1654    #[inline]
1655    fn token_range(&self) -> TextRange {
1656        let end = self.offset();
1657        let len = self.cursor.token_len();
1658
1659        TextRange::at(end - len, len)
1660    }
1661
1662    #[inline]
1663    fn token_text(&self) -> &'src str {
1664        &self.source[self.token_range()]
1665    }
1666
1667    /// Retrieves the current offset of the cursor within the source code.
1668    // SAFETY: Lexer doesn't allow files larger than 4GB
1669    #[expect(clippy::cast_possible_truncation)]
1670    #[inline]
1671    fn offset(&self) -> TextSize {
1672        TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1673    }
1674
1675    #[inline]
1676    fn token_start(&self) -> TextSize {
1677        self.token_range().start()
1678    }
1679
1680    /// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].
1681    pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1682        LexerCheckpoint {
1683            value: self.current_value.clone(),
1684            current_kind: self.current_kind,
1685            current_range: self.current_range,
1686            current_flags: self.current_flags,
1687            cursor_offset: self.offset(),
1688            state: self.state,
1689            nesting: self.nesting,
1690            indentations_checkpoint: self.indentations.checkpoint(),
1691            pending_indentation: self.pending_indentation,
1692            interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1693            errors_position: self.errors.len(),
1694        }
1695    }
1696
1697    /// Restore the lexer to the given checkpoint.
1698    pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1699        let LexerCheckpoint {
1700            value,
1701            current_kind,
1702            current_range,
1703            current_flags,
1704            cursor_offset,
1705            state,
1706            nesting,
1707            indentations_checkpoint,
1708            pending_indentation,
1709            interpolated_strings_checkpoint,
1710            errors_position,
1711        } = checkpoint;
1712
1713        let mut cursor = Cursor::new(self.source);
1714        // We preserve the previous char using this method.
1715        cursor.skip_bytes(cursor_offset.to_usize());
1716
1717        self.current_value = value;
1718        self.current_kind = current_kind;
1719        self.current_range = current_range;
1720        self.current_flags = current_flags;
1721        self.cursor = cursor;
1722        self.state = state;
1723        self.nesting = nesting;
1724        self.indentations.rewind(indentations_checkpoint);
1725        self.pending_indentation = pending_indentation;
1726        self.interpolated_strings
1727            .rewind(interpolated_strings_checkpoint);
1728        self.errors.truncate(errors_position);
1729    }
1730
1731    pub fn finish(self) -> Vec<LexicalError> {
1732        self.errors
1733    }
1734}
1735
1736pub(crate) struct LexerCheckpoint {
1737    value: TokenValue,
1738    current_kind: TokenKind,
1739    current_range: TextRange,
1740    current_flags: TokenFlags,
1741    cursor_offset: TextSize,
1742    state: State,
1743    nesting: u32,
1744    indentations_checkpoint: IndentationsCheckpoint,
1745    pending_indentation: Option<Indentation>,
1746    interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1747    errors_position: usize,
1748}
1749
1750#[derive(Copy, Clone, Debug)]
1751enum State {
1752    /// Lexer is right at the beginning of the file or after a `Newline` token.
1753    AfterNewline,
1754
1755    /// The lexer is at the start of a new logical line but **after** the indentation
1756    NonEmptyLogicalLine,
1757
1758    /// Lexer is right after an equal token
1759    AfterEqual,
1760
1761    /// Inside of a logical line
1762    Other,
1763}
1764
1765impl State {
1766    const fn is_after_newline(self) -> bool {
1767        matches!(self, State::AfterNewline)
1768    }
1769
1770    const fn is_new_logical_line(self) -> bool {
1771        matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1772    }
1773
1774    const fn is_after_equal(self) -> bool {
1775        matches!(self, State::AfterEqual)
1776    }
1777}
1778
1779#[derive(Copy, Clone, Debug)]
1780enum IpyEscapeLexContext {
1781    Assignment,
1782    LogicalLineStart,
1783}
1784
1785impl IpyEscapeLexContext {
1786    const fn allows_help_end(self) -> bool {
1787        matches!(self, Self::LogicalLineStart)
1788    }
1789}
1790
1791#[derive(Copy, Clone, Debug)]
1792enum Radix {
1793    Binary,
1794    Octal,
1795    Decimal,
1796    Hex,
1797}
1798
1799impl Radix {
1800    const fn as_u32(self) -> u32 {
1801        match self {
1802            Radix::Binary => 2,
1803            Radix::Octal => 8,
1804            Radix::Decimal => 10,
1805            Radix::Hex => 16,
1806        }
1807    }
1808
1809    const fn is_digit(self, c: char) -> bool {
1810        match self {
1811            Radix::Binary => matches!(c, '0'..='1'),
1812            Radix::Octal => matches!(c, '0'..='7'),
1813            Radix::Decimal => c.is_ascii_digit(),
1814            Radix::Hex => c.is_ascii_hexdigit(),
1815        }
1816    }
1817}
1818
1819const fn is_quote(c: char) -> bool {
1820    matches!(c, '\'' | '"')
1821}
1822
1823const fn is_ascii_identifier_start(c: char) -> bool {
1824    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1825}
1826
1827// Checks if the character c is a valid starting character as described
1828// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1829fn is_unicode_identifier_start(c: char) -> bool {
1830    is_xid_start(c)
1831}
1832
1833/// Checks if the character c is a valid continuation character as described
1834/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1835///
1836/// Additionally, this function also keeps track of whether or not the total
1837/// identifier is ASCII-only or not by mutably altering a reference to a
1838/// boolean value passed in.
1839fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1840    // Arrange things such that ASCII codepoints never
1841    // result in the slower `is_xid_continue` getting called.
1842    if c.is_ascii() {
1843        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1844    } else {
1845        *identifier_is_ascii_only = false;
1846        is_xid_continue(c)
1847    }
1848}
1849
1850enum LexedText<'a> {
1851    Source { source: &'a str, range: TextRange },
1852    Owned(String),
1853}
1854
1855impl<'a> LexedText<'a> {
1856    fn new(start: TextSize, source: &'a str) -> Self {
1857        Self::Source {
1858            range: TextRange::empty(start),
1859            source,
1860        }
1861    }
1862
1863    fn push(&mut self, c: char) {
1864        match self {
1865            LexedText::Source { range, source } => {
1866                *range = range.add_end(c.text_len());
1867                debug_assert!(source[*range].ends_with(c));
1868            }
1869            LexedText::Owned(owned) => owned.push(c),
1870        }
1871    }
1872
1873    fn as_str<'b>(&'b self) -> &'b str
1874    where
1875        'b: 'a,
1876    {
1877        match self {
1878            LexedText::Source { range, source } => &source[*range],
1879            LexedText::Owned(owned) => owned,
1880        }
1881    }
1882
1883    fn skip_char(&mut self) {
1884        match self {
1885            LexedText::Source { range, source } => {
1886                *self = LexedText::Owned(source[*range].to_string());
1887            }
1888            LexedText::Owned(_) => {}
1889        }
1890    }
1891}
1892
1893/// Create a new [`Lexer`] for the given source code and [`Mode`].
1894pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1895    Lexer::new(source, mode, TextSize::default())
1896}
1897
1898#[cfg(test)]
1899mod tests {
1900    use std::fmt::Write;
1901
1902    use insta::assert_snapshot;
1903
1904    use super::*;
1905
1906    const WINDOWS_EOL: &str = "\r\n";
1907    const MAC_EOL: &str = "\r";
1908    const UNIX_EOL: &str = "\n";
1909
1910    /// Same as [`Token`] except that this includes the [`TokenValue`] as well.
1911    struct TestToken {
1912        kind: TokenKind,
1913        value: TokenValue,
1914        range: TextRange,
1915        flags: TokenFlags,
1916    }
1917
1918    impl std::fmt::Debug for TestToken {
1919        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1920            let mut tuple = f.debug_tuple("");
1921            let mut tuple = if matches!(self.value, TokenValue::None) {
1922                tuple.field(&self.kind)
1923            } else {
1924                tuple.field(&self.value)
1925            };
1926            tuple = tuple.field(&self.range);
1927            if self.flags.is_empty() {
1928                tuple.finish()
1929            } else {
1930                tuple.field(&self.flags).finish()
1931            }
1932        }
1933    }
1934
1935    struct LexerOutput {
1936        tokens: Vec<TestToken>,
1937        errors: Vec<LexicalError>,
1938    }
1939
1940    impl std::fmt::Display for LexerOutput {
1941        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1942            writeln!(f, "## Tokens")?;
1943            writeln!(f, "```\n{:#?}\n```", self.tokens)?;
1944            if !self.errors.is_empty() {
1945                writeln!(f, "## Errors")?;
1946                writeln!(f, "```\n{:#?}\n```", self.errors)?;
1947            }
1948            Ok(())
1949        }
1950    }
1951
1952    fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1953        let mut lexer = Lexer::new(source, mode, start_offset);
1954        let mut tokens = Vec::new();
1955        loop {
1956            let kind = lexer.next_token();
1957            if kind.is_eof() {
1958                break;
1959            }
1960            tokens.push(TestToken {
1961                kind,
1962                value: lexer.take_value(),
1963                range: lexer.current_range(),
1964                flags: lexer.current_flags(),
1965            });
1966        }
1967        LexerOutput {
1968            tokens,
1969            errors: lexer.finish(),
1970        }
1971    }
1972
1973    #[track_caller]
1974    fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1975        let output = lex(source, mode, start_offset);
1976
1977        if !output.errors.is_empty() {
1978            let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
1979            for error in &output.errors {
1980                writeln!(&mut message, "{error:?}").unwrap();
1981            }
1982            writeln!(&mut message, "Source:\n{source}").unwrap();
1983            panic!("{message}");
1984        }
1985
1986        output
1987    }
1988
1989    #[track_caller]
1990    fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
1991        let output = lex(source, mode, TextSize::default());
1992
1993        assert!(
1994            !output.errors.is_empty(),
1995            "Expected lexer to generate at least one error for the following source:\n{source}"
1996        );
1997
1998        output
1999    }
2000
2001    #[track_caller]
2002    fn lex_source(source: &str) -> LexerOutput {
2003        lex_valid(source, Mode::Module, TextSize::default())
2004    }
2005
2006    #[track_caller]
2007    fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
2008        lex_valid(source, Mode::Module, start_offset)
2009    }
2010
2011    #[track_caller]
2012    fn lex_jupyter_source(source: &str) -> LexerOutput {
2013        lex_valid(source, Mode::Ipython, TextSize::default())
2014    }
2015
2016    #[test]
2017    fn bom() {
2018        let source = "\u{feff}x = 1";
2019        assert_snapshot!(lex_source(source));
2020    }
2021
2022    #[test]
2023    fn bom_with_offset() {
2024        let source = "\u{feff}x + y + z";
2025        assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
2026    }
2027
2028    #[test]
2029    fn bom_with_offset_edge() {
2030        // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
2031        // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
2032        let source = "\u{feff}x + y + z";
2033        assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
2034    }
2035
2036    fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
2037        let source = format!("%matplotlib \\{eol}  --inline");
2038        lex_jupyter_source(&source)
2039    }
2040
2041    #[test]
2042    fn test_ipython_escape_command_line_continuation_unix_eol() {
2043        assert_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
2044    }
2045
2046    #[test]
2047    fn test_ipython_escape_command_line_continuation_mac_eol() {
2048        assert_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
2049    }
2050
2051    #[test]
2052    fn test_ipython_escape_command_line_continuation_windows_eol() {
2053        assert_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
2054    }
2055
2056    fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> LexerOutput {
2057        let source = format!("%matplotlib \\{eol}");
2058        lex_jupyter_source(&source)
2059    }
2060
2061    #[test]
2062    fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
2063        assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2064            UNIX_EOL
2065        ));
2066    }
2067
2068    #[test]
2069    fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
2070        assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2071            MAC_EOL
2072        ));
2073    }
2074
2075    #[test]
2076    fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
2077        assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2078            WINDOWS_EOL
2079        ));
2080    }
2081
2082    #[test]
2083    fn test_empty_ipython_escape_command() {
2084        let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
2085        assert_snapshot!(lex_jupyter_source(source));
2086    }
2087
2088    #[test]
2089    fn test_ipython_escape_command() {
2090        let source = r"
2091?foo
2092??foo
2093%timeit a = b
2094%timeit a % 3
2095%matplotlib \
2096    --inline
2097!pwd \
2098  && ls -a | sed 's/^/\\    /'
2099!!cd /Users/foo/Library/Application\ Support/
2100/foo 1 2
2101,foo 1 2
2102;foo 1 2
2103!ls
2104"
2105        .trim();
2106        assert_snapshot!(lex_jupyter_source(source));
2107    }
2108
2109    #[test]
2110    fn test_ipython_help_end_escape_command() {
2111        let source = r"
2112?foo?
2113??   foo?
2114??   foo  ?
2115?foo??
2116??foo??
2117???foo?
2118???foo??
2119??foo???
2120???foo???
2121?? \
2122    foo?
2123?? \
2124?
2125????
2126%foo?
2127%foo??
2128%%foo???
2129!pwd?"
2130            .trim();
2131        assert_snapshot!(lex_jupyter_source(source));
2132    }
2133
2134    #[test]
2135    fn test_ipython_escape_command_indentation() {
2136        let source = r"
2137if True:
2138    %matplotlib \
2139        --inline"
2140            .trim();
2141        assert_snapshot!(lex_jupyter_source(source));
2142    }
2143
2144    #[test]
2145    fn test_ipython_escape_command_assignment() {
2146        let source = r"
2147pwd = !pwd
2148foo = %timeit a = b
2149bar = %timeit a % 3
2150baz = %matplotlib \
2151        inline
2152qux = %foo?
2153quux = !pwd?"
2154            .trim();
2155        assert_snapshot!(lex_jupyter_source(source));
2156    }
2157
2158    fn assert_no_ipython_escape_command(tokens: &[TestToken]) {
2159        for token in tokens {
2160            if matches!(token.kind, TokenKind::IpyEscapeCommand) {
2161                panic!("Unexpected escape command token at {:?}", token.range)
2162            }
2163        }
2164    }
2165
2166    #[test]
2167    fn test_ipython_escape_command_not_an_assignment() {
2168        let source = r"
2169# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
2170foo = /func
2171foo = ;func
2172foo = ,func
2173
2174(foo == %timeit a = b)
2175(foo := %timeit a = b)
2176def f(arg=%timeit a = b):
2177    pass"
2178            .trim();
2179        let output = lex(source, Mode::Ipython, TextSize::default());
2180        assert!(output.errors.is_empty());
2181        assert_no_ipython_escape_command(&output.tokens);
2182    }
2183
2184    #[test]
2185    fn test_numbers() {
2186        let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA 0x995DC9BBDF1939FA995DC9BBDF1939FA";
2187        assert_snapshot!(lex_source(source));
2188    }
2189
2190    #[test]
2191    fn test_invalid_leading_zero_small() {
2192        let source = "025";
2193        assert_snapshot!(lex_invalid(source, Mode::Module));
2194    }
2195
2196    #[test]
2197    fn test_invalid_leading_zero_big() {
2198        let source =
2199            "0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
2200        assert_snapshot!(lex_invalid(source, Mode::Module));
2201    }
2202
2203    #[test]
2204    fn test_line_comment_long() {
2205        let source = "99232  # foo".to_string();
2206        assert_snapshot!(lex_source(&source));
2207    }
2208
2209    #[test]
2210    fn test_line_comment_whitespace() {
2211        let source = "99232  #  ".to_string();
2212        assert_snapshot!(lex_source(&source));
2213    }
2214
2215    #[test]
2216    fn test_line_comment_single_whitespace() {
2217        let source = "99232  # ".to_string();
2218        assert_snapshot!(lex_source(&source));
2219    }
2220
2221    #[test]
2222    fn test_line_comment_empty() {
2223        let source = "99232  #".to_string();
2224        assert_snapshot!(lex_source(&source));
2225    }
2226
2227    fn comment_until_eol(eol: &str) -> LexerOutput {
2228        let source = format!("123  # Foo{eol}456");
2229        lex_source(&source)
2230    }
2231
2232    #[test]
2233    fn test_comment_until_unix_eol() {
2234        assert_snapshot!(comment_until_eol(UNIX_EOL));
2235    }
2236
2237    #[test]
2238    fn test_comment_until_mac_eol() {
2239        assert_snapshot!(comment_until_eol(MAC_EOL));
2240    }
2241
2242    #[test]
2243    fn test_comment_until_windows_eol() {
2244        assert_snapshot!(comment_until_eol(WINDOWS_EOL));
2245    }
2246
2247    #[test]
2248    fn test_assignment() {
2249        let source = r"a_variable = 99 + 2-0";
2250        assert_snapshot!(lex_source(source));
2251    }
2252
2253    fn indentation_with_eol(eol: &str) -> LexerOutput {
2254        let source = format!("def foo():{eol}    return 99{eol}{eol}");
2255        lex_source(&source)
2256    }
2257
2258    #[test]
2259    fn test_indentation_with_unix_eol() {
2260        assert_snapshot!(indentation_with_eol(UNIX_EOL));
2261    }
2262
2263    #[test]
2264    fn test_indentation_with_mac_eol() {
2265        assert_snapshot!(indentation_with_eol(MAC_EOL));
2266    }
2267
2268    #[test]
2269    fn test_indentation_with_windows_eol() {
2270        assert_snapshot!(indentation_with_eol(WINDOWS_EOL));
2271    }
2272
2273    fn double_dedent_with_eol(eol: &str) -> LexerOutput {
2274        let source = format!("def foo():{eol} if x:{eol}{eol}  return 99{eol}{eol}");
2275        lex_source(&source)
2276    }
2277
2278    #[test]
2279    fn test_double_dedent_with_unix_eol() {
2280        assert_snapshot!(double_dedent_with_eol(UNIX_EOL));
2281    }
2282
2283    #[test]
2284    fn test_double_dedent_with_mac_eol() {
2285        assert_snapshot!(double_dedent_with_eol(MAC_EOL));
2286    }
2287
2288    #[test]
2289    fn test_double_dedent_with_windows_eol() {
2290        assert_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
2291    }
2292
2293    fn double_dedent_with_tabs_eol(eol: &str) -> LexerOutput {
2294        let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
2295        lex_source(&source)
2296    }
2297
2298    #[test]
2299    fn test_double_dedent_with_tabs_unix_eol() {
2300        assert_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
2301    }
2302
2303    #[test]
2304    fn test_double_dedent_with_tabs_mac_eol() {
2305        assert_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
2306    }
2307
2308    #[test]
2309    fn test_double_dedent_with_tabs_windows_eol() {
2310        assert_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
2311    }
2312
2313    #[test]
2314    fn dedent_after_whitespace() {
2315        let source = "\
2316if first:
2317    if second:
2318        pass
2319    foo
2320";
2321        assert_snapshot!(lex_source(source));
2322    }
2323
2324    fn newline_in_brackets_eol(eol: &str) -> LexerOutput {
2325        let source = r"x = [
2326
2327    1,2
2328,(3,
23294,
2330), {
23315,
23326,\
23337}]
2334"
2335        .replace('\n', eol);
2336        lex_source(&source)
2337    }
2338
2339    #[test]
2340    fn test_newline_in_brackets_unix_eol() {
2341        assert_snapshot!(newline_in_brackets_eol(UNIX_EOL));
2342    }
2343
2344    #[test]
2345    fn test_newline_in_brackets_mac_eol() {
2346        assert_snapshot!(newline_in_brackets_eol(MAC_EOL));
2347    }
2348
2349    #[test]
2350    fn test_newline_in_brackets_windows_eol() {
2351        assert_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
2352    }
2353
2354    #[test]
2355    fn test_non_logical_newline_in_string_continuation() {
2356        let source = r"(
2357    'a'
2358    'b'
2359
2360    'c' \
2361    'd'
2362)";
2363        assert_snapshot!(lex_source(source));
2364    }
2365
2366    #[test]
2367    fn test_logical_newline_line_comment() {
2368        let source = "#Hello\n#World\n";
2369        assert_snapshot!(lex_source(source));
2370    }
2371
2372    #[test]
2373    fn test_operators() {
2374        let source = "//////=/ /";
2375        assert_snapshot!(lex_source(source));
2376    }
2377
2378    #[test]
2379    fn test_string() {
2380        let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
2381        assert_snapshot!(lex_source(source));
2382    }
2383
2384    fn string_continuation_with_eol(eol: &str) -> LexerOutput {
2385        let source = format!("\"abc\\{eol}def\"");
2386        lex_source(&source)
2387    }
2388
2389    #[test]
2390    fn test_string_continuation_with_unix_eol() {
2391        assert_snapshot!(string_continuation_with_eol(UNIX_EOL));
2392    }
2393
2394    #[test]
2395    fn test_string_continuation_with_mac_eol() {
2396        assert_snapshot!(string_continuation_with_eol(MAC_EOL));
2397    }
2398
2399    #[test]
2400    fn test_string_continuation_with_windows_eol() {
2401        assert_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
2402    }
2403
2404    #[test]
2405    fn test_escape_unicode_name() {
2406        let source = r#""\N{EN SPACE}""#;
2407        assert_snapshot!(lex_source(source));
2408    }
2409
2410    fn get_tokens_only(source: &str) -> Vec<TokenKind> {
2411        let output = lex(source, Mode::Module, TextSize::default());
2412        assert!(output.errors.is_empty());
2413        output.tokens.into_iter().map(|token| token.kind).collect()
2414    }
2415
2416    #[test]
2417    fn test_nfkc_normalization() {
2418        let source1 = "𝒞 = 500";
2419        let source2 = "C = 500";
2420        assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
2421    }
2422
2423    fn triple_quoted_eol(eol: &str) -> LexerOutput {
2424        let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
2425        lex_source(&source)
2426    }
2427
2428    #[test]
2429    fn test_triple_quoted_unix_eol() {
2430        assert_snapshot!(triple_quoted_eol(UNIX_EOL));
2431    }
2432
2433    #[test]
2434    fn test_triple_quoted_mac_eol() {
2435        assert_snapshot!(triple_quoted_eol(MAC_EOL));
2436    }
2437
2438    #[test]
2439    fn test_triple_quoted_windows_eol() {
2440        assert_snapshot!(triple_quoted_eol(WINDOWS_EOL));
2441    }
2442
2443    fn line_continuation_at_eof_after_newline(eol: &str) -> LexerOutput {
2444        let source = format!(r"\{eol}");
2445        lex_invalid(&source, Mode::Module)
2446    }
2447
2448    #[test]
2449    fn test_line_continuation_at_eof_after_newline_unix_eol() {
2450        assert_snapshot!(line_continuation_at_eof_after_newline(UNIX_EOL));
2451    }
2452
2453    #[test]
2454    fn test_line_continuation_at_eof_after_newline_mac_eol() {
2455        assert_snapshot!(line_continuation_at_eof_after_newline(MAC_EOL));
2456    }
2457
2458    #[test]
2459    fn test_line_continuation_at_eof_after_newline_windows_eol() {
2460        assert_snapshot!(line_continuation_at_eof_after_newline(WINDOWS_EOL));
2461    }
2462
2463    fn line_continuation_at_eof(eol: &str) -> LexerOutput {
2464        let source = format!(r"1, \{eol}");
2465        lex_invalid(&source, Mode::Module)
2466    }
2467
2468    #[test]
2469    fn test_line_continuation_at_eof_unix_eol() {
2470        assert_snapshot!(line_continuation_at_eof(UNIX_EOL));
2471    }
2472
2473    #[test]
2474    fn test_line_continuation_at_eof_mac_eol() {
2475        assert_snapshot!(line_continuation_at_eof(MAC_EOL));
2476    }
2477
2478    #[test]
2479    fn test_line_continuation_at_eof_windows_eol() {
2480        assert_snapshot!(line_continuation_at_eof(WINDOWS_EOL));
2481    }
2482
2483    // This test case is to just make sure that the lexer doesn't go into
2484    // infinite loop on invalid input.
2485    #[test]
2486    fn test_infinite_loop() {
2487        let source = "[1";
2488        lex_invalid(source, Mode::Module);
2489    }
2490
2491    /// Emoji identifiers are a non-standard python feature and are not supported by our lexer.
2492    #[test]
2493    fn test_emoji_identifier() {
2494        let source = "🐦";
2495        assert_snapshot!(lex_invalid(source, Mode::Module));
2496    }
2497
2498    #[test]
2499    fn tet_too_low_dedent() {
2500        let source = "if True:
2501    pass
2502  pass";
2503        assert_snapshot!(lex_invalid(source, Mode::Module));
2504    }
2505
2506    #[test]
2507    fn test_empty_fstrings() {
2508        let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
2509        assert_snapshot!(lex_source(source));
2510    }
2511
2512    #[test]
2513    fn test_fstring_prefix() {
2514        let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
2515        assert_snapshot!(lex_source(source));
2516    }
2517
2518    #[test]
2519    fn test_fstring() {
2520        let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
2521        assert_snapshot!(lex_source(source));
2522    }
2523
2524    #[test]
2525    fn test_fstring_parentheses() {
2526        let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}}  ""#;
2527        assert_snapshot!(lex_source(source));
2528    }
2529
2530    fn fstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
2531        let source = format!(r"f'text \{eol} more text'");
2532        lex_source(&source)
2533    }
2534
2535    #[test]
2536    fn test_fstring_single_quote_escape_unix_eol() {
2537        assert_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
2538    }
2539
2540    #[test]
2541    fn test_fstring_single_quote_escape_mac_eol() {
2542        assert_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
2543    }
2544
2545    #[test]
2546    fn test_fstring_single_quote_escape_windows_eol() {
2547        assert_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
2548    }
2549
2550    #[test]
2551    fn test_fstring_escape() {
2552        let source = r#"f"\{x:\"\{x}} \"\"\
2553 end""#;
2554        assert_snapshot!(lex_source(source));
2555    }
2556
2557    #[test]
2558    fn test_fstring_escape_braces() {
2559        let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
2560        assert_snapshot!(lex_source(source));
2561    }
2562
2563    #[test]
2564    fn test_fstring_escape_raw() {
2565        let source = r#"rf"\{x:\"\{x}} \"\"\
2566 end""#;
2567        assert_snapshot!(lex_source(source));
2568    }
2569
2570    #[test]
2571    fn test_fstring_named_unicode() {
2572        let source = r#"f"\N{BULLET} normal \Nope \N""#;
2573        assert_snapshot!(lex_source(source));
2574    }
2575
2576    #[test]
2577    fn test_fstring_named_unicode_raw() {
2578        let source = r#"rf"\N{BULLET} normal""#;
2579        assert_snapshot!(lex_source(source));
2580    }
2581
2582    #[test]
2583    fn test_fstring_with_named_expression() {
2584        let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
2585        assert_snapshot!(lex_source(source));
2586    }
2587
2588    #[test]
2589    fn test_fstring_with_format_spec() {
2590        let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
2591        assert_snapshot!(lex_source(source));
2592    }
2593
2594    #[test]
2595    fn test_fstring_with_multiline_format_spec() {
2596        // The last f-string is invalid syntactically but we should still lex it.
2597        // Note that the `b` is a `Name` token and not a `FStringMiddle` token.
2598        let source = r"f'''__{
2599    x:d
2600}__'''
2601f'''__{
2602    x:a
2603        b
2604          c
2605}__'''
2606";
2607        assert_snapshot!(lex_source(source));
2608    }
2609
2610    #[test]
2611    fn test_fstring_newline_format_spec() {
2612        let source = r"
2613f'__{
2614    x:d
2615}__'
2616f'__{
2617    x:a
2618        b
2619}__'
2620";
2621        assert_snapshot!(lex_invalid(source, Mode::Module));
2622    }
2623
2624    #[test]
2625    fn test_fstring_conversion() {
2626        let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
2627        assert_snapshot!(lex_source(source));
2628    }
2629
2630    #[test]
2631    fn test_fstring_nested() {
2632        let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
2633        assert_snapshot!(lex_source(source));
2634    }
2635
2636    #[test]
2637    fn test_fstring_expression_multiline() {
2638        let source = r#"f"first {
2639    x
2640        *
2641            y
2642} second""#;
2643        assert_snapshot!(lex_source(source));
2644    }
2645
2646    #[test]
2647    fn test_fstring_multiline() {
2648        let source = r#"f"""
2649hello
2650    world
2651""" f'''
2652    world
2653hello
2654''' f"some {f"""multiline
2655allowed {x}"""} string""#;
2656        assert_snapshot!(lex_source(source));
2657    }
2658
2659    #[test]
2660    fn test_fstring_comments() {
2661        let source = r#"f"""
2662# not a comment { # comment {
2663    x
2664} # not a comment
2665""""#;
2666        assert_snapshot!(lex_source(source));
2667    }
2668
2669    #[test]
2670    fn test_fstring_with_ipy_escape_command() {
2671        let source = r#"f"foo {!pwd} bar""#;
2672        assert_snapshot!(lex_source(source));
2673    }
2674
2675    #[test]
2676    fn test_fstring_with_lambda_expression() {
2677        let source = r#"
2678f"{lambda x:{x}}"
2679f"{(lambda x:{x})}"
2680"#
2681        .trim();
2682        assert_snapshot!(lex_source(source));
2683    }
2684
2685    #[test]
2686    fn test_fstring_with_nul_char() {
2687        let source = r"f'\0'";
2688        assert_snapshot!(lex_source(source));
2689    }
2690
2691    #[test]
2692    fn test_empty_tstrings() {
2693        let source = r#"t"" "" t"" t'' '' t"""""" t''''''"#;
2694        assert_snapshot!(lex_source(source));
2695    }
2696
2697    #[test]
2698    fn test_tstring_prefix() {
2699        let source = r#"t"" t"" rt"" rt"" Rt"" Rt"" tr"" Tr"" tR"" TR"""#;
2700        assert_snapshot!(lex_source(source));
2701    }
2702
2703    #[test]
2704    fn test_tstring() {
2705        let source = r#"t"normal {foo} {{another}} {bar} {{{three}}}""#;
2706        assert_snapshot!(lex_source(source));
2707    }
2708
2709    #[test]
2710    fn test_tstring_parentheses() {
2711        let source = r#"t"{}" t"{{}}" t" {}" t"{{{}}}" t"{{{{}}}}" t" {} {{}} {{{}}} {{{{}}}}  ""#;
2712        assert_snapshot!(lex_source(source));
2713    }
2714
2715    fn tstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
2716        let source = format!(r"t'text \{eol} more text'");
2717        lex_source(&source)
2718    }
2719
2720    #[test]
2721    fn test_tstring_single_quote_escape_unix_eol() {
2722        assert_snapshot!(tstring_single_quote_escape_eol(UNIX_EOL));
2723    }
2724
2725    #[test]
2726    fn test_tstring_single_quote_escape_mac_eol() {
2727        assert_snapshot!(tstring_single_quote_escape_eol(MAC_EOL));
2728    }
2729
2730    #[test]
2731    fn test_tstring_single_quote_escape_windows_eol() {
2732        assert_snapshot!(tstring_single_quote_escape_eol(WINDOWS_EOL));
2733    }
2734
2735    #[test]
2736    fn test_tstring_escape() {
2737        let source = r#"t"\{x:\"\{x}} \"\"\
2738 end""#;
2739        assert_snapshot!(lex_source(source));
2740    }
2741
2742    #[test]
2743    fn test_tstring_escape_braces() {
2744        let source = r"t'\{foo}' t'\\{foo}' t'\{{foo}}' t'\\{{foo}}'";
2745        assert_snapshot!(lex_source(source));
2746    }
2747
2748    #[test]
2749    fn test_tstring_escape_raw() {
2750        let source = r#"rt"\{x:\"\{x}} \"\"\
2751 end""#;
2752        assert_snapshot!(lex_source(source));
2753    }
2754
2755    #[test]
2756    fn test_tstring_named_unicode() {
2757        let source = r#"t"\N{BULLET} normal \Nope \N""#;
2758        assert_snapshot!(lex_source(source));
2759    }
2760
2761    #[test]
2762    fn test_tstring_named_unicode_raw() {
2763        let source = r#"rt"\N{BULLET} normal""#;
2764        assert_snapshot!(lex_source(source));
2765    }
2766
2767    #[test]
2768    fn test_tstring_with_named_expression() {
2769        let source = r#"t"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
2770        assert_snapshot!(lex_source(source));
2771    }
2772
2773    #[test]
2774    fn test_tstring_with_format_spec() {
2775        let source = r#"t"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
2776        assert_snapshot!(lex_source(source));
2777    }
2778
2779    #[test]
2780    fn test_tstring_with_multiline_format_spec() {
2781        // The last t-string is invalid syntactically but we should still lex it.
2782        // Note that the `b` is a `Name` token and not a `TStringMiddle` token.
2783        let source = r"t'''__{
2784    x:d
2785}__'''
2786t'''__{
2787    x:a
2788        b
2789          c
2790}__'''
2791";
2792        assert_snapshot!(lex_source(source));
2793    }
2794
2795    #[test]
2796    fn test_tstring_newline_format_spec() {
2797        let source = r"
2798t'__{
2799    x:d
2800}__'
2801t'__{
2802    x:a
2803        b
2804}__'
2805";
2806        assert_snapshot!(lex_invalid(source, Mode::Module));
2807    }
2808
2809    #[test]
2810    fn test_tstring_conversion() {
2811        let source = r#"t"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
2812        assert_snapshot!(lex_source(source));
2813    }
2814
2815    #[test]
2816    fn test_tstring_nested() {
2817        let source = r#"t"foo {t"bar {x + t"{wow}"}"} baz" t'foo {t'bar'} some {t"another"}'"#;
2818        assert_snapshot!(lex_source(source));
2819    }
2820
2821    #[test]
2822    fn test_tstring_expression_multiline() {
2823        let source = r#"t"first {
2824    x
2825        *
2826            y
2827} second""#;
2828        assert_snapshot!(lex_source(source));
2829    }
2830
2831    #[test]
2832    fn test_tstring_multiline() {
2833        let source = r#"t"""
2834hello
2835    world
2836""" t'''
2837    world
2838hello
2839''' t"some {t"""multiline
2840allowed {x}"""} string""#;
2841        assert_snapshot!(lex_source(source));
2842    }
2843
2844    #[test]
2845    fn test_tstring_comments() {
2846        let source = r#"t"""
2847# not a comment { # comment {
2848    x
2849} # not a comment
2850""""#;
2851        assert_snapshot!(lex_source(source));
2852    }
2853
2854    #[test]
2855    fn test_tstring_with_ipy_escape_command() {
2856        let source = r#"t"foo {!pwd} bar""#;
2857        assert_snapshot!(lex_source(source));
2858    }
2859
2860    #[test]
2861    fn test_tstring_with_lambda_expression() {
2862        let source = r#"
2863t"{lambda x:{x}}"
2864t"{(lambda x:{x})}"
2865"#
2866        .trim();
2867        assert_snapshot!(lex_source(source));
2868    }
2869
2870    #[test]
2871    fn test_tstring_with_nul_char() {
2872        let source = r"t'\0'";
2873        assert_snapshot!(lex_source(source));
2874    }
2875
2876    #[test]
2877    fn test_nested_t_and_fstring() {
2878        let source = r#"t"foo {f"bar {x + t"{wow}"}"} baz" f'foo {t'bar'!r} some {f"another"}'"#;
2879        assert_snapshot!(lex_source(source));
2880    }
2881
2882    #[test]
2883    fn test_match_softkeyword_in_notebook() {
2884        let source = r"match foo:
2885    case bar:
2886        pass";
2887        assert_snapshot!(lex_jupyter_source(source));
2888    }
2889
2890    fn lex_fstring_error(source: &str) -> InterpolatedStringErrorType {
2891        let output = lex(source, Mode::Module, TextSize::default());
2892        match output
2893            .errors
2894            .into_iter()
2895            .next()
2896            .expect("lexer should give at least one error")
2897            .into_error()
2898        {
2899            LexicalErrorType::FStringError(error) => error,
2900            err => panic!("Expected FStringError: {err:?}"),
2901        }
2902    }
2903
2904    #[test]
2905    fn test_fstring_error() {
2906        use InterpolatedStringErrorType::{
2907            SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
2908        };
2909
2910        assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
2911        assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
2912        assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
2913        assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
2914        assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
2915        assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
2916        assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
2917        assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
2918
2919        assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
2920        assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
2921
2922        assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
2923        assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
2924        assert_eq!(
2925            lex_fstring_error(r#"f"""""#),
2926            UnterminatedTripleQuotedString
2927        );
2928        assert_eq!(
2929            lex_fstring_error(r#"f""""""#),
2930            UnterminatedTripleQuotedString
2931        );
2932    }
2933
2934    fn lex_tstring_error(source: &str) -> InterpolatedStringErrorType {
2935        let output = lex(source, Mode::Module, TextSize::default());
2936        match output
2937            .errors
2938            .into_iter()
2939            .next()
2940            .expect("lexer should give at least one error")
2941            .into_error()
2942        {
2943            LexicalErrorType::TStringError(error) => error,
2944            err => panic!("Expected TStringError: {err:?}"),
2945        }
2946    }
2947
2948    #[test]
2949    fn lex_fstring_unclosed() {
2950        let source = r#"f"hello"#;
2951
2952        assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
2953        ## Tokens
2954        ```
2955        [
2956            (
2957                FStringStart,
2958                0..2,
2959                TokenFlags(
2960                    DOUBLE_QUOTES | F_STRING,
2961                ),
2962            ),
2963            (
2964                InterpolatedStringMiddle(
2965                    "hello",
2966                ),
2967                2..7,
2968                TokenFlags(
2969                    DOUBLE_QUOTES | F_STRING,
2970                ),
2971            ),
2972            (
2973                Newline,
2974                7..7,
2975            ),
2976        ]
2977        ```
2978        ## Errors
2979        ```
2980        [
2981            LexicalError {
2982                error: FStringError(
2983                    UnterminatedString,
2984                ),
2985                location: 2..7,
2986            },
2987        ]
2988        ```
2989        "#);
2990    }
2991
2992    #[test]
2993    fn lex_fstring_missing_brace() {
2994        let source = "f'{'";
2995
2996        assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
2997        ## Tokens
2998        ```
2999        [
3000            (
3001                FStringStart,
3002                0..2,
3003                TokenFlags(
3004                    F_STRING,
3005                ),
3006            ),
3007            (
3008                Lbrace,
3009                2..3,
3010            ),
3011            (
3012                String(
3013                    "",
3014                ),
3015                3..4,
3016                TokenFlags(
3017                    UNCLOSED_STRING,
3018                ),
3019            ),
3020            (
3021                Newline,
3022                4..4,
3023            ),
3024        ]
3025        ```
3026        ## Errors
3027        ```
3028        [
3029            LexicalError {
3030                error: UnclosedStringError,
3031                location: 3..4,
3032            },
3033            LexicalError {
3034                error: FStringError(
3035                    UnterminatedString,
3036                ),
3037                location: 4..4,
3038            },
3039        ]
3040        ```
3041        "#);
3042    }
3043
3044    #[test]
3045    fn lex_fstring_missing_brace_after_format_spec() {
3046        let source = r#"f"{foo!r""#;
3047
3048        assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
3049        ## Tokens
3050        ```
3051        [
3052            (
3053                FStringStart,
3054                0..2,
3055                TokenFlags(
3056                    DOUBLE_QUOTES | F_STRING,
3057                ),
3058            ),
3059            (
3060                Lbrace,
3061                2..3,
3062            ),
3063            (
3064                Name(
3065                    Name("foo"),
3066                ),
3067                3..6,
3068            ),
3069            (
3070                Exclamation,
3071                6..7,
3072            ),
3073            (
3074                String(
3075                    "",
3076                ),
3077                7..9,
3078                TokenFlags(
3079                    DOUBLE_QUOTES | RAW_STRING_LOWERCASE | UNCLOSED_STRING,
3080                ),
3081            ),
3082            (
3083                Newline,
3084                9..9,
3085            ),
3086        ]
3087        ```
3088        ## Errors
3089        ```
3090        [
3091            LexicalError {
3092                error: UnclosedStringError,
3093                location: 7..9,
3094            },
3095            LexicalError {
3096                error: FStringError(
3097                    UnterminatedString,
3098                ),
3099                location: 9..9,
3100            },
3101        ]
3102        ```
3103        "#);
3104    }
3105
3106    #[test]
3107    fn test_tstring_error() {
3108        use InterpolatedStringErrorType::{
3109            SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
3110        };
3111
3112        assert_eq!(lex_tstring_error("t'}'"), SingleRbrace);
3113        assert_eq!(lex_tstring_error("t'{{}'"), SingleRbrace);
3114        assert_eq!(lex_tstring_error("t'{{}}}'"), SingleRbrace);
3115        assert_eq!(lex_tstring_error("t'foo}'"), SingleRbrace);
3116        assert_eq!(lex_tstring_error(r"t'\u007b}'"), SingleRbrace);
3117        assert_eq!(lex_tstring_error("t'{a:b}}'"), SingleRbrace);
3118        assert_eq!(lex_tstring_error("t'{3:}}>10}'"), SingleRbrace);
3119        assert_eq!(lex_tstring_error(r"t'\{foo}\}'"), SingleRbrace);
3120
3121        assert_eq!(lex_tstring_error(r#"t""#), UnterminatedString);
3122        assert_eq!(lex_tstring_error(r"t'"), UnterminatedString);
3123
3124        assert_eq!(lex_tstring_error(r#"t""""#), UnterminatedTripleQuotedString);
3125        assert_eq!(lex_tstring_error(r"t'''"), UnterminatedTripleQuotedString);
3126        assert_eq!(
3127            lex_tstring_error(r#"t"""""#),
3128            UnterminatedTripleQuotedString
3129        );
3130        assert_eq!(
3131            lex_tstring_error(r#"t""""""#),
3132            UnterminatedTripleQuotedString
3133        );
3134    }
3135
3136    #[test]
3137    fn backslash_continuation_indentation() {
3138        // The first `\` has 4 spaces before it which matches the indentation level at that point,
3139        // so the whitespace before `2` is irrelevant and shouldn't produce an indentation error.
3140        // Similarly, the second `\` is also at the same indentation level, so the `3` line is also
3141        // valid.
3142        let source = r"if True:
3143    1
3144    \
3145        2
3146    \
31473
3148else:
3149    pass
3150"
3151        .to_string();
3152        assert_snapshot!(lex_source(&source));
3153    }
3154
3155    #[test]
3156    fn backslash_continuation_at_root() {
3157        // But, it's a different when the backslash character itself is at the root indentation
3158        // level. Then, the whitespaces following it determines the indentation level of the next
3159        // line, so `1` is indented with 4 spaces and `2` is indented with 8 spaces, and `3` is
3160        // indented with 4 spaces, all of which are valid.
3161        let source = r"if True:
3162\
3163    1
3164    if True:
3165\
3166        2
3167else:\
3168    3
3169"
3170        .to_string();
3171        assert_snapshot!(lex_source(&source));
3172    }
3173
3174    #[test]
3175    fn multiple_backslash_continuation() {
3176        // It's only the first backslash character that determines the indentation level of the next
3177        // line, so all the lines after the first `\` are indented with 4 spaces, and the remaining
3178        // backslashes are just ignored and don't affect the indentation level.
3179        let source = r"if True:
3180    1
3181    \
3182            \
3183        \
3184    \
3185    2
3186"
3187        .to_string();
3188        assert_snapshot!(lex_source(&source));
3189    }
3190
3191    #[test]
3192    fn backslash_continuation_mismatch_indentation() {
3193        // Indentation doesn't match any previous indentation level
3194        let source = r"if True:
3195    1
3196  \
3197    2
3198"
3199        .to_string();
3200        assert_snapshot!(lex_invalid(&source, Mode::Module));
3201    }
3202}
rustpython_ruff_python_parser/lexer.rs

rustpython_ruff_python_parser/
lexer.rs