ruff_python_trivia/
tokenizer.rs

1use unicode_ident::{is_xid_continue, is_xid_start};
2
3use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
4
5use crate::{Cursor, is_python_whitespace};
6
7/// Searches for the first non-trivia character after `offset`.
8///
9/// The search skips over any whitespace and comments.
10///
11/// Returns `Some` if the source code after `offset` contains any non-trivia character.///
12/// Returns `None` if the text after `offset` is empty or only contains trivia (whitespace or comments).
13pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
14    SimpleTokenizer::starts_at(offset, code)
15        .skip_trivia()
16        .next()
17}
18
19/// Returns the only non-trivia, non-closing parenthesis token in `range`.
20///
21/// Includes debug assertions that the range only contains that single token.
22pub fn find_only_token_in_range(
23    range: TextRange,
24    token_kind: SimpleTokenKind,
25    code: &str,
26) -> SimpleToken {
27    let mut tokens = SimpleTokenizer::new(code, range)
28        .skip_trivia()
29        .skip_while(|token| token.kind == SimpleTokenKind::RParen);
30    let token = tokens.next().expect("Expected a token");
31    debug_assert_eq!(token.kind(), token_kind);
32    let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
33    #[expect(clippy::debug_assert_with_mut_call)]
34    {
35        debug_assert_eq!(tokens.next(), None);
36    }
37    token
38}
39
40/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
41pub fn lines_before(offset: TextSize, code: &str) -> u32 {
42    let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
43
44    let mut newlines = 0u32;
45    while let Some(c) = cursor.bump_back() {
46        match c {
47            '\n' => {
48                cursor.eat_char_back('\r');
49                newlines += 1;
50            }
51            '\r' => {
52                newlines += 1;
53            }
54            c if is_python_whitespace(c) => {
55                continue;
56            }
57            _ => {
58                break;
59            }
60        }
61    }
62
63    newlines
64}
65
66/// Counts the empty lines between `offset` and the first non-whitespace character.
67pub fn lines_after(offset: TextSize, code: &str) -> u32 {
68    let mut cursor = Cursor::new(&code[offset.to_usize()..]);
69
70    let mut newlines = 0u32;
71    while let Some(c) = cursor.bump() {
72        match c {
73            '\n' => {
74                newlines += 1;
75            }
76            '\r' => {
77                cursor.eat_char('\n');
78                newlines += 1;
79            }
80            c if is_python_whitespace(c) => {
81                continue;
82            }
83            _ => {
84                break;
85            }
86        }
87    }
88
89    newlines
90}
91
92/// Counts the empty lines after `offset`, ignoring any trailing trivia: end-of-line comments,
93/// own-line comments, and any intermediary newlines.
94pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
95    let mut newlines = 0u32;
96    for token in SimpleTokenizer::starts_at(offset, code) {
97        match token.kind() {
98            SimpleTokenKind::Newline => {
99                newlines += 1;
100            }
101            SimpleTokenKind::Whitespace => {}
102            // If we see a comment, reset the newlines counter.
103            SimpleTokenKind::Comment => {
104                newlines = 0;
105            }
106            // As soon as we see a non-trivia token, we're done.
107            _ => {
108                break;
109            }
110        }
111    }
112    newlines
113}
114
115/// Counts the empty lines after `offset`, ignoring any trailing trivia on the same line as
116/// `offset`.
117#[expect(clippy::cast_possible_truncation)]
118pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
119    // SAFETY: We don't support files greater than 4GB, so casting to u32 is safe.
120    SimpleTokenizer::starts_at(offset, code)
121        .skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
122        .take_while(|token| {
123            token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
124        })
125        .filter(|token| token.kind == SimpleTokenKind::Newline)
126        .count() as u32
127}
128
129fn is_identifier_start(c: char) -> bool {
130    if c.is_ascii() {
131        c.is_ascii_alphabetic() || c == '_'
132    } else {
133        is_xid_start(c)
134    }
135}
136
137// Checks if the character c is a valid continuation character as described
138// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
139fn is_identifier_continuation(c: char) -> bool {
140    // Arrange things such that ASCII codepoints never
141    // result in the slower `is_xid_continue` getting called.
142    if c.is_ascii() {
143        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
144    } else {
145        is_xid_continue(c)
146    }
147}
148
149fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
150    match source {
151        "and" => SimpleTokenKind::And,
152        "as" => SimpleTokenKind::As,
153        "assert" => SimpleTokenKind::Assert,
154        "async" => SimpleTokenKind::Async,
155        "await" => SimpleTokenKind::Await,
156        "break" => SimpleTokenKind::Break,
157        "class" => SimpleTokenKind::Class,
158        "continue" => SimpleTokenKind::Continue,
159        "def" => SimpleTokenKind::Def,
160        "del" => SimpleTokenKind::Del,
161        "elif" => SimpleTokenKind::Elif,
162        "else" => SimpleTokenKind::Else,
163        "except" => SimpleTokenKind::Except,
164        "finally" => SimpleTokenKind::Finally,
165        "for" => SimpleTokenKind::For,
166        "from" => SimpleTokenKind::From,
167        "global" => SimpleTokenKind::Global,
168        "if" => SimpleTokenKind::If,
169        "import" => SimpleTokenKind::Import,
170        "in" => SimpleTokenKind::In,
171        "is" => SimpleTokenKind::Is,
172        "lambda" => SimpleTokenKind::Lambda,
173        "nonlocal" => SimpleTokenKind::Nonlocal,
174        "not" => SimpleTokenKind::Not,
175        "or" => SimpleTokenKind::Or,
176        "pass" => SimpleTokenKind::Pass,
177        "raise" => SimpleTokenKind::Raise,
178        "return" => SimpleTokenKind::Return,
179        "try" => SimpleTokenKind::Try,
180        "while" => SimpleTokenKind::While,
181        "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
182        "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
183        "case" => SimpleTokenKind::Case,
184        "with" => SimpleTokenKind::With,
185        "yield" => SimpleTokenKind::Yield,
186        _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
187    }
188}
189
190#[derive(Clone, Debug, Eq, PartialEq, Hash)]
191pub struct SimpleToken {
192    pub kind: SimpleTokenKind,
193    pub range: TextRange,
194}
195
196impl SimpleToken {
197    pub const fn kind(&self) -> SimpleTokenKind {
198        self.kind
199    }
200}
201
202impl Ranged for SimpleToken {
203    fn range(&self) -> TextRange {
204        self.range
205    }
206}
207
208#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
209pub enum SimpleTokenKind {
210    /// A comment, not including the trailing new line.
211    Comment,
212
213    /// Sequence of ' ' or '\t'
214    Whitespace,
215
216    /// Start or end of the file
217    EndOfFile,
218
219    /// `\\`
220    Continuation,
221
222    /// `\n` or `\r` or `\r\n`
223    Newline,
224
225    /// `(`
226    LParen,
227
228    /// `)`
229    RParen,
230
231    /// `{`
232    LBrace,
233
234    /// `}`
235    RBrace,
236
237    /// `[`
238    LBracket,
239
240    /// `]`
241    RBracket,
242
243    /// `,`
244    Comma,
245
246    /// `:`
247    Colon,
248
249    /// `;`
250    Semi,
251
252    /// `/`
253    Slash,
254
255    /// `*`
256    Star,
257
258    /// `.`
259    Dot,
260
261    /// `+`
262    Plus,
263
264    /// `-`
265    Minus,
266
267    /// `=`
268    Equals,
269
270    /// `>`
271    Greater,
272
273    /// `<`
274    Less,
275
276    /// `%`
277    Percent,
278
279    /// `&`
280    Ampersand,
281
282    /// `^`
283    Circumflex,
284
285    /// `|`
286    Vbar,
287
288    /// `@`
289    At,
290
291    /// `~`
292    Tilde,
293
294    /// `==`
295    EqEqual,
296
297    /// `!=`
298    NotEqual,
299
300    /// `<=`
301    LessEqual,
302
303    /// `>=`
304    GreaterEqual,
305
306    /// `<<`
307    LeftShift,
308
309    /// `>>`
310    RightShift,
311
312    /// `**`
313    DoubleStar,
314
315    /// `**=`
316    DoubleStarEqual,
317
318    /// `+=`
319    PlusEqual,
320
321    /// `-=`
322    MinusEqual,
323
324    /// `*=`
325    StarEqual,
326
327    /// `/=`
328    SlashEqual,
329
330    /// `%=`
331    PercentEqual,
332
333    /// `&=`
334    AmperEqual,
335
336    /// `|=`
337    VbarEqual,
338
339    /// `^=`
340    CircumflexEqual,
341
342    /// `<<=`
343    LeftShiftEqual,
344
345    /// `>>=`
346    RightShiftEqual,
347
348    /// `//`
349    DoubleSlash,
350
351    /// `//=`
352    DoubleSlashEqual,
353
354    /// `:=`
355    ColonEqual,
356
357    /// `...`
358    Ellipsis,
359
360    /// `@=`
361    AtEqual,
362
363    /// `->`
364    RArrow,
365
366    /// `and`
367    And,
368
369    /// `as`
370    As,
371
372    /// `assert`
373    Assert,
374
375    /// `async`
376    Async,
377
378    /// `await`
379    Await,
380
381    /// `break`
382    Break,
383
384    /// `class`
385    Class,
386
387    /// `continue`
388    Continue,
389
390    /// `def`
391    Def,
392
393    /// `del`
394    Del,
395
396    /// `elif`
397    Elif,
398
399    /// `else`
400    Else,
401
402    /// `except`
403    Except,
404
405    /// `finally`
406    Finally,
407
408    /// `for`
409    For,
410
411    /// `from`
412    From,
413
414    /// `global`
415    Global,
416
417    /// `if`
418    If,
419
420    /// `import`
421    Import,
422
423    /// `in`
424    In,
425
426    /// `is`
427    Is,
428
429    /// `lambda`
430    Lambda,
431
432    /// `nonlocal`
433    Nonlocal,
434
435    /// `not`
436    Not,
437
438    /// `or`
439    Or,
440
441    /// `pass`
442    Pass,
443
444    /// `raise`
445    Raise,
446
447    /// `return`
448    Return,
449
450    /// `try`
451    Try,
452
453    /// `while`
454    While,
455
456    /// `match`
457    Match,
458
459    /// `type`
460    Type,
461
462    /// `case`
463    Case,
464
465    /// `with`
466    With,
467
468    /// `yield`
469    Yield,
470
471    /// An identifier or keyword.
472    Name,
473
474    /// Any other non trivia token.
475    Other,
476
477    /// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
478    Bogus,
479}
480
481impl SimpleTokenKind {
482    pub const fn is_trivia(self) -> bool {
483        matches!(
484            self,
485            SimpleTokenKind::Whitespace
486                | SimpleTokenKind::Newline
487                | SimpleTokenKind::Comment
488                | SimpleTokenKind::Continuation
489        )
490    }
491
492    pub const fn is_comment(self) -> bool {
493        matches!(self, SimpleTokenKind::Comment)
494    }
495}
496
497/// Simple zero allocation tokenizer handling most tokens.
498///
499/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
500///
501/// In case it finds something it can't parse, the tokenizer will return a
502/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
503pub struct SimpleTokenizer<'a> {
504    offset: TextSize,
505    /// `true` when it is known that the current `back` line has no comment for sure.
506    bogus: bool,
507    source: &'a str,
508    cursor: Cursor<'a>,
509}
510
511impl<'a> SimpleTokenizer<'a> {
512    pub fn new(source: &'a str, range: TextRange) -> Self {
513        Self {
514            offset: range.start(),
515            bogus: false,
516            source,
517            cursor: Cursor::new(&source[range]),
518        }
519    }
520
521    pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
522        let range = TextRange::new(offset, source.text_len());
523        Self::new(source, range)
524    }
525
526    fn next_token(&mut self) -> SimpleToken {
527        self.cursor.start_token();
528
529        let Some(first) = self.cursor.bump() else {
530            return SimpleToken {
531                kind: SimpleTokenKind::EndOfFile,
532                range: TextRange::empty(self.offset),
533            };
534        };
535
536        if self.bogus {
537            // Emit a single final bogus token
538            let token = SimpleToken {
539                kind: SimpleTokenKind::Bogus,
540                range: TextRange::new(self.offset, self.source.text_len()),
541            };
542
543            // Set the cursor to EOF
544            self.cursor = Cursor::new("");
545            self.offset = self.source.text_len();
546            return token;
547        }
548
549        let kind = self.next_token_inner(first);
550
551        let token_len = self.cursor.token_len();
552
553        let token = SimpleToken {
554            kind,
555            range: TextRange::at(self.offset, token_len),
556        };
557
558        self.offset += token_len;
559
560        token
561    }
562
563    fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
564        match first {
565            // Keywords and identifiers
566            c if is_identifier_start(c) => {
567                self.cursor.eat_while(is_identifier_continuation);
568                let token_len = self.cursor.token_len();
569
570                let range = TextRange::at(self.offset, token_len);
571                let kind = to_keyword_or_other(&self.source[range]);
572
573                // If the next character is a quote, we may be in a string prefix. For example:
574                // `f"foo`.
575                if kind == SimpleTokenKind::Name
576                    && matches!(self.cursor.first(), '"' | '\'')
577                    && matches!(
578                        &self.source[range],
579                        "B" | "BR"
580                            | "Br"
581                            | "F"
582                            | "FR"
583                            | "Fr"
584                            | "R"
585                            | "RB"
586                            | "RF"
587                            | "Rb"
588                            | "Rf"
589                            | "U"
590                            | "b"
591                            | "bR"
592                            | "br"
593                            | "f"
594                            | "fR"
595                            | "fr"
596                            | "r"
597                            | "rB"
598                            | "rF"
599                            | "rb"
600                            | "rf"
601                            | "u"
602                            | "T"
603                            | "TR"
604                            | "Tr"
605                            | "RT"
606                            | "Rt"
607                            | "t"
608                            | "tR"
609                            | "tr"
610                            | "rT"
611                            | "rt"
612                    )
613                {
614                    self.bogus = true;
615                    SimpleTokenKind::Other
616                } else {
617                    kind
618                }
619            }
620
621            // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
622            // whitespace.
623            ' ' | '\t' | '\x0C' => {
624                self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
625                SimpleTokenKind::Whitespace
626            }
627
628            '\n' => SimpleTokenKind::Newline,
629
630            '\r' => {
631                self.cursor.eat_char('\n');
632                SimpleTokenKind::Newline
633            }
634
635            '#' => {
636                self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
637                SimpleTokenKind::Comment
638            }
639
640            '\\' => SimpleTokenKind::Continuation,
641
642            // Non-trivia, non-keyword tokens
643            '=' => {
644                if self.cursor.eat_char('=') {
645                    SimpleTokenKind::EqEqual
646                } else {
647                    SimpleTokenKind::Equals
648                }
649            }
650            '+' => {
651                if self.cursor.eat_char('=') {
652                    SimpleTokenKind::PlusEqual
653                } else {
654                    SimpleTokenKind::Plus
655                }
656            }
657            '*' => {
658                if self.cursor.eat_char('=') {
659                    SimpleTokenKind::StarEqual
660                } else if self.cursor.eat_char('*') {
661                    if self.cursor.eat_char('=') {
662                        SimpleTokenKind::DoubleStarEqual
663                    } else {
664                        SimpleTokenKind::DoubleStar
665                    }
666                } else {
667                    SimpleTokenKind::Star
668                }
669            }
670            '/' => {
671                if self.cursor.eat_char('=') {
672                    SimpleTokenKind::SlashEqual
673                } else if self.cursor.eat_char('/') {
674                    if self.cursor.eat_char('=') {
675                        SimpleTokenKind::DoubleSlashEqual
676                    } else {
677                        SimpleTokenKind::DoubleSlash
678                    }
679                } else {
680                    SimpleTokenKind::Slash
681                }
682            }
683            '%' => {
684                if self.cursor.eat_char('=') {
685                    SimpleTokenKind::PercentEqual
686                } else {
687                    SimpleTokenKind::Percent
688                }
689            }
690            '|' => {
691                if self.cursor.eat_char('=') {
692                    SimpleTokenKind::VbarEqual
693                } else {
694                    SimpleTokenKind::Vbar
695                }
696            }
697            '^' => {
698                if self.cursor.eat_char('=') {
699                    SimpleTokenKind::CircumflexEqual
700                } else {
701                    SimpleTokenKind::Circumflex
702                }
703            }
704            '&' => {
705                if self.cursor.eat_char('=') {
706                    SimpleTokenKind::AmperEqual
707                } else {
708                    SimpleTokenKind::Ampersand
709                }
710            }
711            '-' => {
712                if self.cursor.eat_char('=') {
713                    SimpleTokenKind::MinusEqual
714                } else if self.cursor.eat_char('>') {
715                    SimpleTokenKind::RArrow
716                } else {
717                    SimpleTokenKind::Minus
718                }
719            }
720            '@' => {
721                if self.cursor.eat_char('=') {
722                    SimpleTokenKind::AtEqual
723                } else {
724                    SimpleTokenKind::At
725                }
726            }
727            '!' => {
728                if self.cursor.eat_char('=') {
729                    SimpleTokenKind::NotEqual
730                } else {
731                    self.bogus = true;
732                    SimpleTokenKind::Other
733                }
734            }
735            '~' => SimpleTokenKind::Tilde,
736            ':' => {
737                if self.cursor.eat_char('=') {
738                    SimpleTokenKind::ColonEqual
739                } else {
740                    SimpleTokenKind::Colon
741                }
742            }
743            ';' => SimpleTokenKind::Semi,
744            '<' => {
745                if self.cursor.eat_char('<') {
746                    if self.cursor.eat_char('=') {
747                        SimpleTokenKind::LeftShiftEqual
748                    } else {
749                        SimpleTokenKind::LeftShift
750                    }
751                } else if self.cursor.eat_char('=') {
752                    SimpleTokenKind::LessEqual
753                } else {
754                    SimpleTokenKind::Less
755                }
756            }
757            '>' => {
758                if self.cursor.eat_char('>') {
759                    if self.cursor.eat_char('=') {
760                        SimpleTokenKind::RightShiftEqual
761                    } else {
762                        SimpleTokenKind::RightShift
763                    }
764                } else if self.cursor.eat_char('=') {
765                    SimpleTokenKind::GreaterEqual
766                } else {
767                    SimpleTokenKind::Greater
768                }
769            }
770            ',' => SimpleTokenKind::Comma,
771            '.' => {
772                if self.cursor.first() == '.' && self.cursor.second() == '.' {
773                    self.cursor.bump();
774                    self.cursor.bump();
775                    SimpleTokenKind::Ellipsis
776                } else {
777                    SimpleTokenKind::Dot
778                }
779            }
780
781            // Bracket tokens
782            '(' => SimpleTokenKind::LParen,
783            ')' => SimpleTokenKind::RParen,
784            '[' => SimpleTokenKind::LBracket,
785            ']' => SimpleTokenKind::RBracket,
786            '{' => SimpleTokenKind::LBrace,
787            '}' => SimpleTokenKind::RBrace,
788
789            _ => {
790                self.bogus = true;
791                SimpleTokenKind::Other
792            }
793        }
794    }
795
796    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
797        self.filter(|t| !t.kind().is_trivia())
798    }
799}
800
801impl Iterator for SimpleTokenizer<'_> {
802    type Item = SimpleToken;
803
804    fn next(&mut self) -> Option<Self::Item> {
805        let token = self.next_token();
806
807        if token.kind == SimpleTokenKind::EndOfFile {
808            None
809        } else {
810            Some(token)
811        }
812    }
813}
814
815/// Simple zero allocation backwards tokenizer for finding preceding tokens.
816///
817/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
818/// It will fail when reaching a string.
819///
820/// In case it finds something it can't parse, the tokenizer will return a
821/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
822pub struct BackwardsTokenizer<'a> {
823    offset: TextSize,
824    back_offset: TextSize,
825    /// Not `&CommentRanges` to avoid a circular dependency.
826    comment_ranges: &'a [TextRange],
827    bogus: bool,
828    source: &'a str,
829    cursor: Cursor<'a>,
830}
831
832impl<'a> BackwardsTokenizer<'a> {
833    pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
834        Self {
835            offset: range.start(),
836            back_offset: range.end(),
837            // Throw out any comments that follow the range.
838            comment_ranges: &comment_range
839                [..comment_range.partition_point(|comment| comment.start() <= range.end())],
840            bogus: false,
841            source,
842            cursor: Cursor::new(&source[range]),
843        }
844    }
845
846    pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
847        Self::new(source, TextRange::up_to(offset), comment_range)
848    }
849
850    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
851        self.filter(|t| !t.kind().is_trivia())
852    }
853
854    pub fn next_token(&mut self) -> SimpleToken {
855        self.cursor.start_token();
856        self.back_offset = self.cursor.text_len() + self.offset;
857
858        let Some(last) = self.cursor.bump_back() else {
859            return SimpleToken {
860                kind: SimpleTokenKind::EndOfFile,
861                range: TextRange::empty(self.back_offset),
862            };
863        };
864
865        if self.bogus {
866            let token = SimpleToken {
867                kind: SimpleTokenKind::Bogus,
868                range: TextRange::up_to(self.back_offset),
869            };
870
871            // Set the cursor to EOF
872            self.cursor = Cursor::new("");
873            self.back_offset = TextSize::new(0);
874            return token;
875        }
876
877        if let Some(comment) = self
878            .comment_ranges
879            .last()
880            .filter(|comment| comment.contains_inclusive(self.back_offset))
881        {
882            self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
883
884            // Skip the comment without iterating over the chars manually.
885            self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
886            debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
887            return SimpleToken {
888                kind: SimpleTokenKind::Comment,
889                range: comment.range(),
890            };
891        }
892
893        let kind = match last {
894            // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
895            // whitespace. Note that this will lex-out trailing whitespace from a comment as
896            // whitespace rather than as part of the comment token, but this shouldn't matter for
897            // our use case.
898            ' ' | '\t' | '\x0C' => {
899                self.cursor
900                    .eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
901                SimpleTokenKind::Whitespace
902            }
903
904            '\r' => SimpleTokenKind::Newline,
905            '\n' => {
906                self.cursor.eat_char_back('\r');
907                SimpleTokenKind::Newline
908            }
909            _ => self.next_token_inner(last),
910        };
911
912        let token_len = self.cursor.token_len();
913        let start = self.back_offset - token_len;
914        SimpleToken {
915            kind,
916            range: TextRange::at(start, token_len),
917        }
918    }
919
920    /// Helper to parser the previous token once we skipped all whitespace
921    fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
922        match last {
923            // Keywords and identifiers
924            c if is_identifier_continuation(c) => {
925                // if we only have identifier continuations but no start (e.g. 555) we
926                // don't want to consume the chars, so in that case, we want to rewind the
927                // cursor to here
928                let savepoint = self.cursor.clone();
929                self.cursor.eat_back_while(is_identifier_continuation);
930
931                let token_len = self.cursor.token_len();
932                let range = TextRange::at(self.back_offset - token_len, token_len);
933
934                if self.source[range]
935                    .chars()
936                    .next()
937                    .is_some_and(is_identifier_start)
938                {
939                    to_keyword_or_other(&self.source[range])
940                } else {
941                    self.cursor = savepoint;
942                    self.bogus = true;
943                    SimpleTokenKind::Other
944                }
945            }
946
947            // Non-trivia tokens that are unambiguous when lexing backwards.
948            // In other words: these are characters that _don't_ appear at the
949            // end of a multi-character token (like `!=`).
950            '\\' => SimpleTokenKind::Continuation,
951            ':' => SimpleTokenKind::Colon,
952            '~' => SimpleTokenKind::Tilde,
953            '%' => SimpleTokenKind::Percent,
954            '|' => SimpleTokenKind::Vbar,
955            ',' => SimpleTokenKind::Comma,
956            ';' => SimpleTokenKind::Semi,
957            '(' => SimpleTokenKind::LParen,
958            ')' => SimpleTokenKind::RParen,
959            '[' => SimpleTokenKind::LBracket,
960            ']' => SimpleTokenKind::RBracket,
961            '{' => SimpleTokenKind::LBrace,
962            '}' => SimpleTokenKind::RBrace,
963            '&' => SimpleTokenKind::Ampersand,
964            '^' => SimpleTokenKind::Circumflex,
965            '+' => SimpleTokenKind::Plus,
966            '-' => SimpleTokenKind::Minus,
967
968            // Non-trivia tokens that _are_ ambiguous when lexing backwards.
969            // In other words: these are characters that _might_ mark the end
970            // of a multi-character token (like `!=` or `->` or `//` or `**`).
971            '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
972                // This could be a single-token token, like `+` in `x + y`, or a
973                // multi-character token, like `+=` in `x += y`. It could also be a sequence
974                // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
975                // important that we produce the same token stream when lexing backwards as
976                // we do when lexing forwards. So, identify the range of the sequence, lex
977                // forwards, and return the last token.
978                let mut cursor = self.cursor.clone();
979                cursor.eat_back_while(|c| {
980                    matches!(
981                        c,
982                        ':' | '~'
983                            | '%'
984                            | '|'
985                            | '&'
986                            | '^'
987                            | '+'
988                            | '-'
989                            | '='
990                            | '*'
991                            | '/'
992                            | '@'
993                            | '!'
994                            | '<'
995                            | '>'
996                            | '.'
997                    )
998                });
999
1000                let token_len = cursor.token_len();
1001                let range = TextRange::at(self.back_offset - token_len, token_len);
1002
1003                let forward_lexer = SimpleTokenizer::new(self.source, range);
1004                if let Some(token) = forward_lexer.last() {
1005                    // If the token spans multiple characters, bump the cursor. Note,
1006                    // though, that we already bumped the cursor to past the last character
1007                    // in the token at the very start of `next_token_back`.y
1008                    for _ in self.source[token.range].chars().rev().skip(1) {
1009                        self.cursor.bump_back().unwrap();
1010                    }
1011                    token.kind()
1012                } else {
1013                    self.bogus = true;
1014                    SimpleTokenKind::Other
1015                }
1016            }
1017            _ => {
1018                self.bogus = true;
1019                SimpleTokenKind::Other
1020            }
1021        }
1022    }
1023}
1024
1025impl Iterator for BackwardsTokenizer<'_> {
1026    type Item = SimpleToken;
1027
1028    fn next(&mut self) -> Option<Self::Item> {
1029        let token = self.next_token();
1030
1031        if token.kind == SimpleTokenKind::EndOfFile {
1032            None
1033        } else {
1034            Some(token)
1035        }
1036    }
1037}
ruff_python_trivia/tokenizer.rs

ruff_python_trivia/
tokenizer.rs