rustpython_ruff_python_trivia/
tokenizer.rs

1use unicode_ident::{is_xid_continue, is_xid_start};
2
3use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
4
5use crate::{Cursor, is_python_whitespace};
6
7/// Searches for the first non-trivia character after `offset`.
8///
9/// The search skips over any whitespace and comments.
10///
11/// Returns `Some` if the source code after `offset` contains any non-trivia character.///
12/// Returns `None` if the text after `offset` is empty or only contains trivia (whitespace or comments).
13pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
14    SimpleTokenizer::starts_at(offset, code)
15        .skip_trivia()
16        .next()
17}
18
19/// Returns the only non-trivia, non-closing parenthesis token in `range`.
20///
21/// Includes debug assertions that the range only contains that single token.
22pub fn find_only_token_in_range(
23    range: TextRange,
24    token_kind: SimpleTokenKind,
25    code: &str,
26) -> SimpleToken {
27    let mut tokens = SimpleTokenizer::new(code, range)
28        .skip_trivia()
29        .skip_while(|token| token.kind == SimpleTokenKind::RParen);
30    let token = tokens.next().expect("Expected a token");
31    debug_assert_eq!(token.kind(), token_kind);
32    let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
33    #[expect(clippy::debug_assert_with_mut_call)]
34    {
35        debug_assert_eq!(tokens.next(), None);
36    }
37    token
38}
39
40/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
41pub fn lines_before(offset: TextSize, code: &str) -> u32 {
42    let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
43
44    let mut newlines = 0u32;
45    while let Some(c) = cursor.bump_back() {
46        match c {
47            '\n' => {
48                cursor.eat_char_back('\r');
49                newlines += 1;
50            }
51            '\r' => {
52                newlines += 1;
53            }
54            c if is_python_whitespace(c) => {
55                continue;
56            }
57            _ => {
58                break;
59            }
60        }
61    }
62
63    newlines
64}
65
66/// Counts the empty lines between `offset` and the first non-whitespace character.
67pub fn lines_after(offset: TextSize, code: &str) -> u32 {
68    let mut cursor = Cursor::new(&code[offset.to_usize()..]);
69
70    let mut newlines = 0u32;
71    while let Some(c) = cursor.bump() {
72        match c {
73            '\n' => {
74                newlines += 1;
75            }
76            '\r' => {
77                cursor.eat_char('\n');
78                newlines += 1;
79            }
80            c if is_python_whitespace(c) => {
81                continue;
82            }
83            _ => {
84                break;
85            }
86        }
87    }
88
89    newlines
90}
91
92/// Counts the empty lines after `offset`, ignoring any trailing trivia: end-of-line comments,
93/// own-line comments, and any intermediary newlines.
94pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
95    let mut newlines = 0u32;
96    for token in SimpleTokenizer::starts_at(offset, code) {
97        match token.kind() {
98            SimpleTokenKind::Newline => {
99                newlines += 1;
100            }
101            SimpleTokenKind::Whitespace => {}
102            // If we see a comment, reset the newlines counter.
103            SimpleTokenKind::Comment => {
104                newlines = 0;
105            }
106            // As soon as we see a non-trivia token, we're done.
107            _ => {
108                break;
109            }
110        }
111    }
112    newlines
113}
114
115/// Counts the empty lines after `offset`, ignoring any trailing trivia on the same line as
116/// `offset`.
117#[expect(clippy::cast_possible_truncation)]
118pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
119    // SAFETY: We don't support files greater than 4GB, so casting to u32 is safe.
120    SimpleTokenizer::starts_at(offset, code)
121        .skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
122        .take_while(|token| {
123            token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
124        })
125        .filter(|token| token.kind == SimpleTokenKind::Newline)
126        .count() as u32
127}
128
129fn is_identifier_start(c: char) -> bool {
130    if c.is_ascii() {
131        c.is_ascii_alphabetic() || c == '_'
132    } else {
133        is_xid_start(c)
134    }
135}
136
137// Checks if the character c is a valid continuation character as described
138// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
139fn is_identifier_continuation(c: char) -> bool {
140    // Arrange things such that ASCII codepoints never
141    // result in the slower `is_xid_continue` getting called.
142    if c.is_ascii() {
143        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
144    } else {
145        is_xid_continue(c)
146    }
147}
148
149fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
150    match source {
151        "and" => SimpleTokenKind::And,
152        "as" => SimpleTokenKind::As,
153        "assert" => SimpleTokenKind::Assert,
154        "async" => SimpleTokenKind::Async,
155        "await" => SimpleTokenKind::Await,
156        "break" => SimpleTokenKind::Break,
157        "class" => SimpleTokenKind::Class,
158        "continue" => SimpleTokenKind::Continue,
159        "def" => SimpleTokenKind::Def,
160        "del" => SimpleTokenKind::Del,
161        "elif" => SimpleTokenKind::Elif,
162        "else" => SimpleTokenKind::Else,
163        "except" => SimpleTokenKind::Except,
164        "finally" => SimpleTokenKind::Finally,
165        "for" => SimpleTokenKind::For,
166        "from" => SimpleTokenKind::From,
167        "global" => SimpleTokenKind::Global,
168        "if" => SimpleTokenKind::If,
169        "import" => SimpleTokenKind::Import,
170        "in" => SimpleTokenKind::In,
171        "is" => SimpleTokenKind::Is,
172        "lazy" => SimpleTokenKind::Lazy, // Lazy is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
173        "lambda" => SimpleTokenKind::Lambda,
174        "nonlocal" => SimpleTokenKind::Nonlocal,
175        "not" => SimpleTokenKind::Not,
176        "or" => SimpleTokenKind::Or,
177        "pass" => SimpleTokenKind::Pass,
178        "raise" => SimpleTokenKind::Raise,
179        "return" => SimpleTokenKind::Return,
180        "try" => SimpleTokenKind::Try,
181        "while" => SimpleTokenKind::While,
182        "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
183        "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
184        "case" => SimpleTokenKind::Case,
185        "with" => SimpleTokenKind::With,
186        "yield" => SimpleTokenKind::Yield,
187        _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
188    }
189}
190
191#[derive(Clone, Debug, Eq, PartialEq, Hash)]
192pub struct SimpleToken {
193    pub kind: SimpleTokenKind,
194    pub range: TextRange,
195}
196
197impl SimpleToken {
198    pub const fn kind(&self) -> SimpleTokenKind {
199        self.kind
200    }
201}
202
203impl Ranged for SimpleToken {
204    fn range(&self) -> TextRange {
205        self.range
206    }
207}
208
209#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
210pub enum SimpleTokenKind {
211    /// A comment, not including the trailing new line.
212    Comment,
213
214    /// Sequence of ' ' or '\t'
215    Whitespace,
216
217    /// Start or end of the file
218    EndOfFile,
219
220    /// `\\`
221    Continuation,
222
223    /// `\n` or `\r` or `\r\n`
224    Newline,
225
226    /// `(`
227    LParen,
228
229    /// `)`
230    RParen,
231
232    /// `{`
233    LBrace,
234
235    /// `}`
236    RBrace,
237
238    /// `[`
239    LBracket,
240
241    /// `]`
242    RBracket,
243
244    /// `,`
245    Comma,
246
247    /// `:`
248    Colon,
249
250    /// `;`
251    Semi,
252
253    /// `/`
254    Slash,
255
256    /// `*`
257    Star,
258
259    /// `.`
260    Dot,
261
262    /// `+`
263    Plus,
264
265    /// `-`
266    Minus,
267
268    /// `=`
269    Equals,
270
271    /// `>`
272    Greater,
273
274    /// `<`
275    Less,
276
277    /// `%`
278    Percent,
279
280    /// `&`
281    Ampersand,
282
283    /// `^`
284    Circumflex,
285
286    /// `|`
287    Vbar,
288
289    /// `@`
290    At,
291
292    /// `~`
293    Tilde,
294
295    /// `==`
296    EqEqual,
297
298    /// `!=`
299    NotEqual,
300
301    /// `<=`
302    LessEqual,
303
304    /// `>=`
305    GreaterEqual,
306
307    /// `<<`
308    LeftShift,
309
310    /// `>>`
311    RightShift,
312
313    /// `**`
314    DoubleStar,
315
316    /// `**=`
317    DoubleStarEqual,
318
319    /// `+=`
320    PlusEqual,
321
322    /// `-=`
323    MinusEqual,
324
325    /// `*=`
326    StarEqual,
327
328    /// `/=`
329    SlashEqual,
330
331    /// `%=`
332    PercentEqual,
333
334    /// `&=`
335    AmperEqual,
336
337    /// `|=`
338    VbarEqual,
339
340    /// `^=`
341    CircumflexEqual,
342
343    /// `<<=`
344    LeftShiftEqual,
345
346    /// `>>=`
347    RightShiftEqual,
348
349    /// `//`
350    DoubleSlash,
351
352    /// `//=`
353    DoubleSlashEqual,
354
355    /// `:=`
356    ColonEqual,
357
358    /// `...`
359    Ellipsis,
360
361    /// `@=`
362    AtEqual,
363
364    /// `->`
365    RArrow,
366
367    /// `and`
368    And,
369
370    /// `as`
371    As,
372
373    /// `assert`
374    Assert,
375
376    /// `async`
377    Async,
378
379    /// `await`
380    Await,
381
382    /// `break`
383    Break,
384
385    /// `class`
386    Class,
387
388    /// `continue`
389    Continue,
390
391    /// `def`
392    Def,
393
394    /// `del`
395    Del,
396
397    /// `elif`
398    Elif,
399
400    /// `else`
401    Else,
402
403    /// `except`
404    Except,
405
406    /// `finally`
407    Finally,
408
409    /// `for`
410    For,
411
412    /// `from`
413    From,
414
415    /// `global`
416    Global,
417
418    /// `if`
419    If,
420
421    /// `import`
422    Import,
423
424    /// `in`
425    In,
426
427    /// `is`
428    Is,
429
430    /// `lambda`
431    Lambda,
432
433    /// `nonlocal`
434    Nonlocal,
435
436    /// `not`
437    Not,
438
439    /// `or`
440    Or,
441
442    /// `pass`
443    Pass,
444
445    /// `raise`
446    Raise,
447
448    /// `return`
449    Return,
450
451    /// `try`
452    Try,
453
454    /// `while`
455    While,
456
457    /// `lazy`
458    Lazy,
459
460    /// `match`
461    Match,
462
463    /// `type`
464    Type,
465
466    /// `case`
467    Case,
468
469    /// `with`
470    With,
471
472    /// `yield`
473    Yield,
474
475    /// An identifier or keyword.
476    Name,
477
478    /// Any other non trivia token.
479    Other,
480
481    /// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
482    Bogus,
483}
484
485impl SimpleTokenKind {
486    pub const fn is_trivia(self) -> bool {
487        matches!(
488            self,
489            SimpleTokenKind::Whitespace
490                | SimpleTokenKind::Newline
491                | SimpleTokenKind::Comment
492                | SimpleTokenKind::Continuation
493        )
494    }
495
496    pub const fn is_comment(self) -> bool {
497        matches!(self, SimpleTokenKind::Comment)
498    }
499}
500
501/// Simple zero allocation tokenizer handling most tokens.
502///
503/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
504///
505/// In case it finds something it can't parse, the tokenizer will return a
506/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
507pub struct SimpleTokenizer<'a> {
508    offset: TextSize,
509    /// `true` when it is known that the current `back` line has no comment for sure.
510    bogus: bool,
511    source: &'a str,
512    cursor: Cursor<'a>,
513}
514
515impl<'a> SimpleTokenizer<'a> {
516    pub fn new(source: &'a str, range: TextRange) -> Self {
517        Self {
518            offset: range.start(),
519            bogus: false,
520            source,
521            cursor: Cursor::new(&source[range]),
522        }
523    }
524
525    pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
526        let range = TextRange::new(offset, source.text_len());
527        Self::new(source, range)
528    }
529
530    fn next_token(&mut self) -> SimpleToken {
531        self.cursor.start_token();
532
533        let Some(first) = self.cursor.bump() else {
534            return SimpleToken {
535                kind: SimpleTokenKind::EndOfFile,
536                range: TextRange::empty(self.offset),
537            };
538        };
539
540        if self.bogus {
541            // Emit a single final bogus token
542            let token = SimpleToken {
543                kind: SimpleTokenKind::Bogus,
544                range: TextRange::new(self.offset, self.source.text_len()),
545            };
546
547            // Set the cursor to EOF
548            self.cursor = Cursor::new("");
549            self.offset = self.source.text_len();
550            return token;
551        }
552
553        let kind = self.next_token_inner(first);
554
555        let token_len = self.cursor.token_len();
556
557        let token = SimpleToken {
558            kind,
559            range: TextRange::at(self.offset, token_len),
560        };
561
562        self.offset += token_len;
563
564        token
565    }
566
567    fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
568        match first {
569            // Keywords and identifiers
570            c if is_identifier_start(c) => {
571                self.cursor.eat_while(is_identifier_continuation);
572                let token_len = self.cursor.token_len();
573
574                let range = TextRange::at(self.offset, token_len);
575                let kind = to_keyword_or_other(&self.source[range]);
576
577                // If the next character is a quote, we may be in a string prefix. For example:
578                // `f"foo`.
579                if kind == SimpleTokenKind::Name
580                    && matches!(self.cursor.first(), '"' | '\'')
581                    && matches!(
582                        &self.source[range],
583                        "B" | "BR"
584                            | "Br"
585                            | "F"
586                            | "FR"
587                            | "Fr"
588                            | "R"
589                            | "RB"
590                            | "RF"
591                            | "Rb"
592                            | "Rf"
593                            | "U"
594                            | "b"
595                            | "bR"
596                            | "br"
597                            | "f"
598                            | "fR"
599                            | "fr"
600                            | "r"
601                            | "rB"
602                            | "rF"
603                            | "rb"
604                            | "rf"
605                            | "u"
606                            | "T"
607                            | "TR"
608                            | "Tr"
609                            | "RT"
610                            | "Rt"
611                            | "t"
612                            | "tR"
613                            | "tr"
614                            | "rT"
615                            | "rt"
616                    )
617                {
618                    self.bogus = true;
619                    SimpleTokenKind::Other
620                } else {
621                    kind
622                }
623            }
624
625            // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
626            // whitespace.
627            ' ' | '\t' | '\x0C' => {
628                self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
629                SimpleTokenKind::Whitespace
630            }
631
632            '\n' => SimpleTokenKind::Newline,
633
634            '\r' => {
635                self.cursor.eat_char('\n');
636                SimpleTokenKind::Newline
637            }
638
639            '#' => {
640                self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
641                SimpleTokenKind::Comment
642            }
643
644            '\\' => SimpleTokenKind::Continuation,
645
646            // Non-trivia, non-keyword tokens
647            '=' => {
648                if self.cursor.eat_char('=') {
649                    SimpleTokenKind::EqEqual
650                } else {
651                    SimpleTokenKind::Equals
652                }
653            }
654            '+' => {
655                if self.cursor.eat_char('=') {
656                    SimpleTokenKind::PlusEqual
657                } else {
658                    SimpleTokenKind::Plus
659                }
660            }
661            '*' => {
662                if self.cursor.eat_char('=') {
663                    SimpleTokenKind::StarEqual
664                } else if self.cursor.eat_char('*') {
665                    if self.cursor.eat_char('=') {
666                        SimpleTokenKind::DoubleStarEqual
667                    } else {
668                        SimpleTokenKind::DoubleStar
669                    }
670                } else {
671                    SimpleTokenKind::Star
672                }
673            }
674            '/' => {
675                if self.cursor.eat_char('=') {
676                    SimpleTokenKind::SlashEqual
677                } else if self.cursor.eat_char('/') {
678                    if self.cursor.eat_char('=') {
679                        SimpleTokenKind::DoubleSlashEqual
680                    } else {
681                        SimpleTokenKind::DoubleSlash
682                    }
683                } else {
684                    SimpleTokenKind::Slash
685                }
686            }
687            '%' => {
688                if self.cursor.eat_char('=') {
689                    SimpleTokenKind::PercentEqual
690                } else {
691                    SimpleTokenKind::Percent
692                }
693            }
694            '|' => {
695                if self.cursor.eat_char('=') {
696                    SimpleTokenKind::VbarEqual
697                } else {
698                    SimpleTokenKind::Vbar
699                }
700            }
701            '^' => {
702                if self.cursor.eat_char('=') {
703                    SimpleTokenKind::CircumflexEqual
704                } else {
705                    SimpleTokenKind::Circumflex
706                }
707            }
708            '&' => {
709                if self.cursor.eat_char('=') {
710                    SimpleTokenKind::AmperEqual
711                } else {
712                    SimpleTokenKind::Ampersand
713                }
714            }
715            '-' => {
716                if self.cursor.eat_char('=') {
717                    SimpleTokenKind::MinusEqual
718                } else if self.cursor.eat_char('>') {
719                    SimpleTokenKind::RArrow
720                } else {
721                    SimpleTokenKind::Minus
722                }
723            }
724            '@' => {
725                if self.cursor.eat_char('=') {
726                    SimpleTokenKind::AtEqual
727                } else {
728                    SimpleTokenKind::At
729                }
730            }
731            '!' => {
732                if self.cursor.eat_char('=') {
733                    SimpleTokenKind::NotEqual
734                } else {
735                    self.bogus = true;
736                    SimpleTokenKind::Other
737                }
738            }
739            '~' => SimpleTokenKind::Tilde,
740            ':' => {
741                if self.cursor.eat_char('=') {
742                    SimpleTokenKind::ColonEqual
743                } else {
744                    SimpleTokenKind::Colon
745                }
746            }
747            ';' => SimpleTokenKind::Semi,
748            '<' => {
749                if self.cursor.eat_char('<') {
750                    if self.cursor.eat_char('=') {
751                        SimpleTokenKind::LeftShiftEqual
752                    } else {
753                        SimpleTokenKind::LeftShift
754                    }
755                } else if self.cursor.eat_char('=') {
756                    SimpleTokenKind::LessEqual
757                } else {
758                    SimpleTokenKind::Less
759                }
760            }
761            '>' => {
762                if self.cursor.eat_char('>') {
763                    if self.cursor.eat_char('=') {
764                        SimpleTokenKind::RightShiftEqual
765                    } else {
766                        SimpleTokenKind::RightShift
767                    }
768                } else if self.cursor.eat_char('=') {
769                    SimpleTokenKind::GreaterEqual
770                } else {
771                    SimpleTokenKind::Greater
772                }
773            }
774            ',' => SimpleTokenKind::Comma,
775            '.' => {
776                if self.cursor.first() == '.' && self.cursor.second() == '.' {
777                    self.cursor.bump();
778                    self.cursor.bump();
779                    SimpleTokenKind::Ellipsis
780                } else {
781                    SimpleTokenKind::Dot
782                }
783            }
784
785            // Bracket tokens
786            '(' => SimpleTokenKind::LParen,
787            ')' => SimpleTokenKind::RParen,
788            '[' => SimpleTokenKind::LBracket,
789            ']' => SimpleTokenKind::RBracket,
790            '{' => SimpleTokenKind::LBrace,
791            '}' => SimpleTokenKind::RBrace,
792
793            _ => {
794                self.bogus = true;
795                SimpleTokenKind::Other
796            }
797        }
798    }
799
800    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
801        self.filter(|t| !t.kind().is_trivia())
802    }
803}
804
805impl Iterator for SimpleTokenizer<'_> {
806    type Item = SimpleToken;
807
808    fn next(&mut self) -> Option<Self::Item> {
809        let token = self.next_token();
810
811        if token.kind == SimpleTokenKind::EndOfFile {
812            None
813        } else {
814            Some(token)
815        }
816    }
817}
818
819/// Simple zero allocation backwards tokenizer for finding preceding tokens.
820///
821/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
822/// It will fail when reaching a string.
823///
824/// In case it finds something it can't parse, the tokenizer will return a
825/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
826pub struct BackwardsTokenizer<'a> {
827    offset: TextSize,
828    back_offset: TextSize,
829    /// Not `&CommentRanges` to avoid a circular dependency.
830    comment_ranges: &'a [TextRange],
831    bogus: bool,
832    source: &'a str,
833    cursor: Cursor<'a>,
834}
835
836impl<'a> BackwardsTokenizer<'a> {
837    pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
838        Self {
839            offset: range.start(),
840            back_offset: range.end(),
841            // Throw out any comments that follow the range.
842            comment_ranges: &comment_range
843                [..comment_range.partition_point(|comment| comment.start() <= range.end())],
844            bogus: false,
845            source,
846            cursor: Cursor::new(&source[range]),
847        }
848    }
849
850    pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
851        Self::new(source, TextRange::up_to(offset), comment_range)
852    }
853
854    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
855        self.filter(|t| !t.kind().is_trivia())
856    }
857
858    pub fn next_token(&mut self) -> SimpleToken {
859        self.cursor.start_token();
860        self.back_offset = self.cursor.text_len() + self.offset;
861
862        let Some(last) = self.cursor.bump_back() else {
863            return SimpleToken {
864                kind: SimpleTokenKind::EndOfFile,
865                range: TextRange::empty(self.back_offset),
866            };
867        };
868
869        if self.bogus {
870            let token = SimpleToken {
871                kind: SimpleTokenKind::Bogus,
872                range: TextRange::up_to(self.back_offset),
873            };
874
875            // Set the cursor to EOF
876            self.cursor = Cursor::new("");
877            self.back_offset = TextSize::new(0);
878            return token;
879        }
880
881        if let Some(comment) = self
882            .comment_ranges
883            .last()
884            .filter(|comment| comment.contains_inclusive(self.back_offset))
885        {
886            self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
887
888            // Skip the comment without iterating over the chars manually.
889            self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
890            debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
891            return SimpleToken {
892                kind: SimpleTokenKind::Comment,
893                range: comment.range(),
894            };
895        }
896
897        let kind = match last {
898            // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
899            // whitespace. Note that this will lex-out trailing whitespace from a comment as
900            // whitespace rather than as part of the comment token, but this shouldn't matter for
901            // our use case.
902            ' ' | '\t' | '\x0C' => {
903                self.cursor
904                    .eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
905                SimpleTokenKind::Whitespace
906            }
907
908            '\r' => SimpleTokenKind::Newline,
909            '\n' => {
910                self.cursor.eat_char_back('\r');
911                SimpleTokenKind::Newline
912            }
913            _ => self.next_token_inner(last),
914        };
915
916        let token_len = self.cursor.token_len();
917        let start = self.back_offset - token_len;
918        SimpleToken {
919            kind,
920            range: TextRange::at(start, token_len),
921        }
922    }
923
924    /// Helper to parser the previous token once we skipped all whitespace
925    fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
926        match last {
927            // Keywords and identifiers
928            c if is_identifier_continuation(c) => {
929                // if we only have identifier continuations but no start (e.g. 555) we
930                // don't want to consume the chars, so in that case, we want to rewind the
931                // cursor to here
932                let savepoint = self.cursor.clone();
933                self.cursor.eat_back_while(is_identifier_continuation);
934
935                let token_len = self.cursor.token_len();
936                let range = TextRange::at(self.back_offset - token_len, token_len);
937
938                if self.source[range]
939                    .chars()
940                    .next()
941                    .is_some_and(is_identifier_start)
942                {
943                    to_keyword_or_other(&self.source[range])
944                } else {
945                    self.cursor = savepoint;
946                    self.bogus = true;
947                    SimpleTokenKind::Other
948                }
949            }
950
951            // Non-trivia tokens that are unambiguous when lexing backwards.
952            // In other words: these are characters that _don't_ appear at the
953            // end of a multi-character token (like `!=`).
954            '\\' => SimpleTokenKind::Continuation,
955            ':' => SimpleTokenKind::Colon,
956            '~' => SimpleTokenKind::Tilde,
957            '%' => SimpleTokenKind::Percent,
958            '|' => SimpleTokenKind::Vbar,
959            ',' => SimpleTokenKind::Comma,
960            ';' => SimpleTokenKind::Semi,
961            '(' => SimpleTokenKind::LParen,
962            ')' => SimpleTokenKind::RParen,
963            '[' => SimpleTokenKind::LBracket,
964            ']' => SimpleTokenKind::RBracket,
965            '{' => SimpleTokenKind::LBrace,
966            '}' => SimpleTokenKind::RBrace,
967            '&' => SimpleTokenKind::Ampersand,
968            '^' => SimpleTokenKind::Circumflex,
969            '+' => SimpleTokenKind::Plus,
970            '-' => SimpleTokenKind::Minus,
971
972            // Non-trivia tokens that _are_ ambiguous when lexing backwards.
973            // In other words: these are characters that _might_ mark the end
974            // of a multi-character token (like `!=` or `->` or `//` or `**`).
975            '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
976                // This could be a single-token token, like `+` in `x + y`, or a
977                // multi-character token, like `+=` in `x += y`. It could also be a sequence
978                // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
979                // important that we produce the same token stream when lexing backwards as
980                // we do when lexing forwards. So, identify the range of the sequence, lex
981                // forwards, and return the last token.
982                let mut cursor = self.cursor.clone();
983                cursor.eat_back_while(|c| {
984                    matches!(
985                        c,
986                        ':' | '~'
987                            | '%'
988                            | '|'
989                            | '&'
990                            | '^'
991                            | '+'
992                            | '-'
993                            | '='
994                            | '*'
995                            | '/'
996                            | '@'
997                            | '!'
998                            | '<'
999                            | '>'
1000                            | '.'
1001                    )
1002                });
1003
1004                let token_len = cursor.token_len();
1005                let range = TextRange::at(self.back_offset - token_len, token_len);
1006
1007                let forward_lexer = SimpleTokenizer::new(self.source, range);
1008                if let Some(token) = forward_lexer.last() {
1009                    // If the token spans multiple characters, bump the cursor. Note,
1010                    // though, that we already bumped the cursor to past the last character
1011                    // in the token at the very start of `next_token_back`.y
1012                    for _ in self.source[token.range].chars().rev().skip(1) {
1013                        self.cursor.bump_back().unwrap();
1014                    }
1015                    token.kind()
1016                } else {
1017                    self.bogus = true;
1018                    SimpleTokenKind::Other
1019                }
1020            }
1021            _ => {
1022                self.bogus = true;
1023                SimpleTokenKind::Other
1024            }
1025        }
1026    }
1027}
1028
1029impl Iterator for BackwardsTokenizer<'_> {
1030    type Item = SimpleToken;
1031
1032    fn next(&mut self) -> Option<Self::Item> {
1033        let token = self.next_token();
1034
1035        if token.kind == SimpleTokenKind::EndOfFile {
1036            None
1037        } else {
1038            Some(token)
1039        }
1040    }
1041}
rustpython_ruff_python_trivia/tokenizer.rs

rustpython_ruff_python_trivia/
tokenizer.rs