1use unicode_ident::{is_xid_continue, is_xid_start};
2
3use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
4
5use crate::{Cursor, is_python_whitespace};
6
7pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
14 SimpleTokenizer::starts_at(offset, code)
15 .skip_trivia()
16 .next()
17}
18
19pub fn find_only_token_in_range(
23 range: TextRange,
24 token_kind: SimpleTokenKind,
25 code: &str,
26) -> SimpleToken {
27 let mut tokens = SimpleTokenizer::new(code, range)
28 .skip_trivia()
29 .skip_while(|token| token.kind == SimpleTokenKind::RParen);
30 let token = tokens.next().expect("Expected a token");
31 debug_assert_eq!(token.kind(), token_kind);
32 let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
33 #[expect(clippy::debug_assert_with_mut_call)]
34 {
35 debug_assert_eq!(tokens.next(), None);
36 }
37 token
38}
39
40pub fn lines_before(offset: TextSize, code: &str) -> u32 {
42 let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
43
44 let mut newlines = 0u32;
45 while let Some(c) = cursor.bump_back() {
46 match c {
47 '\n' => {
48 cursor.eat_char_back('\r');
49 newlines += 1;
50 }
51 '\r' => {
52 newlines += 1;
53 }
54 c if is_python_whitespace(c) => {
55 continue;
56 }
57 _ => {
58 break;
59 }
60 }
61 }
62
63 newlines
64}
65
66pub fn lines_after(offset: TextSize, code: &str) -> u32 {
68 let mut cursor = Cursor::new(&code[offset.to_usize()..]);
69
70 let mut newlines = 0u32;
71 while let Some(c) = cursor.bump() {
72 match c {
73 '\n' => {
74 newlines += 1;
75 }
76 '\r' => {
77 cursor.eat_char('\n');
78 newlines += 1;
79 }
80 c if is_python_whitespace(c) => {
81 continue;
82 }
83 _ => {
84 break;
85 }
86 }
87 }
88
89 newlines
90}
91
92pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
95 let mut newlines = 0u32;
96 for token in SimpleTokenizer::starts_at(offset, code) {
97 match token.kind() {
98 SimpleTokenKind::Newline => {
99 newlines += 1;
100 }
101 SimpleTokenKind::Whitespace => {}
102 SimpleTokenKind::Comment => {
104 newlines = 0;
105 }
106 _ => {
108 break;
109 }
110 }
111 }
112 newlines
113}
114
115#[expect(clippy::cast_possible_truncation)]
118pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
119 SimpleTokenizer::starts_at(offset, code)
121 .skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
122 .take_while(|token| {
123 token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
124 })
125 .filter(|token| token.kind == SimpleTokenKind::Newline)
126 .count() as u32
127}
128
129fn is_identifier_start(c: char) -> bool {
130 if c.is_ascii() {
131 c.is_ascii_alphabetic() || c == '_'
132 } else {
133 is_xid_start(c)
134 }
135}
136
137fn is_identifier_continuation(c: char) -> bool {
140 if c.is_ascii() {
143 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
144 } else {
145 is_xid_continue(c)
146 }
147}
148
149fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
150 match source {
151 "and" => SimpleTokenKind::And,
152 "as" => SimpleTokenKind::As,
153 "assert" => SimpleTokenKind::Assert,
154 "async" => SimpleTokenKind::Async,
155 "await" => SimpleTokenKind::Await,
156 "break" => SimpleTokenKind::Break,
157 "class" => SimpleTokenKind::Class,
158 "continue" => SimpleTokenKind::Continue,
159 "def" => SimpleTokenKind::Def,
160 "del" => SimpleTokenKind::Del,
161 "elif" => SimpleTokenKind::Elif,
162 "else" => SimpleTokenKind::Else,
163 "except" => SimpleTokenKind::Except,
164 "finally" => SimpleTokenKind::Finally,
165 "for" => SimpleTokenKind::For,
166 "from" => SimpleTokenKind::From,
167 "global" => SimpleTokenKind::Global,
168 "if" => SimpleTokenKind::If,
169 "import" => SimpleTokenKind::Import,
170 "in" => SimpleTokenKind::In,
171 "is" => SimpleTokenKind::Is,
172 "lambda" => SimpleTokenKind::Lambda,
173 "nonlocal" => SimpleTokenKind::Nonlocal,
174 "not" => SimpleTokenKind::Not,
175 "or" => SimpleTokenKind::Or,
176 "pass" => SimpleTokenKind::Pass,
177 "raise" => SimpleTokenKind::Raise,
178 "return" => SimpleTokenKind::Return,
179 "try" => SimpleTokenKind::Try,
180 "while" => SimpleTokenKind::While,
181 "match" => SimpleTokenKind::Match, "type" => SimpleTokenKind::Type, "case" => SimpleTokenKind::Case,
184 "with" => SimpleTokenKind::With,
185 "yield" => SimpleTokenKind::Yield,
186 _ => SimpleTokenKind::Name, }
188}
189
190#[derive(Clone, Debug, Eq, PartialEq, Hash)]
191pub struct SimpleToken {
192 pub kind: SimpleTokenKind,
193 pub range: TextRange,
194}
195
196impl SimpleToken {
197 pub const fn kind(&self) -> SimpleTokenKind {
198 self.kind
199 }
200}
201
202impl Ranged for SimpleToken {
203 fn range(&self) -> TextRange {
204 self.range
205 }
206}
207
208#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
209pub enum SimpleTokenKind {
210 Comment,
212
213 Whitespace,
215
216 EndOfFile,
218
219 Continuation,
221
222 Newline,
224
225 LParen,
227
228 RParen,
230
231 LBrace,
233
234 RBrace,
236
237 LBracket,
239
240 RBracket,
242
243 Comma,
245
246 Colon,
248
249 Semi,
251
252 Slash,
254
255 Star,
257
258 Dot,
260
261 Plus,
263
264 Minus,
266
267 Equals,
269
270 Greater,
272
273 Less,
275
276 Percent,
278
279 Ampersand,
281
282 Circumflex,
284
285 Vbar,
287
288 At,
290
291 Tilde,
293
294 EqEqual,
296
297 NotEqual,
299
300 LessEqual,
302
303 GreaterEqual,
305
306 LeftShift,
308
309 RightShift,
311
312 DoubleStar,
314
315 DoubleStarEqual,
317
318 PlusEqual,
320
321 MinusEqual,
323
324 StarEqual,
326
327 SlashEqual,
329
330 PercentEqual,
332
333 AmperEqual,
335
336 VbarEqual,
338
339 CircumflexEqual,
341
342 LeftShiftEqual,
344
345 RightShiftEqual,
347
348 DoubleSlash,
350
351 DoubleSlashEqual,
353
354 ColonEqual,
356
357 Ellipsis,
359
360 AtEqual,
362
363 RArrow,
365
366 And,
368
369 As,
371
372 Assert,
374
375 Async,
377
378 Await,
380
381 Break,
383
384 Class,
386
387 Continue,
389
390 Def,
392
393 Del,
395
396 Elif,
398
399 Else,
401
402 Except,
404
405 Finally,
407
408 For,
410
411 From,
413
414 Global,
416
417 If,
419
420 Import,
422
423 In,
425
426 Is,
428
429 Lambda,
431
432 Nonlocal,
434
435 Not,
437
438 Or,
440
441 Pass,
443
444 Raise,
446
447 Return,
449
450 Try,
452
453 While,
455
456 Match,
458
459 Type,
461
462 Case,
464
465 With,
467
468 Yield,
470
471 Name,
473
474 Other,
476
477 Bogus,
479}
480
481impl SimpleTokenKind {
482 pub const fn is_trivia(self) -> bool {
483 matches!(
484 self,
485 SimpleTokenKind::Whitespace
486 | SimpleTokenKind::Newline
487 | SimpleTokenKind::Comment
488 | SimpleTokenKind::Continuation
489 )
490 }
491
492 pub const fn is_comment(self) -> bool {
493 matches!(self, SimpleTokenKind::Comment)
494 }
495}
496
497pub struct SimpleTokenizer<'a> {
504 offset: TextSize,
505 bogus: bool,
507 source: &'a str,
508 cursor: Cursor<'a>,
509}
510
511impl<'a> SimpleTokenizer<'a> {
512 pub fn new(source: &'a str, range: TextRange) -> Self {
513 Self {
514 offset: range.start(),
515 bogus: false,
516 source,
517 cursor: Cursor::new(&source[range]),
518 }
519 }
520
521 pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
522 let range = TextRange::new(offset, source.text_len());
523 Self::new(source, range)
524 }
525
526 fn next_token(&mut self) -> SimpleToken {
527 self.cursor.start_token();
528
529 let Some(first) = self.cursor.bump() else {
530 return SimpleToken {
531 kind: SimpleTokenKind::EndOfFile,
532 range: TextRange::empty(self.offset),
533 };
534 };
535
536 if self.bogus {
537 let token = SimpleToken {
539 kind: SimpleTokenKind::Bogus,
540 range: TextRange::new(self.offset, self.source.text_len()),
541 };
542
543 self.cursor = Cursor::new("");
545 self.offset = self.source.text_len();
546 return token;
547 }
548
549 let kind = self.next_token_inner(first);
550
551 let token_len = self.cursor.token_len();
552
553 let token = SimpleToken {
554 kind,
555 range: TextRange::at(self.offset, token_len),
556 };
557
558 self.offset += token_len;
559
560 token
561 }
562
563 fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
564 match first {
565 c if is_identifier_start(c) => {
567 self.cursor.eat_while(is_identifier_continuation);
568 let token_len = self.cursor.token_len();
569
570 let range = TextRange::at(self.offset, token_len);
571 let kind = to_keyword_or_other(&self.source[range]);
572
573 if kind == SimpleTokenKind::Name
576 && matches!(self.cursor.first(), '"' | '\'')
577 && matches!(
578 &self.source[range],
579 "B" | "BR"
580 | "Br"
581 | "F"
582 | "FR"
583 | "Fr"
584 | "R"
585 | "RB"
586 | "RF"
587 | "Rb"
588 | "Rf"
589 | "U"
590 | "b"
591 | "bR"
592 | "br"
593 | "f"
594 | "fR"
595 | "fr"
596 | "r"
597 | "rB"
598 | "rF"
599 | "rb"
600 | "rf"
601 | "u"
602 | "T"
603 | "TR"
604 | "Tr"
605 | "RT"
606 | "Rt"
607 | "t"
608 | "tR"
609 | "tr"
610 | "rT"
611 | "rt"
612 )
613 {
614 self.bogus = true;
615 SimpleTokenKind::Other
616 } else {
617 kind
618 }
619 }
620
621 ' ' | '\t' | '\x0C' => {
624 self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
625 SimpleTokenKind::Whitespace
626 }
627
628 '\n' => SimpleTokenKind::Newline,
629
630 '\r' => {
631 self.cursor.eat_char('\n');
632 SimpleTokenKind::Newline
633 }
634
635 '#' => {
636 self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
637 SimpleTokenKind::Comment
638 }
639
640 '\\' => SimpleTokenKind::Continuation,
641
642 '=' => {
644 if self.cursor.eat_char('=') {
645 SimpleTokenKind::EqEqual
646 } else {
647 SimpleTokenKind::Equals
648 }
649 }
650 '+' => {
651 if self.cursor.eat_char('=') {
652 SimpleTokenKind::PlusEqual
653 } else {
654 SimpleTokenKind::Plus
655 }
656 }
657 '*' => {
658 if self.cursor.eat_char('=') {
659 SimpleTokenKind::StarEqual
660 } else if self.cursor.eat_char('*') {
661 if self.cursor.eat_char('=') {
662 SimpleTokenKind::DoubleStarEqual
663 } else {
664 SimpleTokenKind::DoubleStar
665 }
666 } else {
667 SimpleTokenKind::Star
668 }
669 }
670 '/' => {
671 if self.cursor.eat_char('=') {
672 SimpleTokenKind::SlashEqual
673 } else if self.cursor.eat_char('/') {
674 if self.cursor.eat_char('=') {
675 SimpleTokenKind::DoubleSlashEqual
676 } else {
677 SimpleTokenKind::DoubleSlash
678 }
679 } else {
680 SimpleTokenKind::Slash
681 }
682 }
683 '%' => {
684 if self.cursor.eat_char('=') {
685 SimpleTokenKind::PercentEqual
686 } else {
687 SimpleTokenKind::Percent
688 }
689 }
690 '|' => {
691 if self.cursor.eat_char('=') {
692 SimpleTokenKind::VbarEqual
693 } else {
694 SimpleTokenKind::Vbar
695 }
696 }
697 '^' => {
698 if self.cursor.eat_char('=') {
699 SimpleTokenKind::CircumflexEqual
700 } else {
701 SimpleTokenKind::Circumflex
702 }
703 }
704 '&' => {
705 if self.cursor.eat_char('=') {
706 SimpleTokenKind::AmperEqual
707 } else {
708 SimpleTokenKind::Ampersand
709 }
710 }
711 '-' => {
712 if self.cursor.eat_char('=') {
713 SimpleTokenKind::MinusEqual
714 } else if self.cursor.eat_char('>') {
715 SimpleTokenKind::RArrow
716 } else {
717 SimpleTokenKind::Minus
718 }
719 }
720 '@' => {
721 if self.cursor.eat_char('=') {
722 SimpleTokenKind::AtEqual
723 } else {
724 SimpleTokenKind::At
725 }
726 }
727 '!' => {
728 if self.cursor.eat_char('=') {
729 SimpleTokenKind::NotEqual
730 } else {
731 self.bogus = true;
732 SimpleTokenKind::Other
733 }
734 }
735 '~' => SimpleTokenKind::Tilde,
736 ':' => {
737 if self.cursor.eat_char('=') {
738 SimpleTokenKind::ColonEqual
739 } else {
740 SimpleTokenKind::Colon
741 }
742 }
743 ';' => SimpleTokenKind::Semi,
744 '<' => {
745 if self.cursor.eat_char('<') {
746 if self.cursor.eat_char('=') {
747 SimpleTokenKind::LeftShiftEqual
748 } else {
749 SimpleTokenKind::LeftShift
750 }
751 } else if self.cursor.eat_char('=') {
752 SimpleTokenKind::LessEqual
753 } else {
754 SimpleTokenKind::Less
755 }
756 }
757 '>' => {
758 if self.cursor.eat_char('>') {
759 if self.cursor.eat_char('=') {
760 SimpleTokenKind::RightShiftEqual
761 } else {
762 SimpleTokenKind::RightShift
763 }
764 } else if self.cursor.eat_char('=') {
765 SimpleTokenKind::GreaterEqual
766 } else {
767 SimpleTokenKind::Greater
768 }
769 }
770 ',' => SimpleTokenKind::Comma,
771 '.' => {
772 if self.cursor.first() == '.' && self.cursor.second() == '.' {
773 self.cursor.bump();
774 self.cursor.bump();
775 SimpleTokenKind::Ellipsis
776 } else {
777 SimpleTokenKind::Dot
778 }
779 }
780
781 '(' => SimpleTokenKind::LParen,
783 ')' => SimpleTokenKind::RParen,
784 '[' => SimpleTokenKind::LBracket,
785 ']' => SimpleTokenKind::RBracket,
786 '{' => SimpleTokenKind::LBrace,
787 '}' => SimpleTokenKind::RBrace,
788
789 _ => {
790 self.bogus = true;
791 SimpleTokenKind::Other
792 }
793 }
794 }
795
796 pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
797 self.filter(|t| !t.kind().is_trivia())
798 }
799}
800
801impl Iterator for SimpleTokenizer<'_> {
802 type Item = SimpleToken;
803
804 fn next(&mut self) -> Option<Self::Item> {
805 let token = self.next_token();
806
807 if token.kind == SimpleTokenKind::EndOfFile {
808 None
809 } else {
810 Some(token)
811 }
812 }
813}
814
815pub struct BackwardsTokenizer<'a> {
823 offset: TextSize,
824 back_offset: TextSize,
825 comment_ranges: &'a [TextRange],
827 bogus: bool,
828 source: &'a str,
829 cursor: Cursor<'a>,
830}
831
832impl<'a> BackwardsTokenizer<'a> {
833 pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
834 Self {
835 offset: range.start(),
836 back_offset: range.end(),
837 comment_ranges: &comment_range
839 [..comment_range.partition_point(|comment| comment.start() <= range.end())],
840 bogus: false,
841 source,
842 cursor: Cursor::new(&source[range]),
843 }
844 }
845
846 pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
847 Self::new(source, TextRange::up_to(offset), comment_range)
848 }
849
850 pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
851 self.filter(|t| !t.kind().is_trivia())
852 }
853
854 pub fn next_token(&mut self) -> SimpleToken {
855 self.cursor.start_token();
856 self.back_offset = self.cursor.text_len() + self.offset;
857
858 let Some(last) = self.cursor.bump_back() else {
859 return SimpleToken {
860 kind: SimpleTokenKind::EndOfFile,
861 range: TextRange::empty(self.back_offset),
862 };
863 };
864
865 if self.bogus {
866 let token = SimpleToken {
867 kind: SimpleTokenKind::Bogus,
868 range: TextRange::up_to(self.back_offset),
869 };
870
871 self.cursor = Cursor::new("");
873 self.back_offset = TextSize::new(0);
874 return token;
875 }
876
877 if let Some(comment) = self
878 .comment_ranges
879 .last()
880 .filter(|comment| comment.contains_inclusive(self.back_offset))
881 {
882 self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
883
884 self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
886 debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
887 return SimpleToken {
888 kind: SimpleTokenKind::Comment,
889 range: comment.range(),
890 };
891 }
892
893 let kind = match last {
894 ' ' | '\t' | '\x0C' => {
899 self.cursor
900 .eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
901 SimpleTokenKind::Whitespace
902 }
903
904 '\r' => SimpleTokenKind::Newline,
905 '\n' => {
906 self.cursor.eat_char_back('\r');
907 SimpleTokenKind::Newline
908 }
909 _ => self.next_token_inner(last),
910 };
911
912 let token_len = self.cursor.token_len();
913 let start = self.back_offset - token_len;
914 SimpleToken {
915 kind,
916 range: TextRange::at(start, token_len),
917 }
918 }
919
920 fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
922 match last {
923 c if is_identifier_continuation(c) => {
925 let savepoint = self.cursor.clone();
929 self.cursor.eat_back_while(is_identifier_continuation);
930
931 let token_len = self.cursor.token_len();
932 let range = TextRange::at(self.back_offset - token_len, token_len);
933
934 if self.source[range]
935 .chars()
936 .next()
937 .is_some_and(is_identifier_start)
938 {
939 to_keyword_or_other(&self.source[range])
940 } else {
941 self.cursor = savepoint;
942 self.bogus = true;
943 SimpleTokenKind::Other
944 }
945 }
946
947 '\\' => SimpleTokenKind::Continuation,
951 ':' => SimpleTokenKind::Colon,
952 '~' => SimpleTokenKind::Tilde,
953 '%' => SimpleTokenKind::Percent,
954 '|' => SimpleTokenKind::Vbar,
955 ',' => SimpleTokenKind::Comma,
956 ';' => SimpleTokenKind::Semi,
957 '(' => SimpleTokenKind::LParen,
958 ')' => SimpleTokenKind::RParen,
959 '[' => SimpleTokenKind::LBracket,
960 ']' => SimpleTokenKind::RBracket,
961 '{' => SimpleTokenKind::LBrace,
962 '}' => SimpleTokenKind::RBrace,
963 '&' => SimpleTokenKind::Ampersand,
964 '^' => SimpleTokenKind::Circumflex,
965 '+' => SimpleTokenKind::Plus,
966 '-' => SimpleTokenKind::Minus,
967
968 '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
972 let mut cursor = self.cursor.clone();
979 cursor.eat_back_while(|c| {
980 matches!(
981 c,
982 ':' | '~'
983 | '%'
984 | '|'
985 | '&'
986 | '^'
987 | '+'
988 | '-'
989 | '='
990 | '*'
991 | '/'
992 | '@'
993 | '!'
994 | '<'
995 | '>'
996 | '.'
997 )
998 });
999
1000 let token_len = cursor.token_len();
1001 let range = TextRange::at(self.back_offset - token_len, token_len);
1002
1003 let forward_lexer = SimpleTokenizer::new(self.source, range);
1004 if let Some(token) = forward_lexer.last() {
1005 for _ in self.source[token.range].chars().rev().skip(1) {
1009 self.cursor.bump_back().unwrap();
1010 }
1011 token.kind()
1012 } else {
1013 self.bogus = true;
1014 SimpleTokenKind::Other
1015 }
1016 }
1017 _ => {
1018 self.bogus = true;
1019 SimpleTokenKind::Other
1020 }
1021 }
1022 }
1023}
1024
1025impl Iterator for BackwardsTokenizer<'_> {
1026 type Item = SimpleToken;
1027
1028 fn next(&mut self) -> Option<Self::Item> {
1029 let token = self.next_token();
1030
1031 if token.kind == SimpleTokenKind::EndOfFile {
1032 None
1033 } else {
1034 Some(token)
1035 }
1036 }
1037}