1use unicode_ident::{is_xid_continue, is_xid_start};
2
3use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
4
5use crate::{Cursor, is_python_whitespace};
6
7pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
14 SimpleTokenizer::starts_at(offset, code)
15 .skip_trivia()
16 .next()
17}
18
19pub fn find_only_token_in_range(
23 range: TextRange,
24 token_kind: SimpleTokenKind,
25 code: &str,
26) -> SimpleToken {
27 let mut tokens = SimpleTokenizer::new(code, range)
28 .skip_trivia()
29 .skip_while(|token| token.kind == SimpleTokenKind::RParen);
30 let token = tokens.next().expect("Expected a token");
31 debug_assert_eq!(token.kind(), token_kind);
32 let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
33 #[expect(clippy::debug_assert_with_mut_call)]
34 {
35 debug_assert_eq!(tokens.next(), None);
36 }
37 token
38}
39
40pub fn lines_before(offset: TextSize, code: &str) -> u32 {
42 let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
43
44 let mut newlines = 0u32;
45 while let Some(c) = cursor.bump_back() {
46 match c {
47 '\n' => {
48 cursor.eat_char_back('\r');
49 newlines += 1;
50 }
51 '\r' => {
52 newlines += 1;
53 }
54 c if is_python_whitespace(c) => {
55 continue;
56 }
57 _ => {
58 break;
59 }
60 }
61 }
62
63 newlines
64}
65
66pub fn lines_after(offset: TextSize, code: &str) -> u32 {
68 let mut cursor = Cursor::new(&code[offset.to_usize()..]);
69
70 let mut newlines = 0u32;
71 while let Some(c) = cursor.bump() {
72 match c {
73 '\n' => {
74 newlines += 1;
75 }
76 '\r' => {
77 cursor.eat_char('\n');
78 newlines += 1;
79 }
80 c if is_python_whitespace(c) => {
81 continue;
82 }
83 _ => {
84 break;
85 }
86 }
87 }
88
89 newlines
90}
91
92pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
95 let mut newlines = 0u32;
96 for token in SimpleTokenizer::starts_at(offset, code) {
97 match token.kind() {
98 SimpleTokenKind::Newline => {
99 newlines += 1;
100 }
101 SimpleTokenKind::Whitespace => {}
102 SimpleTokenKind::Comment => {
104 newlines = 0;
105 }
106 _ => {
108 break;
109 }
110 }
111 }
112 newlines
113}
114
115#[expect(clippy::cast_possible_truncation)]
118pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
119 SimpleTokenizer::starts_at(offset, code)
121 .skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
122 .take_while(|token| {
123 token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
124 })
125 .filter(|token| token.kind == SimpleTokenKind::Newline)
126 .count() as u32
127}
128
129fn is_identifier_start(c: char) -> bool {
130 if c.is_ascii() {
131 c.is_ascii_alphabetic() || c == '_'
132 } else {
133 is_xid_start(c)
134 }
135}
136
137fn is_identifier_continuation(c: char) -> bool {
140 if c.is_ascii() {
143 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
144 } else {
145 is_xid_continue(c)
146 }
147}
148
149fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
150 match source {
151 "and" => SimpleTokenKind::And,
152 "as" => SimpleTokenKind::As,
153 "assert" => SimpleTokenKind::Assert,
154 "async" => SimpleTokenKind::Async,
155 "await" => SimpleTokenKind::Await,
156 "break" => SimpleTokenKind::Break,
157 "class" => SimpleTokenKind::Class,
158 "continue" => SimpleTokenKind::Continue,
159 "def" => SimpleTokenKind::Def,
160 "del" => SimpleTokenKind::Del,
161 "elif" => SimpleTokenKind::Elif,
162 "else" => SimpleTokenKind::Else,
163 "except" => SimpleTokenKind::Except,
164 "finally" => SimpleTokenKind::Finally,
165 "for" => SimpleTokenKind::For,
166 "from" => SimpleTokenKind::From,
167 "global" => SimpleTokenKind::Global,
168 "if" => SimpleTokenKind::If,
169 "import" => SimpleTokenKind::Import,
170 "in" => SimpleTokenKind::In,
171 "is" => SimpleTokenKind::Is,
172 "lazy" => SimpleTokenKind::Lazy, "lambda" => SimpleTokenKind::Lambda,
174 "nonlocal" => SimpleTokenKind::Nonlocal,
175 "not" => SimpleTokenKind::Not,
176 "or" => SimpleTokenKind::Or,
177 "pass" => SimpleTokenKind::Pass,
178 "raise" => SimpleTokenKind::Raise,
179 "return" => SimpleTokenKind::Return,
180 "try" => SimpleTokenKind::Try,
181 "while" => SimpleTokenKind::While,
182 "match" => SimpleTokenKind::Match, "type" => SimpleTokenKind::Type, "case" => SimpleTokenKind::Case,
185 "with" => SimpleTokenKind::With,
186 "yield" => SimpleTokenKind::Yield,
187 _ => SimpleTokenKind::Name, }
189}
190
191#[derive(Clone, Debug, Eq, PartialEq, Hash)]
192pub struct SimpleToken {
193 pub kind: SimpleTokenKind,
194 pub range: TextRange,
195}
196
197impl SimpleToken {
198 pub const fn kind(&self) -> SimpleTokenKind {
199 self.kind
200 }
201}
202
203impl Ranged for SimpleToken {
204 fn range(&self) -> TextRange {
205 self.range
206 }
207}
208
209#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
210pub enum SimpleTokenKind {
211 Comment,
213
214 Whitespace,
216
217 EndOfFile,
219
220 Continuation,
222
223 Newline,
225
226 LParen,
228
229 RParen,
231
232 LBrace,
234
235 RBrace,
237
238 LBracket,
240
241 RBracket,
243
244 Comma,
246
247 Colon,
249
250 Semi,
252
253 Slash,
255
256 Star,
258
259 Dot,
261
262 Plus,
264
265 Minus,
267
268 Equals,
270
271 Greater,
273
274 Less,
276
277 Percent,
279
280 Ampersand,
282
283 Circumflex,
285
286 Vbar,
288
289 At,
291
292 Tilde,
294
295 EqEqual,
297
298 NotEqual,
300
301 LessEqual,
303
304 GreaterEqual,
306
307 LeftShift,
309
310 RightShift,
312
313 DoubleStar,
315
316 DoubleStarEqual,
318
319 PlusEqual,
321
322 MinusEqual,
324
325 StarEqual,
327
328 SlashEqual,
330
331 PercentEqual,
333
334 AmperEqual,
336
337 VbarEqual,
339
340 CircumflexEqual,
342
343 LeftShiftEqual,
345
346 RightShiftEqual,
348
349 DoubleSlash,
351
352 DoubleSlashEqual,
354
355 ColonEqual,
357
358 Ellipsis,
360
361 AtEqual,
363
364 RArrow,
366
367 And,
369
370 As,
372
373 Assert,
375
376 Async,
378
379 Await,
381
382 Break,
384
385 Class,
387
388 Continue,
390
391 Def,
393
394 Del,
396
397 Elif,
399
400 Else,
402
403 Except,
405
406 Finally,
408
409 For,
411
412 From,
414
415 Global,
417
418 If,
420
421 Import,
423
424 In,
426
427 Is,
429
430 Lambda,
432
433 Nonlocal,
435
436 Not,
438
439 Or,
441
442 Pass,
444
445 Raise,
447
448 Return,
450
451 Try,
453
454 While,
456
457 Lazy,
459
460 Match,
462
463 Type,
465
466 Case,
468
469 With,
471
472 Yield,
474
475 Name,
477
478 Other,
480
481 Bogus,
483}
484
485impl SimpleTokenKind {
486 pub const fn is_trivia(self) -> bool {
487 matches!(
488 self,
489 SimpleTokenKind::Whitespace
490 | SimpleTokenKind::Newline
491 | SimpleTokenKind::Comment
492 | SimpleTokenKind::Continuation
493 )
494 }
495
496 pub const fn is_comment(self) -> bool {
497 matches!(self, SimpleTokenKind::Comment)
498 }
499}
500
501pub struct SimpleTokenizer<'a> {
508 offset: TextSize,
509 bogus: bool,
511 source: &'a str,
512 cursor: Cursor<'a>,
513}
514
515impl<'a> SimpleTokenizer<'a> {
516 pub fn new(source: &'a str, range: TextRange) -> Self {
517 Self {
518 offset: range.start(),
519 bogus: false,
520 source,
521 cursor: Cursor::new(&source[range]),
522 }
523 }
524
525 pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
526 let range = TextRange::new(offset, source.text_len());
527 Self::new(source, range)
528 }
529
530 fn next_token(&mut self) -> SimpleToken {
531 self.cursor.start_token();
532
533 let Some(first) = self.cursor.bump() else {
534 return SimpleToken {
535 kind: SimpleTokenKind::EndOfFile,
536 range: TextRange::empty(self.offset),
537 };
538 };
539
540 if self.bogus {
541 let token = SimpleToken {
543 kind: SimpleTokenKind::Bogus,
544 range: TextRange::new(self.offset, self.source.text_len()),
545 };
546
547 self.cursor = Cursor::new("");
549 self.offset = self.source.text_len();
550 return token;
551 }
552
553 let kind = self.next_token_inner(first);
554
555 let token_len = self.cursor.token_len();
556
557 let token = SimpleToken {
558 kind,
559 range: TextRange::at(self.offset, token_len),
560 };
561
562 self.offset += token_len;
563
564 token
565 }
566
567 fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
568 match first {
569 c if is_identifier_start(c) => {
571 self.cursor.eat_while(is_identifier_continuation);
572 let token_len = self.cursor.token_len();
573
574 let range = TextRange::at(self.offset, token_len);
575 let kind = to_keyword_or_other(&self.source[range]);
576
577 if kind == SimpleTokenKind::Name
580 && matches!(self.cursor.first(), '"' | '\'')
581 && matches!(
582 &self.source[range],
583 "B" | "BR"
584 | "Br"
585 | "F"
586 | "FR"
587 | "Fr"
588 | "R"
589 | "RB"
590 | "RF"
591 | "Rb"
592 | "Rf"
593 | "U"
594 | "b"
595 | "bR"
596 | "br"
597 | "f"
598 | "fR"
599 | "fr"
600 | "r"
601 | "rB"
602 | "rF"
603 | "rb"
604 | "rf"
605 | "u"
606 | "T"
607 | "TR"
608 | "Tr"
609 | "RT"
610 | "Rt"
611 | "t"
612 | "tR"
613 | "tr"
614 | "rT"
615 | "rt"
616 )
617 {
618 self.bogus = true;
619 SimpleTokenKind::Other
620 } else {
621 kind
622 }
623 }
624
625 ' ' | '\t' | '\x0C' => {
628 self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
629 SimpleTokenKind::Whitespace
630 }
631
632 '\n' => SimpleTokenKind::Newline,
633
634 '\r' => {
635 self.cursor.eat_char('\n');
636 SimpleTokenKind::Newline
637 }
638
639 '#' => {
640 self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
641 SimpleTokenKind::Comment
642 }
643
644 '\\' => SimpleTokenKind::Continuation,
645
646 '=' => {
648 if self.cursor.eat_char('=') {
649 SimpleTokenKind::EqEqual
650 } else {
651 SimpleTokenKind::Equals
652 }
653 }
654 '+' => {
655 if self.cursor.eat_char('=') {
656 SimpleTokenKind::PlusEqual
657 } else {
658 SimpleTokenKind::Plus
659 }
660 }
661 '*' => {
662 if self.cursor.eat_char('=') {
663 SimpleTokenKind::StarEqual
664 } else if self.cursor.eat_char('*') {
665 if self.cursor.eat_char('=') {
666 SimpleTokenKind::DoubleStarEqual
667 } else {
668 SimpleTokenKind::DoubleStar
669 }
670 } else {
671 SimpleTokenKind::Star
672 }
673 }
674 '/' => {
675 if self.cursor.eat_char('=') {
676 SimpleTokenKind::SlashEqual
677 } else if self.cursor.eat_char('/') {
678 if self.cursor.eat_char('=') {
679 SimpleTokenKind::DoubleSlashEqual
680 } else {
681 SimpleTokenKind::DoubleSlash
682 }
683 } else {
684 SimpleTokenKind::Slash
685 }
686 }
687 '%' => {
688 if self.cursor.eat_char('=') {
689 SimpleTokenKind::PercentEqual
690 } else {
691 SimpleTokenKind::Percent
692 }
693 }
694 '|' => {
695 if self.cursor.eat_char('=') {
696 SimpleTokenKind::VbarEqual
697 } else {
698 SimpleTokenKind::Vbar
699 }
700 }
701 '^' => {
702 if self.cursor.eat_char('=') {
703 SimpleTokenKind::CircumflexEqual
704 } else {
705 SimpleTokenKind::Circumflex
706 }
707 }
708 '&' => {
709 if self.cursor.eat_char('=') {
710 SimpleTokenKind::AmperEqual
711 } else {
712 SimpleTokenKind::Ampersand
713 }
714 }
715 '-' => {
716 if self.cursor.eat_char('=') {
717 SimpleTokenKind::MinusEqual
718 } else if self.cursor.eat_char('>') {
719 SimpleTokenKind::RArrow
720 } else {
721 SimpleTokenKind::Minus
722 }
723 }
724 '@' => {
725 if self.cursor.eat_char('=') {
726 SimpleTokenKind::AtEqual
727 } else {
728 SimpleTokenKind::At
729 }
730 }
731 '!' => {
732 if self.cursor.eat_char('=') {
733 SimpleTokenKind::NotEqual
734 } else {
735 self.bogus = true;
736 SimpleTokenKind::Other
737 }
738 }
739 '~' => SimpleTokenKind::Tilde,
740 ':' => {
741 if self.cursor.eat_char('=') {
742 SimpleTokenKind::ColonEqual
743 } else {
744 SimpleTokenKind::Colon
745 }
746 }
747 ';' => SimpleTokenKind::Semi,
748 '<' => {
749 if self.cursor.eat_char('<') {
750 if self.cursor.eat_char('=') {
751 SimpleTokenKind::LeftShiftEqual
752 } else {
753 SimpleTokenKind::LeftShift
754 }
755 } else if self.cursor.eat_char('=') {
756 SimpleTokenKind::LessEqual
757 } else {
758 SimpleTokenKind::Less
759 }
760 }
761 '>' => {
762 if self.cursor.eat_char('>') {
763 if self.cursor.eat_char('=') {
764 SimpleTokenKind::RightShiftEqual
765 } else {
766 SimpleTokenKind::RightShift
767 }
768 } else if self.cursor.eat_char('=') {
769 SimpleTokenKind::GreaterEqual
770 } else {
771 SimpleTokenKind::Greater
772 }
773 }
774 ',' => SimpleTokenKind::Comma,
775 '.' => {
776 if self.cursor.first() == '.' && self.cursor.second() == '.' {
777 self.cursor.bump();
778 self.cursor.bump();
779 SimpleTokenKind::Ellipsis
780 } else {
781 SimpleTokenKind::Dot
782 }
783 }
784
785 '(' => SimpleTokenKind::LParen,
787 ')' => SimpleTokenKind::RParen,
788 '[' => SimpleTokenKind::LBracket,
789 ']' => SimpleTokenKind::RBracket,
790 '{' => SimpleTokenKind::LBrace,
791 '}' => SimpleTokenKind::RBrace,
792
793 _ => {
794 self.bogus = true;
795 SimpleTokenKind::Other
796 }
797 }
798 }
799
800 pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
801 self.filter(|t| !t.kind().is_trivia())
802 }
803}
804
805impl Iterator for SimpleTokenizer<'_> {
806 type Item = SimpleToken;
807
808 fn next(&mut self) -> Option<Self::Item> {
809 let token = self.next_token();
810
811 if token.kind == SimpleTokenKind::EndOfFile {
812 None
813 } else {
814 Some(token)
815 }
816 }
817}
818
819pub struct BackwardsTokenizer<'a> {
827 offset: TextSize,
828 back_offset: TextSize,
829 comment_ranges: &'a [TextRange],
831 bogus: bool,
832 source: &'a str,
833 cursor: Cursor<'a>,
834}
835
836impl<'a> BackwardsTokenizer<'a> {
837 pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
838 Self {
839 offset: range.start(),
840 back_offset: range.end(),
841 comment_ranges: &comment_range
843 [..comment_range.partition_point(|comment| comment.start() <= range.end())],
844 bogus: false,
845 source,
846 cursor: Cursor::new(&source[range]),
847 }
848 }
849
850 pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
851 Self::new(source, TextRange::up_to(offset), comment_range)
852 }
853
854 pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
855 self.filter(|t| !t.kind().is_trivia())
856 }
857
858 pub fn next_token(&mut self) -> SimpleToken {
859 self.cursor.start_token();
860 self.back_offset = self.cursor.text_len() + self.offset;
861
862 let Some(last) = self.cursor.bump_back() else {
863 return SimpleToken {
864 kind: SimpleTokenKind::EndOfFile,
865 range: TextRange::empty(self.back_offset),
866 };
867 };
868
869 if self.bogus {
870 let token = SimpleToken {
871 kind: SimpleTokenKind::Bogus,
872 range: TextRange::up_to(self.back_offset),
873 };
874
875 self.cursor = Cursor::new("");
877 self.back_offset = TextSize::new(0);
878 return token;
879 }
880
881 if let Some(comment) = self
882 .comment_ranges
883 .last()
884 .filter(|comment| comment.contains_inclusive(self.back_offset))
885 {
886 self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
887
888 self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
890 debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
891 return SimpleToken {
892 kind: SimpleTokenKind::Comment,
893 range: comment.range(),
894 };
895 }
896
897 let kind = match last {
898 ' ' | '\t' | '\x0C' => {
903 self.cursor
904 .eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
905 SimpleTokenKind::Whitespace
906 }
907
908 '\r' => SimpleTokenKind::Newline,
909 '\n' => {
910 self.cursor.eat_char_back('\r');
911 SimpleTokenKind::Newline
912 }
913 _ => self.next_token_inner(last),
914 };
915
916 let token_len = self.cursor.token_len();
917 let start = self.back_offset - token_len;
918 SimpleToken {
919 kind,
920 range: TextRange::at(start, token_len),
921 }
922 }
923
924 fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
926 match last {
927 c if is_identifier_continuation(c) => {
929 let savepoint = self.cursor.clone();
933 self.cursor.eat_back_while(is_identifier_continuation);
934
935 let token_len = self.cursor.token_len();
936 let range = TextRange::at(self.back_offset - token_len, token_len);
937
938 if self.source[range]
939 .chars()
940 .next()
941 .is_some_and(is_identifier_start)
942 {
943 to_keyword_or_other(&self.source[range])
944 } else {
945 self.cursor = savepoint;
946 self.bogus = true;
947 SimpleTokenKind::Other
948 }
949 }
950
951 '\\' => SimpleTokenKind::Continuation,
955 ':' => SimpleTokenKind::Colon,
956 '~' => SimpleTokenKind::Tilde,
957 '%' => SimpleTokenKind::Percent,
958 '|' => SimpleTokenKind::Vbar,
959 ',' => SimpleTokenKind::Comma,
960 ';' => SimpleTokenKind::Semi,
961 '(' => SimpleTokenKind::LParen,
962 ')' => SimpleTokenKind::RParen,
963 '[' => SimpleTokenKind::LBracket,
964 ']' => SimpleTokenKind::RBracket,
965 '{' => SimpleTokenKind::LBrace,
966 '}' => SimpleTokenKind::RBrace,
967 '&' => SimpleTokenKind::Ampersand,
968 '^' => SimpleTokenKind::Circumflex,
969 '+' => SimpleTokenKind::Plus,
970 '-' => SimpleTokenKind::Minus,
971
972 '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
976 let mut cursor = self.cursor.clone();
983 cursor.eat_back_while(|c| {
984 matches!(
985 c,
986 ':' | '~'
987 | '%'
988 | '|'
989 | '&'
990 | '^'
991 | '+'
992 | '-'
993 | '='
994 | '*'
995 | '/'
996 | '@'
997 | '!'
998 | '<'
999 | '>'
1000 | '.'
1001 )
1002 });
1003
1004 let token_len = cursor.token_len();
1005 let range = TextRange::at(self.back_offset - token_len, token_len);
1006
1007 let forward_lexer = SimpleTokenizer::new(self.source, range);
1008 if let Some(token) = forward_lexer.last() {
1009 for _ in self.source[token.range].chars().rev().skip(1) {
1013 self.cursor.bump_back().unwrap();
1014 }
1015 token.kind()
1016 } else {
1017 self.bogus = true;
1018 SimpleTokenKind::Other
1019 }
1020 }
1021 _ => {
1022 self.bogus = true;
1023 SimpleTokenKind::Other
1024 }
1025 }
1026 }
1027}
1028
1029impl Iterator for BackwardsTokenizer<'_> {
1030 type Item = SimpleToken;
1031
1032 fn next(&mut self) -> Option<Self::Item> {
1033 let token = self.next_token();
1034
1035 if token.kind == SimpleTokenKind::EndOfFile {
1036 None
1037 } else {
1038 Some(token)
1039 }
1040 }
1041}