Skip to main content

ruff_python_ast/token/
tokens.rs

1use std::{iter::FusedIterator, ops::Deref};
2
3use super::{Token, TokenKind};
4use ruff_python_trivia::CommentRanges;
5use ruff_text_size::{Ranged as _, TextRange, TextSize};
6
7/// Tokens represents a vector of lexed [`Token`].
8#[derive(Debug, Clone, PartialEq, Eq)]
9#[cfg_attr(feature = "get-size", derive(get_size2::GetSize))]
10pub struct Tokens {
11    raw: Vec<Token>,
12}
13
14impl Tokens {
15    pub fn new(tokens: Vec<Token>) -> Tokens {
16        Tokens { raw: tokens }
17    }
18
19    /// Returns an iterator over all the tokens that provides context.
20    pub fn iter_with_context(&self) -> TokenIterWithContext<'_> {
21        TokenIterWithContext::new(&self.raw)
22    }
23
24    /// Performs a binary search to find the index of the **first** token that starts at the given `offset`.
25    ///
26    /// Unlike `binary_search_by_key`, this method ensures that if multiple tokens start at the same offset,
27    /// it returns the index of the first one. Multiple tokens can start at the same offset in cases where
28    /// zero-length tokens are involved (like `Dedent` or `Newline` at the end of the file).
29    pub fn binary_search_by_start(&self, offset: TextSize) -> Result<usize, usize> {
30        let partition_point = self.partition_point(|token| token.start() < offset);
31
32        let after = &self[partition_point..];
33
34        if after.first().is_some_and(|first| first.start() == offset) {
35            Ok(partition_point)
36        } else {
37            Err(partition_point)
38        }
39    }
40
41    /// Returns a slice of [`Token`] that are within the given `range`.
42    ///
43    /// The start and end offset of the given range should be either:
44    /// 1. Token boundary
45    /// 2. Gap between the tokens
46    ///
47    /// For example, considering the following tokens and their corresponding range:
48    ///
49    /// | Token               | Range     |
50    /// |---------------------|-----------|
51    /// | `Def`               | `0..3`    |
52    /// | `Name`              | `4..7`    |
53    /// | `Lpar`              | `7..8`    |
54    /// | `Rpar`              | `8..9`    |
55    /// | `Colon`             | `9..10`   |
56    /// | `Newline`           | `10..11`  |
57    /// | `Comment`           | `15..24`  |
58    /// | `NonLogicalNewline` | `24..25`  |
59    /// | `Indent`            | `25..29`  |
60    /// | `Pass`              | `29..33`  |
61    ///
62    /// Here, for (1) a token boundary is considered either the start or end offset of any of the
63    /// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
64    /// token which are 12, 13, and 14.
65    ///
66    /// Examples:
67    /// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
68    /// 2) `11..25` would give `Comment`, `NonLogicalNewline`
69    /// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
70    /// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
71    /// 5) `18..27` would panic because both the start and end offset is within a token
72    ///
73    /// ## Note
74    ///
75    /// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
76    /// error encountered within the given range.
77    ///
78    /// # Panics
79    ///
80    /// If either the start or end offset of the given range is within a token range.
81    pub fn in_range(&self, range: TextRange) -> &[Token] {
82        let tokens_after_start = self.after(range.start());
83
84        Self::before_impl(tokens_after_start, range.end())
85    }
86
87    /// Searches the token(s) at `offset`.
88    ///
89    /// Returns [`TokenAt::Between`] if `offset` points directly inbetween two tokens
90    /// (the left token ends at `offset` and the right token starts at `offset`).
91    pub fn at_offset(&self, offset: TextSize) -> TokenAt {
92        match self.binary_search_by_start(offset) {
93            // The token at `index` starts exactly at `offset.
94            // ```python
95            // object.attribute
96            //        ^ OFFSET
97            // ```
98            Ok(index) => {
99                let token = self[index];
100                // `token` starts exactly at `offset`. Test if the offset is right between
101                // `token` and the previous token (if there's any)
102                if let Some(previous) = index.checked_sub(1).map(|idx| self[idx])
103                    && previous.end() == offset
104                {
105                    return TokenAt::Between(previous, token);
106                }
107
108                TokenAt::Single(token)
109            }
110
111            // No token found that starts exactly at the given offset. But it's possible that
112            // the token starting before `offset` fully encloses `offset` (it's end range ends after `offset`).
113            // ```python
114            // object.attribute
115            //   ^ OFFSET
116            // # or
117            // if True:
118            //     print("test")
119            //  ^ OFFSET
120            // ```
121            Err(index) => {
122                if let Some(previous) = index.checked_sub(1).map(|idx| self[idx])
123                    && previous.range().contains_inclusive(offset)
124                {
125                    return TokenAt::Single(previous);
126                }
127
128                TokenAt::None
129            }
130        }
131    }
132
133    /// Returns a slice of tokens before the given [`TextSize`] offset.
134    ///
135    /// If the given offset is between two tokens, the returned slice will end just before the
136    /// following token. In other words, if the offset is between the end of previous token and
137    /// start of next token, the returned slice will end just before the next token.
138    ///
139    /// # Panics
140    ///
141    /// If the given offset is inside a token range at any point
142    /// other than the start of the range.
143    pub fn before(&self, offset: TextSize) -> &[Token] {
144        Self::before_impl(&self.raw, offset)
145    }
146
147    fn before_impl(tokens: &[Token], offset: TextSize) -> &[Token] {
148        let partition_point = tokens.partition_point(|token| token.start() < offset);
149        let before = &tokens[..partition_point];
150
151        if let Some(last) = before.last() {
152            // If it's equal to the end offset, then it's at a token boundary which is
153            // valid. If it's greater than the end offset, then it's in the gap between
154            // the tokens which is valid as well.
155            assert!(
156                offset >= last.end(),
157                "Offset {offset:?} is inside token `{last:?}`",
158            );
159        }
160        before
161    }
162
163    /// Returns a slice of tokens after the given [`TextSize`] offset.
164    ///
165    /// If the given offset is between two tokens, the returned slice will start from the following
166    /// token. In other words, if the offset is between the end of previous token and start of next
167    /// token, the returned slice will start from the next token.
168    ///
169    /// # Panics
170    ///
171    /// If the given offset is inside a token range at any point
172    /// other than the start of the range.
173    pub fn after(&self, offset: TextSize) -> &[Token] {
174        let partition_point = self.partition_point(|token| token.end() <= offset);
175        let after = &self[partition_point..];
176
177        if let Some(first) = after.first() {
178            // valid. If it's greater than the end offset, then it's in the gap between
179            // the tokens which is valid as well.
180            assert!(
181                offset <= first.start(),
182                "Offset {offset:?} is inside token `{first:?}`",
183            );
184        }
185
186        after
187    }
188
189    /// Returns a pair of token slices from both before and after the given [`TextSize`] offset.
190    ///
191    /// If the given offset is between two tokens, the "before" slice will end just before the
192    /// following token. In other words, if the offset is between the end of previous token and
193    /// start of next token, the "before" slice will end just before the next token. The "after"
194    /// slice will contain the rest of the tokens.
195    ///
196    /// Note that the contents of the "after" slice may differ from the results of calling `after()`
197    /// directly, particularly when the given offset occurs on zero-width tokens like `Dedent`.
198    ///
199    /// # Panics
200    ///
201    /// If the given offset is inside a token range at any point
202    /// other than the start of the range.
203    pub fn split_at(&self, offset: TextSize) -> (&[Token], &[Token]) {
204        let partition_point = self.partition_point(|token| token.start() < offset);
205        let (before, after) = &self.raw.split_at(partition_point);
206
207        if let Some(last) = before.last() {
208            assert!(
209                offset >= last.end(),
210                "Offset {offset:?} is inside token `{last:?}`"
211            );
212        }
213        (before, after)
214    }
215}
216
217impl<'a> IntoIterator for &'a Tokens {
218    type Item = &'a Token;
219    type IntoIter = std::slice::Iter<'a, Token>;
220
221    fn into_iter(self) -> Self::IntoIter {
222        self.iter()
223    }
224}
225
226impl Deref for Tokens {
227    type Target = [Token];
228
229    fn deref(&self) -> &Self::Target {
230        &self.raw
231    }
232}
233
234/// A token that encloses a given offset or ends exactly at it.
235#[derive(Debug, Clone)]
236pub enum TokenAt {
237    /// There's no token at the given offset
238    None,
239
240    /// There's a single token at the given offset.
241    Single(Token),
242
243    /// The offset falls exactly between two tokens. E.g. `CURSOR` in `call<CURSOR>(arguments)` is
244    /// positioned exactly between the `call` and `(` tokens.
245    Between(Token, Token),
246}
247
248impl Iterator for TokenAt {
249    type Item = Token;
250
251    fn next(&mut self) -> Option<Self::Item> {
252        match *self {
253            TokenAt::None => None,
254            TokenAt::Single(token) => {
255                *self = TokenAt::None;
256                Some(token)
257            }
258            TokenAt::Between(first, second) => {
259                *self = TokenAt::Single(second);
260                Some(first)
261            }
262        }
263    }
264}
265
266impl FusedIterator for TokenAt {}
267
268impl From<&Tokens> for CommentRanges {
269    fn from(tokens: &Tokens) -> Self {
270        let mut ranges = vec![];
271        for token in tokens {
272            if token.kind() == TokenKind::Comment {
273                ranges.push(token.range());
274            }
275        }
276        CommentRanges::new(ranges)
277    }
278}
279
280/// An iterator over the [`Token`]s with context.
281///
282/// This struct is created by the [`iter_with_context`] method on [`Tokens`]. Refer to its
283/// documentation for more details.
284///
285/// [`iter_with_context`]: Tokens::iter_with_context
286#[derive(Debug, Clone)]
287pub struct TokenIterWithContext<'a> {
288    inner: std::slice::Iter<'a, Token>,
289    nesting: u32,
290}
291
292impl<'a> TokenIterWithContext<'a> {
293    fn new(tokens: &'a [Token]) -> TokenIterWithContext<'a> {
294        TokenIterWithContext {
295            inner: tokens.iter(),
296            nesting: 0,
297        }
298    }
299
300    /// Return the nesting level the iterator is currently in.
301    pub const fn nesting(&self) -> u32 {
302        self.nesting
303    }
304
305    /// Returns `true` if the iterator is within a parenthesized context.
306    pub const fn in_parenthesized_context(&self) -> bool {
307        self.nesting > 0
308    }
309
310    /// Returns the next [`Token`] in the iterator without consuming it.
311    pub fn peek(&self) -> Option<&'a Token> {
312        self.clone().next()
313    }
314}
315
316impl<'a> Iterator for TokenIterWithContext<'a> {
317    type Item = &'a Token;
318
319    fn next(&mut self) -> Option<Self::Item> {
320        let token = self.inner.next()?;
321
322        match token.kind() {
323            TokenKind::Lpar | TokenKind::Lbrace | TokenKind::Lsqb => self.nesting += 1,
324            TokenKind::Rpar | TokenKind::Rbrace | TokenKind::Rsqb => {
325                self.nesting = self.nesting.saturating_sub(1);
326            }
327            // This mimics the behavior of re-lexing which reduces the nesting level on the lexer.
328            // We don't need to reduce it by 1 because unlike the lexer we see the final token
329            // after recovering from every unclosed parenthesis.
330            TokenKind::Newline if self.nesting > 0 => {
331                self.nesting = 0;
332            }
333            _ => {}
334        }
335
336        Some(token)
337    }
338}
339
340impl FusedIterator for TokenIterWithContext<'_> {}
341
342#[cfg(test)]
343mod tests {
344    use std::ops::Range;
345
346    use ruff_text_size::TextSize;
347
348    use crate::token::{Token, TokenFlags, TokenKind};
349
350    use super::*;
351
352    /// Test case containing a "gap" between two tokens.
353    ///
354    /// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
355    const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
356        (TokenKind::Def, 0..3),
357        (TokenKind::Name, 4..7),
358        (TokenKind::Lpar, 7..8),
359        (TokenKind::Rpar, 8..9),
360        (TokenKind::Colon, 9..10),
361        (TokenKind::Newline, 10..11),
362        // Gap               ||..||
363        (TokenKind::Comment, 15..24),
364        (TokenKind::NonLogicalNewline, 24..25),
365        (TokenKind::Indent, 25..29),
366        (TokenKind::Pass, 29..33),
367        // No newline at the end to keep the token set full of unique tokens
368    ];
369
370    /// Helper function to create [`Tokens`] from an iterator of (kind, range).
371    fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
372        Tokens::new(
373            tokens
374                .map(|(kind, range)| {
375                    Token::new(
376                        kind,
377                        TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
378                        TokenFlags::empty(),
379                    )
380                })
381                .collect(),
382        )
383    }
384
385    #[test]
386    fn tokens_after_offset_at_token_start() {
387        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
388        let after = tokens.after(TextSize::new(8));
389        assert_eq!(after.len(), 7);
390        assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);
391    }
392
393    #[test]
394    fn tokens_after_offset_at_token_end() {
395        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
396        let after = tokens.after(TextSize::new(11));
397        assert_eq!(after.len(), 4);
398        assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
399    }
400
401    #[test]
402    fn tokens_after_offset_between_tokens() {
403        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
404        let after = tokens.after(TextSize::new(13));
405        assert_eq!(after.len(), 4);
406        assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
407    }
408
409    #[test]
410    fn tokens_after_offset_at_last_token_end() {
411        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
412        let after = tokens.after(TextSize::new(33));
413        assert_eq!(after.len(), 0);
414    }
415
416    #[test]
417    #[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
418    fn tokens_after_offset_inside_token() {
419        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
420        tokens.after(TextSize::new(5));
421    }
422
423    #[test]
424    fn tokens_before_offset_at_first_token_start() {
425        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
426        let before = tokens.before(TextSize::new(0));
427        assert_eq!(before.len(), 0);
428    }
429
430    #[test]
431    fn tokens_before_offset_after_first_token_gap() {
432        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
433        let before = tokens.before(TextSize::new(3));
434        assert_eq!(before.len(), 1);
435        assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
436    }
437
438    #[test]
439    fn tokens_before_offset_at_second_token_start() {
440        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
441        let before = tokens.before(TextSize::new(4));
442        assert_eq!(before.len(), 1);
443        assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
444    }
445
446    #[test]
447    fn tokens_before_offset_at_token_start() {
448        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
449        let before = tokens.before(TextSize::new(8));
450        assert_eq!(before.len(), 3);
451        assert_eq!(before.last().unwrap().kind(), TokenKind::Lpar);
452    }
453
454    #[test]
455    fn tokens_before_offset_at_token_end() {
456        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
457        let before = tokens.before(TextSize::new(11));
458        assert_eq!(before.len(), 6);
459        assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
460    }
461
462    #[test]
463    fn tokens_before_offset_between_tokens() {
464        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
465        let before = tokens.before(TextSize::new(13));
466        assert_eq!(before.len(), 6);
467        assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
468    }
469
470    #[test]
471    fn tokens_before_offset_at_last_token_end() {
472        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
473        let before = tokens.before(TextSize::new(33));
474        assert_eq!(before.len(), 10);
475        assert_eq!(before.last().unwrap().kind(), TokenKind::Pass);
476    }
477
478    #[test]
479    #[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
480    fn tokens_before_offset_inside_token() {
481        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
482        tokens.before(TextSize::new(5));
483    }
484
485    #[test]
486    fn tokens_in_range_at_token_offset() {
487        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
488        let in_range = tokens.in_range(TextRange::new(4.into(), 10.into()));
489        assert_eq!(in_range.len(), 4);
490        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
491        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);
492    }
493
494    #[test]
495    fn tokens_in_range_start_offset_at_token_end() {
496        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
497        let in_range = tokens.in_range(TextRange::new(11.into(), 29.into()));
498        assert_eq!(in_range.len(), 3);
499        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
500        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
501    }
502
503    #[test]
504    fn tokens_in_range_end_offset_at_token_start() {
505        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
506        let in_range = tokens.in_range(TextRange::new(8.into(), 15.into()));
507        assert_eq!(in_range.len(), 3);
508        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar);
509        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
510    }
511
512    #[test]
513    fn tokens_in_range_start_offset_between_tokens() {
514        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
515        let in_range = tokens.in_range(TextRange::new(13.into(), 29.into()));
516        assert_eq!(in_range.len(), 3);
517        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
518        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
519    }
520
521    #[test]
522    fn tokens_in_range_end_offset_between_tokens() {
523        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
524        let in_range = tokens.in_range(TextRange::new(9.into(), 13.into()));
525        assert_eq!(in_range.len(), 2);
526        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
527        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
528    }
529
530    #[test]
531    #[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
532    fn tokens_in_range_start_offset_inside_token() {
533        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
534        tokens.in_range(TextRange::new(5.into(), 10.into()));
535    }
536
537    #[test]
538    #[should_panic(expected = "Offset 6 is inside token `Name 4..7`")]
539    fn tokens_in_range_end_offset_inside_token() {
540        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
541        tokens.in_range(TextRange::new(0.into(), 6.into()));
542    }
543
544    #[test]
545    fn tokens_split_at_first_token_start() {
546        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
547        let (before, after) = tokens.split_at(TextSize::new(0));
548        assert_eq!(before.len(), 0);
549        assert_eq!(after.len(), 10);
550    }
551
552    #[test]
553    fn tokens_split_at_last_token_end() {
554        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
555        let (before, after) = tokens.split_at(TextSize::new(33));
556        assert_eq!(before.len(), 10);
557        assert_eq!(after.len(), 0);
558    }
559
560    #[test]
561    fn tokens_split_at_inside_gap() {
562        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
563        let (before, after) = tokens.split_at(TextSize::new(13));
564        assert_eq!(before.len(), 6);
565        assert_eq!(after.len(), 4);
566    }
567
568    #[test]
569    #[should_panic(expected = "Offset 18 is inside token `Comment 15..24`")]
570    fn tokens_split_at_inside_token() {
571        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
572        tokens.split_at(TextSize::new(18));
573    }
574
575    #[test]
576    fn tokens_split_at_matches_before_and_after() {
577        let offset = TextSize::new(15);
578        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
579        let (before, after) = tokens.split_at(offset);
580        assert_eq!(before, tokens.before(offset));
581        assert_eq!(after, tokens.after(offset));
582    }
583
584    #[test]
585    #[should_panic(expected = "Contents of after slice different when offset at dedent")]
586    fn tokens_split_at_matches_before_and_after_zero_length() {
587        let offset = TextSize::new(13);
588        let tokens = new_tokens(
589            [
590                (TokenKind::If, 0..2),
591                (TokenKind::Name, 3..4),
592                (TokenKind::Colon, 4..5),
593                (TokenKind::Newline, 5..6),
594                (TokenKind::Indent, 6..7),
595                (TokenKind::Pass, 7..11),
596                (TokenKind::Newline, 11..12),
597                (TokenKind::NonLogicalNewline, 12..13),
598                (TokenKind::Dedent, 13..13),
599                (TokenKind::Name, 13..14),
600                (TokenKind::Newline, 14..14),
601            ]
602            .into_iter(),
603        );
604        let (before, after) = tokens.split_at(offset);
605        assert_eq!(before, tokens.before(offset));
606        assert!(
607            after == tokens.after(offset),
608            "Contents of after slice different when offset at dedent"
609        );
610    }
611}