1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
use ruff_python_ast::token::{Token, TokenFlags, TokenKind};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::Mode;
use crate::error::LexicalError;
use crate::lexer::{Lexer, LexerCheckpoint};
use crate::string::InterpolatedStringKind;
use crate::token::TokenValue;
/// Token source for the parser that skips over any trivia tokens.
#[derive(Debug)]
pub(crate) struct TokenSource<'src> {
/// The underlying source for the tokens.
lexer: Lexer<'src>,
/// A vector containing all the tokens emitted by the lexer. This is returned when the parser
/// is finished consuming all the tokens. Note that unlike the emitted tokens, this vector
/// holds both the trivia and non-trivia tokens.
tokens: Vec<Token>,
}
impl<'src> TokenSource<'src> {
/// Create a new token source for the given lexer.
pub(crate) fn new(lexer: Lexer<'src>) -> Self {
// TODO(dhruvmanila): Use `allocate_tokens_vec`
TokenSource {
lexer,
tokens: vec![],
}
}
/// Create a new token source from the given source code which starts at the given offset.
pub(crate) fn from_source(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
let lexer = Lexer::new(source, mode, start_offset);
let mut source = TokenSource::new(lexer);
// Initialize the token source so that the current token is set correctly.
source.do_bump();
source
}
/// Returns the kind of the current token.
pub(crate) fn current_kind(&self) -> TokenKind {
self.lexer.current_kind()
}
/// Returns the range of the current token.
pub(crate) fn current_range(&self) -> TextRange {
self.lexer.current_range()
}
/// Returns the flags for the current token.
pub(crate) fn current_flags(&self) -> TokenFlags {
self.lexer.current_flags()
}
/// Calls the underlying [`take_value`] method on the lexer. Refer to its documentation
/// for more info.
///
/// [`take_value`]: Lexer::take_value
pub(crate) fn take_value(&mut self) -> TokenValue {
self.lexer.take_value()
}
/// Calls the underlying [`re_lex_logical_token`] method on the lexer with the new lexer
/// position and updates the token vector accordingly.
///
/// [`re_lex_logical_token`]: Lexer::re_lex_logical_token
pub(crate) fn re_lex_logical_token(&mut self) {
let mut non_logical_newline = None;
#[cfg(debug_assertions)]
let last_non_trivia_end_before = {
self.tokens
.iter()
.rev()
.find(|tok| !tok.kind().is_trivia())
.map(ruff_text_size::Ranged::end)
};
for (index, token) in self.tokens.iter().enumerate().rev() {
match token.kind() {
TokenKind::NonLogicalNewline => {
non_logical_newline = Some((index, token.start()));
}
TokenKind::Comment => continue,
_ => break,
}
}
if !self
.lexer
.re_lex_logical_token(non_logical_newline.map(|(_, start)| start))
{
return;
}
let non_logical_line_index = non_logical_newline
.expect(
"`re_lex_logical_token` should only return `true` if `non_logical_line` is `Some`",
)
.0;
// Trim the already bumped logical line token (and comments coming after it) as it might now have become a logical line token
self.tokens.truncate(non_logical_line_index);
#[cfg(debug_assertions)]
{
let last_non_trivia_end_now = {
self.tokens
.iter()
.rev()
.find(|tok| !tok.kind().is_trivia())
.map(ruff_text_size::Ranged::end)
};
assert_eq!(last_non_trivia_end_before, last_non_trivia_end_now);
}
// Ensure `current` is positioned at a non-trivia token.
if self.current_kind().is_trivia() {
self.bump(self.current_kind());
}
}
pub(crate) fn re_lex_string_token_in_interpolation_element(
&mut self,
kind: InterpolatedStringKind,
) {
self.lexer
.re_lex_string_token_in_interpolation_element(kind);
}
pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
self.lexer.re_lex_raw_string_in_format_spec();
}
/// Returns the next non-trivia token without consuming it.
///
/// Use [`peek2`] to get the next two tokens.
///
/// [`peek2`]: TokenSource::peek2
pub(crate) fn peek(&mut self) -> TokenKind {
let checkpoint = self.lexer.checkpoint();
let next = self.next_non_trivia_token();
self.lexer.rewind(checkpoint);
next
}
/// Returns the next two non-trivia tokens without consuming it.
///
/// Use [`peek`] to only get the next token.
///
/// [`peek`]: TokenSource::peek
pub(crate) fn peek2(&mut self) -> (TokenKind, TokenKind) {
let checkpoint = self.lexer.checkpoint();
let first = self.next_non_trivia_token();
let second = self.next_non_trivia_token();
self.lexer.rewind(checkpoint);
(first, second)
}
/// Bumps the token source to the next non-trivia token.
///
/// It pushes the given kind to the token vector with the current token range.
pub(crate) fn bump(&mut self, kind: TokenKind) {
self.tokens
.push(Token::new(kind, self.current_range(), self.current_flags()));
self.do_bump();
}
/// Bumps the token source to the next non-trivia token without adding the current token to the
/// token vector. It does add the trivia tokens to the token vector.
fn do_bump(&mut self) {
loop {
let kind = self.lexer.next_token();
if kind.is_trivia() {
self.tokens
.push(Token::new(kind, self.current_range(), self.current_flags()));
continue;
}
break;
}
}
/// Returns the next non-trivia token without adding it to the token vector.
fn next_non_trivia_token(&mut self) -> TokenKind {
loop {
let kind = self.lexer.next_token();
if kind.is_trivia() {
continue;
}
break kind;
}
}
/// Creates a checkpoint to which the token source can later return to using [`Self::rewind`].
pub(crate) fn checkpoint(&self) -> TokenSourceCheckpoint {
TokenSourceCheckpoint {
lexer_checkpoint: self.lexer.checkpoint(),
tokens_position: self.tokens.len(),
}
}
/// Restore the token source to the given checkpoint.
pub(crate) fn rewind(&mut self, checkpoint: TokenSourceCheckpoint) {
let TokenSourceCheckpoint {
lexer_checkpoint,
tokens_position,
} = checkpoint;
self.lexer.rewind(lexer_checkpoint);
self.tokens.truncate(tokens_position);
}
/// Returns a slice of [`Token`] that are within the given `range`.
pub(crate) fn in_range(&self, range: TextRange) -> &[Token] {
let start = self
.tokens
.iter()
.rposition(|tok| tok.start() == range.start());
let end = self.tokens.iter().rposition(|tok| tok.end() == range.end());
let (Some(start), Some(end)) = (start, end) else {
return &self.tokens;
};
&self.tokens[start..=end]
}
/// Consumes the token source, returning the collected tokens, comment ranges, and any errors
/// encountered during lexing. The token collection includes both the trivia and non-trivia
/// tokens.
pub(crate) fn finish(mut self) -> (Vec<Token>, Vec<LexicalError>) {
assert_eq!(
self.current_kind(),
TokenKind::EndOfFile,
"TokenSource was not fully consumed"
);
// The `EndOfFile` token shouldn't be included in the token stream, it's mainly to signal
// the parser to stop. This isn't in `do_bump` because it only needs to be done once.
if let Some(last) = self.tokens.pop() {
assert_eq!(last.kind(), TokenKind::EndOfFile);
}
(self.tokens, self.lexer.finish())
}
}
pub(crate) struct TokenSourceCheckpoint {
lexer_checkpoint: LexerCheckpoint,
tokens_position: usize,
}
/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
/// of `contents`.
///
/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
#[expect(dead_code)]
fn allocate_tokens_vec(contents: &str) -> Vec<Token> {
let lower_bound = contents.len().saturating_mul(15) / 100;
Vec::with_capacity(lower_bound)
}