1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
//! Helper and utility methods for the CST parser.
//!
//! This module contains various helper methods used throughout the parsing process,
//! including token peeking, consumption, and logical line detection.
use super::Parser;
use crate::language::Token;
use crate::parsers::SyntaxKind;
use std::num::NonZeroUsize;
impl Parser<'_> {
/// Check if we've reached the end of the token stream.
pub(crate) fn is_at_end(&self) -> bool {
self.pos >= self.tokens.len()
}
/// Get the current token without advancing the position.
pub(crate) fn current_token(&self) -> Option<&Token> {
self.tokens.get(self.pos).map(|(_, token)| token)
}
/// Check if the current token matches the given token type.
pub(crate) fn at_token(&self, token: Token) -> bool {
self.current_token() == Some(&token)
}
/// Peek ahead to get the next keyword (non-whitespace token).
pub(crate) fn peek_next_keyword(&self) -> Option<Token> {
self.peek_next_count_keywords(NonZeroUsize::new(1).unwrap())
.next()
}
/// Check if the current token is an identifier.
pub(crate) fn is_identifier(&self) -> bool {
matches!(self.current_token(), Some(Token::Identifier))
}
/// Check if the current token is a keyword.
pub(crate) fn at_keyword(&self) -> bool {
match self.current_token() {
Some(token) => token.is_keyword(),
None => false,
}
}
/// Check if the current token is a number (any numeric literal type).
pub(crate) fn is_number(&self) -> bool {
matches!(
self.current_token(),
Some(
Token::IntegerLiteral
| Token::LongLiteral
| Token::SingleLiteral
| Token::DoubleLiteral
| Token::DecimalLiteral
)
)
}
/// Peek ahead and get the next `count` non-whitespace keywords from the current position.
///
/// # Arguments
/// * `count` - Number of keywords to peek ahead (must be non-zero)
///
/// # Returns
/// An iterator over the next `count` keywords (non-whitespace tokens)
pub(crate) fn peek_next_count_keywords(
&self,
count: NonZeroUsize,
) -> impl Iterator<Item = Token> + '_ {
self.tokens
.iter()
.skip(self.pos + 1)
.filter(|(_, token)| *token != Token::Whitespace)
.take(count.get())
.map(|(_, token)| *token)
}
/// Peek ahead and get the next `count` tokens (including whitespace) from the current position.
pub(crate) fn peek_next_count_tokens(
&self,
count: NonZeroUsize,
) -> impl Iterator<Item = Token> + '_ {
self.tokens
.iter()
.skip(self.pos + 1)
.take(count.get())
.map(|(_, token)| *token)
}
/// Peek ahead to get the next token (including whitespace).
pub(crate) fn peek_next_token(&self) -> Option<Token> {
self.peek_next_count_tokens(NonZeroUsize::new(1).unwrap())
.next()
}
/// Consume the current token and advance to the next position.
pub(crate) fn consume_token(&mut self) {
if let Some((text, token)) = self.tokens.get(self.pos) {
let kind = SyntaxKind::from(*token);
self.builder.token(kind.to_raw(), text);
self.pos += 1;
}
}
/// Check if the current token is a keyword or identifier followed by `DollarSign`.
/// This pattern represents functions like `Error$`, `Mid$`, `Len$`, `UCase$`, `LCase$`.
pub(crate) fn at_keyword_dollar(&self) -> bool {
// Check for specific keywords that have $ variants
let is_dollar_keyword = matches!(
self.current_token(),
Some(
Token::ErrorKeyword
| Token::LenKeyword
| Token::MidKeyword
| Token::MidBKeyword
| Token::DateKeyword
| Token::StringKeyword
)
);
if is_dollar_keyword {
if let Some(Token::DollarSign) = self.peek_next_token() {
return true;
}
}
// Check for Identifier (like "UCase", "LCase", "Left", etc.) + DollarSign
if self.at_token(Token::Identifier) {
if let Some(Token::DollarSign) = self.peek_next_token() {
// Only merge if it's one of the known dollar functions
if let Some((text, _)) = self.tokens.get(self.pos) {
let text_upper = text.to_uppercase();
if matches!(
text_upper.as_str(),
"CHR"
| "CHRB"
| "CHRW"
| "COMMAND"
| "CURDIR"
| "DATE"
| "ENVIRON"
| "ERROR"
| "FORMAT"
| "HEX"
| "LCASE"
| "LEFT"
| "LEFTB"
| "LTRIM"
| "MID"
| "MIDB"
| "OCT"
| "RIGHT"
| "RIGHTB"
| "RTRIM"
| "SPACE"
| "STR"
| "TIME"
| "TRIM"
| "UCASE"
) {
return true;
}
}
}
}
false
}
/// Consume keyword/identifier + `DollarSign` as a merged Identifier token.
/// This merges tokens like `Error` + `$`, `Len` + `$`, `Mid` + `$`, etc. into single identifiers.
pub(crate) fn consume_keyword_dollar_as_identifier(&mut self) {
if self.at_keyword_dollar() {
// Get the text of both tokens
let first_text = self.tokens.get(self.pos).map_or("", |(text, _)| *text);
let dollar_text = self.tokens.get(self.pos + 1).map_or("", |(text, _)| *text);
// Create a combined text for the identifier
let combined_text = format!("{first_text}{dollar_text}");
// Add as a single Identifier token
self.builder
.token(SyntaxKind::Identifier.to_raw(), &combined_text);
// Skip both tokens
self.pos += 2;
}
}
/// Consume the current token as an Identifier, regardless of whether it's actually a keyword.
/// This is used when keywords appear in identifier positions (e.g., variable names, property names).
///
/// Special cases:
/// - If the current token is `ErrorKeyword` followed by `DollarSign`, they are merged into "Error$"
/// - If the current token is an Identifier (like `Len`, `Mid`, `UCase`, `LCase`) followed by `DollarSign`,
/// they are merged into a single identifier (e.g., `Len$`, `Mid$`, `UCase$`, `LCase$`)
pub(crate) fn consume_token_as_identifier(&mut self) {
// Check for keyword/identifier + $ special cases
if self.at_keyword_dollar() {
self.consume_keyword_dollar_as_identifier();
return;
}
if let Some((text, _)) = self.tokens.get(self.pos) {
self.builder.token(SyntaxKind::Identifier.to_raw(), text);
self.pos += 1;
}
}
/// Consume all whitespace tokens at the current position.
/// Also consumes line continuations (underscore followed by newline).
pub(crate) fn consume_whitespace(&mut self) {
loop {
if self.at_token(Token::Whitespace) {
self.consume_token();
} else if self.at_token(Token::Underscore) {
// Check for line continuation: Underscore [Whitespace] Newline
let mut lookahead = 1;
let mut is_continuation = false;
// Skip whitespace after underscore
while let Some((_, token)) = self.tokens.get(self.pos + lookahead) {
if *token == Token::Whitespace {
lookahead += 1;
} else if *token == Token::Newline {
is_continuation = true;
break;
} else {
break;
}
}
if is_continuation {
// Consume Underscore
self.consume_token();
// Consume whitespace and Newline
while !self.at_token(Token::Newline) {
self.consume_token();
}
self.consume_token(); // Consume Newline
} else {
break;
}
} else {
break;
}
}
}
/// Consume the current token as an Unknown token.
pub(crate) fn consume_token_as_unknown(&mut self) {
if let Some((text, _)) = self.tokens.get(self.pos) {
self.builder.token(SyntaxKind::Unknown.to_raw(), text);
self.pos += 1;
}
}
/// Consume tokens until reaching the specified token or the end of input.
/// The specified token is NOT consumed.
///
/// Handles line continuations when consuming until a newline.
/// Special handling: keyword/identifier followed by `DollarSign` is merged into a single Identifier.
///
/// # Arguments
/// * `target` - The token to stop at (will not be consumed)
pub(crate) fn consume_until(&mut self, target: Token) {
while !self.is_at_end() && !self.at_token(target) {
// Check for keyword/identifier + $ pattern and merge it
if self.at_keyword_dollar() {
self.consume_keyword_dollar_as_identifier();
} else {
self.consume_token();
}
}
// If we're looking for a newline and we found one, check for line continuation
// In VB6, underscore followed by whitespace and newline means "continue on next line"
if target == Token::Newline && self.at_token(Token::Newline) {
// Look back to see if there was an underscore before this newline
// We need to check if the last non-whitespace token was an underscore
let mut check_pos = self.pos;
while check_pos > 0 {
check_pos -= 1;
if let Some((_, token)) = self.tokens.get(check_pos) {
match token {
Token::Whitespace => continue, // Skip whitespace
Token::Underscore => {
// Found line continuation! Consume the newline and keep going
self.consume_token(); // Consume the newline
// Continue consuming until we find a newline without continuation
self.consume_until(target);
return;
}
_ => break, // Not a continuation
}
}
break;
}
}
}
/// Consume tokens until reaching the specified token, then consume that token as well.
///
/// This is a convenience method that combines `consume_until` with consuming the target token.
/// Handles line continuations when consuming until a newline.
///
/// # Arguments
/// * `target` - The token to stop at and consume
pub(crate) fn consume_until_after(&mut self, target: Token) {
self.consume_until(target);
if self.at_token(target) {
self.consume_token();
}
}
}