perl_parser_core/tokens/token_stream.rs
1//! Token stream adapter between `perl-lexer` output and the parser.
2//!
3//! Provides buffered lookahead, skips trivia tokens, and resets lexer mode at
4//! statement boundaries. This stream is optimized for parser consumption rather
5//! than full-fidelity token preservation.
6//!
7//! # Basic usage
8//!
9//! ```
10//! use perl_parser_core::tokens::token_stream::{TokenKind, TokenStream};
11//!
12//! let mut stream = TokenStream::new("my $x = 42;");
13//! assert!(matches!(stream.peek(), Ok(token) if token.kind == TokenKind::My));
14//!
15//! while let Ok(token) = stream.next() {
16//! if token.kind == TokenKind::Eof {
17//! break;
18//! }
19//! }
20//! ```
21//!
22//! # Pre-lexed token stream
23//!
24//! For incremental parsing, use [`TokenStream::from_vec`] to create a stream
25//! from pre-lexed tokens without re-lexing from source:
26//!
27//! ```
28//! use perl_parser_core::tokens::token_stream::{Token, TokenKind, TokenStream};
29//!
30//! let tokens = vec![
31//! Token::new(TokenKind::My, "my", 0, 2),
32//! Token::new(TokenKind::ScalarSigil, "$", 3, 4),
33//! Token::new(TokenKind::Identifier, "x", 4, 5),
34//! Token::new(TokenKind::Assign, "=", 6, 7),
35//! Token::new(TokenKind::Number, "1", 8, 9),
36//! Token::new(TokenKind::Semicolon, ";", 9, 10),
37//! Token::new(TokenKind::Eof, "", 10, 10),
38//! ];
39//! let mut stream = TokenStream::from_vec(tokens);
40//! assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
41//! ```
42
43use crate::syntax::error::{ParseError, ParseResult};
44use perl_lexer::{LexerMode, PerlLexer, Token as LexerToken, TokenType as LexerTokenType};
45pub use perl_token::{Token, TokenKind};
46use std::collections::VecDeque;
47
48/// Backing source for the token stream — either a live lexer or pre-lexed tokens.
49enum TokenStreamInner<'a> {
50 /// Live lexer producing tokens on demand from source text.
51 Lexer(PerlLexer<'a>),
52 /// Pre-lexed token buffer; used by [`TokenStream::from_vec`].
53 Buffered(VecDeque<Token>),
54}
55
56/// Token stream that wraps perl-lexer or a pre-lexed token buffer.
57///
58/// Provides three-token lookahead, transparent trivia skipping (in lexer mode),
59/// and statement-boundary state management used by the recursive-descent parser.
60pub struct TokenStream<'a> {
61 inner: TokenStreamInner<'a>,
62 buffered_eof_pos: usize,
63 peeked: Option<Token>,
64 peeked_second: Option<Token>,
65 peeked_third: Option<Token>,
66}
67
68impl<'a> TokenStream<'a> {
69 /// Create a new token stream from source code.
70 pub fn new(input: &'a str) -> Self {
71 TokenStream {
72 inner: TokenStreamInner::Lexer(PerlLexer::new(input)),
73 buffered_eof_pos: input.len(),
74 peeked: None,
75 peeked_second: None,
76 peeked_third: None,
77 }
78 }
79
80 /// Create a token stream from a pre-lexed token list.
81 ///
82 /// This constructor skips lexing entirely and feeds tokens directly from the
83 /// provided `Vec`. It is intended for the incremental parsing pipeline where
84 /// tokens from a prior parse run can be reused for unchanged regions.
85 ///
86 /// # Behaviour differences from [`TokenStream::new`]
87 ///
88 /// - [`on_stmt_boundary`](Self::on_stmt_boundary): clears lookahead cache only;
89 /// no lexer mode reset (tokens are already classified).
90 /// - [`relex_as_term`](Self::relex_as_term): clears lookahead cache only;
91 /// no re-lexing (token kinds are fixed from the original lex pass).
92 /// - [`enter_format_mode`](Self::enter_format_mode): no-op.
93 ///
94 /// # Arguments
95 ///
96 /// * `tokens` — Pre-lexed tokens. An `Eof` token does **not** need to be
97 /// included; the stream synthesises one when the buffer is exhausted.
98 ///
99 /// # Examples
100 ///
101 /// ```rust
102 /// use perl_parser_core::tokens::token_stream::{Token, TokenKind, TokenStream};
103 ///
104 /// let tokens = vec![
105 /// Token::new(TokenKind::My, "my", 0, 2),
106 /// Token::new(TokenKind::Eof, "", 2, 2),
107 /// ];
108 /// let mut stream = TokenStream::from_vec(tokens);
109 /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
110 /// ```
111 pub fn from_vec(tokens: Vec<Token>) -> Self {
112 let buffered_eof_pos = tokens
113 .last()
114 .map(|token| if token.kind == TokenKind::Eof { token.start } else { token.end })
115 .unwrap_or(0);
116
117 TokenStream {
118 inner: TokenStreamInner::Buffered(VecDeque::from(tokens)),
119 buffered_eof_pos,
120 peeked: None,
121 peeked_second: None,
122 peeked_third: None,
123 }
124 }
125
126 /// Convert a slice of raw [`LexerToken`]s to parser [`Token`]s, filtering out trivia.
127 ///
128 /// This is a convenience method for the incremental parsing pipeline where the
129 /// token cache stores raw lexer tokens (including whitespace and comments) and
130 /// needs to convert them to parser tokens before feeding to [`Self::from_vec`].
131 ///
132 /// Trivia token types (whitespace, newlines, comments, EOF) are discarded.
133 /// All other token types are converted using the same mapping as the live
134 /// [`TokenStream`] would apply.
135 ///
136 /// # Examples
137 ///
138 /// ```rust
139 /// use perl_parser_core::tokens::token_stream::{TokenKind, TokenStream};
140 /// use perl_lexer::{PerlLexer, TokenType};
141 ///
142 /// // Collect raw lexer tokens
143 /// let mut lexer = PerlLexer::new("my $x = 1;");
144 /// let mut raw = Vec::new();
145 /// while let Some(t) = lexer.next_token() {
146 /// if matches!(t.token_type, TokenType::EOF) { break; }
147 /// raw.push(t);
148 /// }
149 ///
150 /// // Convert to parser tokens and build a stream
151 /// let parser_tokens = TokenStream::lexer_tokens_to_parser_tokens(raw);
152 /// let mut stream = TokenStream::from_vec(parser_tokens);
153 /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
154 /// ```
155 pub fn lexer_tokens_to_parser_tokens(tokens: Vec<LexerToken>) -> Vec<Token> {
156 tokens
157 .into_iter()
158 .filter(|t| {
159 !matches!(
160 t.token_type,
161 LexerTokenType::Whitespace | LexerTokenType::Newline | LexerTokenType::EOF
162 ) && !matches!(t.token_type, LexerTokenType::Comment(_))
163 })
164 .map(Self::convert_lexer_token)
165 .collect()
166 }
167
168 /// Peek at the next token without consuming it
169 pub fn peek(&mut self) -> ParseResult<&Token> {
170 if self.peeked.is_none() {
171 self.peeked = Some(self.next_token()?);
172 }
173 // Safe: we just ensured peeked is Some
174 self.peeked.as_ref().ok_or(ParseError::UnexpectedEof)
175 }
176
177 /// Consume and return the next token
178 #[allow(clippy::should_implement_trait)]
179 pub fn next(&mut self) -> ParseResult<Token> {
180 // If we have a peeked token, return it and shift the peek chain down
181
182 if let Some(token) = self.peeked.take() {
183 // Make EOF sticky - if we're returning EOF, put it back in the peek buffer
184 // so future peeks still see EOF instead of getting an error
185 if token.kind == TokenKind::Eof {
186 self.peeked = Some(token.clone());
187 } else {
188 self.peeked = self.peeked_second.take();
189 self.peeked_second = self.peeked_third.take();
190 }
191 Ok(token)
192 } else {
193 let token = self.next_token()?;
194 // Make EOF sticky for fresh tokens too
195 if token.kind == TokenKind::Eof {
196 self.peeked = Some(token.clone());
197 }
198 Ok(token)
199 }
200 }
201
202 /// Check if we're at the end of input
203 pub fn is_eof(&mut self) -> bool {
204 matches!(self.peek(), Ok(token) if token.kind == TokenKind::Eof)
205 }
206
207 /// Peek at the second token (two tokens ahead)
208 pub fn peek_second(&mut self) -> ParseResult<&Token> {
209 // First ensure we have a peeked token
210 self.peek()?;
211
212 // If we don't have a second peeked token, get it
213 if self.peeked_second.is_none() {
214 self.peeked_second = Some(self.next_token()?);
215 }
216
217 // Safe: we just ensured peeked_second is Some
218 self.peeked_second.as_ref().ok_or(ParseError::UnexpectedEof)
219 }
220
221 /// Peek at the third token (three tokens ahead)
222 pub fn peek_third(&mut self) -> ParseResult<&Token> {
223 // First ensure we have peeked and second peeked tokens
224 self.peek_second()?;
225
226 // If we don't have a third peeked token, get it
227 if self.peeked_third.is_none() {
228 self.peeked_third = Some(self.next_token()?);
229 }
230
231 // Safe: we just ensured peeked_third is Some
232 self.peeked_third.as_ref().ok_or(ParseError::UnexpectedEof)
233 }
234
235 /// Enter format body parsing mode in the lexer.
236 ///
237 /// No-op when operating in buffered (pre-lexed) mode — the tokens are
238 /// already fully classified.
239 pub fn enter_format_mode(&mut self) {
240 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
241 lexer.enter_format_mode();
242 }
243 // Buffered mode: no-op — tokens are pre-classified.
244 }
245
246 /// Called at statement boundaries to reset lexer state and clear cached lookahead.
247 ///
248 /// In buffered mode only the lookahead cache is cleared; no lexer mode reset
249 /// is performed because the tokens are already fully classified.
250 pub fn on_stmt_boundary(&mut self) {
251 // Clear any cached lookahead tokens
252 self.peeked = None;
253 self.peeked_second = None;
254 self.peeked_third = None;
255
256 // Reset lexer to expect a term (start of new statement)
257 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
258 lexer.set_mode(LexerMode::ExpectTerm);
259 }
260 // Buffered mode: no lexer mode reset needed — tokens are pre-classified.
261 }
262
263 /// Re-lex the current peeked token in `ExpectTerm` mode.
264 ///
265 /// This is needed for context-sensitive constructs like `split /regex/`
266 /// where the `/` was lexed as division (`Slash`) but should be a regex
267 /// delimiter. Rolls the lexer back to the peeked token's start position,
268 /// switches to `ExpectTerm` mode, and clears the peek cache so the next
269 /// `peek()` or `next()` re-lexes it as a regex.
270 ///
271 /// In buffered mode the peek cache is cleared but no re-lexing occurs —
272 /// token kinds are fixed from the original lex pass.
273 pub fn relex_as_term(&mut self) {
274 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
275 if let Some(ref token) = self.peeked {
276 use perl_lexer::Checkpointable;
277 let pos = token.start;
278 // Build a checkpoint at the peeked token's position with ExpectTerm mode
279 let cp = perl_lexer::LexerCheckpoint::at_position(pos);
280 lexer.restore(&cp);
281 }
282 }
283 // Both modes: clear the peek cache.
284 self.peeked = None;
285 self.peeked_second = None;
286 self.peeked_third = None;
287 }
288
289 /// Pure peek cache invalidation - no mode changes
290 pub fn invalidate_peek(&mut self) {
291 self.peeked = None;
292 self.peeked_third = None;
293 self.peeked_second = None;
294 }
295
296 /// Convenience method for a one-shot fresh peek
297 pub fn peek_fresh_kind(&mut self) -> Option<TokenKind> {
298 self.invalidate_peek();
299 match self.peek() {
300 Ok(token) => Some(token.kind),
301 Err(_) => None,
302 }
303 }
304
305 /// Get the next token from the backing source.
306 fn next_token(&mut self) -> ParseResult<Token> {
307 match &mut self.inner {
308 TokenStreamInner::Lexer(lexer) => Self::next_token_from_lexer(lexer),
309 TokenStreamInner::Buffered(buf) => {
310 Self::next_token_from_buf(buf, &mut self.buffered_eof_pos)
311 }
312 }
313 }
314
315 /// Drain the next non-trivia token from the live lexer.
316 fn next_token_from_lexer(lexer: &mut PerlLexer<'_>) -> ParseResult<Token> {
317 // Skip whitespace and comments
318 loop {
319 let lexer_token = lexer.next_token().ok_or(ParseError::UnexpectedEof)?;
320
321 match &lexer_token.token_type {
322 LexerTokenType::Whitespace | LexerTokenType::Newline => continue,
323 LexerTokenType::Comment(_) => continue,
324 LexerTokenType::EOF => {
325 return Ok(Token {
326 kind: TokenKind::Eof,
327 text: String::new().into(),
328 start: lexer_token.start,
329 end: lexer_token.end,
330 });
331 }
332 _ => {
333 return Ok(Self::convert_lexer_token(lexer_token));
334 }
335 }
336 }
337 }
338
339 /// Return the next token from the pre-lexed buffer.
340 fn next_token_from_buf(
341 buf: &mut VecDeque<Token>,
342 buffered_eof_pos: &mut usize,
343 ) -> ParseResult<Token> {
344 match buf.pop_front() {
345 Some(token) => {
346 *buffered_eof_pos =
347 if token.kind == TokenKind::Eof { token.start } else { token.end };
348 Ok(token)
349 }
350 // Synthesise EOF at the most recently known source position.
351 None => Ok(Token::eof_at(*buffered_eof_pos)),
352 }
353 }
354
355 /// Convert a raw lexer token to the parser `Token` type.
356 ///
357 /// Extracted from `next_token_from_lexer` to keep the match arm readable.
358 fn convert_lexer_token(token: LexerToken) -> Token {
359 let kind = match &token.token_type {
360 // Keywords
361 LexerTokenType::Keyword(kw) => match kw.as_ref() {
362 "qw" => TokenKind::Identifier, // Keep as identifier but handle specially
363 keyword => TokenKind::from_keyword(keyword).unwrap_or(TokenKind::Identifier),
364 },
365
366 // Operators
367 LexerTokenType::Operator(op) => TokenKind::from_operator(op)
368 // Sigils may be surfaced as operator tokens in some contexts.
369 .or_else(|| TokenKind::from_sigil(op))
370 .unwrap_or(TokenKind::Unknown),
371
372 // Arrow tokens
373 LexerTokenType::Arrow => TokenKind::Arrow,
374 LexerTokenType::FatComma => TokenKind::FatArrow,
375
376 // Delimiters
377 LexerTokenType::LeftParen => TokenKind::LeftParen,
378 LexerTokenType::RightParen => TokenKind::RightParen,
379 LexerTokenType::LeftBrace => TokenKind::LeftBrace,
380 LexerTokenType::RightBrace => TokenKind::RightBrace,
381 LexerTokenType::LeftBracket => TokenKind::LeftBracket,
382 LexerTokenType::RightBracket => TokenKind::RightBracket,
383 LexerTokenType::Semicolon => TokenKind::Semicolon,
384 LexerTokenType::Comma => TokenKind::Comma,
385
386 // Division operator (important to handle before other tokens)
387 LexerTokenType::Division => TokenKind::Slash,
388
389 // Literals
390 LexerTokenType::Number(_) => TokenKind::Number,
391 LexerTokenType::StringLiteral | LexerTokenType::InterpolatedString(_) => {
392 TokenKind::String
393 }
394 LexerTokenType::RegexMatch | LexerTokenType::QuoteRegex => TokenKind::Regex,
395 LexerTokenType::Substitution => TokenKind::Substitution,
396 LexerTokenType::Transliteration => TokenKind::Transliteration,
397 LexerTokenType::QuoteSingle => TokenKind::QuoteSingle,
398 LexerTokenType::QuoteDouble => TokenKind::QuoteDouble,
399 LexerTokenType::QuoteWords => TokenKind::QuoteWords,
400 LexerTokenType::QuoteCommand => TokenKind::QuoteCommand,
401 LexerTokenType::HeredocStart => TokenKind::HeredocStart,
402 LexerTokenType::HeredocBody(_) => TokenKind::HeredocBody,
403 LexerTokenType::FormatBody(_) => TokenKind::FormatBody,
404 LexerTokenType::Version(_) => TokenKind::VString,
405 LexerTokenType::DataMarker(_) => TokenKind::DataMarker,
406 LexerTokenType::DataBody(_) => TokenKind::DataBody,
407 LexerTokenType::UnknownRest => TokenKind::UnknownRest,
408
409 // Identifiers
410 LexerTokenType::Identifier(text) => {
411 // The lexer emits bare sigil characters ('%', '&') as Identifier
412 // tokens in postfix-dereference contexts (e.g. `->%{key}`,
413 // `%{$ref}`). Those must map to sigil kinds, NOT operator kinds,
414 // so we check sigil priority first for the ambiguous cases.
415 // '*' is the exception: as a bare identifier it is multiplication.
416 match text.as_ref() {
417 "%" => TokenKind::HashSigil,
418 "&" => TokenKind::SubSigil,
419 _ => TokenKind::from_keyword(text)
420 .or_else(|| TokenKind::from_operator(text))
421 .or_else(|| TokenKind::from_sigil(text))
422 .unwrap_or(TokenKind::Identifier),
423 }
424 }
425
426 // Handle error tokens that might be valid syntax
427 LexerTokenType::Error(msg) => {
428 // Check if it's a specific error we want to handle specially
429 if msg.as_ref() == "Heredoc nesting too deep" {
430 TokenKind::HeredocDepthLimit
431 } else {
432 // Check if it's a brace that the lexer couldn't recognize
433 TokenKind::from_delimiter(token.text.as_ref()).unwrap_or(TokenKind::Unknown)
434 }
435 }
436
437 _ => TokenKind::Unknown,
438 };
439
440 Token { kind, text: token.text, start: token.start, end: token.end }
441 }
442}