Skip to main content

perl_parser_core/tokens/
token_stream.rs

1//! Token stream adapter between `perl-lexer` output and the parser.
2//!
3//! Provides buffered lookahead, skips trivia tokens, and resets lexer mode at
4//! statement boundaries. This stream is optimized for parser consumption rather
5//! than full-fidelity token preservation.
6//!
7//! # Basic usage
8//!
9//! ```
10//! use perl_parser_core::tokens::token_stream::{TokenKind, TokenStream};
11//!
12//! let mut stream = TokenStream::new("my $x = 42;");
13//! assert!(matches!(stream.peek(), Ok(token) if token.kind == TokenKind::My));
14//!
15//! while let Ok(token) = stream.next() {
16//!     if token.kind == TokenKind::Eof {
17//!         break;
18//!     }
19//! }
20//! ```
21//!
22//! # Pre-lexed token stream
23//!
24//! For incremental parsing, use [`TokenStream::from_vec`] to create a stream
25//! from pre-lexed tokens without re-lexing from source:
26//!
27//! ```
28//! use perl_parser_core::tokens::token_stream::{Token, TokenKind, TokenStream};
29//!
30//! let tokens = vec![
31//!     Token::new(TokenKind::My, "my", 0, 2),
32//!     Token::new(TokenKind::ScalarSigil, "$", 3, 4),
33//!     Token::new(TokenKind::Identifier, "x", 4, 5),
34//!     Token::new(TokenKind::Assign, "=", 6, 7),
35//!     Token::new(TokenKind::Number, "1", 8, 9),
36//!     Token::new(TokenKind::Semicolon, ";", 9, 10),
37//!     Token::new(TokenKind::Eof, "", 10, 10),
38//! ];
39//! let mut stream = TokenStream::from_vec(tokens);
40//! assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
41//! ```
42
43use crate::syntax::error::{ParseError, ParseResult};
44use perl_lexer::{LexerMode, PerlLexer, Token as LexerToken, TokenType as LexerTokenType};
45pub use perl_token::{Token, TokenKind};
46use std::collections::VecDeque;
47
48/// Backing source for the token stream — either a live lexer or pre-lexed tokens.
49enum TokenStreamInner<'a> {
50    /// Live lexer producing tokens on demand from source text.
51    Lexer(PerlLexer<'a>),
52    /// Pre-lexed token buffer; used by [`TokenStream::from_vec`].
53    Buffered(VecDeque<Token>),
54}
55
56/// Token stream that wraps perl-lexer or a pre-lexed token buffer.
57///
58/// Provides three-token lookahead, transparent trivia skipping (in lexer mode),
59/// and statement-boundary state management used by the recursive-descent parser.
60pub struct TokenStream<'a> {
61    inner: TokenStreamInner<'a>,
62    buffered_eof_pos: usize,
63    peeked: Option<Token>,
64    peeked_second: Option<Token>,
65    peeked_third: Option<Token>,
66}
67
68impl<'a> TokenStream<'a> {
69    /// Create a new token stream from source code.
70    pub fn new(input: &'a str) -> Self {
71        TokenStream {
72            inner: TokenStreamInner::Lexer(PerlLexer::new(input)),
73            buffered_eof_pos: input.len(),
74            peeked: None,
75            peeked_second: None,
76            peeked_third: None,
77        }
78    }
79
80    /// Create a token stream from a pre-lexed token list.
81    ///
82    /// This constructor skips lexing entirely and feeds tokens directly from the
83    /// provided `Vec`. It is intended for the incremental parsing pipeline where
84    /// tokens from a prior parse run can be reused for unchanged regions.
85    ///
86    /// # Behaviour differences from [`TokenStream::new`]
87    ///
88    /// - [`on_stmt_boundary`](Self::on_stmt_boundary): clears lookahead cache only;
89    ///   no lexer mode reset (tokens are already classified).
90    /// - [`relex_as_term`](Self::relex_as_term): clears lookahead cache only;
91    ///   no re-lexing (token kinds are fixed from the original lex pass).
92    /// - [`enter_format_mode`](Self::enter_format_mode): no-op.
93    ///
94    /// # Arguments
95    ///
96    /// * `tokens` — Pre-lexed tokens. An `Eof` token does **not** need to be
97    ///   included; the stream synthesises one when the buffer is exhausted.
98    ///
99    /// # Examples
100    ///
101    /// ```rust
102    /// use perl_parser_core::tokens::token_stream::{Token, TokenKind, TokenStream};
103    ///
104    /// let tokens = vec![
105    ///     Token::new(TokenKind::My, "my", 0, 2),
106    ///     Token::new(TokenKind::Eof, "", 2, 2),
107    /// ];
108    /// let mut stream = TokenStream::from_vec(tokens);
109    /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
110    /// ```
111    pub fn from_vec(tokens: Vec<Token>) -> Self {
112        let buffered_eof_pos = tokens
113            .last()
114            .map(|token| if token.kind == TokenKind::Eof { token.start } else { token.end })
115            .unwrap_or(0);
116
117        TokenStream {
118            inner: TokenStreamInner::Buffered(VecDeque::from(tokens)),
119            buffered_eof_pos,
120            peeked: None,
121            peeked_second: None,
122            peeked_third: None,
123        }
124    }
125
126    /// Convert a slice of raw [`LexerToken`]s to parser [`Token`]s, filtering out trivia.
127    ///
128    /// This is a convenience method for the incremental parsing pipeline where the
129    /// token cache stores raw lexer tokens (including whitespace and comments) and
130    /// needs to convert them to parser tokens before feeding to [`Self::from_vec`].
131    ///
132    /// Trivia token types (whitespace, newlines, comments, EOF) are discarded.
133    /// All other token types are converted using the same mapping as the live
134    /// [`TokenStream`] would apply.
135    ///
136    /// # Examples
137    ///
138    /// ```rust
139    /// use perl_parser_core::tokens::token_stream::{TokenKind, TokenStream};
140    /// use perl_lexer::{PerlLexer, TokenType};
141    ///
142    /// // Collect raw lexer tokens
143    /// let mut lexer = PerlLexer::new("my $x = 1;");
144    /// let mut raw = Vec::new();
145    /// while let Some(t) = lexer.next_token() {
146    ///     if matches!(t.token_type, TokenType::EOF) { break; }
147    ///     raw.push(t);
148    /// }
149    ///
150    /// // Convert to parser tokens and build a stream
151    /// let parser_tokens = TokenStream::lexer_tokens_to_parser_tokens(raw);
152    /// let mut stream = TokenStream::from_vec(parser_tokens);
153    /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
154    /// ```
155    pub fn lexer_tokens_to_parser_tokens(tokens: Vec<LexerToken>) -> Vec<Token> {
156        tokens
157            .into_iter()
158            .filter(|t| {
159                !matches!(
160                    t.token_type,
161                    LexerTokenType::Whitespace | LexerTokenType::Newline | LexerTokenType::EOF
162                ) && !matches!(t.token_type, LexerTokenType::Comment(_))
163            })
164            .map(Self::convert_lexer_token)
165            .collect()
166    }
167
168    /// Peek at the next token without consuming it
169    pub fn peek(&mut self) -> ParseResult<&Token> {
170        if self.peeked.is_none() {
171            self.peeked = Some(self.next_token()?);
172        }
173        // Safe: we just ensured peeked is Some
174        self.peeked.as_ref().ok_or(ParseError::UnexpectedEof)
175    }
176
177    /// Consume and return the next token
178    #[allow(clippy::should_implement_trait)]
179    pub fn next(&mut self) -> ParseResult<Token> {
180        // If we have a peeked token, return it and shift the peek chain down
181
182        if let Some(token) = self.peeked.take() {
183            // Make EOF sticky - if we're returning EOF, put it back in the peek buffer
184            // so future peeks still see EOF instead of getting an error
185            if token.kind == TokenKind::Eof {
186                self.peeked = Some(token.clone());
187            } else {
188                self.peeked = self.peeked_second.take();
189                self.peeked_second = self.peeked_third.take();
190            }
191            Ok(token)
192        } else {
193            let token = self.next_token()?;
194            // Make EOF sticky for fresh tokens too
195            if token.kind == TokenKind::Eof {
196                self.peeked = Some(token.clone());
197            }
198            Ok(token)
199        }
200    }
201
202    /// Check if we're at the end of input
203    pub fn is_eof(&mut self) -> bool {
204        matches!(self.peek(), Ok(token) if token.kind == TokenKind::Eof)
205    }
206
207    /// Peek at the second token (two tokens ahead)
208    pub fn peek_second(&mut self) -> ParseResult<&Token> {
209        // First ensure we have a peeked token
210        self.peek()?;
211
212        // If we don't have a second peeked token, get it
213        if self.peeked_second.is_none() {
214            self.peeked_second = Some(self.next_token()?);
215        }
216
217        // Safe: we just ensured peeked_second is Some
218        self.peeked_second.as_ref().ok_or(ParseError::UnexpectedEof)
219    }
220
221    /// Peek at the third token (three tokens ahead)
222    pub fn peek_third(&mut self) -> ParseResult<&Token> {
223        // First ensure we have peeked and second peeked tokens
224        self.peek_second()?;
225
226        // If we don't have a third peeked token, get it
227        if self.peeked_third.is_none() {
228            self.peeked_third = Some(self.next_token()?);
229        }
230
231        // Safe: we just ensured peeked_third is Some
232        self.peeked_third.as_ref().ok_or(ParseError::UnexpectedEof)
233    }
234
235    /// Enter format body parsing mode in the lexer.
236    ///
237    /// No-op when operating in buffered (pre-lexed) mode — the tokens are
238    /// already fully classified.
239    pub fn enter_format_mode(&mut self) {
240        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
241            lexer.enter_format_mode();
242        }
243        // Buffered mode: no-op — tokens are pre-classified.
244    }
245
246    /// Called at statement boundaries to reset lexer state and clear cached lookahead.
247    ///
248    /// In buffered mode only the lookahead cache is cleared; no lexer mode reset
249    /// is performed because the tokens are already fully classified.
250    pub fn on_stmt_boundary(&mut self) {
251        // Clear any cached lookahead tokens
252        self.peeked = None;
253        self.peeked_second = None;
254        self.peeked_third = None;
255
256        // Reset lexer to expect a term (start of new statement)
257        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
258            lexer.set_mode(LexerMode::ExpectTerm);
259        }
260        // Buffered mode: no lexer mode reset needed — tokens are pre-classified.
261    }
262
263    /// Re-lex the current peeked token in `ExpectTerm` mode.
264    ///
265    /// This is needed for context-sensitive constructs like `split /regex/`
266    /// where the `/` was lexed as division (`Slash`) but should be a regex
267    /// delimiter. Rolls the lexer back to the peeked token's start position,
268    /// switches to `ExpectTerm` mode, and clears the peek cache so the next
269    /// `peek()` or `next()` re-lexes it as a regex.
270    ///
271    /// In buffered mode the peek cache is cleared but no re-lexing occurs —
272    /// token kinds are fixed from the original lex pass.
273    pub fn relex_as_term(&mut self) {
274        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
275            if let Some(ref token) = self.peeked {
276                use perl_lexer::Checkpointable;
277                let pos = token.start;
278                // Build a checkpoint at the peeked token's position with ExpectTerm mode
279                let cp = perl_lexer::LexerCheckpoint::at_position(pos);
280                lexer.restore(&cp);
281            }
282        }
283        // Both modes: clear the peek cache.
284        self.peeked = None;
285        self.peeked_second = None;
286        self.peeked_third = None;
287    }
288
289    /// Pure peek cache invalidation - no mode changes
290    pub fn invalidate_peek(&mut self) {
291        self.peeked = None;
292        self.peeked_third = None;
293        self.peeked_second = None;
294    }
295
296    /// Convenience method for a one-shot fresh peek
297    pub fn peek_fresh_kind(&mut self) -> Option<TokenKind> {
298        self.invalidate_peek();
299        match self.peek() {
300            Ok(token) => Some(token.kind),
301            Err(_) => None,
302        }
303    }
304
305    /// Get the next token from the backing source.
306    fn next_token(&mut self) -> ParseResult<Token> {
307        match &mut self.inner {
308            TokenStreamInner::Lexer(lexer) => Self::next_token_from_lexer(lexer),
309            TokenStreamInner::Buffered(buf) => {
310                Self::next_token_from_buf(buf, &mut self.buffered_eof_pos)
311            }
312        }
313    }
314
315    /// Drain the next non-trivia token from the live lexer.
316    fn next_token_from_lexer(lexer: &mut PerlLexer<'_>) -> ParseResult<Token> {
317        // Skip whitespace and comments
318        loop {
319            let lexer_token = lexer.next_token().ok_or(ParseError::UnexpectedEof)?;
320
321            match &lexer_token.token_type {
322                LexerTokenType::Whitespace | LexerTokenType::Newline => continue,
323                LexerTokenType::Comment(_) => continue,
324                LexerTokenType::EOF => {
325                    return Ok(Token {
326                        kind: TokenKind::Eof,
327                        text: String::new().into(),
328                        start: lexer_token.start,
329                        end: lexer_token.end,
330                    });
331                }
332                _ => {
333                    return Ok(Self::convert_lexer_token(lexer_token));
334                }
335            }
336        }
337    }
338
339    /// Return the next token from the pre-lexed buffer.
340    fn next_token_from_buf(
341        buf: &mut VecDeque<Token>,
342        buffered_eof_pos: &mut usize,
343    ) -> ParseResult<Token> {
344        match buf.pop_front() {
345            Some(token) => {
346                *buffered_eof_pos =
347                    if token.kind == TokenKind::Eof { token.start } else { token.end };
348                Ok(token)
349            }
350            // Synthesise EOF at the most recently known source position.
351            None => Ok(Token::eof_at(*buffered_eof_pos)),
352        }
353    }
354
355    /// Convert a raw lexer token to the parser `Token` type.
356    ///
357    /// Extracted from `next_token_from_lexer` to keep the match arm readable.
358    fn convert_lexer_token(token: LexerToken) -> Token {
359        let kind = match &token.token_type {
360            // Keywords
361            LexerTokenType::Keyword(kw) => match kw.as_ref() {
362                "qw" => TokenKind::Identifier, // Keep as identifier but handle specially
363                keyword => TokenKind::from_keyword(keyword).unwrap_or(TokenKind::Identifier),
364            },
365
366            // Operators
367            LexerTokenType::Operator(op) => TokenKind::from_operator(op)
368                // Sigils may be surfaced as operator tokens in some contexts.
369                .or_else(|| TokenKind::from_sigil(op))
370                .unwrap_or(TokenKind::Unknown),
371
372            // Arrow tokens
373            LexerTokenType::Arrow => TokenKind::Arrow,
374            LexerTokenType::FatComma => TokenKind::FatArrow,
375
376            // Delimiters
377            LexerTokenType::LeftParen => TokenKind::LeftParen,
378            LexerTokenType::RightParen => TokenKind::RightParen,
379            LexerTokenType::LeftBrace => TokenKind::LeftBrace,
380            LexerTokenType::RightBrace => TokenKind::RightBrace,
381            LexerTokenType::LeftBracket => TokenKind::LeftBracket,
382            LexerTokenType::RightBracket => TokenKind::RightBracket,
383            LexerTokenType::Semicolon => TokenKind::Semicolon,
384            LexerTokenType::Comma => TokenKind::Comma,
385
386            // Division operator (important to handle before other tokens)
387            LexerTokenType::Division => TokenKind::Slash,
388
389            // Literals
390            LexerTokenType::Number(_) => TokenKind::Number,
391            LexerTokenType::StringLiteral | LexerTokenType::InterpolatedString(_) => {
392                TokenKind::String
393            }
394            LexerTokenType::RegexMatch | LexerTokenType::QuoteRegex => TokenKind::Regex,
395            LexerTokenType::Substitution => TokenKind::Substitution,
396            LexerTokenType::Transliteration => TokenKind::Transliteration,
397            LexerTokenType::QuoteSingle => TokenKind::QuoteSingle,
398            LexerTokenType::QuoteDouble => TokenKind::QuoteDouble,
399            LexerTokenType::QuoteWords => TokenKind::QuoteWords,
400            LexerTokenType::QuoteCommand => TokenKind::QuoteCommand,
401            LexerTokenType::HeredocStart => TokenKind::HeredocStart,
402            LexerTokenType::HeredocBody(_) => TokenKind::HeredocBody,
403            LexerTokenType::FormatBody(_) => TokenKind::FormatBody,
404            LexerTokenType::Version(_) => TokenKind::VString,
405            LexerTokenType::DataMarker(_) => TokenKind::DataMarker,
406            LexerTokenType::DataBody(_) => TokenKind::DataBody,
407            LexerTokenType::UnknownRest => TokenKind::UnknownRest,
408
409            // Identifiers
410            LexerTokenType::Identifier(text) => {
411                // The lexer emits bare sigil characters ('%', '&') as Identifier
412                // tokens in postfix-dereference contexts (e.g. `->%{key}`,
413                // `%{$ref}`). Those must map to sigil kinds, NOT operator kinds,
414                // so we check sigil priority first for the ambiguous cases.
415                // '*' is the exception: as a bare identifier it is multiplication.
416                match text.as_ref() {
417                    "%" => TokenKind::HashSigil,
418                    "&" => TokenKind::SubSigil,
419                    _ => TokenKind::from_keyword(text)
420                        .or_else(|| TokenKind::from_operator(text))
421                        .or_else(|| TokenKind::from_sigil(text))
422                        .unwrap_or(TokenKind::Identifier),
423                }
424            }
425
426            // Handle error tokens that might be valid syntax
427            LexerTokenType::Error(msg) => {
428                // Check if it's a specific error we want to handle specially
429                if msg.as_ref() == "Heredoc nesting too deep" {
430                    TokenKind::HeredocDepthLimit
431                } else {
432                    // Check if it's a brace that the lexer couldn't recognize
433                    TokenKind::from_delimiter(token.text.as_ref()).unwrap_or(TokenKind::Unknown)
434                }
435            }
436
437            _ => TokenKind::Unknown,
438        };
439
440        Token { kind, text: token.text, start: token.start, end: token.end }
441    }
442}