Skip to main content

perl_tokenizer/
token_stream.rs

1//! Token stream adapter between `perl-lexer` output and the parser.
2//!
3//! Provides buffered lookahead, skips trivia tokens, and resets lexer mode at
4//! statement boundaries. This stream is optimized for parser consumption rather
5//! than full-fidelity token preservation.
6//!
7//! # Basic usage
8//!
9//! ```
10//! use perl_tokenizer::{TokenKind, TokenStream};
11//!
12//! let mut stream = TokenStream::new("my $x = 42;");
13//! assert!(matches!(stream.peek(), Ok(token) if token.kind == TokenKind::My));
14//!
15//! while let Ok(token) = stream.next() {
16//!     if token.kind == TokenKind::Eof {
17//!         break;
18//!     }
19//! }
20//! ```
21//!
22//! # Pre-lexed token stream
23//!
24//! For incremental parsing, use [`TokenStream::from_vec`] to create a stream
25//! from pre-lexed tokens without re-lexing from source:
26//!
27//! ```
28//! use perl_tokenizer::{Token, TokenKind, TokenStream};
29//!
30//! let tokens = vec![
31//!     Token::new(TokenKind::My, "my", 0, 2),
32//!     Token::new(TokenKind::ScalarSigil, "$", 3, 4),
33//!     Token::new(TokenKind::Identifier, "x", 4, 5),
34//!     Token::new(TokenKind::Assign, "=", 6, 7),
35//!     Token::new(TokenKind::Number, "1", 8, 9),
36//!     Token::new(TokenKind::Semicolon, ";", 9, 10),
37//!     Token::new(TokenKind::Eof, "", 10, 10),
38//! ];
39//! let mut stream = TokenStream::from_vec(tokens);
40//! assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
41//! ```
42
43use perl_error::{ParseError, ParseResult};
44use perl_lexer::{LexerMode, PerlLexer, Token as LexerToken, TokenType as LexerTokenType};
45pub use perl_token::{Token, TokenKind};
46use std::collections::VecDeque;
47
48/// Backing source for the token stream — either a live lexer or pre-lexed tokens.
49enum TokenStreamInner<'a> {
50    /// Live lexer producing tokens on demand from source text.
51    Lexer(PerlLexer<'a>),
52    /// Pre-lexed token buffer; used by [`TokenStream::from_vec`].
53    Buffered(VecDeque<Token>),
54}
55
56/// Token stream that wraps perl-lexer or a pre-lexed token buffer.
57///
58/// Provides three-token lookahead, transparent trivia skipping (in lexer mode),
59/// and statement-boundary state management used by the recursive-descent parser.
60pub struct TokenStream<'a> {
61    inner: TokenStreamInner<'a>,
62    peeked: Option<Token>,
63    peeked_second: Option<Token>,
64    peeked_third: Option<Token>,
65}
66
67impl<'a> TokenStream<'a> {
68    /// Create a new token stream from source code.
69    pub fn new(input: &'a str) -> Self {
70        TokenStream {
71            inner: TokenStreamInner::Lexer(PerlLexer::new(input)),
72            peeked: None,
73            peeked_second: None,
74            peeked_third: None,
75        }
76    }
77
78    /// Create a token stream from a pre-lexed token list.
79    ///
80    /// This constructor skips lexing entirely and feeds tokens directly from the
81    /// provided `Vec`. It is intended for the incremental parsing pipeline where
82    /// tokens from a prior parse run can be reused for unchanged regions.
83    ///
84    /// # Behaviour differences from [`TokenStream::new`]
85    ///
86    /// - [`on_stmt_boundary`](Self::on_stmt_boundary): clears lookahead cache only;
87    ///   no lexer mode reset (tokens are already classified).
88    /// - [`relex_as_term`](Self::relex_as_term): clears lookahead cache only;
89    ///   no re-lexing (token kinds are fixed from the original lex pass).
90    /// - [`enter_format_mode`](Self::enter_format_mode): no-op.
91    ///
92    /// # Arguments
93    ///
94    /// * `tokens` — Pre-lexed tokens. An `Eof` token does **not** need to be
95    ///   included; the stream synthesises one when the buffer is exhausted.
96    ///
97    /// # Examples
98    ///
99    /// ```rust
100    /// use perl_tokenizer::{Token, TokenKind, TokenStream};
101    ///
102    /// let tokens = vec![
103    ///     Token::new(TokenKind::My, "my", 0, 2),
104    ///     Token::new(TokenKind::Eof, "", 2, 2),
105    /// ];
106    /// let mut stream = TokenStream::from_vec(tokens);
107    /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
108    /// ```
109    pub fn from_vec(tokens: Vec<Token>) -> Self {
110        TokenStream {
111            inner: TokenStreamInner::Buffered(VecDeque::from(tokens)),
112            peeked: None,
113            peeked_second: None,
114            peeked_third: None,
115        }
116    }
117
118    /// Convert a slice of raw [`LexerToken`]s to parser [`Token`]s, filtering out trivia.
119    ///
120    /// This is a convenience method for the incremental parsing pipeline where the
121    /// token cache stores raw lexer tokens (including whitespace and comments) and
122    /// needs to convert them to parser tokens before feeding to [`Self::from_vec`].
123    ///
124    /// Trivia token types (whitespace, newlines, comments, EOF) are discarded.
125    /// All other token types are converted using the same mapping as the live
126    /// [`TokenStream`] would apply.
127    ///
128    /// # Examples
129    ///
130    /// ```rust
131    /// use perl_tokenizer::{TokenKind, TokenStream};
132    /// use perl_lexer::{PerlLexer, TokenType};
133    ///
134    /// // Collect raw lexer tokens
135    /// let mut lexer = PerlLexer::new("my $x = 1;");
136    /// let mut raw = Vec::new();
137    /// while let Some(t) = lexer.next_token() {
138    ///     if matches!(t.token_type, TokenType::EOF) { break; }
139    ///     raw.push(t);
140    /// }
141    ///
142    /// // Convert to parser tokens and build a stream
143    /// let parser_tokens = TokenStream::lexer_tokens_to_parser_tokens(raw);
144    /// let mut stream = TokenStream::from_vec(parser_tokens);
145    /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
146    /// ```
147    pub fn lexer_tokens_to_parser_tokens(tokens: Vec<LexerToken>) -> Vec<Token> {
148        tokens
149            .into_iter()
150            .filter(|t| {
151                !matches!(
152                    t.token_type,
153                    LexerTokenType::Whitespace | LexerTokenType::Newline | LexerTokenType::EOF
154                ) && !matches!(t.token_type, LexerTokenType::Comment(_))
155            })
156            .map(Self::convert_lexer_token)
157            .collect()
158    }
159
160    /// Peek at the next token without consuming it
161    pub fn peek(&mut self) -> ParseResult<&Token> {
162        if self.peeked.is_none() {
163            self.peeked = Some(self.next_token()?);
164        }
165        // Safe: we just ensured peeked is Some
166        self.peeked.as_ref().ok_or(ParseError::UnexpectedEof)
167    }
168
169    /// Consume and return the next token
170    #[allow(clippy::should_implement_trait)]
171    pub fn next(&mut self) -> ParseResult<Token> {
172        // If we have a peeked token, return it and shift the peek chain down
173
174        if let Some(token) = self.peeked.take() {
175            // Make EOF sticky - if we're returning EOF, put it back in the peek buffer
176            // so future peeks still see EOF instead of getting an error
177            if token.kind == TokenKind::Eof {
178                self.peeked = Some(token.clone());
179            } else {
180                self.peeked = self.peeked_second.take();
181                self.peeked_second = self.peeked_third.take();
182            }
183            Ok(token)
184        } else {
185            let token = self.next_token()?;
186            // Make EOF sticky for fresh tokens too
187            if token.kind == TokenKind::Eof {
188                self.peeked = Some(token.clone());
189            }
190            Ok(token)
191        }
192    }
193
194    /// Check if we're at the end of input
195    pub fn is_eof(&mut self) -> bool {
196        matches!(self.peek(), Ok(token) if token.kind == TokenKind::Eof)
197    }
198
199    /// Peek at the second token (two tokens ahead)
200    pub fn peek_second(&mut self) -> ParseResult<&Token> {
201        // First ensure we have a peeked token
202        self.peek()?;
203
204        // If we don't have a second peeked token, get it
205        if self.peeked_second.is_none() {
206            self.peeked_second = Some(self.next_token()?);
207        }
208
209        // Safe: we just ensured peeked_second is Some
210        self.peeked_second.as_ref().ok_or(ParseError::UnexpectedEof)
211    }
212
213    /// Peek at the third token (three tokens ahead)
214    pub fn peek_third(&mut self) -> ParseResult<&Token> {
215        // First ensure we have peeked and second peeked tokens
216        self.peek_second()?;
217
218        // If we don't have a third peeked token, get it
219        if self.peeked_third.is_none() {
220            self.peeked_third = Some(self.next_token()?);
221        }
222
223        // Safe: we just ensured peeked_third is Some
224        self.peeked_third.as_ref().ok_or(ParseError::UnexpectedEof)
225    }
226
227    /// Enter format body parsing mode in the lexer.
228    ///
229    /// No-op when operating in buffered (pre-lexed) mode — the tokens are
230    /// already fully classified.
231    pub fn enter_format_mode(&mut self) {
232        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
233            lexer.enter_format_mode();
234        }
235        // Buffered mode: no-op — tokens are pre-classified.
236    }
237
238    /// Called at statement boundaries to reset lexer state and clear cached lookahead.
239    ///
240    /// In buffered mode only the lookahead cache is cleared; no lexer mode reset
241    /// is performed because the tokens are already fully classified.
242    pub fn on_stmt_boundary(&mut self) {
243        // Clear any cached lookahead tokens
244        self.peeked = None;
245        self.peeked_second = None;
246        self.peeked_third = None;
247
248        // Reset lexer to expect a term (start of new statement)
249        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
250            lexer.set_mode(LexerMode::ExpectTerm);
251        }
252        // Buffered mode: no lexer mode reset needed — tokens are pre-classified.
253    }
254
255    /// Re-lex the current peeked token in `ExpectTerm` mode.
256    ///
257    /// This is needed for context-sensitive constructs like `split /regex/`
258    /// where the `/` was lexed as division (`Slash`) but should be a regex
259    /// delimiter. Rolls the lexer back to the peeked token's start position,
260    /// switches to `ExpectTerm` mode, and clears the peek cache so the next
261    /// `peek()` or `next()` re-lexes it as a regex.
262    ///
263    /// In buffered mode the peek cache is cleared but no re-lexing occurs —
264    /// token kinds are fixed from the original lex pass.
265    pub fn relex_as_term(&mut self) {
266        if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
267            if let Some(ref token) = self.peeked {
268                use perl_lexer::Checkpointable;
269                let pos = token.start;
270                // Build a checkpoint at the peeked token's position with ExpectTerm mode
271                let cp = perl_lexer::LexerCheckpoint::at_position(pos);
272                lexer.restore(&cp);
273            }
274        }
275        // Both modes: clear the peek cache.
276        self.peeked = None;
277        self.peeked_second = None;
278        self.peeked_third = None;
279    }
280
281    /// Pure peek cache invalidation - no mode changes
282    pub fn invalidate_peek(&mut self) {
283        self.peeked = None;
284        self.peeked_third = None;
285        self.peeked_second = None;
286    }
287
288    /// Convenience method for a one-shot fresh peek
289    pub fn peek_fresh_kind(&mut self) -> Option<TokenKind> {
290        self.invalidate_peek();
291        match self.peek() {
292            Ok(token) => Some(token.kind),
293            Err(_) => None,
294        }
295    }
296
297    /// Get the next token from the backing source.
298    fn next_token(&mut self) -> ParseResult<Token> {
299        match &mut self.inner {
300            TokenStreamInner::Lexer(lexer) => Self::next_token_from_lexer(lexer),
301            TokenStreamInner::Buffered(buf) => Self::next_token_from_buf(buf),
302        }
303    }
304
305    /// Drain the next non-trivia token from the live lexer.
306    fn next_token_from_lexer(lexer: &mut PerlLexer<'_>) -> ParseResult<Token> {
307        // Skip whitespace and comments
308        loop {
309            let lexer_token = lexer.next_token().ok_or(ParseError::UnexpectedEof)?;
310
311            match &lexer_token.token_type {
312                LexerTokenType::Whitespace | LexerTokenType::Newline => continue,
313                LexerTokenType::Comment(_) => continue,
314                LexerTokenType::EOF => {
315                    return Ok(Token {
316                        kind: TokenKind::Eof,
317                        text: String::new().into(),
318                        start: lexer_token.start,
319                        end: lexer_token.end,
320                    });
321                }
322                _ => {
323                    return Ok(Self::convert_lexer_token(lexer_token));
324                }
325            }
326        }
327    }
328
329    /// Return the next token from the pre-lexed buffer.
330    fn next_token_from_buf(buf: &mut VecDeque<Token>) -> ParseResult<Token> {
331        match buf.pop_front() {
332            Some(token) => Ok(token),
333            // Synthesise an EOF at position 0 when the buffer is exhausted.
334            // The caller (parser) makes EOF sticky so position doesn't matter
335            // for correctness; using 0 is safe.
336            None => Ok(Token { kind: TokenKind::Eof, text: "".into(), start: 0, end: 0 }),
337        }
338    }
339
340    /// Convert a raw lexer token to the parser `Token` type.
341    ///
342    /// Extracted from `next_token_from_lexer` to keep the match arm readable.
343    fn convert_lexer_token(token: LexerToken) -> Token {
344        let kind = match &token.token_type {
345            // Keywords
346            LexerTokenType::Keyword(kw) => match kw.as_ref() {
347                "my" => TokenKind::My,
348                "our" => TokenKind::Our,
349                "local" => TokenKind::Local,
350                "state" => TokenKind::State,
351                "sub" => TokenKind::Sub,
352                "if" => TokenKind::If,
353                "elsif" => TokenKind::Elsif,
354                "else" => TokenKind::Else,
355                "unless" => TokenKind::Unless,
356                "while" => TokenKind::While,
357                "until" => TokenKind::Until,
358                "for" => TokenKind::For,
359                "foreach" => TokenKind::Foreach,
360                "return" => TokenKind::Return,
361                "package" => TokenKind::Package,
362                "use" => TokenKind::Use,
363                "no" => TokenKind::No,
364                "BEGIN" => TokenKind::Begin,
365                "END" => TokenKind::End,
366                "CHECK" => TokenKind::Check,
367                "INIT" => TokenKind::Init,
368                "UNITCHECK" => TokenKind::Unitcheck,
369                "eval" => TokenKind::Eval,
370                "do" => TokenKind::Do,
371                "given" => TokenKind::Given,
372                "when" => TokenKind::When,
373                "default" => TokenKind::Default,
374                "try" => TokenKind::Try,
375                "catch" => TokenKind::Catch,
376                "field" => TokenKind::Field,
377                "finally" => TokenKind::Finally,
378                "continue" => TokenKind::Continue,
379                "next" => TokenKind::Next,
380                "last" => TokenKind::Last,
381                "redo" => TokenKind::Redo,
382                "goto" => TokenKind::Goto,
383                "class" => TokenKind::Class,
384                "method" => TokenKind::Method,
385                "format" => TokenKind::Format,
386                "undef" => TokenKind::Undef,
387                "and" => TokenKind::WordAnd,
388                "or" => TokenKind::WordOr,
389                "not" => TokenKind::WordNot,
390                "xor" => TokenKind::WordXor,
391                "cmp" => TokenKind::StringCompare,
392                "qw" => TokenKind::Identifier, // Keep as identifier but handle specially
393                _ => TokenKind::Identifier,
394            },
395
396            // Operators
397            LexerTokenType::Operator(op) => match op.as_ref() {
398                "=" => TokenKind::Assign,
399                "+" => TokenKind::Plus,
400                "-" => TokenKind::Minus,
401                "*" => TokenKind::Star,
402                "/" => TokenKind::Slash,
403                "%" => TokenKind::Percent,
404                "**" => TokenKind::Power,
405                "<<" => TokenKind::LeftShift,
406                ">>" => TokenKind::RightShift,
407                "&" => TokenKind::BitwiseAnd,
408                "|" => TokenKind::BitwiseOr,
409                "^" => TokenKind::BitwiseXor,
410                "~" => TokenKind::BitwiseNot,
411                // Compound assignments
412                "+=" => TokenKind::PlusAssign,
413                "-=" => TokenKind::MinusAssign,
414                "*=" => TokenKind::StarAssign,
415                "/=" => TokenKind::SlashAssign,
416                "%=" => TokenKind::PercentAssign,
417                ".=" => TokenKind::DotAssign,
418                "&=" => TokenKind::AndAssign,
419                "|=" => TokenKind::OrAssign,
420                "^=" => TokenKind::XorAssign,
421                "**=" => TokenKind::PowerAssign,
422                "<<=" => TokenKind::LeftShiftAssign,
423                ">>=" => TokenKind::RightShiftAssign,
424                "&&=" => TokenKind::LogicalAndAssign,
425                "||=" => TokenKind::LogicalOrAssign,
426                "//=" => TokenKind::DefinedOrAssign,
427                "==" => TokenKind::Equal,
428                "!=" => TokenKind::NotEqual,
429                "=~" => TokenKind::Match,
430                "!~" => TokenKind::NotMatch,
431                "~~" => TokenKind::SmartMatch,
432                "<" => TokenKind::Less,
433                ">" => TokenKind::Greater,
434                "<=" => TokenKind::LessEqual,
435                ">=" => TokenKind::GreaterEqual,
436                "<=>" => TokenKind::Spaceship,
437                "&&" => TokenKind::And,
438                "||" => TokenKind::Or,
439                "!" => TokenKind::Not,
440                "//" => TokenKind::DefinedOr,
441                "->" => TokenKind::Arrow,
442                "=>" => TokenKind::FatArrow,
443                "." => TokenKind::Dot,
444                ".." => TokenKind::Range,
445                "..." => TokenKind::Ellipsis,
446                "++" => TokenKind::Increment,
447                "--" => TokenKind::Decrement,
448                "::" => TokenKind::DoubleColon,
449                "?" => TokenKind::Question,
450                ":" => TokenKind::Colon,
451                "\\" => TokenKind::Backslash,
452                // Sigils (when used as operators in certain contexts)
453                "$" => TokenKind::ScalarSigil,
454                "@" => TokenKind::ArraySigil,
455                // % is already handled as Percent above
456                // & is already handled as BitwiseAnd above
457                // * is already handled as Star above
458                _ => TokenKind::Unknown,
459            },
460
461            // Arrow tokens
462            LexerTokenType::Arrow => TokenKind::Arrow,
463            LexerTokenType::FatComma => TokenKind::FatArrow,
464
465            // Delimiters
466            LexerTokenType::LeftParen => TokenKind::LeftParen,
467            LexerTokenType::RightParen => TokenKind::RightParen,
468            LexerTokenType::LeftBrace => TokenKind::LeftBrace,
469            LexerTokenType::RightBrace => TokenKind::RightBrace,
470            LexerTokenType::LeftBracket => TokenKind::LeftBracket,
471            LexerTokenType::RightBracket => TokenKind::RightBracket,
472            LexerTokenType::Semicolon => TokenKind::Semicolon,
473            LexerTokenType::Comma => TokenKind::Comma,
474
475            // Division operator (important to handle before other tokens)
476            LexerTokenType::Division => TokenKind::Slash,
477
478            // Literals
479            LexerTokenType::Number(_) => TokenKind::Number,
480            LexerTokenType::StringLiteral | LexerTokenType::InterpolatedString(_) => {
481                TokenKind::String
482            }
483            LexerTokenType::RegexMatch | LexerTokenType::QuoteRegex => TokenKind::Regex,
484            LexerTokenType::Substitution => TokenKind::Substitution,
485            LexerTokenType::Transliteration => TokenKind::Transliteration,
486            LexerTokenType::QuoteSingle => TokenKind::QuoteSingle,
487            LexerTokenType::QuoteDouble => TokenKind::QuoteDouble,
488            LexerTokenType::QuoteWords => TokenKind::QuoteWords,
489            LexerTokenType::QuoteCommand => TokenKind::QuoteCommand,
490            LexerTokenType::HeredocStart => TokenKind::HeredocStart,
491            LexerTokenType::HeredocBody(_) => TokenKind::HeredocBody,
492            LexerTokenType::FormatBody(_) => TokenKind::FormatBody,
493            LexerTokenType::Version(_) => TokenKind::VString,
494            LexerTokenType::DataMarker(_) => TokenKind::DataMarker,
495            LexerTokenType::DataBody(_) => TokenKind::DataBody,
496            LexerTokenType::UnknownRest => TokenKind::UnknownRest,
497
498            // Identifiers
499            LexerTokenType::Identifier(text) => {
500                // Check if it's actually a keyword that the lexer didn't recognize
501                match text.as_ref() {
502                    "no" => TokenKind::No,
503                    "*" => TokenKind::Star, // Special case: * by itself is multiplication
504                    "$" => TokenKind::ScalarSigil,
505                    "@" => TokenKind::ArraySigil,
506                    "%" => TokenKind::HashSigil,
507                    "&" => TokenKind::SubSigil,
508                    _ => TokenKind::Identifier,
509                }
510            }
511
512            // Handle error tokens that might be valid syntax
513            LexerTokenType::Error(msg) => {
514                // Check if it's a specific error we want to handle specially
515                if msg.as_ref() == "Heredoc nesting too deep" {
516                    TokenKind::HeredocDepthLimit
517                } else {
518                    // Check if it's a brace that the lexer couldn't recognize
519                    match token.text.as_ref() {
520                        "{" => TokenKind::LeftBrace,
521                        "}" => TokenKind::RightBrace,
522                        _ => TokenKind::Unknown,
523                    }
524                }
525            }
526
527            _ => TokenKind::Unknown,
528        };
529
530        Token { kind, text: token.text, start: token.start, end: token.end }
531    }
532}