perl_tokenizer/token_stream.rs
1//! Token stream adapter between `perl-lexer` output and the parser.
2//!
3//! Provides buffered lookahead, skips trivia tokens, and resets lexer mode at
4//! statement boundaries. This stream is optimized for parser consumption rather
5//! than full-fidelity token preservation.
6//!
7//! # Basic usage
8//!
9//! ```
10//! use perl_tokenizer::{TokenKind, TokenStream};
11//!
12//! let mut stream = TokenStream::new("my $x = 42;");
13//! assert!(matches!(stream.peek(), Ok(token) if token.kind == TokenKind::My));
14//!
15//! while let Ok(token) = stream.next() {
16//! if token.kind == TokenKind::Eof {
17//! break;
18//! }
19//! }
20//! ```
21//!
22//! # Pre-lexed token stream
23//!
24//! For incremental parsing, use [`TokenStream::from_vec`] to create a stream
25//! from pre-lexed tokens without re-lexing from source:
26//!
27//! ```
28//! use perl_tokenizer::{Token, TokenKind, TokenStream};
29//!
30//! let tokens = vec![
31//! Token::new(TokenKind::My, "my", 0, 2),
32//! Token::new(TokenKind::ScalarSigil, "$", 3, 4),
33//! Token::new(TokenKind::Identifier, "x", 4, 5),
34//! Token::new(TokenKind::Assign, "=", 6, 7),
35//! Token::new(TokenKind::Number, "1", 8, 9),
36//! Token::new(TokenKind::Semicolon, ";", 9, 10),
37//! Token::new(TokenKind::Eof, "", 10, 10),
38//! ];
39//! let mut stream = TokenStream::from_vec(tokens);
40//! assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
41//! ```
42
43use perl_error::{ParseError, ParseResult};
44use perl_lexer::{LexerMode, PerlLexer, Token as LexerToken, TokenType as LexerTokenType};
45pub use perl_token::{Token, TokenKind};
46use std::collections::VecDeque;
47
48/// Backing source for the token stream — either a live lexer or pre-lexed tokens.
49enum TokenStreamInner<'a> {
50 /// Live lexer producing tokens on demand from source text.
51 Lexer(PerlLexer<'a>),
52 /// Pre-lexed token buffer; used by [`TokenStream::from_vec`].
53 Buffered(VecDeque<Token>),
54}
55
56/// Token stream that wraps perl-lexer or a pre-lexed token buffer.
57///
58/// Provides three-token lookahead, transparent trivia skipping (in lexer mode),
59/// and statement-boundary state management used by the recursive-descent parser.
60pub struct TokenStream<'a> {
61 inner: TokenStreamInner<'a>,
62 peeked: Option<Token>,
63 peeked_second: Option<Token>,
64 peeked_third: Option<Token>,
65}
66
67impl<'a> TokenStream<'a> {
68 /// Create a new token stream from source code.
69 pub fn new(input: &'a str) -> Self {
70 TokenStream {
71 inner: TokenStreamInner::Lexer(PerlLexer::new(input)),
72 peeked: None,
73 peeked_second: None,
74 peeked_third: None,
75 }
76 }
77
78 /// Create a token stream from a pre-lexed token list.
79 ///
80 /// This constructor skips lexing entirely and feeds tokens directly from the
81 /// provided `Vec`. It is intended for the incremental parsing pipeline where
82 /// tokens from a prior parse run can be reused for unchanged regions.
83 ///
84 /// # Behaviour differences from [`TokenStream::new`]
85 ///
86 /// - [`on_stmt_boundary`](Self::on_stmt_boundary): clears lookahead cache only;
87 /// no lexer mode reset (tokens are already classified).
88 /// - [`relex_as_term`](Self::relex_as_term): clears lookahead cache only;
89 /// no re-lexing (token kinds are fixed from the original lex pass).
90 /// - [`enter_format_mode`](Self::enter_format_mode): no-op.
91 ///
92 /// # Arguments
93 ///
94 /// * `tokens` — Pre-lexed tokens. An `Eof` token does **not** need to be
95 /// included; the stream synthesises one when the buffer is exhausted.
96 ///
97 /// # Examples
98 ///
99 /// ```rust
100 /// use perl_tokenizer::{Token, TokenKind, TokenStream};
101 ///
102 /// let tokens = vec![
103 /// Token::new(TokenKind::My, "my", 0, 2),
104 /// Token::new(TokenKind::Eof, "", 2, 2),
105 /// ];
106 /// let mut stream = TokenStream::from_vec(tokens);
107 /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
108 /// ```
109 pub fn from_vec(tokens: Vec<Token>) -> Self {
110 TokenStream {
111 inner: TokenStreamInner::Buffered(VecDeque::from(tokens)),
112 peeked: None,
113 peeked_second: None,
114 peeked_third: None,
115 }
116 }
117
118 /// Convert a slice of raw [`LexerToken`]s to parser [`Token`]s, filtering out trivia.
119 ///
120 /// This is a convenience method for the incremental parsing pipeline where the
121 /// token cache stores raw lexer tokens (including whitespace and comments) and
122 /// needs to convert them to parser tokens before feeding to [`Self::from_vec`].
123 ///
124 /// Trivia token types (whitespace, newlines, comments, EOF) are discarded.
125 /// All other token types are converted using the same mapping as the live
126 /// [`TokenStream`] would apply.
127 ///
128 /// # Examples
129 ///
130 /// ```rust
131 /// use perl_tokenizer::{TokenKind, TokenStream};
132 /// use perl_lexer::{PerlLexer, TokenType};
133 ///
134 /// // Collect raw lexer tokens
135 /// let mut lexer = PerlLexer::new("my $x = 1;");
136 /// let mut raw = Vec::new();
137 /// while let Some(t) = lexer.next_token() {
138 /// if matches!(t.token_type, TokenType::EOF) { break; }
139 /// raw.push(t);
140 /// }
141 ///
142 /// // Convert to parser tokens and build a stream
143 /// let parser_tokens = TokenStream::lexer_tokens_to_parser_tokens(raw);
144 /// let mut stream = TokenStream::from_vec(parser_tokens);
145 /// assert!(matches!(stream.peek(), Ok(t) if t.kind == TokenKind::My));
146 /// ```
147 pub fn lexer_tokens_to_parser_tokens(tokens: Vec<LexerToken>) -> Vec<Token> {
148 tokens
149 .into_iter()
150 .filter(|t| {
151 !matches!(
152 t.token_type,
153 LexerTokenType::Whitespace | LexerTokenType::Newline | LexerTokenType::EOF
154 ) && !matches!(t.token_type, LexerTokenType::Comment(_))
155 })
156 .map(Self::convert_lexer_token)
157 .collect()
158 }
159
160 /// Peek at the next token without consuming it
161 pub fn peek(&mut self) -> ParseResult<&Token> {
162 if self.peeked.is_none() {
163 self.peeked = Some(self.next_token()?);
164 }
165 // Safe: we just ensured peeked is Some
166 self.peeked.as_ref().ok_or(ParseError::UnexpectedEof)
167 }
168
169 /// Consume and return the next token
170 #[allow(clippy::should_implement_trait)]
171 pub fn next(&mut self) -> ParseResult<Token> {
172 // If we have a peeked token, return it and shift the peek chain down
173
174 if let Some(token) = self.peeked.take() {
175 // Make EOF sticky - if we're returning EOF, put it back in the peek buffer
176 // so future peeks still see EOF instead of getting an error
177 if token.kind == TokenKind::Eof {
178 self.peeked = Some(token.clone());
179 } else {
180 self.peeked = self.peeked_second.take();
181 self.peeked_second = self.peeked_third.take();
182 }
183 Ok(token)
184 } else {
185 let token = self.next_token()?;
186 // Make EOF sticky for fresh tokens too
187 if token.kind == TokenKind::Eof {
188 self.peeked = Some(token.clone());
189 }
190 Ok(token)
191 }
192 }
193
194 /// Check if we're at the end of input
195 pub fn is_eof(&mut self) -> bool {
196 matches!(self.peek(), Ok(token) if token.kind == TokenKind::Eof)
197 }
198
199 /// Peek at the second token (two tokens ahead)
200 pub fn peek_second(&mut self) -> ParseResult<&Token> {
201 // First ensure we have a peeked token
202 self.peek()?;
203
204 // If we don't have a second peeked token, get it
205 if self.peeked_second.is_none() {
206 self.peeked_second = Some(self.next_token()?);
207 }
208
209 // Safe: we just ensured peeked_second is Some
210 self.peeked_second.as_ref().ok_or(ParseError::UnexpectedEof)
211 }
212
213 /// Peek at the third token (three tokens ahead)
214 pub fn peek_third(&mut self) -> ParseResult<&Token> {
215 // First ensure we have peeked and second peeked tokens
216 self.peek_second()?;
217
218 // If we don't have a third peeked token, get it
219 if self.peeked_third.is_none() {
220 self.peeked_third = Some(self.next_token()?);
221 }
222
223 // Safe: we just ensured peeked_third is Some
224 self.peeked_third.as_ref().ok_or(ParseError::UnexpectedEof)
225 }
226
227 /// Enter format body parsing mode in the lexer.
228 ///
229 /// No-op when operating in buffered (pre-lexed) mode — the tokens are
230 /// already fully classified.
231 pub fn enter_format_mode(&mut self) {
232 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
233 lexer.enter_format_mode();
234 }
235 // Buffered mode: no-op — tokens are pre-classified.
236 }
237
238 /// Called at statement boundaries to reset lexer state and clear cached lookahead.
239 ///
240 /// In buffered mode only the lookahead cache is cleared; no lexer mode reset
241 /// is performed because the tokens are already fully classified.
242 pub fn on_stmt_boundary(&mut self) {
243 // Clear any cached lookahead tokens
244 self.peeked = None;
245 self.peeked_second = None;
246 self.peeked_third = None;
247
248 // Reset lexer to expect a term (start of new statement)
249 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
250 lexer.set_mode(LexerMode::ExpectTerm);
251 }
252 // Buffered mode: no lexer mode reset needed — tokens are pre-classified.
253 }
254
255 /// Re-lex the current peeked token in `ExpectTerm` mode.
256 ///
257 /// This is needed for context-sensitive constructs like `split /regex/`
258 /// where the `/` was lexed as division (`Slash`) but should be a regex
259 /// delimiter. Rolls the lexer back to the peeked token's start position,
260 /// switches to `ExpectTerm` mode, and clears the peek cache so the next
261 /// `peek()` or `next()` re-lexes it as a regex.
262 ///
263 /// In buffered mode the peek cache is cleared but no re-lexing occurs —
264 /// token kinds are fixed from the original lex pass.
265 pub fn relex_as_term(&mut self) {
266 if let TokenStreamInner::Lexer(ref mut lexer) = self.inner {
267 if let Some(ref token) = self.peeked {
268 use perl_lexer::Checkpointable;
269 let pos = token.start;
270 // Build a checkpoint at the peeked token's position with ExpectTerm mode
271 let cp = perl_lexer::LexerCheckpoint::at_position(pos);
272 lexer.restore(&cp);
273 }
274 }
275 // Both modes: clear the peek cache.
276 self.peeked = None;
277 self.peeked_second = None;
278 self.peeked_third = None;
279 }
280
281 /// Pure peek cache invalidation - no mode changes
282 pub fn invalidate_peek(&mut self) {
283 self.peeked = None;
284 self.peeked_third = None;
285 self.peeked_second = None;
286 }
287
288 /// Convenience method for a one-shot fresh peek
289 pub fn peek_fresh_kind(&mut self) -> Option<TokenKind> {
290 self.invalidate_peek();
291 match self.peek() {
292 Ok(token) => Some(token.kind),
293 Err(_) => None,
294 }
295 }
296
297 /// Get the next token from the backing source.
298 fn next_token(&mut self) -> ParseResult<Token> {
299 match &mut self.inner {
300 TokenStreamInner::Lexer(lexer) => Self::next_token_from_lexer(lexer),
301 TokenStreamInner::Buffered(buf) => Self::next_token_from_buf(buf),
302 }
303 }
304
305 /// Drain the next non-trivia token from the live lexer.
306 fn next_token_from_lexer(lexer: &mut PerlLexer<'_>) -> ParseResult<Token> {
307 // Skip whitespace and comments
308 loop {
309 let lexer_token = lexer.next_token().ok_or(ParseError::UnexpectedEof)?;
310
311 match &lexer_token.token_type {
312 LexerTokenType::Whitespace | LexerTokenType::Newline => continue,
313 LexerTokenType::Comment(_) => continue,
314 LexerTokenType::EOF => {
315 return Ok(Token {
316 kind: TokenKind::Eof,
317 text: String::new().into(),
318 start: lexer_token.start,
319 end: lexer_token.end,
320 });
321 }
322 _ => {
323 return Ok(Self::convert_lexer_token(lexer_token));
324 }
325 }
326 }
327 }
328
329 /// Return the next token from the pre-lexed buffer.
330 fn next_token_from_buf(buf: &mut VecDeque<Token>) -> ParseResult<Token> {
331 match buf.pop_front() {
332 Some(token) => Ok(token),
333 // Synthesise an EOF at position 0 when the buffer is exhausted.
334 // The caller (parser) makes EOF sticky so position doesn't matter
335 // for correctness; using 0 is safe.
336 None => Ok(Token { kind: TokenKind::Eof, text: "".into(), start: 0, end: 0 }),
337 }
338 }
339
340 /// Convert a raw lexer token to the parser `Token` type.
341 ///
342 /// Extracted from `next_token_from_lexer` to keep the match arm readable.
343 fn convert_lexer_token(token: LexerToken) -> Token {
344 let kind = match &token.token_type {
345 // Keywords
346 LexerTokenType::Keyword(kw) => match kw.as_ref() {
347 "my" => TokenKind::My,
348 "our" => TokenKind::Our,
349 "local" => TokenKind::Local,
350 "state" => TokenKind::State,
351 "sub" => TokenKind::Sub,
352 "if" => TokenKind::If,
353 "elsif" => TokenKind::Elsif,
354 "else" => TokenKind::Else,
355 "unless" => TokenKind::Unless,
356 "while" => TokenKind::While,
357 "until" => TokenKind::Until,
358 "for" => TokenKind::For,
359 "foreach" => TokenKind::Foreach,
360 "return" => TokenKind::Return,
361 "package" => TokenKind::Package,
362 "use" => TokenKind::Use,
363 "no" => TokenKind::No,
364 "BEGIN" => TokenKind::Begin,
365 "END" => TokenKind::End,
366 "CHECK" => TokenKind::Check,
367 "INIT" => TokenKind::Init,
368 "UNITCHECK" => TokenKind::Unitcheck,
369 "eval" => TokenKind::Eval,
370 "do" => TokenKind::Do,
371 "given" => TokenKind::Given,
372 "when" => TokenKind::When,
373 "default" => TokenKind::Default,
374 "try" => TokenKind::Try,
375 "catch" => TokenKind::Catch,
376 "field" => TokenKind::Field,
377 "finally" => TokenKind::Finally,
378 "continue" => TokenKind::Continue,
379 "next" => TokenKind::Next,
380 "last" => TokenKind::Last,
381 "redo" => TokenKind::Redo,
382 "goto" => TokenKind::Goto,
383 "class" => TokenKind::Class,
384 "method" => TokenKind::Method,
385 "format" => TokenKind::Format,
386 "undef" => TokenKind::Undef,
387 "and" => TokenKind::WordAnd,
388 "or" => TokenKind::WordOr,
389 "not" => TokenKind::WordNot,
390 "xor" => TokenKind::WordXor,
391 "cmp" => TokenKind::StringCompare,
392 "qw" => TokenKind::Identifier, // Keep as identifier but handle specially
393 _ => TokenKind::Identifier,
394 },
395
396 // Operators
397 LexerTokenType::Operator(op) => match op.as_ref() {
398 "=" => TokenKind::Assign,
399 "+" => TokenKind::Plus,
400 "-" => TokenKind::Minus,
401 "*" => TokenKind::Star,
402 "/" => TokenKind::Slash,
403 "%" => TokenKind::Percent,
404 "**" => TokenKind::Power,
405 "<<" => TokenKind::LeftShift,
406 ">>" => TokenKind::RightShift,
407 "&" => TokenKind::BitwiseAnd,
408 "|" => TokenKind::BitwiseOr,
409 "^" => TokenKind::BitwiseXor,
410 "~" => TokenKind::BitwiseNot,
411 // Compound assignments
412 "+=" => TokenKind::PlusAssign,
413 "-=" => TokenKind::MinusAssign,
414 "*=" => TokenKind::StarAssign,
415 "/=" => TokenKind::SlashAssign,
416 "%=" => TokenKind::PercentAssign,
417 ".=" => TokenKind::DotAssign,
418 "&=" => TokenKind::AndAssign,
419 "|=" => TokenKind::OrAssign,
420 "^=" => TokenKind::XorAssign,
421 "**=" => TokenKind::PowerAssign,
422 "<<=" => TokenKind::LeftShiftAssign,
423 ">>=" => TokenKind::RightShiftAssign,
424 "&&=" => TokenKind::LogicalAndAssign,
425 "||=" => TokenKind::LogicalOrAssign,
426 "//=" => TokenKind::DefinedOrAssign,
427 "==" => TokenKind::Equal,
428 "!=" => TokenKind::NotEqual,
429 "=~" => TokenKind::Match,
430 "!~" => TokenKind::NotMatch,
431 "~~" => TokenKind::SmartMatch,
432 "<" => TokenKind::Less,
433 ">" => TokenKind::Greater,
434 "<=" => TokenKind::LessEqual,
435 ">=" => TokenKind::GreaterEqual,
436 "<=>" => TokenKind::Spaceship,
437 "&&" => TokenKind::And,
438 "||" => TokenKind::Or,
439 "!" => TokenKind::Not,
440 "//" => TokenKind::DefinedOr,
441 "->" => TokenKind::Arrow,
442 "=>" => TokenKind::FatArrow,
443 "." => TokenKind::Dot,
444 ".." => TokenKind::Range,
445 "..." => TokenKind::Ellipsis,
446 "++" => TokenKind::Increment,
447 "--" => TokenKind::Decrement,
448 "::" => TokenKind::DoubleColon,
449 "?" => TokenKind::Question,
450 ":" => TokenKind::Colon,
451 "\\" => TokenKind::Backslash,
452 // Sigils (when used as operators in certain contexts)
453 "$" => TokenKind::ScalarSigil,
454 "@" => TokenKind::ArraySigil,
455 // % is already handled as Percent above
456 // & is already handled as BitwiseAnd above
457 // * is already handled as Star above
458 _ => TokenKind::Unknown,
459 },
460
461 // Arrow tokens
462 LexerTokenType::Arrow => TokenKind::Arrow,
463 LexerTokenType::FatComma => TokenKind::FatArrow,
464
465 // Delimiters
466 LexerTokenType::LeftParen => TokenKind::LeftParen,
467 LexerTokenType::RightParen => TokenKind::RightParen,
468 LexerTokenType::LeftBrace => TokenKind::LeftBrace,
469 LexerTokenType::RightBrace => TokenKind::RightBrace,
470 LexerTokenType::LeftBracket => TokenKind::LeftBracket,
471 LexerTokenType::RightBracket => TokenKind::RightBracket,
472 LexerTokenType::Semicolon => TokenKind::Semicolon,
473 LexerTokenType::Comma => TokenKind::Comma,
474
475 // Division operator (important to handle before other tokens)
476 LexerTokenType::Division => TokenKind::Slash,
477
478 // Literals
479 LexerTokenType::Number(_) => TokenKind::Number,
480 LexerTokenType::StringLiteral | LexerTokenType::InterpolatedString(_) => {
481 TokenKind::String
482 }
483 LexerTokenType::RegexMatch | LexerTokenType::QuoteRegex => TokenKind::Regex,
484 LexerTokenType::Substitution => TokenKind::Substitution,
485 LexerTokenType::Transliteration => TokenKind::Transliteration,
486 LexerTokenType::QuoteSingle => TokenKind::QuoteSingle,
487 LexerTokenType::QuoteDouble => TokenKind::QuoteDouble,
488 LexerTokenType::QuoteWords => TokenKind::QuoteWords,
489 LexerTokenType::QuoteCommand => TokenKind::QuoteCommand,
490 LexerTokenType::HeredocStart => TokenKind::HeredocStart,
491 LexerTokenType::HeredocBody(_) => TokenKind::HeredocBody,
492 LexerTokenType::FormatBody(_) => TokenKind::FormatBody,
493 LexerTokenType::Version(_) => TokenKind::VString,
494 LexerTokenType::DataMarker(_) => TokenKind::DataMarker,
495 LexerTokenType::DataBody(_) => TokenKind::DataBody,
496 LexerTokenType::UnknownRest => TokenKind::UnknownRest,
497
498 // Identifiers
499 LexerTokenType::Identifier(text) => {
500 // Check if it's actually a keyword that the lexer didn't recognize
501 match text.as_ref() {
502 "no" => TokenKind::No,
503 "*" => TokenKind::Star, // Special case: * by itself is multiplication
504 "$" => TokenKind::ScalarSigil,
505 "@" => TokenKind::ArraySigil,
506 "%" => TokenKind::HashSigil,
507 "&" => TokenKind::SubSigil,
508 _ => TokenKind::Identifier,
509 }
510 }
511
512 // Handle error tokens that might be valid syntax
513 LexerTokenType::Error(msg) => {
514 // Check if it's a specific error we want to handle specially
515 if msg.as_ref() == "Heredoc nesting too deep" {
516 TokenKind::HeredocDepthLimit
517 } else {
518 // Check if it's a brace that the lexer couldn't recognize
519 match token.text.as_ref() {
520 "{" => TokenKind::LeftBrace,
521 "}" => TokenKind::RightBrace,
522 _ => TokenKind::Unknown,
523 }
524 }
525 }
526
527 _ => TokenKind::Unknown,
528 };
529
530 Token { kind, text: token.text, start: token.start, end: token.end }
531 }
532}