mago-syntax 1.28.0

A correct, fast, and memory-efficient PHP syntax implementation, including Lexer, Parser, AST, and utilities for Mago.
Documentation
use std::fmt::Debug;

use bumpalo::Bump;
use bumpalo::collections::CollectIn;
use bumpalo::collections::Vec;

use mago_database::file::FileId;
use mago_database::file::HasFileId;
use mago_span::Position;
use mago_span::Span;
use mago_syntax_core::parser::LookaheadBuf;

use crate::ast::sequence::Sequence;
use crate::ast::trivia::Trivia;
use crate::ast::trivia::TriviaKind;
use crate::error::ParseError;
use crate::error::SyntaxError;
use crate::lexer::Lexer;
use crate::token::Token;
use crate::token::TokenKind;

#[derive(Debug)]
pub struct TokenStream<'input, 'arena> {
    arena: &'arena Bump,
    lexer: Lexer<'input>,
    buffer: LookaheadBuf<Token<'input>, 16>,
    trivia: Vec<'arena, Token<'input>>,
    position: Position,
    file_id: FileId,
}

impl<'input, 'arena> TokenStream<'input, 'arena> {
    pub fn new(arena: &'arena Bump, lexer: Lexer<'input>) -> TokenStream<'input, 'arena> {
        let position = lexer.current_position();
        let file_id_cached = lexer.file_id();

        TokenStream {
            arena,
            lexer,
            buffer: LookaheadBuf::new(),
            trivia: Vec::new_in(arena),
            position,
            file_id: file_id_cached,
        }
    }

    /// Returns the current position of the stream within the source file.
    ///
    /// This position represents the end location of the most recently
    /// consumed significant token via `advance()` or `consume()`.
    #[inline]
    #[must_use]
    pub const fn current_position(&self) -> Position {
        self.position
    }

    /// Returns whether the stream has consumed all tokens up to EOF.
    ///
    /// # Errors
    ///
    /// Returns a [`SyntaxError`] if the lexer fails to produce the next token.
    #[inline]
    pub fn has_reached_eof(&mut self) -> Result<bool, SyntaxError> {
        Ok(self.fill_buffer(1)?.is_none())
    }

    /// Consumes and returns the next significant token.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if EOF is reached or a lexer error occurs.
    #[inline]
    pub fn consume(&mut self) -> Result<Token<'input>, ParseError> {
        match self.advance() {
            Some(Ok(token)) => Ok(token),
            Some(Err(error)) => Err(error.into()),
            None => Err(self.unexpected(None, &[])),
        }
    }

    /// Consumes the next token only if it matches the expected kind.
    ///
    /// Returns the token if it matches, otherwise returns an error.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if the next token's kind does not match `kind`, or if EOF is reached.
    #[inline]
    pub fn eat(&mut self, kind: TokenKind) -> Result<Token<'input>, ParseError> {
        // Fast path: head already buffered. Avoids the Result<Option<...>>
        // round trip from `peek_kind` plus a follow-up `lookahead` on the
        // happy path.
        if let Some(token) = self.buffer.get(0) {
            if token.kind == kind {
                let _ = self.buffer.pop_front();

                self.position = Position::new(token.start.offset + token.value.len() as u32);
                return Ok(token);
            }

            return Err(self.unexpected(Some(token), &[kind]));
        }

        // Slow path: buffer empty, fill it.
        let current_kind = self.peek_kind(0)?;
        match current_kind {
            Some(k) if k == kind => self.consume(),
            Some(_) => {
                // The kind we just peeked guarantees a token is buffered, so `lookahead(0)`
                // must yield `Some`; if the lexer somehow disagrees we surface an EOF error
                // rather than panicking.
                match self.lookahead(0)? {
                    Some(token) => Err(self.unexpected(Some(token), &[kind])),
                    None => Err(self.unexpected(None, &[kind])),
                }
            }
            None => Err(self.unexpected(None, &[kind])),
        }
    }

    /// Consumes and returns the span of the next significant token.
    ///
    /// This is a convenience method equivalent to `consume()?.span_for(file_id())`.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if EOF is reached or a lexer error occurs.
    #[inline]
    pub fn consume_span(&mut self) -> Result<Span, ParseError> {
        let file_id = self.file_id();
        self.consume().map(|t| t.span_for(file_id))
    }

    /// Consumes the next token only if it matches the expected kind, returning its span.
    ///
    /// This is a convenience method equivalent to `eat(kind)?.span_for(file_id())`.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if the next token's kind does not match `kind`, or if EOF is reached.
    #[inline]
    pub fn eat_span(&mut self, kind: TokenKind) -> Result<Span, ParseError> {
        let file_id = self.file_id();
        self.eat(kind).map(|t| t.span_for(file_id))
    }

    /// Advances the stream to the next token in the input source code and returns it.
    ///
    /// If the stream has already read the entire input source code, this method will return `None`.
    ///
    /// # Returns
    ///
    /// The next token in the input source code, or `None` if the lexer has reached the end of the input.
    #[inline]
    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
        match self.fill_buffer(1) {
            Ok(Some(_)) => {
                if let Some(token) = self.buffer.pop_front() {
                    // Compute end position from start + value length
                    self.position = Position::new(token.start.offset + token.value.len() as u32);
                    Some(Ok(token))
                } else {
                    None
                }
            }
            Ok(None) => None,
            Err(error) => Some(Err(error)),
        }
    }

    /// Checks if the next token matches the given kind without consuming it.
    ///
    /// Returns `false` if at EOF.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if the lexer fails to produce the next token.
    #[inline]
    pub fn is_at(&mut self, kind: TokenKind) -> Result<bool, ParseError> {
        if let Some(token) = self.buffer.get(0) {
            return Ok(token.kind == kind);
        }

        Ok(self.peek_kind(0)? == Some(kind))
    }

    /// Peeks at the nth (0-indexed) significant token ahead without consuming it.
    ///
    /// Returns `Ok(None)` if EOF is reached before the nth token.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if the lexer fails to produce a token while filling the lookahead buffer.
    #[inline]
    pub fn lookahead(&mut self, n: usize) -> Result<Option<Token<'input>>, ParseError> {
        if n < self.buffer.len() {
            return Ok(self.buffer.get(n));
        }

        match self.fill_buffer(n + 1) {
            Ok(Some(_)) => Ok(self.buffer.get(n)),
            Ok(None) => Ok(None),
            Err(error) => Err(error.into()),
        }
    }

    /// Peeks at the kind of the nth (0-indexed) significant token ahead.
    ///
    /// More efficient than `lookahead(n)?.map(|t| t.kind)` as it avoids
    /// copying the full token when only the kind is needed.
    ///
    /// # Errors
    ///
    /// Returns a [`ParseError`] if the lexer fails to produce a token while filling the lookahead buffer.
    #[inline]
    pub fn peek_kind(&mut self, n: usize) -> Result<Option<TokenKind>, ParseError> {
        if n < self.buffer.len() {
            return Ok(self.buffer.get(n).map(|t| t.kind));
        }

        match self.fill_buffer(n + 1) {
            Ok(Some(_)) => Ok(self.buffer.get(n).map(|t| t.kind)),
            Ok(None) => Ok(None),
            Err(error) => Err(error.into()),
        }
    }

    /// Creates a `ParseError` for an unexpected token or EOF.
    #[inline]
    #[must_use]
    pub fn unexpected(&self, found: Option<Token<'_>>, expected: &[TokenKind]) -> ParseError {
        let expected_kinds: Box<[TokenKind]> = expected.into();
        if let Some(token) = found {
            ParseError::UnexpectedToken(expected_kinds, token.kind, token.span_for(self.file_id()))
        } else {
            ParseError::UnexpectedEndOfFile(expected_kinds, self.file_id(), self.current_position())
        }
    }

    /// Consumes the comments collected by the lexer and returns them.
    #[inline]
    pub fn get_trivia(&mut self) -> Sequence<'arena, Trivia<'arena>> {
        let mut tokens = Vec::new_in(self.arena);
        std::mem::swap(&mut self.trivia, &mut tokens);

        let file_id = self.file_id();
        Sequence::new(
            tokens
                .into_iter()
                .filter_map(|token| {
                    let span = token.span_for(file_id);
                    let kind = match token.kind {
                        TokenKind::Whitespace => TriviaKind::WhiteSpace,
                        TokenKind::HashComment => TriviaKind::HashComment,
                        TokenKind::SingleLineComment => TriviaKind::SingleLineComment,
                        TokenKind::MultiLineComment => TriviaKind::MultiLineComment,
                        TokenKind::DocBlockComment => TriviaKind::DocBlockComment,
                        // Tokens collected into `self.trivia` are guaranteed by `fill_buffer_slow`
                        // to satisfy `kind.is_trivia()`; any non-trivia kind here is a parser bug
                        // and the safe response is to drop it rather than panic.
                        _ => return None,
                    };
                    Some(Trivia { kind, span, value: token.value })
                })
                .collect_in(self.arena),
        )
    }

    /// Fills the token buffer until at least `n` tokens are available, unless the lexer returns EOF.
    ///
    /// Trivia tokens are collected separately and are not stored in the main token buffer.
    #[inline]
    fn fill_buffer(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
        if self.buffer.len() >= n {
            return Ok(Some(n));
        }

        self.fill_buffer_slow(n)
    }

    #[inline(never)]
    fn fill_buffer_slow(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
        while self.buffer.len() < n {
            match self.lexer.advance() {
                Some(result) => {
                    let token = result?;
                    if token.kind.is_trivia() {
                        self.trivia.push(token);
                        continue;
                    }
                    self.buffer.push_back(token);
                }
                None => return Ok(None),
            }
        }

        Ok(Some(n))
    }
}

impl HasFileId for TokenStream<'_, '_> {
    #[inline]
    fn file_id(&self) -> FileId {
        self.file_id
    }
}