Skip to main content

mago_syntax/parser/
stream.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use bumpalo::collections::CollectIn;
6use bumpalo::collections::Vec;
7
8use mago_database::file::HasFileId;
9use mago_span::Position;
10use mago_span::Span;
11
12use crate::ast::sequence::Sequence;
13use crate::ast::trivia::Trivia;
14use crate::ast::trivia::TriviaKind;
15use crate::error::ParseError;
16use crate::error::SyntaxError;
17use crate::lexer::Lexer;
18use crate::token::Token;
19use crate::token::TokenKind;
20
21#[derive(Debug)]
22pub struct TokenStream<'input, 'arena> {
23    arena: &'arena Bump,
24    lexer: Lexer<'input>,
25    buffer: VecDeque<Token<'input>>,
26    trivia: Vec<'arena, Token<'input>>,
27    position: Position,
28}
29
30impl<'input, 'arena> TokenStream<'input, 'arena> {
31    /// Initial capacity for the token lookahead buffer.
32    const BUFFER_INITIAL_CAPACITY: usize = 8;
33
34    pub fn new(arena: &'arena Bump, lexer: Lexer<'input>) -> TokenStream<'input, 'arena> {
35        let position = lexer.current_position();
36
37        TokenStream {
38            arena,
39            lexer,
40            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
41            trivia: Vec::new_in(arena),
42            position,
43        }
44    }
45
46    /// Returns the current position of the stream within the source file.
47    ///
48    /// This position represents the end location of the most recently
49    /// consumed significant token via `advance()` or `consume()`.
50    #[inline]
51    pub const fn current_position(&self) -> Position {
52        self.position
53    }
54
55    #[inline]
56    pub fn has_reached_eof(&mut self) -> Result<bool, SyntaxError> {
57        Ok(self.fill_buffer(1)?.is_none())
58    }
59
60    /// Consumes and returns the next significant token.
61    ///
62    /// Returns an error if EOF is reached or a lexer error occurs.
63    #[inline]
64    pub fn consume(&mut self) -> Result<Token<'input>, ParseError> {
65        match self.advance() {
66            Some(Ok(token)) => Ok(token),
67            Some(Err(error)) => Err(error.into()),
68            None => Err(self.unexpected(None, &[])),
69        }
70    }
71
72    /// Consumes the next token only if it matches the expected kind.
73    ///
74    /// Returns the token if it matches, otherwise returns an error.
75    #[inline]
76    pub fn eat(&mut self, kind: TokenKind) -> Result<Token<'input>, ParseError> {
77        // Check kind first without copying full token
78        let current_kind = self.peek_kind(0)?;
79        match current_kind {
80            Some(k) if k == kind => self.consume(),
81            Some(_) => {
82                let token = self.lookahead(0)?.unwrap();
83
84                Err(self.unexpected(Some(token), &[kind]))
85            }
86            None => Err(self.unexpected(None, &[kind])),
87        }
88    }
89
90    /// Consumes and returns the span of the next significant token.
91    ///
92    /// This is a convenience method equivalent to `consume()?.span_for(file_id())`.
93    #[inline]
94    pub fn consume_span(&mut self) -> Result<Span, ParseError> {
95        let file_id = self.file_id();
96        self.consume().map(|t| t.span_for(file_id))
97    }
98
99    /// Consumes the next token only if it matches the expected kind, returning its span.
100    ///
101    /// This is a convenience method equivalent to `eat(kind)?.span_for(file_id())`.
102    #[inline]
103    pub fn eat_span(&mut self, kind: TokenKind) -> Result<Span, ParseError> {
104        let file_id = self.file_id();
105        self.eat(kind).map(|t| t.span_for(file_id))
106    }
107
108    /// Advances the stream to the next token in the input source code and returns it.
109    ///
110    /// If the stream has already read the entire input source code, this method will return `None`.
111    ///
112    /// # Returns
113    ///
114    /// The next token in the input source code, or `None` if the lexer has reached the end of the input.
115    #[inline]
116    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
117        match self.fill_buffer(1) {
118            Ok(Some(_)) => {
119                if let Some(token) = self.buffer.pop_front() {
120                    // Compute end position from start + value length
121                    self.position = Position::new(token.start.offset + token.value.len() as u32);
122                    Some(Ok(token))
123                } else {
124                    None
125                }
126            }
127            Ok(None) => None,
128            Err(error) => Some(Err(error)),
129        }
130    }
131
132    /// Checks if the next token matches the given kind without consuming it.
133    ///
134    /// Returns `false` if at EOF.
135    #[inline]
136    pub fn is_at(&mut self, kind: TokenKind) -> Result<bool, ParseError> {
137        Ok(self.peek_kind(0)? == Some(kind))
138    }
139
140    /// Peeks at the nth (0-indexed) significant token ahead without consuming it.
141    ///
142    /// Returns `Ok(None)` if EOF is reached before the nth token.
143    #[inline]
144    pub fn lookahead(&mut self, n: usize) -> Result<Option<Token<'input>>, ParseError> {
145        match self.fill_buffer(n + 1) {
146            Ok(Some(_)) => Ok(self.buffer.get(n).copied()),
147            Ok(None) => Ok(None),
148            Err(error) => Err(error.into()),
149        }
150    }
151
152    /// Peeks at the kind of the nth (0-indexed) significant token ahead.
153    ///
154    /// More efficient than `lookahead(n)?.map(|t| t.kind)` as it avoids
155    /// copying the full token when only the kind is needed.
156    #[inline]
157    pub fn peek_kind(&mut self, n: usize) -> Result<Option<TokenKind>, ParseError> {
158        match self.fill_buffer(n + 1) {
159            Ok(Some(_)) => Ok(self.buffer.get(n).map(|t| t.kind)),
160            Ok(None) => Ok(None),
161            Err(error) => Err(error.into()),
162        }
163    }
164
165    /// Creates a `ParseError` for an unexpected token or EOF.
166    #[inline]
167    pub fn unexpected(&self, found: Option<Token<'_>>, expected: &[TokenKind]) -> ParseError {
168        let expected_kinds: Box<[TokenKind]> = expected.into();
169        if let Some(token) = found {
170            ParseError::UnexpectedToken(expected_kinds, token.kind, token.span_for(self.file_id()))
171        } else {
172            ParseError::UnexpectedEndOfFile(expected_kinds, self.file_id(), self.current_position())
173        }
174    }
175
176    /// Consumes the comments collected by the lexer and returns them.
177    #[inline]
178    pub fn get_trivia(&mut self) -> Sequence<'arena, Trivia<'arena>> {
179        let mut tokens = Vec::new_in(self.arena);
180        std::mem::swap(&mut self.trivia, &mut tokens);
181
182        let file_id = self.file_id();
183        Sequence::new(
184            tokens
185                .into_iter()
186                .map(|token| {
187                    let span = token.span_for(file_id);
188                    match token.kind {
189                        TokenKind::Whitespace => Trivia { kind: TriviaKind::WhiteSpace, span, value: token.value },
190                        TokenKind::HashComment => Trivia { kind: TriviaKind::HashComment, span, value: token.value },
191                        TokenKind::SingleLineComment => {
192                            Trivia { kind: TriviaKind::SingleLineComment, span, value: token.value }
193                        }
194                        TokenKind::MultiLineComment => {
195                            Trivia { kind: TriviaKind::MultiLineComment, span, value: token.value }
196                        }
197                        TokenKind::DocBlockComment => {
198                            Trivia { kind: TriviaKind::DocBlockComment, span, value: token.value }
199                        }
200                        _ => unreachable!(),
201                    }
202                })
203                .collect_in(self.arena),
204        )
205    }
206
207    /// Fills the token buffer until at least `n` tokens are available, unless the lexer returns EOF.
208    ///
209    /// Trivia tokens are collected separately and are not stored in the main token buffer.
210    #[inline]
211    fn fill_buffer(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
212        if self.buffer.len() >= n {
213            return Ok(Some(n));
214        }
215
216        self.fill_buffer_slow(n)
217    }
218
219    #[inline(never)]
220    fn fill_buffer_slow(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
221        while self.buffer.len() < n {
222            match self.lexer.advance() {
223                Some(result) => match result {
224                    Ok(token) => {
225                        if token.kind.is_trivia() {
226                            self.trivia.push(token);
227                            continue;
228                        }
229                        self.buffer.push_back(token);
230                    }
231                    Err(error) => return Err(error),
232                },
233                None => return Ok(None),
234            }
235        }
236
237        Ok(Some(n))
238    }
239}
240
241impl HasFileId for TokenStream<'_, '_> {
242    fn file_id(&self) -> mago_database::file::FileId {
243        self.lexer.file_id()
244    }
245}