Skip to main content

mago_syntax/parser/
stream.rs

1use std::fmt::Debug;
2
3use bumpalo::Bump;
4use bumpalo::collections::CollectIn;
5use bumpalo::collections::Vec;
6
7use mago_database::file::FileId;
8use mago_database::file::HasFileId;
9use mago_span::Position;
10use mago_span::Span;
11use mago_syntax_core::parser::LookaheadBuf;
12
13use crate::ast::sequence::Sequence;
14use crate::ast::trivia::Trivia;
15use crate::ast::trivia::TriviaKind;
16use crate::error::ParseError;
17use crate::error::SyntaxError;
18use crate::lexer::Lexer;
19use crate::token::Token;
20use crate::token::TokenKind;
21
22#[derive(Debug)]
23pub struct TokenStream<'input, 'arena> {
24    arena: &'arena Bump,
25    lexer: Lexer<'input>,
26    buffer: LookaheadBuf<Token<'input>, 16>,
27    trivia: Vec<'arena, Token<'input>>,
28    position: Position,
29    file_id: FileId,
30}
31
32impl<'input, 'arena> TokenStream<'input, 'arena> {
33    pub fn new(arena: &'arena Bump, lexer: Lexer<'input>) -> TokenStream<'input, 'arena> {
34        let position = lexer.current_position();
35        let file_id_cached = lexer.file_id();
36
37        TokenStream {
38            arena,
39            lexer,
40            buffer: LookaheadBuf::new(),
41            trivia: Vec::new_in(arena),
42            position,
43            file_id: file_id_cached,
44        }
45    }
46
47    /// Returns the current position of the stream within the source file.
48    ///
49    /// This position represents the end location of the most recently
50    /// consumed significant token via `advance()` or `consume()`.
51    #[inline]
52    #[must_use]
53    pub const fn current_position(&self) -> Position {
54        self.position
55    }
56
57    /// Returns whether the stream has consumed all tokens up to EOF.
58    ///
59    /// # Errors
60    ///
61    /// Returns a [`SyntaxError`] if the lexer fails to produce the next token.
62    #[inline]
63    pub fn has_reached_eof(&mut self) -> Result<bool, SyntaxError> {
64        Ok(self.fill_buffer(1)?.is_none())
65    }
66
67    /// Consumes and returns the next significant token.
68    ///
69    /// # Errors
70    ///
71    /// Returns a [`ParseError`] if EOF is reached or a lexer error occurs.
72    #[inline]
73    pub fn consume(&mut self) -> Result<Token<'input>, ParseError> {
74        match self.advance() {
75            Some(Ok(token)) => Ok(token),
76            Some(Err(error)) => Err(error.into()),
77            None => Err(self.unexpected(None, &[])),
78        }
79    }
80
81    /// Consumes the next token only if it matches the expected kind.
82    ///
83    /// Returns the token if it matches, otherwise returns an error.
84    ///
85    /// # Errors
86    ///
87    /// Returns a [`ParseError`] if the next token's kind does not match `kind`, or if EOF is reached.
88    #[inline]
89    pub fn eat(&mut self, kind: TokenKind) -> Result<Token<'input>, ParseError> {
90        // Fast path: head already buffered. Avoids the Result<Option<...>>
91        // round trip from `peek_kind` plus a follow-up `lookahead` on the
92        // happy path.
93        if let Some(token) = self.buffer.get(0) {
94            if token.kind == kind {
95                let _ = self.buffer.pop_front();
96
97                self.position = Position::new(token.start.offset + token.value.len() as u32);
98                return Ok(token);
99            }
100
101            return Err(self.unexpected(Some(token), &[kind]));
102        }
103
104        // Slow path: buffer empty, fill it.
105        let current_kind = self.peek_kind(0)?;
106        match current_kind {
107            Some(k) if k == kind => self.consume(),
108            Some(_) => {
109                // The kind we just peeked guarantees a token is buffered, so `lookahead(0)`
110                // must yield `Some`; if the lexer somehow disagrees we surface an EOF error
111                // rather than panicking.
112                match self.lookahead(0)? {
113                    Some(token) => Err(self.unexpected(Some(token), &[kind])),
114                    None => Err(self.unexpected(None, &[kind])),
115                }
116            }
117            None => Err(self.unexpected(None, &[kind])),
118        }
119    }
120
121    /// Consumes and returns the span of the next significant token.
122    ///
123    /// This is a convenience method equivalent to `consume()?.span_for(file_id())`.
124    ///
125    /// # Errors
126    ///
127    /// Returns a [`ParseError`] if EOF is reached or a lexer error occurs.
128    #[inline]
129    pub fn consume_span(&mut self) -> Result<Span, ParseError> {
130        let file_id = self.file_id();
131        self.consume().map(|t| t.span_for(file_id))
132    }
133
134    /// Consumes the next token only if it matches the expected kind, returning its span.
135    ///
136    /// This is a convenience method equivalent to `eat(kind)?.span_for(file_id())`.
137    ///
138    /// # Errors
139    ///
140    /// Returns a [`ParseError`] if the next token's kind does not match `kind`, or if EOF is reached.
141    #[inline]
142    pub fn eat_span(&mut self, kind: TokenKind) -> Result<Span, ParseError> {
143        let file_id = self.file_id();
144        self.eat(kind).map(|t| t.span_for(file_id))
145    }
146
147    /// Advances the stream to the next token in the input source code and returns it.
148    ///
149    /// If the stream has already read the entire input source code, this method will return `None`.
150    ///
151    /// # Returns
152    ///
153    /// The next token in the input source code, or `None` if the lexer has reached the end of the input.
154    #[inline]
155    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
156        match self.fill_buffer(1) {
157            Ok(Some(_)) => {
158                if let Some(token) = self.buffer.pop_front() {
159                    // Compute end position from start + value length
160                    self.position = Position::new(token.start.offset + token.value.len() as u32);
161                    Some(Ok(token))
162                } else {
163                    None
164                }
165            }
166            Ok(None) => None,
167            Err(error) => Some(Err(error)),
168        }
169    }
170
171    /// Checks if the next token matches the given kind without consuming it.
172    ///
173    /// Returns `false` if at EOF.
174    ///
175    /// # Errors
176    ///
177    /// Returns a [`ParseError`] if the lexer fails to produce the next token.
178    #[inline]
179    pub fn is_at(&mut self, kind: TokenKind) -> Result<bool, ParseError> {
180        if let Some(token) = self.buffer.get(0) {
181            return Ok(token.kind == kind);
182        }
183
184        Ok(self.peek_kind(0)? == Some(kind))
185    }
186
187    /// Peeks at the nth (0-indexed) significant token ahead without consuming it.
188    ///
189    /// Returns `Ok(None)` if EOF is reached before the nth token.
190    ///
191    /// # Errors
192    ///
193    /// Returns a [`ParseError`] if the lexer fails to produce a token while filling the lookahead buffer.
194    #[inline]
195    pub fn lookahead(&mut self, n: usize) -> Result<Option<Token<'input>>, ParseError> {
196        if n < self.buffer.len() {
197            return Ok(self.buffer.get(n));
198        }
199
200        match self.fill_buffer(n + 1) {
201            Ok(Some(_)) => Ok(self.buffer.get(n)),
202            Ok(None) => Ok(None),
203            Err(error) => Err(error.into()),
204        }
205    }
206
207    /// Peeks at the kind of the nth (0-indexed) significant token ahead.
208    ///
209    /// More efficient than `lookahead(n)?.map(|t| t.kind)` as it avoids
210    /// copying the full token when only the kind is needed.
211    ///
212    /// # Errors
213    ///
214    /// Returns a [`ParseError`] if the lexer fails to produce a token while filling the lookahead buffer.
215    #[inline]
216    pub fn peek_kind(&mut self, n: usize) -> Result<Option<TokenKind>, ParseError> {
217        if n < self.buffer.len() {
218            return Ok(self.buffer.get(n).map(|t| t.kind));
219        }
220
221        match self.fill_buffer(n + 1) {
222            Ok(Some(_)) => Ok(self.buffer.get(n).map(|t| t.kind)),
223            Ok(None) => Ok(None),
224            Err(error) => Err(error.into()),
225        }
226    }
227
228    /// Creates a `ParseError` for an unexpected token or EOF.
229    #[inline]
230    #[must_use]
231    pub fn unexpected(&self, found: Option<Token<'_>>, expected: &[TokenKind]) -> ParseError {
232        let expected_kinds: Box<[TokenKind]> = expected.into();
233        if let Some(token) = found {
234            ParseError::UnexpectedToken(expected_kinds, token.kind, token.span_for(self.file_id()))
235        } else {
236            ParseError::UnexpectedEndOfFile(expected_kinds, self.file_id(), self.current_position())
237        }
238    }
239
240    /// Consumes the comments collected by the lexer and returns them.
241    #[inline]
242    pub fn get_trivia(&mut self) -> Sequence<'arena, Trivia<'arena>> {
243        let mut tokens = Vec::new_in(self.arena);
244        std::mem::swap(&mut self.trivia, &mut tokens);
245
246        let file_id = self.file_id();
247        Sequence::new(
248            tokens
249                .into_iter()
250                .filter_map(|token| {
251                    let span = token.span_for(file_id);
252                    let kind = match token.kind {
253                        TokenKind::Whitespace => TriviaKind::WhiteSpace,
254                        TokenKind::HashComment => TriviaKind::HashComment,
255                        TokenKind::SingleLineComment => TriviaKind::SingleLineComment,
256                        TokenKind::MultiLineComment => TriviaKind::MultiLineComment,
257                        TokenKind::DocBlockComment => TriviaKind::DocBlockComment,
258                        // Tokens collected into `self.trivia` are guaranteed by `fill_buffer_slow`
259                        // to satisfy `kind.is_trivia()`; any non-trivia kind here is a parser bug
260                        // and the safe response is to drop it rather than panic.
261                        _ => return None,
262                    };
263                    Some(Trivia { kind, span, value: token.value })
264                })
265                .collect_in(self.arena),
266        )
267    }
268
269    /// Fills the token buffer until at least `n` tokens are available, unless the lexer returns EOF.
270    ///
271    /// Trivia tokens are collected separately and are not stored in the main token buffer.
272    #[inline]
273    fn fill_buffer(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
274        if self.buffer.len() >= n {
275            return Ok(Some(n));
276        }
277
278        self.fill_buffer_slow(n)
279    }
280
281    #[inline(never)]
282    fn fill_buffer_slow(&mut self, n: usize) -> Result<Option<usize>, SyntaxError> {
283        while self.buffer.len() < n {
284            match self.lexer.advance() {
285                Some(result) => {
286                    let token = result?;
287                    if token.kind.is_trivia() {
288                        self.trivia.push(token);
289                        continue;
290                    }
291                    self.buffer.push_back(token);
292                }
293                None => return Ok(None),
294            }
295        }
296
297        Ok(Some(n))
298    }
299}
300
301impl HasFileId for TokenStream<'_, '_> {
302    #[inline]
303    fn file_id(&self) -> FileId {
304        self.file_id
305    }
306}