Skip to main content

oak_core/parser/
state.rs

1use crate::{
2    Language,
3    errors::OakError,
4    language::TokenType,
5    lexer::{LexOutput, Token, Tokens},
6    memory::arena::SyntaxArena,
7    source::Source,
8    tree::{Cursor, GreenLeaf, GreenNode, GreenTree, TokenProvenance},
9};
10use core::range::Range;
11use triomphe::Arc;
12
13/// Helper function to raise deep clone of a node into an arena.
14///
15/// This is used to "promote" nodes from a previous generation's arena to the current one,
16/// ensuring that the new tree is self-contained and has good memory locality.
17pub fn deep_clone_node<'a, L: Language>(node: &GreenNode<'_, L>, arena: &'a SyntaxArena) -> &'a GreenNode<'a, L> {
18    let children_iter = node.children.iter().map(|child| match child {
19        GreenTree::Node(n) => GreenTree::Node(deep_clone_node(n, arena)),
20        GreenTree::Leaf(l) => GreenTree::Leaf(*l),
21    });
22
23    let slice = arena.alloc_slice_fill_iter(node.children.len(), children_iter);
24    let new_node = GreenNode::new(node.kind, slice);
25    arena.alloc(new_node)
26}
27
28/// Provides tokens to the parser.
29pub struct TokenSource<L: Language> {
30    tokens: Tokens<L>,
31    index: usize,
32}
33
34impl<L: Language> Clone for TokenSource<L> {
35    fn clone(&self) -> Self {
36        Self { tokens: self.tokens.clone(), index: self.index }
37    }
38}
39
40impl<L: Language> TokenSource<L> {
41    /// Creates a new token source.
42    pub fn new(tokens: Tokens<L>) -> Self {
43        Self { tokens, index: 0 }
44    }
45
46    /// Gets the current token.
47    #[inline]
48    pub fn current(&self) -> Option<&Token<L::TokenType>> {
49        self.tokens.get(self.index)
50    }
51
52    /// Peeks at a token at the specified offset from the current position.
53    #[inline]
54    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
55        self.tokens.get(self.index + offset)
56    }
57
58    /// Advances the current position to the next token.
59    #[inline]
60    pub fn advance(&mut self) {
61        self.index += 1;
62    }
63
64    /// Checks if the token source has reached the end.
65    #[inline]
66    pub fn is_end(&self) -> bool {
67        self.index >= self.tokens.len()
68    }
69
70    /// Gets the current index.
71    #[inline]
72    pub fn index(&self) -> usize {
73        self.index
74    }
75
76    /// Sets the current index.
77    #[inline]
78    pub fn set_index(&mut self, index: usize) {
79        self.index = index;
80    }
81}
82
83/// Collects the results of the parsing process.
84pub struct TreeSink<'a, L: Language> {
85    arena: &'a SyntaxArena,
86    children: Vec<GreenTree<'a, L>>,
87}
88
89impl<'a, L: Language> TreeSink<'a, L> {
90    /// Creates a new tree sink.
91    pub fn new(arena: &'a SyntaxArena, capacity_hint: usize) -> Self {
92        Self { arena, children: Vec::with_capacity(capacity_hint) }
93    }
94
95    /// Pushes a leaf node (token) to the current list of children.
96    pub fn push_leaf(&mut self, kind: L::TokenType, len: usize) {
97        self.children.push(GreenTree::Leaf(GreenLeaf::new(kind, len as u32)));
98    }
99
100    /// Pushes a leaf node (token) with provenance metadata to the current list of children.
101    pub fn push_leaf_with_metadata(&mut self, kind: L::TokenType, len: usize, provenance: TokenProvenance) {
102        let index = self.arena.add_metadata(provenance);
103        self.children.push(GreenTree::Leaf(GreenLeaf::with_metadata(kind, len as u32, Some(index))));
104    }
105
106    /// Pushes an existing node to the current list of children.
107    pub fn push_node(&mut self, node: &'a GreenNode<'a, L>) {
108        self.children.push(GreenTree::Node(node));
109    }
110
111    /// Returns the current number of children, serving as a checkpoint for finishing a node later.
112    pub fn checkpoint(&self) -> usize {
113        self.children.len()
114    }
115
116    /// Returns the syntax arena used by this sink.
117    pub fn arena(&self) -> &'a SyntaxArena {
118        self.arena
119    }
120
121    /// Restores the sink to a previous checkpoint by truncating the children list.
122    pub fn restore(&mut self, checkpoint: usize) {
123        self.children.truncate(checkpoint)
124    }
125
126    /// Finishes a node starting from the given checkpoint and adds it as a child.
127    pub fn finish_node(&mut self, checkpoint: usize, kind: L::ElementType) -> &'a GreenNode<'a, L> {
128        let children_slice = self.arena.alloc_slice_copy(&self.children[checkpoint..]);
129        self.children.truncate(checkpoint);
130        let node = GreenNode::new(kind, children_slice);
131        let node_ref = self.arena.alloc(node);
132        self.children.push(GreenTree::Node(node_ref));
133        node_ref
134    }
135}
136
137/// Encapsulates incremental parsing logic.
138pub struct IncrementalContext<'a, L: Language> {
139    cursor: Cursor<'a, L>,
140    /// Sorted list of edits with their accumulated deltas.
141    /// Each entry contains the old range and the cumulative delta *after* this edit.
142    edits: Vec<(Range<usize>, isize)>,
143}
144
145impl<'a, L: Language> IncrementalContext<'a, L> {
146    /// Creates a new incremental parsing context.
147    pub fn new(old_root: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) -> Self {
148        let mut sorted_edits: Vec<_> = edits.iter().collect();
149        sorted_edits.sort_by_key(|e| e.span.start);
150
151        let mut cumulative_delta = 0isize;
152        let mut processed_edits = Vec::with_capacity(edits.len());
153
154        for edit in sorted_edits {
155            let old_len = edit.span.end - edit.span.start;
156            let new_len = edit.text.len();
157            cumulative_delta += new_len as isize - old_len as isize;
158            processed_edits.push((edit.span, cumulative_delta));
159        }
160
161        Self { cursor: Cursor::new(old_root), edits: processed_edits }
162    }
163
164    fn map_new_to_old(&self, new_pos: usize) -> Option<usize> {
165        let mut current_delta = 0isize;
166
167        for (old_range, cumulative_delta) in &self.edits {
168            let new_start = (old_range.start as isize + current_delta) as usize;
169            let edit_delta = *cumulative_delta - current_delta;
170            let new_end = (old_range.end as isize + current_delta + edit_delta) as usize;
171
172            if new_pos < new_start {
173                return Some((new_pos as isize - current_delta) as usize);
174            }
175            else if new_pos < new_end {
176                // Inside a dirty range
177                return None;
178            }
179
180            current_delta = *cumulative_delta;
181        }
182
183        Some((new_pos as isize - current_delta) as usize)
184    }
185
186    fn is_dirty(&self, old_start: usize, old_end: usize) -> bool {
187        for (old_range, _) in &self.edits {
188            // Check for overlap between [old_start, old_end) and [old_range.start, old_range.end)
189            if old_start < old_range.end && old_end > old_range.start {
190                return true;
191            }
192        }
193        false
194    }
195}
196
197/// High-level API for parsers, coordinating token supply and tree construction.
198pub struct ParserState<'a, L: Language, S: Source + ?Sized = crate::source::SourceText> {
199    /// The token stream being parsed.
200    pub tokens: TokenSource<L>,
201
202    /// The sink where the syntax tree is constructed.
203    pub sink: TreeSink<'a, L>,
204
205    /// Optional context for incremental parsing.
206    pub incremental: Option<IncrementalContext<'a, L>>,
207
208    /// Collection of errors encountered during parsing.
209    pub errors: Vec<OakError>,
210    /// We keep a reference to help with error reporting and offset calculation.
211    pub source: &'a S,
212}
213
214impl<'a, L: Language, S: Source + ?Sized> ParserState<'a, L, S> {
215    /// Returns the source file's ID, if available.
216    pub fn source_id(&self) -> Option<crate::source::SourceId> {
217        self.source.source_id()
218    }
219
220    /// Returns the syntax arena used by this parser.
221    pub fn arena(&self) -> &'a SyntaxArena {
222        self.sink.arena()
223    }
224
225    /// Creates a new parser state.
226    ///
227    /// # Arguments
228    /// * `arena` - The syntax arena to allocate nodes in.
229    /// * `lex_output` - The output from the lexer, including tokens and any lexical errors.
230    /// * `source` - The source text being parsed.
231    /// * `capacity_hint` - Initial capacity hint for the tree sink's child vector.
232    pub fn new(arena: &'a SyntaxArena, lex_output: LexOutput<L>, source: &'a S, capacity_hint: usize) -> Self {
233        let (tokens, mut errors) = match lex_output.result {
234            Ok(tokens) => (tokens, Vec::new()),
235            Err(e) => (Arc::from(Vec::new()), vec![e]),
236        };
237        errors.extend(lex_output.diagnostics);
238
239        let mut st = Self { tokens: TokenSource::new(tokens), sink: TreeSink::new(arena, capacity_hint), incremental: None, errors, source };
240        st.skip_trivia();
241        st
242    }
243
244    /// Creates a nested parser state that shares the same arena and source.
245    /// This is useful for parsing sub-structures that should be independent but part of the same overall tree.
246    pub fn nested(&self) -> ParserState<'a, L, S> {
247        ParserState { tokens: self.tokens.clone(), sink: TreeSink::new(self.sink.arena, 1024), incremental: None, errors: Vec::new(), source: self.source }
248    }
249
250    /// Returns the text content of the current token.
251    pub fn peek_text(&self) -> Option<std::borrow::Cow<'_, str>> {
252        self.current().map(|t| self.source.get_text_in(t.span))
253    }
254
255    /// Manually promotes a node from a previous generation to the current one.
256    /// This is useful for improving memory locality.
257    pub fn promote(&mut self, node: &'a GreenNode<'a, L>) -> &'a GreenNode<'a, L> {
258        deep_clone_node(node, self.sink.arena)
259    }
260
261    /// Sets the incremental parsing context.
262    ///
263    /// # Arguments
264    /// * `old` - The root of the previous version of the syntax tree.
265    /// * `edits` - The edits that were applied to the source text.
266    pub fn set_incremental(&mut self, old: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) {
267        self.incremental = Some(IncrementalContext::new(old, edits));
268    }
269
270    // --- Error Reporting ---
271
272    /// Returns the current byte offset.
273    pub fn current_offset(&self) -> usize {
274        self.tokens.current().map(|t| t.span.start).unwrap_or_else(|| self.source.length())
275    }
276
277    /// Records a syntax error with the given message.
278    pub fn syntax_error(&mut self, message: impl Into<String>) -> OakError {
279        let err = OakError::syntax_error(message, self.current_offset(), self.source.source_id());
280        self.errors.push(err.clone());
281        err
282    }
283
284    /// Records an unexpected token error.
285    pub fn record_unexpected_token(&mut self, token: impl Into<String>) {
286        let err = OakError::unexpected_token(token, self.current_offset(), self.source.source_id());
287        self.errors.push(err)
288    }
289
290    /// Records an expected token error.
291    pub fn record_expected(&mut self, expected: impl Into<String>) {
292        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
293        let err = OakError::expected_token(expected, offset, self.source.source_id());
294        self.errors.push(err)
295    }
296
297    /// Records an expected name error.
298    pub fn record_expected_name(&mut self, name_kind: impl Into<String>) {
299        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
300        let err = OakError::expected_name(name_kind, offset, self.source.source_id());
301        self.errors.push(err)
302    }
303
304    /// Records a trailing comma not allowed error.
305    pub fn record_trailing_comma_not_allowed(&mut self) {
306        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
307        let err = OakError::trailing_comma_not_allowed(offset, self.source.source_id());
308        self.errors.push(err)
309    }
310
311    /// Records an unexpected end of file error.
312    pub fn record_unexpected_eof(&mut self) {
313        let offset = self.source.length();
314        let err = OakError::unexpected_eof(offset, self.source.source_id());
315        self.errors.push(err)
316    }
317
318    /// Records an unexpected end of file error and returns it.
319    pub fn unexpected_eof(&mut self) -> OakError {
320        let offset = self.source.length();
321        let err = OakError::unexpected_eof(offset, self.source.source_id());
322        self.errors.push(err.clone());
323        err
324    }
325
326    // --- Token Navigation ---
327
328    /// Returns the current token.
329    #[inline]
330    pub fn current(&self) -> Option<&Token<L::TokenType>> {
331        self.tokens.current()
332    }
333
334    /// Returns the kind of the current token.
335    #[inline]
336    pub fn peek_kind(&self) -> Option<L::TokenType> {
337        self.current().map(|t| t.kind)
338    }
339
340    /// Peeks at a token at the specified offset from the current position.
341    #[inline]
342    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
343        self.tokens.peek_at(offset)
344    }
345
346    /// Peeks at the Nth non-trivia token from the current position.
347    pub fn peek_non_trivia_at(&self, mut n: usize) -> Option<&Token<L::TokenType>> {
348        let mut offset = 0;
349        while let Some(token) = self.tokens.peek_at(offset) {
350            if !L::TokenType::is_ignored(&token.kind) {
351                if n == 0 {
352                    return Some(token);
353                }
354                n -= 1
355            }
356            offset += 1
357        }
358        None
359    }
360
361    /// Returns the kind of the token at the specified offset from the current position.
362    #[inline]
363    pub fn peek_kind_at(&self, offset: usize) -> Option<L::TokenType> {
364        self.tokens.peek_at(offset).map(|t| t.kind)
365    }
366
367    /// Returns the kind of the Nth non-trivia token from the current position.
368    pub fn peek_non_trivia_kind_at(&self, n: usize) -> Option<L::TokenType> {
369        self.peek_non_trivia_at(n).map(|t| t.kind)
370    }
371
372    /// Checks if the parser has not yet reached the end of the token stream.
373    #[inline]
374    pub fn not_at_end(&self) -> bool {
375        !self.tokens.is_end()
376    }
377
378    /// Checks if the current token is of the specified kind.
379    #[inline]
380    pub fn at(&self, kind: L::TokenType) -> bool {
381        self.peek_kind() == Some(kind)
382    }
383
384    /// Checks if the current token is NOT of the specified kind.
385    #[inline]
386    pub fn not_at(&self, kind: L::TokenType) -> bool {
387        !self.at(kind)
388    }
389
390    /// Advances the current position to the next token.
391    pub fn advance(&mut self) {
392        self.tokens.advance();
393        self.skip_trivia()
394    }
395
396    /// Skips trivia tokens and adds them to the syntax tree.
397    pub fn skip_trivia(&mut self) {
398        while let Some(token) = self.tokens.current() {
399            if L::TokenType::is_ignored(&token.kind) {
400                self.sink.push_leaf(token.kind, token.length());
401                self.tokens.advance()
402            }
403            else {
404                break;
405            }
406        }
407    }
408
409    /// Advances until a token of the specified kind is found, or the end of the token stream is reached.
410    pub fn advance_until(&mut self, kind: L::TokenType) {
411        while self.not_at_end() && !self.at(kind) {
412            self.advance()
413        }
414    }
415
416    /// Advances until any token of the specified kinds is found, or the end of the token stream is reached.
417    pub fn advance_until_any(&mut self, kinds: &[L::TokenType]) {
418        while self.not_at_end() && !kinds.iter().any(|&k| self.at(k)) {
419            self.advance()
420        }
421    }
422
423    /// Consumes the current token and adds it to the syntax tree as a leaf node.
424    pub fn bump(&mut self) {
425        if let Some(token) = self.current() {
426            self.sink.push_leaf(token.kind, token.length());
427            self.advance()
428        }
429    }
430
431    /// Consumes the current token if it matches the specified kind.
432    /// Returns `true` if the token was consumed, `false` otherwise.
433    pub fn eat(&mut self, kind: L::TokenType) -> bool {
434        if self.at(kind) {
435            self.bump();
436            true
437        }
438        else {
439            false
440        }
441    }
442
443    /// Expects the current token to be of the specified kind, consuming it if it matches.
444    /// If the token does not match, an error is recorded and returned.
445    pub fn expect(&mut self, kind: L::TokenType) -> Result<(), OakError> {
446        if self.eat(kind) {
447            Ok(())
448        }
449        else {
450            let err = OakError::expected_token(format!("{:?}", kind), self.current_offset(), self.source.source_id());
451            self.errors.push(err.clone());
452            Err(err)
453        }
454    }
455
456    /// Tries to parse a construct with a backtracking point.
457    pub fn try_parse<T, F>(&mut self, parser: F) -> Result<T, OakError>
458    where
459        F: FnOnce(&mut Self) -> Result<T, OakError>,
460    {
461        let checkpoint = self.checkpoint();
462        match parser(self) {
463            Ok(value) => Ok(value),
464            Err(err) => {
465                self.restore(checkpoint);
466                Err(err)
467            }
468        }
469    }
470
471    /// Tries to parse a construct with a backtracking point, preserving the error if it fails.
472    ///
473    /// This is similar to `try_parse`, but it keeps the error generated by the parser
474    /// instead of potentially discarding it or generating a generic one.
475    pub fn try_parse_with_error<T, F>(&mut self, parser: F) -> Result<T, OakError>
476    where
477        F: FnOnce(&mut Self) -> Result<T, OakError>,
478    {
479        let checkpoint = self.checkpoint();
480        match parser(self) {
481            Ok(value) => Ok(value),
482            Err(err) => {
483                self.restore(checkpoint);
484                Err(err)
485            }
486        }
487    }
488
489    // --- Tree Construction ---
490
491    /// Creates a checkpoint in the tree construction process, which can be used to finish a node later.
492    /// This checkpoint includes both the token index and the tree sink's current position for backtracking.
493    pub fn checkpoint(&self) -> (usize, usize) {
494        (self.tokens.index(), self.sink.checkpoint())
495    }
496
497    /// Restores the parser state to a previous checkpoint.
498    pub fn restore(&mut self, (token_index, tree_checkpoint): (usize, usize)) {
499        self.tokens.set_index(token_index);
500        self.sink.restore(tree_checkpoint)
501    }
502
503    /// Finishes a node starting from the given checkpoint and adds it as a child.
504    pub fn finish_at(&mut self, checkpoint: (usize, usize), kind: L::ElementType) -> &'a GreenNode<'a, L> {
505        self.sink.finish_node(checkpoint.1, kind)
506    }
507
508    /// Creates a checkpoint before an existing node.
509    pub fn checkpoint_before(&mut self, node: &'a GreenNode<'a, L>) -> (usize, usize) {
510        // Find the index of the node in the sink's children
511        let sink_checkpoint = self.sink.checkpoint();
512        for i in (0..sink_checkpoint).rev() {
513            if let Some(child) = self.sink.children.get(i) {
514                if let GreenTree::Node(n) = child {
515                    if std::ptr::eq(*n, node) {
516                        return (0, i); // token_index is hard to determine, but usually not needed for infix wrapping
517                    }
518                }
519            }
520        }
521        (0, sink_checkpoint)
522    }
523
524    /// Adds an existing node as a child to the current node.
525    pub fn push_child(&mut self, node: &'a GreenNode<'a, L>) {
526        self.sink.push_node(node)
527    }
528
529    /// Attempts to reuse a previous syntax node at the current position.
530    /// This is the core of incremental parsing.
531    pub fn try_reuse(&mut self, kind: L::ElementType) -> bool {
532        let Some(inc) = &mut self.incremental
533        else {
534            return false;
535        };
536        let current_index = self.tokens.index();
537        let new_pos = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
538
539        #[cfg(test)]
540        println!("try_reuse: kind={:?}, new_pos={}", kind, new_pos);
541
542        let Some(target_old_pos) = inc.map_new_to_old(new_pos)
543        else {
544            return false;
545        };
546
547        #[cfg(test)]
548        println!("Trying to reuse node at pos {};: {:?}", new_pos, kind);
549
550        let mut steps = 0;
551        const MAX_STEPS: usize = 100;
552
553        loop {
554            let start = inc.cursor.offset();
555            let end = inc.cursor.end_offset();
556
557            if start == target_old_pos {
558                if let Some(node) = inc.cursor.as_node() {
559                    if node.kind == kind {
560                        if !inc.is_dirty(start, end) {
561                            // Verify tokens
562                            let node_len = node.text_len() as usize;
563
564                            // Check token stream
565                            let mut lookahead = 0;
566                            let mut current_pos = new_pos;
567                            let target_new_end = new_pos + node_len;
568
569                            while let Some(t) = self.tokens.peek_at(lookahead) {
570                                if t.span.start != current_pos {
571                                    // Token stream doesn't match expected positions
572                                    break;
573                                }
574                                current_pos = t.span.end;
575
576                                if t.span.end == target_new_end {
577                                    // Found the end!
578                                    let tokens_consumed = lookahead + 1;
579                                    let new_node = deep_clone_node(node, self.sink.arena);
580                                    self.sink.push_node(new_node);
581                                    self.tokens.set_index(current_index + tokens_consumed);
582                                    inc.cursor.step_over();
583                                    return true;
584                                }
585                                else if t.span.end > target_new_end {
586                                    break;
587                                }
588                                lookahead += 1;
589                            }
590                        }
591                    }
592                }
593                if !inc.cursor.step_into() && !inc.cursor.step_next() {
594                    return false;
595                }
596            }
597            else if start < target_old_pos && end > target_old_pos {
598                if !inc.cursor.step_into() && !inc.cursor.step_next() {
599                    return false;
600                }
601            }
602            else if end <= target_old_pos {
603                if !inc.cursor.step_next() {
604                    return false;
605                }
606            }
607            else {
608                return false;
609            }
610
611            steps += 1;
612            if steps > MAX_STEPS {
613                return false;
614            }
615        }
616    }
617
618    /// Finishes the parsing process and returns the final output.
619    pub fn finish(self, result: Result<&'a GreenNode<'a, L>, OakError>) -> crate::parser::ParseOutput<'a, L> {
620        crate::errors::OakDiagnostics { result, diagnostics: self.errors }
621    }
622
623    /// Wraps the parsing of a node with incremental reuse support.
624    ///
625    /// This method first attempts to reuse an existing node of the given kind from the previous tree.
626    /// If successful, it skips the provided closure and returns `Ok(())`.
627    /// Otherwise, it creates a checkpoint, runs the closure to parse the node's content,
628    /// and then finishes the node with the specified kind.
629    pub fn incremental_node<F>(&mut self, kind: L::ElementType, f: F) -> Result<(), OakError>
630    where
631        F: FnOnce(&mut Self) -> Result<(), OakError>,
632    {
633        if self.try_reuse(kind) {
634            return Ok(());
635        };
636
637        let checkpoint = self.checkpoint();
638        let res = f(self);
639        self.finish_at(checkpoint, kind);
640        res
641    }
642
643    /// Wraps the parsing of an optional node with incremental reuse support.
644    ///
645    /// Similar to `incremental_node`, but the closure returns a boolean indicating
646    /// if the node was actually present.
647    pub fn incremental_opt<F>(&mut self, kind: L::ElementType, f: F) -> Result<bool, OakError>
648    where
649        F: FnOnce(&mut Self) -> Result<bool, OakError>,
650    {
651        if self.try_reuse(kind) {
652            return Ok(true);
653        };
654
655        let checkpoint = self.checkpoint();
656        match f(self) {
657            Ok(true) => {
658                self.finish_at(checkpoint, kind);
659                Ok(true)
660            }
661            Ok(false) => Ok(false),
662            Err(e) => {
663                self.finish_at(checkpoint, kind);
664                Err(e)
665            }
666        }
667    }
668}