Skip to main content

oak_core/parser/
state.rs

1use crate::{
2    Language,
3    errors::OakError,
4    language::TokenType,
5    lexer::{LexOutput, Token, Tokens},
6    memory::arena::SyntaxArena,
7    source::Source,
8    tree::{Cursor, GreenLeaf, GreenNode, GreenTree, TokenProvenance},
9};
10use core::range::Range;
11
12/// Helper function to raise deep clone of a node into an arena.
13///
14/// This is used to "promote" nodes from a previous generation's arena to the current one,
15/// ensuring that the new tree is self-contained and has good memory locality.
16pub fn deep_clone_node<'a, L: Language>(node: &GreenNode<'_, L>, arena: &'a SyntaxArena) -> &'a GreenNode<'a, L> {
17    let children_iter = node.children.iter().map(|child| match child {
18        GreenTree::Node(n) => GreenTree::Node(deep_clone_node(n, arena)),
19        GreenTree::Leaf(l) => GreenTree::Leaf(*l),
20    });
21
22    let slice = arena.alloc_slice_fill_iter(node.children.len(), children_iter);
23    let new_node = GreenNode::new(node.kind, slice);
24    arena.alloc(new_node)
25}
26
27/// Provides tokens to the parser.
28pub struct TokenSource<L: Language> {
29    /// The tokens being consumed.
30    tokens: Tokens<L>,
31    /// The current index into the tokens.
32    index: usize,
33}
34
35impl<L: Language> Clone for TokenSource<L> {
36    fn clone(&self) -> Self {
37        Self { tokens: self.tokens.clone(), index: self.index }
38    }
39}
40
41impl<L: Language> TokenSource<L> {
42    /// Creates a new token source.
43    pub fn new(tokens: Tokens<L>) -> Self {
44        Self { tokens, index: 0 }
45    }
46
47    /// Gets the current token.
48    #[inline]
49    pub fn current(&self) -> Option<&Token<L::TokenType>> {
50        self.tokens.get(self.index)
51    }
52
53    /// Peeks at a token at the specified offset from the current position.
54    #[inline]
55    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
56        self.tokens.get(self.index + offset)
57    }
58
59    /// Advances the current position to the next token.
60    #[inline]
61    pub fn advance(&mut self) {
62        self.index += 1;
63    }
64
65    /// Checks if the token source has reached the end.
66    #[inline]
67    pub fn is_end(&self) -> bool {
68        self.index >= self.tokens.len()
69    }
70
71    /// Gets the current index.
72    #[inline]
73    pub fn index(&self) -> usize {
74        self.index
75    }
76
77    /// Sets the current index.
78    #[inline]
79    pub fn set_index(&mut self, index: usize) {
80        self.index = index;
81    }
82}
83
84/// Collects the results of the parsing process.
85pub struct TreeSink<'a, L: Language> {
86    /// The arena to allocate syntax nodes in.
87    arena: &'a SyntaxArena,
88    /// The list of children for the current node being constructed.
89    children: Vec<GreenTree<'a, L>>,
90}
91
92impl<'a, L: Language> TreeSink<'a, L> {
93    /// Creates a new tree sink.
94    pub fn new(arena: &'a SyntaxArena, capacity_hint: usize) -> Self {
95        Self { arena, children: Vec::with_capacity(capacity_hint) }
96    }
97
98    /// Pushes a leaf node (token) to the current list of children.
99    pub fn push_leaf(&mut self, kind: L::TokenType, len: usize) {
100        self.children.push(GreenTree::Leaf(GreenLeaf::new(kind, len as u32)));
101    }
102
103    /// Pushes a leaf node (token) with provenance metadata to the current list of children.
104    pub fn push_leaf_with_metadata(&mut self, kind: L::TokenType, len: usize, provenance: TokenProvenance) {
105        let index = self.arena.add_metadata(provenance);
106        self.children.push(GreenTree::Leaf(GreenLeaf::with_metadata(kind, len as u32, Some(index))));
107    }
108
109    /// Pushes an existing node to the current list of children.
110    pub fn push_node(&mut self, node: &'a GreenNode<'a, L>) {
111        self.children.push(GreenTree::Node(node));
112    }
113
114    /// Returns the current number of children, serving as a checkpoint for finishing a node later.
115    pub fn checkpoint(&self) -> usize {
116        self.children.len()
117    }
118
119    /// Returns the syntax arena used by this sink.
120    pub fn arena(&self) -> &'a SyntaxArena {
121        self.arena
122    }
123
124    /// Restores the sink to a previous checkpoint by truncating the children list.
125    pub fn restore(&mut self, checkpoint: usize) {
126        self.children.truncate(checkpoint)
127    }
128
129    /// Finishes a node starting from the given checkpoint and adds it as a child.
130    pub fn finish_node(&mut self, checkpoint: usize, kind: L::ElementType) -> &'a GreenNode<'a, L> {
131        let children_slice = self.arena.alloc_slice_copy(&self.children[checkpoint..]);
132        self.children.truncate(checkpoint);
133        let node = GreenNode::new(kind, children_slice);
134        let node_ref = self.arena.alloc(node);
135        self.children.push(GreenTree::Node(node_ref));
136        node_ref
137    }
138}
139
140/// Encapsulates incremental parsing logic.
141pub struct IncrementalContext<'a, L: Language> {
142    /// A cursor over the previous syntax tree.
143    cursor: Cursor<'a, L>,
144    /// Sorted list of edits with their accumulated deltas.
145    /// Each entry contains the old range and the cumulative delta *after* this edit.
146    edits: Vec<(Range<usize>, isize)>,
147}
148
149impl<'a, L: Language> IncrementalContext<'a, L> {
150    /// Creates a new incremental parsing context.
151    pub fn new(old_root: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) -> Self {
152        let mut sorted_edits: Vec<_> = edits.iter().collect();
153        sorted_edits.sort_by_key(|e| e.span.start);
154
155        let mut cumulative_delta = 0isize;
156        let mut processed_edits = Vec::with_capacity(edits.len());
157
158        for edit in sorted_edits {
159            let old_len = edit.span.end - edit.span.start;
160            let new_len = edit.text.len();
161            cumulative_delta += new_len as isize - old_len as isize;
162            processed_edits.push((edit.span, cumulative_delta));
163        }
164
165        Self { cursor: Cursor::new(old_root), edits: processed_edits }
166    }
167
168    fn map_new_to_old(&self, new_pos: usize) -> Option<usize> {
169        let mut current_delta = 0isize;
170
171        for (old_range, cumulative_delta) in &self.edits {
172            let new_start = (old_range.start as isize + current_delta) as usize;
173            let edit_delta = *cumulative_delta - current_delta;
174            let new_end = (old_range.end as isize + current_delta + edit_delta) as usize;
175
176            if new_pos < new_start {
177                return Some((new_pos as isize - current_delta) as usize);
178            }
179            else if new_pos < new_end {
180                // Inside a dirty range
181                return None;
182            }
183
184            current_delta = *cumulative_delta;
185        }
186
187        Some((new_pos as isize - current_delta) as usize)
188    }
189
190    fn is_dirty(&self, old_start: usize, old_end: usize) -> bool {
191        for (old_range, _) in &self.edits {
192            // Check for overlap between [old_start, old_end) and [old_range.start, old_range.end)
193            if old_start < old_range.end && old_end > old_range.start {
194                return true;
195            }
196        }
197        false
198    }
199}
200
201/// High-level API for parsers, coordinating token supply and tree construction.
202pub struct ParserState<'a, L: Language, S: Source + ?Sized = crate::source::SourceText> {
203    /// The token source providing tokens to the parser.
204    pub tokens: TokenSource<L>,
205
206    /// The sink where the syntax tree is constructed.
207    pub sink: TreeSink<'a, L>,
208
209    /// Optional context for incremental parsing.
210    pub incremental: Option<IncrementalContext<'a, L>>,
211
212    /// Collection of errors encountered during parsing.
213    pub errors: Vec<OakError>,
214    /// We keep a reference to help with error reporting and offset calculation.
215    pub source: &'a S,
216}
217
218impl<'a, L: Language, S: Source + ?Sized> ParserState<'a, L, S> {
219    /// Returns the source file's ID, if available.
220    pub fn source_id(&self) -> Option<crate::source::SourceId> {
221        self.source.source_id()
222    }
223
224    /// Returns the syntax arena used by this parser.
225    pub fn arena(&self) -> &'a SyntaxArena {
226        self.sink.arena()
227    }
228
229    /// Creates a new parser state.
230    ///
231    /// # Arguments
232    /// * `arena` - The syntax arena to allocate nodes in.
233    /// * `lex_output` - The output from the lexer, including tokens and any lexical errors.
234    /// * `source` - The source text being parsed.
235    /// * `capacity_hint` - Initial capacity hint for the tree sink's child vector.
236    pub fn new(arena: &'a SyntaxArena, lex_output: LexOutput<L>, source: &'a S, capacity_hint: usize) -> Self {
237        let (tokens, mut errors) = match lex_output.result {
238            Ok(tokens) => (tokens, Vec::new()),
239            Err(e) => (Tokens::default(), vec![e]),
240        };
241        errors.extend(lex_output.diagnostics);
242
243        let mut st = Self { tokens: TokenSource::new(tokens), sink: TreeSink::new(arena, capacity_hint), incremental: None, errors, source };
244        st.skip_trivia();
245        st
246    }
247
248    /// Creates a nested parser state that shares the same arena and source.
249    /// This is useful for parsing sub-structures that should be independent but part of the same overall tree.
250    pub fn nested(&self) -> ParserState<'a, L, S> {
251        ParserState { tokens: self.tokens.clone(), sink: TreeSink::new(self.sink.arena, 1024), incremental: None, errors: Vec::new(), source: self.source }
252    }
253
254    /// Returns the text content of the current token.
255    pub fn peek_text(&self) -> Option<std::borrow::Cow<'_, str>> {
256        self.current().map(|t| self.source.get_text_in(t.span))
257    }
258
259    /// Manually promotes a node from a previous generation to the current one.
260    /// This is useful for improving memory locality.
261    pub fn promote(&mut self, node: &'a GreenNode<'a, L>) -> &'a GreenNode<'a, L> {
262        deep_clone_node(node, self.sink.arena)
263    }
264
265    /// Sets the incremental parsing context.
266    ///
267    /// # Arguments
268    /// * `old` - The root of the previous version of the syntax tree.
269    /// * `edits` - The edits that were applied to the source text.
270    pub fn set_incremental(&mut self, old: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) {
271        self.incremental = Some(IncrementalContext::new(old, edits));
272    }
273
274    // --- Error Reporting ---
275
276    /// Returns the current byte offset.
277    pub fn current_offset(&self) -> usize {
278        self.tokens.current().map(|t| t.span.start).unwrap_or_else(|| self.source.length())
279    }
280
281    /// Records a syntax error with the given message.
282    pub fn syntax_error(&mut self, message: impl Into<String>) -> OakError {
283        let err = OakError::syntax_error(message, self.current_offset(), self.source.source_id());
284        self.errors.push(err.clone());
285        err
286    }
287
288    /// Records an unexpected token error.
289    pub fn record_unexpected_token(&mut self, token: impl Into<String>) {
290        let err = OakError::unexpected_token(token, self.current_offset(), self.source.source_id());
291        self.errors.push(err)
292    }
293
294    /// Records an expected token error.
295    pub fn record_expected(&mut self, expected: impl Into<String>) {
296        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
297        let err = OakError::expected_token(expected, offset, self.source.source_id());
298        self.errors.push(err)
299    }
300
301    /// Records an expected name error.
302    pub fn record_expected_name(&mut self, name_kind: impl Into<String>) {
303        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
304        let err = OakError::expected_name(name_kind, offset, self.source.source_id());
305        self.errors.push(err)
306    }
307
308    /// Records a trailing comma not allowed error.
309    pub fn record_trailing_comma_not_allowed(&mut self) {
310        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
311        let err = OakError::trailing_comma_not_allowed(offset, self.source.source_id());
312        self.errors.push(err)
313    }
314
315    /// Records an unexpected end of file error.
316    pub fn record_unexpected_eof(&mut self) {
317        let offset = self.source.length();
318        let err = OakError::unexpected_eof(offset, self.source.source_id());
319        self.errors.push(err)
320    }
321
322    /// Records an unexpected end of file error and returns it.
323    pub fn unexpected_eof(&mut self) -> OakError {
324        let offset = self.source.length();
325        let err = OakError::unexpected_eof(offset, self.source.source_id());
326        self.errors.push(err.clone());
327        err
328    }
329
330    // --- Token Navigation ---
331
332    /// Returns the current token.
333    #[inline]
334    pub fn current(&self) -> Option<&Token<L::TokenType>> {
335        self.tokens.current()
336    }
337
338    /// Returns the kind of the current token.
339    #[inline]
340    pub fn peek_kind(&self) -> Option<L::TokenType> {
341        self.current().map(|t| t.kind)
342    }
343
344    /// Peeks at a token at the specified offset from the current position.
345    #[inline]
346    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
347        self.tokens.peek_at(offset)
348    }
349
350    /// Peeks at the Nth non-trivia token from the current position.
351    pub fn peek_non_trivia_at(&self, mut n: usize) -> Option<&Token<L::TokenType>> {
352        let mut offset = 0;
353        while let Some(token) = self.tokens.peek_at(offset) {
354            if !L::TokenType::is_ignored(&token.kind) {
355                if n == 0 {
356                    return Some(token);
357                }
358                n -= 1
359            }
360            offset += 1
361        }
362        None
363    }
364
365    /// Returns the kind of the token at the specified offset from the current position.
366    #[inline]
367    pub fn peek_kind_at(&self, offset: usize) -> Option<L::TokenType> {
368        self.tokens.peek_at(offset).map(|t| t.kind)
369    }
370
371    /// Returns the kind of the Nth non-trivia token from the current position.
372    pub fn peek_non_trivia_kind_at(&self, n: usize) -> Option<L::TokenType> {
373        self.peek_non_trivia_at(n).map(|t| t.kind)
374    }
375
376    /// Checks if the parser has not yet reached the end of the token stream.
377    #[inline]
378    pub fn not_at_end(&self) -> bool {
379        !self.tokens.is_end()
380    }
381
382    /// Checks if the current token is of the specified kind.
383    #[inline]
384    pub fn at(&self, kind: L::TokenType) -> bool {
385        self.peek_kind() == Some(kind)
386    }
387
388    /// Checks if the current token is NOT of the specified kind.
389    #[inline]
390    pub fn not_at(&self, kind: L::TokenType) -> bool {
391        !self.at(kind)
392    }
393
394    /// Advances the current position to the next token.
395    pub fn advance(&mut self) {
396        self.tokens.advance();
397        self.skip_trivia()
398    }
399
400    /// Skips trivia tokens and adds them to the syntax tree.
401    pub fn skip_trivia(&mut self) {
402        while let Some(token) = self.tokens.current() {
403            if L::TokenType::is_ignored(&token.kind) {
404                self.sink.push_leaf(token.kind, token.length());
405                self.tokens.advance()
406            }
407            else {
408                break;
409            }
410        }
411    }
412
413    /// Advances until a token of the specified kind is found, or the end of the token stream is reached.
414    pub fn advance_until(&mut self, kind: L::TokenType) {
415        while self.not_at_end() && !self.at(kind) {
416            self.advance()
417        }
418    }
419
420    /// Advances until any token of the specified kinds is found, or the end of the token stream is reached.
421    pub fn advance_until_any(&mut self, kinds: &[L::TokenType]) {
422        while self.not_at_end() && !kinds.iter().any(|&k| self.at(k)) {
423            self.advance()
424        }
425    }
426
427    /// Consumes the current token and adds it to the syntax tree as a leaf node.
428    pub fn bump(&mut self) {
429        if let Some(token) = self.current() {
430            self.sink.push_leaf(token.kind, token.length());
431            self.advance()
432        }
433    }
434
435    /// Consumes the current token if it matches the specified kind.
436    /// Returns `true` if the token was consumed, `false` otherwise.
437    pub fn eat(&mut self, kind: L::TokenType) -> bool {
438        if self.at(kind) {
439            self.bump();
440            true
441        }
442        else {
443            false
444        }
445    }
446
447    /// Expects the current token to be of the specified kind, consuming it if it matches.
448    /// If the token does not match, an error is recorded and returned.
449    pub fn expect(&mut self, kind: L::TokenType) -> Result<(), OakError> {
450        if self.eat(kind) {
451            Ok(())
452        }
453        else {
454            let err = OakError::expected_token(format!("{:?}", kind), self.current_offset(), self.source.source_id());
455            self.errors.push(err.clone());
456            Err(err)
457        }
458    }
459
460    /// Tries to parse a construct with a backtracking point.
461    pub fn try_parse<T, F>(&mut self, parser: F) -> Result<T, OakError>
462    where
463        F: FnOnce(&mut Self) -> Result<T, OakError>,
464    {
465        let checkpoint = self.checkpoint();
466        match parser(self) {
467            Ok(value) => Ok(value),
468            Err(err) => {
469                self.restore(checkpoint);
470                Err(err)
471            }
472        }
473    }
474
475    /// Tries to parse a construct with a backtracking point, preserving the error if it fails.
476    ///
477    /// This is similar to `try_parse`, but it keeps the error generated by the parser
478    /// instead of potentially discarding it or generating a generic one.
479    pub fn try_parse_with_error<T, F>(&mut self, parser: F) -> Result<T, OakError>
480    where
481        F: FnOnce(&mut Self) -> Result<T, OakError>,
482    {
483        let checkpoint = self.checkpoint();
484        match parser(self) {
485            Ok(value) => Ok(value),
486            Err(err) => {
487                self.restore(checkpoint);
488                Err(err)
489            }
490        }
491    }
492
493    // --- Tree Construction ---
494
495    /// Creates a checkpoint in the tree construction process, which can be used to finish a node later.
496    /// This checkpoint includes both the token index and the tree sink's current position for backtracking.
497    pub fn checkpoint(&self) -> (usize, usize) {
498        (self.tokens.index(), self.sink.checkpoint())
499    }
500
501    /// Restores the parser state to a previous checkpoint.
502    pub fn restore(&mut self, (token_index, tree_checkpoint): (usize, usize)) {
503        self.tokens.set_index(token_index);
504        self.sink.restore(tree_checkpoint)
505    }
506
507    /// Finishes a node starting from the given checkpoint and adds it as a child.
508    pub fn finish_at(&mut self, checkpoint: (usize, usize), kind: L::ElementType) -> &'a GreenNode<'a, L> {
509        self.sink.finish_node(checkpoint.1, kind)
510    }
511
512    /// Creates a checkpoint before an existing node.
513    pub fn checkpoint_before(&mut self, node: &'a GreenNode<'a, L>) -> (usize, usize) {
514        // Find the index of the node in the sink's children
515        let sink_checkpoint = self.sink.checkpoint();
516        for i in (0..sink_checkpoint).rev() {
517            if let Some(child) = self.sink.children.get(i) {
518                if let GreenTree::Node(n) = child {
519                    if std::ptr::eq(*n, node) {
520                        return (0, i); // token_index is hard to determine, but usually not needed for infix wrapping
521                    }
522                }
523            }
524        }
525        (0, sink_checkpoint)
526    }
527
528    /// Adds an existing node as a child to the current node.
529    pub fn push_child(&mut self, node: &'a GreenNode<'a, L>) {
530        self.sink.push_node(node)
531    }
532
533    /// Attempts to reuse a previous syntax node at the current position.
534    /// This is the core of incremental parsing.
535    pub fn try_reuse(&mut self, kind: L::ElementType) -> bool {
536        let Some(inc) = &mut self.incremental
537        else {
538            return false;
539        };
540        let current_index = self.tokens.index();
541        let new_pos = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
542
543        #[cfg(test)]
544        println!("try_reuse: kind={:?}, new_pos={}", kind, new_pos);
545
546        let Some(target_old_pos) = inc.map_new_to_old(new_pos)
547        else {
548            return false;
549        };
550
551        #[cfg(test)]
552        println!("Trying to reuse node at pos {};: {:?}", new_pos, kind);
553
554        let mut steps = 0;
555        const MAX_STEPS: usize = 100;
556
557        loop {
558            let start = inc.cursor.offset();
559            let end = inc.cursor.end_offset();
560
561            if start == target_old_pos {
562                if let Some(node) = inc.cursor.as_node() {
563                    if node.kind == kind {
564                        if !inc.is_dirty(start, end) {
565                            // Verify tokens
566                            let node_len = node.text_len() as usize;
567
568                            // Check token stream
569                            let mut lookahead = 0;
570                            let mut current_pos = new_pos;
571                            let target_new_end = new_pos + node_len;
572
573                            while let Some(t) = self.tokens.peek_at(lookahead) {
574                                if t.span.start != current_pos {
575                                    // Token stream doesn't match expected positions
576                                    break;
577                                }
578                                current_pos = t.span.end;
579
580                                if t.span.end == target_new_end {
581                                    // Found the end!
582                                    let tokens_consumed = lookahead + 1;
583                                    let new_node = deep_clone_node(node, self.sink.arena);
584                                    self.sink.push_node(new_node);
585                                    self.tokens.set_index(current_index + tokens_consumed);
586                                    inc.cursor.step_over();
587                                    return true;
588                                }
589                                else if t.span.end > target_new_end {
590                                    break;
591                                }
592                                lookahead += 1;
593                            }
594                        }
595                    }
596                }
597                if !inc.cursor.step_into() && !inc.cursor.step_next() {
598                    return false;
599                }
600            }
601            else if start < target_old_pos && end > target_old_pos {
602                if !inc.cursor.step_into() && !inc.cursor.step_next() {
603                    return false;
604                }
605            }
606            else if end <= target_old_pos {
607                if !inc.cursor.step_next() {
608                    return false;
609                }
610            }
611            else {
612                return false;
613            }
614
615            steps += 1;
616            if steps > MAX_STEPS {
617                return false;
618            }
619        }
620    }
621
622    /// Finishes the parsing process and returns the final output.
623    pub fn finish(self, result: Result<&'a GreenNode<'a, L>, OakError>) -> crate::parser::ParseOutput<'a, L> {
624        crate::errors::OakDiagnostics { result, diagnostics: self.errors }
625    }
626
627    /// Wraps the parsing of a node with incremental reuse support.
628    ///
629    /// This method first attempts to reuse an existing node of the given kind from the previous tree.
630    /// If successful, it skips the provided closure and returns `Ok(())`.
631    /// Otherwise, it creates a checkpoint, runs the closure to parse the node's content,
632    /// and then finishes the node with the specified kind.
633    pub fn incremental_node<F>(&mut self, kind: L::ElementType, f: F) -> Result<(), OakError>
634    where
635        F: FnOnce(&mut Self) -> Result<(), OakError>,
636    {
637        if self.try_reuse(kind) {
638            return Ok(());
639        };
640
641        let checkpoint = self.checkpoint();
642        let res = f(self);
643        self.finish_at(checkpoint, kind);
644        res
645    }
646
647    /// Wraps the parsing of an optional node with incremental reuse support.
648    ///
649    /// Similar to `incremental_node`, but the closure returns a boolean indicating
650    /// if the node was actually present.
651    pub fn incremental_opt<F>(&mut self, kind: L::ElementType, f: F) -> Result<bool, OakError>
652    where
653        F: FnOnce(&mut Self) -> Result<bool, OakError>,
654    {
655        if self.try_reuse(kind) {
656            return Ok(true);
657        };
658
659        let checkpoint = self.checkpoint();
660        match f(self) {
661            Ok(true) => {
662                self.finish_at(checkpoint, kind);
663                Ok(true)
664            }
665            Ok(false) => Ok(false),
666            Err(e) => {
667                self.finish_at(checkpoint, kind);
668                Err(e)
669            }
670        }
671    }
672}