Skip to main content

oak_core/parser/
state.rs

1use crate::{
2    Language,
3    errors::OakError,
4    lexer::{LexOutput, Token, Tokens},
5    memory::arena::SyntaxArena,
6    source::Source,
7    tree::{Cursor, GreenLeaf, GreenNode, GreenTree},
8};
9use core::range::Range;
10use triomphe::Arc;
11
12/// Helper function to perform deep clone of a node into an arena.
13///
14/// This is used to "promote" nodes from a previous generation's arena to the current one,
15/// ensuring that the new tree is self-contained and has good memory locality.
16pub fn deep_clone_node<'a, L: Language>(node: &GreenNode<'_, L>, arena: &'a SyntaxArena) -> &'a GreenNode<'a, L> {
17    let children_iter = node.children.iter().map(|child| match child {
18        GreenTree::Node(n) => GreenTree::Node(deep_clone_node(n, arena)),
19        GreenTree::Leaf(l) => GreenTree::Leaf(*l),
20    });
21
22    let slice = arena.alloc_slice_fill_iter(node.children.len(), children_iter);
23    let new_node = GreenNode::new(node.kind, slice);
24    arena.alloc(new_node)
25}
26
27/// Provides tokens to the parser.
28pub struct TokenSource<L: Language> {
29    tokens: Tokens<L>,
30    index: usize,
31}
32
33impl<L: Language> Clone for TokenSource<L> {
34    fn clone(&self) -> Self {
35        Self { tokens: self.tokens.clone(), index: self.index }
36    }
37}
38
39impl<L: Language> TokenSource<L> {
40    /// Creates a new token source.
41    pub fn new(tokens: Tokens<L>) -> Self {
42        Self { tokens, index: 0 }
43    }
44
45    /// Gets the current token.
46    #[inline]
47    pub fn current(&self) -> Option<&Token<L::TokenType>> {
48        self.tokens.get(self.index)
49    }
50
51    /// Peeks at a token at the specified offset from the current position.
52    #[inline]
53    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
54        self.tokens.get(self.index + offset)
55    }
56
57    /// Advances the current position to the next token.
58    #[inline]
59    pub fn advance(&mut self) {
60        self.index += 1;
61    }
62
63    /// Checks if the token source has reached the end.
64    #[inline]
65    pub fn is_end(&self) -> bool {
66        self.index >= self.tokens.len()
67    }
68
69    /// Gets the current index.
70    #[inline]
71    pub fn index(&self) -> usize {
72        self.index
73    }
74
75    /// Sets the current index.
76    #[inline]
77    pub fn set_index(&mut self, index: usize) {
78        self.index = index;
79    }
80}
81
82/// Collects the results of the parsing process.
83pub struct TreeSink<'a, L: Language> {
84    arena: &'a SyntaxArena,
85    children: Vec<GreenTree<'a, L>>,
86}
87
88impl<'a, L: Language> TreeSink<'a, L> {
89    /// Creates a new tree sink.
90    pub fn new(arena: &'a SyntaxArena, capacity_hint: usize) -> Self {
91        Self { arena, children: Vec::with_capacity(capacity_hint) }
92    }
93
94    /// Pushes a leaf node (token) to the current list of children.
95    pub fn push_leaf(&mut self, kind: L::TokenType, len: usize) {
96        self.children.push(GreenTree::Leaf(GreenLeaf::new(kind, len as u32)));
97    }
98
99    /// Pushes an existing node to the current list of children.
100    pub fn push_node(&mut self, node: &'a GreenNode<'a, L>) {
101        self.children.push(GreenTree::Node(node));
102    }
103
104    /// Returns the current number of children, serving as a checkpoint for finishing a node later.
105    pub fn checkpoint(&self) -> usize {
106        self.children.len()
107    }
108
109    /// Returns the syntax arena used by this sink.
110    pub fn arena(&self) -> &'a SyntaxArena {
111        self.arena
112    }
113
114    /// Restores the sink to a previous checkpoint by truncating the children list.
115    pub fn restore(&mut self, checkpoint: usize) {
116        self.children.truncate(checkpoint);
117    }
118
119    /// Finishes a node starting from the given checkpoint and adds it as a child.
120    pub fn finish_node(&mut self, checkpoint: usize, kind: L::ElementType) -> &'a GreenNode<'a, L> {
121        let children_slice = self.arena.alloc_slice_copy(&self.children[checkpoint..]);
122        self.children.truncate(checkpoint);
123        let node = GreenNode::new(kind, children_slice);
124        let node_ref = self.arena.alloc(node);
125        self.children.push(GreenTree::Node(node_ref));
126        node_ref
127    }
128}
129
130/// Encapsulates incremental parsing logic.
131pub struct IncrementalContext<'a, L: Language> {
132    cursor: Cursor<'a, L>,
133    /// Sorted list of edits with their accumulated deltas.
134    /// Each entry contains the old range and the cumulative delta *after* this edit.
135    edits: Vec<(Range<usize>, isize)>,
136}
137
138impl<'a, L: Language> IncrementalContext<'a, L> {
139    /// Creates a new incremental parsing context.
140    pub fn new(old_root: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) -> Self {
141        let mut sorted_edits: Vec<_> = edits.iter().collect();
142        sorted_edits.sort_by_key(|e| e.span.start);
143
144        let mut cumulative_delta = 0isize;
145        let mut processed_edits = Vec::with_capacity(edits.len());
146
147        for edit in sorted_edits {
148            let old_len = edit.span.end - edit.span.start;
149            let new_len = edit.text.len();
150            cumulative_delta += new_len as isize - old_len as isize;
151            processed_edits.push((edit.span, cumulative_delta));
152        }
153
154        Self { cursor: Cursor::new(old_root), edits: processed_edits }
155    }
156
157    fn map_new_to_old(&self, new_pos: usize) -> Option<usize> {
158        let mut current_delta = 0isize;
159
160        for (old_range, cumulative_delta) in &self.edits {
161            let new_start = (old_range.start as isize + current_delta) as usize;
162            let edit_delta = *cumulative_delta - current_delta;
163            let new_end = (old_range.end as isize + current_delta + edit_delta) as usize;
164
165            if new_pos < new_start {
166                return Some((new_pos as isize - current_delta) as usize);
167            }
168            else if new_pos < new_end {
169                // Inside a dirty range
170                return None;
171            }
172
173            current_delta = *cumulative_delta;
174        }
175
176        Some((new_pos as isize - current_delta) as usize)
177    }
178
179    fn is_dirty(&self, old_start: usize, old_end: usize) -> bool {
180        for (old_range, _) in &self.edits {
181            // Check for overlap between [old_start, old_end) and [old_range.start, old_range.end)
182            if old_start < old_range.end && old_end > old_range.start {
183                return true;
184            }
185        }
186        false
187    }
188}
189
190/// High-level API for parsers, coordinating token supply and tree construction.
191pub struct ParserState<'a, L: Language, S: Source + ?Sized = crate::source::SourceText> {
192    /// The token stream being parsed.
193    pub tokens: TokenSource<L>,
194
195    /// The sink where the syntax tree is constructed.
196    pub sink: TreeSink<'a, L>,
197
198    /// Optional context for incremental parsing.
199    pub incremental: Option<IncrementalContext<'a, L>>,
200
201    /// Collection of errors encountered during parsing.
202    pub errors: Vec<OakError>,
203    /// We keep a reference to help with error reporting and offset calculation.
204    pub source: &'a S,
205}
206
207impl<'a, L: Language, S: Source + ?Sized> ParserState<'a, L, S> {
208    /// Returns the source file's ID, if available.
209    pub fn source_id(&self) -> Option<crate::source::SourceId> {
210        self.source.source_id()
211    }
212
213    /// Returns the syntax arena used by this parser.
214    pub fn arena(&self) -> &'a SyntaxArena {
215        self.sink.arena()
216    }
217
218    /// Creates a new parser state.
219    ///
220    /// # Arguments
221    /// * `arena` - The syntax arena to allocate nodes in.
222    /// * `lex_output` - The output from the lexer, including tokens and any lexical errors.
223    /// * `source` - The source text being parsed.
224    /// * `capacity_hint` - Initial capacity hint for the tree sink's child vector.
225    pub fn new(arena: &'a SyntaxArena, lex_output: LexOutput<L>, source: &'a S, capacity_hint: usize) -> Self {
226        let (tokens, mut errors) = match lex_output.result {
227            Ok(tokens) => (tokens, Vec::new()),
228            Err(e) => (Arc::from(Vec::new()), vec![e]),
229        };
230        errors.extend(lex_output.diagnostics);
231
232        Self { tokens: TokenSource::new(tokens), sink: TreeSink::new(arena, capacity_hint), incremental: None, errors, source }
233    }
234
235    /// Creates a nested parser state that shares the same arena and source.
236    /// This is useful for parsing sub-structures that should be independent but part of the same overall tree.
237    pub fn nested(&self) -> ParserState<'a, L, S> {
238        ParserState { tokens: self.tokens.clone(), sink: TreeSink::new(self.sink.arena, 1024), incremental: None, errors: Vec::new(), source: self.source }
239    }
240
241    /// Returns the text content of the current token.
242    pub fn peek_text(&self) -> Option<std::borrow::Cow<'_, str>> {
243        self.current().map(|t| self.source.get_text_in(t.span))
244    }
245
246    /// Manually promotes a node from a previous generation to the current one.
247    /// This is useful for improving memory locality.
248    pub fn promote(&mut self, node: &'a GreenNode<'a, L>) -> &'a GreenNode<'a, L> {
249        deep_clone_node(node, self.sink.arena)
250    }
251
252    /// Sets the incremental parsing context.
253    ///
254    /// # Arguments
255    /// * `old` - The root of the previous version of the syntax tree.
256    /// * `edits` - The edits that were applied to the source text.
257    pub fn set_incremental(&mut self, old: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) {
258        self.incremental = Some(IncrementalContext::new(old, edits));
259    }
260
261    // --- Error Reporting ---
262
263    /// Returns the current byte offset.
264    pub fn current_offset(&self) -> usize {
265        self.tokens.current().map(|t| t.span.start).unwrap_or_else(|| self.source.length())
266    }
267
268    /// Records a syntax error with the given message.
269    pub fn syntax_error(&mut self, message: impl Into<String>) -> OakError {
270        let err = OakError::syntax_error(message, self.current_offset(), self.source.source_id());
271        self.errors.push(err.clone());
272        err
273    }
274
275    /// Records an unexpected token error.
276    pub fn record_unexpected_token(&mut self, token: impl Into<String>) {
277        let err = OakError::unexpected_token(token, self.current_offset(), self.source.source_id());
278        self.errors.push(err);
279    }
280
281    /// Records an expected token error.
282    pub fn record_expected(&mut self, expected: impl Into<String>) {
283        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
284        let err = OakError::expected_token(expected, offset, self.source.source_id());
285        self.errors.push(err);
286    }
287
288    /// Records an expected name error.
289    pub fn record_expected_name(&mut self, name_kind: impl Into<String>) {
290        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
291        let err = OakError::expected_name(name_kind, offset, self.source.source_id());
292        self.errors.push(err);
293    }
294
295    /// Records a trailing comma not allowed error.
296    pub fn record_trailing_comma_not_allowed(&mut self) {
297        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
298        let err = OakError::trailing_comma_not_allowed(offset, self.source.source_id());
299        self.errors.push(err);
300    }
301
302    /// Records an unexpected end of file error.
303    pub fn record_unexpected_eof(&mut self) {
304        let offset = self.source.length();
305        let err = OakError::unexpected_eof(offset, self.source.source_id());
306        self.errors.push(err);
307    }
308
309    /// Records an unexpected end of file error and returns it.
310    pub fn unexpected_eof(&mut self) -> OakError {
311        let offset = self.source.length();
312        let err = OakError::unexpected_eof(offset, self.source.source_id());
313        self.errors.push(err.clone());
314        err
315    }
316
317    // --- Token Navigation ---
318
319    /// Returns the current token.
320    #[inline]
321    pub fn current(&self) -> Option<&Token<L::TokenType>> {
322        self.tokens.current()
323    }
324
325    /// Returns the kind of the current token.
326    #[inline]
327    pub fn peek_kind(&self) -> Option<L::TokenType> {
328        self.current().map(|t| t.kind)
329    }
330
331    /// Peeks at a token at the specified offset from the current position.
332    #[inline]
333    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
334        self.tokens.peek_at(offset)
335    }
336
337    /// Returns the kind of the token at the specified offset from the current position.
338    #[inline]
339    pub fn peek_kind_at(&self, offset: usize) -> Option<L::TokenType> {
340        self.tokens.peek_at(offset).map(|t| t.kind)
341    }
342
343    /// Checks if the parser has not yet reached the end of the token stream.
344    #[inline]
345    pub fn not_at_end(&self) -> bool {
346        !self.tokens.is_end()
347    }
348
349    /// Checks if the current token is of the specified kind.
350    #[inline]
351    pub fn at(&self, kind: L::TokenType) -> bool {
352        self.peek_kind() == Some(kind)
353    }
354
355    /// Checks if the current token is NOT of the specified kind.
356    #[inline]
357    pub fn not_at(&self, kind: L::TokenType) -> bool {
358        !self.at(kind)
359    }
360
361    /// Advances the current position to the next token.
362    pub fn advance(&mut self) {
363        self.tokens.advance();
364    }
365
366    /// Advances until a token of the specified kind is found, or the end of the token stream is reached.
367    pub fn advance_until(&mut self, kind: L::TokenType) {
368        while self.not_at_end() && !self.at(kind) {
369            self.advance();
370        }
371    }
372
373    /// Advances until any token of the specified kinds is found, or the end of the token stream is reached.
374    pub fn advance_until_any(&mut self, kinds: &[L::TokenType]) {
375        while self.not_at_end() && !kinds.iter().any(|&k| self.at(k)) {
376            self.advance();
377        }
378    }
379
380    /// Consumes the current token and adds it to the syntax tree as a leaf node.
381    pub fn bump(&mut self)
382    where
383        L::ElementType: From<L::TokenType>,
384    {
385        if let Some(token) = self.current() {
386            self.sink.push_leaf(token.kind, token.length());
387            self.tokens.advance();
388        }
389    }
390
391    /// Consumes the current token if it matches the specified kind.
392    /// Returns `true` if the token was consumed, `false` otherwise.
393    pub fn eat(&mut self, kind: L::TokenType) -> bool
394    where
395        L::ElementType: From<L::TokenType>,
396    {
397        if self.at(kind) {
398            self.bump();
399            true
400        }
401        else {
402            false
403        }
404    }
405
406    /// Expects the current token to be of the specified kind, consuming it if it matches.
407    /// If the token does not match, an error is recorded and returned.
408    pub fn expect(&mut self, kind: L::TokenType) -> Result<(), OakError>
409    where
410        L::ElementType: From<L::TokenType>,
411    {
412        if self.eat(kind) {
413            Ok(())
414        }
415        else {
416            let err = OakError::expected_token(format!("{:?}", kind), self.current_offset(), self.source.source_id());
417            self.errors.push(err.clone());
418            Err(err)
419        }
420    }
421
422    /// Tries to parse a construct with a backtracking point.
423    pub fn try_parse<T, F>(&mut self, parser: F) -> Result<T, OakError>
424    where
425        F: FnOnce(&mut Self) -> Result<T, OakError>,
426    {
427        let checkpoint = self.checkpoint();
428        match parser(self) {
429            Ok(value) => Ok(value),
430            Err(err) => {
431                self.restore(checkpoint);
432                Err(err)
433            }
434        }
435    }
436
437    /// Tries to parse a construct with a backtracking point, preserving the error if it fails.
438    ///
439    /// This is similar to `try_parse`, but it keeps the error generated by the parser
440    /// instead of potentially discarding it or generating a generic one.
441    pub fn try_parse_with_error<T, F>(&mut self, parser: F) -> Result<T, OakError>
442    where
443        F: FnOnce(&mut Self) -> Result<T, OakError>,
444    {
445        let checkpoint = self.checkpoint();
446        match parser(self) {
447            Ok(value) => Ok(value),
448            Err(err) => {
449                self.restore(checkpoint);
450                Err(err)
451            }
452        }
453    }
454
455    // --- Tree Construction ---
456
457    /// Creates a checkpoint in the tree construction process, which can be used to finish a node later.
458    /// This checkpoint includes both the token index and the tree sink's current position for backtracking.
459    pub fn checkpoint(&self) -> (usize, usize) {
460        (self.tokens.index(), self.sink.checkpoint())
461    }
462
463    /// Restores the parser state to a previous checkpoint.
464    pub fn restore(&mut self, (token_index, tree_checkpoint): (usize, usize)) {
465        self.tokens.set_index(token_index);
466        self.sink.restore(tree_checkpoint);
467    }
468
469    /// Finishes a node starting from the given checkpoint and adds it as a child.
470    pub fn finish_at(&mut self, checkpoint: (usize, usize), kind: L::ElementType) -> &'a GreenNode<'a, L> {
471        self.sink.finish_node(checkpoint.1, kind)
472    }
473
474    /// Adds an existing node as a child to the current node.
475    pub fn push_child(&mut self, node: &'a GreenNode<'a, L>) {
476        self.sink.push_node(node);
477    }
478
479    /// Attempts to reuse a previous syntax node at the current position.
480    /// This is the core of incremental parsing.
481    pub fn try_reuse(&mut self, kind: L::ElementType) -> bool {
482        let Some(inc) = &mut self.incremental
483        else {
484            return false;
485        };
486        let current_index = self.tokens.index();
487        let new_pos = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
488
489        #[cfg(test)]
490        println!("try_reuse: kind={:?}, new_pos={}", kind, new_pos);
491
492        let Some(target_old_pos) = inc.map_new_to_old(new_pos)
493        else {
494            return false;
495        };
496
497        #[cfg(test)]
498        println!("Trying to reuse node at pos {}: {:?}", new_pos, kind);
499
500        let mut steps = 0;
501        const MAX_STEPS: usize = 100;
502
503        loop {
504            let start = inc.cursor.offset();
505            let end = inc.cursor.end_offset();
506
507            if start == target_old_pos {
508                if let Some(node) = inc.cursor.as_node() {
509                    if node.kind == kind {
510                        if !inc.is_dirty(start, end) {
511                            // Verify tokens
512                            let node_len = node.text_len() as usize;
513
514                            // Check token stream
515                            let mut lookahead = 0;
516                            let mut current_pos = new_pos;
517                            let target_new_end = new_pos + node_len;
518
519                            while let Some(t) = self.tokens.peek_at(lookahead) {
520                                if t.span.start != current_pos {
521                                    // Token stream doesn't match expected positions
522                                    break;
523                                }
524                                current_pos = t.span.end;
525
526                                if t.span.end == target_new_end {
527                                    // Found the end!
528                                    let tokens_consumed = lookahead + 1;
529                                    let new_node = deep_clone_node(node, self.sink.arena);
530                                    self.sink.push_node(new_node);
531                                    self.tokens.set_index(current_index + tokens_consumed);
532                                    inc.cursor.step_over();
533                                    return true;
534                                }
535                                else if t.span.end > target_new_end {
536                                    break;
537                                }
538                                lookahead += 1;
539                            }
540                        }
541                    }
542                }
543                if !inc.cursor.step_into() && !inc.cursor.step_next() {
544                    return false;
545                }
546            }
547            else if start < target_old_pos && end > target_old_pos {
548                if !inc.cursor.step_into() && !inc.cursor.step_next() {
549                    return false;
550                }
551            }
552            else if end <= target_old_pos {
553                if !inc.cursor.step_next() {
554                    return false;
555                }
556            }
557            else {
558                return false;
559            }
560
561            steps += 1;
562            if steps > MAX_STEPS {
563                return false;
564            }
565        }
566    }
567
568    /// Finishes the parsing process and returns the final output.
569    pub fn finish(self, result: Result<&'a GreenNode<'a, L>, OakError>) -> crate::parser::ParseOutput<'a, L> {
570        crate::errors::OakDiagnostics { result, diagnostics: self.errors }
571    }
572
573    /// Wraps the parsing of a node with incremental reuse support.
574    ///
575    /// This method first attempts to reuse an existing node of the given kind from the previous tree.
576    /// If successful, it skips the provided closure and returns `Ok(())`.
577    /// Otherwise, it creates a checkpoint, runs the closure to parse the node's content,
578    /// and then finishes the node with the specified kind.
579    pub fn incremental_node<F>(&mut self, kind: L::ElementType, f: F) -> Result<(), OakError>
580    where
581        F: FnOnce(&mut Self) -> Result<(), OakError>,
582    {
583        if self.try_reuse(kind) {
584            return Ok(());
585        }
586
587        let checkpoint = self.checkpoint();
588        let res = f(self);
589        self.finish_at(checkpoint, kind);
590        res
591    }
592
593    /// Wraps the parsing of an optional node with incremental reuse support.
594    ///
595    /// Similar to `incremental_node`, but the closure returns a boolean indicating
596    /// if the node was actually present.
597    pub fn incremental_opt<F>(&mut self, kind: L::ElementType, f: F) -> Result<bool, OakError>
598    where
599        F: FnOnce(&mut Self) -> Result<bool, OakError>,
600    {
601        if self.try_reuse(kind) {
602            return Ok(true);
603        }
604
605        let checkpoint = self.checkpoint();
606        match f(self) {
607            Ok(true) => {
608                self.finish_at(checkpoint, kind);
609                Ok(true)
610            }
611            Ok(false) => Ok(false),
612            Err(e) => {
613                self.finish_at(checkpoint, kind);
614                Err(e)
615            }
616        }
617    }
618}