oak_core/parser/
state.rs

1use crate::{
2    Language,
3    errors::OakError,
4    lexer::{LexOutput, Token, Tokens},
5    memory::arena::SyntaxArena,
6    source::Source,
7    tree::{Cursor, GreenLeaf, GreenNode, GreenTree},
8};
9use core::range::Range;
10use triomphe::Arc;
11
12/// Helper function to perform deep clone of a node into an arena.
13///
14/// This is used to "promote" nodes from a previous generation's arena to the current one,
15/// ensuring that the new tree is self-contained and has good memory locality.
16pub fn deep_clone_node<'a, L: Language>(node: &'a GreenNode<'a, L>, arena: &'a SyntaxArena) -> &'a GreenNode<'a, L> {
17    let children_iter = node.children.iter().map(|child| match child {
18        GreenTree::Node(n) => GreenTree::Node(deep_clone_node(n, arena)),
19        GreenTree::Leaf(l) => GreenTree::Leaf(*l),
20    });
21
22    let slice = arena.alloc_slice_fill_iter(node.children.len(), children_iter);
23    let new_node = GreenNode::new(node.kind, slice);
24    arena.alloc(new_node)
25}
26
27/// Provides tokens to the parser.
28pub struct TokenSource<L: Language> {
29    tokens: Tokens<L>,
30    index: usize,
31}
32
33impl<L: Language> Clone for TokenSource<L> {
34    fn clone(&self) -> Self {
35        Self { tokens: self.tokens.clone(), index: self.index }
36    }
37}
38
39impl<L: Language> TokenSource<L> {
40    /// Creates a new token source.
41    pub fn new(tokens: Tokens<L>) -> Self {
42        Self { tokens, index: 0 }
43    }
44
45    /// Gets the current token.
46    #[inline]
47    pub fn current(&self) -> Option<&Token<L::TokenType>> {
48        self.tokens.get(self.index)
49    }
50
51    /// Peeks at a token at the specified offset from the current position.
52    #[inline]
53    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
54        self.tokens.get(self.index + offset)
55    }
56
57    /// Advances the current position to the next token.
58    #[inline]
59    pub fn advance(&mut self) {
60        self.index += 1;
61    }
62
63    /// Checks if the token source has reached the end.
64    #[inline]
65    pub fn is_end(&self) -> bool {
66        self.index >= self.tokens.len()
67    }
68
69    /// Gets the current index.
70    #[inline]
71    pub fn index(&self) -> usize {
72        self.index
73    }
74
75    /// Sets the current index.
76    #[inline]
77    pub fn set_index(&mut self, index: usize) {
78        self.index = index;
79    }
80}
81
82/// Collects the results of the parsing process.
83pub struct TreeSink<'a, L: Language> {
84    arena: &'a SyntaxArena,
85    children: Vec<GreenTree<'a, L>>,
86}
87
88impl<'a, L: Language> TreeSink<'a, L> {
89    /// Creates a new tree sink.
90    pub fn new(arena: &'a SyntaxArena, capacity_hint: usize) -> Self {
91        Self { arena, children: Vec::with_capacity(capacity_hint) }
92    }
93
94    /// Pushes a leaf node (token) to the current list of children.
95    pub fn push_leaf(&mut self, kind: L::TokenType, len: usize) {
96        self.children.push(GreenTree::Leaf(GreenLeaf::new(kind, len as u32)));
97    }
98
99    /// Pushes an existing node to the current list of children.
100    pub fn push_node(&mut self, node: &'a GreenNode<'a, L>) {
101        self.children.push(GreenTree::Node(node));
102    }
103
104    /// Returns the current number of children, serving as a checkpoint for finishing a node later.
105    pub fn checkpoint(&self) -> usize {
106        self.children.len()
107    }
108
109    /// Finishes a node starting from the given checkpoint and adds it as a child.
110    pub fn finish_node(&mut self, checkpoint: usize, kind: L::ElementType) -> &'a GreenNode<'a, L> {
111        let children_slice = self.arena.alloc_slice_copy(&self.children[checkpoint..]);
112        self.children.truncate(checkpoint);
113        let node = GreenNode::new(kind, children_slice);
114        let node_ref = self.arena.alloc(node);
115        self.children.push(GreenTree::Node(node_ref));
116        node_ref
117    }
118}
119
120/// Encapsulates incremental parsing logic.
121pub struct IncrementalContext<'a, L: Language> {
122    cursor: Cursor<'a, L>,
123    /// Sorted list of edits with their accumulated deltas.
124    /// Each entry contains the old range and the cumulative delta *after* this edit.
125    edits: Vec<(Range<usize>, isize)>,
126}
127
128impl<'a, L: Language> IncrementalContext<'a, L> {
129    /// Creates a new incremental parsing context.
130    pub fn new(old_root: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) -> Self {
131        let mut sorted_edits: Vec<_> = edits.iter().collect();
132        sorted_edits.sort_by_key(|e| e.span.start);
133
134        let mut cumulative_delta = 0isize;
135        let mut processed_edits = Vec::with_capacity(edits.len());
136
137        for edit in sorted_edits {
138            let old_len = edit.span.end - edit.span.start;
139            let new_len = edit.text.len();
140            cumulative_delta += new_len as isize - old_len as isize;
141            processed_edits.push((edit.span, cumulative_delta));
142        }
143
144        Self { cursor: Cursor::new(old_root), edits: processed_edits }
145    }
146
147    fn map_new_to_old(&self, new_pos: usize) -> Option<usize> {
148        let mut current_delta = 0isize;
149
150        for (old_range, cumulative_delta) in &self.edits {
151            let new_start = (old_range.start as isize + current_delta) as usize;
152            let edit_delta = *cumulative_delta - current_delta;
153            let new_end = (old_range.end as isize + current_delta + edit_delta) as usize;
154
155            if new_pos < new_start {
156                return Some((new_pos as isize - current_delta) as usize);
157            }
158            else if new_pos < new_end {
159                // Inside a dirty range
160                return None;
161            }
162
163            current_delta = *cumulative_delta;
164        }
165
166        Some((new_pos as isize - current_delta) as usize)
167    }
168
169    fn is_dirty(&self, old_start: usize, old_end: usize) -> bool {
170        for (old_range, _) in &self.edits {
171            // Check for overlap between [old_start, old_end) and [old_range.start, old_range.end)
172            if old_start < old_range.end && old_end > old_range.start {
173                return true;
174            }
175        }
176        false
177    }
178}
179
180/// High-level API for parsers, coordinating token supply and tree construction.
181pub struct ParserState<'a, L: Language, S: Source + ?Sized = crate::source::SourceText> {
182    /// The token stream being parsed.
183    pub tokens: TokenSource<L>,
184
185    /// The sink where the syntax tree is constructed.
186    pub sink: TreeSink<'a, L>,
187
188    /// Optional context for incremental parsing.
189    pub incremental: Option<IncrementalContext<'a, L>>,
190
191    /// Collection of errors encountered during parsing.
192    pub errors: Vec<OakError>,
193    /// We keep a reference to help with error reporting and offset calculation.
194    pub source: &'a S,
195}
196
197impl<'a, L: Language, S: Source + ?Sized> ParserState<'a, L, S> {
198    /// Returns the source file's URL, if available.
199    pub fn source_url(&self) -> Option<crate::source::Url> {
200        self.source.url()
201    }
202
203    /// Creates a new parser state.
204    ///
205    /// # Arguments
206    /// * `arena` - The syntax arena to allocate nodes in.
207    /// * `lex_output` - The output from the lexer, including tokens and any lexical errors.
208    /// * `source` - The source text being parsed.
209    /// * `capacity_hint` - Initial capacity hint for the tree sink's child vector.
210    pub fn new(arena: &'a SyntaxArena, lex_output: LexOutput<L>, source: &'a S, capacity_hint: usize) -> Self {
211        let (tokens, mut errors) = match lex_output.result {
212            Ok(tokens) => (tokens, Vec::new()),
213            Err(e) => (Arc::from(Vec::new()), vec![e]),
214        };
215        errors.extend(lex_output.diagnostics);
216
217        Self { tokens: TokenSource::new(tokens), sink: TreeSink::new(arena, capacity_hint), incremental: None, errors, source }
218    }
219
220    /// Creates a nested parser state that shares the same arena and source.
221    /// This is useful for parsing sub-structures that should be independent but part of the same overall tree.
222    pub fn nested(&self) -> ParserState<'a, L, S> {
223        ParserState { tokens: self.tokens.clone(), sink: TreeSink::new(self.sink.arena, 1024), incremental: None, errors: Vec::new(), source: self.source }
224    }
225
226    /// Returns the text content of the current token.
227    pub fn peek_text(&self) -> Option<std::borrow::Cow<'_, str>> {
228        self.current().map(|t| self.source.get_text_in(t.span))
229    }
230
231    /// Manually promotes a node from a previous generation to the current one.
232    /// This is useful for improving memory locality.
233    pub fn promote(&mut self, node: &'a GreenNode<'a, L>) -> &'a GreenNode<'a, L> {
234        deep_clone_node(node, self.sink.arena)
235    }
236
237    /// Sets the incremental parsing context.
238    ///
239    /// # Arguments
240    /// * `old` - The root of the previous version of the syntax tree.
241    /// * `edits` - The edits that were applied to the source text.
242    pub fn set_incremental(&mut self, old: &'a GreenNode<'a, L>, edits: &[crate::source::TextEdit]) {
243        self.incremental = Some(IncrementalContext::new(old, edits));
244    }
245
246    // --- Error Reporting ---
247
248    /// Returns the current byte offset.
249    pub fn current_offset(&self) -> usize {
250        self.tokens.current().map(|t| t.span.start).unwrap_or_else(|| self.source.length())
251    }
252
253    /// Records a syntax error with the given message.
254    pub fn syntax_error(&mut self, message: impl Into<String>) -> OakError {
255        let err = OakError::syntax_error(message, self.current_offset(), self.source.url());
256        self.errors.push(err.clone());
257        err
258    }
259
260    /// Records an unexpected token error.
261    pub fn record_unexpected_token(&mut self, token: impl Into<String>) {
262        let err = OakError::unexpected_token(token, self.current_offset(), self.source.url());
263        self.errors.push(err);
264    }
265
266    /// Records an expected token error.
267    pub fn record_expected(&mut self, expected: impl Into<String>) {
268        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
269        let err = OakError::expected_token(expected, offset, self.source.url());
270        self.errors.push(err);
271    }
272
273    /// Records an expected name error.
274    pub fn record_expected_name(&mut self, name_kind: impl Into<String>) {
275        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
276        let err = OakError::expected_name(name_kind, offset, self.source.url());
277        self.errors.push(err);
278    }
279
280    /// Records a trailing comma not allowed error.
281    pub fn record_trailing_comma_not_allowed(&mut self) {
282        let offset = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
283        let err = OakError::trailing_comma_not_allowed(offset, self.source.url());
284        self.errors.push(err);
285    }
286
287    /// Records an unexpected end of file error.
288    pub fn record_unexpected_eof(&mut self) {
289        let offset = self.source.length();
290        let err = OakError::unexpected_eof(offset, self.source.url());
291        self.errors.push(err);
292    }
293
294    /// Records an unexpected end of file error and returns it.
295    pub fn unexpected_eof(&mut self) -> OakError {
296        let offset = self.source.length();
297        let err = OakError::unexpected_eof(offset, self.source.url());
298        self.errors.push(err.clone());
299        err
300    }
301
302    // --- Token Navigation ---
303
304    /// Returns the current token.
305    #[inline]
306    pub fn current(&self) -> Option<&Token<L::TokenType>> {
307        self.tokens.current()
308    }
309
310    /// Returns the kind of the current token.
311    #[inline]
312    pub fn peek_kind(&self) -> Option<L::TokenType> {
313        self.current().map(|t| t.kind)
314    }
315
316    /// Peeks at a token at the specified offset from the current position.
317    #[inline]
318    pub fn peek_at(&self, offset: usize) -> Option<&Token<L::TokenType>> {
319        self.tokens.peek_at(offset)
320    }
321
322    /// Returns the kind of the token at the specified offset from the current position.
323    #[inline]
324    pub fn peek_kind_at(&self, offset: usize) -> Option<L::TokenType> {
325        self.tokens.peek_at(offset).map(|t| t.kind)
326    }
327
328    /// Checks if the parser has not yet reached the end of the token stream.
329    #[inline]
330    pub fn not_at_end(&self) -> bool {
331        !self.tokens.is_end()
332    }
333
334    /// Checks if the current token is of the specified kind.
335    #[inline]
336    pub fn at(&self, kind: L::TokenType) -> bool {
337        self.peek_kind() == Some(kind)
338    }
339
340    /// Advances the current position to the next token.
341    pub fn advance(&mut self) {
342        self.tokens.advance();
343    }
344
345    /// Advances until a token of the specified kind is found, or the end of the token stream is reached.
346    pub fn advance_until(&mut self, kind: L::TokenType) {
347        while self.not_at_end() && !self.at(kind) {
348            self.advance();
349        }
350    }
351
352    /// Advances until any token of the specified kinds is found, or the end of the token stream is reached.
353    pub fn advance_until_any(&mut self, kinds: &[L::TokenType]) {
354        while self.not_at_end() && !kinds.iter().any(|&k| self.at(k)) {
355            self.advance();
356        }
357    }
358
359    /// Consumes the current token and adds it to the syntax tree as a leaf node.
360    pub fn bump(&mut self)
361    where
362        L::ElementType: From<L::TokenType>,
363    {
364        if let Some(token) = self.current() {
365            self.sink.push_leaf(token.kind, token.length());
366            self.tokens.advance();
367        }
368    }
369
370    /// Consumes the current token if it matches the specified kind.
371    /// Returns `true` if the token was consumed, `false` otherwise.
372    pub fn eat(&mut self, kind: L::TokenType) -> bool
373    where
374        L::ElementType: From<L::TokenType>,
375    {
376        if self.at(kind) {
377            self.bump();
378            true
379        }
380        else {
381            false
382        }
383    }
384
385    /// Expects the current token to be of the specified kind, consuming it if it matches.
386    /// If the token does not match, an error is recorded and returned.
387    pub fn expect(&mut self, kind: L::TokenType) -> Result<(), OakError>
388    where
389        L::ElementType: From<L::TokenType>,
390    {
391        if self.eat(kind) {
392            Ok(())
393        }
394        else {
395            let err = OakError::expected_token(format!("{:?}", kind), self.current_offset(), self.source.url());
396            self.errors.push(err.clone());
397            Err(err)
398        }
399    }
400
401    // --- Tree Construction ---
402
403    /// Creates a checkpoint in the tree construction process, which can be used to finish a node later.
404    pub fn checkpoint(&self) -> usize {
405        self.sink.checkpoint()
406    }
407
408    /// Finishes a node starting from the given checkpoint and adds it as a child.
409    pub fn finish_at(&mut self, checkpoint: usize, kind: L::ElementType) -> &'a GreenNode<'a, L> {
410        self.sink.finish_node(checkpoint, kind)
411    }
412
413    /// Adds an existing node as a child to the current node.
414    pub fn push_child(&mut self, node: &'a GreenNode<'a, L>) {
415        self.sink.push_node(node);
416    }
417
418    /// Attempts to reuse a previous syntax node at the current position.
419    /// This is the core of incremental parsing.
420    pub fn try_reuse(&mut self, kind: L::ElementType) -> bool {
421        let Some(inc) = &mut self.incremental
422        else {
423            return false;
424        };
425        let current_index = self.tokens.index();
426        let new_pos = self.tokens.current().map(|t| t.span.start).unwrap_or(self.source.length());
427
428        #[cfg(test)]
429        println!("try_reuse: kind={:?}, new_pos={}", kind, new_pos);
430
431        let Some(target_old_pos) = inc.map_new_to_old(new_pos)
432        else {
433            return false;
434        };
435
436        #[cfg(test)]
437        println!("Trying to reuse node at pos {}: {:?}", new_pos, kind);
438
439        let mut steps = 0;
440        const MAX_STEPS: usize = 100;
441
442        loop {
443            let start = inc.cursor.offset();
444            let end = inc.cursor.end_offset();
445
446            if start == target_old_pos {
447                if let Some(node) = inc.cursor.as_node() {
448                    if node.kind == kind {
449                        if !inc.is_dirty(start, end) {
450                            // Verify tokens
451                            let node_len = node.text_len() as usize;
452
453                            // Check token stream
454                            let mut lookahead = 0;
455                            let mut current_pos = new_pos;
456                            let target_new_end = new_pos + node_len;
457
458                            while let Some(t) = self.tokens.peek_at(lookahead) {
459                                if t.span.start != current_pos {
460                                    // Token stream doesn't match expected positions
461                                    break;
462                                }
463                                current_pos = t.span.end;
464
465                                if t.span.end == target_new_end {
466                                    // Found the end!
467                                    let tokens_consumed = lookahead + 1;
468                                    let new_node = deep_clone_node(node, self.sink.arena);
469                                    self.sink.push_node(new_node);
470                                    self.tokens.set_index(current_index + tokens_consumed);
471                                    inc.cursor.step_over();
472                                    return true;
473                                }
474                                else if t.span.end > target_new_end {
475                                    break;
476                                }
477                                lookahead += 1;
478                            }
479                        }
480                    }
481                }
482                if !inc.cursor.step_into() && !inc.cursor.step_next() {
483                    return false;
484                }
485            }
486            else if start < target_old_pos && end > target_old_pos {
487                if !inc.cursor.step_into() && !inc.cursor.step_next() {
488                    return false;
489                }
490            }
491            else if end <= target_old_pos {
492                if !inc.cursor.step_next() {
493                    return false;
494                }
495            }
496            else {
497                return false;
498            }
499
500            steps += 1;
501            if steps > MAX_STEPS {
502                return false;
503            }
504        }
505    }
506
507    /// Finishes the parsing process and returns the final output.
508    pub fn finish(self, result: Result<&'a GreenNode<'a, L>, OakError>) -> crate::parser::ParseOutput<'a, L> {
509        crate::errors::OakDiagnostics { result, diagnostics: self.errors }
510    }
511
512    /// Wraps the parsing of a node with incremental reuse support.
513    ///
514    /// This method first attempts to reuse an existing node of the given kind from the previous tree.
515    /// If successful, it skips the provided closure and returns `Ok(())`.
516    /// Otherwise, it creates a checkpoint, runs the closure to parse the node's content,
517    /// and then finishes the node with the specified kind.
518    pub fn incremental_node<F>(&mut self, kind: L::ElementType, f: F) -> Result<(), OakError>
519    where
520        F: FnOnce(&mut Self) -> Result<(), OakError>,
521    {
522        if self.try_reuse(kind) {
523            return Ok(());
524        }
525
526        let checkpoint = self.checkpoint();
527        let res = f(self);
528        self.finish_at(checkpoint, kind);
529        res
530    }
531
532    /// Wraps the parsing of an optional node with incremental reuse support.
533    ///
534    /// Similar to `incremental_node`, but the closure returns a boolean indicating
535    /// if the node was actually present.
536    pub fn incremental_opt<F>(&mut self, kind: L::ElementType, f: F) -> Result<bool, OakError>
537    where
538        F: FnOnce(&mut Self) -> Result<bool, OakError>,
539    {
540        if self.try_reuse(kind) {
541            return Ok(true);
542        }
543
544        let checkpoint = self.checkpoint();
545        match f(self) {
546            Ok(true) => {
547                self.finish_at(checkpoint, kind);
548                Ok(true)
549            }
550            Ok(false) => Ok(false),
551            Err(e) => {
552                self.finish_at(checkpoint, kind);
553                Err(e)
554            }
555        }
556    }
557}