sipha_parse/
parser.rs

1//! High-level parser that evaluates grammar rules.
2//!
3//! The `Parser` is the main entry point for parsing token streams into concrete
4//! syntax trees (CST). It supports grammar-based parsing.
5
6use std::collections::{HashMap, HashSet};
7
8use crate::grammar::{GrammarRule, GrammarRuleParser, Precedence};
9use crate::state::ParserState;
10use builder_pattern::Builder;
11use sipha_core::traits::{GrammarContext, NodeId, RuleId, TokenKind};
12use sipha_error::ParseError;
13use sipha_tree::{CstKind, NodeChildren};
14
15/// Main parser entry point.
16///
17/// The `Parser` is responsible for evaluating grammar rules and building
18/// concrete syntax trees (CST) from token streams. It supports both
19/// grammar-based parsing and Pratt parsing for operator precedence.
20///
21/// # Performance Characteristics
22///
23/// - **Time Complexity**: O(n) for most grammars, where n is the number of tokens
24/// - **Space Complexity**: O(n) for the CST arena
25/// - **Memoization**: Enabled by default for recursive grammars
26///
27/// # Example
28///
29/// ```rust
30/// use sipha_core::{Parser, helpers, ParserState, cst::{NodeArena, RawNodeId}};
31///
32/// # #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
33/// # enum Token { Ident, Plus }
34/// # impl sipha_core::traits::TokenKind for Token {
35/// #     fn is_trivia(&self) -> bool { false }
36/// # }
37/// # #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
38/// # enum Rule { Expr }
39/// let mut parser = Parser::create();
40/// parser.register_rule(
41///     Rule::Expr,
42///     helpers::seq(vec![
43///         helpers::token(Token::Ident),
44///         helpers::token(Token::Plus),
45///         helpers::token(Token::Ident),
46///     ]),
47/// );
48/// ```
49#[derive(Builder)]
50pub struct Parser<K: TokenKind, R: RuleId> {
51    /// Grammar rules registered with the parser.
52    #[default(HashMap::new())]
53    grammar_rules: HashMap<R, GrammarRule<K, R>>,
54    /// Synchronization tokens for error recovery.
55    #[default(HashSet::new())]
56    sync_tokens: HashSet<K>,
57    /// Maximum depth for error recovery attempts.
58    #[default(10)]
59    max_recovery_depth: usize,
60}
61
62impl<K: TokenKind, R: RuleId> Parser<K, R> {
63    /// Create a new parser with default settings.
64    ///
65    /// For builder pattern usage, use `Parser::new()` which returns a builder.
66    pub fn create() -> Self {
67        Self::default()
68    }
69
70    /// Set the maximum depth for error recovery attempts.
71    pub fn set_max_recovery_depth(&mut self, depth: usize) {
72        self.max_recovery_depth = depth;
73    }
74
75    /// Register a grammar rule.
76    ///
77    /// Validates the rule before registration (e.g., ensures range min <= max).
78    /// Invalid rules will cause a panic to prevent silent failures.
79    pub fn register_rule(&mut self, rule_id: R, rule: GrammarRule<K, R>) {
80        if let Err(err) = crate::grammar::validate_rule(&rule) {
81            panic!("Invalid grammar rule registered: {:?}", err);
82        }
83        self.grammar_rules.insert(rule_id, rule);
84    }
85
86    /// Configure synchronization tokens for error recovery.
87    pub fn set_sync_tokens(&mut self, tokens: Vec<K>) {
88        self.sync_tokens = tokens.into_iter().collect();
89    }
90
91    /// Parse a top-level rule by identifier.
92    #[must_use = "parse results should be checked for errors"]
93    pub fn parse_rule<'tokens, 'arena, N, Ctx>(
94        &mut self,
95        rule_id: R,
96        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
97    ) -> Result<N, ParseError<K, R>>
98    where
99        N: NodeId,
100        Ctx: GrammarContext,
101    {
102        let start_pos = state.position();
103        if let Some(memo_entry) = state.check_memo(rule_id) {
104            match memo_entry {
105                sipha_memo::MemoEntry::Success { node, consumed } => {
106                    let node = *node;
107                    let consumed = *consumed;
108                    state.reset(start_pos + consumed);
109                    return Ok(node);
110                }
111                sipha_memo::MemoEntry::Failure { error, consumed } => {
112                    let error = error.clone();
113                    let consumed = *consumed;
114                    state.reset(start_pos + consumed);
115                    return Err(error);
116                }
117            }
118        }
119
120        state.push_rule(rule_id);
121        let result = {
122            let rule = self.grammar_rules.get(&rule_id).cloned().ok_or_else(|| {
123                ParseError::InvalidContext {
124                    rule: "unknown rule",
125                    span: state.current_span(),
126                    rule_context: state.current_rule(),
127                    context_stack: state.build_context_stack(),
128                }
129            })?;
130            self.parse_grammar_rule(rule_id, &rule, state)
131        };
132
133        let consumed = state.position() - start_pos;
134        match &result {
135            Ok(node) => state.store_memo_success(rule_id, start_pos, *node, consumed),
136            Err(error) => state.store_memo_failure(rule_id, start_pos, error.clone(), consumed),
137        }
138
139        state.pop_rule();
140        result
141    }
142
143    /// Attempt to recover from a parse error by scanning for sync tokens.
144    fn try_recover<'tokens, 'arena, N, Ctx>(
145        &self,
146        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
147        recovery_depth: usize,
148    ) -> bool
149    where
150        N: NodeId,
151        Ctx: GrammarContext,
152    {
153        if recovery_depth >= self.max_recovery_depth {
154            return false;
155        }
156
157        let start_pos = state.position();
158
159        if !self.sync_tokens.is_empty() {
160            let mut offset = 0;
161            let max_scan = 100;
162
163            while offset < max_scan {
164                if let Some(token) = state.peek(offset) {
165                    if self.sync_tokens.contains(&token.kind) {
166                        state.reset(start_pos + offset);
167                        return true;
168                    }
169                    offset += 1;
170                } else {
171                    break;
172                }
173            }
174        }
175
176        let mut offset = 1;
177        let max_skip = 10;
178
179        while offset < max_skip {
180            if let Some(token) = state.peek(offset) {
181                if !token.kind.is_trivia() {
182                    state.reset(start_pos + offset);
183                    return true;
184                }
185                offset += 1;
186            } else {
187                break;
188            }
189        }
190
191        false
192    }
193
194    fn parse_sequence<'tokens, 'arena, N, Ctx>(
195        &mut self,
196        rule_id: R,
197        rules: &[GrammarRule<K, R>],
198        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
199    ) -> Result<N, ParseError<K, R>>
200    where
201        N: NodeId,
202        Ctx: GrammarContext,
203    {
204        self.parse_sequence_with_recovery(rule_id, rules, state, 0)
205    }
206
207    fn parse_sequence_with_recovery<'tokens, 'arena, N, Ctx>(
208        &mut self,
209        rule_id: R,
210        rules: &[GrammarRule<K, R>],
211        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
212        recovery_depth: usize,
213    ) -> Result<N, ParseError<K, R>>
214    where
215        N: NodeId,
216        Ctx: GrammarContext,
217    {
218        let start_pos = state.position();
219        let mut builder = state.start_node(rule_id);
220        for rule in rules {
221            match self.parse_grammar_rule(rule_id, rule, state) {
222                Ok(child) => {
223                    builder.node(child);
224                }
225                Err(err) => {
226                    let error_span = state.current_span();
227                    if self.try_recover(state, recovery_depth) {
228                        let error_node = state.alloc_node(
229                            CstKind::Error("parse error"),
230                            error_span,
231                            NodeChildren::new(),
232                        );
233                        builder.node(error_node);
234                    } else {
235                        state.reset(start_pos);
236                        return Err(err);
237                    }
238                }
239            }
240        }
241        Ok(builder.finish(state))
242    }
243
244    fn parse_choice<'tokens, 'arena, N, Ctx>(
245        &mut self,
246        rule_id: R,
247        rules: &[GrammarRule<K, R>],
248        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
249    ) -> Result<N, ParseError<K, R>>
250    where
251        N: NodeId,
252        Ctx: GrammarContext,
253    {
254        if rules.is_empty() {
255            return Err(ParseError::InvalidContext {
256                rule: "empty choice",
257                span: state.current_span(),
258                rule_context: state.current_rule(),
259                context_stack: state.build_context_stack(),
260            });
261        }
262
263        let start = state.position();
264        let mut errors: Vec<ParseError<K, R>> = Vec::new();
265
266        for rule in rules {
267            if let GrammarRule::Token(expected_kind) = rule {
268                if state.at(*expected_kind) {
269                    match self.parse_grammar_rule(rule_id, rule, state) {
270                        Ok(node) => return Ok(node),
271                        Err(err) => {
272                            errors.push(err);
273                            state.reset(start);
274                        }
275                    }
276                    continue;
277                } else {
278                    errors.push(ParseError::Expected {
279                        expected: sipha_error::Expected::Single(*expected_kind),
280                        found: state.peek(0).map(|t| t.kind),
281                        span: state.current_span(),
282                        rule_context: state.current_rule(),
283                        context_stack: state.build_context_stack(),
284                    });
285                    continue;
286                }
287            }
288
289            match self.parse_grammar_rule(rule_id, rule, state) {
290                Ok(node) => return Ok(node),
291                Err(err) => {
292                    errors.push(err);
293                    state.reset(start);
294                }
295            }
296        }
297
298        if errors.is_empty() {
299            Err(ParseError::UnexpectedEof {
300                span: state.current_span(),
301                rule_context: state.current_rule(),
302                context_stack: state.build_context_stack(),
303            })
304        } else if errors.len() == 1 {
305            Err(errors.into_iter().next().unwrap())
306        } else {
307            Err(ParseError::MultipleAlternatives {
308                alternatives: errors,
309                span: state.current_span(),
310                rule_context: state.current_rule(),
311                context_stack: state.build_context_stack(),
312            })
313        }
314    }
315
316    fn parse_optional<'tokens, 'arena, N, Ctx>(
317        &mut self,
318        rule_id: R,
319        rule: &GrammarRule<K, R>,
320        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
321    ) -> Result<N, ParseError<K, R>>
322    where
323        N: NodeId,
324        Ctx: GrammarContext,
325    {
326        let start = state.position();
327        match self.parse_grammar_rule(rule_id, rule, state) {
328            Ok(node) => Ok(node),
329            Err(_) => {
330                state.reset(start);
331                Ok(state.alloc_node(
332                    CstKind::Rule(rule_id),
333                    state.current_span(),
334                    NodeChildren::new(),
335                ))
336            }
337        }
338    }
339
340    fn parse_zero_or_more<'tokens, 'arena, N, Ctx>(
341        &mut self,
342        rule_id: R,
343        rule: &GrammarRule<K, R>,
344        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
345    ) -> Result<N, ParseError<K, R>>
346    where
347        N: NodeId,
348        Ctx: GrammarContext,
349    {
350        let mut builder = state.start_node(rule_id);
351        while let Ok(child) = self.parse_grammar_rule(rule_id, rule, state) {
352            builder.node(child);
353        }
354        Ok(builder.finish(state))
355    }
356
357    fn parse_one_or_more<'tokens, 'arena, N, Ctx>(
358        &mut self,
359        rule_id: R,
360        rule: &GrammarRule<K, R>,
361        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
362    ) -> Result<N, ParseError<K, R>>
363    where
364        N: NodeId,
365        Ctx: GrammarContext,
366    {
367        let mut builder = state.start_node(rule_id);
368        let first = self.parse_grammar_rule(rule_id, rule, state)?;
369        builder.node(first);
370        while let Ok(child) = self.parse_grammar_rule(rule_id, rule, state) {
371            builder.node(child);
372        }
373        Ok(builder.finish(state))
374    }
375
376    fn parse_range<'tokens, 'arena, N, Ctx>(
377        &mut self,
378        rule_id: R,
379        rule: &GrammarRule<K, R>,
380        min: usize,
381        max: usize,
382        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
383    ) -> Result<N, ParseError<K, R>>
384    where
385        N: NodeId,
386        Ctx: GrammarContext,
387    {
388        if min > max {
389            return Err(ParseError::InvalidContext {
390                rule: "range min > max",
391                span: state.current_span(),
392                rule_context: state.current_rule(),
393                context_stack: state.build_context_stack(),
394            });
395        }
396        let mut builder = state.start_node(rule_id);
397        let mut count = 0;
398
399        while count < max {
400            match self.parse_grammar_rule(rule_id, rule, state) {
401                Ok(node) => {
402                    builder.node(node);
403                    count += 1;
404                }
405                Err(err) => {
406                    if count < min {
407                        return Err(err);
408                    }
409                    break;
410                }
411            }
412        }
413
414        if count < min {
415            return Err(ParseError::RangeError {
416                min,
417                max,
418                found: count,
419                span: state.current_span(),
420                rule_context: state.current_rule(),
421                context_stack: state.build_context_stack(),
422            });
423        }
424
425        Ok(builder.finish(state))
426    }
427
428    fn parse_pratt_expr<'tokens, 'arena, N, Ctx>(
429        &mut self,
430        _rule_id: R,
431        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
432        _min_prec: Precedence,
433    ) -> Result<N, ParseError<K, R>>
434    where
435        N: NodeId,
436        Ctx: GrammarContext,
437    {
438        // TODO: Integrate with sipha-pratt when it's created
439        Err(ParseError::InvalidContext {
440            rule: "Pratt parsing not yet integrated",
441            span: state.current_span(),
442            rule_context: state.current_rule(),
443            context_stack: state.build_context_stack(),
444        })
445    }
446
447    fn parse_negative_lookahead<'tokens, 'arena, N, Ctx>(
448        &mut self,
449        rule_id: R,
450        rule: &GrammarRule<K, R>,
451        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
452    ) -> Result<N, ParseError<K, R>>
453    where
454        N: NodeId,
455        Ctx: GrammarContext,
456    {
457        let start = state.position();
458        match self.parse_grammar_rule(rule_id, rule, state) {
459            Ok(_) => {
460                state.reset(start);
461                let token_kind =
462                    state
463                        .peek(0)
464                        .map(|t| t.kind)
465                        .ok_or_else(|| ParseError::UnexpectedEof {
466                            span: state.current_span(),
467                            rule_context: state.current_rule(),
468                            context_stack: state.build_context_stack(),
469                        })?;
470                Err(ParseError::UnexpectedToken {
471                    kind: token_kind,
472                    span: state.current_span(),
473                    rule_context: state.current_rule(),
474                    context_stack: state.build_context_stack(),
475                })
476            }
477            Err(_) => {
478                state.reset(start);
479                let builder = state.start_node(rule_id);
480                Ok(builder.finish(state))
481            }
482        }
483    }
484
485    fn parse_positive_lookahead<'tokens, 'arena, N, Ctx>(
486        &mut self,
487        rule_id: R,
488        rule: &GrammarRule<K, R>,
489        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
490    ) -> Result<N, ParseError<K, R>>
491    where
492        N: NodeId,
493        Ctx: GrammarContext,
494    {
495        let start = state.position();
496        match self.parse_grammar_rule(rule_id, rule, state) {
497            Ok(_) => {
498                state.reset(start);
499                let builder = state.start_node(rule_id);
500                Ok(builder.finish(state))
501            }
502            Err(err) => {
503                state.reset(start);
504                Err(err)
505            }
506        }
507    }
508}
509
510impl<K: TokenKind, R: RuleId> GrammarRuleParser<K, R> for Parser<K, R> {
511    fn parse_grammar_rule<'tokens, 'arena, N, Ctx>(
512        &mut self,
513        rule_id: R,
514        rule: &GrammarRule<K, R>,
515        state: &mut ParserState<'tokens, 'arena, K, R, N, Ctx>,
516    ) -> Result<N, ParseError<K, R>>
517    where
518        N: NodeId,
519        Ctx: GrammarContext,
520    {
521        match rule {
522            GrammarRule::Sequence(rules) => self.parse_sequence(rule_id, rules, state),
523            GrammarRule::Choice(rules) => self.parse_choice(rule_id, rules, state),
524            GrammarRule::Optional(inner) => self.parse_optional(rule_id, inner, state),
525            GrammarRule::ZeroOrMore(inner) => self.parse_zero_or_more(rule_id, inner, state),
526            GrammarRule::OneOrMore(inner) => self.parse_one_or_more(rule_id, inner, state),
527            GrammarRule::Range {
528                rule: inner,
529                min,
530                max,
531            } => self.parse_range(rule_id, inner, *min, *max, state),
532            GrammarRule::PrattExpr(min_prec) => self.parse_pratt_expr(rule_id, state, *min_prec),
533            GrammarRule::Token(kind) => state.expect_node(*kind),
534            GrammarRule::Rule(id) => self.parse_rule(*id, state),
535            GrammarRule::Conditional {
536                rule: inner,
537                condition,
538            } => {
539                if condition(state.position()) {
540                    self.parse_grammar_rule(rule_id, inner, state)
541                } else {
542                    Err(ParseError::InvalidContext {
543                        rule: "conditional",
544                        span: state.current_span(),
545                        rule_context: state.current_rule(),
546                        context_stack: state.build_context_stack(),
547                    })
548                }
549            }
550            GrammarRule::NegativeLookahead(rule) => {
551                self.parse_negative_lookahead(rule_id, rule, state)
552            }
553            GrammarRule::PositiveLookahead(rule) => {
554                self.parse_positive_lookahead(rule_id, rule, state)
555            }
556        }
557    }
558}
559
560impl<K: TokenKind, R: RuleId> Default for Parser<K, R> {
561    fn default() -> Self {
562        Parser::new().build()
563    }
564}