mpl 0.3.1 - Docs.rs

//! Parse

use crate::input::Input;
use crate::output::Output;
use crate::position::Position;
use crate::rules::Rules;
use crate::span::Span;
use crate::symbols::{Equivalence, Metasymbol, Terminal, TerminalSymbol, Variable, E};
use crate::trees::{AST, CST};
use std::collections::HashMap;
use std::hash::Hash;

/// Memoization table for [`Parser::packrat_parse`].
///
/// Stores only the parse outcome at each `(variable, start position)`:
/// * `None` — the rule has been tried at this position and failed.
/// * `Some(end_pos)` — the rule has been tried and succeeded, ending at `end_pos`.
///
/// AST nodes are intentionally **not** cached. Failed paths short-circuit on a
/// cache hit (the main packrat benefit during backtracking); successful paths
/// re-execute the rule body to rebuild the AST, but their sub-rule calls hit
/// the cache and avoid redundant work.
pub type PackratMemo<V, P> = HashMap<(V, P), Option<P>>;

/// Memoization entry for [`Parser::squirrel_parse`].
///
/// Squirrel-style left-recursion-aware packrat:
/// - `InProgress { seed }` — this rule is currently being grown at this
///   position. A recursive call to the same rule returns `seed` directly,
///   acting as the recursion base case for the seed-growing loop.
/// - `Done { result }` — the seed has converged.
///
/// The algorithm is Warth et al.'s (2008) seed-growing iteration:
/// 1. Start with `seed = Err(failure_ast)`.
/// 2. Evaluate the rule body. Recursive calls hitting `InProgress` see
///    the current seed instead of looping.
/// 3. If the new result extends past the previous seed, update seed and
///    iterate. Otherwise convergence — store as `Done`.
///
/// Hutchison's Squirrel (arXiv 2601.05012, 2026) generalises this with
/// per-entry version counters for uniform handling of indirect / hidden
/// / mutual left recursion. This implementation handles direct left
/// recursion robustly; mutual is best-effort.
pub enum SquirrelEntry<V, S, O> {
    InProgress {
        seed: Result<AST<V, S, O>, AST<V, S, O>>,
    },
    Done {
        result: Result<AST<V, S, O>, AST<V, S, O>>,
    },
}

pub type SquirrelMemo<V, P, S, O> = HashMap<(V, P), SquirrelEntry<V, S, O>>;

/// Memoization entry for the recognition-only Squirrel path
/// ([`Parser::squirrel_recognize`]).
///
/// Stores end positions only — no AST. This matches peg's
/// `#[cache_left_rec]` calling convention (`RuleResult<()>`) and is
/// what makes left-recursive recognition genuinely O(n): no value is
/// cloned across the seed-growing iterations. The trade-off is that
/// this path cannot return a parse tree.
pub enum SquirrelRecEntry<P> {
    InProgress { seed: Option<P> },
    Done { result: Option<P> },
}

pub type SquirrelRecMemo<V, P> = HashMap<(V, P), SquirrelRecEntry<P>>;

/// Types that can be parsed.
///
/// I is Input.
/// T is terminal symbols.
/// V is (enum of) Variables.
/// S is Span.
/// P is position.
/// R is Rules.
/// O is output type.
// TODO: Create Error types
pub trait Parser<'i, I, T, V, S, P, R, O = ()>
where
    I: Input + ?Sized,
    T: Terminal<'i, I, V, S, P, O>,
    V: Variable,
    S: Span<I, P>,
    P: Position,
    R: Rules<T, V>,
    O: Output<'i, I, V, S>,
{
    /// Minimal parse.
    ///
    /// # Warning
    ///
    /// `all_of_the_span.hi(self)` must be smaller than its length.
    fn parse(
        &self,
        input: &'i I,
        rules: &R,
        start_variable: &V,
        all_of_the_span: &S,
    ) -> Result<AST<V, S, O>, AST<V, S, O>> {
        let ast = self.eval(
            input,
            &all_of_the_span.lo(input),
            rules,
            start_variable,
            &all_of_the_span.hi(input),
        )?;

        if &ast.span == all_of_the_span {
            Ok(ast)
        } else {
            Err(ast)
        }
    }

    fn to_empty_ast(&self, input: &'i I, pos: P) -> Result<AST<V, S, O>, AST<V, S, O>> {
        Ok(AST::from_leaf(
            Metasymbol::Empty.into(),
            Span::from_lo_hi(pos.clone(), pos, input),
        ))
    }

    fn to_failure_ast(&self, input: &'i I, pos: P) -> Result<AST<V, S, O>, AST<V, S, O>> {
        Err(AST::from_leaf(
            Metasymbol::Failure.into(),
            Span::from_lo_hi(pos.clone(), pos, input),
        ))
    }

    // TODO: Decide return Any or Failure
    fn to_any_ast(
        &self,
        input: &'i I,
        pos: P,
        max_pos: &P,
        n: usize,
    ) -> Result<AST<V, S, O>, AST<V, S, O>> {
        let span_with_len_added = S::from_lo_len(pos, n, input);
        let hi = span_with_len_added.hi(input);
        let ast = AST::from_leaf(Metasymbol::Any(n).into(), span_with_len_added);
        if &hi <= max_pos {
            Ok(ast)
        } else {
            Err(ast)
        }
    }

    fn to_all_ast(&self, input: &'i I, pos: P, max_pos: P) -> Result<AST<V, S, O>, AST<V, S, O>> {
        Ok(AST::from_leaf(
            Metasymbol::All.into(),
            Span::from_lo_hi(pos, max_pos, input),
        ))
    }

    fn eval_terminal_symbol(
        &self,
        input: &'i I,
        terminal_symbol: &TerminalSymbol<T>,
        pos: P,
        max_pos: &P,
    ) -> Result<AST<V, S, O>, AST<V, S, O>> {
        match terminal_symbol {
            TerminalSymbol::Original(t) => t.eval(input, pos, max_pos),
            TerminalSymbol::Metasymbol(metasymbol) => match metasymbol {
                Metasymbol::Empty => self.to_empty_ast(input, pos),
                Metasymbol::Failure => self.to_failure_ast(input, pos),
                Metasymbol::Any(n) => self.to_any_ast(input, pos, max_pos, *n),
                Metasymbol::All => self.to_all_ast(input, pos, max_pos.clone()),
                Metasymbol::Omit => unimplemented!(),
            },
        }
    }

    fn eval(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
    ) -> Result<AST<V, S, O>, AST<V, S, O>> {
        let right_rule = rules.get(variable).expect("right_rule from a variable");

        // First choice
        // left-hand side of first choice
        let left_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.lhs {
            E::T(terminal_symbol) => {
                self.eval_terminal_symbol(input, terminal_symbol, pos.clone(), max_pos)
            }
            E::V(lhs_of_fc_v) => self.eval(input, pos, rules, lhs_of_fc_v, max_pos),
        };

        if let Ok(left_ast) = left_ast {
            // right-hand side of first choice
            let right_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.rhs {
                E::T(terminal_symbol) => self.eval_terminal_symbol(
                    input,
                    terminal_symbol,
                    left_ast.span.hi(input),
                    max_pos,
                ),
                E::V(rhs_of_fc_v) => {
                    self.eval(input, &left_ast.span.hi(input), rules, rhs_of_fc_v, max_pos)
                }
            };

            if let Ok(right_ast) = right_ast {
                let merged_span = Span::merge_lhs_and_rhs(&left_ast.span, &right_ast.span, input);

                let variable_and_choice =
                    Equivalence::new(variable.clone(), (left_ast, right_ast).into());

                let cst = CST::new(variable_and_choice, merged_span);

                let output_ast = O::output_ast(input, cst);

                return Ok(output_ast);
            }
        }

        // Second choice
        match &right_rule.second.0 {
            E::T(terminal_symbol) => {
                self.eval_terminal_symbol(input, terminal_symbol, pos.clone(), max_pos)
            }
            E::V(sc_v) => {
                let ast = self.eval(input, pos, rules, sc_v, max_pos)?;
                let span = ast.span.clone();

                let variable_and_choice = Equivalence::new(variable.clone(), ast.into());

                let cst = CST::new(variable_and_choice, span);

                let output_ast = O::output_ast(input, cst);

                Ok(output_ast)
            }
        }
    }

    /// Packrat-memoized parse with a position-only cache.
    ///
    /// Equivalent in result to [`parse`](Self::parse) but caches the outcome of
    /// each `(variable, start position)` pair so that:
    /// * Failed paths short-circuit on subsequent visits — this is what avoids
    ///   the exponential-time blowup that PEG/TDPL backtracking can cause.
    /// * Successful paths re-execute the rule body to rebuild the AST, but
    ///   their sub-rule calls reuse cache entries.
    ///
    /// AST nodes are not stored in the cache, so memory usage is bounded by
    /// `O(n * |V|)` and there is no per-entry deep clone.
    fn packrat_parse(
        &self,
        input: &'i I,
        rules: &R,
        start_variable: &V,
        all_of_the_span: &S,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let mut memo: PackratMemo<V, P> = HashMap::new();
        let ast = self.packrat_eval(
            input,
            &all_of_the_span.lo(input),
            rules,
            start_variable,
            &all_of_the_span.hi(input),
            &mut memo,
        )?;

        if &ast.span == all_of_the_span {
            Ok(ast)
        } else {
            Err(ast)
        }
    }

    /// Memoized counterpart of [`eval`](Self::eval).
    ///
    /// On a cached failure for `(variable, *pos)`, returns a fresh failure AST
    /// without re-executing the rule. On a cached success, falls through to
    /// rule evaluation (sub-rule lookups still benefit from the cache). On a
    /// miss, evaluates and records only the resulting end position (or
    /// `None` for failure).
    fn packrat_eval(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut PackratMemo<V, P>,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let key = (variable.clone(), pos.clone());
        if let Some(cached) = memo.get(&key) {
            if cached.is_none() {
                return self.to_failure_ast(input, pos.clone());
            }
        }

        let result = self.packrat_eval_uncached(input, pos, rules, variable, max_pos, memo);

        memo.entry(key).or_insert_with(|| match &result {
            Ok(ast) => Some(ast.span.hi(input)),
            Err(_) => None,
        });

        result
    }

    fn packrat_eval_uncached(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut PackratMemo<V, P>,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let right_rule = rules.get(variable).expect("right_rule from a variable");

        let left_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.lhs {
            E::T(terminal_symbol) => {
                self.eval_terminal_symbol(input, terminal_symbol, pos.clone(), max_pos)
            }
            E::V(lhs_of_fc_v) => self.packrat_eval(input, pos, rules, lhs_of_fc_v, max_pos, memo),
        };

        if let Ok(left_ast) = left_ast {
            let right_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.rhs {
                E::T(terminal_symbol) => self.eval_terminal_symbol(
                    input,
                    terminal_symbol,
                    left_ast.span.hi(input),
                    max_pos,
                ),
                E::V(rhs_of_fc_v) => self.packrat_eval(
                    input,
                    &left_ast.span.hi(input),
                    rules,
                    rhs_of_fc_v,
                    max_pos,
                    memo,
                ),
            };

            if let Ok(right_ast) = right_ast {
                let merged_span = Span::merge_lhs_and_rhs(&left_ast.span, &right_ast.span, input);

                let variable_and_choice =
                    Equivalence::new(variable.clone(), (left_ast, right_ast).into());

                let cst = CST::new(variable_and_choice, merged_span);

                let output_ast = O::output_ast(input, cst);

                return Ok(output_ast);
            }
        }

        match &right_rule.second.0 {
            E::T(terminal_symbol) => {
                self.eval_terminal_symbol(input, terminal_symbol, pos.clone(), max_pos)
            }
            E::V(sc_v) => {
                let ast = self.packrat_eval(input, pos, rules, sc_v, max_pos, memo)?;
                let span = ast.span.clone();

                let variable_and_choice = Equivalence::new(variable.clone(), ast.into());

                let cst = CST::new(variable_and_choice, span);

                let output_ast = O::output_ast(input, cst);

                Ok(output_ast)
            }
        }
    }

    /// Squirrel-inspired left-recursion-aware parse.
    ///
    /// Built on the seed-growing iteration of Warth et al. (2008) with a
    /// trim toward the structure of Hutchison's Squirrel (arXiv 2601.05012).
    /// On a non-left-recursive grammar this produces the same result as
    /// [`packrat_parse`](Self::packrat_parse). On a directly left-recursive
    /// grammar it terminates by growing the seed — the rule's best known
    /// match length — until convergence.
    ///
    /// Requires `O: Clone, V: Clone, S: Clone` because the seed AST is
    /// cloned both into the memo table and out as the recursion stub.
    fn squirrel_parse(
        &self,
        input: &'i I,
        rules: &R,
        start_variable: &V,
        all_of_the_span: &S,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
        O: Clone,
    {
        let mut memo: SquirrelMemo<V, P, S, O> = HashMap::new();
        let ast = self.squirrel_eval(
            input,
            &all_of_the_span.lo(input),
            rules,
            start_variable,
            &all_of_the_span.hi(input),
            &mut memo,
        )?;
        if &ast.span == all_of_the_span {
            Ok(ast)
        } else {
            Err(ast)
        }
    }

    /// Compare two parse outcomes by progress: `Ok` further into the
    /// input wins; any `Ok` beats `Err`. Used to decide whether the
    /// seed-growing loop should iterate again.
    fn squirrel_grew(
        new: &Result<AST<V, S, O>, AST<V, S, O>>,
        old: &Result<AST<V, S, O>, AST<V, S, O>>,
        input: &'i I,
    ) -> bool {
        match (new, old) {
            (Ok(_), Err(_)) => true,
            (Err(_), Ok(_)) => false,
            (Ok(n), Ok(o)) => n.span.hi(input) > o.span.hi(input),
            (Err(_), Err(_)) => false,
        }
    }

    fn squirrel_eval(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut SquirrelMemo<V, P, S, O>,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
        O: Clone,
    {
        let key = (variable.clone(), pos.clone());

        // Cache hit?
        if let Some(entry) = memo.get(&key) {
            match entry {
                SquirrelEntry::Done { result } => return result.clone(),
                // Recursive call mid-grow: hand back the seed so the parent
                // rule treats this position as already matched up to seed.
                SquirrelEntry::InProgress { seed } => return seed.clone(),
            }
        }

        // Initial seed: failure. The first iteration's recursive calls all
        // see this as failure, so the rule's non-recursive alternative
        // takes over to produce the base case.
        let initial_seed = self.to_failure_ast(input, pos.clone());
        memo.insert(
            key.clone(),
            SquirrelEntry::InProgress {
                seed: initial_seed.clone(),
            },
        );

        let mut best = initial_seed;
        loop {
            let result = self.squirrel_eval_uncached(input, pos, rules, variable, max_pos, memo);
            if Self::squirrel_grew(&result, &best, input) {
                best = result;
                memo.insert(
                    key.clone(),
                    SquirrelEntry::InProgress { seed: best.clone() },
                );
            } else {
                break;
            }
        }

        memo.insert(
            key,
            SquirrelEntry::Done {
                result: best.clone(),
            },
        );
        best
    }

    fn squirrel_eval_uncached(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut SquirrelMemo<V, P, S, O>,
    ) -> Result<AST<V, S, O>, AST<V, S, O>>
    where
        V: Eq + Hash,
        P: Eq + Hash,
        O: Clone,
    {
        let right_rule = rules.get(variable).expect("right_rule from a variable");

        let left_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.lhs {
            E::T(t) => self.eval_terminal_symbol(input, t, pos.clone(), max_pos),
            E::V(lhs) => self.squirrel_eval(input, pos, rules, lhs, max_pos, memo),
        };

        if let Ok(left_ast) = left_ast {
            let right_ast: Result<AST<V, S, O>, AST<V, S, O>> = match &right_rule.first.rhs {
                E::T(t) => self.eval_terminal_symbol(input, t, left_ast.span.hi(input), max_pos),
                E::V(rhs) => {
                    self.squirrel_eval(input, &left_ast.span.hi(input), rules, rhs, max_pos, memo)
                }
            };

            if let Ok(right_ast) = right_ast {
                let merged = Span::merge_lhs_and_rhs(&left_ast.span, &right_ast.span, input);
                let value = Equivalence::new(variable.clone(), (left_ast, right_ast).into());
                let cst = CST::new(value, merged);
                return Ok(O::output_ast(input, cst));
            }
        }

        // Second choice
        match &right_rule.second.0 {
            E::T(t) => self.eval_terminal_symbol(input, t, pos.clone(), max_pos),
            E::V(sc) => {
                let ast = self.squirrel_eval(input, pos, rules, sc, max_pos, memo)?;
                let span = ast.span.clone();
                let value = Equivalence::new(variable.clone(), ast.into());
                let cst = CST::new(value, span);
                Ok(O::output_ast(input, cst))
            }
        }
    }

    /// Recognise the input via Squirrel-style seed-growing, returning
    /// only `bool`. No AST is built; the memo stores end positions
    /// (`Option<P>`) only.
    ///
    /// This path achieves linear time on left-recursive grammars in the
    /// same family as peg's `#[cache_left_rec]` (van Rossum) — every
    /// `(variable, position)` pair completes in O(1) per iteration and
    /// the seed-growing loop iterates at most O(input length) times.
    ///
    /// Use [`squirrel_parse`](Self::squirrel_parse) instead when you
    /// need the parse tree; that path trades O(n²) AST-clone overhead
    /// in exchange for the AST output.
    fn squirrel_recognize(
        &self,
        input: &'i I,
        rules: &R,
        start_variable: &V,
        all_of_the_span: &S,
    ) -> bool
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let mut memo: SquirrelRecMemo<V, P> = HashMap::new();
        let end = self.squirrel_rec_eval(
            input,
            &all_of_the_span.lo(input),
            rules,
            start_variable,
            &all_of_the_span.hi(input),
            &mut memo,
        );
        match end {
            Some(end_pos) => end_pos == all_of_the_span.hi(input),
            None => false,
        }
    }

    fn squirrel_rec_eval(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut SquirrelRecMemo<V, P>,
    ) -> Option<P>
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let key = (variable.clone(), pos.clone());

        if let Some(entry) = memo.get(&key) {
            match entry {
                SquirrelRecEntry::Done { result } => return result.clone(),
                SquirrelRecEntry::InProgress { seed } => return seed.clone(),
            }
        }

        memo.insert(key.clone(), SquirrelRecEntry::InProgress { seed: None });

        let mut best: Option<P> = None;
        loop {
            let result =
                self.squirrel_rec_eval_uncached(input, pos, rules, variable, max_pos, memo);
            let grew = match (&result, &best) {
                (Some(_), None) => true,
                (Some(n), Some(o)) => n > o,
                _ => false,
            };
            if !grew {
                break;
            }
            best = result;
            memo.insert(
                key.clone(),
                SquirrelRecEntry::InProgress { seed: best.clone() },
            );
        }

        memo.insert(
            key,
            SquirrelRecEntry::Done {
                result: best.clone(),
            },
        );
        best
    }

    fn squirrel_rec_eval_uncached(
        &self,
        input: &'i I,
        pos: &P,
        rules: &R,
        variable: &V,
        max_pos: &P,
        memo: &mut SquirrelRecMemo<V, P>,
    ) -> Option<P>
    where
        V: Eq + Hash,
        P: Eq + Hash,
    {
        let right_rule = rules.get(variable).expect("right_rule from a variable");

        // First choice: lhs followed by rhs.
        let lhs_end = match &right_rule.first.lhs {
            E::T(t) => self
                .eval_terminal_symbol(input, t, pos.clone(), max_pos)
                .ok()
                .map(|ast| ast.span.hi(input)),
            E::V(v) => self.squirrel_rec_eval(input, pos, rules, v, max_pos, memo),
        };

        if let Some(end_lhs) = lhs_end {
            let rhs_end = match &right_rule.first.rhs {
                E::T(t) => self
                    .eval_terminal_symbol(input, t, end_lhs.clone(), max_pos)
                    .ok()
                    .map(|ast| ast.span.hi(input)),
                E::V(v) => self.squirrel_rec_eval(input, &end_lhs, rules, v, max_pos, memo),
            };
            if let Some(end_rhs) = rhs_end {
                return Some(end_rhs);
            }
        }

        // Second choice.
        match &right_rule.second.0 {
            E::T(t) => self
                .eval_terminal_symbol(input, t, pos.clone(), max_pos)
                .ok()
                .map(|ast| ast.span.hi(input)),
            E::V(v) => self.squirrel_rec_eval(input, pos, rules, v, max_pos, memo),
        }
    }
}