mpl-macro 0.2.1

Derive parsers from MPL's one-rule grammar files. Includes the FastParse static-codegen backend (cascade detection, first-byte dispatch, Squirrel left recursion).
Documentation
//! Cascade detection and classification.
//!
//! Many practical grammars contain `A = X1 () / Y` chains where each `Xi`
//! is some sub-rule. The runtime cost is one function call per alternative
//! tried — for a 6-10 element cascade, every match pays for the failed
//! attempts before the right one.
//!
//! This module identifies such cascades at compile time and classifies
//! them so the codegen can emit either:
//! * **Byte-table fast path** when each alternative resolves to a single
//!   byte (e.g. `Variable = UppercaseX () / Variable1` with 10
//!   single-character variants); or
//! * **First-byte dispatch** when alternatives have mostly disjoint
//!   FIRST sets (e.g. `Atom = ExprInParens () / FloatLiteral () / ...`
//!   where '(' routes to ExprInParens, digits to Float/Int, etc.).
//!
//! The emit-mode path is left untouched so the original AST shape is
//! preserved. Detection is purely structural — it works for any user
//! grammar that follows the cascade pattern.

use crate::generator::cut::FirstSet;
use mpl::rules::Rule;
use mpl::symbols::{Metasymbol, TerminalSymbol, E};
use std::collections::{BTreeSet, HashMap};

/// Try to extract `Char('c')` → byte from the original-symbol raw text
/// (e.g. `Char('X')`, `Char(' ')`).
fn extract_char_byte(raw: &str) -> Option<u8> {
    let parsed: syn::Expr = syn::parse_str(raw).ok()?;
    let call = match parsed {
        syn::Expr::Call(c) => c,
        _ => return None,
    };
    let path = match &*call.func {
        syn::Expr::Path(p) => &p.path,
        _ => return None,
    };
    if !path.is_ident("Char") {
        return None;
    }
    let arg = call.args.first()?;
    let lit = match arg {
        syn::Expr::Lit(syn::ExprLit {
            lit: syn::Lit::Char(c),
            ..
        }) => c,
        _ => return None,
    };
    let v = lit.value() as u32;
    if v < 256 {
        Some(v as u8)
    } else {
        None
    }
}

fn is_empty_meta(e: &E<&str, &str>) -> bool {
    matches!(e, E::T(TerminalSymbol::Metasymbol(Metasymbol::Empty)))
}

fn is_failure_meta(e: &E<&str, &str>) -> bool {
    matches!(e, E::T(TerminalSymbol::Metasymbol(Metasymbol::Failure)))
}

/// One pass of the analysis. Returns `Some(byte_set)` if `e` already
/// resolves to a finite single-byte set under the current `result` map,
/// `None` if not (yet) — a later iteration may resolve it.
fn try_resolve_byte_set<'a>(
    e: &E<&'a str, &'a str>,
    result: &HashMap<&'a str, BTreeSet<u8>>,
) -> Option<BTreeSet<u8>> {
    match e {
        E::T(TerminalSymbol::Original(raw)) => extract_char_byte(raw).map(|b| {
            let mut s = BTreeSet::new();
            s.insert(b);
            s
        }),
        E::V(name) => result.get(name).cloned(),
        _ => None,
    }
}

/// Decide whether a rule is a char-class cascade and, if so, the byte
/// set it accepts. Returns `None` if the rule does not match the pattern
/// or its alternatives have not yet been classified.
///
/// Pattern: `R = X () / Y` where:
/// * `X` resolves to a single-byte terminal (directly or via another
///   char-class cascade variable).
/// * `Y` is `f`, or also resolves to a single-byte set.
fn classify_one<'a>(
    rule: &Rule<&'a str, &'a str>,
    result: &HashMap<&'a str, BTreeSet<u8>>,
) -> Option<BTreeSet<u8>> {
    if !is_empty_meta(&rule.equal.first.rhs) {
        return None;
    }
    let mut bytes = try_resolve_byte_set(&rule.equal.first.lhs, result)?;

    let second = &rule.equal.second.0;
    if !is_failure_meta(second) {
        let extra = try_resolve_byte_set(second, result)?;
        bytes.extend(extra);
    }

    Some(bytes)
}

/// True when a rule directly left-recurses on itself in the first
/// choice's left position: `R = R X / Y`. Such rules need
/// Squirrel-style seed-growing iteration to terminate.
pub fn is_left_recursive(rule: &Rule<&str, &str>) -> bool {
    matches!(&rule.equal.first.lhs, E::V(name) if *name == rule.value)
}

/// True when a rule has the shape `R = X R / ()` — an "zero-or-more"
/// tail recursion: match `X`, then recurse to self, with empty-match
/// fallback. peg's codegen emits these as `loop {}` rather than
/// recursion because LLVM does not reliably perform tail-call
/// elimination across `&mut State + Result<u32, ()>` returns.
pub fn is_tail_loop(rule: &Rule<&str, &str>) -> bool {
    let rhs_self = match &rule.equal.first.rhs {
        E::V(name) => *name == rule.value,
        _ => false,
    };
    if !rhs_self {
        return false;
    }
    matches!(
        &rule.equal.second.0,
        E::T(TerminalSymbol::Metasymbol(Metasymbol::Empty))
    )
}

/// Compute the char-class byte set for every rule that is a char-class
/// cascade, by fixed-point iteration over `classify_one`.
pub fn analyze_char_classes<'a>(
    rules: &[&'a Rule<&'a str, &'a str>],
) -> HashMap<&'a str, BTreeSet<u8>> {
    let mut result: HashMap<&'a str, BTreeSet<u8>> = HashMap::new();
    loop {
        let mut changed = false;
        for r in rules {
            if result.contains_key(r.value) {
                continue;
            }
            if let Some(set) = classify_one(r, &result) {
                result.insert(r.value, set);
                changed = true;
            }
        }
        if !changed {
            break;
        }
    }
    result
}

/// Result of classifying a rule as a first-byte-dispatch cascade. The
/// caller iterates over `alternatives` in order, generating a `match`
/// that routes each FIRST byte to the corresponding sub-parser.
pub struct DispatchCascade<'a> {
    /// Alternatives in cascade-priority order. Each entry is a variable
    /// reference (the LHS of `Vi () / Vi+1`); a byte present in earlier
    /// alternative's FIRST set takes precedence.
    pub alternatives: Vec<DispatchAlt<'a>>,
}

pub struct DispatchAlt<'a> {
    /// Variable name to call when this alternative is taken.
    pub rule_name: &'a str,
    /// FIRST set of that variable.
    pub first: FirstSet,
}

/// Walk the cascade chain of `rule_name` and collect every alternative.
/// Only succeeds when:
/// * Every link has shape `R = V () / Z` with `V` a variable reference
///   (no inline terminals, no metasymbols on the LHS).
/// * `Z` is `f` or another cascade variable.
fn collect_cascade_alternatives<'a>(
    rule_name: &'a str,
    rules: &HashMap<&'a str, &'a Rule<&'a str, &'a str>>,
    visited: &mut BTreeSet<&'a str>,
    out: &mut Vec<&'a str>,
) -> bool {
    if !visited.insert(rule_name) {
        // Cycle: bail out.
        return false;
    }
    let rule = match rules.get(rule_name) {
        Some(r) => r,
        None => return false,
    };
    if !is_empty_meta(&rule.equal.first.rhs) {
        return false;
    }
    let lhs_name = match &rule.equal.first.lhs {
        E::V(v) => *v,
        _ => return false,
    };
    out.push(lhs_name);

    match &rule.equal.second.0 {
        E::T(TerminalSymbol::Metasymbol(Metasymbol::Failure)) => true,
        E::V(next) => collect_cascade_alternatives(next, rules, visited, out),
        _ => false,
    }
}

/// Identify cascades that benefit from first-byte dispatch. Heuristic:
/// the cascade must have at least 5 alternatives, and at least one
/// alternative's FIRST set must contain ≥ 3 distinct bytes (i.e. the
/// cascade is dispatching on a meaningful char class, not just ordinary
/// keyword alternates which the cascade form already handles efficiently).
pub fn analyze_first_byte_dispatch<'a>(
    rules: &[&'a Rule<&'a str, &'a str>],
    firsts: &HashMap<String, FirstSet>,
    char_classes: &HashMap<&'a str, BTreeSet<u8>>,
) -> HashMap<&'a str, DispatchCascade<'a>> {
    let rule_map: HashMap<&str, &Rule<&str, &str>> = rules.iter().map(|r| (r.value, *r)).collect();

    let mut result = HashMap::new();
    for r in rules {
        // Skip rules that already have a byte-table fast path — the
        // table is strictly cheaper than `match`-with-arms-per-byte.
        if char_classes.contains_key(r.value) {
            continue;
        }

        let mut alts = Vec::new();
        let mut visited = BTreeSet::new();
        if !collect_cascade_alternatives(r.value, &rule_map, &mut visited, &mut alts) {
            continue;
        }

        if alts.len() < 5 {
            continue;
        }

        let alt_firsts: Vec<DispatchAlt> = alts
            .into_iter()
            .filter_map(|name| {
                let f = firsts.get(name)?.clone();
                // Skip if FIRST is unbounded (any_byte) — dispatch can't
                // help; `_ => fallback` would dominate.
                if f.any_byte {
                    return None;
                }
                Some(DispatchAlt {
                    rule_name: name,
                    first: f,
                })
            })
            .collect();

        // Need a useful char class somewhere — otherwise this is just
        // a list of single-byte literals and the original cascade with
        // `#[inline]` is already fine (Comparison/Function pattern).
        let any_class = alt_firsts.iter().any(|a| a.first.byte_count() >= 3);
        if !any_class {
            continue;
        }

        // Need every alternative to have a non-empty FIRST set so we
        // can reach it from the dispatch table.
        if alt_firsts.iter().any(|a| a.first.byte_count() == 0) {
            continue;
        }

        result.insert(
            r.value,
            DispatchCascade {
                alternatives: alt_firsts,
            },
        );
    }
    result
}