mpl-macro 0.2.0

Derive parsers from MPL's one-rule grammar files. Includes the FastParse static-codegen backend (cascade detection, first-byte dispatch, Squirrel left recursion).
Documentation
//! Mizushima-style FIRST-set analysis and cut classification.
//!
//! For each rule `A = B C / D` we compute `FIRST(B C)` and `FIRST(D)` over
//! a fixpoint and decide whether the choice is LL(1)-equivalent (i.e. their
//! FIRST sets are disjoint and not both nullable). Such rules can be parsed
//! without memoisation; the rest are candidates for packrat / Squirrel.
//!
//! The output is a per-grammar `needs_memo[rule] -> bool` table that the
//! `FastParse` codegen embeds as a constant. A future memoising backend
//! consults the same table to decide where to allocate cache slots.

use mpl::rules::Rule;
use mpl::symbols::{Metasymbol, TerminalSymbol, E};
use std::collections::HashMap;

/// FIRST set for an expression. Tracks which leading bytes can begin it,
/// whether it can match the empty string, and a wildcard flag for `Any`/`All`.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct FirstSet {
    /// `true` for each byte value that can start a match.
    bytes: [bool; 256],
    /// `true` if any byte can start a match (e.g. `?` / `*` metasymbols).
    pub any_byte: bool,
    /// `true` if the expression can match the empty input.
    pub nullable: bool,
}

impl Default for FirstSet {
    fn default() -> Self {
        Self {
            bytes: [false; 256],
            any_byte: false,
            nullable: false,
        }
    }
}

impl FirstSet {
    /// Iterate every byte that is in this FIRST set. Skipped if `any_byte`
    /// — callers must check that flag separately when relevant.
    pub fn iter_bytes(&self) -> impl Iterator<Item = u8> + '_ {
        (0u32..256).filter_map(move |i| {
            if self.bytes[i as usize] {
                Some(i as u8)
            } else {
                None
            }
        })
    }

    pub fn byte_count(&self) -> usize {
        self.bytes.iter().filter(|&&b| b).count()
    }

    fn add_byte(&mut self, b: u8) -> bool {
        if !self.bytes[b as usize] {
            self.bytes[b as usize] = true;
            true
        } else {
            false
        }
    }

    fn union_in(&mut self, other: &FirstSet) -> bool {
        let mut changed = false;
        for i in 0..256 {
            if other.bytes[i] && !self.bytes[i] {
                self.bytes[i] = true;
                changed = true;
            }
        }
        if other.any_byte && !self.any_byte {
            self.any_byte = true;
            changed = true;
        }
        if other.nullable && !self.nullable {
            self.nullable = true;
            changed = true;
        }
        changed
    }

    /// Disjointness test in the LL(1) sense: the two FIRST sets share no
    /// leading byte and are not both nullable. A wildcard byte clashes
    /// with any non-empty FIRST set.
    pub fn is_disjoint(&self, other: &FirstSet) -> bool {
        if self.nullable && other.nullable {
            return false;
        }
        if self.any_byte && (other.any_byte || other.bytes.iter().any(|&b| b)) {
            return false;
        }
        if other.any_byte && self.bytes.iter().any(|&b| b) {
            return false;
        }
        for i in 0..256 {
            if self.bytes[i] && other.bytes[i] {
                return false;
            }
        }
        true
    }
}

/// Compute FIRST(e) at one fixpoint step. Returns the set; the caller
/// decides whether the parent set actually grew.
fn first_of_e(e: &E<&str, &str>, firsts: &HashMap<String, FirstSet>) -> FirstSet {
    match e {
        E::V(v) => firsts.get(*v).cloned().unwrap_or_default(),
        E::T(TerminalSymbol::Metasymbol(m)) => match m {
            Metasymbol::Empty => FirstSet {
                nullable: true,
                ..Default::default()
            },
            Metasymbol::Failure => FirstSet::default(),
            Metasymbol::Any(n) => FirstSet {
                any_byte: *n > 0,
                nullable: *n == 0,
                ..Default::default()
            },
            Metasymbol::All => FirstSet {
                any_byte: true,
                nullable: true,
                ..Default::default()
            },
            Metasymbol::Omit => FirstSet::default(),
        },
        E::T(TerminalSymbol::Original(raw)) => first_of_original(raw),
    }
}

/// Best-effort byte FIRST set for an original-symbol expression like
/// `Char('c')` or `Str("abc")`. Anything else conservatively widens to
/// `any_byte = true` (memoisation will not be elided for that rule).
fn first_of_original(raw: &str) -> FirstSet {
    let parsed = match syn::parse_str::<syn::Expr>(raw) {
        Ok(e) => e,
        Err(_) => {
            return FirstSet {
                any_byte: true,
                ..Default::default()
            };
        }
    };
    if let syn::Expr::Call(call) = &parsed {
        if let syn::Expr::Path(p) = &*call.func {
            if p.path.is_ident("Char") {
                if let Some(syn::Expr::Lit(syn::ExprLit {
                    lit: syn::Lit::Char(c),
                    ..
                })) = call.args.first()
                {
                    let mut fs = FirstSet::default();
                    fs.add_byte(c.value() as u32 as u8);
                    return fs;
                }
            } else if p.path.is_ident("Str") {
                if let Some(syn::Expr::Lit(syn::ExprLit {
                    lit: syn::Lit::Str(s),
                    ..
                })) = call.args.first()
                {
                    let bytes = s.value();
                    let mut fs = FirstSet::default();
                    if let Some(&b) = bytes.as_bytes().first() {
                        fs.add_byte(b);
                    } else {
                        fs.nullable = true;
                    }
                    return fs;
                }
            }
        }
    }
    FirstSet {
        any_byte: true,
        ..Default::default()
    }
}

/// FIRST(B C): FIRST(B) plus, if B is nullable, FIRST(C). Nullable iff both.
fn first_of_seq(
    b: &E<&str, &str>,
    c: &E<&str, &str>,
    firsts: &HashMap<String, FirstSet>,
) -> FirstSet {
    let fb = first_of_e(b, firsts);
    let fc = first_of_e(c, firsts);
    let mut out = fb.clone();
    if fb.nullable {
        out.union_in(&fc);
    }
    out.nullable = fb.nullable && fc.nullable;
    out
}

/// Run the FIRST fixpoint over all rules and return the final map.
pub fn compute_first_sets<'a>(rules: &[&'a Rule<&'a str, &'a str>]) -> HashMap<String, FirstSet> {
    let mut firsts: HashMap<String, FirstSet> = HashMap::new();
    for r in rules {
        firsts.insert(r.value.to_string(), FirstSet::default());
    }

    loop {
        let mut changed = false;
        for r in rules {
            let first_choice = first_of_seq(&r.equal.first.lhs, &r.equal.first.rhs, &firsts);
            let second_choice = first_of_e(&r.equal.second.0, &firsts);
            let mut union = first_choice;
            union.union_in(&second_choice);

            let cur = firsts.get_mut(r.value).unwrap();
            if cur.union_in(&union) {
                changed = true;
            }
        }
        if !changed {
            break;
        }
    }

    firsts
}

/// For each rule, decide whether memoisation is required at its choice
/// point. `false` ⇒ FIRST(first choice) and FIRST(second choice) are
/// disjoint and not both nullable, so the rule is LL(1)-equivalent and
/// will never re-evaluate at the same position.
pub fn classify_rules<'a>(
    rules: &[&'a Rule<&'a str, &'a str>],
    firsts: &HashMap<String, FirstSet>,
) -> Vec<(String, bool)> {
    rules
        .iter()
        .map(|r| {
            let first_choice = first_of_seq(&r.equal.first.lhs, &r.equal.first.rhs, firsts);
            let second_choice = first_of_e(&r.equal.second.0, firsts);
            let needs_memo = !first_choice.is_disjoint(&second_choice);
            (r.value.to_string(), needs_memo)
        })
        .collect()
}