shiftkit 0.2.0

A Rust library for building parsers and grammars
Documentation
//! Implements LR(0), LR(1), and LALR(1) items and item sets
//!
//! Includes closure/goto operations for parse table generation.

use std::collections::{BTreeSet, HashMap, HashSet};
use std::ops::{Deref, DerefMut};

use crate::grammar::{AstNodeType, Grammar, GrammarItem, GrammarRuleId, HasTokenType, TokenType};

/// Internal token representation used during parser generation
///
/// This allows the parser generator to use special sentinel tokens (EOF, Special)
/// without conflicts with user-defined token types.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum InternalToken {
    /// A user-defined token type from the grammar
    User(TokenType),
    /// End-of-file sentinel token
    Eof,
    /// Special token used for lookahead propagation in LALR construction
    Special,
}

impl From<TokenType> for InternalToken {
    fn from(token: TokenType) -> Self {
        Self::User(token)
    }
}

/// Represents an LR(0) item: a grammar rule with a position marker (dot)
/// Example: "E -> E . + T" (rule for "E -> E + T" with dot before "+")
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Lr0Item {
    /// Grammar rule represented
    pub rule: GrammarRuleId,
    /// Index into this grammar rule (position of the dot)
    pub index: usize,
}

impl Lr0Item {
    /// Create a new `Lr0Item`
    #[must_use]
    pub const fn new(rule: GrammarRuleId, index: usize) -> Self {
        Self { rule, index }
    }
}

/// A set of LR(0) items, representing the kernel of an LR(0) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct Lr0Kernel(pub BTreeSet<Lr0Item>);

impl Deref for Lr0Kernel {
    type Target = BTreeSet<Lr0Item>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for Lr0Kernel {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a Lr0Kernel {
    type Item = &'a Lr0Item;
    type IntoIter = std::collections::btree_set::Iter<'a, Lr0Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for Lr0Kernel {
    type Item = Lr0Item;
    type IntoIter = std::collections::btree_set::IntoIter<Lr0Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}

/// A set of LR(0) items, representing the full closure of an LR(0) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct Lr0Closure(pub BTreeSet<Lr0Item>);

impl Lr0Closure {
    /// Compute the GOTO function for the current LR(0) closure and a given grammar item
    ///
    /// This determines the next LR(0) kernel state reachable from the current closure
    /// by processing `next_entry`.
    #[must_use]
    pub fn goto<T: HasTokenType, A>(
        &self,
        grammar: &Grammar<T, A>,
        next_entry: &GrammarItem,
    ) -> Lr0Kernel {
        let mut new_kernel = Lr0Kernel::default();

        for &Lr0Item { rule, index } in &self.0 {
            let components = &grammar.get_rule(rule).components;
            if index < components.len() && components[index] == *next_entry {
                new_kernel.insert(Lr0Item {
                    rule,
                    index: index + 1,
                });
            }
        }

        new_kernel
    }
}

impl Deref for Lr0Closure {
    type Target = BTreeSet<Lr0Item>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for Lr0Closure {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a Lr0Closure {
    type Item = &'a Lr0Item;
    type IntoIter = std::collections::btree_set::Iter<'a, Lr0Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for Lr0Closure {
    type Item = Lr0Item;
    type IntoIter = std::collections::btree_set::IntoIter<Lr0Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}

/// Represents an LR(1) item: an LR(0) item with a single lookahead token
/// Example: "E -> E . + T", "$"  (with "$" as lookahead)
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Lr1Item {
    /// Grammar rule represented
    pub rule: GrammarRuleId,
    /// Index into this grammar rule (position of the dot)
    pub index: usize,
    /// Lookahead token (can be user token, EOF, or special marker)
    pub lookahead: InternalToken,
}

impl Lr1Item {
    /// Create a new LR(1) item
    #[must_use]
    pub const fn new(rule: GrammarRuleId, index: usize, lookahead: InternalToken) -> Self {
        Self {
            rule,
            index,
            lookahead,
        }
    }

    /// Compute the LR(1) closure for this item
    ///
    /// The closure includes all items reachable by applying the closure operation
    /// (adding new items based on non-terminals immediately following the dot)
    /// and propagating lookaheads.
    #[must_use]
    pub fn closure<T: HasTokenType, A>(
        &self,
        grammar: &Grammar<T, A>,
        generators: &HashMap<AstNodeType, Vec<GrammarRuleId>>,
        first_sets: &HashMap<AstNodeType, HashSet<TokenType>>,
    ) -> Lr1Closure {
        let mut closure = Lr1Closure(BTreeSet::from([self.clone()]));
        let mut queue = vec![self.clone()];

        while let Some(Self {
            rule,
            index,
            lookahead,
        }) = queue.pop()
        {
            let components = &grammar.get_rule(rule).components;

            if let Some(GrammarItem::AstNode(node_type)) = components.get(index)
                && let Some(generators) = generators.get(node_type)
            {
                let lookaheads: Vec<InternalToken> = match components.get(index + 1) {
                    Some(GrammarItem::Token(token_type)) => vec![InternalToken::User(*token_type)],
                    Some(GrammarItem::AstNode(node_type)) => first_sets
                        .get(node_type)
                        .cloned()
                        .map(|set| set.into_iter().map(InternalToken::User).collect())
                        .unwrap_or_default(),
                    None => vec![lookahead],
                };
                for &rule in generators {
                    for &lookahead in &lookaheads {
                        let new_item = Self::new(rule, 0, lookahead);
                        if closure.insert(new_item.clone()) {
                            queue.push(new_item);
                        }
                    }
                }
            }
        }

        closure
    }
}

/// A set of LR(1) items, representing the kernel of an LR(1) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct Lr1Kernel(pub BTreeSet<Lr1Item>);

impl Deref for Lr1Kernel {
    type Target = BTreeSet<Lr1Item>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for Lr1Kernel {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a Lr1Kernel {
    type Item = &'a Lr1Item;
    type IntoIter = std::collections::btree_set::Iter<'a, Lr1Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for Lr1Kernel {
    type Item = Lr1Item;
    type IntoIter = std::collections::btree_set::IntoIter<Lr1Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}

/// A set of LR(1) items, representing the full closure of an LR(1) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct Lr1Closure(pub BTreeSet<Lr1Item>);

impl Deref for Lr1Closure {
    type Target = BTreeSet<Lr1Item>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for Lr1Closure {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a Lr1Closure {
    type Item = &'a Lr1Item;
    type IntoIter = std::collections::btree_set::Iter<'a, Lr1Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for Lr1Closure {
    type Item = Lr1Item;
    type IntoIter = std::collections::btree_set::IntoIter<Lr1Item>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}

/// Represents a LALR(1) item: an LR(0) item with a set of lookahead tokens
/// This is what LALR(1) parsers use - combining LR(1) states with the same core
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct LalrItem {
    /// Grammar rule represented
    pub rule: GrammarRuleId,
    /// Index into this grammar rule (position of the dot)
    pub index: usize,
    /// Lookahead tokens (merged from multiple LR(1) items with same core)
    pub lookaheads: BTreeSet<InternalToken>,
}

impl LalrItem {
    /// Create a new LALR(1) item with multiple lookaheads
    #[must_use]
    pub const fn new(
        rule: GrammarRuleId,
        index: usize,
        lookaheads: BTreeSet<InternalToken>,
    ) -> Self {
        Self {
            rule,
            index,
            lookaheads,
        }
    }
}

/// A set of LALR(1) items, representing the kernel of an LALR(1) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct LalrKernel(pub BTreeSet<LalrItem>);

impl Deref for LalrKernel {
    type Target = BTreeSet<LalrItem>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for LalrKernel {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a LalrKernel {
    type Item = &'a LalrItem;
    type IntoIter = std::collections::btree_set::Iter<'a, LalrItem>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for LalrKernel {
    type Item = LalrItem;
    type IntoIter = std::collections::btree_set::IntoIter<LalrItem>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}

/// A set of LALR(1) items, representing the full closure of an LALR(1) state
#[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
pub struct LalrClosure(pub BTreeSet<LalrItem>);

impl Deref for LalrClosure {
    type Target = BTreeSet<LalrItem>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for LalrClosure {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<'a> IntoIterator for &'a LalrClosure {
    type Item = &'a LalrItem;
    type IntoIter = std::collections::btree_set::Iter<'a, LalrItem>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.iter()
    }
}

impl IntoIterator for LalrClosure {
    type Item = LalrItem;
    type IntoIter = std::collections::btree_set::IntoIter<LalrItem>;

    fn into_iter(self) -> Self::IntoIter {
        self.0.into_iter()
    }
}