shiftkit 0.2.0

A Rust library for building parsers and grammars
Documentation
//! Provides the parser definition, actions, error types, and the parser generator

use std::collections::HashMap;

use crate::grammar::{
    AstNodeId, AstNodeType, Grammar, GrammarRuleId, HasTokenType, Index, ReductionResult, TokenType,
};
use crate::lr::InternalToken;

pub mod generator;
pub use generator::{Ambiguity, AmbiguityKind};

/// Represents a unique identifier for a parser state
pub type StateId = usize;

/// An action the parser can take
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Action {
    /// Shift the current token onto the stack and go to the specified state
    Shift(StateId),
    /// Reduce a sequence of grammar items on the stack using the specified grammar rule
    Reduce(GrammarRuleId),
    /// Accept the input, indicating successful parsing
    Accept,
}

/// An error that occurred during parsing
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ParseError {
    /// An unexpected token was encountered
    UnexpectedToken {
        /// The ID of the parser state when the error occurred
        state: StateId,
        /// The type of the unexpected token
        token_type: TokenType,
        /// The position in the input token stream where the error occurred
        position: usize,
    },
    /// The end of the input was reached unexpectedly
    UnexpectedEof {
        /// The ID of the parser state when the error occurred
        state: StateId,
    },
}

/// An LALR(1) parser that can parse a stream of tokens into an Abstract Syntax Tree
#[derive(Clone)]
pub struct Parser<T: HasTokenType, A> {
    /// The grammar used by this parser
    pub grammar: Grammar<T, A>,
    /// The GOTO table, mapping (state, AST node type) to a new state
    pub goto_table: Vec<HashMap<AstNodeType, usize>>,
    /// The ACTION table, mapping (state, internal token) to a parser action (`Shift`/`Reduce`/`Accept`)
    pub action_table: Vec<HashMap<InternalToken, Action>>,
}

impl<T: HasTokenType, A> Parser<T, A> {
    /// Parse a sequence of tokens using the LALR parser
    ///
    /// # Errors
    ///
    /// Returns `ParseError::UnexpectedToken` if an unexpected token is encountered.
    /// Returns `ParseError::UnexpectedEof` if the end of input is reached unexpectedly.
    ///
    /// # Panics
    ///
    /// This function should not panic under normal operation.
    /// Internal assertions expect the state and value stacks to be non-empty during parsing,
    /// which is guaranteed by the parser's initialization and operation.
    pub fn parse(&self, tokens: &[T]) -> Result<(AstNodeId, Vec<A>), ParseError> {
        let mut state_stack: Vec<StateId> = vec![0];
        let mut value_stack: Vec<Index> = Vec::new();
        let mut ast_nodes: Vec<A> = Vec::new();
        let mut position = 0;

        loop {
            let current_state = *state_stack.last().expect("State stack should not be empty");
            let lookahead = if position < tokens.len() {
                InternalToken::User(tokens[position].token_type())
            } else {
                InternalToken::Eof
            };

            let action = self.action_table[current_state]
                .get(&lookahead)
                .ok_or_else(|| {
                    if lookahead == InternalToken::Eof {
                        ParseError::UnexpectedEof {
                            state: current_state,
                        }
                    } else if let InternalToken::User(token_type) = lookahead {
                        ParseError::UnexpectedToken {
                            state: current_state,
                            token_type,
                            position,
                        }
                    } else {
                        unreachable!("Special token should never appear during parsing")
                    }
                })?;

            match action {
                Action::Shift(next_state) => {
                    value_stack.push(Index::Token(crate::grammar::TokenId(position)));
                    state_stack.push(*next_state);
                    position += 1;
                }
                Action::Reduce(rule_id) => {
                    let rule = self.grammar.get_rule(*rule_id);
                    let component_count = rule.components.len();

                    // Pop states and values
                    state_stack.truncate(state_stack.len() - component_count);
                    let popped_indices: Vec<Index> = value_stack
                        .drain(value_stack.len() - component_count..)
                        .collect();

                    // Execute reduction function
                    let result_node_id = match (rule.reduction)(&popped_indices, tokens, &ast_nodes)
                    {
                        ReductionResult::NewNode(node) => {
                            let node_id = AstNodeId(ast_nodes.len());
                            ast_nodes.push(node);
                            node_id
                        }
                        ReductionResult::Forward(node_id) => node_id,
                    };

                    // Push the result onto value stack
                    value_stack.push(Index::AstNode(result_node_id));

                    // Goto next state
                    let current_state =
                        *state_stack.last().expect("State stack should not be empty");
                    let next_state = self.goto_table[current_state]
                        .get(&rule.result)
                        .expect("Goto table should have entry for reduction result");
                    state_stack.push(*next_state);
                }
                Action::Accept => {
                    // The final value should be the root AST node
                    let root_id = value_stack
                        .last()
                        .expect("Value stack should contain result")
                        .as_ast_node_id();
                    return Ok((root_id, ast_nodes));
                }
            }
        }
    }
}