ielr 0.1.1

Table generation backend for LR parser generators
Documentation
/*
 * Copyright 2022 Arnaud Golfouse
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

#![doc = include_str!("../README.md")]
#![deny(unsafe_code, clippy::wildcard_dependencies, clippy::mod_module_files)]
#![warn(
    // Check that all public items are documented.
    missing_docs,
    // Some Rc are used in this crate, so clearer cloning helps somewhat.
    clippy::clone_on_ref_ptr,
    // There is some amount of indexing in this crate, so a better error message is preferable.
    clippy::get_unwrap,
)]

// Notes about this implementation:
//
// - In the original paper, wording is a bit different. In particular, we use 'node' for
// 'non terminal', 'token' for 'terminal', and 'lookahead' for 'extended terminal'.
// - This implementation does _not_ use a grammar augmented with a `S' → S` production.

#[cfg(any(doc, test))]
pub mod examples;
mod indices;
pub mod input;
pub mod output;

mod grammar_data;
mod ielr;
mod lr;
mod structures;
#[cfg(test)]
mod test_common;

use self::{
    grammar_data::GrammarData,
    indices::{GotoIdx, ItemIdx, StateIdx},
    input::Node,
    output::Table,
};
use std::rc::Rc;

/// What algorithm to use.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum Algorithm {
    /// Use the LALR(k) algorithm.
    ///
    /// Conflict resolution may not be used with LALR.
    Lalr(std::num::NonZeroU8),
    /// Use the LR(k) algorithm.
    Lr(std::num::NonZeroU8),
}

impl Default for Algorithm {
    fn default() -> Self {
        Self::Lalr(std::num::NonZeroU8::new(1).unwrap())
    }
}

impl PartialOrd for Algorithm {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        match (self, other) {
            (Algorithm::Lalr(k1), Algorithm::Lalr(k2)) | (Algorithm::Lr(k1), Algorithm::Lr(k2)) => {
                k1.partial_cmp(k2)
            }
            (Algorithm::Lalr(k1), Algorithm::Lr(k2)) if k1 < k2 => Some(std::cmp::Ordering::Less),
            (Algorithm::Lr(k1), Algorithm::Lalr(k2)) if k1 > k2 => {
                Some(std::cmp::Ordering::Greater)
            }
            _ => None,
        }
    }
}

/// Main function of this crate.
///
/// This computes the LR table for a given grammar.
/// # Parameters
/// - `algorithm`: which [`Algorithm`] to use to build the LR table.
///   
///   The actual computation may use a less powerful algorithm: in this case, you
/// need to check [`algorithm_needed`] field of [`Statistics`].
/// - `max_states`: the maximum number of states the algorithm is allowed to compute.
///
///   This is to prevent the algorithm from taking too many time. Most real-life
/// programming language generate no more than 5000 states. Anything higher than
/// `1_000_000` will certainly take way too many time and memory in practice.
/// - `grammar`: the specification of the grammar used. See [`input`] to build it.
/// - `start_nodes`: which nodes may be used to start parsing.
///
/// # Panics
/// At the moment, any algorithm different from [`Algorithm::Lalr(1)`] or
/// [`Algorithm::Lr(1)`] will result in a panic.
///
/// [`algorithm_needed`]: output::Statistics::algorithm_needed
/// [`Statistics`]: output::Statistics
/// [`Algorithm::Lr(1)`]: Algorithm::Lr
/// [`Algorithm::Lalr(1)`]: Algorithm::Lalr
pub fn compute_table(
    algorithm: Algorithm,
    grammar: &input::Grammar,
    start_nodes: impl IntoIterator<Item = Node>,
) -> Result<(output::Table, output::Statistics), output::Error> {
    // Reduce monomorphisation
    fn inner(
        algorithm: Algorithm,
        grammar: &input::Grammar,
        mut start_nodes: Vec<Node>,
    ) -> Result<(output::Table, output::Statistics), output::Error> {
        // deduplicate nodes.
        let start_nodes = {
            start_nodes.sort_unstable();
            start_nodes.dedup();
            start_nodes
        };
        let (is_lalr, k) = match algorithm {
            Algorithm::Lalr(k) => (true, k.get()),
            Algorithm::Lr(k) => (false, k.get()),
        };

        let grammar_data = GrammarData::new(grammar)?;
        let mut lalr_table =
            lr::LRTable::new_lr0(&grammar_data, start_nodes.iter().copied().collect())?;
        let phase0 = ielr::Phase0::new(&grammar_data, &mut lalr_table);
        let phase1 = ielr::Phase1::new(phase0);
        let mut phase2 = ielr::Phase2::new(phase1);
        if is_lalr && k == 1 {
            let mut conflicts = Vec::new();
            for state_conflicts in &phase2.conflict_lists {
                for conflict in state_conflicts {
                    conflicts.push(Rc::clone(conflict));
                }
            }
            if !conflicts.is_empty() {
                let conflict = Rc::clone(&conflicts[0]);
                let (state, conflict) = (
                    conflict.s,
                    output::error::Conflict {
                        lookahead: conflict.lookahead,
                        contributions: conflict.contributions.clone(),
                    },
                );
                let lalr_tables = phase2.get_table();
                return Err(output::Error::Conflict {
                    lalr_tables,
                    state: state.into(),
                    conflict,
                });
            }
            let stats = phase2.get_statistics();
            let mut table = phase2.get_table();
            table.finish(&grammar_data);
            // LALR(1) lookaheads are already computed.
            return Ok((table, stats));
        } else if is_lalr && k > 1 {
            todo!("LALR(k)");
        }
        let has_conflicts = phase2
            .conflict_lists
            .iter()
            .any(|conflicts| !conflicts.is_empty());
        if !has_conflicts {
            let stats = phase2.get_statistics();
            let mut table = phase2.get_table();
            table.finish(&grammar_data);
            return Ok((table, stats));
        }
        // We will use LR(1) now.
        phase2.statistics.algorithm_needed = Algorithm::Lr(std::num::NonZeroU8::new(1).unwrap());
        let mut phase3 = ielr::Phase3::new(phase2);
        if let Err(error) = phase3.regenerate() {
            if k > 1 {
                todo!("LR(k)")
            } else {
                return Err(error);
            }
        };
        let stats = phase3.get_statistics();
        let mut phase4 = ielr::Phase4::new(phase3);
        phase4.compute_lookaheads();
        let mut table: Table = phase4.lr1_table.into();
        table.finish(&grammar_data);
        Ok((table, stats))
    }

    inner(algorithm, grammar, start_nodes.into_iter().collect())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::input::Grammar;
    use std::num::NonZeroU8;

    #[test]
    fn empty_grammar_starting_nodes() {
        let grammar = Grammar::new();
        let algorithm = Algorithm::Lr(NonZeroU8::new(1).unwrap());
        let (table, _) = compute_table(algorithm, &grammar, []).unwrap();
        assert_eq!(table.states.len(), 0);
        assert_eq!(table.starting_states.len(), 0);
    }

    #[test]
    #[should_panic]
    fn lr_k_unimplemented() {
        let mut grammar = Grammar::new();
        grammar.add_production(Node(0), vec![]).unwrap();
        grammar.add_production(Node(0), vec![]).unwrap();
        let algorithm = Algorithm::Lr(NonZeroU8::new(2).unwrap());
        let _ = compute_table(algorithm, &grammar, [Node(0)]);
    }

    #[test]
    #[should_panic]
    fn lalr_k_unimplemented() {
        let mut grammar = Grammar::new();
        grammar.add_production(Node(0), vec![]).unwrap();
        grammar.add_production(Node(0), vec![]).unwrap();
        let algorithm = Algorithm::Lalr(NonZeroU8::new(2).unwrap());
        let _ = compute_table(algorithm, &grammar, [Node(0)]);
    }

    #[test]
    fn expression_conflict_solution() {
        use crate::{
            input::{
                ConflictSolution, ConflictingAction,
                Symbol::{Node as N, Token as T},
            },
            output::Lookahead,
            test_common::{
                nodes::{E, START},
                tokens::{INT, PLUS},
            },
        };

        let mut grammar = Grammar::new();
        for (lhs, rhs) in [(START, vec![N(E)]), (E, vec![T(INT)])] {
            grammar.add_production(lhs, rhs).unwrap();
        }
        let addition_production = grammar
            .add_production(E, vec![N(E), T(PLUS), N(E)])
            .unwrap();

        compute_table(Algorithm::Lr(NonZeroU8::new(1).unwrap()), &grammar, [START]).unwrap_err();

        grammar.add_conflict_solution(ConflictSolution {
            prefer: ConflictingAction::Reduce(addition_production),
            over: ConflictingAction::Shift(Lookahead::Token(PLUS)),
        });
        compute_table(Algorithm::Lr(NonZeroU8::new(1).unwrap()), &grammar, [START]).unwrap();
    }
}