1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*
 * Copyright 2022 Arnaud Golfouse
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

#![doc = include_str!("../README.md")]
#![deny(unsafe_code, clippy::wildcard_dependencies, clippy::mod_module_files)]
#![warn(
    // Check that all public items are documented.
    missing_docs,
    // Some Rc are used in this crate, so clearer cloning helps somewhat.
    clippy::clone_on_ref_ptr,
    // There is some amount of indexing in this crate, so a better error message is preferable.
    clippy::get_unwrap,
)]

// Notes about this implementation:
//
// - In the original paper, wording is a bit different. In particular, we use 'node' for
// 'non terminal', 'token' for 'terminal', and 'lookahead' for 'extended terminal'.
// - This implementation does _not_ use a grammar augmented with a `S' → S` production.

#[cfg(any(doc, test))]
pub mod examples;
mod indices;
pub mod input;
pub mod output;

mod grammar_data;
mod ielr;
mod lr;
mod structures;
#[cfg(test)]
mod test_common;

use self::{
    grammar_data::GrammarData,
    indices::{GotoIdx, ItemIdx, StateIdx},
    input::Node,
    output::Table,
};
use std::rc::Rc;

/// What algorithm to use.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum Algorithm {
    /// Use the LALR(k) algorithm.
    ///
    /// Conflict resolution may not be used with LALR.
    Lalr(std::num::NonZeroU8),
    /// Use the LR(k) algorithm.
    Lr(std::num::NonZeroU8),
}

impl Default for Algorithm {
    fn default() -> Self {
        Self::Lalr(std::num::NonZeroU8::new(1).unwrap())
    }
}

impl PartialOrd for Algorithm {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        match (self, other) {
            (Algorithm::Lalr(k1), Algorithm::Lalr(k2)) | (Algorithm::Lr(k1), Algorithm::Lr(k2)) => {
                k1.partial_cmp(k2)
            }
            (Algorithm::Lalr(k1), Algorithm::Lr(k2)) if k1 < k2 => Some(std::cmp::Ordering::Less),
            (Algorithm::Lr(k1), Algorithm::Lalr(k2)) if k1 > k2 => {
                Some(std::cmp::Ordering::Greater)
            }
            _ => None,
        }
    }
}

/// Main function of this crate.
///
/// This computes the LR table for a given grammar.
/// # Parameters
/// - `algorithm`: which [`Algorithm`] to use to build the LR table.
///   
///   The actual computation may use a less powerful algorithm: in this case, you
/// need to check [`algorithm_needed`] field of [`Statistics`].
/// - `max_states`: the maximum number of states the algorithm is allowed to compute.
///
///   This is to prevent the algorithm from taking too many time. Most real-life
/// programming language generate no more than 5000 states. Anything higher than
/// `1_000_000` will certainly take way too many time and memory in practice.
/// - `grammar`: the specification of the grammar used. See [`input`] to build it.
/// - `start_nodes`: which nodes may be used to start parsing.
///
/// # Panics
/// At the moment, any algorithm different from [`Algorithm::Lalr(1)`] or
/// [`Algorithm::Lr(1)`] will result in a panic.
///
/// [`algorithm_needed`]: output::Statistics::algorithm_needed
/// [`Statistics`]: output::Statistics
/// [`Algorithm::Lr(1)`]: Algorithm::Lr
/// [`Algorithm::Lalr(1)`]: Algorithm::Lalr
pub fn compute_table(
    algorithm: Algorithm,
    grammar: &input::Grammar,
    start_nodes: impl IntoIterator<Item = Node>,
) -> Result<(output::Table, output::Statistics), output::Error> {
    // Reduce monomorphisation
    fn inner(
        algorithm: Algorithm,
        grammar: &input::Grammar,
        mut start_nodes: Vec<Node>,
    ) -> Result<(output::Table, output::Statistics), output::Error> {
        // deduplicate nodes.
        let start_nodes = {
            start_nodes.sort_unstable();
            start_nodes.dedup();
            start_nodes
        };
        let (is_lalr, k) = match algorithm {
            Algorithm::Lalr(k) => (true, k.get()),
            Algorithm::Lr(k) => (false, k.get()),
        };

        let grammar_data = GrammarData::new(grammar)?;
        let mut lalr_table =
            lr::LRTable::new_lr0(&grammar_data, start_nodes.iter().copied().collect())?;
        let phase0 = ielr::Phase0::new(&grammar_data, &mut lalr_table);
        let phase1 = ielr::Phase1::new(phase0);
        let mut phase2 = ielr::Phase2::new(phase1);
        if is_lalr && k == 1 {
            let mut conflicts = Vec::new();
            for state_conflicts in &phase2.conflict_lists {
                for conflict in state_conflicts {
                    conflicts.push(Rc::clone(conflict));
                }
            }
            if !conflicts.is_empty() {
                let conflict = Rc::clone(&conflicts[0]);
                let (state, conflict) = (
                    conflict.s,
                    output::error::Conflict {
                        lookahead: conflict.lookahead,
                        contributions: conflict.contributions.clone(),
                    },
                );
                let lalr_tables = phase2.get_table();
                return Err(output::Error::Conflict {
                    lalr_tables,
                    state: state.into(),
                    conflict,
                });
            }
            let stats = phase2.get_statistics();
            let mut table = phase2.get_table();
            table.finish(&grammar_data);
            // LALR(1) lookaheads are already computed.
            return Ok((table, stats));
        } else if is_lalr && k > 1 {
            todo!("LALR(k)");
        }
        let has_conflicts = phase2
            .conflict_lists
            .iter()
            .any(|conflicts| !conflicts.is_empty());
        if !has_conflicts {
            let stats = phase2.get_statistics();
            let mut table = phase2.get_table();
            table.finish(&grammar_data);
            return Ok((table, stats));
        }
        // We will use LR(1) now.
        phase2.statistics.algorithm_needed = Algorithm::Lr(std::num::NonZeroU8::new(1).unwrap());
        let mut phase3 = ielr::Phase3::new(phase2);
        if let Err(error) = phase3.regenerate() {
            if k > 1 {
                todo!("LR(k)")
            } else {
                return Err(error);
            }
        };
        let stats = phase3.get_statistics();
        let mut phase4 = ielr::Phase4::new(phase3);
        phase4.compute_lookaheads();
        let mut table: Table = phase4.lr1_table.into();
        table.finish(&grammar_data);
        Ok((table, stats))
    }

    inner(algorithm, grammar, start_nodes.into_iter().collect())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::input::Grammar;
    use std::num::NonZeroU8;

    #[test]
    fn empty_grammar_starting_nodes() {
        let grammar = Grammar::new();
        let algorithm = Algorithm::Lr(NonZeroU8::new(1).unwrap());
        let (table, _) = compute_table(algorithm, &grammar, []).unwrap();
        assert_eq!(table.states.len(), 0);
        assert_eq!(table.starting_states.len(), 0);
    }

    #[test]
    #[should_panic]
    fn lr_k_unimplemented() {
        let mut grammar = Grammar::new();
        grammar.add_production(Node(0), vec![]).unwrap();
        grammar.add_production(Node(0), vec![]).unwrap();
        let algorithm = Algorithm::Lr(NonZeroU8::new(2).unwrap());
        let _ = compute_table(algorithm, &grammar, [Node(0)]);
    }

    #[test]
    #[should_panic]
    fn lalr_k_unimplemented() {
        let mut grammar = Grammar::new();
        grammar.add_production(Node(0), vec![]).unwrap();
        grammar.add_production(Node(0), vec![]).unwrap();
        let algorithm = Algorithm::Lalr(NonZeroU8::new(2).unwrap());
        let _ = compute_table(algorithm, &grammar, [Node(0)]);
    }

    #[test]
    fn expression_conflict_solution() {
        use crate::{
            input::{
                ConflictSolution, ConflictingAction,
                Symbol::{Node as N, Token as T},
            },
            output::Lookahead,
            test_common::{
                nodes::{E, START},
                tokens::{INT, PLUS},
            },
        };

        let mut grammar = Grammar::new();
        for (lhs, rhs) in [(START, vec![N(E)]), (E, vec![T(INT)])] {
            grammar.add_production(lhs, rhs).unwrap();
        }
        let addition_production = grammar
            .add_production(E, vec![N(E), T(PLUS), N(E)])
            .unwrap();

        compute_table(Algorithm::Lr(NonZeroU8::new(1).unwrap()), &grammar, [START]).unwrap_err();

        grammar.add_conflict_solution(ConflictSolution {
            prefer: ConflictingAction::Reduce(addition_production),
            over: ConflictingAction::Shift(Lookahead::Token(PLUS)),
        });
        compute_table(Algorithm::Lr(NonZeroU8::new(1).unwrap()), &grammar, [START]).unwrap();
    }
}