bnf/earley/
mod.rs

1mod grammar;
2mod input_range;
3mod traversal;
4
5use crate::{tracing, ParseTree, ParseTreeNode, Term};
6use grammar::{ParseGrammar, Production};
7use input_range::InputRange;
8use std::collections::{BTreeSet, HashMap, HashSet, VecDeque};
9use traversal::{TermMatch, Traversal, TraversalId, TraversalTree};
10
11pub fn parse<'gram>(
12    grammar: &'gram crate::Grammar,
13    input: &'gram str,
14) -> impl Iterator<Item = ParseTree<'gram>> {
15    ParseTreeIter::new(grammar, input)
16}
17
18/// A queue of [`TraversalId`] for processing, with repetitions ignored.
19#[derive(Debug, Default)]
20struct TraversalQueue {
21    processed: HashSet<TraversalId>,
22    queue: VecDeque<TraversalId>,
23}
24
25impl TraversalQueue {
26    pub fn pop_front(&mut self) -> Option<TraversalId> {
27        self.queue.pop_front()
28    }
29    pub fn push_back(&mut self, id: TraversalId) {
30        if self.processed.insert(id) {
31            self.queue.push_back(id);
32        }
33    }
34    /// Add starting traversals to back of queue
35    pub fn push_back_starting<'gram>(
36        &mut self,
37        traversal_tree: &mut TraversalTree<'gram>,
38        grammar: &ParseGrammar<'gram>,
39        starting_term: &'gram Term,
40        input: &InputRange<'gram>,
41    ) {
42        for starting_prod in grammar.get_productions_by_lhs(starting_term) {
43            let traversal = traversal_tree.predict_starting(starting_prod, input);
44            self.push_back(traversal.id);
45        }
46    }
47}
48
49/// Create a [ParseTree] starting at the root [`TraversalId`].
50fn parse_tree<'gram>(
51    traversal_tree: &TraversalTree<'gram>,
52    grammar: &ParseGrammar<'gram>,
53    traversal_id: TraversalId,
54) -> ParseTree<'gram> {
55    let production = {
56        let traversal = traversal_tree.get(traversal_id);
57        grammar.get_production_by_id(traversal.production_id)
58    };
59    let rhs = traversal_tree
60        .get_matched(traversal_id)
61        .map(|term_match| match term_match {
62            TermMatch::Terminal(term) => ParseTreeNode::Terminal(term),
63            TermMatch::Nonterminal(traversal_id) => {
64                ParseTreeNode::Nonterminal(parse_tree(traversal_tree, grammar, *traversal_id))
65            }
66        })
67        .collect::<Vec<ParseTreeNode>>();
68
69    ParseTree::new(production.lhs, rhs)
70}
71
72/// Pops [Traversal] from the provided queue, and follows
73/// the core [Earley parsing](https://en.wikipedia.org/wiki/Earley_parser) algorithm.
74fn earley<'gram>(
75    queue: &mut TraversalQueue,
76    traversal_tree: &mut TraversalTree<'gram>,
77    completions: &mut CompletionMap<'gram>,
78    grammar: &ParseGrammar<'gram>,
79) -> Option<TraversalId> {
80    let _span = tracing::span!(tracing::Level::DEBUG, "earley").entered();
81    while let Some(traversal_id) = queue.pop_front() {
82        tracing::event!(
83            tracing::Level::TRACE,
84            "earley queue pop: {:#?}",
85            traversal_tree.get(traversal_id)
86        );
87
88        match traversal_tree.get_matching(traversal_id) {
89            Some(nonterminal @ Term::Nonterminal(_)) => {
90                let _span = tracing::span!(tracing::Level::DEBUG, "Predict").entered();
91
92                let traversal = traversal_tree.get(traversal_id);
93                let lhs = grammar.get_production_by_id(traversal.production_id).lhs;
94
95                completions.insert(traversal, lhs);
96
97                let input_range = traversal.input_range.clone();
98
99                for production in grammar.get_productions_by_lhs(nonterminal) {
100                    let predicted = traversal_tree.predict(production, &input_range);
101                    tracing::event!(tracing::Level::TRACE, "predicted: {predicted:#?}");
102                    queue.push_back(predicted.id);
103                }
104
105                for completed in completions.get_complete(nonterminal, &input_range) {
106                    let term_match = TermMatch::Nonterminal(completed);
107                    let prior_completed = traversal_tree.match_term(traversal_id, term_match);
108                    tracing::event!(
109                        tracing::Level::TRACE,
110                        "prior_completed: {prior_completed:#?}"
111                    );
112                    queue.push_back(prior_completed.id);
113                }
114            }
115            Some(Term::Terminal(term)) => {
116                let _span = tracing::span!(tracing::Level::DEBUG, "Scan").entered();
117                let traversal = traversal_tree.get(traversal_id);
118                if traversal.input_range.next().starts_with(term) {
119                    let term_match = TermMatch::Terminal(term);
120                    let scanned = traversal_tree.match_term(traversal_id, term_match);
121                    tracing::event!(tracing::Level::TRACE, "scanned: {scanned:#?}");
122                    queue.push_back(scanned.id);
123                }
124            }
125            None => {
126                let _span = tracing::span!(tracing::Level::DEBUG, "Complete").entered();
127
128                let traversal = traversal_tree.get(traversal_id);
129                let is_full_traversal =
130                    traversal.is_starting && traversal.input_range.is_complete();
131                let lhs = grammar.get_production_by_id(traversal.production_id).lhs;
132
133                completions.insert(traversal, lhs);
134
135                for incomplete_traversal_id in completions.get_incomplete(lhs, traversal) {
136                    let term_match = TermMatch::Nonterminal(traversal_id);
137                    let completed = traversal_tree.match_term(incomplete_traversal_id, term_match);
138
139                    tracing::event!(tracing::Level::TRACE, "completed: {completed:#?}");
140                    queue.push_back(completed.id);
141                }
142
143                if is_full_traversal {
144                    return Some(traversal_id);
145                }
146            }
147        }
148    }
149
150    None
151}
152
153#[derive(Debug)]
154struct ParseTreeIter<'gram> {
155    traversal_tree: TraversalTree<'gram>,
156    grammar: ParseGrammar<'gram>,
157    queue: TraversalQueue,
158    completions: CompletionMap<'gram>,
159}
160
161impl<'gram> ParseTreeIter<'gram> {
162    pub fn new(grammar: &'gram crate::Grammar, input: &'gram str) -> Self {
163        let starting_term = grammar
164            .starting_term()
165            .expect("Grammar must have one production to parse");
166
167        let grammar = ParseGrammar::new(grammar);
168
169        let input = InputRange::new(input);
170        let mut traversal_tree = TraversalTree::default();
171        let mut queue = TraversalQueue::default();
172        let completions = CompletionMap::default();
173
174        queue.push_back_starting(&mut traversal_tree, &grammar, starting_term, &input);
175
176        Self {
177            traversal_tree,
178            grammar,
179            queue,
180            completions,
181        }
182    }
183}
184
185impl<'gram> Iterator for ParseTreeIter<'gram> {
186    type Item = ParseTree<'gram>;
187    fn next(&mut self) -> Option<Self::Item> {
188        let Self {
189            queue,
190            completions,
191            grammar,
192            traversal_tree,
193        } = self;
194
195        earley(queue, traversal_tree, completions, grammar).map(|traversal_id| {
196            let _span = tracing::span!(tracing::Level::DEBUG, "next_parse_tree").entered();
197            let parse_tree = parse_tree(traversal_tree, grammar, traversal_id);
198            tracing::event!(tracing::Level::TRACE, "\n{parse_tree}");
199            parse_tree
200        })
201    }
202}
203/// Key used for "incomplete" [`Traversal`] in [CompletionMap]
204#[derive(Debug, PartialEq, Eq, Hash)]
205pub(crate) struct CompletionKey<'gram> {
206    term: &'gram Term,
207    input_start: usize,
208}
209
210impl<'gram> CompletionKey<'gram> {
211    pub fn new_start(term: &'gram Term, input: &InputRange<'gram>) -> Self {
212        Self {
213            term,
214            input_start: input.offset.start,
215        }
216    }
217    pub fn new_total(term: &'gram Term, input: &InputRange<'gram>) -> Self {
218        Self {
219            term,
220            input_start: input.offset.total_len(),
221        }
222    }
223}
224
225#[derive(Debug, Default)]
226pub(crate) struct CompletionMap<'gram> {
227    incomplete: HashMap<CompletionKey<'gram>, BTreeSet<TraversalId>>,
228    complete: HashMap<CompletionKey<'gram>, BTreeSet<TraversalId>>,
229}
230
231impl<'gram> CompletionMap<'gram> {
232    pub fn get_incomplete(
233        &'_ self,
234        term: &'gram Term,
235        complete_traversal: &Traversal<'gram>,
236    ) -> impl Iterator<Item = TraversalId> + '_ {
237        let _span = tracing::span!(tracing::Level::DEBUG, "get_incomplete").entered();
238        let key = CompletionKey::new_start(term, &complete_traversal.input_range);
239        self.incomplete.get(&key).into_iter().flatten().cloned()
240    }
241    pub fn get_complete(
242        &'_ self,
243        term: &'gram Term,
244        input_range: &InputRange<'gram>,
245    ) -> impl Iterator<Item = TraversalId> + '_ {
246        let _span = tracing::span!(tracing::Level::DEBUG, "get_complete").entered();
247        let key = CompletionKey::new_total(term, input_range);
248        self.complete.get(&key).into_iter().flatten().cloned()
249    }
250    pub fn insert(&mut self, traversal: &Traversal<'gram>, lhs: &'gram Term) {
251        let _span = tracing::span!(tracing::Level::DEBUG, "insert").entered();
252        match traversal.next_unmatched() {
253            Some(Term::Terminal(_)) => {
254                // do nothing, because terminals are irrelevant to completion
255            }
256            Some(unmatched @ Term::Nonterminal(_)) => {
257                let key = CompletionKey::new_total(unmatched, &traversal.input_range);
258                self.incomplete.entry(key).or_default().insert(traversal.id);
259            }
260            None => {
261                let key = CompletionKey::new_start(lhs, &traversal.input_range);
262                self.complete.entry(key).or_default().insert(traversal.id);
263            }
264        }
265    }
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use crate::Grammar;
272    use quickcheck::{Arbitrary, Gen, QuickCheck, TestResult};
273
274    #[derive(Debug, Clone)]
275    struct NestedEmptyGrammar(Grammar);
276    impl Arbitrary for NestedEmptyGrammar {
277        fn arbitrary(g: &mut Gen) -> Self {
278            let mut grammar: Grammar = "
279            <start> ::= <a> <empty>
280            <a> ::= 'a' <empty>"
281                .parse()
282                .unwrap();
283
284            let mut expressions: Vec<_> = grammar
285                .productions_iter_mut()
286                .flat_map(|prod| prod.rhs_iter_mut())
287                .collect();
288
289            let expr_indexes: Vec<usize> = (0..expressions.len()).collect();
290            let expr_choice_index = g.choose(&expr_indexes).unwrap();
291            let expr_choice: &mut crate::Expression = expressions[*expr_choice_index];
292
293            let term_choice_indexes: Vec<usize> = (0..expr_choice.terms.len()).collect();
294            let term_choice_index = g.choose(&term_choice_indexes).unwrap();
295
296            expr_choice
297                .terms
298                .insert(*term_choice_index, Term::Nonterminal(String::from("empty")));
299
300            grammar.add_production("<empty> ::= ''".parse().unwrap());
301
302            Self(grammar)
303        }
304    }
305
306    fn prop_empty_rules_allow_parse(grammar: NestedEmptyGrammar) -> TestResult {
307        let input = "a";
308
309        let mut parses = parse(&grammar.0, input);
310        TestResult::from_bool(parses.next().is_some())
311    }
312
313    #[test]
314    fn empty_rules_allow_parse() {
315        QuickCheck::new()
316            .tests(1000)
317            .quickcheck(prop_empty_rules_allow_parse as fn(NestedEmptyGrammar) -> TestResult)
318    }
319}