lrtable/
pager.rs

1use std::{
2    collections::{HashSet, hash_map::HashMap},
3    hash::Hash,
4};
5
6use cfgrammar::{SIdx, Symbol, yacc::YaccGrammar};
7use num_traits::{AsPrimitive, PrimInt, Unsigned};
8use vob::Vob;
9
10use crate::{StIdx, itemset::Itemset, stategraph::StateGraph};
11
12// This file creates stategraphs from grammars. Unfortunately there is no perfect guide to how to
13// do this that I know of -- certainly not one that talks about sensible ways to arrange data and
14// low-level optimisations. That said, the general algorithms that form the basis of what's used in
15// this file can be found in:
16//
17//   A Practical General Method for Constructing LR(k) Parsers
18//     David Pager, Acta Informatica 7, 249--268, 1977
19//
20// However Pager's paper is dense, and doesn't name sub-parts of the algorithm. We mostly reference
21// the (still incomplete, but less incomplete) version of the algorithm found in:
22//
23//   Measuring and extending LR(1) parser generation
24//     Xin Chen, PhD thesis, University of Hawaii, 2009
25
26impl<StorageT: Hash + PrimInt + Unsigned> Itemset<StorageT> {
27    /// Return true if `other` is weakly compatible with `self`.
28    fn weakly_compatible(&self, other: &Self) -> bool {
29        // The weakly compatible check is one of the three core parts of Pager's algorithm
30        // (along with merging and change propagation). In essence, there are three conditions
31        // which, if satisfied, guarantee that `other` is weakly compatible with `self`
32        // (p255 of Pager's paper, and p50 of Chen's dissertation).
33        //
34        // Since neither Pager nor Chen talk about data-structures, they don't provide an algorithm
35        // for sensibly checking these three conditions. The approach in this function is 1) try
36        // and fail at the earliest point that we notice that the two itemsets can't be weakly
37        // compatible 2) perform the checks of all three conditions in one go.
38
39        // The two itemsets must have the same number of core configurations. Since our itemsets
40        // only store core configurations, two itemsets with different numbers of items can't
41        // possibly be weakly compatible.
42        let len = self.items.len();
43        if len != other.items.len() {
44            return false;
45        }
46
47        // Check that each itemset has the same core configuration.
48        for &(pidx, dot) in self.items.keys() {
49            if !other.items.contains_key(&(pidx, dot)) {
50                return false;
51            }
52        }
53
54        // If there's only one core configuration to deal with -- which happens surprisingly often
55        // -- then the for loop below will always return true, so we save ourselves allocating
56        // memory and bail out early.
57        if len == 1 {
58            return true;
59        }
60
61        // Pager's conditions rely on itemsets being ordered. However, ours are stored as hashmaps:
62        // iterating over self.items and other.items will not generally give the same order of
63        // configurations.
64        //
65        // The most practical thing we can do is to convert one of the itemsets's keys into a list
66        // and use that as a stable ordering mechanism. This uses more hash lookups than we might
67        // like, but we're helped by the fact that most itemsets are smallish in number.
68        let keys: Vec<_> = self.items.keys().collect();
69        for (i, i_key) in keys.iter().enumerate().take(len - 1) {
70            for j_key in keys.iter().take(len).skip(i + 1) {
71                // Condition 1 in the Pager paper
72                if !(vob_intersect(&self.items[*i_key], &other.items[*j_key])
73                    || vob_intersect(&self.items[*j_key], &other.items[*i_key]))
74                {
75                    continue;
76                }
77                // Conditions 2 and 3 in the Pager paper
78                if vob_intersect(&self.items[*i_key], &self.items[*j_key])
79                    || vob_intersect(&other.items[*i_key], &other.items[*j_key])
80                {
81                    continue;
82                }
83                return false;
84            }
85        }
86
87        true
88    }
89
90    /// Merge `other` into `self`, returning `true` if this led to any changes. If `other` is not
91    /// weakly compatible with `self`, this function's effects and return value are undefined.
92    fn weakly_merge(&mut self, other: &Self) -> bool {
93        let mut changed = false;
94        for (&(pidx, dot), ctx) in &mut self.items {
95            if ctx.or(&other.items[&(pidx, dot)]) {
96                changed = true;
97            }
98        }
99        changed
100    }
101}
102
103/// Returns true if two identically sized bitvecs intersect.
104fn vob_intersect(v1: &Vob, v2: &Vob) -> bool {
105    // Iterating over integer sized blocks allows us to do this operation very quickly. Note that
106    // the Vob implementation guarantees that the last block's unused bits will be zeroed out.
107    for (b1, b2) in v1.iter_storage().zip(v2.iter_storage()) {
108        if b1 & b2 != 0 {
109            return true;
110        }
111    }
112    false
113}
114
115/// Create a `StateGraph` from 'grm'.
116pub(crate) fn pager_stategraph<StorageT: 'static + Hash + PrimInt + Unsigned>(
117    grm: &YaccGrammar<StorageT>,
118) -> StateGraph<StorageT>
119where
120    usize: AsPrimitive<StorageT>,
121{
122    // This function can be seen as a modified version of items() from Chen's dissertation.
123
124    let firsts = grm.firsts();
125    // closed_states and core_states are both equally sized vectors of states. Core states are
126    // smaller, and used for the weakly compatible checks, but we ultimately need to return
127    // closed states. Closed states which are None are those which require processing; thus
128    // closed_states also implicitly serves as a todo list.
129    let mut closed_states = Vec::new();
130    let mut core_states = Vec::new();
131    let mut edges: Vec<HashMap<Symbol<StorageT>, StIdx<usize>>> = Vec::new();
132
133    // Because we GC states later, it's possible that we will end up with more states before GC
134    // than `StorageT` can hold. We thus do all our calculations in this function in terms of
135    // `usize`s before converting them to `StorageT` later.
136    let start_state = StIdx(0usize);
137    let mut state0 = Itemset::new(grm);
138    let mut ctx = Vob::from_elem(false, usize::from(grm.tokens_len()));
139    ctx.set(usize::from(grm.eof_token_idx()), true);
140    state0.add(grm.start_prod(), SIdx(StorageT::zero()), &ctx);
141    closed_states.push(None);
142    core_states.push(state0);
143    edges.push(HashMap::new());
144
145    // We maintain two lists of which rules and tokens we've seen; when processing a given
146    // state there's no point processing a rule or token more than once.
147    let mut seen_rules = Vob::from_elem(false, usize::from(grm.rules_len()));
148    let mut seen_tokens = Vob::from_elem(false, usize::from(grm.tokens_len()));
149    // new_states is used to separate out iterating over states vs. mutating it
150    let mut new_states = Vec::new();
151    // cnd_[rule|token]_weaklies represent which states are possible weakly compatible
152    // matches for a given symbol.
153    let mut cnd_rule_weaklies: Vec<Vec<StIdx<usize>>> =
154        vec![Vec::new(); usize::from(grm.rules_len())];
155    let mut cnd_token_weaklies: Vec<Vec<StIdx<usize>>> =
156        vec![Vec::new(); usize::from(grm.tokens_len()).checked_add(1).unwrap()];
157
158    let mut todo = 1; // How many None values are there in closed_states?
159    let mut todo_off = 0; // Offset in closed states to start searching for the next todo.
160    while todo > 0 {
161        debug_assert_eq!(core_states.len(), closed_states.len());
162        // state_i is the next item to process. We don't want to continually search for the
163        // next None from the beginning, so we remember where we last saw a None (todo_off) and
164        // search from that point onwards, wrapping as necessary. Since processing a state x
165        // disproportionately causes state x + 1 to require processing, this prevents the
166        // search from becoming horribly non-linear.
167        let state_i = match closed_states
168            .iter()
169            .skip(todo_off)
170            .position(Option::is_none)
171        {
172            Some(i) => todo_off + i,
173            None => closed_states.iter().position(Option::is_none).unwrap(),
174        };
175        todo_off = state_i + 1;
176        todo -= 1;
177
178        {
179            closed_states[state_i] = Some(core_states[state_i].close(grm, &firsts));
180            let cl_state = &closed_states[state_i].as_ref().unwrap();
181            seen_rules.set_all(false);
182            seen_tokens.set_all(false);
183            for &(pidx, dot) in cl_state.items.keys() {
184                let prod = grm.prod(pidx);
185                if dot == grm.prod_len(pidx) {
186                    continue;
187                }
188                let sym = prod[usize::from(dot)];
189                match sym {
190                    Symbol::Rule(s_ridx) => {
191                        if seen_rules[usize::from(s_ridx)] {
192                            continue;
193                        }
194                        seen_rules.set(usize::from(s_ridx), true);
195                    }
196                    Symbol::Token(s_tidx) => {
197                        if seen_tokens[usize::from(s_tidx)] {
198                            continue;
199                        }
200                        seen_tokens.set(usize::from(s_tidx), true);
201                    }
202                }
203                let nstate = cl_state.goto(grm, &sym);
204                new_states.push((sym, nstate));
205            }
206        }
207
208        'a: for (sym, nstate) in new_states.drain(..) {
209            let mut m = None;
210            {
211                // Try and compatible match for this state.
212                let cnd_states = match sym {
213                    Symbol::Rule(s_ridx) => &cnd_rule_weaklies[usize::from(s_ridx)],
214                    Symbol::Token(s_tidx) => &cnd_token_weaklies[usize::from(s_tidx)],
215                };
216                // First of all see if any of the candidate states are exactly the same as the
217                // new state, in which case we only need to add an edge to the candidate
218                // state. This isn't just an optimisation (though it does avoid the expense
219                // of change propagation), but has a correctness aspect: there's no guarantee
220                // that the weakly compatible check is reflexive (i.e. a state may not be
221                // weakly compatible with itself).
222                for cnd in cnd_states.iter().cloned() {
223                    if core_states[usize::from(cnd)] == nstate {
224                        edges[state_i].insert(sym, cnd);
225                        continue 'a;
226                    }
227                }
228                // No candidate states were equal to the new state, so we need to look for a
229                // candidate state which is weakly compatible.
230                for cnd in cnd_states.iter().cloned() {
231                    if core_states[usize::from(cnd)].weakly_compatible(&nstate) {
232                        m = Some(cnd);
233                        break;
234                    }
235                }
236            }
237            match m {
238                Some(k) => {
239                    // A weakly compatible match has been found.
240                    edges[state_i].insert(sym, k);
241                    if core_states[usize::from(k)].weakly_merge(&nstate) {
242                        // We only do the simplest change propagation, forcing possibly
243                        // affected sets to be entirely reprocessed (which will recursively
244                        // force propagation too). Even though this does unnecessary
245                        // computation, it is still pretty fast.
246                        //
247                        // Note also that edges[k] will be completely regenerated, overwriting
248                        // all existing entries and possibly adding new ones. We thus don't
249                        // need to clear it manually.
250                        if closed_states[usize::from(k)].is_some() {
251                            closed_states[usize::from(k)] = None;
252                            todo += 1;
253                        }
254                    }
255                }
256                None => {
257                    // Check that StorageT is big enough to hold RIdx/PIdx/SIdx/TIdx values; after these
258                    // checks we can guarantee that things like RIdx(ast.rules.len().as_()) are safe.
259                    if core_states.len() >= num_traits::cast(StorageT::max_value()).unwrap() {
260                        panic!("StorageT is not big enough to store this stategraph.");
261                    }
262                    // The condition above guarantees that core_states.len() will fit in a StorageT
263                    // and thus the implicit StorageT -> usize cast therein is safe.
264                    let stidx = StIdx(core_states.len());
265                    match sym {
266                        Symbol::Rule(s_ridx) => {
267                            cnd_rule_weaklies[usize::from(s_ridx)].push(stidx);
268                        }
269                        Symbol::Token(s_tidx) => {
270                            cnd_token_weaklies[usize::from(s_tidx)].push(stidx);
271                        }
272                    }
273                    edges[state_i].insert(sym, stidx);
274                    edges.push(HashMap::new());
275                    closed_states.push(None);
276                    core_states.push(nstate);
277                    todo += 1;
278                }
279            }
280        }
281    }
282
283    // Although the Pager paper doesn't talk about it, the algorithm above can create
284    // unreachable states due to the non-determinism inherent in working with hashsets. Indeed,
285    // this can even happen with the example from Pager's paper (on perhaps 1 out of
286    // 100 runs, 24 or 25 states will be created instead of 23). We thus need to weed out
287    // unreachable states and update edges accordingly.
288    debug_assert_eq!(core_states.len(), closed_states.len());
289    let (gc_states, gc_edges) = gc(
290        core_states
291            .drain(..)
292            .zip(closed_states.drain(..).map(Option::unwrap))
293            .collect(),
294        start_state,
295        edges,
296    );
297
298    // Check that StorageT is big enough to hold RIdx/PIdx/SIdx/TIdx values; after these
299    // checks we can guarantee that things like RIdx(ast.rules.len().as_()) are safe.
300    if gc_states.len() > num_traits::cast(StorageT::max_value()).unwrap() {
301        panic!("StorageT is not big enough to store this stategraph.");
302    }
303
304    let start_state = StIdx::<StorageT>(start_state.as_storaget().as_());
305    let mut gc_edges_storaget = Vec::with_capacity(gc_edges.len());
306    for x in gc_edges {
307        let mut m = HashMap::with_capacity(x.len());
308        for (k, v) in x {
309            m.insert(k, StIdx::<StorageT>(v.as_storaget().as_()));
310        }
311        gc_edges_storaget.push(m);
312    }
313
314    StateGraph::new(gc_states, start_state, gc_edges_storaget)
315}
316
317/// Garbage collect `zip_states` (of `(core_states, closed_state)`) and `edges`. Returns a new pair
318/// with unused states and their corresponding edges removed.
319fn gc<StorageT: 'static + Eq + Hash + PrimInt + Unsigned>(
320    mut states: Vec<(Itemset<StorageT>, Itemset<StorageT>)>,
321    start_state: StIdx<usize>,
322    mut edges: Vec<HashMap<Symbol<StorageT>, StIdx<usize>>>,
323) -> (
324    Vec<(Itemset<StorageT>, Itemset<StorageT>)>,
325    Vec<HashMap<Symbol<StorageT>, StIdx<usize>>>,
326)
327where
328    usize: AsPrimitive<StorageT>,
329{
330    // First of all, do a simple pass over all states. All state indexes reachable from the
331    // start state will be inserted into the 'seen' set.
332    let mut todo = HashSet::new();
333    todo.insert(start_state);
334    let mut seen = HashSet::new();
335    while !todo.is_empty() {
336        // XXX This is the clumsy way we're forced to do what we'd prefer to be:
337        //     "let &(pidx, dot) = todo.pop()"
338        let state_i = *todo.iter().next().unwrap();
339        todo.remove(&state_i);
340        seen.insert(state_i);
341
342        todo.extend(
343            edges[usize::from(state_i)]
344                .values()
345                .filter(|x| !seen.contains(x)),
346        );
347    }
348
349    if states.len() == seen.len() {
350        // Nothing to garbage collect.
351        return (states, edges);
352    }
353
354    // Imagine we started with 3 states and their edges:
355    //   states: [0, 1, 2]
356    //   edges : [[_ => 2]]
357    //
358    // At this point, 'seen' will be the set {0, 2}. What we need to do is to create a new list
359    // of states that doesn't have state 1 in it. That will cause state 2 to become to state 1,
360    // meaning that we need to adjust edges so that the pointer to state 2 is updated to state
361    // 1. In other words we want to achieve this output:
362    //   states: [0, 2]
363    //   edges : [_ => 1]
364    //
365    // The way we do this is to first iterate over all states, working out what the mapping
366    // from seen states to their new offsets is.
367    let mut gc_states = Vec::with_capacity(seen.len());
368    let mut offsets = Vec::with_capacity(states.len());
369    let mut offset = 0;
370    for (state_i, zstate) in states.drain(..).enumerate() {
371        offsets.push(StIdx(state_i - offset));
372        if !seen.contains(&StIdx(state_i)) {
373            offset += 1;
374            continue;
375        }
376        gc_states.push(zstate);
377    }
378
379    // At this point the offsets list will be [0, 1, 1]. We now create new edges where each
380    // offset is corrected by looking it up in the offsets list.
381    let mut gc_edges = Vec::with_capacity(seen.len());
382    for (st_edge_i, st_edges) in edges.drain(..).enumerate() {
383        if !seen.contains(&StIdx(st_edge_i)) {
384            continue;
385        }
386        gc_edges.push(
387            st_edges
388                .iter()
389                .map(|(&k, &v)| (k, offsets[usize::from(v)]))
390                .collect(),
391        );
392    }
393
394    (gc_states, gc_edges)
395}
396
397#[cfg(test)]
398mod test {
399    use vob::Vob;
400
401    use crate::{StIdx, pager::pager_stategraph, stategraph::state_exists};
402    use cfgrammar::{
403        SIdx, Symbol,
404        yacc::{YaccGrammar, YaccKind, YaccOriginalActionKind},
405    };
406
407    use super::vob_intersect;
408
409    #[test]
410    fn test_vob_intersect() {
411        let mut b1 = Vob::from_elem(false, 8);
412        let mut b2 = Vob::from_elem(false, 8);
413        assert!(!vob_intersect(&b1, &b2));
414        // Check that partial blocks (i.e. when only part of a word is used in the bitvec for
415        // storage) maintain the expected guarantees.
416        b1.push(false);
417        b2.push(false);
418        assert!(!vob_intersect(&b1, &b2));
419        b1.push(true);
420        b2.push(true);
421        assert!(vob_intersect(&b1, &b2));
422
423        b1 = Vob::from_elem(false, 64);
424        b2 = Vob::from_elem(false, 64);
425        b1.push(true);
426        b2.push(true);
427        for _ in 0..63 {
428            b1.push(false);
429            b2.push(false);
430        }
431        assert!(vob_intersect(&b1, &b2));
432    }
433
434    // GrammarAST from 'LR(k) Analyse fuer Pragmatiker'
435    // Z : S
436    // S : Sb
437    //     bAa
438    // A : aSc
439    //     a
440    //     aSb
441    fn grammar3() -> YaccGrammar {
442        YaccGrammar::new(
443            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
444            "
445          %start S
446          %token a b c d
447          %%
448          S: S 'b' | 'b' A 'a';
449          A: 'a' S 'c' | 'a' | 'a' S 'b';
450          ",
451        )
452        .unwrap()
453    }
454
455    #[test]
456    #[rustfmt::skip]
457    fn test_stategraph() {
458        let grm = grammar3();
459        let sg = pager_stategraph(&grm);
460
461        assert_eq!(sg.all_states_len(), StIdx(10));
462        assert_eq!(sg.all_edges_len(), 10);
463
464        assert_eq!(sg.closed_state(sg.start_state()).items.len(), 3);
465        state_exists(&grm, sg.closed_state(sg.start_state()), "^", 0, SIdx(0), vec!["$"]);
466        state_exists(&grm, sg.closed_state(sg.start_state()), "S", 0, SIdx(0), vec!["$", "b"]);
467        state_exists(&grm, sg.closed_state(sg.start_state()), "S", 1, SIdx(0), vec!["$", "b"]);
468
469        let s1 = sg.edge(sg.start_state(), Symbol::Rule(grm.rule_idx("S").unwrap())).unwrap();
470        assert_eq!(sg.closed_state(s1).items.len(), 2);
471        state_exists(&grm, sg.closed_state(s1), "^", 0, SIdx(1), vec!["$"]);
472        state_exists(&grm, sg.closed_state(s1), "S", 0, SIdx(1), vec!["$", "b"]);
473
474        let s2 = sg.edge(s1, Symbol::Token(grm.token_idx("b").unwrap())).unwrap();
475        assert_eq!(sg.closed_state(s2).items.len(), 1);
476        state_exists(&grm, sg.closed_state(s2), "S", 0, SIdx(2), vec!["$", "b"]);
477
478        let s3 = sg.edge(sg.start_state(), Symbol::Token(grm.token_idx("b").unwrap())).unwrap();
479        assert_eq!(sg.closed_state(s3).items.len(), 4);
480        state_exists(&grm, sg.closed_state(s3), "S", 1, SIdx(1), vec!["$", "b", "c"]);
481        state_exists(&grm, sg.closed_state(s3), "A", 0, SIdx(0), vec!["a"]);
482        state_exists(&grm, sg.closed_state(s3), "A", 1, SIdx(0), vec!["a"]);
483        state_exists(&grm, sg.closed_state(s3), "A", 2, SIdx(0), vec!["a"]);
484
485        let s4 = sg.edge(s3, Symbol::Rule(grm.rule_idx("A").unwrap())).unwrap();
486        assert_eq!(sg.closed_state(s4).items.len(), 1);
487        state_exists(&grm, sg.closed_state(s4), "S", 1, SIdx(2), vec!["$", "b", "c"]);
488
489        let s5 = sg.edge(s4, Symbol::Token(grm.token_idx("a").unwrap())).unwrap();
490        assert_eq!(sg.closed_state(s5).items.len(), 1);
491        state_exists(&grm, sg.closed_state(s5), "S", 1, SIdx(3), vec!["$", "b", "c"]);
492
493        let s6 = sg.edge(s3, Symbol::Token(grm.token_idx("a").unwrap())).unwrap();
494        // result from merging 10 into 3
495        assert_eq!(s3, sg.edge(s6, Symbol::Token(grm.token_idx("b").unwrap())).unwrap());
496        assert_eq!(sg.closed_state(s6).items.len(), 5);
497        state_exists(&grm, sg.closed_state(s6), "A", 0, SIdx(1), vec!["a"]);
498        state_exists(&grm, sg.closed_state(s6), "A", 1, SIdx(1), vec!["a"]);
499        state_exists(&grm, sg.closed_state(s6), "A", 2, SIdx(1), vec!["a"]);
500        state_exists(&grm, sg.closed_state(s6), "S", 0, SIdx(0), vec!["b", "c"]);
501        state_exists(&grm, sg.closed_state(s6), "S", 1, SIdx(0), vec!["b", "c"]);
502
503        let s7 = sg.edge(s6, Symbol::Rule(grm.rule_idx("S").unwrap())).unwrap();
504        assert_eq!(sg.closed_state(s7).items.len(), 3);
505        state_exists(&grm, sg.closed_state(s7), "A", 0, SIdx(2), vec!["a"]);
506        state_exists(&grm, sg.closed_state(s7), "A", 2, SIdx(2), vec!["a"]);
507        state_exists(&grm, sg.closed_state(s7), "S", 0, SIdx(1), vec!["b", "c"]);
508
509        let s8 = sg.edge(s7, Symbol::Token(grm.token_idx("c").unwrap())).unwrap();
510        assert_eq!(sg.closed_state(s8).items.len(), 1);
511        state_exists(&grm, sg.closed_state(s8), "A", 0, SIdx(3), vec!["a"]);
512
513        let s9 = sg.edge(s7, Symbol::Token(grm.token_idx("b").unwrap())).unwrap();
514        assert_eq!(sg.closed_state(s9).items.len(), 2);
515        state_exists(&grm, sg.closed_state(s9), "A", 2, SIdx(3), vec!["a"]);
516        state_exists(&grm, sg.closed_state(s9), "S", 0, SIdx(2), vec!["b", "c"]);
517    }
518
519    // Pager grammar
520    fn grammar_pager() -> YaccGrammar {
521        YaccGrammar::new(
522            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
523            "
524            %start X
525            %%
526             X : 'a' Y 'd' | 'a' Z 'c' | 'a' T | 'b' Y 'e' | 'b' Z 'd' | 'b' T;
527             Y : 't' W | 'u' X;
528             Z : 't' 'u';
529             T : 'u' X 'a';
530             W : 'u' V;
531             V : ;
532          ",
533        )
534        .unwrap()
535    }
536
537    #[rustfmt::skip]
538    fn test_pager_graph(grm: &YaccGrammar) {
539        let sg = pager_stategraph(grm);
540
541        assert_eq!(sg.all_states_len(), StIdx(23));
542        assert_eq!(sg.all_edges_len(), 27);
543
544        // State 0
545        assert_eq!(sg.closed_state(sg.start_state()).items.len(), 7);
546        state_exists(grm, sg.closed_state(sg.start_state()), "^", 0, SIdx(0), vec!["$"]);
547        state_exists(grm, sg.closed_state(sg.start_state()), "X", 0, SIdx(0), vec!["$"]);
548        state_exists(grm, sg.closed_state(sg.start_state()), "X", 1, SIdx(0), vec!["$"]);
549        state_exists(grm, sg.closed_state(sg.start_state()), "X", 2, SIdx(0), vec!["$"]);
550        state_exists(grm, sg.closed_state(sg.start_state()), "X", 3, SIdx(0), vec!["$"]);
551        state_exists(grm, sg.closed_state(sg.start_state()), "X", 4, SIdx(0), vec!["$"]);
552        state_exists(grm, sg.closed_state(sg.start_state()), "X", 5, SIdx(0), vec!["$"]);
553
554        let s1 = sg.edge(sg.start_state(), Symbol::Token(grm.token_idx("a").unwrap())).unwrap();
555        assert_eq!(sg.closed_state(s1).items.len(), 7);
556        state_exists(grm, sg.closed_state(s1), "X", 0, SIdx(1), vec!["a", "d", "e", "$"]);
557        state_exists(grm, sg.closed_state(s1), "X", 1, SIdx(1), vec!["a", "d", "e", "$"]);
558        state_exists(grm, sg.closed_state(s1), "X", 2, SIdx(1), vec!["a", "d", "e", "$"]);
559        state_exists(grm, sg.closed_state(s1), "Y", 0, SIdx(0), vec!["d"]);
560        state_exists(grm, sg.closed_state(s1), "Y", 1, SIdx(0), vec!["d"]);
561        state_exists(grm, sg.closed_state(s1), "Z", 0, SIdx(0), vec!["c"]);
562        state_exists(grm, sg.closed_state(s1), "T", 0, SIdx(0), vec!["a", "d", "e", "$"]);
563
564        let s7 = sg.edge(sg.start_state(), Symbol::Token(grm.token_idx("b").unwrap())).unwrap();
565        assert_eq!(sg.closed_state(s7).items.len(), 7);
566        state_exists(grm, sg.closed_state(s7), "X", 3, SIdx(1), vec!["a", "d", "e", "$"]);
567        state_exists(grm, sg.closed_state(s7), "X", 4, SIdx(1), vec!["a", "d", "e", "$"]);
568        state_exists(grm, sg.closed_state(s7), "X", 5, SIdx(1), vec!["a", "d", "e", "$"]);
569        state_exists(grm, sg.closed_state(s1), "Y", 0, SIdx(0), vec!["d"]);
570        state_exists(grm, sg.closed_state(s1), "Y", 1, SIdx(0), vec!["d"]);
571        state_exists(grm, sg.closed_state(s1), "Z", 0, SIdx(0), vec!["c"]);
572        state_exists(grm, sg.closed_state(s1), "T", 0, SIdx(0), vec!["a", "d", "e", "$"]);
573
574        let s4 = sg.edge(s1, Symbol::Token(grm.token_idx("u").unwrap())).unwrap();
575        assert_eq!(sg.closed_state(s4).items.len(), 8);
576        assert_eq!(s4, sg.edge(s7, Symbol::Token(grm.token_idx("u").unwrap())).unwrap());
577        state_exists(grm, sg.closed_state(s4), "Y", 1, SIdx(1), vec!["d", "e"]);
578        state_exists(grm, sg.closed_state(s4), "T", 0, SIdx(1), vec!["a", "d", "e", "$"]);
579        state_exists(grm, sg.closed_state(s4), "X", 0, SIdx(0), vec!["a", "d", "e"]);
580        state_exists(grm, sg.closed_state(s4), "X", 1, SIdx(0), vec!["a", "d", "e"]);
581        state_exists(grm, sg.closed_state(s4), "X", 2, SIdx(0), vec!["a", "d", "e"]);
582        state_exists(grm, sg.closed_state(s4), "X", 3, SIdx(0), vec!["a", "d", "e"]);
583        state_exists(grm, sg.closed_state(s4), "X", 4, SIdx(0), vec!["a", "d", "e"]);
584        state_exists(grm, sg.closed_state(s4), "X", 5, SIdx(0), vec!["a", "d", "e"]);
585
586        assert_eq!(s1, sg.edge(s4, Symbol::Token(grm.token_idx("a").unwrap())).unwrap());
587        assert_eq!(s7, sg.edge(s4, Symbol::Token(grm.token_idx("b").unwrap())).unwrap());
588
589        let s2 = sg.edge(s1, Symbol::Token(grm.token_idx("t").unwrap())).unwrap();
590        assert_eq!(sg.closed_state(s2).items.len(), 3);
591        state_exists(grm, sg.closed_state(s2), "Y", 0, SIdx(1), vec!["d"]);
592        state_exists(grm, sg.closed_state(s2), "Z", 0, SIdx(1), vec!["c"]);
593        state_exists(grm, sg.closed_state(s2), "W", 0, SIdx(0), vec!["d"]);
594
595        let s3 = sg.edge(s2, Symbol::Token(grm.token_idx("u").unwrap())).unwrap();
596        assert_eq!(sg.closed_state(s3).items.len(), 3);
597        state_exists(grm, sg.closed_state(s3), "Z", 0, SIdx(2), vec!["c"]);
598        state_exists(grm, sg.closed_state(s3), "W", 0, SIdx(1), vec!["d"]);
599        state_exists(grm, sg.closed_state(s3), "V", 0, SIdx(0), vec!["d"]);
600
601        let s5 = sg.edge(s4, Symbol::Rule(grm.rule_idx("X").unwrap())).unwrap();
602        assert_eq!(sg.closed_state(s5).items.len(), 2);
603        state_exists(grm, sg.closed_state(s5), "Y", 1, SIdx(2), vec!["d", "e"]);
604        state_exists(grm, sg.closed_state(s5), "T", 0, SIdx(2), vec!["a", "d", "e", "$"]);
605
606        let s6 = sg.edge(s5, Symbol::Token(grm.token_idx("a").unwrap())).unwrap();
607        assert_eq!(sg.closed_state(s6).items.len(), 1);
608        state_exists(grm, sg.closed_state(s6), "T", 0, SIdx(3), vec!["a", "d", "e", "$"]);
609
610        let s8 = sg.edge(s7, Symbol::Token(grm.token_idx("t").unwrap())).unwrap();
611        assert_eq!(sg.closed_state(s8).items.len(), 3);
612        state_exists(grm, sg.closed_state(s8), "Y", 0, SIdx(1), vec!["e"]);
613        state_exists(grm, sg.closed_state(s8), "Z", 0, SIdx(1), vec!["d"]);
614        state_exists(grm, sg.closed_state(s8), "W", 0, SIdx(0), vec!["e"]);
615
616        let s9 = sg.edge(s8, Symbol::Token(grm.token_idx("u").unwrap())).unwrap();
617        assert_eq!(sg.closed_state(s9).items.len(), 3);
618        state_exists(grm, sg.closed_state(s9), "Z", 0, SIdx(2), vec!["d"]);
619        state_exists(grm, sg.closed_state(s9), "W", 0, SIdx(1), vec!["e"]);
620        state_exists(grm, sg.closed_state(s3), "V", 0, SIdx(0), vec!["d"]);
621
622        // Ommitted successors from the graph in Fig.3
623
624        // X-successor of S0
625        let s0x = sg.edge(sg.start_state(), Symbol::Rule(grm.rule_idx("X").unwrap())).unwrap();
626        state_exists(grm, sg.closed_state(s0x), "^", 0, SIdx(1), vec!["$"]);
627
628        // Y-successor of S1 (and it's d-successor)
629        let s1y = sg.edge(s1, Symbol::Rule(grm.rule_idx("Y").unwrap())).unwrap();
630        state_exists(grm, sg.closed_state(s1y), "X", 0, SIdx(2), vec!["a", "d", "e", "$"]);
631        let s1yd = sg.edge(s1y, Symbol::Token(grm.token_idx("d").unwrap())).unwrap();
632        state_exists(grm, sg.closed_state(s1yd), "X", 0, SIdx(3), vec!["a", "d", "e", "$"]);
633
634        // Z-successor of S1 (and it's successor)
635        let s1z = sg.edge(s1, Symbol::Rule(grm.rule_idx("Z").unwrap())).unwrap();
636        state_exists(grm, sg.closed_state(s1z), "X", 1, SIdx(2), vec!["a", "d", "e", "$"]);
637        let s1zc = sg.edge(s1z, Symbol::Token(grm.token_idx("c").unwrap())).unwrap();
638        state_exists(grm, sg.closed_state(s1zc), "X", 1, SIdx(3), vec!["a", "d", "e", "$"]);
639
640        // T-successor of S1
641        let s1t = sg.edge(s1, Symbol::Rule(grm.rule_idx("T").unwrap())).unwrap();
642        state_exists(grm, sg.closed_state(s1t), "X", 2, SIdx(2), vec!["a", "d", "e", "$"]);
643
644        // Y-successor of S7 (and it's d-successor)
645        let s7y = sg.edge(s7, Symbol::Rule(grm.rule_idx("Y").unwrap())).unwrap();
646        state_exists(grm, sg.closed_state(s7y), "X", 3, SIdx(2), vec!["a", "d", "e", "$"]);
647        let s7ye = sg.edge(s7y, Symbol::Token(grm.token_idx("e").unwrap())).unwrap();
648        state_exists(grm, sg.closed_state(s7ye), "X", 3, SIdx(3), vec!["a", "d", "e", "$"]);
649
650        // Z-successor of S7 (and it's successor)
651        let s7z = sg.edge(s7, Symbol::Rule(grm.rule_idx("Z").unwrap())).unwrap();
652        state_exists(grm, sg.closed_state(s7z), "X", 4, SIdx(2), vec!["a", "d", "e", "$"]);
653        let s7zd = sg.edge(s7z, Symbol::Token(grm.token_idx("d").unwrap())).unwrap();
654        state_exists(grm, sg.closed_state(s7zd), "X", 4, SIdx(3), vec!["a", "d", "e", "$"]);
655
656        // T-successor of S7
657        let s7t = sg.edge(s7, Symbol::Rule(grm.rule_idx("T").unwrap())).unwrap();
658        state_exists(grm, sg.closed_state(s7t), "X", 5, SIdx(2), vec!["a", "d", "e", "$"]);
659
660        // W-successor of S2 and S8 (merged)
661        let s8w = sg.edge(s8, Symbol::Rule(grm.rule_idx("W").unwrap())).unwrap();
662        assert_eq!(s8w, sg.edge(s2, Symbol::Rule(grm.rule_idx("W").unwrap())).unwrap());
663        state_exists(grm, sg.closed_state(s8w), "Y", 0, SIdx(2), vec!["d", "e"]);
664
665        // V-successor of S3 and S9 (merged)
666        let s9v = sg.edge(s9, Symbol::Rule(grm.rule_idx("V").unwrap())).unwrap();
667        assert_eq!(s9v, sg.edge(s3, Symbol::Rule(grm.rule_idx("V").unwrap())).unwrap());
668        state_exists(grm, sg.closed_state(s9v), "W", 0, SIdx(2), vec!["d", "e"]);
669    }
670
671    #[test]
672    fn test_pager_graph_and_gc() {
673        // The example from Pager's paper (in test_pager_graph) occasionally creates unreachable
674        // states, depending on the non-determinism inherent in iterating over sets in our
675        // implementation, causing the test to fail. This happens in roughly 1 every 100 executions
676        // if GC isn't present. So we run this test a lot of times on the basis that if the GC
677        // fails to work correctly, this will eventually trigger.
678        //
679        // It goes without saying that this is not the ideal way of testing this, but if you can
680        // distil this down to a small, deterministic example, you're a better person than I am.
681        let grm = grammar_pager();
682        for _ in 0..750 {
683            test_pager_graph(&grm);
684        }
685    }
686
687    #[test]
688    #[rustfmt::skip]
689    fn test_pager_graph_core_states() {
690        let grm = grammar_pager();
691        let sg = pager_stategraph(&grm);
692
693        // State 0
694        assert_eq!(sg.core_state(sg.start_state()).items.len(), 1);
695        state_exists(&grm, sg.core_state(sg.start_state()), "^", 0, SIdx(0), vec!["$"]);
696    }
697}