1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
use std::{
    collections::hash_map::{Entry, HashMap},
    hash::{BuildHasherDefault, Hash},
};

use cfgrammar::{yacc::YaccGrammar, PIdx, SIdx, Symbol};
use fnv::FnvHasher;
use num_traits::{AsPrimitive, PrimInt, Unsigned};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use vob::Vob;

use cfgrammar::yacc::firsts::YaccFirsts;

/// The type of "context" (also known as "lookaheads")
pub type Ctx = Vob;

#[derive(Clone, Debug, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Itemset<StorageT: Eq + Hash> {
    pub items: HashMap<(PIdx<StorageT>, SIdx<StorageT>), Ctx, BuildHasherDefault<FnvHasher>>,
}

impl<StorageT: 'static + Hash + PrimInt + Unsigned> Itemset<StorageT>
where
    usize: AsPrimitive<StorageT>,
{
    /// Create a blank Itemset.
    pub fn new(_: &YaccGrammar<StorageT>) -> Self {
        Itemset {
            items: HashMap::with_hasher(BuildHasherDefault::<FnvHasher>::default()),
        }
    }

    /// Add an item `(prod, dot)` with context `ctx` to this itemset. Returns true if this led to
    /// any changes in the itemset.
    pub fn add(&mut self, pidx: PIdx<StorageT>, dot: SIdx<StorageT>, ctx: &Ctx) -> bool {
        let entry = self.items.entry((pidx, dot));
        match entry {
            Entry::Occupied(mut e) => e.get_mut().or(ctx),
            Entry::Vacant(e) => {
                e.insert(ctx.clone());
                true
            }
        }
    }

    /// Create a new itemset which is a closed version of `self`.
    pub fn close(&self, grm: &YaccGrammar<StorageT>, firsts: &YaccFirsts<StorageT>) -> Self {
        // This function can be seen as a merger of getClosure and getContext from Chen's
        // dissertation.

        let mut new_is = self.clone(); // The new itemset we're building up

        // In a typical description of this algorithm, one would have a todo set which contains
        // pairs (pidx, dot). Unfortunately this is a slow way of doing things. Searching the set
        // for the next item and removing it is slow; and, since we don't know how many potential
        // dots there are in a production, the set is of potentially unbounded size, so we can end
        // up resizing memory. Since this function is the most expensive in the table generation,
        // using a HashSet (which is the "obvious" solution) is slow.
        //
        // However, we can reduce these costs through two observations:
        //   1) The initial todo set is populated with (pidx, dot) pairs that all come from
        //      self.items.keys(). There's no point copying these into a todo list.
        //   2) All subsequent todo items are of the form (prod_off, 0). Since the dot in these
        //      cases is always 0, we don't need to store pairs: simply knowing which prod_off's we
        //      need to look at is sufficient. We can represent these with a fixed-size bitfield.
        // All we need to do is first iterate through the items in 1 and, when it's exhausted,
        // continually iterate over the bitfield from 2 until no new items have been added.

        let mut keys_iter = self.items.keys(); // The initial todo list
        let mut zero_todos = Vob::from_elem(usize::from(grm.prods_len()), false); // Subsequent todos
        let mut new_ctx = Vob::from_elem(usize::from(grm.tokens_len()), false);
        loop {
            let pidx;
            let dot;
            // Find the next todo item or, if there are none, break the loop. First of all we try
            // pumping keys_iter for its next value. If there is none (i.e. we've exhausted that
            // part of the todo set), we iterate over zero_todos.
            match keys_iter.next() {
                Some(&(x, y)) => {
                    pidx = x;
                    dot = y;
                }
                None => {
                    match zero_todos.iter_set_bits(..).next() {
                        Some(i) => {
                            // Since zero_todos.len() == grm.prods_len, the call to as_ is safe.
                            pidx = PIdx(i.as_())
                        }
                        None => break,
                    }
                    dot = SIdx(StorageT::zero());
                    zero_todos.set(pidx.into(), false);
                }
            }
            let prod = grm.prod(pidx);
            if dot == grm.prod_len(pidx) {
                continue;
            }
            if let Symbol::Rule(s_ridx) = prod[usize::from(dot)] {
                // This if statement is, in essence, a fast version of what's called getContext in
                // Chen's dissertation, folding in getTHeads at the same time. The particular
                // formulation here is based as much on
                // http://binarysculpting.com/2012/02/04/computing-lr1-closure/ as it is any of the
                // several versions of getTHeads in Chen's dissertation. Nevertheless, the mapping
                // between the two different formulations is fairly straight-forward.
                new_ctx.set_all(false);
                let mut nullable = true;
                for sym in prod.iter().skip(usize::from(dot) + 1) {
                    match *sym {
                        Symbol::Token(s_tidx) => {
                            new_ctx.set(usize::from(s_tidx), true);
                            nullable = false;
                            break;
                        }
                        Symbol::Rule(s_ridx) => {
                            new_ctx.or(firsts.firsts(s_ridx));
                            if !firsts.is_epsilon_set(s_ridx) {
                                nullable = false;
                                break;
                            }
                        }
                    }
                }
                if nullable {
                    new_ctx.or(&new_is.items[&(pidx, dot)]);
                }

                for ref_pidx in grm.rule_to_prods(s_ridx).iter() {
                    if new_is.add(*ref_pidx, SIdx(StorageT::zero()), &new_ctx) {
                        zero_todos.set(usize::from(*ref_pidx), true);
                    }
                }
            }
        }
        new_is
    }

    /// Create a new Itemset based on calculating the goto of 'sym' on the current Itemset.
    pub fn goto(&self, grm: &YaccGrammar<StorageT>, sym: &Symbol<StorageT>) -> Self {
        // This is called 'transition' in Chen's dissertation, though note that the definition
        // therein appears to get the dot in the input/output the wrong way around.
        let mut newis = Itemset::new(grm);
        for (&(pidx, dot), ctx) in &self.items {
            let prod = grm.prod(pidx);
            if dot == grm.prod_len(pidx) {
                continue;
            }
            if sym == &prod[usize::from(dot)] {
                newis.add(pidx, SIdx(dot.as_storaget() + StorageT::one()), ctx);
            }
        }
        newis
    }
}

#[cfg(test)]
mod test {
    use cfgrammar::{
        yacc::{YaccGrammar, YaccKind, YaccOriginalActionKind},
        SIdx, Symbol,
    };
    use vob::Vob;

    use super::Itemset;
    use crate::stategraph::state_exists;

    #[test]
    #[rustfmt::skip]
    fn test_dragon_grammar() {
        // From http://binarysculpting.com/2012/02/04/computing-lr1-closure/
        let grm = &YaccGrammar::new(
            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
            &"
          %start S
          %%
          S: L '=' R | R;
          L: '*' R | 'id';
          R: L;
          "
        ).unwrap();
        let firsts = grm.firsts();

        let mut is = Itemset::new(&grm);
        let mut la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.eof_token_idx()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("^").unwrap())[0], SIdx(0), &la);
        let cls_is = is.close(&grm, &firsts);
        println!("{:?}", cls_is);
        assert_eq!(cls_is.items.len(), 6);
        state_exists(&grm, &cls_is, "^", 0, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "S", 0, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "S", 1, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "L", 0, SIdx(0), vec!["$", "="]);
        state_exists(&grm, &cls_is, "L", 1, SIdx(0), vec!["$", "="]);
        state_exists(&grm, &cls_is, "R", 0, SIdx(0), vec!["$"]);
    }

    fn eco_grammar() -> YaccGrammar {
        YaccGrammar::new(
            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
            &"
          %start S
          %token a b c d f
          %%
          S: S 'b' | 'b' A 'a' | 'a';
          A: 'a' S 'c' | 'a' | 'a' S 'b';
          B: A S;
          C: D A;
          D: 'd' | ;
          F: C D 'f';
          ",
        )
        .unwrap()
    }

    #[test]
    #[rustfmt::skip]
    fn test_closure1_ecogrm() {
        let grm = eco_grammar();
        let firsts = grm.firsts();

        let mut is = Itemset::new(&grm);
        let mut la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.eof_token_idx()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("^").unwrap())[0], SIdx(0), &la);
        let mut cls_is = is.close(&grm, &firsts);

        state_exists(&grm, &cls_is, "^", 0, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "S", 0, SIdx(0), vec!["b", "$"]);
        state_exists(&grm, &cls_is, "S", 1, SIdx(0), vec!["b", "$"]);
        state_exists(&grm, &cls_is, "S", 2, SIdx(0), vec!["b", "$"]);

        is = Itemset::new(&grm);
        is.add(grm.rule_to_prods(grm.rule_idx("F").unwrap())[0], SIdx(0), &la);
        cls_is = is.close(&grm, &firsts);
        state_exists(&grm, &cls_is, "F", 0, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "C", 0, SIdx(0), vec!["d", "f"]);
        state_exists(&grm, &cls_is, "D", 0, SIdx(0), vec!["a"]);
        state_exists(&grm, &cls_is, "D", 1, SIdx(0), vec!["a"]);
    }

    // GrammarAST from 'LR(k) Analyse fuer Pragmatiker'
    // Z : S
    // S : Sb
    //     bAa
    // A : aSc
    //     a
    //     aSb
    fn grammar3() -> YaccGrammar {
        YaccGrammar::new(
            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
            &"
          %start S
          %token a b c d
          %%
          S: S 'b' | 'b' A 'a';
          A: 'a' S 'c' | 'a' | 'a' S 'b';
          ",
        )
        .unwrap()
    }

    #[test]
    #[rustfmt::skip]
    fn test_closure1_grm3() {
        let grm = grammar3();
        let firsts = grm.firsts();

        let mut is = Itemset::new(&grm);
        let mut la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.eof_token_idx()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("^").unwrap())[0], SIdx(0), &la);
        let mut cls_is = is.close(&grm, &firsts);

        state_exists(&grm, &cls_is, "^", 0, SIdx(0), vec!["$"]);
        state_exists(&grm, &cls_is, "S", 0, SIdx(0), vec!["b", "$"]);
        state_exists(&grm, &cls_is, "S", 1, SIdx(0), vec!["b", "$"]);

        is = Itemset::new(&grm);
        la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.token_idx("b").unwrap()), true);
        la.set(usize::from(grm.eof_token_idx()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("S").unwrap())[1], SIdx(1), &la);
        cls_is = is.close(&grm, &firsts);
        state_exists(&grm, &cls_is, "A", 0, SIdx(0), vec!["a"]);
        state_exists(&grm, &cls_is, "A", 1, SIdx(0), vec!["a"]);
        state_exists(&grm, &cls_is, "A", 2, SIdx(0), vec!["a"]);

        is = Itemset::new(&grm);
        la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.token_idx("a").unwrap()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("A").unwrap())[0], SIdx(1), &la);
        cls_is = is.close(&grm, &firsts);
        state_exists(&grm, &cls_is, "S", 0, SIdx(0), vec!["b", "c"]);
        state_exists(&grm, &cls_is, "S", 1, SIdx(0), vec!["b", "c"]);
    }

    #[test]
    #[rustfmt::skip]
    fn test_goto1() {
        let grm = grammar3();
        let firsts = grm.firsts();

        let mut is = Itemset::new(&grm);
        let mut la = Vob::from_elem(usize::from(grm.tokens_len()), false);
        la.set(usize::from(grm.eof_token_idx()), true);
        is.add(grm.rule_to_prods(grm.rule_idx("^").unwrap())[0], SIdx(0), &la);
        let cls_is = is.close(&grm, &firsts);

        let goto1 = cls_is.goto(&grm, &Symbol::Rule(grm.rule_idx("S").unwrap()));
        state_exists(&grm, &goto1, "^", 0, SIdx(1), vec!["$"]);
        state_exists(&grm, &goto1, "S", 0, SIdx(1), vec!["$", "b"]);

        // follow 'b' from start set
        let goto2 = cls_is.goto(&grm, &Symbol::Token(grm.token_idx("b").unwrap()));
        state_exists(&grm, &goto2, "S", 1, SIdx(1), vec!["$", "b"]);

        // continue by following 'a' from last goto, after it's been closed
        let goto3 = goto2.close(&grm, &firsts).goto(&grm, &Symbol::Token(grm.token_idx("a").unwrap()));
        state_exists(&grm, &goto3, "A", 1, SIdx(1), vec!["a"]);
        state_exists(&grm, &goto3, "A", 2, SIdx(1), vec!["a"]);
    }
}