cfg_grammar/
cfg.rs

1use std::borrow::Cow;
2use std::cell::RefCell;
3use std::fmt::Write;
4use std::rc::Rc;
5use std::{cmp, fmt};
6use std::{mem, ops};
7
8use cfg_history::earley::History;
9use cfg_symbol::SymbolName;
10
11use occurence_map::OccurenceMap;
12use rule_builder::RuleBuilder;
13
14use crate::local_prelude::*;
15use cfg_history::{
16    BinarizedRhsRange::*, HistoryNodeBinarize, HistoryNodeEliminateNulling, RootHistoryNode,
17};
18
19/// Representation of context-free grammars.
20#[derive(Clone, Debug)]
21pub struct Cfg {
22    /// The symbol source.
23    sym_source: SymbolSource,
24    /// The list of lexemes.
25    lexemes: SymbolBitSet,
26    /// The array of rules.
27    rules: Vec<CfgRule>,
28    /// Start symbols.
29    roots: MaybeSmallVec<Symbol>,
30    wrapped_roots: MaybeSmallVec<WrappedRoot>,
31    rhs_len_invariant: Option<usize>,
32    eliminate_nulling: bool,
33    tmp_stack: RefCell<Vec<Symbol>>,
34}
35
36/// Your standard grammar rule representation.
37#[derive(Clone, Debug, Eq, PartialEq)]
38pub struct CfgRule {
39    /// The rule's left-hand side symbol.
40    pub lhs: Symbol,
41    /// The rule's right-hand side symbols.
42    pub rhs: Rc<[Symbol]>,
43    /// The rule's history.
44    pub history: History,
45}
46
47/// Your standard grammar rule representation.
48#[derive(Clone)]
49pub struct NamedCfgRule {
50    /// The rule's left-hand side symbol.
51    pub lhs: Symbol,
52    /// The rule's right-hand side symbols.
53    pub rhs: Rc<[Symbol]>,
54    /// The rule's history.
55    pub history: Option<History>,
56    /// Collection of symbol names.
57    pub names: Vec<Option<SymbolName>>,
58}
59
60#[derive(Clone, Copy, Debug)]
61pub struct WrappedRoot {
62    pub start_of_input: Symbol,
63    pub inner_root: Symbol,
64    pub end_of_input: Symbol,
65    pub root: Symbol,
66}
67
68#[derive(Eq, PartialEq, Clone, Copy)]
69pub enum RhsPropertyMode {
70    All,
71    Any,
72}
73
74#[derive(Clone, Copy, Debug)]
75pub struct DotInfo {
76    pub lhs: Symbol,
77    pub predot: Option<Symbol>,
78    pub postdot: Option<Symbol>,
79    pub earley: Option<earley::rule_dot::RuleDot>,
80}
81
82impl Default for Cfg {
83    fn default() -> Self {
84        Self::with_sym_source(SymbolSource::new())
85    }
86}
87
88impl Cfg {
89    /// Creates an empty context-free grammar.
90    #[inline]
91    pub fn new() -> Self {
92        Self::default()
93    }
94
95    pub fn with_sym_source(sym_source: SymbolSource) -> Self {
96        Cfg {
97            sym_source,
98            lexemes: SymbolBitSet::new(),
99            rules: vec![],
100            roots: MaybeSmallVec::new(),
101            wrapped_roots: MaybeSmallVec::new(),
102            rhs_len_invariant: None,
103            eliminate_nulling: false,
104            tmp_stack: RefCell::new(vec![]),
105        }
106    }
107
108    /// Returns generated symbols.
109    pub fn sym<const N: usize>(&mut self) -> [Symbol; N] {
110        self.sym_source_mut().sym()
111    }
112
113    /// Generates a new unique symbol.
114    pub fn next_sym(&mut self, name: Option<Cow<str>>) -> Symbol {
115        self.sym_source_mut().next_sym(name)
116    }
117
118    /// Generates a new unique symbol.
119    pub fn lexeme(&mut self, name: Option<Cow<str>>) -> Symbol {
120        self.lexemes.reserve(self.num_syms() + 1);
121        let result = self.sym_source_mut().next_sym(name);
122        self.lexemes.set(result, true);
123        result
124    }
125
126    pub fn sym_at<const N: usize>(at: usize) -> [Symbol; N] {
127        let mut sym_source = SymbolSource::new();
128        for _ in 0..at {
129            sym_source.next_sym(None);
130        }
131        sym_source.sym()
132    }
133
134    /// Returns the number of symbols in use.
135    pub fn num_syms(&self) -> usize {
136        self.sym_source().num_syms()
137    }
138
139    pub fn set_roots(&mut self, roots: impl AsRef<[Symbol]>) {
140        self.roots = roots.as_ref().iter().copied().collect();
141    }
142
143    pub fn roots(&self) -> &[Symbol] {
144        &self.roots[..]
145    }
146
147    pub fn wrapped_roots(&self) -> &[WrappedRoot] {
148        &self.wrapped_roots[..]
149    }
150
151    pub fn set_wrapped_roots(&mut self, wrapped_roots: &[WrappedRoot]) {
152        self.wrapped_roots = wrapped_roots.into();
153    }
154
155    pub fn has_roots(&self) -> bool {
156        !self.roots.is_empty()
157    }
158
159    /// Modifies this grammar to its weak equivalent.
160    ///
161    /// # Invariants
162    ///
163    /// All rule RHS' have at most `n` symbols at all times until another
164    /// call to this method or a call to [`fn binarize_and_eliminate_nulling_rules`].
165    ///
166    /// [`fn binarize_and_eliminate_nulling_rules`]: Self::binarize_and_eliminate_nulling_rules
167    pub fn limit_rhs_len(&mut self, limit: Option<usize>) {
168        self.rhs_len_invariant = limit;
169        let mut container = mem::take(&mut self.rules);
170        container.retain(|rule| self.maybe_process_rule(rule));
171        self.rules.extend(container);
172    }
173
174    pub fn rule_rhs_len_allowed_range(&self) -> ops::Range<usize> {
175        self.eliminate_nulling as usize..self.rhs_len_invariant.unwrap_or(usize::MAX)
176    }
177
178    /// Sorts the rule array.
179    pub fn sort(&mut self) {
180        self.rules.sort_by_key(|rule| (rule.lhs, rule.rhs.clone()));
181    }
182
183    /// Sorts the rule array in place, using the argument to compare elements.
184    pub fn sort_by(&mut self, compare: impl FnMut(&CfgRule, &CfgRule) -> cmp::Ordering) {
185        self.rules.sort_by(compare);
186    }
187
188    /// Removes consecutive duplicate rules.
189    pub fn dedup(&mut self) {
190        self.rules.dedup_by_key(|rule| (rule.lhs, rule.rhs.clone()));
191    }
192
193    pub fn extend(&mut self, other: &Cfg) {
194        self.rules.extend(other.rules.iter().cloned());
195    }
196
197    /// Ensures the grammar is binarized and eliminates all nulling rules, which have the
198    /// form `A ::= epsilon`. Returns the eliminated parts of the grammar as a nulling subgrammar.
199    ///
200    /// In other words, this method splits off the nulling parts of the grammar.
201    ///
202    /// The language represented by the grammar is preserved, except for the possible lack of
203    /// the empty string. Unproductive rules aren't preserved.
204    ///
205    /// # Invariants
206    ///
207    /// All rule RHS' have at least 1 symbol and at most 2 symbols at all times until
208    /// another call to this method or a call to [`fn limit_rhs_len`].
209    ///
210    /// [`fn limit_rhs_len`]: Self::limit_rhs_len
211    pub fn binarize_and_eliminate_nulling_rules(&mut self) -> Cfg {
212        self.limit_rhs_len(Some(2));
213
214        let mut result = Cfg::with_sym_source(self.sym_source.clone());
215
216        let mut nullable = self.nulling_symbols();
217        self.rhs_closure_for_all(&mut nullable);
218        if nullable.iter().count() == 0 {
219            return result;
220        }
221
222        let mut rewritten_work = Cfg::new();
223        for rule in self.rules() {
224            let is_nullable = |sym: &Symbol| nullable[*sym];
225            let maybe_which = match (
226                rule.rhs.get(0).map(is_nullable),
227                rule.rhs.get(1).map(is_nullable),
228            ) {
229                (Some(true), Some(true)) => Some(All(2)),
230                (Some(true), None) => Some(All(1)),
231                (None, None) => Some(All(0)),
232                (Some(true), Some(false)) => Some(Right),
233                (Some(false), Some(true)) => Some(Left),
234                _ => None,
235            };
236            if let Some(which) = maybe_which {
237                match which {
238                    All(num) => {
239                        // nulling
240                        if num == 2 {
241                            let history = HistoryNodeEliminateNulling {
242                                prev: rule.history,
243                                rhs0: rule.rhs.get(0).cloned(),
244                                rhs1: rule.rhs.get(1).cloned(),
245                                which,
246                            }
247                            .into();
248                            rewritten_work
249                                .rule(rule.lhs)
250                                .history(history)
251                                .rhs(&rule.rhs[0..1]);
252                            rewritten_work
253                                .rule(rule.lhs)
254                                .history(history)
255                                .rhs(&rule.rhs[1..2]);
256                        }
257                        let history = HistoryNodeEliminateNulling {
258                            prev: rule.history,
259                            rhs0: rule.rhs.get(0).cloned(),
260                            rhs1: rule.rhs.get(1).cloned(),
261                            which,
262                        }
263                        .into();
264                        result
265                            .rule(rule.lhs)
266                            .history(history)
267                            .rhs(&rule.rhs[which.as_range()]);
268                    }
269                    Left | Right => {
270                        let history: History = HistoryNodeEliminateNulling {
271                            prev: rule.history,
272                            rhs0: rule.rhs.get(0).cloned(),
273                            rhs1: rule.rhs.get(1).cloned(),
274                            which,
275                        }
276                        .into();
277                        println!("{:?}", history.nullable());
278                        rewritten_work
279                            .rule(rule.lhs)
280                            .history(history)
281                            .rhs(&rule.rhs[which.as_range()]);
282                    }
283                }
284            }
285        }
286
287        self.extend(&rewritten_work);
288
289        self.rules.retain(|rule| rule.rhs.len() != 0);
290
291        let mut productive = SymbolBitSet::new();
292        // TODO check if correct
293        productive.terminal(&*self);
294        productive.subtract_productive(&result);
295
296        self.rhs_closure_for_all(&mut productive);
297        self.rules.retain(|rule| {
298            // Retain the rule only if it's productive. We have to, in order to remove rules
299            // that were made unproductive as a result of `A ::= epsilon` rule elimination.
300            // Otherwise, some of our nonterminal symbols might be terminal.
301            productive[rule.lhs]
302        });
303
304        result
305    }
306
307    pub fn rules<'a>(&'a self) -> impl Iterator<Item = &'a CfgRule>
308    where
309        Self: 'a,
310    {
311        self.rules.iter()
312    }
313
314    pub fn column(&self, col: usize) -> impl Iterator<Item = DotInfo> + '_ {
315        let mapper = move |rule: &CfgRule| DotInfo {
316            lhs: rule.lhs,
317            predot: rule.rhs.get(col.wrapping_sub(1)).copied(),
318            postdot: rule.rhs.get(col).copied(),
319            earley: Some(rule.history.dot(col)),
320        };
321        self.rules().map(mapper)
322    }
323
324    pub fn sym_source(&self) -> &SymbolSource {
325        &self.sym_source
326    }
327
328    pub fn sym_source_mut(&mut self) -> &mut SymbolSource {
329        &mut self.sym_source
330    }
331
332    pub fn retain(&mut self, f: impl FnMut(&CfgRule) -> bool) {
333        self.rules.retain(f);
334    }
335
336    fn maybe_process_rule(&mut self, rule: &CfgRule) -> bool {
337        if self.rule_rhs_len_allowed_range().contains(&rule.rhs.len()) {
338            return true;
339        }
340
341        // Rewrite to a set of binarized rules.
342        // From `LHS ⸬= A B C … X Y Z` to:
343        // ____________________
344        // | LHS ⸬= S0  Z
345        // | S0  ⸬= S1  Y
346        // | S1  ⸬= S2  X
347        // | …
348        // | Sm  ⸬= Sn  C
349        // | Sn  ⸬= A   B
350        let mut rhs_rev = rule.rhs.to_vec();
351        rhs_rev.reverse();
352        let mut tail = Vec::new();
353        let mut i: u32 = 0;
354        while !rhs_rev.is_empty() {
355            let tail_idx = rhs_rev
356                .len()
357                .saturating_sub(self.rule_rhs_len_allowed_range().end);
358            tail.extend(rhs_rev.drain(tail_idx..));
359            tail.reverse();
360            let lhs;
361            if rhs_rev.is_empty() {
362                lhs = rule.lhs;
363            } else {
364                lhs = self.next_sym(None);
365                rhs_rev.push(lhs);
366            }
367            let history;
368            if i == 0 && rhs_rev.is_empty() {
369                history = rule.history;
370            } else {
371                let history_node_binarize = HistoryNodeBinarize {
372                    prev: rule.history,
373                    height: i,
374                    full_len: rule.rhs.len(),
375                    is_top: rhs_rev.is_empty(),
376                };
377                println!("{:?}", rule.rhs);
378                history = history_node_binarize.into();
379            }
380            self.rules.push(CfgRule::new(lhs, &tail[..], history));
381            tail.clear();
382            i += 1;
383        }
384
385        false
386    }
387
388    pub fn add_rule(&mut self, rule: CfgRule) {
389        if self.maybe_process_rule(&rule) {
390            self.rules.push(rule);
391        }
392    }
393
394    pub fn clear_rules(&mut self) {
395        self.rules.clear();
396    }
397
398    /// Reverses the grammar.
399    pub fn reverse(&mut self) {
400        for rule in &mut self.rules {
401            rule.rhs = rule.rhs.iter().copied().rev().collect();
402        }
403    }
404
405    /// Starts building a new rule.
406    pub fn rule(&mut self, lhs: Symbol) -> RuleBuilder<'_> {
407        RuleBuilder::new(self).rule(lhs)
408    }
409
410    /// Starts building a new precedenced rule.
411    pub fn precedenced_rule(&mut self, lhs: Symbol) -> PrecedencedRuleBuilder<'_> {
412        PrecedencedRuleBuilder::new(self, lhs)
413    }
414
415    pub fn rhs_closure_for_all(&self, property: &mut SymbolBitSet) {
416        self.rhs_closure(property, RhsPropertyMode::All)
417    }
418
419    pub fn rhs_closure_for_any(&self, property: &mut SymbolBitSet) {
420        self.rhs_closure(property, RhsPropertyMode::Any)
421    }
422
423    pub fn rhs_closure(&self, property: &mut SymbolBitSet, property_mode: RhsPropertyMode) {
424        let mut tmp_stack = self.tmp_stack.borrow_mut();
425        tmp_stack.extend(property.iter());
426
427        let occurence_map = OccurenceMap::from_rules(self.rules());
428
429        while let Some(work_sym) = tmp_stack.pop() {
430            for &rule_id in occurence_map.get(work_sym).rhs() {
431                let rule = &self.rules[rule_id];
432                let mut rhs_iter = rule.rhs.iter();
433                let get_property = |&sym: &Symbol| property[sym];
434                let rhs_satifies_property = match property_mode {
435                    RhsPropertyMode::All => rhs_iter.all(get_property),
436                    RhsPropertyMode::Any => rhs_iter.any(get_property),
437                };
438                if !property[rule.lhs] && rhs_satifies_property {
439                    property.set(rule.lhs, true);
440                    tmp_stack.push(rule.lhs);
441                }
442            }
443        }
444
445        tmp_stack.clear();
446    }
447
448    pub fn rhs_closure_with_values(&mut self, value: &mut [Option<u32>]) {
449        let mut tmp_stack = self.tmp_stack.borrow_mut();
450        for (maybe_sym_value, sym) in value.iter().zip(SymbolSource::generate_fresh()) {
451            if maybe_sym_value.is_some() {
452                tmp_stack.push(sym);
453            }
454        }
455
456        let occurence_map = OccurenceMap::from_rules(self.rules());
457
458        while let Some(work_sym) = tmp_stack.pop() {
459            let rules = occurence_map
460                .get(work_sym)
461                .rhs()
462                .iter()
463                .map(|&rule_id| &self.rules[rule_id]);
464            for rule in rules {
465                let maybe_work_value = rule
466                    .rhs
467                    .iter()
468                    .try_fold(0, |acc, elem| value[elem.usize()].map(|val| acc + val));
469                if let Some(work_value) = maybe_work_value {
470                    if let Some(current_value) = value[rule.lhs.usize()] {
471                        if current_value <= work_value {
472                            continue;
473                        }
474                    }
475                    value[rule.lhs.usize()] = Some(work_value);
476                    tmp_stack.push(rule.lhs);
477                }
478            }
479        }
480
481        tmp_stack.clear();
482    }
483
484    pub fn wrap_input(&mut self) {
485        self.wrapped_roots.clear();
486        let roots_len = self.roots.len();
487        let roots = mem::replace(&mut self.roots, MaybeSmallVec::with_capacity(roots_len));
488        for inner_root in roots {
489            let [root, start_of_input, end_of_input] = self.sym_source.with_names([
490                Some("root"),
491                Some("start_of_input"),
492                Some("end_of_input"),
493            ]);
494            self.add_rule(CfgRule {
495                lhs: root,
496                rhs: [start_of_input, inner_root, end_of_input].into(),
497                history: RootHistoryNode::Rule { lhs: root }.into(),
498            });
499            self.wrapped_roots.push(WrappedRoot {
500                root,
501                start_of_input,
502                inner_root,
503                end_of_input,
504            });
505            self.roots.push(root);
506        }
507    }
508
509    pub fn is_empty(&self) -> bool {
510        if self.wrapped_roots.is_empty() {
511            self.rules.is_empty()
512        } else {
513            let mut roots = self.roots.clone();
514            roots.sort();
515            self.rules()
516                .all(|rule| roots.binary_search(&rule.lhs).is_ok())
517        }
518    }
519
520    pub fn stringify_to_bnf(&self) -> String {
521        let mut result = String::new();
522        for rule in self.rules() {
523            let stringify_sym = |sym| format!("{}({})", self.sym_source.name_of(sym), sym.usize());
524            let lhs = stringify_sym(rule.lhs);
525            let rhs = if rule.rhs.is_empty() {
526                "()".into()
527            } else {
528                rule.rhs
529                    .iter()
530                    .copied()
531                    .map(stringify_sym)
532                    .enumerate()
533                    .map(|(i, elem)| {
534                        if i == 0 {
535                            elem.to_string()
536                        } else {
537                            format!(" ~ {}", elem)
538                        }
539                    })
540                    .collect::<String>()
541            };
542            writeln!(&mut result, "{} ::= {};", lhs, rhs).expect("writing to String failed");
543        }
544        result
545    }
546}
547
548impl CfgRule {
549    /// Creates a new rule.
550    pub fn new(lhs: Symbol, rhs: impl AsRef<[Symbol]>, history: History) -> Self {
551        CfgRule {
552            lhs,
553            rhs: rhs.as_ref().into(),
554            history,
555        }
556    }
557
558    pub fn named(&self, sym_source: &SymbolSource) -> NamedCfgRule {
559        NamedCfgRule {
560            lhs: self.lhs,
561            rhs: self.rhs.clone(),
562            history: Some(self.history),
563            names: sym_source.names(),
564        }
565    }
566}
567
568impl NamedCfgRule {
569    pub fn new(names: Vec<Option<SymbolName>>) -> Self {
570        let mut iter = SymbolSource::generate_fresh();
571        NamedCfgRule {
572            lhs: iter.next().unwrap(),
573            rhs: iter.take(names.len() - 1).collect::<Vec<_>>().into(),
574            history: None,
575            names,
576        }
577    }
578
579    pub fn with_history(names: Vec<Option<SymbolName>>, history: History) -> Self {
580        let mut iter = SymbolSource::generate_fresh();
581        NamedCfgRule {
582            lhs: iter.next().unwrap(),
583            rhs: iter.take(names.len() - 1).collect::<Vec<_>>().into(),
584            history: Some(history),
585            names,
586        }
587    }
588}
589
590#[macro_export]
591macro_rules! named_cfg_rule {
592    ($lhs:ident ::= $($rhs:ident)*) => {
593        {
594            use std::rc::Rc;
595            NamedCfgRule::new(vec![Some(stringify!($lhs).into()), $(Some(stringify!($rhs).into())),*])
596        }
597    };
598}
599
600impl Eq for NamedCfgRule {
601    fn assert_receiver_is_total_eq(&self) {}
602}
603
604impl PartialEq for NamedCfgRule {
605    fn eq(&self, other: &Self) -> bool {
606        self.names[self.lhs.usize()] == other.names[other.lhs.usize()]
607            && self.rhs.len() == other.rhs.len()
608            && self
609                .rhs
610                .iter()
611                .zip(other.rhs.iter())
612                .all(|(sym_a, sym_b)| self.names[sym_a.usize()] == other.names[sym_b.usize()])
613            && (self.history.is_none() || other.history.is_none() || self.history == other.history)
614    }
615}
616
617impl fmt::Debug for NamedCfgRule {
618    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
619        let gensym = &"gensym".to_string();
620        let lhs = self.names[self.lhs.usize()].as_deref().unwrap_or(gensym);
621        let rhs = self
622            .rhs
623            .iter()
624            .map(|sym| self.names[sym.usize()].as_deref().unwrap_or(gensym))
625            .collect::<Vec<_>>();
626        f.debug_struct("NamedCfgRule")
627            .field("lhs", &lhs)
628            .field("rhs", &rhs)
629            .field("history", &self.history)
630            .finish()
631    }
632}