kbnf_syntax/
validated_grammar.rs

1use std::{iter::zip, mem};
2
3use kbnf_regex_automata::dfa;
4use rustc_hash::{FxHashMap, FxHashSet};
5use string_interner::{backend::StringBackend, symbol::SymbolU32, StringInterner, Symbol};
6
7use crate::{
8    config::CompressionConfig,
9    expression::ExpressionWithID,
10    node::{Alternation, NoNestingNode, NodeWithID, OperatorFlattenedNode, Rhs},
11    regex::{FiniteStateAutomaton, FiniteStateAutomatonConfig},
12    simplified_grammar::SimplifiedGrammar,
13    suffix_automaton::SuffixAutomaton,
14    utils::from_terminals_to_regex_string,
15    InternedStrings, RegexExtKind, SymbolKind,
16};
17
18#[derive(Debug, Clone)]
19pub struct ValidatedGrammar {
20    pub expressions: Vec<ExpressionWithID>,
21    pub interned_strings: InternedStrings,
22    pub start_symbol: SymbolU32,
23    pub id_to_regex: FxHashMap<SymbolU32, FiniteStateAutomaton>,
24    pub id_to_suffix_automaton: FxHashMap<SymbolU32, SuffixAutomaton>,
25}
26
27impl ValidatedGrammar {
28    pub fn simplify_grammar(
29        mut self,
30        config: CompressionConfig,
31        regex_start_config: &kbnf_regex_automata::util::start::Config,
32    ) -> SimplifiedGrammar {
33        let expressions = Self::remove_unused_rules(self.expressions, self.start_symbol);
34        let (expressions, mut special_nonterminals) =
35            Self::flatten_nested_rules_with_nonterminals(expressions, &mut self.interned_strings);
36        let expressions = Self::flatten_operators(expressions);
37        let expressions = Self::group_same_lhs_together(expressions);
38        let expressions = Self::deduplicate_alternations(expressions);
39        let expressions = Self::remove_unit_production(
40            expressions,
41            &mut self.start_symbol,
42            &mut special_nonterminals,
43        );
44        let expressions =
45            Self::merge_consecutive_terminals(expressions, &mut self.interned_strings);
46        let expressions = Self::expand_special_nonterminals(
47            expressions,
48            special_nonterminals,
49            &mut self.interned_strings,
50        );
51        let expressions = Self::merge_identical_rhs_across_nonterminals(expressions);
52        let expressions = Self::remove_nullable_rules(
53            expressions,
54            &self.interned_strings,
55            &self.id_to_regex,
56            &self.id_to_suffix_automaton,
57            regex_start_config,
58        );
59        let expressions = Self::remove_unit_production(
60            expressions,
61            &mut self.start_symbol,
62            &mut FxHashMap::default(),
63        );
64        let expressions =
65            Self::merge_consecutive_terminals(expressions, &mut self.interned_strings);
66        let expressions = Self::remove_fixed_point_production(expressions);
67        let expressions = Self::compress_terminals(
68            expressions,
69            &mut self.interned_strings,
70            config,
71            &mut self.id_to_regex,
72        );
73        let (interned_strings, id_to_regexes, id_to_suffix_automaton, expressions, start_symbol) =
74            Self::compact_interned(
75                self.start_symbol,
76                expressions,
77                self.interned_strings,
78                self.id_to_regex,
79                self.id_to_suffix_automaton,
80            );
81        SimplifiedGrammar {
82            expressions,
83            start_symbol,
84            interned_strings,
85            id_to_regex: id_to_regexes,
86            id_to_suffix_automaton,
87        }
88    }
89
90    fn remove_unused_rules(
91        expressions: Vec<ExpressionWithID>,
92        start_symbol: SymbolU32,
93    ) -> Vec<ExpressionWithID> {
94        let mut used_nonterminals = FxHashSet::default();
95        let mut stack = vec![];
96        used_nonterminals.insert(start_symbol);
97        for ExpressionWithID { lhs, rhs: node } in &expressions {
98            if *lhs == start_symbol {
99                stack.push(node);
100            }
101        }
102        while let Some(node) = stack.pop() {
103            match node {
104                NodeWithID::Terminal(_) => {}
105                NodeWithID::RegexString(_) => {}
106                NodeWithID::EarlyEndRegexString(_) => {}
107                NodeWithID::Substrings(_) => {}
108                NodeWithID::RegexComplement(_) => {}
109                NodeWithID::Nonterminal(nonterminal) => {
110                    if used_nonterminals.contains(nonterminal) {
111                        continue;
112                    }
113                    used_nonterminals.insert(*nonterminal);
114                    for ExpressionWithID { lhs, rhs: node } in &expressions {
115                        if *lhs == *nonterminal {
116                            stack.push(node);
117                        }
118                    }
119                }
120                NodeWithID::Multiple(nodes) => {
121                    for node in nodes {
122                        stack.push(node);
123                    }
124                }
125                NodeWithID::RegexExt(node, _) => {
126                    stack.push(node);
127                }
128                NodeWithID::Symbol(lhs, _, rhs) => {
129                    stack.push(lhs);
130                    stack.push(rhs);
131                }
132                NodeWithID::Group(node) => {
133                    stack.push(node);
134                }
135                NodeWithID::Unknown => {
136                    unreachable!("Unknown node. This should not happen.")
137                }
138            }
139        }
140        expressions
141            .into_iter()
142            .filter(|expression| used_nonterminals.contains(&expression.lhs))
143            .collect()
144    }
145
146    fn flatten_nested_rules_with_nonterminals(
147        mut rules: Vec<ExpressionWithID>,
148        interned_strings: &mut InternedStrings,
149    ) -> (
150        Vec<(SymbolU32, NoNestingNode)>,
151        FxHashMap<SymbolU32, RegexExtKind>,
152    ) {
153        let mut flattened_rules: Vec<(SymbolU32, NoNestingNode)> = Vec::with_capacity(rules.len());
154        let mut special_nonterminals: FxHashMap<SymbolU32, RegexExtKind> = FxHashMap::default();
155        let get_new_nonterminal_name =
156            |nonterminal: SymbolU32, identifier: &str, interned_strings: &mut InternedStrings| {
157                let mut i = 0;
158                loop {
159                    let nonterminal = interned_strings.nonterminals.resolve(nonterminal).unwrap();
160                    let new_nonterminal = format!("{nonterminal}_{identifier}_{i}");
161                    if interned_strings
162                        .nonterminals
163                        .get(&new_nonterminal)
164                        .is_none()
165                    {
166                        return interned_strings.nonterminals.get_or_intern(new_nonterminal);
167                    }
168                    i += 1;
169                }
170            };
171        let mut add_new_rule = |lhs: SymbolU32,
172                                identifier: &str,
173                                parent: &mut NoNestingNode,
174                                node: NodeWithID,
175                                rules: &mut Vec<ExpressionWithID>,
176                                kind: Option<RegexExtKind>| {
177            let name = get_new_nonterminal_name(lhs, identifier, interned_strings);
178            *parent = NoNestingNode::Nonterminal(name);
179            if let Some(kind) = kind {
180                special_nonterminals.insert(name, kind);
181            }
182            rules.push(ExpressionWithID {
183                lhs: name,
184                rhs: node,
185            });
186        };
187        fn get_slice(nodes: &[NodeWithID]) -> Vec<NoNestingNode> {
188            let mut buffer = Vec::with_capacity(nodes.len());
189            buffer.resize(nodes.len(), NoNestingNode::Unknown);
190            buffer
191        }
192        while !rules.is_empty() {
193            let length = rules.len();
194            for i in (0..length).rev() {
195                let mut stack: Vec<(NodeWithID, &mut NoNestingNode)> = vec![];
196                let mut root = NoNestingNode::Unknown;
197                let ExpressionWithID { lhs, rhs } = rules.swap_remove(i);
198                stack.push((rhs, &mut root));
199                while let Some((mut old_node, new_parent)) = stack.pop() {
200                    match &mut old_node {
201                        NodeWithID::Terminal(value) => {
202                            *new_parent = NoNestingNode::Terminal(*value);
203                        }
204                        NodeWithID::RegexString(value) => {
205                            *new_parent = NoNestingNode::RegexString(*value);
206                        }
207                        NodeWithID::Nonterminal(value) => {
208                            *new_parent = NoNestingNode::Nonterminal(*value);
209                        }
210                        NodeWithID::RegexComplement(value) => {
211                            *new_parent = NoNestingNode::RegexComplement(*value);
212                        }
213                        NodeWithID::Multiple(nodes) => {
214                            *new_parent = NoNestingNode::Concatenations(get_slice(nodes));
215                            match new_parent {
216                                NoNestingNode::Concatenations(new_nodes) => {
217                                    for (node, new_parent) in
218                                        zip(nodes.iter_mut(), new_nodes.iter_mut())
219                                    {
220                                        stack.push((
221                                            mem::replace(node, NodeWithID::Unknown),
222                                            new_parent,
223                                        ));
224                                    }
225                                }
226                                _ => unreachable!(),
227                            }
228                        }
229                        NodeWithID::RegexExt(node, ext) => {
230                            add_new_rule(
231                                lhs,
232                                match ext {
233                                    RegexExtKind::Repeat0 => "repeat0",
234                                    RegexExtKind::Repeat1 => "repeat1",
235                                    RegexExtKind::Optional => "optional",
236                                },
237                                new_parent,
238                                mem::replace(node.as_mut(), NodeWithID::Unknown),
239                                &mut rules,
240                                Some(*ext),
241                            );
242                        }
243                        NodeWithID::Symbol(l, symbol, r) => {
244                            let nodes = vec![
245                                mem::replace(l.as_mut(), NodeWithID::Unknown),
246                                mem::replace(r.as_mut(), NodeWithID::Unknown),
247                            ];
248                            match symbol {
249                                SymbolKind::Concatenation => {
250                                    *new_parent = NoNestingNode::Concatenations(get_slice(&nodes));
251                                    match new_parent {
252                                        NoNestingNode::Concatenations(new_nodes) => {
253                                            for (node, new_parent) in
254                                                zip(nodes.into_iter(), new_nodes.iter_mut())
255                                            {
256                                                stack.push((node, new_parent));
257                                            }
258                                        }
259                                        _ => unreachable!(),
260                                    }
261                                }
262                                SymbolKind::Alternation => {
263                                    *new_parent = NoNestingNode::Alternations(get_slice(&nodes));
264                                    match new_parent {
265                                        NoNestingNode::Alternations(new_nodes) => {
266                                            for (node, new_parent) in
267                                                zip(nodes.into_iter(), new_nodes.iter_mut())
268                                            {
269                                                stack.push((node, new_parent));
270                                            }
271                                        }
272                                        _ => unreachable!(),
273                                    }
274                                }
275                            }
276                        }
277                        NodeWithID::Group(node) => {
278                            add_new_rule(
279                                lhs,
280                                "group",
281                                new_parent,
282                                mem::replace(node.as_mut(), NodeWithID::Unknown),
283                                &mut rules,
284                                None,
285                            );
286                        }
287                        NodeWithID::EarlyEndRegexString(value) => {
288                            *new_parent = NoNestingNode::EarlyEndRegexString(*value);
289                        }
290                        NodeWithID::Substrings(value) => {
291                            *new_parent = NoNestingNode::Substrings(*value);
292                        }
293                        NodeWithID::Unknown => {
294                            unreachable!("Unknown node. This should not happen.")
295                        }
296                    }
297                }
298                flattened_rules.push((lhs, root));
299            }
300        }
301        (flattened_rules, special_nonterminals)
302    }
303
304    fn flatten_operators(rules: Vec<(SymbolU32, NoNestingNode)>) -> Vec<(SymbolU32, Rhs)> {
305        let mut flattened_rules: Vec<(SymbolU32, Rhs)> =
306            Vec::<(SymbolU32, Rhs)>::with_capacity(rules.len());
307        for (lhs, node) in rules {
308            let mut rhs = Rhs {
309                alternations: vec![Alternation {
310                    concatenations: vec![],
311                }],
312            };
313            let mut stack = vec![(node, 0usize)];
314            while let Some((node, index)) = stack.pop() {
315                match node {
316                    NoNestingNode::Unknown => unreachable!("Unknown node. This should not happen."),
317                    NoNestingNode::Terminal(value) => {
318                        rhs.alternations[index]
319                            .concatenations
320                            .push(OperatorFlattenedNode::Terminal(value));
321                    }
322                    NoNestingNode::RegexComplement(value) => {
323                        rhs.alternations[index]
324                            .concatenations
325                            .push(OperatorFlattenedNode::RegexComplement(value));
326                    }
327                    NoNestingNode::RegexString(value) => {
328                        rhs.alternations[index]
329                            .concatenations
330                            .push(OperatorFlattenedNode::RegexString(value));
331                    }
332                    NoNestingNode::Substrings(value) => {
333                        rhs.alternations[index]
334                            .concatenations
335                            .push(OperatorFlattenedNode::Substrings(value));
336                    }
337                    NoNestingNode::Nonterminal(value) => {
338                        rhs.alternations[index]
339                            .concatenations
340                            .push(OperatorFlattenedNode::Nonterminal(value));
341                    }
342                    NoNestingNode::Concatenations(mut nodes) => {
343                        if nodes.is_empty() {
344                            continue;
345                        }
346                        if index != usize::MAX {
347                            nodes.reverse();
348                        }
349                        let new_node = nodes.pop().unwrap();
350                        stack.push((
351                            NoNestingNode::Concatenations(nodes),
352                            usize::MAX, // This is a signal not to reverse the nodes.
353                        ));
354                        stack.push((new_node, rhs.alternations.len() - 1));
355                    }
356
357                    NoNestingNode::Alternations(mut nodes) => {
358                        assert!(
359                            nodes.len() == 2,
360                            "Alternations should only have two elements."
361                        );
362                        let r = nodes.pop().unwrap();
363                        let l = nodes.pop().unwrap();
364                        // Due to the way the EBNF parser is implemented, we can assume alternations only have two elements.
365                        rhs.alternations.push(Alternation {
366                            concatenations: vec![],
367                        });
368                        stack.push((r, rhs.alternations.len() - 1)); // put the right node to the new alternation
369                        stack.push((l, rhs.alternations.len() - 2)); // put the left node to the previous alternation
370                    }
371                    NoNestingNode::EarlyEndRegexString(value) => {
372                        rhs.alternations[index]
373                            .concatenations
374                            .push(OperatorFlattenedNode::EarlyEndRegexString(value));
375                    }
376                }
377            }
378            flattened_rules.push((lhs, rhs));
379        }
380        flattened_rules
381    }
382
383    fn group_same_lhs_together(rules: Vec<(SymbolU32, Rhs)>) -> FxHashMap<SymbolU32, Rhs> {
384        let mut new_rules: FxHashMap<SymbolU32, Rhs> = FxHashMap::default();
385        for (lhs, rhs) in rules {
386            let entry = new_rules.entry(lhs).or_insert(Rhs {
387                alternations: vec![],
388            });
389            entry.alternations.extend(rhs.alternations);
390        }
391        new_rules
392    }
393
394    fn merge_consecutive_terminals(
395        rules: FxHashMap<SymbolU32, Rhs>,
396        interned_strings: &mut InternedStrings,
397    ) -> FxHashMap<SymbolU32, Rhs> {
398        rules
399            .into_iter()
400            .map(|(lhs, rhs)| {
401                (
402                    lhs,
403                    Rhs {
404                        alternations: rhs
405                            .alternations
406                            .into_iter()
407                            .map(|a| Alternation {
408                                concatenations: a.concatenations.into_iter().fold(
409                                    vec![],
410                                    |mut acc, c| {
411                                        if let OperatorFlattenedNode::Terminal(value) = c {
412                                            if let Some(OperatorFlattenedNode::Terminal(
413                                                last_value,
414                                            )) = acc.last()
415                                            {
416                                                let last_value = interned_strings
417                                                    .terminals
418                                                    .resolve(*last_value)
419                                                    .unwrap();
420                                                let value = interned_strings
421                                                    .terminals
422                                                    .resolve(value)
423                                                    .unwrap();
424                                                let new_value = format!("{}{}", last_value, value);
425                                                let new_value = interned_strings
426                                                    .terminals
427                                                    .get_or_intern(new_value);
428                                                acc.pop();
429                                                acc.push(OperatorFlattenedNode::Terminal(
430                                                    new_value,
431                                                ));
432                                            } else {
433                                                acc.push(c);
434                                            }
435                                        } else {
436                                            acc.push(c);
437                                        }
438                                        acc
439                                    },
440                                ),
441                            })
442                            .collect(),
443                    },
444                )
445            })
446            .collect()
447    }
448
449    fn deduplicate_alternations(rules: FxHashMap<SymbolU32, Rhs>) -> FxHashMap<SymbolU32, Rhs> {
450        let mut new_rules: FxHashMap<SymbolU32, FxHashSet<Alternation>> = FxHashMap::default();
451        for (lhs, rhs) in rules {
452            let entry = new_rules.entry(lhs).or_default();
453            entry.extend(rhs.alternations.into_iter());
454        }
455        new_rules
456            .into_iter()
457            .map(|(lhs, rhs)| {
458                (
459                    lhs,
460                    Rhs {
461                        alternations: rhs.into_iter().collect(),
462                    },
463                )
464            })
465            .collect()
466    }
467
468    fn remove_unit_production(
469        rules: FxHashMap<SymbolU32, Rhs>,
470        start_nonterminal: &mut SymbolU32,
471        special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
472    ) -> FxHashMap<SymbolU32, Rhs> {
473        fn find_unit_chain<'a>(
474            rules: &'a FxHashMap<SymbolU32, Rhs>,
475            nonterminal_node: &'a OperatorFlattenedNode,
476            nonterminal: SymbolU32,
477            special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
478        ) -> Vec<&'a OperatorFlattenedNode> {
479            let mut last_nonterminal = nonterminal;
480            let mut chain = vec![nonterminal_node];
481            loop {
482                let rhs = rules.get(&last_nonterminal).unwrap();
483                if rhs.alternations.len() != 1 {
484                    break;
485                }
486                let altercation = rhs.alternations.first().unwrap();
487                if altercation.concatenations.len() != 1 {
488                    break;
489                }
490                let node = altercation.concatenations.first().unwrap();
491                match node {
492                    OperatorFlattenedNode::Nonterminal(next_nonterminal) => {
493                        if special_nonterminals.contains_key(&last_nonterminal)
494                            ^ special_nonterminals.contains_key(next_nonterminal)
495                        {
496                            break;
497                        }
498                        if next_nonterminal == &last_nonterminal {
499                            break;
500                        }
501                        chain.push(node);
502                        if let Some(e1) = special_nonterminals.get(&last_nonterminal) {
503                            if let Some(e2) = special_nonterminals.get(next_nonterminal) {
504                                match (e1, e2) {
505                                    (RegexExtKind::Repeat0, RegexExtKind::Repeat0) => {}
506                                    (RegexExtKind::Repeat1, RegexExtKind::Repeat1) => {}
507                                    (RegexExtKind::Optional, RegexExtKind::Optional) => {}
508                                    (RegexExtKind::Repeat0, RegexExtKind::Repeat1) => {
509                                        *special_nonterminals.get_mut(next_nonterminal).unwrap() =
510                                            RegexExtKind::Repeat0;
511                                    }
512                                    (RegexExtKind::Repeat1, RegexExtKind::Repeat0) => {}
513                                    (RegexExtKind::Repeat0, RegexExtKind::Optional) => {
514                                        *special_nonterminals.get_mut(next_nonterminal).unwrap() =
515                                            RegexExtKind::Repeat0;
516                                    }
517                                    (RegexExtKind::Optional, RegexExtKind::Repeat0) => {
518                                        *special_nonterminals.get_mut(next_nonterminal).unwrap() =
519                                            RegexExtKind::Repeat0;
520                                    }
521                                    (RegexExtKind::Repeat1, RegexExtKind::Optional) => {
522                                        *special_nonterminals.get_mut(next_nonterminal).unwrap() =
523                                            RegexExtKind::Repeat0;
524                                    }
525                                    (RegexExtKind::Optional, RegexExtKind::Repeat1) => {
526                                        *special_nonterminals.get_mut(next_nonterminal).unwrap() =
527                                            RegexExtKind::Repeat0;
528                                    }
529                                };
530                            }
531                        }
532                        last_nonterminal = *next_nonterminal;
533                    }
534                    _ => {
535                        if !special_nonterminals.contains_key(&last_nonterminal) {
536                            chain.push(node);
537                        }
538                        break;
539                    }
540                }
541            }
542            chain
543        }
544        fn update_nonterminal<'a>(
545            rules: &'a FxHashMap<SymbolU32, Rhs>,
546            nonterminal_node: &'a OperatorFlattenedNode,
547            nonterminal: SymbolU32,
548            visited: &mut FxHashSet<SymbolU32>,
549            src_to_dst: &mut FxHashMap<SymbolU32, OperatorFlattenedNode>,
550            stack: &mut Vec<SymbolU32>,
551            special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
552        ) {
553            let chain = find_unit_chain(rules, nonterminal_node, nonterminal, special_nonterminals);
554            visited.extend(chain.iter().take(chain.len() - 1).map(|node| match node {
555                OperatorFlattenedNode::Nonterminal(nonterminal) => *nonterminal,
556                _ => unreachable!(),
557            }));
558            if chain.len() > 1 {
559                if let OperatorFlattenedNode::Nonterminal(nonterminal) = chain.last().unwrap() {
560                    stack.push(*nonterminal);
561                }
562                match chain.as_slice() {
563                    [first @ .., last] => {
564                        for node in first {
565                            match node {
566                                OperatorFlattenedNode::Nonterminal(nonterminal) => {
567                                    src_to_dst.insert(*nonterminal, (*last).clone());
568                                }
569                                _ => unreachable!(),
570                            }
571                        }
572                    }
573                    _ => unreachable!(),
574                }
575            } else {
576                stack.push(nonterminal);
577            }
578        }
579        let mut stack = vec![*start_nonterminal];
580        let mut chains: FxHashMap<SymbolU32, OperatorFlattenedNode> = FxHashMap::default();
581        let mut visited = FxHashSet::default();
582        while let Some(nonterminal) = stack.pop() {
583            let rhs = rules.get(&nonterminal).unwrap();
584            for a in rhs.alternations.iter() {
585                for c in a.concatenations.iter() {
586                    if let &OperatorFlattenedNode::Nonterminal(nonterminal) = c {
587                        if visited.contains(&nonterminal) {
588                            continue;
589                        }
590
591                        update_nonterminal(
592                            &rules,
593                            c,
594                            nonterminal,
595                            &mut visited,
596                            &mut chains,
597                            &mut stack,
598                            special_nonterminals,
599                        );
600                    }
601                }
602            }
603            visited.insert(nonterminal);
604        }
605        if let Some(value) = chains.get(start_nonterminal) {
606            match value {
607                OperatorFlattenedNode::Nonterminal(nonterminal) => {
608                    *start_nonterminal = *nonterminal;
609                }
610                _ => unreachable!(),
611            }
612        }
613        rules
614            .into_iter()
615            .filter_map(|(lhs, rhs)| {
616                if chains.contains_key(&lhs) {
617                    None
618                } else {
619                    Some((
620                        lhs,
621                        Rhs {
622                            alternations: rhs
623                                .alternations
624                                .into_iter()
625                                .map(|a| Alternation {
626                                    concatenations: a
627                                        .concatenations
628                                        .into_iter()
629                                        .map(|c| match c {
630                                            OperatorFlattenedNode::Nonterminal(nonterminal) => {
631                                                chains.get(&nonterminal).unwrap_or(&c).clone()
632                                            }
633                                            _ => c,
634                                        })
635                                        .collect::<Vec<OperatorFlattenedNode>>(),
636                                })
637                                .collect(),
638                        },
639                    ))
640                }
641            })
642            .collect()
643    }
644
645    fn expand_special_nonterminals(
646        rules: FxHashMap<SymbolU32, Rhs>,
647        special_nonterminals: FxHashMap<SymbolU32, RegexExtKind>,
648        interned_strings: &mut InternedStrings,
649    ) -> FxHashMap<SymbolU32, Rhs> {
650        rules
651            .into_iter()
652            .map(|(lhs, mut rhs)| {
653                if let Some(kind) = special_nonterminals.get(&lhs) {
654                    match kind {
655                        RegexExtKind::Optional => {
656                            rhs.alternations.push(Alternation {
657                                concatenations: vec![OperatorFlattenedNode::Terminal(
658                                    interned_strings.terminals.get_or_intern(""),
659                                )],
660                            });
661                            (lhs, rhs)
662                        }
663                        RegexExtKind::Repeat0 => {
664                            let iter = rhs
665                                .alternations
666                                .iter()
667                                .cloned()
668                                .map(|mut a| {
669                                    a.concatenations
670                                        .insert(0, OperatorFlattenedNode::Nonterminal(lhs));
671                                    a
672                                })
673                                .collect::<Vec<_>>();
674                            rhs.alternations.extend(iter);
675                            rhs.alternations.push(Alternation {
676                                concatenations: vec![OperatorFlattenedNode::Terminal(
677                                    interned_strings.terminals.get_or_intern(""),
678                                )],
679                            });
680                            (lhs, rhs)
681                        }
682                        RegexExtKind::Repeat1 => {
683                            let iter = rhs
684                                .alternations
685                                .iter()
686                                .cloned()
687                                .map(|mut a| {
688                                    a.concatenations
689                                        .insert(0, OperatorFlattenedNode::Nonterminal(lhs));
690                                    a
691                                })
692                                .collect::<Vec<_>>();
693                            rhs.alternations.extend(iter);
694                            (lhs, rhs)
695                        }
696                    }
697                } else {
698                    (lhs, rhs)
699                }
700            })
701            .collect()
702    }
703
704    fn merge_identical_rhs_across_nonterminals(
705        mut rules: FxHashMap<SymbolU32, Rhs>,
706    ) -> FxHashMap<SymbolU32, Rhs> {
707        loop {
708            // In worst case it has O(n^2logn) complexity. I am curious whether there exists a better solution.
709            let mut updated = false;
710            let mut merged_rhs_to_lhses = FxHashMap::default();
711            let mut lhs_to_lhs = FxHashMap::default();
712            for (lhs, mut rhs) in rules {
713                rhs.alternations.sort();
714                match merged_rhs_to_lhses.entry(rhs) {
715                    std::collections::hash_map::Entry::Occupied(entry) => {
716                        lhs_to_lhs.insert(lhs, *entry.get());
717                        updated = true;
718                    }
719                    std::collections::hash_map::Entry::Vacant(entry) => {
720                        entry.insert(lhs);
721                    }
722                }
723            }
724            rules = merged_rhs_to_lhses
725                .into_iter()
726                .map(|(rhs, lhs)| {
727                    (
728                        lhs,
729                        Rhs {
730                            alternations: rhs
731                                .alternations
732                                .into_iter()
733                                .map(|alternation| Alternation {
734                                    concatenations: alternation
735                                        .concatenations
736                                        .into_iter()
737                                        .map(|concatenation| match concatenation {
738                                            OperatorFlattenedNode::Nonterminal(nonterminal) => {
739                                                OperatorFlattenedNode::Nonterminal(
740                                                    *lhs_to_lhs
741                                                        .get(&nonterminal)
742                                                        .unwrap_or(&nonterminal),
743                                                )
744                                            }
745                                            _ => concatenation,
746                                        })
747                                        .collect(),
748                                })
749                                .collect(),
750                        },
751                    )
752                })
753                .collect();
754            if !updated {
755                break;
756            }
757        }
758        rules.into_iter().collect()
759    }
760
761    fn remove_nullable_rules(
762        rules: FxHashMap<SymbolU32, Rhs>,
763        interned_strings: &InternedStrings,
764        id_to_regex: &FxHashMap<SymbolU32, FiniteStateAutomaton>,
765        id_to_suffix_automaton: &FxHashMap<SymbolU32, SuffixAutomaton>,
766        regex_start_config: &kbnf_regex_automata::util::start::Config,
767    ) -> FxHashMap<SymbolU32, Rhs> {
768        fn find_nullable_nonterminals(
769            rules: &FxHashMap<SymbolU32, Rhs>,
770            interned_strings: &InternedStrings,
771            id_to_regex: &FxHashMap<SymbolU32, FiniteStateAutomaton>,
772            id_to_suffix_automaton: &FxHashMap<SymbolU32, SuffixAutomaton>,
773            regex_start_config: &kbnf_regex_automata::util::start::Config,
774        ) -> (
775            FxHashSet<OperatorFlattenedNode>,
776            FxHashSet<OperatorFlattenedNode>,
777        ) {
778            let mut nullable_symbols: FxHashSet<OperatorFlattenedNode> = FxHashSet::default(); // The symbol can produce empty string.
779            let mut null_symbols: FxHashSet<OperatorFlattenedNode> = FxHashSet::default(); // The symbol always produce empty string.
780            loop {
781                // In worst case it has O(n^2) complexity. I am curious whether there exists a better solution.
782                let mut updated = false;
783                for (lhs, rhs) in rules {
784                    if nullable_symbols.contains(&OperatorFlattenedNode::Nonterminal(*lhs)) {
785                        continue;
786                    }
787                    let mut null_lhs = true;
788                    let mut nullable_lhs = false;
789                    for a in &rhs.alternations {
790                        let mut nullable = true;
791                        let mut null = true;
792                        for c in &a.concatenations {
793                            if null_symbols.contains(c) {
794                                null &= true;
795                                nullable &= true;
796                            } else if nullable_symbols.contains(c) {
797                                nullable &= true;
798                                null &= false;
799                            } else {
800                                match c {
801                                    OperatorFlattenedNode::Terminal(terminal) => {
802                                        let empty = interned_strings
803                                            .terminals
804                                            .resolve(*terminal)
805                                            .unwrap()
806                                            .is_empty();
807                                        if empty {
808                                            null &= true;
809                                            nullable &= true;
810                                            null_symbols.insert(c.clone());
811                                            nullable_symbols.insert(c.clone());
812                                        } else {
813                                            null &= false;
814                                            nullable &= false;
815                                        }
816                                    }
817                                    OperatorFlattenedNode::RegexString(regex) => {
818                                        let automaton = id_to_regex.get(regex).unwrap();
819                                        if automaton.only_empty(regex_start_config) {
820                                            null &= true;
821                                            nullable &= true;
822                                            null_symbols.insert(c.clone());
823                                            nullable_symbols.insert(c.clone());
824                                        } else if automaton.has_empty() {
825                                            nullable &= true;
826                                            nullable_symbols.insert(c.clone());
827                                        } else {
828                                            null &= false;
829                                            nullable &= false;
830                                        }
831                                    }
832                                    OperatorFlattenedNode::Substrings(substrings) => {
833                                        let automaton = id_to_suffix_automaton.get(substrings).unwrap();
834                                        if automaton.num_of_nodes() == 2 { // one root and one nil
835                                            null &= true;
836                                            nullable &= true;
837                                            null_symbols.insert(c.clone());
838                                            nullable_symbols.insert(c.clone());
839                                        } else {
840                                            nullable &= true;
841                                            nullable_symbols.insert(c.clone());
842                                        }
843                                    }
844                                    _ => {
845                                        null &= false;
846                                        nullable &= false;
847                                    }
848                                }
849                            }
850                        }
851                        null_lhs &= null;
852                        nullable_lhs |= nullable;
853                    }
854                    if null_lhs {
855                        null_symbols.insert(OperatorFlattenedNode::Nonterminal(*lhs));
856                        updated = true;
857                    }
858                    if nullable_lhs {
859                        nullable_symbols.insert(OperatorFlattenedNode::Nonterminal(*lhs));
860                        updated = true;
861                    }
862                }
863                if !updated {
864                    break;
865                }
866            }
867            (nullable_symbols, null_symbols)
868        }
869        let (nullable_nodes, null_nodes) = find_nullable_nonterminals(
870            &rules,
871            interned_strings,
872            id_to_regex,
873            id_to_suffix_automaton,
874            regex_start_config,
875        );
876        let mut new_rules = FxHashMap::default();
877        for (lhs, Rhs { alternations }) in rules {
878            let mut new_alterations = FxHashSet::default();
879            for Alternation { concatenations } in alternations {
880                let mut stack = vec![(vec![], concatenations.into_iter())];
881                while let Some((mut prefix, mut iter)) = stack.pop() {
882                    if let Some(node) = iter.next() {
883                        if null_nodes.contains(&node) {
884                            stack.push((prefix, iter));
885                        } else if nullable_nodes.contains(&node) {
886                            let without = prefix.clone();
887                            prefix.push(node);
888                            stack.push((prefix, iter.clone()));
889                            stack.push((without, iter));
890                        } else {
891                            prefix.push(node);
892                            stack.push((prefix, iter));
893                        }
894                    } else if !prefix.is_empty() {
895                        new_alterations.insert(Alternation {
896                            concatenations: prefix,
897                        });
898                    }
899                }
900            }
901            new_rules.insert(
902                lhs,
903                Rhs {
904                    alternations: new_alterations.into_iter().collect(),
905                },
906            );
907        }
908        new_rules
909    }
910
911    fn remove_fixed_point_production(
912        rules: FxHashMap<SymbolU32, Rhs>,
913    ) -> FxHashMap<SymbolU32, Rhs> {
914        rules
915            .into_iter()
916            .filter_map(|(lhs, rhs)| {
917                let new_rhs = Rhs {
918                    alternations: rhs
919                        .alternations
920                        .into_iter()
921                        .filter_map(|a| {
922                            if a.concatenations.len() == 1 {
923                                match a.concatenations.first().unwrap() {
924                                    OperatorFlattenedNode::Nonterminal(nonterminal) => {
925                                        if *nonterminal == lhs {
926                                            None
927                                        } else {
928                                            Some(a)
929                                        }
930                                    }
931                                    _ => Some(a),
932                                }
933                            } else {
934                                Some(a)
935                            }
936                        })
937                        .collect(),
938                };
939                if new_rhs.alternations.is_empty() {
940                    None
941                } else {
942                    Some((lhs, new_rhs))
943                }
944            })
945            .collect()
946    }
947
948    fn compress_terminals(
949        rules: FxHashMap<SymbolU32, Rhs>,
950        interned_strings: &mut InternedStrings,
951        config: CompressionConfig,
952        id_to_regex: &mut FxHashMap<SymbolU32, FiniteStateAutomaton>,
953    ) -> FxHashMap<SymbolU32, Rhs> {
954        rules
955            .into_iter()
956            .map(|(lhs, rhs)| {
957                let (terminals, mut remaining): (Vec<_>, Vec<_>) =
958                    rhs.alternations.into_iter().partition(|x| {
959                        matches!(
960                            x.concatenations.as_slice(),
961                            [OperatorFlattenedNode::Terminal(_)]
962                        )
963                    });
964                let alternations = if terminals.len() >= config.min_terminals {
965                    let regex_string = from_terminals_to_regex_string(&terminals, interned_strings);
966                    let id = interned_strings
967                        .regex_strings
968                        .get(&regex_string)
969                        .unwrap_or_else(|| match &config.regex_config {
970                            FiniteStateAutomatonConfig::Dfa(config) => {
971                                let dfa = dfa::dense::Builder::new()
972                                    .configure(config.clone())
973                                    .build(&regex_string)
974                                    .unwrap();
975                                let id = interned_strings.regex_strings.get_or_intern(regex_string);
976                                id_to_regex.insert(id, FiniteStateAutomaton::Dfa(dfa));
977                                id
978                            }
979                        });
980                    remaining.push(Alternation {
981                        concatenations: vec![OperatorFlattenedNode::RegexString(id)],
982                    });
983                    remaining
984                } else {
985                    remaining.extend(terminals);
986                    remaining
987                };
988                (lhs, Rhs { alternations })
989            })
990            .collect()
991    }
992
993    fn compact_interned(
994        mut start_symbol: SymbolU32,
995        rules: FxHashMap<SymbolU32, Rhs>,
996        interned: InternedStrings,
997        id_to_regex: FxHashMap<SymbolU32, FiniteStateAutomaton>,
998        id_to_suffix_automaton: FxHashMap<SymbolU32, SuffixAutomaton>,
999    ) -> (
1000        InternedStrings,
1001        Vec<FiniteStateAutomaton>,
1002        Vec<SuffixAutomaton>,
1003        Vec<Rhs>,
1004        SymbolU32,
1005    ) {
1006        let mut interned_nonterminals: StringInterner<StringBackend> = StringInterner::default();
1007        let mut interned_terminals: StringInterner<StringBackend> = StringInterner::default();
1008        let mut interned_regexes: StringInterner<StringBackend> = StringInterner::default();
1009        let mut interned_sub_strings: StringInterner<StringBackend> = StringInterner::default();
1010        let mut new_id_to_regex = Vec::with_capacity(id_to_regex.len());
1011        let mut new_id_to_suffix_automaton = Vec::with_capacity(id_to_suffix_automaton.len());
1012        let mut new_rules: Vec<_> = Vec::with_capacity(rules.len());
1013        let mut start_symbol_updated = false;
1014        for (lhs, rhs) in rules.into_iter() {
1015            let id =
1016                interned_nonterminals.get_or_intern(interned.nonterminals.resolve(lhs).unwrap());
1017            if lhs == start_symbol && !start_symbol_updated {
1018                start_symbol = id;
1019                start_symbol_updated = true;
1020            }
1021            assert!(id.to_usize() == new_rules.len());
1022            new_rules.push(rhs);
1023        }
1024        for rhs in new_rules.iter_mut() {
1025            for Alternation { concatenations } in &mut rhs.alternations {
1026                for concatenation in concatenations {
1027                    match concatenation {
1028                        OperatorFlattenedNode::Nonterminal(nonterminal) => {
1029                            *nonterminal = interned_nonterminals.get_or_intern(
1030                                interned.nonterminals.resolve(*nonterminal).unwrap(),
1031                            );
1032                        }
1033                        OperatorFlattenedNode::Terminal(terminal) => {
1034                            *terminal = interned_terminals
1035                                .get_or_intern(interned.terminals.resolve(*terminal).unwrap());
1036                        }
1037                        OperatorFlattenedNode::RegexString(regex)
1038                        | OperatorFlattenedNode::EarlyEndRegexString(regex)
1039                        | OperatorFlattenedNode::RegexComplement(regex) => {
1040                            let new_id = interned_regexes
1041                                .get_or_intern(interned.regex_strings.resolve(*regex).unwrap());
1042                            if new_id.to_usize() == new_id_to_regex.len() {
1043                                new_id_to_regex.push(id_to_regex.get(regex).unwrap().clone());
1044                            }
1045                            *regex = new_id;
1046                            // Should not fail since StringBackend is contiguous.
1047                        }
1048                        OperatorFlattenedNode::Substrings(substrings) => {
1049                            let new_id = interned_sub_strings
1050                                .get_or_intern(interned.sub_strings.resolve(*substrings).unwrap());
1051                            if new_id.to_usize() == new_id_to_suffix_automaton.len() {
1052                                new_id_to_suffix_automaton
1053                                    .push(id_to_suffix_automaton.get(substrings).unwrap().clone());
1054                            }
1055                            *substrings = new_id;
1056                        }
1057                    }
1058                }
1059            }
1060        }
1061        (
1062            InternedStrings {
1063                nonterminals: interned_nonterminals,
1064                terminals: interned_terminals,
1065                regex_strings: interned_regexes,
1066                sub_strings: interned_sub_strings,
1067            },
1068            new_id_to_regex,
1069            new_id_to_suffix_automaton,
1070            new_rules,
1071            start_symbol,
1072        )
1073    }
1074}