1use std::{iter::zip, mem};
2
3use kbnf_regex_automata::dfa;
4use rustc_hash::{FxHashMap, FxHashSet};
5use string_interner::{backend::StringBackend, symbol::SymbolU32, StringInterner, Symbol};
6
7use crate::{
8 config::CompressionConfig,
9 expression::ExpressionWithID,
10 node::{Alternation, NoNestingNode, NodeWithID, OperatorFlattenedNode, Rhs},
11 regex::{FiniteStateAutomaton, FiniteStateAutomatonConfig},
12 simplified_grammar::SimplifiedGrammar,
13 suffix_automaton::SuffixAutomaton,
14 utils::from_terminals_to_regex_string,
15 InternedStrings, RegexExtKind, SymbolKind,
16};
17
18#[derive(Debug, Clone)]
19pub struct ValidatedGrammar {
20 pub expressions: Vec<ExpressionWithID>,
21 pub interned_strings: InternedStrings,
22 pub start_symbol: SymbolU32,
23 pub id_to_regex: FxHashMap<SymbolU32, FiniteStateAutomaton>,
24 pub id_to_suffix_automaton: FxHashMap<SymbolU32, SuffixAutomaton>,
25}
26
27impl ValidatedGrammar {
28 pub fn simplify_grammar(
29 mut self,
30 config: CompressionConfig,
31 regex_start_config: &kbnf_regex_automata::util::start::Config,
32 ) -> SimplifiedGrammar {
33 let expressions = Self::remove_unused_rules(self.expressions, self.start_symbol);
34 let (expressions, mut special_nonterminals) =
35 Self::flatten_nested_rules_with_nonterminals(expressions, &mut self.interned_strings);
36 let expressions = Self::flatten_operators(expressions);
37 let expressions = Self::group_same_lhs_together(expressions);
38 let expressions = Self::deduplicate_alternations(expressions);
39 let expressions = Self::remove_unit_production(
40 expressions,
41 &mut self.start_symbol,
42 &mut special_nonterminals,
43 );
44 let expressions =
45 Self::merge_consecutive_terminals(expressions, &mut self.interned_strings);
46 let expressions = Self::expand_special_nonterminals(
47 expressions,
48 special_nonterminals,
49 &mut self.interned_strings,
50 );
51 let expressions = Self::merge_identical_rhs_across_nonterminals(expressions);
52 let expressions = Self::remove_nullable_rules(
53 expressions,
54 &self.interned_strings,
55 &self.id_to_regex,
56 &self.id_to_suffix_automaton,
57 regex_start_config,
58 );
59 let expressions = Self::remove_unit_production(
60 expressions,
61 &mut self.start_symbol,
62 &mut FxHashMap::default(),
63 );
64 let expressions =
65 Self::merge_consecutive_terminals(expressions, &mut self.interned_strings);
66 let expressions = Self::remove_fixed_point_production(expressions);
67 let expressions = Self::compress_terminals(
68 expressions,
69 &mut self.interned_strings,
70 config,
71 &mut self.id_to_regex,
72 );
73 let (interned_strings, id_to_regexes, id_to_suffix_automaton, expressions, start_symbol) =
74 Self::compact_interned(
75 self.start_symbol,
76 expressions,
77 self.interned_strings,
78 self.id_to_regex,
79 self.id_to_suffix_automaton,
80 );
81 SimplifiedGrammar {
82 expressions,
83 start_symbol,
84 interned_strings,
85 id_to_regex: id_to_regexes,
86 id_to_suffix_automaton,
87 }
88 }
89
90 fn remove_unused_rules(
91 expressions: Vec<ExpressionWithID>,
92 start_symbol: SymbolU32,
93 ) -> Vec<ExpressionWithID> {
94 let mut used_nonterminals = FxHashSet::default();
95 let mut stack = vec![];
96 used_nonterminals.insert(start_symbol);
97 for ExpressionWithID { lhs, rhs: node } in &expressions {
98 if *lhs == start_symbol {
99 stack.push(node);
100 }
101 }
102 while let Some(node) = stack.pop() {
103 match node {
104 NodeWithID::Terminal(_) => {}
105 NodeWithID::RegexString(_) => {}
106 NodeWithID::EarlyEndRegexString(_) => {}
107 NodeWithID::Substrings(_) => {}
108 NodeWithID::RegexComplement(_) => {}
109 NodeWithID::Nonterminal(nonterminal) => {
110 if used_nonterminals.contains(nonterminal) {
111 continue;
112 }
113 used_nonterminals.insert(*nonterminal);
114 for ExpressionWithID { lhs, rhs: node } in &expressions {
115 if *lhs == *nonterminal {
116 stack.push(node);
117 }
118 }
119 }
120 NodeWithID::Multiple(nodes) => {
121 for node in nodes {
122 stack.push(node);
123 }
124 }
125 NodeWithID::RegexExt(node, _) => {
126 stack.push(node);
127 }
128 NodeWithID::Symbol(lhs, _, rhs) => {
129 stack.push(lhs);
130 stack.push(rhs);
131 }
132 NodeWithID::Group(node) => {
133 stack.push(node);
134 }
135 NodeWithID::Unknown => {
136 unreachable!("Unknown node. This should not happen.")
137 }
138 }
139 }
140 expressions
141 .into_iter()
142 .filter(|expression| used_nonterminals.contains(&expression.lhs))
143 .collect()
144 }
145
146 fn flatten_nested_rules_with_nonterminals(
147 mut rules: Vec<ExpressionWithID>,
148 interned_strings: &mut InternedStrings,
149 ) -> (
150 Vec<(SymbolU32, NoNestingNode)>,
151 FxHashMap<SymbolU32, RegexExtKind>,
152 ) {
153 let mut flattened_rules: Vec<(SymbolU32, NoNestingNode)> = Vec::with_capacity(rules.len());
154 let mut special_nonterminals: FxHashMap<SymbolU32, RegexExtKind> = FxHashMap::default();
155 let get_new_nonterminal_name =
156 |nonterminal: SymbolU32, identifier: &str, interned_strings: &mut InternedStrings| {
157 let mut i = 0;
158 loop {
159 let nonterminal = interned_strings.nonterminals.resolve(nonterminal).unwrap();
160 let new_nonterminal = format!("{nonterminal}_{identifier}_{i}");
161 if interned_strings
162 .nonterminals
163 .get(&new_nonterminal)
164 .is_none()
165 {
166 return interned_strings.nonterminals.get_or_intern(new_nonterminal);
167 }
168 i += 1;
169 }
170 };
171 let mut add_new_rule = |lhs: SymbolU32,
172 identifier: &str,
173 parent: &mut NoNestingNode,
174 node: NodeWithID,
175 rules: &mut Vec<ExpressionWithID>,
176 kind: Option<RegexExtKind>| {
177 let name = get_new_nonterminal_name(lhs, identifier, interned_strings);
178 *parent = NoNestingNode::Nonterminal(name);
179 if let Some(kind) = kind {
180 special_nonterminals.insert(name, kind);
181 }
182 rules.push(ExpressionWithID {
183 lhs: name,
184 rhs: node,
185 });
186 };
187 fn get_slice(nodes: &[NodeWithID]) -> Vec<NoNestingNode> {
188 let mut buffer = Vec::with_capacity(nodes.len());
189 buffer.resize(nodes.len(), NoNestingNode::Unknown);
190 buffer
191 }
192 while !rules.is_empty() {
193 let length = rules.len();
194 for i in (0..length).rev() {
195 let mut stack: Vec<(NodeWithID, &mut NoNestingNode)> = vec![];
196 let mut root = NoNestingNode::Unknown;
197 let ExpressionWithID { lhs, rhs } = rules.swap_remove(i);
198 stack.push((rhs, &mut root));
199 while let Some((mut old_node, new_parent)) = stack.pop() {
200 match &mut old_node {
201 NodeWithID::Terminal(value) => {
202 *new_parent = NoNestingNode::Terminal(*value);
203 }
204 NodeWithID::RegexString(value) => {
205 *new_parent = NoNestingNode::RegexString(*value);
206 }
207 NodeWithID::Nonterminal(value) => {
208 *new_parent = NoNestingNode::Nonterminal(*value);
209 }
210 NodeWithID::RegexComplement(value) => {
211 *new_parent = NoNestingNode::RegexComplement(*value);
212 }
213 NodeWithID::Multiple(nodes) => {
214 *new_parent = NoNestingNode::Concatenations(get_slice(nodes));
215 match new_parent {
216 NoNestingNode::Concatenations(new_nodes) => {
217 for (node, new_parent) in
218 zip(nodes.iter_mut(), new_nodes.iter_mut())
219 {
220 stack.push((
221 mem::replace(node, NodeWithID::Unknown),
222 new_parent,
223 ));
224 }
225 }
226 _ => unreachable!(),
227 }
228 }
229 NodeWithID::RegexExt(node, ext) => {
230 add_new_rule(
231 lhs,
232 match ext {
233 RegexExtKind::Repeat0 => "repeat0",
234 RegexExtKind::Repeat1 => "repeat1",
235 RegexExtKind::Optional => "optional",
236 },
237 new_parent,
238 mem::replace(node.as_mut(), NodeWithID::Unknown),
239 &mut rules,
240 Some(*ext),
241 );
242 }
243 NodeWithID::Symbol(l, symbol, r) => {
244 let nodes = vec![
245 mem::replace(l.as_mut(), NodeWithID::Unknown),
246 mem::replace(r.as_mut(), NodeWithID::Unknown),
247 ];
248 match symbol {
249 SymbolKind::Concatenation => {
250 *new_parent = NoNestingNode::Concatenations(get_slice(&nodes));
251 match new_parent {
252 NoNestingNode::Concatenations(new_nodes) => {
253 for (node, new_parent) in
254 zip(nodes.into_iter(), new_nodes.iter_mut())
255 {
256 stack.push((node, new_parent));
257 }
258 }
259 _ => unreachable!(),
260 }
261 }
262 SymbolKind::Alternation => {
263 *new_parent = NoNestingNode::Alternations(get_slice(&nodes));
264 match new_parent {
265 NoNestingNode::Alternations(new_nodes) => {
266 for (node, new_parent) in
267 zip(nodes.into_iter(), new_nodes.iter_mut())
268 {
269 stack.push((node, new_parent));
270 }
271 }
272 _ => unreachable!(),
273 }
274 }
275 }
276 }
277 NodeWithID::Group(node) => {
278 add_new_rule(
279 lhs,
280 "group",
281 new_parent,
282 mem::replace(node.as_mut(), NodeWithID::Unknown),
283 &mut rules,
284 None,
285 );
286 }
287 NodeWithID::EarlyEndRegexString(value) => {
288 *new_parent = NoNestingNode::EarlyEndRegexString(*value);
289 }
290 NodeWithID::Substrings(value) => {
291 *new_parent = NoNestingNode::Substrings(*value);
292 }
293 NodeWithID::Unknown => {
294 unreachable!("Unknown node. This should not happen.")
295 }
296 }
297 }
298 flattened_rules.push((lhs, root));
299 }
300 }
301 (flattened_rules, special_nonterminals)
302 }
303
304 fn flatten_operators(rules: Vec<(SymbolU32, NoNestingNode)>) -> Vec<(SymbolU32, Rhs)> {
305 let mut flattened_rules: Vec<(SymbolU32, Rhs)> =
306 Vec::<(SymbolU32, Rhs)>::with_capacity(rules.len());
307 for (lhs, node) in rules {
308 let mut rhs = Rhs {
309 alternations: vec![Alternation {
310 concatenations: vec![],
311 }],
312 };
313 let mut stack = vec![(node, 0usize)];
314 while let Some((node, index)) = stack.pop() {
315 match node {
316 NoNestingNode::Unknown => unreachable!("Unknown node. This should not happen."),
317 NoNestingNode::Terminal(value) => {
318 rhs.alternations[index]
319 .concatenations
320 .push(OperatorFlattenedNode::Terminal(value));
321 }
322 NoNestingNode::RegexComplement(value) => {
323 rhs.alternations[index]
324 .concatenations
325 .push(OperatorFlattenedNode::RegexComplement(value));
326 }
327 NoNestingNode::RegexString(value) => {
328 rhs.alternations[index]
329 .concatenations
330 .push(OperatorFlattenedNode::RegexString(value));
331 }
332 NoNestingNode::Substrings(value) => {
333 rhs.alternations[index]
334 .concatenations
335 .push(OperatorFlattenedNode::Substrings(value));
336 }
337 NoNestingNode::Nonterminal(value) => {
338 rhs.alternations[index]
339 .concatenations
340 .push(OperatorFlattenedNode::Nonterminal(value));
341 }
342 NoNestingNode::Concatenations(mut nodes) => {
343 if nodes.is_empty() {
344 continue;
345 }
346 if index != usize::MAX {
347 nodes.reverse();
348 }
349 let new_node = nodes.pop().unwrap();
350 stack.push((
351 NoNestingNode::Concatenations(nodes),
352 usize::MAX, ));
354 stack.push((new_node, rhs.alternations.len() - 1));
355 }
356
357 NoNestingNode::Alternations(mut nodes) => {
358 assert!(
359 nodes.len() == 2,
360 "Alternations should only have two elements."
361 );
362 let r = nodes.pop().unwrap();
363 let l = nodes.pop().unwrap();
364 rhs.alternations.push(Alternation {
366 concatenations: vec![],
367 });
368 stack.push((r, rhs.alternations.len() - 1)); stack.push((l, rhs.alternations.len() - 2)); }
371 NoNestingNode::EarlyEndRegexString(value) => {
372 rhs.alternations[index]
373 .concatenations
374 .push(OperatorFlattenedNode::EarlyEndRegexString(value));
375 }
376 }
377 }
378 flattened_rules.push((lhs, rhs));
379 }
380 flattened_rules
381 }
382
383 fn group_same_lhs_together(rules: Vec<(SymbolU32, Rhs)>) -> FxHashMap<SymbolU32, Rhs> {
384 let mut new_rules: FxHashMap<SymbolU32, Rhs> = FxHashMap::default();
385 for (lhs, rhs) in rules {
386 let entry = new_rules.entry(lhs).or_insert(Rhs {
387 alternations: vec![],
388 });
389 entry.alternations.extend(rhs.alternations);
390 }
391 new_rules
392 }
393
394 fn merge_consecutive_terminals(
395 rules: FxHashMap<SymbolU32, Rhs>,
396 interned_strings: &mut InternedStrings,
397 ) -> FxHashMap<SymbolU32, Rhs> {
398 rules
399 .into_iter()
400 .map(|(lhs, rhs)| {
401 (
402 lhs,
403 Rhs {
404 alternations: rhs
405 .alternations
406 .into_iter()
407 .map(|a| Alternation {
408 concatenations: a.concatenations.into_iter().fold(
409 vec![],
410 |mut acc, c| {
411 if let OperatorFlattenedNode::Terminal(value) = c {
412 if let Some(OperatorFlattenedNode::Terminal(
413 last_value,
414 )) = acc.last()
415 {
416 let last_value = interned_strings
417 .terminals
418 .resolve(*last_value)
419 .unwrap();
420 let value = interned_strings
421 .terminals
422 .resolve(value)
423 .unwrap();
424 let new_value = format!("{}{}", last_value, value);
425 let new_value = interned_strings
426 .terminals
427 .get_or_intern(new_value);
428 acc.pop();
429 acc.push(OperatorFlattenedNode::Terminal(
430 new_value,
431 ));
432 } else {
433 acc.push(c);
434 }
435 } else {
436 acc.push(c);
437 }
438 acc
439 },
440 ),
441 })
442 .collect(),
443 },
444 )
445 })
446 .collect()
447 }
448
449 fn deduplicate_alternations(rules: FxHashMap<SymbolU32, Rhs>) -> FxHashMap<SymbolU32, Rhs> {
450 let mut new_rules: FxHashMap<SymbolU32, FxHashSet<Alternation>> = FxHashMap::default();
451 for (lhs, rhs) in rules {
452 let entry = new_rules.entry(lhs).or_default();
453 entry.extend(rhs.alternations.into_iter());
454 }
455 new_rules
456 .into_iter()
457 .map(|(lhs, rhs)| {
458 (
459 lhs,
460 Rhs {
461 alternations: rhs.into_iter().collect(),
462 },
463 )
464 })
465 .collect()
466 }
467
468 fn remove_unit_production(
469 rules: FxHashMap<SymbolU32, Rhs>,
470 start_nonterminal: &mut SymbolU32,
471 special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
472 ) -> FxHashMap<SymbolU32, Rhs> {
473 fn find_unit_chain<'a>(
474 rules: &'a FxHashMap<SymbolU32, Rhs>,
475 nonterminal_node: &'a OperatorFlattenedNode,
476 nonterminal: SymbolU32,
477 special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
478 ) -> Vec<&'a OperatorFlattenedNode> {
479 let mut last_nonterminal = nonterminal;
480 let mut chain = vec![nonterminal_node];
481 loop {
482 let rhs = rules.get(&last_nonterminal).unwrap();
483 if rhs.alternations.len() != 1 {
484 break;
485 }
486 let altercation = rhs.alternations.first().unwrap();
487 if altercation.concatenations.len() != 1 {
488 break;
489 }
490 let node = altercation.concatenations.first().unwrap();
491 match node {
492 OperatorFlattenedNode::Nonterminal(next_nonterminal) => {
493 if special_nonterminals.contains_key(&last_nonterminal)
494 ^ special_nonterminals.contains_key(next_nonterminal)
495 {
496 break;
497 }
498 if next_nonterminal == &last_nonterminal {
499 break;
500 }
501 chain.push(node);
502 if let Some(e1) = special_nonterminals.get(&last_nonterminal) {
503 if let Some(e2) = special_nonterminals.get(next_nonterminal) {
504 match (e1, e2) {
505 (RegexExtKind::Repeat0, RegexExtKind::Repeat0) => {}
506 (RegexExtKind::Repeat1, RegexExtKind::Repeat1) => {}
507 (RegexExtKind::Optional, RegexExtKind::Optional) => {}
508 (RegexExtKind::Repeat0, RegexExtKind::Repeat1) => {
509 *special_nonterminals.get_mut(next_nonterminal).unwrap() =
510 RegexExtKind::Repeat0;
511 }
512 (RegexExtKind::Repeat1, RegexExtKind::Repeat0) => {}
513 (RegexExtKind::Repeat0, RegexExtKind::Optional) => {
514 *special_nonterminals.get_mut(next_nonterminal).unwrap() =
515 RegexExtKind::Repeat0;
516 }
517 (RegexExtKind::Optional, RegexExtKind::Repeat0) => {
518 *special_nonterminals.get_mut(next_nonterminal).unwrap() =
519 RegexExtKind::Repeat0;
520 }
521 (RegexExtKind::Repeat1, RegexExtKind::Optional) => {
522 *special_nonterminals.get_mut(next_nonterminal).unwrap() =
523 RegexExtKind::Repeat0;
524 }
525 (RegexExtKind::Optional, RegexExtKind::Repeat1) => {
526 *special_nonterminals.get_mut(next_nonterminal).unwrap() =
527 RegexExtKind::Repeat0;
528 }
529 };
530 }
531 }
532 last_nonterminal = *next_nonterminal;
533 }
534 _ => {
535 if !special_nonterminals.contains_key(&last_nonterminal) {
536 chain.push(node);
537 }
538 break;
539 }
540 }
541 }
542 chain
543 }
544 fn update_nonterminal<'a>(
545 rules: &'a FxHashMap<SymbolU32, Rhs>,
546 nonterminal_node: &'a OperatorFlattenedNode,
547 nonterminal: SymbolU32,
548 visited: &mut FxHashSet<SymbolU32>,
549 src_to_dst: &mut FxHashMap<SymbolU32, OperatorFlattenedNode>,
550 stack: &mut Vec<SymbolU32>,
551 special_nonterminals: &mut FxHashMap<SymbolU32, RegexExtKind>,
552 ) {
553 let chain = find_unit_chain(rules, nonterminal_node, nonterminal, special_nonterminals);
554 visited.extend(chain.iter().take(chain.len() - 1).map(|node| match node {
555 OperatorFlattenedNode::Nonterminal(nonterminal) => *nonterminal,
556 _ => unreachable!(),
557 }));
558 if chain.len() > 1 {
559 if let OperatorFlattenedNode::Nonterminal(nonterminal) = chain.last().unwrap() {
560 stack.push(*nonterminal);
561 }
562 match chain.as_slice() {
563 [first @ .., last] => {
564 for node in first {
565 match node {
566 OperatorFlattenedNode::Nonterminal(nonterminal) => {
567 src_to_dst.insert(*nonterminal, (*last).clone());
568 }
569 _ => unreachable!(),
570 }
571 }
572 }
573 _ => unreachable!(),
574 }
575 } else {
576 stack.push(nonterminal);
577 }
578 }
579 let mut stack = vec![*start_nonterminal];
580 let mut chains: FxHashMap<SymbolU32, OperatorFlattenedNode> = FxHashMap::default();
581 let mut visited = FxHashSet::default();
582 while let Some(nonterminal) = stack.pop() {
583 let rhs = rules.get(&nonterminal).unwrap();
584 for a in rhs.alternations.iter() {
585 for c in a.concatenations.iter() {
586 if let &OperatorFlattenedNode::Nonterminal(nonterminal) = c {
587 if visited.contains(&nonterminal) {
588 continue;
589 }
590
591 update_nonterminal(
592 &rules,
593 c,
594 nonterminal,
595 &mut visited,
596 &mut chains,
597 &mut stack,
598 special_nonterminals,
599 );
600 }
601 }
602 }
603 visited.insert(nonterminal);
604 }
605 if let Some(value) = chains.get(start_nonterminal) {
606 match value {
607 OperatorFlattenedNode::Nonterminal(nonterminal) => {
608 *start_nonterminal = *nonterminal;
609 }
610 _ => unreachable!(),
611 }
612 }
613 rules
614 .into_iter()
615 .filter_map(|(lhs, rhs)| {
616 if chains.contains_key(&lhs) {
617 None
618 } else {
619 Some((
620 lhs,
621 Rhs {
622 alternations: rhs
623 .alternations
624 .into_iter()
625 .map(|a| Alternation {
626 concatenations: a
627 .concatenations
628 .into_iter()
629 .map(|c| match c {
630 OperatorFlattenedNode::Nonterminal(nonterminal) => {
631 chains.get(&nonterminal).unwrap_or(&c).clone()
632 }
633 _ => c,
634 })
635 .collect::<Vec<OperatorFlattenedNode>>(),
636 })
637 .collect(),
638 },
639 ))
640 }
641 })
642 .collect()
643 }
644
645 fn expand_special_nonterminals(
646 rules: FxHashMap<SymbolU32, Rhs>,
647 special_nonterminals: FxHashMap<SymbolU32, RegexExtKind>,
648 interned_strings: &mut InternedStrings,
649 ) -> FxHashMap<SymbolU32, Rhs> {
650 rules
651 .into_iter()
652 .map(|(lhs, mut rhs)| {
653 if let Some(kind) = special_nonterminals.get(&lhs) {
654 match kind {
655 RegexExtKind::Optional => {
656 rhs.alternations.push(Alternation {
657 concatenations: vec![OperatorFlattenedNode::Terminal(
658 interned_strings.terminals.get_or_intern(""),
659 )],
660 });
661 (lhs, rhs)
662 }
663 RegexExtKind::Repeat0 => {
664 let iter = rhs
665 .alternations
666 .iter()
667 .cloned()
668 .map(|mut a| {
669 a.concatenations
670 .insert(0, OperatorFlattenedNode::Nonterminal(lhs));
671 a
672 })
673 .collect::<Vec<_>>();
674 rhs.alternations.extend(iter);
675 rhs.alternations.push(Alternation {
676 concatenations: vec![OperatorFlattenedNode::Terminal(
677 interned_strings.terminals.get_or_intern(""),
678 )],
679 });
680 (lhs, rhs)
681 }
682 RegexExtKind::Repeat1 => {
683 let iter = rhs
684 .alternations
685 .iter()
686 .cloned()
687 .map(|mut a| {
688 a.concatenations
689 .insert(0, OperatorFlattenedNode::Nonterminal(lhs));
690 a
691 })
692 .collect::<Vec<_>>();
693 rhs.alternations.extend(iter);
694 (lhs, rhs)
695 }
696 }
697 } else {
698 (lhs, rhs)
699 }
700 })
701 .collect()
702 }
703
704 fn merge_identical_rhs_across_nonterminals(
705 mut rules: FxHashMap<SymbolU32, Rhs>,
706 ) -> FxHashMap<SymbolU32, Rhs> {
707 loop {
708 let mut updated = false;
710 let mut merged_rhs_to_lhses = FxHashMap::default();
711 let mut lhs_to_lhs = FxHashMap::default();
712 for (lhs, mut rhs) in rules {
713 rhs.alternations.sort();
714 match merged_rhs_to_lhses.entry(rhs) {
715 std::collections::hash_map::Entry::Occupied(entry) => {
716 lhs_to_lhs.insert(lhs, *entry.get());
717 updated = true;
718 }
719 std::collections::hash_map::Entry::Vacant(entry) => {
720 entry.insert(lhs);
721 }
722 }
723 }
724 rules = merged_rhs_to_lhses
725 .into_iter()
726 .map(|(rhs, lhs)| {
727 (
728 lhs,
729 Rhs {
730 alternations: rhs
731 .alternations
732 .into_iter()
733 .map(|alternation| Alternation {
734 concatenations: alternation
735 .concatenations
736 .into_iter()
737 .map(|concatenation| match concatenation {
738 OperatorFlattenedNode::Nonterminal(nonterminal) => {
739 OperatorFlattenedNode::Nonterminal(
740 *lhs_to_lhs
741 .get(&nonterminal)
742 .unwrap_or(&nonterminal),
743 )
744 }
745 _ => concatenation,
746 })
747 .collect(),
748 })
749 .collect(),
750 },
751 )
752 })
753 .collect();
754 if !updated {
755 break;
756 }
757 }
758 rules.into_iter().collect()
759 }
760
761 fn remove_nullable_rules(
762 rules: FxHashMap<SymbolU32, Rhs>,
763 interned_strings: &InternedStrings,
764 id_to_regex: &FxHashMap<SymbolU32, FiniteStateAutomaton>,
765 id_to_suffix_automaton: &FxHashMap<SymbolU32, SuffixAutomaton>,
766 regex_start_config: &kbnf_regex_automata::util::start::Config,
767 ) -> FxHashMap<SymbolU32, Rhs> {
768 fn find_nullable_nonterminals(
769 rules: &FxHashMap<SymbolU32, Rhs>,
770 interned_strings: &InternedStrings,
771 id_to_regex: &FxHashMap<SymbolU32, FiniteStateAutomaton>,
772 id_to_suffix_automaton: &FxHashMap<SymbolU32, SuffixAutomaton>,
773 regex_start_config: &kbnf_regex_automata::util::start::Config,
774 ) -> (
775 FxHashSet<OperatorFlattenedNode>,
776 FxHashSet<OperatorFlattenedNode>,
777 ) {
778 let mut nullable_symbols: FxHashSet<OperatorFlattenedNode> = FxHashSet::default(); let mut null_symbols: FxHashSet<OperatorFlattenedNode> = FxHashSet::default(); loop {
781 let mut updated = false;
783 for (lhs, rhs) in rules {
784 if nullable_symbols.contains(&OperatorFlattenedNode::Nonterminal(*lhs)) {
785 continue;
786 }
787 let mut null_lhs = true;
788 let mut nullable_lhs = false;
789 for a in &rhs.alternations {
790 let mut nullable = true;
791 let mut null = true;
792 for c in &a.concatenations {
793 if null_symbols.contains(c) {
794 null &= true;
795 nullable &= true;
796 } else if nullable_symbols.contains(c) {
797 nullable &= true;
798 null &= false;
799 } else {
800 match c {
801 OperatorFlattenedNode::Terminal(terminal) => {
802 let empty = interned_strings
803 .terminals
804 .resolve(*terminal)
805 .unwrap()
806 .is_empty();
807 if empty {
808 null &= true;
809 nullable &= true;
810 null_symbols.insert(c.clone());
811 nullable_symbols.insert(c.clone());
812 } else {
813 null &= false;
814 nullable &= false;
815 }
816 }
817 OperatorFlattenedNode::RegexString(regex) => {
818 let automaton = id_to_regex.get(regex).unwrap();
819 if automaton.only_empty(regex_start_config) {
820 null &= true;
821 nullable &= true;
822 null_symbols.insert(c.clone());
823 nullable_symbols.insert(c.clone());
824 } else if automaton.has_empty() {
825 nullable &= true;
826 nullable_symbols.insert(c.clone());
827 } else {
828 null &= false;
829 nullable &= false;
830 }
831 }
832 OperatorFlattenedNode::Substrings(substrings) => {
833 let automaton = id_to_suffix_automaton.get(substrings).unwrap();
834 if automaton.num_of_nodes() == 2 { null &= true;
836 nullable &= true;
837 null_symbols.insert(c.clone());
838 nullable_symbols.insert(c.clone());
839 } else {
840 nullable &= true;
841 nullable_symbols.insert(c.clone());
842 }
843 }
844 _ => {
845 null &= false;
846 nullable &= false;
847 }
848 }
849 }
850 }
851 null_lhs &= null;
852 nullable_lhs |= nullable;
853 }
854 if null_lhs {
855 null_symbols.insert(OperatorFlattenedNode::Nonterminal(*lhs));
856 updated = true;
857 }
858 if nullable_lhs {
859 nullable_symbols.insert(OperatorFlattenedNode::Nonterminal(*lhs));
860 updated = true;
861 }
862 }
863 if !updated {
864 break;
865 }
866 }
867 (nullable_symbols, null_symbols)
868 }
869 let (nullable_nodes, null_nodes) = find_nullable_nonterminals(
870 &rules,
871 interned_strings,
872 id_to_regex,
873 id_to_suffix_automaton,
874 regex_start_config,
875 );
876 let mut new_rules = FxHashMap::default();
877 for (lhs, Rhs { alternations }) in rules {
878 let mut new_alterations = FxHashSet::default();
879 for Alternation { concatenations } in alternations {
880 let mut stack = vec![(vec![], concatenations.into_iter())];
881 while let Some((mut prefix, mut iter)) = stack.pop() {
882 if let Some(node) = iter.next() {
883 if null_nodes.contains(&node) {
884 stack.push((prefix, iter));
885 } else if nullable_nodes.contains(&node) {
886 let without = prefix.clone();
887 prefix.push(node);
888 stack.push((prefix, iter.clone()));
889 stack.push((without, iter));
890 } else {
891 prefix.push(node);
892 stack.push((prefix, iter));
893 }
894 } else if !prefix.is_empty() {
895 new_alterations.insert(Alternation {
896 concatenations: prefix,
897 });
898 }
899 }
900 }
901 new_rules.insert(
902 lhs,
903 Rhs {
904 alternations: new_alterations.into_iter().collect(),
905 },
906 );
907 }
908 new_rules
909 }
910
911 fn remove_fixed_point_production(
912 rules: FxHashMap<SymbolU32, Rhs>,
913 ) -> FxHashMap<SymbolU32, Rhs> {
914 rules
915 .into_iter()
916 .filter_map(|(lhs, rhs)| {
917 let new_rhs = Rhs {
918 alternations: rhs
919 .alternations
920 .into_iter()
921 .filter_map(|a| {
922 if a.concatenations.len() == 1 {
923 match a.concatenations.first().unwrap() {
924 OperatorFlattenedNode::Nonterminal(nonterminal) => {
925 if *nonterminal == lhs {
926 None
927 } else {
928 Some(a)
929 }
930 }
931 _ => Some(a),
932 }
933 } else {
934 Some(a)
935 }
936 })
937 .collect(),
938 };
939 if new_rhs.alternations.is_empty() {
940 None
941 } else {
942 Some((lhs, new_rhs))
943 }
944 })
945 .collect()
946 }
947
948 fn compress_terminals(
949 rules: FxHashMap<SymbolU32, Rhs>,
950 interned_strings: &mut InternedStrings,
951 config: CompressionConfig,
952 id_to_regex: &mut FxHashMap<SymbolU32, FiniteStateAutomaton>,
953 ) -> FxHashMap<SymbolU32, Rhs> {
954 rules
955 .into_iter()
956 .map(|(lhs, rhs)| {
957 let (terminals, mut remaining): (Vec<_>, Vec<_>) =
958 rhs.alternations.into_iter().partition(|x| {
959 matches!(
960 x.concatenations.as_slice(),
961 [OperatorFlattenedNode::Terminal(_)]
962 )
963 });
964 let alternations = if terminals.len() >= config.min_terminals {
965 let regex_string = from_terminals_to_regex_string(&terminals, interned_strings);
966 let id = interned_strings
967 .regex_strings
968 .get(®ex_string)
969 .unwrap_or_else(|| match &config.regex_config {
970 FiniteStateAutomatonConfig::Dfa(config) => {
971 let dfa = dfa::dense::Builder::new()
972 .configure(config.clone())
973 .build(®ex_string)
974 .unwrap();
975 let id = interned_strings.regex_strings.get_or_intern(regex_string);
976 id_to_regex.insert(id, FiniteStateAutomaton::Dfa(dfa));
977 id
978 }
979 });
980 remaining.push(Alternation {
981 concatenations: vec![OperatorFlattenedNode::RegexString(id)],
982 });
983 remaining
984 } else {
985 remaining.extend(terminals);
986 remaining
987 };
988 (lhs, Rhs { alternations })
989 })
990 .collect()
991 }
992
993 fn compact_interned(
994 mut start_symbol: SymbolU32,
995 rules: FxHashMap<SymbolU32, Rhs>,
996 interned: InternedStrings,
997 id_to_regex: FxHashMap<SymbolU32, FiniteStateAutomaton>,
998 id_to_suffix_automaton: FxHashMap<SymbolU32, SuffixAutomaton>,
999 ) -> (
1000 InternedStrings,
1001 Vec<FiniteStateAutomaton>,
1002 Vec<SuffixAutomaton>,
1003 Vec<Rhs>,
1004 SymbolU32,
1005 ) {
1006 let mut interned_nonterminals: StringInterner<StringBackend> = StringInterner::default();
1007 let mut interned_terminals: StringInterner<StringBackend> = StringInterner::default();
1008 let mut interned_regexes: StringInterner<StringBackend> = StringInterner::default();
1009 let mut interned_sub_strings: StringInterner<StringBackend> = StringInterner::default();
1010 let mut new_id_to_regex = Vec::with_capacity(id_to_regex.len());
1011 let mut new_id_to_suffix_automaton = Vec::with_capacity(id_to_suffix_automaton.len());
1012 let mut new_rules: Vec<_> = Vec::with_capacity(rules.len());
1013 let mut start_symbol_updated = false;
1014 for (lhs, rhs) in rules.into_iter() {
1015 let id =
1016 interned_nonterminals.get_or_intern(interned.nonterminals.resolve(lhs).unwrap());
1017 if lhs == start_symbol && !start_symbol_updated {
1018 start_symbol = id;
1019 start_symbol_updated = true;
1020 }
1021 assert!(id.to_usize() == new_rules.len());
1022 new_rules.push(rhs);
1023 }
1024 for rhs in new_rules.iter_mut() {
1025 for Alternation { concatenations } in &mut rhs.alternations {
1026 for concatenation in concatenations {
1027 match concatenation {
1028 OperatorFlattenedNode::Nonterminal(nonterminal) => {
1029 *nonterminal = interned_nonterminals.get_or_intern(
1030 interned.nonterminals.resolve(*nonterminal).unwrap(),
1031 );
1032 }
1033 OperatorFlattenedNode::Terminal(terminal) => {
1034 *terminal = interned_terminals
1035 .get_or_intern(interned.terminals.resolve(*terminal).unwrap());
1036 }
1037 OperatorFlattenedNode::RegexString(regex)
1038 | OperatorFlattenedNode::EarlyEndRegexString(regex)
1039 | OperatorFlattenedNode::RegexComplement(regex) => {
1040 let new_id = interned_regexes
1041 .get_or_intern(interned.regex_strings.resolve(*regex).unwrap());
1042 if new_id.to_usize() == new_id_to_regex.len() {
1043 new_id_to_regex.push(id_to_regex.get(regex).unwrap().clone());
1044 }
1045 *regex = new_id;
1046 }
1048 OperatorFlattenedNode::Substrings(substrings) => {
1049 let new_id = interned_sub_strings
1050 .get_or_intern(interned.sub_strings.resolve(*substrings).unwrap());
1051 if new_id.to_usize() == new_id_to_suffix_automaton.len() {
1052 new_id_to_suffix_automaton
1053 .push(id_to_suffix_automaton.get(substrings).unwrap().clone());
1054 }
1055 *substrings = new_id;
1056 }
1057 }
1058 }
1059 }
1060 }
1061 (
1062 InternedStrings {
1063 nonterminals: interned_nonterminals,
1064 terminals: interned_terminals,
1065 regex_strings: interned_regexes,
1066 sub_strings: interned_sub_strings,
1067 },
1068 new_id_to_regex,
1069 new_id_to_suffix_automaton,
1070 new_rules,
1071 start_symbol,
1072 )
1073 }
1074}