kbnf_syntax/
grammar.rs

1use std::fmt::Debug;
2
3use alloc::vec::Vec;
4
5use general_sam::GeneralSam;
6use rustc_hash::{FxHashMap, FxHashSet};
7use string_interner::symbol::SymbolU32;
8
9use crate::{
10    expression::ExpressionWithID,
11    node::NodeWithID,
12    regex::{FiniteStateAutomaton, FiniteStateAutomatonConfig},
13    semantic_error::SemanticError,
14    suffix_automaton::SuffixAutomaton,
15    utils::compile_one_regex_string,
16    validated_grammar::ValidatedGrammar,
17    InternedStrings,
18};
19
20#[derive(Debug, Clone)]
21pub struct Grammar {
22    pub expressions: Vec<ExpressionWithID>,
23    pub interned_strings: InternedStrings,
24}
25
26impl Grammar {
27    pub fn validate_grammar(
28        self,
29        start_nonterminal: &str,
30        regex_config: FiniteStateAutomatonConfig,
31    ) -> Result<ValidatedGrammar, Box<SemanticError>> {
32        let start = self
33            .interned_strings
34            .nonterminals
35            .get(start_nonterminal)
36            .ok_or(SemanticError::UndefinedNonterminal(start_nonterminal.to_string()))?;
37        self.check_undefined_nonterminal(start_nonterminal)?;
38        let regexes = self.compile_regex_string(regex_config)?;
39        let suffix_automata = self.compile_suffix_automaton();
40        Ok(ValidatedGrammar {
41            expressions: self.expressions,
42            start_symbol: start,
43            interned_strings: self.interned_strings,
44            id_to_regex: regexes,
45            id_to_suffix_automaton: suffix_automata,
46        })
47    }
48
49    fn check_undefined_nonterminal(&self, start_symbol: &str) -> Result<(), Box<SemanticError>> {
50        fn check_defined_nonterminals(
51            defined_nonterminals: &FxHashSet<SymbolU32>,
52            expressions: &[ExpressionWithID],
53            interned_strings: &InternedStrings,
54        ) -> Result<(), Box<SemanticError>> {
55            for expression in expressions.iter() {
56                let mut stack = vec![&expression.rhs];
57                while let Some(node) = stack.pop() {
58                    match node {
59                        NodeWithID::Terminal(_) => {}
60                        NodeWithID::RegexString(_) => {}
61                        NodeWithID::EarlyEndRegexString(_) => {}
62                        NodeWithID::Substrings(_) => {}
63                        NodeWithID::RegexComplement(_) => {}
64                        NodeWithID::Nonterminal(nonterminal) => {
65                            if !defined_nonterminals.contains(nonterminal) {
66                                return Err(Box::new(SemanticError::UndefinedNonterminal(
67                                    interned_strings
68                                        .nonterminals
69                                        .resolve(*nonterminal)
70                                        .unwrap()
71                                        .to_string(),
72                                )));
73                            }
74                        }
75                        NodeWithID::Multiple(nodes) => {
76                            stack.extend(nodes);
77                        }
78                        NodeWithID::RegexExt(node, _) => {
79                            stack.push(node);
80                        }
81                        NodeWithID::Symbol(lhs, _, rhs) => {
82                            stack.push(lhs);
83                            stack.push(rhs);
84                        }
85                        NodeWithID::Group(node) => {
86                            stack.push(node);
87                        }
88                        NodeWithID::Unknown => unreachable!(),
89                    }
90                }
91            }
92            Ok(())
93        }
94        let defined_nonterminals = self
95            .expressions
96            .iter()
97            .map(|expression| expression.lhs)
98            .collect::<FxHashSet<SymbolU32>>();
99        self.interned_strings
100            .nonterminals
101            .get(start_symbol)
102            .ok_or_else(|| SemanticError::UndefinedNonterminal(start_symbol.to_string()))?;
103        check_defined_nonterminals(
104            &defined_nonterminals,
105            &self.expressions,
106            &self.interned_strings,
107        )
108    }
109
110    fn compile_regex_string(
111        &self,
112        config: FiniteStateAutomatonConfig,
113    ) -> Result<FxHashMap<SymbolU32, FiniteStateAutomaton>, Box<SemanticError>> {
114        let mut regexes = FxHashMap::default();
115        for (id, regex_string) in &self.interned_strings.regex_strings {
116            let regex: Result<FiniteStateAutomaton, SemanticError> =
117                compile_one_regex_string(regex_string, config.clone());
118            let regex = match regex {
119                Ok(x) => x,
120                Err(e) => return Err(Box::new(e)),
121            };
122            regexes.insert(id, regex);
123        }
124        Ok(regexes)
125    }
126
127    fn compile_suffix_automaton(&self) -> FxHashMap<SymbolU32, SuffixAutomaton> {
128        let mut suffix_automata = FxHashMap::default();
129        for (id, full_string) in &self.interned_strings.sub_strings {
130            let suffix_automaton = GeneralSam::from_bytes(full_string);
131            suffix_automata.insert(id, suffix_automaton);
132        }
133        suffix_automata
134    }
135}
136
137#[cfg(test)]
138mod test {
139    use insta::assert_snapshot;
140    use kbnf_regex_automata::dfa::dense::Config;
141
142    use crate::{config::CompressionConfig, get_grammar};
143    #[test]
144    #[should_panic]
145    fn undefined_nonterminal() {
146        let source = r#"
147            except ::= A;
148        "#;
149        let _ = get_grammar(source)
150            .unwrap()
151            .validate_grammar(
152                "except",
153                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
154            )
155            .unwrap();
156    }
157    #[test]
158    #[should_panic]
159    fn undefined_nonterminal2() {
160        let source = r#"
161             except ::= except!(A);
162        "#;
163        let _ = get_grammar(source)
164            .unwrap()
165            .validate_grammar(
166                "except",
167                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
168            )
169            .unwrap();
170    }
171    #[test]
172    #[should_panic]
173    fn undefined_nonterminal3() {
174        let source = r#"
175             except ::= except!(A);
176        "#;
177        let _ = get_grammar(source)
178            .unwrap()
179            .validate_grammar(
180                "A",
181                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
182            )
183            .unwrap();
184    }
185    #[test]
186    #[should_panic]
187    fn invalid_excepted_nonterminal() {
188        let source = r#"
189             except ::= except!(A);
190             A ::= 'a'|('a'|'b');
191        "#;
192        let _ = get_grammar(source)
193            .unwrap()
194            .validate_grammar(
195                "A",
196                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
197            )
198            .unwrap();
199    }
200    #[test]
201    #[should_panic]
202    fn invalid_excepted_nonterminal2() {
203        let source = r#"
204             except ::= except!(A);
205             A ::= B;
206             B ::= 'c';
207        "#;
208        let _ = get_grammar(source)
209            .unwrap()
210            .validate_grammar(
211                "A",
212                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
213            )
214            .unwrap();
215    }
216    #[test]
217    #[should_panic]
218    fn invalid_excepted_terminal() {
219        let source = r#"
220             except ::= except!('');
221             A ::= 'a'|'';
222        "#;
223        let _ = get_grammar(source)
224            .unwrap()
225            .validate_grammar(
226                "A",
227                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
228            )
229            .unwrap();
230    }
231
232    #[test]
233    fn simplify_grammar2() {
234        let source = r#"
235            S ::= (A)? (A)? (A)? (A)? (A)? B;
236            A ::= 'cd';
237            B ::= A'a';
238        "#;
239        let result = get_grammar(source)
240            .unwrap()
241            .validate_grammar(
242                "S",
243                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
244            )
245            .unwrap()
246            .simplify_grammar(
247                CompressionConfig {
248                    min_terminals: 2,
249                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
250                },
251                &kbnf_regex_automata::util::start::Config::new(),
252            );
253        assert_snapshot!(format!("{:?}", result))
254    }
255
256    #[test]
257    fn simplify_grammar3() {
258        let source = r#"
259            S ::= 'a'? 'b'? 'c'? 'd'? 'e'?;
260            A ::= 'cd';
261        "#;
262        let result = get_grammar(source)
263            .unwrap()
264            .validate_grammar(
265                "S",
266                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
267            )
268            .unwrap()
269            .simplify_grammar(
270                CompressionConfig {
271                    min_terminals: 2,
272                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
273                },
274                &kbnf_regex_automata::util::start::Config::new(),
275            );
276        assert_snapshot!(format!("{:?}", result))
277    }
278
279    #[test]
280    fn simplify_grammar4() {
281        let source = r#"
282            S ::= ((A?)*)+;
283            A ::= 'cd'?;
284        "#;
285        let result = get_grammar(source)
286            .unwrap()
287            .validate_grammar(
288                "S",
289                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
290            )
291            .unwrap()
292            .simplify_grammar(
293                CompressionConfig {
294                    min_terminals: 2,
295                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
296                },
297                &kbnf_regex_automata::util::start::Config::new(),
298            );
299        assert_snapshot!(format!("{:?}", result))
300    }
301
302    #[test]
303    fn simplify_grammar5() {
304        let source = r#"
305            S ::= 'ab'S? | 'jk'(((A)));
306            A ::= 'cd'|'cd'|A'c'|'Ac';
307        "#;
308        let result = get_grammar(source)
309            .unwrap()
310            .validate_grammar(
311                "S",
312                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
313            )
314            .unwrap()
315            .simplify_grammar(
316                CompressionConfig {
317                    min_terminals: 2,
318                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
319                },
320                &kbnf_regex_automata::util::start::Config::new(),
321            );
322        assert_snapshot!(format!("{:?}", result))
323    }
324
325    #[test]
326    fn simplify_grammar6() {
327        let source = r#"
328            S ::= 'cd'A B;
329            A ::= #"";
330            B ::= #e'cd';
331        "#;
332        let result = get_grammar(source)
333            .unwrap()
334            .validate_grammar(
335                "S",
336                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
337            )
338            .unwrap()
339            .simplify_grammar(
340                CompressionConfig {
341                    min_terminals: 2,
342                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
343                },
344                &kbnf_regex_automata::util::start::Config::new(),
345            );
346        assert_snapshot!(format!("{:?}", result))
347    }
348
349    #[test]
350    fn simplify_grammar9() {
351        let source = r#"
352            S ::= 'c'|'a'|'b'|'d';
353            A ::= #"";
354        "#;
355        let result = get_grammar(source)
356            .unwrap()
357            .validate_grammar(
358                "S",
359                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
360            )
361            .unwrap()
362            .simplify_grammar(
363                CompressionConfig {
364                    min_terminals: 2,
365                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
366                },
367                &kbnf_regex_automata::util::start::Config::new(),
368            );
369        assert_snapshot!(format!("{:?}", result))
370    }
371    #[test]
372    fn simplify_grammar11() {
373        let source = r#"
374__choice_food_0 ::= 'railroad' | 'orange' | 'banana';
375__regex_0_0 ::= #'[0-9]+';
376__regex_1_0 ::= #'[a-z]+';
377__choice_ID_0 ::= __regex_0_0 | __regex_1_0;
378integer ::= #"-?(0|[1-9]\\d*)";
379number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?";
380string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{4})*"';
381boolean ::= "true"|"false";
382null ::= "null";
383array ::= array_begin (json_value (comma json_value)*)? array_end;
384object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end;
385json_value ::= number|string|boolean|null|array|object;
386comma ::= #"(\u0020|\u000A|\u000D|\u0009)*,(\u0020|\u000A|\u000D|\u0009)*";
387colon ::= #"(\u0020|\u000A|\u000D|\u0009)*:(\u0020|\u000A|\u000D|\u0009)*";
388object_begin ::= #"\\{(\u0020|\u000A|\u000D|\u0009)*";
389object_end ::= #"(\u0020|\u000A|\u000D|\u0009)*\\}";
390array_begin ::= #"\\[(\u0020|\u000A|\u000D|\u0009)*";
391array_end ::= #"(\u0020|\u000A|\u000D|\u0009)*\\]";
392__schema_json_0 ::= object_begin 'name' colon __schema_json_0_name comma 'weight' colon __schema_json_0_weight comma 'color' colon __schema_json_0_color object_end;
393__schema_json_0_color ::= string;
394__schema_json_0_weight ::= number;
395__schema_json_0_name ::= string;
396
397start ::= 'Today, I want to eat ' __choice_food_0 '\n' "My food's ID is " __choice_ID_0 '.\n' "\nWhat's more, indentations\nare handled\nappropriately." "Let's end with a json: " __schema_json_0 '\n';
398        "#;
399        let result = get_grammar(source)
400            .unwrap()
401            .validate_grammar(
402                "start",
403                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
404            )
405            .unwrap()
406            .simplify_grammar(
407                CompressionConfig {
408                    min_terminals: 3,
409                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
410                },
411                &kbnf_regex_automata::util::start::Config::new(),
412            );
413        assert_snapshot!(format!("{:?}", result))
414    }
415    #[test]
416    fn unit_production_for_start_symbol() {
417        let source = r#"
418            S ::= 'a';
419        "#;
420        let result = get_grammar(source)
421            .unwrap()
422            .validate_grammar(
423                "S",
424                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
425            )
426            .unwrap()
427            .simplify_grammar(
428                CompressionConfig {
429                    min_terminals: 2,
430                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
431                },
432                &kbnf_regex_automata::util::start::Config::new(),
433            );
434        assert_snapshot!(format!("{:?}", result))
435    }
436    #[test]
437    fn empty_grammar() {
438        let source = r#"
439            S ::= '';
440        "#;
441        let result = get_grammar(source)
442            .unwrap()
443            .validate_grammar(
444                "S",
445                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
446            )
447            .unwrap()
448            .simplify_grammar(
449                CompressionConfig {
450                    min_terminals: 2,
451                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
452                },
453                &kbnf_regex_automata::util::start::Config::new(),
454            );
455        assert_snapshot!(format!("{:?}", result))
456    }
457    #[test]
458    fn empty_grammar2() {
459        let source = r#"
460            S ::= S;
461        "#;
462        let result = get_grammar(source)
463            .unwrap()
464            .validate_grammar(
465                "S",
466                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
467            )
468            .unwrap()
469            .simplify_grammar(
470                CompressionConfig {
471                    min_terminals: 2,
472                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
473                },
474                &kbnf_regex_automata::util::start::Config::new(),
475            );
476        assert_snapshot!(format!("{:?}", result))
477    }
478
479    #[test]
480    fn empty_grammar3() {
481        let source = r#"
482            S ::= ''|#'^$';
483        "#;
484        let result = get_grammar(source)
485            .unwrap()
486            .validate_grammar(
487                "S",
488                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
489            )
490            .unwrap()
491            .simplify_grammar(
492                CompressionConfig {
493                    min_terminals: 2,
494                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
495                },
496                &kbnf_regex_automata::util::start::Config::new()
497                    .anchored(kbnf_regex_automata::Anchored::Yes),
498            );
499        assert_snapshot!(format!("{:?}", result))
500    }
501    #[test]
502    fn indirect_right_recursive_grammar() {
503        let source = "start::=A'\n';
504        A::='x'|'x' B;
505        B::='y'|'y' A;";
506        let result = get_grammar(source)
507            .unwrap()
508            .validate_grammar(
509                "start",
510                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
511            )
512            .unwrap()
513            .simplify_grammar(
514                CompressionConfig {
515                    min_terminals: 3,
516                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
517                },
518                &kbnf_regex_automata::util::start::Config::new()
519                    .anchored(kbnf_regex_automata::Anchored::Yes),
520            );
521        assert_snapshot!(format!("{:?}", result))
522    }
523    #[test]
524    fn linked_list() {
525        let source = r#"__schema_json_1_next_0 ::= __schema_json_1;
526
527start ::= "```json\n"__schema_json_1"```\n";
528
529__schema_json_1 ::= 
530    #"\\A\\{( |\n|\r|\t)*\\z" 
531    "\"value\""
532    #"\\A( |\n|\r|\t)*:( |\n|\r|\t)*\\z" 
533    #"\\A-?(0|[1-9]\\d*)\\z" 
534    #"\\A( |\n|\r|\t)*,( |\n|\r|\t)*\\z" 
535    "\"next\"" 
536    #"\\A( |\n|\r|\t)*:( |\n|\r|\t)*\\z" 
537    __schema_json_1_next
538    #"\\A( |\n|\r|\t)*\\}\\z";
539
540__schema_json_1_next ::= 
541    "null"
542    | __schema_json_1_next_0;"#;
543        let result = get_grammar(source)
544            .unwrap()
545            .validate_grammar(
546                "start",
547                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
548            )
549            .unwrap()
550            .simplify_grammar(
551                CompressionConfig {
552                    min_terminals: 3,
553                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
554                },
555                &kbnf_regex_automata::util::start::Config::new()
556                    .anchored(kbnf_regex_automata::Anchored::Yes),
557            );
558        assert_snapshot!(format!("{:?}", result))
559    }
560
561    #[test]
562    fn sub_strings() {
563        let source = r#"
564            S ::= B #substrs"abc" "d" | #substrs"A" "e";
565            B ::= #substrs"";
566        "#;
567        let result = get_grammar(source)
568            .unwrap()
569            .validate_grammar(
570                "S",
571                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
572            )
573            .unwrap()
574            .simplify_grammar(
575                CompressionConfig {
576                    min_terminals: 2,
577                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
578                },
579                &kbnf_regex_automata::util::start::Config::new()
580                    .anchored(kbnf_regex_automata::Anchored::Yes),
581            );
582        assert_snapshot!(format!("{:?}", result))
583    }
584
585    #[test]
586    fn regex_complement() {
587        let source = r#"
588            filter ::= #ex"[a-z]+";
589        "#;
590        let result = get_grammar(source).unwrap().validate_grammar(
591            "filter",
592                crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
593            )
594            .unwrap()
595            .simplify_grammar(
596                CompressionConfig {
597                    min_terminals: 2,
598                    regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
599                },
600                &kbnf_regex_automata::util::start::Config::new(),
601            );
602        assert_snapshot!(format!("{:?}", result))
603    }
604}