1use std::fmt::Debug;
2
3use alloc::vec::Vec;
4
5use general_sam::GeneralSam;
6use rustc_hash::{FxHashMap, FxHashSet};
7use string_interner::symbol::SymbolU32;
8
9use crate::{
10 expression::ExpressionWithID,
11 node::NodeWithID,
12 regex::{FiniteStateAutomaton, FiniteStateAutomatonConfig},
13 semantic_error::SemanticError,
14 suffix_automaton::SuffixAutomaton,
15 utils::compile_one_regex_string,
16 validated_grammar::ValidatedGrammar,
17 InternedStrings,
18};
19
20#[derive(Debug, Clone)]
21pub struct Grammar {
22 pub expressions: Vec<ExpressionWithID>,
23 pub interned_strings: InternedStrings,
24}
25
26impl Grammar {
27 pub fn validate_grammar(
28 self,
29 start_nonterminal: &str,
30 regex_config: FiniteStateAutomatonConfig,
31 ) -> Result<ValidatedGrammar, Box<SemanticError>> {
32 let start = self
33 .interned_strings
34 .nonterminals
35 .get(start_nonterminal)
36 .ok_or(SemanticError::UndefinedNonterminal(start_nonterminal.to_string()))?;
37 self.check_undefined_nonterminal(start_nonterminal)?;
38 let regexes = self.compile_regex_string(regex_config)?;
39 let suffix_automata = self.compile_suffix_automaton();
40 Ok(ValidatedGrammar {
41 expressions: self.expressions,
42 start_symbol: start,
43 interned_strings: self.interned_strings,
44 id_to_regex: regexes,
45 id_to_suffix_automaton: suffix_automata,
46 })
47 }
48
49 fn check_undefined_nonterminal(&self, start_symbol: &str) -> Result<(), Box<SemanticError>> {
50 fn check_defined_nonterminals(
51 defined_nonterminals: &FxHashSet<SymbolU32>,
52 expressions: &[ExpressionWithID],
53 interned_strings: &InternedStrings,
54 ) -> Result<(), Box<SemanticError>> {
55 for expression in expressions.iter() {
56 let mut stack = vec![&expression.rhs];
57 while let Some(node) = stack.pop() {
58 match node {
59 NodeWithID::Terminal(_) => {}
60 NodeWithID::RegexString(_) => {}
61 NodeWithID::EarlyEndRegexString(_) => {}
62 NodeWithID::Substrings(_) => {}
63 NodeWithID::RegexComplement(_) => {}
64 NodeWithID::Nonterminal(nonterminal) => {
65 if !defined_nonterminals.contains(nonterminal) {
66 return Err(Box::new(SemanticError::UndefinedNonterminal(
67 interned_strings
68 .nonterminals
69 .resolve(*nonterminal)
70 .unwrap()
71 .to_string(),
72 )));
73 }
74 }
75 NodeWithID::Multiple(nodes) => {
76 stack.extend(nodes);
77 }
78 NodeWithID::RegexExt(node, _) => {
79 stack.push(node);
80 }
81 NodeWithID::Symbol(lhs, _, rhs) => {
82 stack.push(lhs);
83 stack.push(rhs);
84 }
85 NodeWithID::Group(node) => {
86 stack.push(node);
87 }
88 NodeWithID::Unknown => unreachable!(),
89 }
90 }
91 }
92 Ok(())
93 }
94 let defined_nonterminals = self
95 .expressions
96 .iter()
97 .map(|expression| expression.lhs)
98 .collect::<FxHashSet<SymbolU32>>();
99 self.interned_strings
100 .nonterminals
101 .get(start_symbol)
102 .ok_or_else(|| SemanticError::UndefinedNonterminal(start_symbol.to_string()))?;
103 check_defined_nonterminals(
104 &defined_nonterminals,
105 &self.expressions,
106 &self.interned_strings,
107 )
108 }
109
110 fn compile_regex_string(
111 &self,
112 config: FiniteStateAutomatonConfig,
113 ) -> Result<FxHashMap<SymbolU32, FiniteStateAutomaton>, Box<SemanticError>> {
114 let mut regexes = FxHashMap::default();
115 for (id, regex_string) in &self.interned_strings.regex_strings {
116 let regex: Result<FiniteStateAutomaton, SemanticError> =
117 compile_one_regex_string(regex_string, config.clone());
118 let regex = match regex {
119 Ok(x) => x,
120 Err(e) => return Err(Box::new(e)),
121 };
122 regexes.insert(id, regex);
123 }
124 Ok(regexes)
125 }
126
127 fn compile_suffix_automaton(&self) -> FxHashMap<SymbolU32, SuffixAutomaton> {
128 let mut suffix_automata = FxHashMap::default();
129 for (id, full_string) in &self.interned_strings.sub_strings {
130 let suffix_automaton = GeneralSam::from_bytes(full_string);
131 suffix_automata.insert(id, suffix_automaton);
132 }
133 suffix_automata
134 }
135}
136
137#[cfg(test)]
138mod test {
139 use insta::assert_snapshot;
140 use kbnf_regex_automata::dfa::dense::Config;
141
142 use crate::{config::CompressionConfig, get_grammar};
143 #[test]
144 #[should_panic]
145 fn undefined_nonterminal() {
146 let source = r#"
147 except ::= A;
148 "#;
149 let _ = get_grammar(source)
150 .unwrap()
151 .validate_grammar(
152 "except",
153 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
154 )
155 .unwrap();
156 }
157 #[test]
158 #[should_panic]
159 fn undefined_nonterminal2() {
160 let source = r#"
161 except ::= except!(A);
162 "#;
163 let _ = get_grammar(source)
164 .unwrap()
165 .validate_grammar(
166 "except",
167 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
168 )
169 .unwrap();
170 }
171 #[test]
172 #[should_panic]
173 fn undefined_nonterminal3() {
174 let source = r#"
175 except ::= except!(A);
176 "#;
177 let _ = get_grammar(source)
178 .unwrap()
179 .validate_grammar(
180 "A",
181 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
182 )
183 .unwrap();
184 }
185 #[test]
186 #[should_panic]
187 fn invalid_excepted_nonterminal() {
188 let source = r#"
189 except ::= except!(A);
190 A ::= 'a'|('a'|'b');
191 "#;
192 let _ = get_grammar(source)
193 .unwrap()
194 .validate_grammar(
195 "A",
196 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
197 )
198 .unwrap();
199 }
200 #[test]
201 #[should_panic]
202 fn invalid_excepted_nonterminal2() {
203 let source = r#"
204 except ::= except!(A);
205 A ::= B;
206 B ::= 'c';
207 "#;
208 let _ = get_grammar(source)
209 .unwrap()
210 .validate_grammar(
211 "A",
212 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
213 )
214 .unwrap();
215 }
216 #[test]
217 #[should_panic]
218 fn invalid_excepted_terminal() {
219 let source = r#"
220 except ::= except!('');
221 A ::= 'a'|'';
222 "#;
223 let _ = get_grammar(source)
224 .unwrap()
225 .validate_grammar(
226 "A",
227 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
228 )
229 .unwrap();
230 }
231
232 #[test]
233 fn simplify_grammar2() {
234 let source = r#"
235 S ::= (A)? (A)? (A)? (A)? (A)? B;
236 A ::= 'cd';
237 B ::= A'a';
238 "#;
239 let result = get_grammar(source)
240 .unwrap()
241 .validate_grammar(
242 "S",
243 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
244 )
245 .unwrap()
246 .simplify_grammar(
247 CompressionConfig {
248 min_terminals: 2,
249 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
250 },
251 &kbnf_regex_automata::util::start::Config::new(),
252 );
253 assert_snapshot!(format!("{:?}", result))
254 }
255
256 #[test]
257 fn simplify_grammar3() {
258 let source = r#"
259 S ::= 'a'? 'b'? 'c'? 'd'? 'e'?;
260 A ::= 'cd';
261 "#;
262 let result = get_grammar(source)
263 .unwrap()
264 .validate_grammar(
265 "S",
266 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
267 )
268 .unwrap()
269 .simplify_grammar(
270 CompressionConfig {
271 min_terminals: 2,
272 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
273 },
274 &kbnf_regex_automata::util::start::Config::new(),
275 );
276 assert_snapshot!(format!("{:?}", result))
277 }
278
279 #[test]
280 fn simplify_grammar4() {
281 let source = r#"
282 S ::= ((A?)*)+;
283 A ::= 'cd'?;
284 "#;
285 let result = get_grammar(source)
286 .unwrap()
287 .validate_grammar(
288 "S",
289 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
290 )
291 .unwrap()
292 .simplify_grammar(
293 CompressionConfig {
294 min_terminals: 2,
295 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
296 },
297 &kbnf_regex_automata::util::start::Config::new(),
298 );
299 assert_snapshot!(format!("{:?}", result))
300 }
301
302 #[test]
303 fn simplify_grammar5() {
304 let source = r#"
305 S ::= 'ab'S? | 'jk'(((A)));
306 A ::= 'cd'|'cd'|A'c'|'Ac';
307 "#;
308 let result = get_grammar(source)
309 .unwrap()
310 .validate_grammar(
311 "S",
312 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
313 )
314 .unwrap()
315 .simplify_grammar(
316 CompressionConfig {
317 min_terminals: 2,
318 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
319 },
320 &kbnf_regex_automata::util::start::Config::new(),
321 );
322 assert_snapshot!(format!("{:?}", result))
323 }
324
325 #[test]
326 fn simplify_grammar6() {
327 let source = r#"
328 S ::= 'cd'A B;
329 A ::= #"";
330 B ::= #e'cd';
331 "#;
332 let result = get_grammar(source)
333 .unwrap()
334 .validate_grammar(
335 "S",
336 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
337 )
338 .unwrap()
339 .simplify_grammar(
340 CompressionConfig {
341 min_terminals: 2,
342 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
343 },
344 &kbnf_regex_automata::util::start::Config::new(),
345 );
346 assert_snapshot!(format!("{:?}", result))
347 }
348
349 #[test]
350 fn simplify_grammar9() {
351 let source = r#"
352 S ::= 'c'|'a'|'b'|'d';
353 A ::= #"";
354 "#;
355 let result = get_grammar(source)
356 .unwrap()
357 .validate_grammar(
358 "S",
359 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
360 )
361 .unwrap()
362 .simplify_grammar(
363 CompressionConfig {
364 min_terminals: 2,
365 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
366 },
367 &kbnf_regex_automata::util::start::Config::new(),
368 );
369 assert_snapshot!(format!("{:?}", result))
370 }
371 #[test]
372 fn simplify_grammar11() {
373 let source = r#"
374__choice_food_0 ::= 'railroad' | 'orange' | 'banana';
375__regex_0_0 ::= #'[0-9]+';
376__regex_1_0 ::= #'[a-z]+';
377__choice_ID_0 ::= __regex_0_0 | __regex_1_0;
378integer ::= #"-?(0|[1-9]\\d*)";
379number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?";
380string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{4})*"';
381boolean ::= "true"|"false";
382null ::= "null";
383array ::= array_begin (json_value (comma json_value)*)? array_end;
384object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end;
385json_value ::= number|string|boolean|null|array|object;
386comma ::= #"(\u0020|\u000A|\u000D|\u0009)*,(\u0020|\u000A|\u000D|\u0009)*";
387colon ::= #"(\u0020|\u000A|\u000D|\u0009)*:(\u0020|\u000A|\u000D|\u0009)*";
388object_begin ::= #"\\{(\u0020|\u000A|\u000D|\u0009)*";
389object_end ::= #"(\u0020|\u000A|\u000D|\u0009)*\\}";
390array_begin ::= #"\\[(\u0020|\u000A|\u000D|\u0009)*";
391array_end ::= #"(\u0020|\u000A|\u000D|\u0009)*\\]";
392__schema_json_0 ::= object_begin 'name' colon __schema_json_0_name comma 'weight' colon __schema_json_0_weight comma 'color' colon __schema_json_0_color object_end;
393__schema_json_0_color ::= string;
394__schema_json_0_weight ::= number;
395__schema_json_0_name ::= string;
396
397start ::= 'Today, I want to eat ' __choice_food_0 '\n' "My food's ID is " __choice_ID_0 '.\n' "\nWhat's more, indentations\nare handled\nappropriately." "Let's end with a json: " __schema_json_0 '\n';
398 "#;
399 let result = get_grammar(source)
400 .unwrap()
401 .validate_grammar(
402 "start",
403 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
404 )
405 .unwrap()
406 .simplify_grammar(
407 CompressionConfig {
408 min_terminals: 3,
409 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
410 },
411 &kbnf_regex_automata::util::start::Config::new(),
412 );
413 assert_snapshot!(format!("{:?}", result))
414 }
415 #[test]
416 fn unit_production_for_start_symbol() {
417 let source = r#"
418 S ::= 'a';
419 "#;
420 let result = get_grammar(source)
421 .unwrap()
422 .validate_grammar(
423 "S",
424 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
425 )
426 .unwrap()
427 .simplify_grammar(
428 CompressionConfig {
429 min_terminals: 2,
430 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
431 },
432 &kbnf_regex_automata::util::start::Config::new(),
433 );
434 assert_snapshot!(format!("{:?}", result))
435 }
436 #[test]
437 fn empty_grammar() {
438 let source = r#"
439 S ::= '';
440 "#;
441 let result = get_grammar(source)
442 .unwrap()
443 .validate_grammar(
444 "S",
445 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
446 )
447 .unwrap()
448 .simplify_grammar(
449 CompressionConfig {
450 min_terminals: 2,
451 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
452 },
453 &kbnf_regex_automata::util::start::Config::new(),
454 );
455 assert_snapshot!(format!("{:?}", result))
456 }
457 #[test]
458 fn empty_grammar2() {
459 let source = r#"
460 S ::= S;
461 "#;
462 let result = get_grammar(source)
463 .unwrap()
464 .validate_grammar(
465 "S",
466 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
467 )
468 .unwrap()
469 .simplify_grammar(
470 CompressionConfig {
471 min_terminals: 2,
472 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
473 },
474 &kbnf_regex_automata::util::start::Config::new(),
475 );
476 assert_snapshot!(format!("{:?}", result))
477 }
478
479 #[test]
480 fn empty_grammar3() {
481 let source = r#"
482 S ::= ''|#'^$';
483 "#;
484 let result = get_grammar(source)
485 .unwrap()
486 .validate_grammar(
487 "S",
488 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
489 )
490 .unwrap()
491 .simplify_grammar(
492 CompressionConfig {
493 min_terminals: 2,
494 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
495 },
496 &kbnf_regex_automata::util::start::Config::new()
497 .anchored(kbnf_regex_automata::Anchored::Yes),
498 );
499 assert_snapshot!(format!("{:?}", result))
500 }
501 #[test]
502 fn indirect_right_recursive_grammar() {
503 let source = "start::=A'\n';
504 A::='x'|'x' B;
505 B::='y'|'y' A;";
506 let result = get_grammar(source)
507 .unwrap()
508 .validate_grammar(
509 "start",
510 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
511 )
512 .unwrap()
513 .simplify_grammar(
514 CompressionConfig {
515 min_terminals: 3,
516 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
517 },
518 &kbnf_regex_automata::util::start::Config::new()
519 .anchored(kbnf_regex_automata::Anchored::Yes),
520 );
521 assert_snapshot!(format!("{:?}", result))
522 }
523 #[test]
524 fn linked_list() {
525 let source = r#"__schema_json_1_next_0 ::= __schema_json_1;
526
527start ::= "```json\n"__schema_json_1"```\n";
528
529__schema_json_1 ::=
530 #"\\A\\{( |\n|\r|\t)*\\z"
531 "\"value\""
532 #"\\A( |\n|\r|\t)*:( |\n|\r|\t)*\\z"
533 #"\\A-?(0|[1-9]\\d*)\\z"
534 #"\\A( |\n|\r|\t)*,( |\n|\r|\t)*\\z"
535 "\"next\""
536 #"\\A( |\n|\r|\t)*:( |\n|\r|\t)*\\z"
537 __schema_json_1_next
538 #"\\A( |\n|\r|\t)*\\}\\z";
539
540__schema_json_1_next ::=
541 "null"
542 | __schema_json_1_next_0;"#;
543 let result = get_grammar(source)
544 .unwrap()
545 .validate_grammar(
546 "start",
547 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
548 )
549 .unwrap()
550 .simplify_grammar(
551 CompressionConfig {
552 min_terminals: 3,
553 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
554 },
555 &kbnf_regex_automata::util::start::Config::new()
556 .anchored(kbnf_regex_automata::Anchored::Yes),
557 );
558 assert_snapshot!(format!("{:?}", result))
559 }
560
561 #[test]
562 fn sub_strings() {
563 let source = r#"
564 S ::= B #substrs"abc" "d" | #substrs"A" "e";
565 B ::= #substrs"";
566 "#;
567 let result = get_grammar(source)
568 .unwrap()
569 .validate_grammar(
570 "S",
571 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
572 )
573 .unwrap()
574 .simplify_grammar(
575 CompressionConfig {
576 min_terminals: 2,
577 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
578 },
579 &kbnf_regex_automata::util::start::Config::new()
580 .anchored(kbnf_regex_automata::Anchored::Yes),
581 );
582 assert_snapshot!(format!("{:?}", result))
583 }
584
585 #[test]
586 fn regex_complement() {
587 let source = r#"
588 filter ::= #ex"[a-z]+";
589 "#;
590 let result = get_grammar(source).unwrap().validate_grammar(
591 "filter",
592 crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
593 )
594 .unwrap()
595 .simplify_grammar(
596 CompressionConfig {
597 min_terminals: 2,
598 regex_config: crate::regex::FiniteStateAutomatonConfig::Dfa(Config::default()),
599 },
600 &kbnf_regex_automata::util::start::Config::new(),
601 );
602 assert_snapshot!(format!("{:?}", result))
603 }
604}