kbnf_syntax/
lib.rs

1//! ebnf - A successor bnf parsing library of bnf parsing library, for parsing Extended Backus–Naur form context-free grammars
2//!
3//! The code is available on [GitHub](https://github.com/ChAoSUnItY/ebnf)
4//!
5//! ## Disclaimer:
6//! There are various variants of EBNF, which uses somewhat different syntactic conventions. This library
7//! takes [EBNF Evaluator](https://mdkrajnak.github.io/ebnftest/)'s example code as standard, which has
8//! almost most syntactic conventions on Wikipedia's page.
9//!
10//! ## What does a valid EBNF grammar looks like?
11//!
12//! The following example is taken from EBNF Evaluator:
13//!
14//! ```ebnf
15//! filter ::= ( first ' ' )? ( number '~ ' )? ( number '-' number ) ( ' ' number '~' )? ( ' hz' )? ( ' b' )? ( ' i' )? ( ' a' )?;
16//! first  ::= #'[a-za-z][a-za-z0-9_+]*';
17//! number ::= digits ( ( '.' | ',' ) digits? )?;
18//! digits ::= #'[0-9]+';
19//! ```
20//!
21//! ## How to use this library?
22//!
23//! ```rust
24//! extern crate kbnf_syntax;
25//!
26//! fn main() {
27//!     let source = r"
28//!         filter ::= ( first ' ' )? ( number '~ ' )? ( number '-' number ) ( ' ' number '~' )? ( ' hz' )? ( ' b' )? ( ' i' )? ( ' a' )?;
29//!         first  ::= #'[a-za-z][a-za-z0-9_+]*';
30//!         number ::= digits ( ( '.' | ',' ) digits? )?;
31//!         digits ::= #'[0-9]+';
32//!     ";
33//!
34//!     let result = kbnf_syntax::get_grammar(source);
35//! }
36//! ```
37
38extern crate alloc;
39extern crate nom;
40extern crate parse_hyperlinks;
41use std::{iter::zip, mem};
42
43use expression::{Expression, ExpressionWithID};
44pub use grammar::Grammar;
45use node::NodeWithID;
46pub use node::{Node, RegexExtKind, SymbolKind};
47use string_interner::{backend::StringBackend, symbol::SymbolU32, StringInterner};
48
49pub mod config;
50mod expression;
51pub mod grammar;
52pub mod node;
53mod parser;
54pub mod regex;
55pub mod semantic_error;
56pub mod simplified_grammar;
57pub mod suffix_automaton;
58pub mod utils;
59pub mod validated_grammar;
60
61#[derive(Debug, Clone)]
62pub struct InternedStrings {
63    pub nonterminals: StringInterner<StringBackend<SymbolU32>>,
64    pub terminals: StringInterner<StringBackend<SymbolU32>>,
65    pub regex_strings: StringInterner<StringBackend<SymbolU32>>,
66    pub sub_strings: StringInterner<StringBackend<SymbolU32>>,
67}
68
69/// Get and parse EBNF grammar source into [Grammar], returns [Err] when given grammar is invalid.
70///
71/// # Example
72///
73/// ```rust
74/// use kbnf_syntax::get_grammar;
75///
76/// let grammar_literal = r"
77///     term ::= '1';
78/// ";
79/// let grammar = get_grammar(grammar_literal)?;
80///
81/// # Ok::<(), nom::Err<nom::error::VerboseError<&str>>>(())
82/// ```
83pub fn get_grammar(input: &str) -> Result<Grammar, nom::Err<nom::error::VerboseError<&str>>> {
84    let (interned_strings, expressions) = intern_strings(parser::parse_expressions(input)?.1);
85    Ok(Grammar {
86        interned_strings,
87        expressions,
88    })
89}
90
91fn intern_strings(expressions: Vec<Expression>) -> (InternedStrings, Vec<ExpressionWithID>) {
92    let mut nonterminals = StringInterner::<StringBackend<SymbolU32>>::new();
93    let mut terminals = StringInterner::<StringBackend<SymbolU32>>::new();
94    let mut regex_strings = StringInterner::<StringBackend<SymbolU32>>::new();
95    let mut sub_strings = StringInterner::<StringBackend<SymbolU32>>::new();
96    let mut new_expressions = vec![];
97    for expression in expressions {
98        let lhs = nonterminals.get_or_intern(expression.lhs);
99        let mut rhs = NodeWithID::Unknown;
100        let node = expression.rhs;
101        let mut stack = vec![(node, &mut rhs)];
102        while let Some((mut node, parent)) = stack.pop() {
103            match &mut node {
104                Node::Terminal(terminal) => {
105                    let terminal = std::mem::take(terminal);
106                    *parent = NodeWithID::Terminal(terminals.get_or_intern(&terminal));
107                }
108                Node::RegexString(regex_string) => {
109                    let regex_string = std::mem::take(regex_string);
110                    *parent = NodeWithID::RegexString(regex_strings.get_or_intern(&regex_string));
111                }
112                Node::EarlyEndRegexString(regex_string) => {
113                    let regex_string = std::mem::take(regex_string);
114                    *parent =
115                        NodeWithID::EarlyEndRegexString(regex_strings.get_or_intern(&regex_string));
116                }
117                Node::RegexComplement(regex_complement) => {
118                    let regex_complement = std::mem::take(regex_complement);
119                    *parent = NodeWithID::RegexComplement(
120                        regex_strings.get_or_intern(&regex_complement),
121                    );
122                }
123                Node::Substrings(substrings) => {
124                    let substrings = std::mem::take(substrings);
125                    *parent = NodeWithID::Substrings(sub_strings.get_or_intern(&substrings));
126                }
127                Node::Nonterminal(nonterminal) => {
128                    let nonterminal = std::mem::take(nonterminal);
129                    *parent = NodeWithID::Nonterminal(nonterminals.get_or_intern(&nonterminal));
130                }
131                Node::Multiple(nodes) => {
132                    let nodes = std::mem::take(nodes);
133                    let mut buffer = Vec::with_capacity(nodes.len());
134                    buffer.resize(nodes.len(), NodeWithID::Unknown);
135                    *parent = NodeWithID::Multiple(buffer);
136                    match parent {
137                        NodeWithID::Multiple(new_nodes) => {
138                            for (node, new_parent) in zip(nodes, new_nodes.iter_mut()) {
139                                stack.push((node, new_parent));
140                            }
141                        }
142                        _ => unreachable!(),
143                    }
144                }
145                Node::RegexExt(node, e) => {
146                    let node = mem::replace(node, Box::new(Node::Terminal(String::new())));
147                    *parent = NodeWithID::RegexExt(Box::new(NodeWithID::Unknown), *e);
148                    match parent {
149                        NodeWithID::RegexExt(new_node, _) => {
150                            stack.push((*node, new_node));
151                        }
152                        _ => unreachable!(),
153                    }
154                }
155                Node::Symbol(lhs, symbol, rhs) => {
156                    let lhs = mem::replace(lhs, Box::new(Node::Terminal(String::new())));
157                    // SAFETY: The `mem::forget(node)` after the match ensures we do not double free the rhs box
158                    let rhs = mem::replace(rhs, Box::new(Node::Terminal(String::new())));
159                    *parent = NodeWithID::Symbol(
160                        Box::new(NodeWithID::Unknown),
161                        *symbol,
162                        Box::new(NodeWithID::Unknown),
163                    );
164                    match parent {
165                        NodeWithID::Symbol(l, _, r) => {
166                            stack.push((*lhs, l));
167                            stack.push((*rhs, r));
168                        }
169                        _ => unreachable!(),
170                    }
171                }
172                Node::Group(node) => {
173                    let node = mem::replace(node, Box::new(Node::Terminal(String::new())));
174                    *parent = NodeWithID::Group(Box::new(NodeWithID::Unknown));
175                    match parent {
176                        NodeWithID::Group(new_node) => {
177                            stack.push((*node, new_node));
178                        }
179                        _ => unreachable!(),
180                    }
181                }
182            }
183        }
184        new_expressions.push((lhs, rhs));
185    }
186    (
187        InternedStrings {
188            nonterminals,
189            terminals,
190            regex_strings,
191            sub_strings,
192        },
193        new_expressions
194            .into_iter()
195            .map(|(lhs, rhs)| ExpressionWithID { lhs, rhs })
196            .collect(),
197    )
198}