Skip to main content

pred_recdec/
lib.rs

1//! # Predicated Recursive Descent
2//! 
3//! aka pred recdec
4//! 
5//! It is **VERY HIGHLY RECOMMENDED** that you use the mimalloc crate (or some other high-performance memory allocator) if you use this library:
6//!
7//! ```
8//! # {} /*
9//! use mimalloc::MiMalloc;
10//! #[global_allocator]
11//! static GLOBAL: MiMalloc = MiMalloc;
12//! # */
13//! ```
14//!
15//! You want:
16//! - [bnf::bnf_to_grammar]
17//! - [bnf::Grammar]
18//! - [bnf::tokenize]
19//! - [bnf::Token]
20//! - [ast::ASTNode]
21//! - [ast::parse]
22//! 
23//! This library provides a way to write and run [BNF](https://en.wikipedia.org/wiki/Backus–Naur_form) grammars with annotations that make them behave like a handwritten recursive descent parser.
24//!
25//! The structure of the BNF corresponds to the structure of the resulting ASTs. Tokenization is handled automatically; you do not need to write a lexical grammar. The tokenizer handles comments, whitespace, maximal munch, and even regex tests. Unless you're trying to do something silly like how Lua uses context-sensitive long brackets for comments, that's all you need.
26//! 
27//! The resulting parser is scannerful but tolerant of soft keywords. It performs no memoization or backtracking. Impure hooks are safe. There's no lookahead generation or guessing: the parser only does exactly what you specify in the BNF, in order.
28//! 
29//! This is strong enough to parse C. Yes, the language with the super horrible grammar that needs arbitrary lookahead and state munging to parse correctly.
30//! 
31//! This is *not* a PEG or packrat thing.
32//!
33//! Performance: varies between 50% and 400% slower than a native machine code handwritten parser. For more complex grammars, the performance loss is less. Totally usable!
34//! 
35//! Simple example (totally not lisp), capable of parsing the input `(a b (q x)kfwaiei i  9 (af0f1a) () () )`:
36//! 
37//! ```text
38//! S ::= @peek(0, "(") parenexpr
39//! parenexpr ::=
40//!     @peek(1, ")") "(" ")"
41//!     | "(" itemlist ")"
42//! itemlist ::=
43//!     @peek(0, ")") #end of list
44//!     | item $become itemlist
45//! item ::=
46//!     @peek(0, "(") parenexpr
47//!     | @auto r`[a-zA-Z_][a-zA-Z_0-9]*|[0-9\.]*`r
48//! ```
49//!
50//! ## Example usage
51//!
52//! ```
53//! # {} /*
54//! let mut g = bnf_to_grammar(&grammar_source).unwrap();
55//! let tokens = tokenize(&mut g, &test_source);
56//! let tokens = tokens.unwrap();
57//! 
58//! use std::rc::Rc;
59//! let ast = parse(&g,
60//!     "S", // Name of your root-most rule
61//!     &tokens[..],
62//!     Rc::new(<_>::default()), // guards (see test in `src/c.rs` for a detailed usage example)
63//!     Rc::new(<_>::default()) // hooks (same)
64//! );
65//! 
66//! if let Ok(ast) = &ast
67//! {
68//!     print_ast_pred_recdec(ast, &g.string_cache_inv, 0);
69//! }
70//! drop(ast.unwrap());
71//! # */
72//! ```
73//!
74//! ## BNF Extensions
75//!
76//! Mini glossary: nonterminal = "call of another rule", terminal = "immediate match of a token's contents".
77//!
78//! Common EBNF syntax like `[]` or `()?` is not supported. Don't worry, you can make flat lists just fine.
79//! 
80//! Extensions from pure BNF are:
81//! 
82//! `\` - End-of-line escaping with `\`, so you can write multiline regexes.
83//!
84//! `#` - Comments until the end of the line, outside of strings/regexes.
85//!
86//! `"...` - Terms starting with `"` are inline strings. They register with the tokenizer and check the entire body of a token. (Strings are interned, so this is O(1).)
87//! 
88//! Terms starting with ```r`...```, ```R`...```, or ```A`...``` are inline regexes:
89//! - ```r`...`r``` registers with the tokenizer and does a full token match (i.e. it's given an implicit trailing `\z` during token matching).
90//! - ```R`...`r``` DOES NOT register with the tokenizer, but does a full token match (i.e. it's given an implicit trailing `\z`). This is provided as an optimization: sometimes you want to combine tokenizer regexes into a single regex for performance reasons, which should be done manually.
91//! - ```A`...`r``` DOES NOT register with the tokenizer, and only checks against the start of the token. This is only provided as an optimization: ```R``r``` is strictly more powerful.
92//!
93//! Regex results are cached, so checking them is amortized O(1).
94//!
95//! Terms beginning with `!` currently only have one kind:
96//! - `!hook`, e.g. `!hook(fix_infix_expr)`, are calls to user-provided code. This is **allowed to be impure**, e.g. management of **typedef symbol tables**.
97//!
98//! Terms beginning with `@` are guards/predicates. They determine, at the start of a given alternation/production, whether it is valid. If true, that production is taken, and none of the others will be attempted (at this location). Remember, there's no backtracking or magic.
99//! - `@peek(N, STRING)` - Checks if, relative to the parse position, the given token contains the given text.
100//! - `@peekr(N, REGEX)` - Same, but only for regexes.
101//! - `@auto item` - Desugars to `@peek(0, item) item` or `@peekr(0, item) item` automatically.
102//! - `@guard(name)` - Calls user-provided code to determine if the production is accepted. This is allowed to be impure, but you should be careful with it, since a path hasn't been committed yet.
103//! - `@recover` - Pseudo-guard, is never attempted. Instead, it tells the associated grammar rule that if it fails, it's allowed to recover (into a poisoned state) by seeking for a particular set of tokens.
104//! - `@recover_before` - Same, but stops right before accepting any seek token instead of consuming it.
105//! 
106//! Terms starting with `$` are directives:
107//!- `$become nonterminal` performs a tail call, keeping the current AST node name. This can be used to build lists without smashing the stack.
108//!- `$become_as nonterminal` performs a tail call, replacing the current AST node's name with that of the target.
109//!- `$any` matches and includes any one token as a terminal.
110//!- `$pruned` specifies that this particular production doesn't generate AST nodes for bare terminals. This is useful for reducing AST bloat. For example, `@peek(1, ")") "(" ")" $pruned` is a non-empty production but produces zero AST nodes.
111//!- `$hoist` deletes the most recent child and moves its children into the current AST node's child list.
112//!- `$hoist_unit` does the same, but only if the child has exactly one child.
113//!- `$drop` deletes the most recent child.
114//!- `$drop_empty` does the same, but only if the child has exactly zero children.
115//!- `$rename nonterminal` renames the current AST node, giving it the same name as `nonterminal` but does NOT invoke a run of parsing `nonterminal` (i.e. it's skipped over).
116//! 
117//! You'll note that there's no "negative rule-match-check predicate" extension (e.g. no "parse A, but only if it doesn't also parse B"). This is by design. Rule-level negation is way too powerful, and requires an extremely sophisticated parser generator (e.g. packrat) to handle both correctly and cheaply. For any reasonably simple implementation, it would be incompatible with impure hooks. `__RESERVED_WORDS`, described below, is the only exception, because it's easy to define in a sane way.
118//!
119//! For negative token predicates (peeks), you can refactor the grammar slightly, or if it's a particularly complicated peek, write a custom guard. So this isn't a limitation.
120//!
121//! ## Magic pseudo-rules
122//!
123//! The following magic pseudo rule names are available (e.g. `__COMMENTS ::= //`):
124//!
125//! - `__BRACKET_PAIRS` e.g. `::= ( ) | { }` - Tell the tokenizer to pair-up these tokens with each other so that hooks can skip over their contents in O(1) time.
126//! - `__COMMENTS` e.g. `::= "//" | "#"` -- Tell the tokenizer that this is a kind of single-line comment.
127//! - `__COMMENT_PAIRS` e.g. `::= /* */` - Tell the tokenizer that this is a kind of pair-based comment.
128//! - `__COMMENT_PAIRS_NESTED` - Same, but nesting, like in Rust.
129//! - `__COMMENT_REGEXES` - Same, but formed as a regex. These are slower than the above, because the Rust `regex` crate doesn't have a JIT.
130//! - `__RESERVED_WORDS` - e.g. `::= auto break case` - Specifies a list of token contents that are not allowed to be "accepted" by regex terminals like ```r`[a-zA-Z_]+`r```
131
132pub mod bnf;
133pub mod ast;
134mod json;
135
136// Thing
137pub (crate) use rustc_hash::FxBuildHasher as HashBuilder;
138
139#[cfg(test)]
140mod test {
141    #[test]
142    fn test_lib() {
143        use crate::*;
144        pub use bnf::*;
145        pub use ast::*;
146        
147        let grammar_source = r#"
148    __BRACKET_PAIRS ::= ( )
149    S ::= @peek(0, "(") parenexpr $become EOF
150    EOF ::= @eof
151    parenexpr ::=
152        @peek(1, ")") "(" ")" $pruned
153        | "(" $become itemlist $pruned
154    itemlist ::=
155        @peek(0, ")") $pruned ")"
156        | @peek(0, "(") parenexpr $become itemlist
157        | @auto r`[a-zA-Z_][a-zA-Z_0-9]*|[0-9\.]*`r $become itemlist
158        "#;
159        
160        let test_source = r#"
161        (a b (q x)kfwaiei i  9 (af0f1a) () () )
162        "#;
163        
164        let mut g = bnf_to_grammar(&grammar_source).unwrap();
165        let tokens = tokenize(&mut g, &test_source);
166        let tokens = tokens.unwrap();
167        
168        use std::rc::Rc;
169        let ast = parse(&g, "S", &tokens[..], Rc::new(<_>::default()), Rc::new(<_>::default()));
170        
171        if let Ok(ast) = &ast
172        {
173            print_ast_pred_recdec(ast, &g.string_cache_inv, 0);
174        }
175        drop(ast.unwrap());
176        
177        
178        
179        let test_source = r#"
180        ( a ) a
181        "#;
182        
183        let tokens = tokenize(&mut g, &test_source);
184        let tokens = tokens.unwrap();
185        let ast = parse(&g, "S", &tokens[..], Rc::new(<_>::default()), Rc::new(<_>::default()));
186        //println!("{:?}", ast);
187        let ast = ast.unwrap_err();
188        assert_eq!(ast.token_index, 3);
189        assert_eq!(ast.rule, 2);
190        assert_eq!(ast.in_alt, 0);
191        assert_eq!(ast.alt_progress, Some(1));
192        assert_eq!(ast.on_behalf_of_rule, 1);
193        
194        
195        
196        let test_source = r#"    ( a ) ) "#;
197        
198        let tokens = tokenize(&mut g, &test_source);
199        let tokens = tokens.unwrap_err();
200        //println!("{:?}", tokens);
201        assert_eq!(tokens.produced, 3);
202        assert_eq!(tokens.location, 10);
203        assert_eq!(tokens.pairing_error, Some((")".to_string(), false)));
204        
205        
206        
207        let grammar_source = r#"
208    S ::= NUL a2 a $drop a a $hoist a $drop_empty NUL $drop_empty a2 a $hoist_unit a2 $hoist_unit EOF $drop
209    EOF ::= @eof
210    NUL ::=
211    a ::= "a"
212    a2 ::= "a" "a" $rename ax
213    ax ::= #dummy
214        "#;
215        let mut g = bnf_to_grammar(&grammar_source).unwrap();
216        
217        let test_source = r#"aaa a a aa a aaa"#;
218        
219        let tokens = tokenize(&mut g, &test_source);
220        let tokens = tokens.unwrap();
221        let ast = parse(&g, "S", &tokens[..], Rc::new(<_>::default()), Rc::new(<_>::default()));
222        let ast = ast.unwrap();
223        print_ast_pred_recdec(&ast, &g.string_cache_inv, 0);
224        let s = ast_to_shape_string(&ast);
225        println!("{}", s);
226        assert_eq!(s, "++-+..-+.-.+..-.-");
227        
228        assert_eq!(*g.string_cache_inv[ast.children.unwrap()[1].text as usize], "ax");
229    }
230}