Skip to main content

pred_recdec/
ast.rs

1// Predicated Recursive Descent
2
3use std::rc::Rc;
4
5type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
6
7use crate::bnf::*;
8
9#[derive(Clone, Debug, Default)]
10/// AST node. Grammar rules have children, tokens do not.
11///
12/// The total structure of the AST is defined by the grammar that it was parsed with.
13pub struct ASTNode {
14    /// If `Some`, this node is a parent/nonterminal. If `None`, this node is a token/leaf/terminal.
15    pub children : Option<Vec<ASTNode>>,
16    /// Due to error recovery, AST nodes can be marked as "poisoned".
17    ///
18    /// When a node is poisoned, its token count is XOR'd with !0u32 (all one-bits).
19    pub token_count : u32, // IF POISONED: xor with !0u32 (all one-bits)
20    /// Index into grammar.string_cache_inv, giving an `Rc<String>`.
21    ///
22    /// For parents, it's the name of the associated grammar rule.
23    ///
24    /// For tokens, it's the token content (the *actual* token contents, but what it was matched with, i.e. it's not a regex).
25    pub text : u32,
26}
27
28impl ASTNode {
29    pub fn is_poisoned(&self) -> bool { self.token_count >= 0x80000000 }
30    pub fn get_real_token_count(&self) -> u32 { if self.token_count >= 0x80000000 { self.token_count ^ !0u32 } else { self.token_count } }
31}
32
33// ASTs can be deeply recursive, so we need to avoid destroying them recursively.
34// Collect all transitive children into self.
35// This is slower than normal dropping (around 2x the runtime) but much safer.
36/*
37impl Drop for ASTNode {
38    fn drop(&mut self)
39    {
40        if let Some(collected) = self.children.as_mut()
41        {
42            let mut i = 0;
43            while i < collected.len()
44            {
45                if let Some(mut c) = collected[i].children.take()
46                {
47                    collected.append(&mut c);
48                }
49                i += 1;
50            }
51        }
52    }
53}
54*/
55
56/// Result of a [`Guard`] checking if a given alternation should be taken or not.
57pub enum GuardResult {
58    /// The alternation is chosen.
59    #[allow(unused)] Accept,
60    /// The alternation is rejected.
61    #[allow(unused)] Reject,
62    /// The guard has preemptively decided the the parse is invalid at this state and position.
63    #[allow(unused)] HardError(String)
64}
65
66/// Standard "AnyMap" for putting whatever you want in it.
67#[derive(Default)]
68pub struct AnyMap { map: HashMap<std::any::TypeId, Box<dyn std::any::Any>> }
69
70impl AnyMap {
71    #[allow(unused)] pub fn insert<T: 'static>(&mut self, value: T) { self.map.insert(std::any::TypeId::of::<T>(), Box::new(value)); }
72    #[allow(unused)] pub fn get<T: 'static>(&self) -> Option<&T> { self.map.get(&std::any::TypeId::of::<T>())?.downcast_ref::<T>() }
73    #[allow(unused)] pub fn get_mut<T: 'static>(&mut self) -> Option<&mut T> { self.map.get_mut(&std::any::TypeId::of::<T>())?.downcast_mut::<T>() }
74}
75
76/// Exposable parts of current parser state.
77pub struct PrdGlobal<'a> {
78    pub (crate) guards : Rc<HashMap<String, Rc<dyn Fn(&mut PrdGlobal, &[Token], usize) -> GuardResult>>>,
79    pub (crate) hooks : Rc<HashMap<String, Rc<dyn Fn(&mut PrdGlobal, &[Token], usize, &mut Vec<ASTNode>) -> Result<usize, String>>>>,
80    
81    /// Put your impure data here.
82    #[allow(unused)] pub udata : AnyMap,
83    /// Put your impure data here (simple path for cached regexes).
84    #[allow(unused)] pub udata_r : HashMap<usize, RegexCacher>,
85    
86    #[allow(unused)] pub (crate) g : &'a Grammar,
87}
88
89pub (crate) fn pred_recdec_parse_impl_recursive(
90    global : &mut PrdGlobal,
91    gp_id : usize, tokens : &[Token], token_start : usize,
92    depth : usize
93) -> Result<ASTNode, String>
94{
95    const DEPTH_LIMIT : usize = 1000;
96    if depth > DEPTH_LIMIT { return Err(format!("Exceeded recursion depth limit of {DEPTH_LIMIT}.")); }
97    let mut g_item = &global.g.points[gp_id];
98    let mut chosen_name_id = g_item.name_id;
99    
100    let mut children = vec!();
101    let mut i = token_start;
102    
103    let mut poisoned = false;
104    
105    #[cfg(feature = "parse_trace")] { println!("entered {} at {i}, depth {depth}", global.g.string_cache_inv[chosen_name_id as usize]); }
106    
107    // Structured this way for the sake of $become
108    // 1) We can't use an iterator because then we can't go back to alternation 0.
109    // 2) We can't have a "find alt" loop followed by a non-loop process block because then we can't go back to the "find alt" loop during $BECOME.
110    let mut alt_id = 0;
111    'top: while alt_id < g_item.forms.len()
112    {
113        let alt = &g_item.forms[alt_id];
114        alt_id += 1;
115        
116        if alt.matching_terms.len() == 0
117        {
118            return Ok(ASTNode {
119                text : chosen_name_id,
120                children : Some(children),
121                token_count : (i - token_start) as u32,
122            });
123        }
124        
125        let mut term_idx = 0;
126        
127        let mut accepted = true;
128        let first_term = alt.matching_terms.get(0);
129        let first_term = first_term.as_ref().unwrap();
130        match &first_term.t
131        {
132            MatchingTermE::Guard(guard) =>
133            {
134                accepted = false;
135                if let Some(f) = global.guards.get(&**guard)
136                {
137                    let f = Rc::clone(&f);
138                    match f(global, tokens, i)
139                    {
140                        GuardResult::Accept => accepted = true,
141                        GuardResult::HardError(e) => { return Err(e); }
142                        _ => {}
143                    }
144                }
145                else
146                {
147                    return Err(format!("Unknown guard {guard}"));
148                }
149                term_idx += 1;
150            }
151            MatchingTermE::Peek(loc, tester) =>
152            {
153                accepted = false;
154                let loc = (i as isize + loc) as usize;
155                if loc < tokens.len() && tokens[loc].text == *tester
156                {
157                    accepted = true;
158                }
159                term_idx += 1;
160            }
161            MatchingTermE::PeekR(loc, tester) =>
162            {
163                accepted = false;
164                let loc = (i as isize + loc) as usize;
165                //if loc < tokens.len() && tester.is_match(&global.g.string_cache_inv[tokens[loc].text as usize])
166                if loc < tokens.len() && tester.is_match_interned(tokens[loc].text, &global.g.string_cache_inv)
167                {
168                    accepted = true;
169                }
170                term_idx += 1;
171            }
172            MatchingTermE::PeekRes(loc, tester) =>
173            {
174                accepted = false;
175                let loc = (i as isize + loc) as usize;
176                //if loc < tokens.len() && tester.is_match(&global.g.string_cache_inv[tokens[loc].text as usize])
177                if loc < tokens.len() && tester.is_match_interned(tokens[loc].text, &global.g.string_cache_inv)
178                {
179                    accepted = true;
180                    if let Some(r) = &global.g.reserved
181                    {
182                        if regex_is_match(r, &global.g.string_cache_inv[tokens[loc].text as usize]) { accepted = false; }
183                    }
184                }
185                term_idx += 1;
186            }
187            MatchingTermE::Eof =>
188            {
189                accepted = i == tokens.len();
190                term_idx += 1;
191            }
192            _ => {}
193        }
194        
195        if !accepted { continue; }
196        
197        #[cfg(feature = "parse_trace")] { println!("chose variant {}", alt_id-1); }
198        
199        if children.capacity() == 0
200        {
201            children.reserve_exact(alt.matching_terms.len());
202        }
203        
204        while term_idx < alt.matching_terms.len()
205        {
206            let term = &alt.matching_terms[term_idx];
207            let mut matched = false;
208            match &term.t
209            {
210                MatchingTermE::Rule(id) =>
211                {
212                    let mut child = pred_recdec_parse_impl_recursive(global, *id, tokens, i, depth + 1);
213                    child = std::hint::black_box(child);
214                    if child.is_err() && global.g.points[*id].recover.is_some()
215                    {
216                        if let Some((r, after)) = &global.g.points[*id].recover
217                        {
218                            let mut j = i + 1;
219                            while j < tokens.len() && !r.is_match(&global.g.string_cache_inv[tokens[j].text as usize])
220                            {
221                                j += 1;
222                            }
223                            if j < tokens.len()
224                            {
225                                if *after { j += 1; }
226                                child = Ok(ASTNode {
227                                    text : global.g.points[*id].name_id,
228                                    children : Some(vec!()),
229                                    token_count : (j - i) as u32 ^ !0u32,
230                                });
231                            }
232                        }
233                    }
234                    let child = child.map_err(|e| format!("In {}: {e}", g_item.name))?;
235                    if child.is_poisoned()
236                    {
237                        poisoned = true;
238                    }
239                    i += child.get_real_token_count() as usize;
240                    children.push(child);
241                    matched = true;
242                }
243                MatchingTermE::TermLit(lit) =>
244                {
245                    if i < tokens.len() && tokens[i].text == *lit
246                    {
247                        if !alt.pruned
248                        {
249                            children.push(ASTNode {
250                                text : tokens[i].text.clone(), children : None,
251                                token_count : 1,
252                            });
253                        }
254                        //println!("munched {lit} at {i}");
255                        i += 1;
256                        matched = true;
257                    }
258                }
259                MatchingTermE::TermRegex(regex) => if i < tokens.len() && regex.is_match_interned(tokens[i].text, &global.g.string_cache_inv)
260                {
261                    if !alt.pruned
262                    {
263                        children.push(ASTNode {
264                            text : tokens[i].text.clone(), children : None,
265                            token_count : 1,
266                        });
267                    }
268                    //println!("munched {} at {i}", tokens[i].text);
269                    i += 1;
270                    matched = true;
271                }
272                MatchingTermE::Directive(d) =>
273                {
274                    match d
275                    {
276                        MatchDirective::Become | MatchDirective::BecomeAs =>
277                        {
278                            if let Some(MatchingTermE::Rule(id)) = alt.matching_terms.get(term_idx + 1).map(|x| &x.t)
279                            {
280                                g_item = &global.g.points[*id];
281                                #[cfg(feature = "parse_trace")] { println!("became {} at {i}, depth {depth}", g_item.name); }
282                                alt_id = 0;
283                                if matches!(d, MatchDirective::BecomeAs)
284                                {
285                                    chosen_name_id = g_item.name_id;
286                                }
287                                continue 'top;
288                            }
289                        }
290                        MatchDirective::Any => if i < tokens.len()
291                        {
292                            children.push(ASTNode {
293                                text : tokens[i].text.clone(), children : None,
294                                token_count : 1,
295                            });
296                            matched = true;
297                            i += 1;
298                        }
299                        _ => panic!("TODO: {:?}", d), // also TODO: combine into parent match once all implemented
300                    }
301                }
302                MatchingTermE::Hook(name) =>
303                {
304                    if let Some(f) = global.hooks.get(&**name)
305                    {
306                        let f = Rc::clone(&f);
307                        match f(global, tokens, i, &mut children)
308                        {
309                            Ok(consumed) => { i += consumed; }
310                            Err(e) => Err(e)?
311                        }
312                    }
313                    else
314                    {
315                        Err(format!("Unknown custom hook {:?} inside of {}", name, global.g.string_cache_inv[chosen_name_id as usize]))?
316                    }
317                    matched = true;
318                }
319                _ => Err(format!("Term type {:?} not supported in this position in a rule (context: {})", term, global.g.string_cache_inv[chosen_name_id as usize]))?
320            }
321            if !matched
322            {
323                Err(format!("Failed to match token at {i} in rule {} alt {alt_id}. Token is `{:?}`.\n{:?}",
324                    global.g.string_cache_inv[chosen_name_id as usize],
325                    tokens.get(i).map(|x| global.g.string_cache_inv[x.text as usize].clone()),
326                    tokens[token_start..tokens.len().min(token_start+15)].iter().map(|x| global.g.string_cache_inv[x.text as usize].clone()).collect::<Vec<_>>()))?
327            }
328            term_idx += 1;
329        }
330        
331        #[cfg(feature = "parse_trace")] { println!("accepted {} from {token_start} to {i}, depth {depth}", global.g.string_cache_inv[chosen_name_id as usize]); }
332        let mut token_count = (i - token_start) as u32;
333        if poisoned
334        {
335            token_count = token_count ^ !0u32;
336        }
337        return Ok(ASTNode {
338            text : chosen_name_id,
339            children : Some(children),
340            token_count,
341        });
342    }
343    
344    Err(format!("Failed to match rule {} at token position {token_start}\n{:?}", global.g.string_cache_inv[chosen_name_id as usize],
345        tokens[token_start..tokens.len().min(token_start+15)].iter().map(|x| global.g.string_cache_inv[x.text as usize].clone()).collect::<Vec<_>>()))
346}
347
348/// Visit the AST with a possibly-impure callback. The AST itself cannot be modified this way.
349///
350/// Takes a function that returns whether it's OK to also visit the children of this node.
351#[allow(unused)]
352pub fn visit_ast(n : &ASTNode, f : &mut dyn FnMut(&ASTNode) -> bool)
353{
354    let flag = f(n);
355    if flag && let Some(c) = &n.children
356    {
357        for c in c.iter()
358        {
359            visit_ast(c, f);
360        }
361    }
362}
363
364/// Arguments:
365/// - `&mut PrdGlobal` - context
366/// - `&[Token]` - tokenstream
367/// - `usize` - position in tokenstream.
368pub type Guard = Rc<dyn Fn(&mut PrdGlobal, &[Token], usize) -> GuardResult>;
369/// Arguments:
370/// - `&mut PrdGlobal` - context
371/// - `&[Token]` - tokenstream
372/// - `usize` - position in tokenstream.
373/// - `&mut Vec<ASTNode>` - current partially-produced AST item.
374pub type Hook = Rc<dyn Fn(&mut PrdGlobal, &[Token], usize, &mut Vec<ASTNode>) -> Result<usize, String>>;
375
376#[allow(unused)]
377/// Parse the given token stream (produced by [`bnf::tokenize`](`super::bnf::tokenize`)) into an AST, using the given [`bnf::Grammar`](`super::bnf::Grammar`), and taking the given root rule name as the starting point.
378///
379/// Guards and hooks are for advanced usage (e.g. parsing C).
380/// 
381/// See also: [`ASTNode`]
382pub fn parse(
383    g : &Grammar, root_rule_name : &str, tokens : &[Token],
384    guards : Rc<HashMap<String, Guard>>,
385    hooks : Rc<HashMap<String, Hook>>,
386) -> Result<ASTNode, String>
387{
388    let gp_id = g.by_name.get(root_rule_name).unwrap();
389    let mut global = PrdGlobal {
390        guards,
391        hooks,
392        udata : <_>::default(),
393        udata_r : <_>::default(),
394        g,
395    };
396    
397    if let Some(f) = global.hooks.get("init")
398    {
399        let f = Rc::clone(&f);
400        let _ = f(&mut global, tokens, 0, &mut vec!());
401    }
402    
403    pred_recdec_parse_impl_recursive(&mut global, *gp_id, tokens, 0, 0)
404}
405
406
407#[allow(unused)]
408/// For debugging only: print out the given AST.
409pub fn print_ast_pred_recdec(ast : &ASTNode, string_cache_inv : &Vec<Rc<String>>, indent : usize)
410{
411    print!("{}", " ".repeat(indent));
412    if let Some(c) = &ast.children
413    {
414        if ast.is_poisoned() { println!("{} (POISONED) {{", ast.text); } else
415        { println!("{} {{", string_cache_inv[ast.text as usize]); }
416        for c in c.iter()
417        {
418            print_ast_pred_recdec(c, string_cache_inv, indent+1);
419        }
420        print!("{}", " ".repeat(indent));
421        println!("}};");
422    }
423    else
424    {
425        println!("{}", string_cache_inv[ast.text as usize]);
426    }
427}