Skip to main content

pred_recdec/
ast.rs

1// Predicated Recursive Descent
2
3use std::rc::Rc;
4
5type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
6
7use crate::bnf::*;
8
9#[derive(Clone, Debug, Default)]
10/// AST node. Grammar rules have children, tokens do not.
11///
12/// The total structure of the AST is defined by the grammar that it was parsed with.
13///
14/// Why isn't this an enum? `Option<Vec<ASTNode>>` and other two-variant enums containing a Vec undergo niche optimization. If this were `enum ASTNode { Rule{...}, Token(u32) }` then it would look like you can just add a third variant (e.g. poisoned) without issue. However, doing that would actually increase the size of the ASTNode from 32 bytes to 40 bytes.
15/// 
16/// If the size is ever forced above 32 bytes (e.g. increasing token count from u32 to u64) then I'll probably change it to an enum.
17#[non_exhaustive]
18pub struct ASTNode {
19    /// If `Some`, this node is a parent/nonterminal. If `None`, this node is a token/leaf/terminal.
20    pub children : Option<Vec<ASTNode>>,
21    /// Due to error recovery, AST nodes can be marked as "poisoned".
22    ///
23    /// When a node is poisoned, its token count is XOR'd with !0u32 (all one-bits).
24    pub token_count : u32, // IF POISONED: xor with !0u32 (all one-bits)
25    /// Index into grammar.string_cache_inv, giving an `Rc<String>`.
26    ///
27    /// For parents, it's the name of the associated grammar rule.
28    ///
29    /// For tokens, it's the token content (the *actual* token contents, but what it was matched with, i.e. it's not a regex).
30    pub text : u32,
31}
32
33impl ASTNode {
34    pub fn new(children : Option<Vec<ASTNode>>, token_count : u32, text : u32) -> Self { Self { children, token_count, text } }
35    pub fn is_poisoned(&self) -> bool { self.token_count >= 0x80000000 }
36    pub fn get_real_token_count(&self) -> u32 { if self.token_count >= 0x80000000 { self.token_count ^ !0u32 } else { self.token_count } }
37}
38
39// ASTs can be deeply recursive, so we need to avoid destroying them recursively.
40// Collect all transitive children into self.
41// This is slower than normal dropping (around 2x the runtime) but much safer.
42/*
43impl Drop for ASTNode {
44    fn drop(&mut self)
45    {
46        if let Some(collected) = self.children.as_mut()
47        {
48            let mut i = 0;
49            while i < collected.len()
50            {
51                if let Some(mut c) = collected[i].children.take()
52                {
53                    collected.append(&mut c);
54                }
55                i += 1;
56            }
57        }
58    }
59}
60*/
61
62/// Result of a [`Guard`] checking if a given alternation should be taken or not.
63pub enum GuardResult {
64    /// The alternation is chosen.
65    #[allow(unused)] Accept,
66    /// The alternation is rejected.
67    #[allow(unused)] Reject,
68    /// The guard has preemptively decided the the parse is invalid at this state and position.
69    #[allow(unused)] HardError(String)
70}
71
72/// Standard "AnyMap" for putting whatever you want in it.
73#[derive(Default)]
74pub struct AnyMap { map: HashMap<std::any::TypeId, Box<dyn std::any::Any>> }
75
76impl AnyMap {
77    #[allow(unused)] pub fn insert<T: 'static>(&mut self, value: T) { self.map.insert(std::any::TypeId::of::<T>(), Box::new(value)); }
78    #[allow(unused)] pub fn get<T: 'static>(&self) -> Option<&T> { self.map.get(&std::any::TypeId::of::<T>())?.downcast_ref::<T>() }
79    #[allow(unused)] pub fn get_mut<T: 'static>(&mut self) -> Option<&mut T> { self.map.get_mut(&std::any::TypeId::of::<T>())?.downcast_mut::<T>() }
80}
81
82/// Exposable parts of current parser state.
83pub struct PrdGlobal<'a> {
84    pub (crate) guards : Rc<HashMap<String, Rc<dyn Fn(&mut PrdGlobal, &[Token], usize) -> GuardResult>>>,
85    pub (crate) hooks : Rc<HashMap<String, Rc<dyn Fn(&mut PrdGlobal, &[Token], usize, &mut Vec<ASTNode>) -> Result<usize, String>>>>,
86    
87    /// Put your impure data here.
88    #[allow(unused)] pub udata : AnyMap,
89    /// Put your impure data here (simple path for cached regexes).
90    #[allow(unused)] pub udata_r : HashMap<usize, RegexCacher>,
91    
92    #[allow(unused)] pub g : &'a Grammar,
93}
94
95/// Parser error state.
96#[derive(Clone, Debug)]
97pub struct PrdError {
98    /// Human-readable error message. Format is not guaranteed and may change arbitrarily.
99    #[allow(unused)] pub err_message : String,
100    /// How far into the token stream did we successfully parse? Note: this is NOT
101    #[allow(unused)] pub token_index : usize,
102    /// Which rule's alternations were we inside of? (Index into [`bnf::Grammar::points`](`super::bnf::Grammar::points`))
103    #[allow(unused)] pub rule : u32,
104    /// Which alternation was it?
105    ///
106    /// If `u16::MAX`, we errored out before entering an alternation.
107    ///
108    /// If past the end of the grammar point's list of alternations, then every alternation was checked and failed.
109    #[allow(unused)] pub in_alt : u16,
110    /// Where were we inside of that alternation?
111    ///
112    /// If "None", we errored on a guard/lookahead test.
113    #[allow(unused)] pub alt_progress : Option<u16>,
114    /// On behalf of what rule were we parsing for? (e.g. the parent of a `$become`)
115    #[allow(unused)] pub on_behalf_of_rule : u32,
116}
117
118pub (crate) fn pred_recdec_parse_impl_recursive(
119    global : &mut PrdGlobal,
120    gp_id : usize, tokens : &[Token], token_start : usize,
121    depth : usize
122) -> Result<ASTNode, PrdError>
123{
124    const DEPTH_LIMIT : usize = if cfg!(debug_assertions) { 300 } else { 1200 };
125    let mut g_item = &global.g.points[gp_id];
126    let mut chosen_name_id = g_item.name_id;
127    
128    let mut children = vec!();
129    let mut i = token_start;
130    
131    let mut poisoned = false;
132    let mut alt_id : usize = 0;
133    
134    macro_rules! build_err { ($x:expr, $prog:expr) => {
135        Err(PrdError {
136            err_message : $x, token_index : i, rule : g_item.name_id, on_behalf_of_rule : chosen_name_id,
137            in_alt : alt_id.wrapping_sub(1) as u16, alt_progress : $prog
138        })
139    } }
140    
141    if depth > DEPTH_LIMIT { return build_err!(format!("Exceeded recursion depth limit of {DEPTH_LIMIT}."), None); }
142    
143    #[cfg(feature = "parse_trace")] { println!("entered {} at {i}, depth {depth}", global.g.string_cache_inv[chosen_name_id as usize]); }
144    
145    // Structured this way for the sake of $become
146    // 1) We can't use an iterator because then we can't go back to alternation 0.
147    // 2) We can't have a "find alt" loop followed by a non-loop process block because then we can't go back to the "find alt" loop during $BECOME.
148    'top: while alt_id < g_item.forms.len()
149    {
150        let alt = &g_item.forms[alt_id];
151        alt_id += 1;
152        
153        if alt.matching_terms.len() == 0
154        {
155            return Ok(ASTNode::new(Some(children), (i - token_start) as u32, chosen_name_id));
156        }
157        
158        let mut term_idx = 0;
159        
160        let mut accepted = true;
161        let first_term = alt.matching_terms.get(0);
162        let first_term = first_term.as_ref().unwrap();
163        match &first_term.t
164        {
165            MatchingTermE::Guard(guard) =>
166            {
167                accepted = false;
168                if let Some(f) = global.guards.get(&**guard)
169                {
170                    let f = Rc::clone(&f);
171                    match f(global, tokens, i)
172                    {
173                        GuardResult::Accept => accepted = true,
174                        GuardResult::HardError(e) => build_err!(e, None)?,
175                        _ => {}
176                    }
177                }
178                else
179                {
180                    return build_err!(format!("Unknown guard {guard}"), None);
181                }
182                term_idx += 1;
183            }
184            MatchingTermE::Peek(loc, tester) =>
185            {
186                accepted = false;
187                let loc = (i as isize + loc) as usize;
188                if loc < tokens.len() && tokens[loc].text == *tester
189                {
190                    accepted = true;
191                }
192                term_idx += 1;
193            }
194            MatchingTermE::PeekR(loc, tester) =>
195            {
196                accepted = false;
197                let loc = (i as isize + loc) as usize;
198                //if loc < tokens.len() && tester.is_match(&global.g.string_cache_inv[tokens[loc].text as usize])
199                if loc < tokens.len() && tester.is_match_interned(tokens[loc].text, &global.g.string_cache_inv)
200                {
201                    accepted = true;
202                }
203                term_idx += 1;
204            }
205            MatchingTermE::PeekRes(loc, tester) =>
206            {
207                accepted = false;
208                let loc = (i as isize + loc) as usize;
209                //if loc < tokens.len() && tester.is_match(&global.g.string_cache_inv[tokens[loc].text as usize])
210                if loc < tokens.len() && tester.is_match_interned(tokens[loc].text, &global.g.string_cache_inv)
211                {
212                    accepted = true;
213                    if let Some(r) = &global.g.reserved
214                    {
215                        if regex_is_match(r, &global.g.string_cache_inv[tokens[loc].text as usize]) { accepted = false; }
216                    }
217                }
218                term_idx += 1;
219            }
220            MatchingTermE::Eof =>
221            {
222                accepted = i == tokens.len();
223                term_idx += 1;
224            }
225            _ => {}
226        }
227        
228        if !accepted { continue; }
229        
230        #[cfg(feature = "parse_trace")] { println!("chose variant {}", alt_id-1); }
231        
232        if children.capacity() == 0
233        {
234            children.reserve_exact(alt.matching_terms.len());
235        }
236        
237        while term_idx < alt.matching_terms.len()
238        {
239            let term = &alt.matching_terms[term_idx];
240            let mut matched = false;
241            match &term.t
242            {
243                MatchingTermE::Rule(id) =>
244                {
245                    let mut child = pred_recdec_parse_impl_recursive(global, *id, tokens, i, depth + 1);
246                    child = std::hint::black_box(child);
247                    if child.is_err() && global.g.points[*id].recover.is_some()
248                    {
249                        if let Some((r, after)) = &global.g.points[*id].recover
250                        {
251                            let mut j = i + 1;
252                            while j < tokens.len() && !r.is_match(&global.g.string_cache_inv[tokens[j].text as usize])
253                            {
254                                j += 1;
255                            }
256                            if j < tokens.len()
257                            {
258                                if *after { j += 1; }
259                                child = Ok(ASTNode::new(Some(vec!()), (j - i) as u32 ^ !0u32, global.g.points[*id].name_id));
260                            }
261                        }
262                    }
263                    const FAT_ERRS : bool = false;
264                    if FAT_ERRS
265                    {
266                        child = child.map_err(|e| {
267                            let mut e2 = e.clone(); e2.err_message = format!("In {}: {}", g_item.name, e2.err_message); e2
268                        });
269                    }
270                    let child = child?;
271                    if child.is_poisoned()
272                    {
273                        poisoned = true;
274                    }
275                    i += child.get_real_token_count() as usize;
276                    children.push(child);
277                    matched = true;
278                }
279                MatchingTermE::TermLit(lit) =>
280                {
281                    if i < tokens.len() && tokens[i].text == *lit
282                    {
283                        if !alt.pruned
284                        {
285                            children.push(ASTNode::new(None, 1, tokens[i].text));
286                        }
287                        //println!("munched {lit} at {i}");
288                        i += 1;
289                        matched = true;
290                    }
291                }
292                MatchingTermE::TermRegex(regex) => if i < tokens.len() && regex.is_match_interned(tokens[i].text, &global.g.string_cache_inv)
293                {
294                    if !alt.pruned
295                    {
296                        children.push(ASTNode::new(None, 1, tokens[i].text));
297                    }
298                    //println!("munched {} at {i}", tokens[i].text);
299                    i += 1;
300                    matched = true;
301                }
302                MatchingTermE::Directive(d) =>
303                {
304                    match d
305                    {
306                        MatchDirective::Become | MatchDirective::BecomeAs =>
307                        {
308                            if let Some(MatchingTermE::Rule(id)) = alt.matching_terms.get(term_idx + 1).map(|x| &x.t)
309                            {
310                                g_item = &global.g.points[*id];
311                                #[cfg(feature = "parse_trace")] { println!("became {} at {i}, depth {depth}", g_item.name); }
312                                alt_id = 0;
313                                if matches!(d, MatchDirective::BecomeAs)
314                                {
315                                    chosen_name_id = g_item.name_id;
316                                }
317                                continue 'top;
318                            }
319                        }
320                        MatchDirective::Rename => if let Some(MatchingTermE::Rule(id)) = alt.matching_terms.get(term_idx + 1).map(|x| &x.t)
321                        {
322                            chosen_name_id = *id as u32;
323                            term_idx += 1;
324                            matched = true;
325                        }
326                        MatchDirective::Drop => if children.len() > 0
327                        {
328                            children.pop();
329                            matched = true;
330                        }
331                        MatchDirective::DropIfEmpty => if children.len() > 0
332                        {
333                            if matches!(children.last().unwrap().children, Some(_))
334                            {
335                                children.pop();
336                            }
337                            matched = true;
338                        }
339                        MatchDirective::Hoist => if children.len() > 0
340                        {
341                            let x = children.pop().unwrap();
342                            if let Some(mut c) = x.children
343                            {
344                                children.append(&mut c);
345                            }
346                            matched = true;
347                        }
348                        MatchDirective::HoistIfUnit => if children.len() > 0
349                        {
350                            let x = children.pop().unwrap();
351                            if let Some(mut c) = x.children && c.len() == 1
352                            {
353                                children.append(&mut c);
354                            }
355                            matched = true;
356                        }
357                        MatchDirective::Any => if i < tokens.len()
358                        {
359                            children.push(ASTNode::new(None, 1, tokens[i].text));
360                            matched = true;
361                            i += 1;
362                        }
363                    }
364                }
365                MatchingTermE::Hook(name) =>
366                {
367                    if let Some(f) = global.hooks.get(&**name)
368                    {
369                        let f = Rc::clone(&f);
370                        match f(global, tokens, i, &mut children)
371                        {
372                            Ok(consumed) => { i += consumed; }
373                            Err(e) => build_err!(e, Some(term_idx as u16))?
374                        }
375                    }
376                    else
377                    {
378                        build_err!(
379                            format!("Unknown custom hook {:?} inside of {}",
380                                name, global.g.string_cache_inv[chosen_name_id as usize]
381                            ),
382                            Some(term_idx as u16)
383                        )?
384                    }
385                    matched = true;
386                }
387                _ => build_err!(
388                        format!("Term type {:?} not supported in this position in a rule (context: {})",
389                            term, global.g.string_cache_inv[chosen_name_id as usize]
390                        ),
391                        Some(term_idx as u16)
392                    )?
393            }
394            if !matched
395            {
396                build_err!(
397                    format!("Failed to match token at {i} in rule {} alt {alt_id}. Token is `{:?}`.\n{:?}",
398                        global.g.string_cache_inv[chosen_name_id as usize],
399                        tokens.get(i).map(|x| global.g.string_cache_inv[x.text as usize].clone()),
400                        tokens[token_start..tokens.len().min(token_start+15)].iter()
401                            .map(|x| global.g.string_cache_inv[x.text as usize].clone()).collect::<Vec<_>>()
402                    ),
403                    Some(term_idx as u16)
404                )?
405            }
406            term_idx += 1;
407        }
408        
409        #[cfg(feature = "parse_trace")] { println!("accepted {} from {token_start} to {i}, depth {depth}", global.g.string_cache_inv[chosen_name_id as usize]); }
410        let mut token_count = (i - token_start) as u32;
411        if poisoned
412        {
413            token_count = token_count ^ !0u32;
414        }
415        return Ok(ASTNode::new(Some(children), token_count, chosen_name_id));
416    }
417    
418    build_err!(
419        format!("Failed to match rule {} at token position {token_start}\n{:?}", global.g.string_cache_inv[chosen_name_id as usize],
420            tokens[token_start..tokens.len().min(token_start+15)].iter().map(|x| global.g.string_cache_inv[x.text as usize].clone()).collect::<Vec<_>>()
421        ),
422        Some(g_item.forms.len() as u16)
423    )
424}
425
426/// Visit the AST with a possibly-impure callback. The AST itself cannot be modified this way.
427///
428/// Takes a function that returns whether it's OK to also visit the children of this node.
429#[allow(unused)]
430pub fn visit_ast(n : &ASTNode, f : &mut dyn FnMut(&ASTNode) -> bool)
431{
432    let flag = f(n);
433    if flag && let Some(c) = &n.children
434    {
435        for c in c.iter()
436        {
437            visit_ast(c, f);
438        }
439    }
440}
441
442/// Arguments:
443/// - `&mut PrdGlobal` - context
444/// - `&[Token]` - tokenstream
445/// - `usize` - position in tokenstream.
446pub type Guard = Rc<dyn Fn(&mut PrdGlobal, &[Token], usize) -> GuardResult>;
447/// Arguments:
448/// - `&mut PrdGlobal` - context
449/// - `&[Token]` - tokenstream
450/// - `usize` - position in tokenstream.
451/// - `&mut Vec<ASTNode>` - current partially-produced AST item.
452pub type Hook = Rc<dyn Fn(&mut PrdGlobal, &[Token], usize, &mut Vec<ASTNode>) -> Result<usize, String>>;
453
454#[allow(unused)]
455/// Parse the given token stream (produced by [`bnf::tokenize`](`super::bnf::tokenize`)) into an AST, using the given [`bnf::Grammar`](`super::bnf::Grammar`), and taking the given root rule name as the starting point.
456///
457/// Guards and hooks are for advanced usage (e.g. parsing C).
458/// 
459/// See also: [`ASTNode`]
460pub fn parse(
461    g : &Grammar, root_rule_name : &str, tokens : &[Token],
462    guards : Rc<HashMap<String, Guard>>,
463    hooks : Rc<HashMap<String, Hook>>,
464) -> Result<ASTNode, PrdError>
465{
466    let gp_id = g.by_name.get(root_rule_name).unwrap();
467    let mut global = PrdGlobal {
468        guards,
469        hooks,
470        udata : <_>::default(),
471        udata_r : <_>::default(),
472        g,
473    };
474    
475    if let Some(f) = global.hooks.get("init")
476    {
477        let f = Rc::clone(&f);
478        let _ = f(&mut global, tokens, 0, &mut vec!());
479    }
480    
481    pred_recdec_parse_impl_recursive(&mut global, *gp_id, tokens, 0, 0)
482}
483
484
485#[allow(unused)]
486/// For debugging only: print out the given AST.
487pub fn print_ast_pred_recdec(ast : &ASTNode, string_cache_inv : &Vec<Rc<String>>, indent : usize)
488{
489    print!("{}", " ".repeat(indent));
490    if let Some(c) = &ast.children
491    {
492        if ast.is_poisoned() { println!("{} (POISONED) {{", ast.text); } else
493        { println!("{} {{", string_cache_inv[ast.text as usize]); }
494        for c in c.iter()
495        {
496            print_ast_pred_recdec(c, string_cache_inv, indent+1);
497        }
498        print!("{}", " ".repeat(indent));
499        println!("}};");
500    }
501    else
502    {
503        println!("{}", string_cache_inv[ast.text as usize]);
504    }
505}
506
507#[allow(unused)]
508/// For testing only: convert the AST's shape to a string.
509///
510/// Parents turn into `+<contents>-`. If poisoned (i.e. containing any error recovery), they're prefixed with `p`.
511///
512/// Leaves turn into `.`
513pub fn ast_to_shape_string(ast : &ASTNode) -> String
514{
515    let mut s = "".to_string();
516    if let Some(c) = &ast.children
517    {
518        if ast.is_poisoned() { s += "p"; }
519        s += "+";
520        for c in c.iter()
521        {
522            s += &ast_to_shape_string(c);
523        }
524        s += "-";
525    }
526    else
527    {
528        s += ".";
529    }
530    s
531}