Skip to main content

pred_recdec/
bnf.rs

1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6//use regex::bytes::Regex as Regex;
7//use pcre2::bytes::Regex as Regex;
8
9// Yes this is normal for low-level parsers.
10// No the rust stdlib does not have an equivalent function.
11pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13    s[i..].chars().next().unwrap()
14}
15pub (crate) fn new_regex(s : &str) -> Result<Regex, regex::Error>
16//pub (crate) fn new_regex(s : &str) -> Result<Regex, pcre2::Error>
17{
18    Regex::new(s)
19    //regex::RegexBuilder::new(s).unicode(false).build()
20    //pcre2::bytes::RegexBuilder::new().jit(true).build(s)
21}
22
23pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
24//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<pcre2::bytes::Match<'a>>
25//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::bytes::Match<'a>>
26{
27    //r.find(s.as_bytes())
28    //r.find(s.as_bytes()).unwrap()
29    r.find(s)
30}
31pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
32{
33    //r.is_match(s.as_bytes())
34    //r.is_match(s.as_bytes()).unwrap()
35    r.is_match(s)
36}
37
38
39#[derive(Debug, Clone, Hash)]
40pub (crate) struct K<T> {
41    pub (crate) k : Rc<T>
42}
43
44impl<T : PartialEq> PartialEq for K<T> {
45    fn eq(&self, other: &Self) -> bool {
46        if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
47        self.k == other.k
48    }
49}
50impl<T : Eq> Eq for K<T> { }
51
52#[derive(Debug, Clone)]
53pub struct RegexCacher {
54    r : Regex,
55    cache : Rc<RefCell<HashMap<K<String>, bool>>>,
56    cache2 : Rc<RefCell<HashMap<u32, bool>>>,
57}
58
59impl RegexCacher {
60    /// Wrap a Regex with a fast-path cache that checks if a given known input is already known.
61    ///
62    /// This is faster than whatever caching Regex does internally because of string interning.
63    #[allow(unused)]
64    pub fn new(r : Regex) -> RegexCacher
65    {
66        let cache = Rc::new(RefCell::new(HashMap::default()));
67        let cache2 = Rc::new(RefCell::new(HashMap::default()));
68        RegexCacher { r, cache, cache2 }
69    }
70    pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
71    {
72        let s = r.as_str().to_string();
73        let mut cache = Rc::new(RefCell::new(HashMap::default()));
74        let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
75        if let Some(cached) = cache_pool.get(&s)
76        {
77            cache = Rc::clone(&cached.0);
78            cache2 = Rc::clone(&cached.1);
79        }
80        else
81        {
82            cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
83        }
84        
85        RegexCacher { r, cache, cache2 }
86    }
87    /// Does the regex match the string?
88    pub fn is_match(&self, s : &Rc<String>) -> bool
89    {
90        let mut cache = self.cache.borrow_mut();
91        let k = K { k : Rc::clone(s) };
92        if let Some(result) = cache.get(&k) { return *result; }
93        let ret = regex_is_match(&self.r, &*s);
94        cache.insert(k, ret);
95        ret
96    }
97    /// Does the regex match the string based on its interned ID? See also: [`Grammar::string_cache_inv`]
98    pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
99    {
100        let mut cache = self.cache2.borrow_mut();
101        if let Some(result) = cache.get(&i) { return *result; }
102        let s = &string_cache_inv[i as usize];
103        let ret = regex_is_match(&self.r, &*s);
104        cache.insert(i, ret);
105        ret
106    }
107}
108
109#[derive(Default)]
110/// Produced by [`bnf_to_grammar`].
111///
112/// Next step: [`tokenize`].
113pub struct Grammar {
114    /// Arena for all grammar points in the grammar.
115    pub points: Vec<GrammarPoint>,
116    /// Lookup table for grammar point IDs (indexes in the arena) by their name.
117    pub by_name: HashMap<String, usize>,
118    
119    pub (crate) literals: Vec<String>,
120    pub (crate) regexes: Vec<(Regex, RegexCacher)>,
121    
122    /// String interning cache: from string to interned ID.
123    pub string_cache : HashMap<String, u32>,
124    /// Inverse string interning cache. Index = string ID. The given `Rc<String>` is the canonical object for that interned string.
125    pub string_cache_inv : Vec<Rc<String>>,
126    
127    pub (crate) bracket_pairs : Vec<(String, String)>,
128    pub (crate) comments : Vec<String>,
129    pub (crate) comment_pairs : Vec<(String, String)>,
130    pub (crate) comment_pairs_nested : Vec<(String, String)>,
131    pub (crate) comment_regexes : Vec<Regex>,
132    pub (crate) reserved : Option<Regex>,
133}
134
135#[derive(Debug, Clone)]
136/// A grammar rule.
137/// 
138/// More specifically, every production/alternation associated with a given name, in order. Alternations are stored and tested in the same order as written in the grammar.
139pub struct GrammarPoint {
140    /// Name of the grammar point (LHS).
141    pub name: Rc<String>,
142    /// ID of the grammar point (index in [`Grammar::points`]).
143    pub name_id: u32,
144    /// List of productions/alternations under this grammar point's LHS
145    pub forms: Vec<Alternation>,
146    pub (crate) recover: Option<(RegexCacher, bool)>,
147}
148
149#[derive(Debug, Clone)]
150/// A particular production of a grammar rule.
151///
152/// Named "Alternation" because they're tested in order.
153pub struct Alternation {
154    /// List of terms, in order, for this alternation.
155    pub matching_terms: Vec<MatchingTerm>,
156    /// Does this alternation want terminals to be added to it, or dropped?
157    pub (crate) pruned: bool,
158}
159
160#[derive(Debug, Clone)]
161pub(crate) enum MatchDirective {
162    Any, Become, BecomeAs, Hoist, Drop, DropIfEmpty, Rename,
163}
164
165#[derive(Debug, Clone)]
166/// Intentionally opaque for API stability reasons. Don't worry, it's just a single enum internally.
167pub struct MatchingTerm { pub(crate) t : MatchingTermE }
168
169#[derive(Debug, Clone)]
170#[non_exhaustive]
171pub(crate) enum MatchingTermE {
172    Rule(usize),
173    TermLit(u32),
174    TermRegex(RegexCacher),
175    Directive(MatchDirective),
176    Hook(Rc<String>),
177    _AutoTemp,
178    
179    Eof,
180    Peek(isize, u32),
181    PeekR(isize, RegexCacher),
182    PeekRes(isize, RegexCacher),
183    Guard(Rc<String>),
184}
185impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
186
187/// Look up a string in the string interning cache.
188pub fn string_cache_lookup(
189    string_cache : &mut HashMap<String, u32>,
190    string_cache_inv : &mut Vec<Rc<String>>,
191    s : &str) -> (Rc<String>, u32)
192{
193    if let Some(sn) = string_cache.get(s)
194    {
195        return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
196    }
197    let rc = Rc::new(s.to_string());
198    let n = string_cache_inv.len().try_into().unwrap();
199    string_cache.insert(s.to_string(), n);
200    string_cache_inv.push(Rc::clone(&rc));
201    (rc, n)
202}
203/// Look up a string in the string interning cache, but skip the `Rc::clone()` overhead because we only need the ID.
204pub fn string_cache_lookup_id(
205    string_cache : &mut HashMap<String, u32>,
206    string_cache_inv : &mut Vec<Rc<String>>,
207    s : &str) -> u32
208{
209    if let Some(sn) = string_cache.get(s)
210    {
211        return *sn;
212    }
213    let rc = Rc::new(s.to_string());
214    let n = string_cache_inv.len().try_into().unwrap();
215    string_cache.insert(s.to_string(), n);
216    string_cache_inv.push(rc);
217    n
218}
219
220pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
221{
222    let mut rules = Vec::new();
223    
224    let mut metalist = Vec::new();
225    let mut current = Vec::new();
226    
227    let mut name : Option<String> = None;
228    let mut found_separator = false;
229    
230    let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
231    let mut lines2 : Vec<String> = vec!();
232    for l in lines
233    {
234        if let Some(l0) = lines2.last_mut()
235        {
236            if l0.ends_with("\\")
237            {
238                *l0 = format!("{}{l}", &l0[..l0.len()-1]);
239                continue;
240            }
241        }
242        
243        lines2.push(l);
244    }
245    for (mut linenum, rest) in lines2.iter().enumerate()
246    {
247        let mut rest : &str = rest;
248        linenum += 1; // user-facing line numbers are 1-indexed
249        
250        let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
251        
252        if _split.get(1) == Some(&"::=") // ::= is only allowed as the second token on a line and must be space-separated
253        {
254            if name.is_some()
255            {
256                metalist.push(current);
257                rules.push((name.unwrap(), metalist));
258            }
259            
260            name = None;
261            found_separator = false;
262            metalist = Vec::new();
263            current = Vec::new();
264        }
265        
266        while !rest.is_empty()
267        {
268            // skip extra whitespace
269            if get_char_at_byte(rest, 0).is_whitespace()
270            {
271                rest = rest.trim_start();
272            }
273            // comment
274            else if rest.starts_with("#")
275            {
276                break;
277            }
278            // literal
279            else if rest.starts_with("\"")
280            {
281                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
282                let mut len = 1;
283                let mut in_escape = false;
284                let mut found_exit = false;
285                while len < rest.len()
286                {
287                    let c = get_char_at_byte(rest, len);
288                    len += c.len_utf8();
289                    if !in_escape && c == '\\'
290                    {
291                        in_escape = true;
292                        continue;
293                    }
294                    if !in_escape && c == '"' { found_exit = true; break; }
295                    in_escape = false;
296                }
297                if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
298                current.push(rest[..len].to_string());
299                rest = &rest[len..];
300            }
301            // regex
302            else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
303            {
304                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
305                let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
306                let len = end + 4;
307                current.push(rest[..len].to_string());
308                rest = &rest[len..];
309            }
310            // split
311            else if rest.starts_with("::=")
312            {
313                if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
314                if name.is_none() { return Err(format!("missing name on line {linenum}")); }
315                found_separator = true;
316                rest = &rest[3..];
317            }
318            // operators
319            else if  rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
320            {
321                current.push(rest[0..1].to_string());
322                rest = &rest[1..];
323            }
324            else if rest.starts_with("|")
325            {
326                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
327                metalist.push(current);
328                current = vec!();
329                rest = &rest[1..];
330            }
331            // name
332            else
333            {
334                let mut end = rest.len();
335                for (i, ch) in rest.char_indices()
336                {
337                    if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
338                        || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
339                    {
340                        end = i;
341                        break;
342                    }
343                }
344                if name.is_none()
345                {
346                    name = Some(rest[..end].to_string());
347                }
348                else
349                {
350                    if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
351                    current.push(rest[..end].to_string());
352                }
353                rest = &rest[end..];
354            }
355        }
356    }
357    
358    if name.is_some()
359    {
360        metalist.push(current);
361        rules.push((name.unwrap(), metalist));
362    }
363    
364    Ok(rules)
365}
366
367pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
368{
369    let mut by_name = HashMap::default();
370    for (name, _) in input.iter()
371    {
372        if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
373        if by_name.insert(name.clone(), by_name.len()).is_some()
374        {
375            return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
376        }
377    }
378    
379    let mut string_cache = HashMap::default();
380    let mut string_cache_inv = Vec::new();
381    let mut points = Vec::new();
382    let mut literals = HashSet::default();
383    let mut lex_regexes = HashMap::default();
384    
385    let mut bracket_pairs = Vec::new();
386    let mut comment_pairs = Vec::new();
387    let mut comment_pairs_nested = Vec::new();
388    let mut comment_regexes = Vec::new();
389    let mut comments = Vec::new();
390    
391    let mut cache_pool = HashMap::<String, _>::default();
392    
393    let mut reserved = None;
394    for (name, raw_forms) in input.iter()
395    {
396        if name == "__RESERVED_WORDS"
397        {
398            let mut set = Vec::new();
399            for s in raw_forms.iter().map(|x| x.iter()).flatten()
400            {
401                set.push(s.clone());
402            }
403            reserved = Some(build_literal_regex(&set, true));
404            continue;
405        }
406        if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
407        {
408            for raw_alt in raw_forms
409            {
410                match (raw_alt.get(0), raw_alt.get(1))
411                {
412                    (Some(l), Some(r)) =>
413                    {
414                        if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
415                        if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
416                        if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
417                    }
418                    _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
419                }
420            }
421            continue;
422        }
423        if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
424        {
425            for s in raw_forms.iter().map(|x| x.iter()).flatten()
426            {
427                if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
428                {
429                    let pattern = &s[2..s.len() - 2];
430                    let pattern = format!("\\A{pattern}");
431                    let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
432                    comment_regexes.push(re);
433                }
434                if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
435                {
436                    let pattern = &s[1..s.len() - 1];
437                    comments.push(pattern.to_string());
438                }
439            }
440            continue;
441        }
442        let index = *by_name.get(name).unwrap();
443        
444        let mut forms = Vec::new();
445        let mut recover = None;
446        
447        for raw_alt in raw_forms
448        {
449            //println!("{:?}", raw_alt);
450            let mut matching_terms = Vec::new();
451            let mut pruned = false;
452            
453            let mut i = 0;
454            while i < raw_alt.len()
455            {
456                let term_str = &raw_alt[i];
457                i += 1;
458                
459                if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
460                {
461                    let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
462                    matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
463                    
464                    literals.insert(literal.clone());
465                    continue;
466                }
467                if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
468                {
469                    let pattern = &term_str[2..term_str.len() - 2];
470                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
471                    let pattern = format!("\\A(?:{pattern})"); // at start (for tokenization)
472                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
473                    let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
474                    lex_regexes.insert(pattern, re2.clone());
475                    matching_terms.push(MatchingTermE::TermRegex(re2).to());
476                    continue;
477                }
478                // non-tokenizing regex (optimization: hide redundant regexes from the tokenizer)
479                if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
480                {
481                    let pattern = &term_str[2..term_str.len() - 2];
482                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
483                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
484                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
485                    continue;
486                }
487                if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
488                {
489                    let pattern = &term_str[2..term_str.len() - 2];
490                    let pattern_all = format!("\\A(?:{pattern})");
491                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
492                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
493                    continue;
494                }
495                if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
496                {
497                    let pattern = &raw_alt[i];
498                    if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
499                        || !pattern.ends_with("`r")
500                    {
501                        Err(format!("@recover guards only accept regex strings"))?
502                    }
503                    let no_z = pattern.starts_with("A`");
504                    let pattern = &pattern[2..pattern.len() - 2];
505                    let mut pattern_all = format!("\\A(?:{})\\z", pattern);
506                    if no_z { pattern_all = format!("\\A(?:{})", pattern); }
507                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
508                    // TODO: make regex cachers use interior mutability and share the cache
509                    if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
510                    recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
511                    i += 1;
512                    continue;
513                }
514                if matches!(&**term_str, "@EOF" | "@eof")
515                {
516                    matching_terms.push(MatchingTermE::Eof.to());
517                    continue;
518                }
519                if matches!(&**term_str, "@AUTO" | "@auto")
520                {
521                    matching_terms.push(MatchingTermE::_AutoTemp.to());
522                    continue;
523                }
524                if (term_str == "@PEEK" || term_str == "@peek"
525                    || term_str == "@PEEKR" || term_str == "@peekr"
526                    || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
527                {
528                    if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
529                    let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
530                    if term_str == "@PEEK" || term_str == "@peek"
531                    {
532                        let literal = &raw_alt[i+3];
533                        if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
534                        {
535                            return Err(format!("@peek guards only accept plain strings"));
536                        }
537                        let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
538                        let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
539                        matching_terms.push(MatchingTermE::Peek(n, s).to());
540                    }
541                    else
542                    {
543                        let pattern = &raw_alt[i+3];
544                        if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
545                            || !pattern.ends_with("`r")
546                        {
547                            Err(format!("@recover guards only accept regex strings"))?
548                        }
549                        let no_z = pattern.starts_with("A`");
550                        let pattern = &pattern[2..pattern.len() - 2];
551                        let mut pattern_all = format!("\\A(?:{})\\z", pattern);
552                        if no_z { pattern_all = format!("\\A(?:{})", pattern); }
553                        
554                        let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
555                        // TODO: make regex cachers use interior mutability and share the cache
556                        if term_str == "@PEEKRES" || term_str == "@peekres"
557                        {
558                            matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
559                        }
560                        else
561                        {
562                            matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
563                        }
564                    }
565                    i += 5;
566                    continue;
567                }
568                if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
569                {
570                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
571                    {
572                        return Err(format!("Invalid guard syntax: must be @guard(name)"));
573                    }
574                    let literal = &raw_alt[i+1];
575                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
576                    matching_terms.push(MatchingTermE::Guard(s).to());
577                    i += 3;
578                    continue;
579                }
580                if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
581                {
582                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
583                    {
584                        return Err(format!("Invalid hook syntax: must be @hook(name)"));
585                    }
586                    let literal = &raw_alt[i+1];
587                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
588                    matching_terms.push(MatchingTermE::Hook(s).to());
589                    i += 3;
590                    continue;
591                }
592                if matches!(&**term_str, "$BECOME" | "$become")
593                {
594                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
595                    continue;
596                }
597                if matches!(&**term_str, "$BECOME_AS" | "$become_as")
598                {
599                    matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
600                    continue;
601                }
602                if matches!(&**term_str, "$ANY" | "$any")
603                {
604                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
605                    continue;
606                }
607                if matches!(&**term_str, "$PRUNED" | "$pruned")
608                {
609                    pruned = true;
610                    continue;
611                }
612                let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
613                matching_terms.push(MatchingTermE::Rule(*id).to());
614            }
615            if matching_terms.len() > 60000
616            {
617                Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
618            }
619            if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
620            {
621                match matching_terms.get(1).map(|x| &x.t)
622                {
623                    Some(MatchingTermE::TermLit(s)) =>
624                    {
625                        matching_terms[0] = MatchingTermE::Peek(0, *s).to();
626                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
627                    }
628                    Some(MatchingTermE::TermRegex(r)) =>
629                    {
630                        let r = r.clone();
631                        matching_terms[0] = MatchingTermE::PeekR(0, r).to();
632                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
633                    }
634                    _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
635                }
636            }
637            // TODO: check for illegally-placed become, becomeas, hoist
638            forms.push(Alternation { matching_terms, pruned });
639        }
640        if forms.len() > 60000
641        {
642            Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
643        }
644        let mut num_nonguards = 0;
645        for f in &forms
646        {
647            if num_nonguards != 0
648            {
649                eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
650                break;
651            }
652            if !(if let Some(x) = f.matching_terms.get(0) // early 2026: working around not-yet-supported syntax
653                && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
654            { num_nonguards += 1; }
655        }
656        
657        if index > 4000000000
658        {
659            Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
660        }
661        let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
662        points.push(GrammarPoint
663        {
664            name,
665            name_id,
666            //id: index as u32,
667            forms,
668            recover,
669        });
670    }
671    
672    let mut literals = literals.into_iter().collect::<Vec<_>>();
673    literals.sort();
674    
675    let mut regexes = Vec::new();
676    for (r, r2) in lex_regexes
677    {
678        regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
679    }
680    Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
681}
682
683/// Turns a BNF string into a [`Grammar`]. See the comments at [the crate root](super) for syntax notes. The basic parts are standard BNF.
684///
685/// Next step: [`tokenize`].
686pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
687{
688    grammar_convert(&bnf_parse(s)?)
689}
690
691#[derive(Debug, Clone, Default)]
692/// Produced by [`tokenize`].
693///
694/// Next step: [`ast::parse`](`super::ast::parse`).
695pub struct Token {
696    /// Interned string ID, see [`Grammar::string_cache_inv`]
697    pub text : u32,
698    /// For bracket pairs: how far away, in which direction, is the paired bracket? In terms of tokens.
699    pub pair : isize,
700}
701
702// Sort literals from grammar by length and combine them into a single match-longest regex.
703pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
704{
705    let mut text_token_regex_s = "\\A(?:".to_string();
706    
707    let mut lits = literals.clone();
708    lits.sort_by(|a, b| b.len().cmp(&a.len()));
709    for text in lits.iter()
710    {
711        let s2 = regex::escape(&text);
712        text_token_regex_s += &s2;
713        text_token_regex_s += "|";
714    }
715    if lits.len() > 0 { text_token_regex_s.pop(); }
716    text_token_regex_s += ")";
717    if terminated { text_token_regex_s += "\\z"; }
718    let text_token_regex = new_regex(&text_token_regex_s).unwrap();
719    text_token_regex
720}
721
722/// Scans the given string and produces a stream of [`Token`]s.
723///
724/// Next step: [`ast::parse`](`super::ast::parse`).
725///
726/// The scanner performs maximal munch between all string literals and r``r terminals in the grammar. It also skips whitespace, and comments (as defined in the grammar). The produced tokens use string interning and can be bracket-paired. See [the crate root](super) for more details.
727pub fn tokenize(
728    g : &mut Grammar,
729    mut s : &str
730) -> Result<Vec<Token>, String>
731{
732    let s_orig = s;
733    let mut tokens = Vec::<Token>::new();
734    tokens.reserve(s.len()/16 + 1);
735    
736    let mut lit_filtered = Vec::new();
737    for l in g.literals.iter()
738    {
739        let mut covered = false;
740        for r in &g.regexes
741        {
742            if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
743            {
744                if loc == l.len()
745                {
746                    covered = true;
747                    break;
748                }
749            }
750        }
751        if !covered
752        {
753            lit_filtered.push(l.clone());
754        }
755    }
756    let all_literals_regex = if lit_filtered.len() > 0
757    {
758        Some(build_literal_regex(&lit_filtered, false))
759    }
760    else
761    {
762        None
763    };
764    //println!("{}", all_literals_regex);
765    
766    for text in g.literals.iter()
767    {
768        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
769    }
770    for point in g.points.iter()
771    {
772        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
773    }
774    
775    let mut openers = HashSet::default();
776    let mut closers = HashMap::default();
777    let mut stacks = HashMap::default();
778    let mut any_paired = false;
779    for (l, r) in &g.bracket_pairs
780    {
781        let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
782        let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
783        openers.insert(lsc);
784        closers.insert(rsc, lsc);
785        stacks.insert(lsc, Vec::<usize>::new());
786        any_paired = true;
787    }
788    
789    /*
790    println!("num regexes to check: {}", g.regexes.len());
791    for r in &g.regexes
792    {
793        println!("{:?}", r.as_str());
794    }
795    */
796    
797    'top: while !s.is_empty()
798    {
799        if get_char_at_byte(s, 0) as u32 <= 0x20
800        {
801            if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
802            {
803                while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
804                {
805                    s = &s[1..]; // ascii whitespace is always 1 byte long
806                }
807                if s.is_empty() { break; }
808                continue 'top;
809            }
810        }
811        
812        // Pure regex comments
813        for re in &g.comment_regexes
814        {
815            if let Some(x) = regex_find(re, s)
816            {
817                //s = &s[x.len()..];
818                s = &s[x.end() - x.start()..];
819                continue 'top;
820            }
821        }
822        // Comments with escapable newline handling
823        for c in &g.comments
824        {
825            if s.starts_with(c)
826            {
827                s = &s[c.len()..];
828                while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
829                {
830                    if get_char_at_byte(s, 0) == '\\'
831                    {
832                        s = &s[get_char_at_byte(s, 0).len_utf8()..]; // extra skip
833                    }
834                    if s.len() > 0
835                    {
836                        s = &s[get_char_at_byte(s, 0).len_utf8()..];
837                    }
838                }
839                continue 'top;
840            }
841        }
842        // Pair comments with nesting
843        for (l, r) in &g.comment_pairs_nested
844        {
845            if s.starts_with(l)
846            {
847                s = &s[l.len()..];
848                let mut nest = 1;
849                while s.len() > 0 && nest > 0
850                {
851                    if s.starts_with(l) { nest += 1; }
852                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
853                    if s.starts_with(r) { nest -= 1; }
854                }
855                if s.starts_with(r) { s = &s[r.len()..]; }
856                continue 'top;
857            }
858        }
859        // Pair comments without nesting
860        for (l, r) in &g.comment_pairs
861        {
862            if s.starts_with(l)
863            {
864                s = &s[l.len()..];
865                while s.len() > 0 && !s.starts_with(r)
866                {
867                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
868                }
869                s = &s[r.len()..];
870                continue 'top;
871            }
872        }
873        // Maximal munch: Regex pass
874        let mut longest = 0;
875        //let mut found_regex = None;
876        for r in &g.regexes
877        {
878            if let Some(loc) = regex_find(&r.0, s)
879            {
880                let len = loc.end() - loc.start();
881                //found_regex = Some(&r.1);
882                longest = longest.max(len);
883            }
884        }
885        // Literals pass.
886        if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
887        {
888            let len = loc.end() - loc.start();
889            //found_regex = None;
890            longest = longest.max(len);
891        }
892        if longest == 0
893        {
894            let sn : String = s.chars().take(5).collect();
895            return Err(format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn));
896        }
897        
898        //let text_info = string_cache_lookup(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
899        //let text = text_info.1;
900        let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
901        let mut token = Token { text, pair : 0 };
902        
903        /*
904        if let Some(r2) = found_regex
905        {
906            r2.cache2.borrow_mut().insert(text, true);
907            r2.cache.borrow_mut().insert(text_info.0, true);
908        }
909        */
910        
911        if any_paired
912        {
913            if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
914            {
915                s.push(tokens.len());
916            }
917            if let Some(l) = closers.get(&text) && let Some(s) = stacks.get_mut(l)
918            {
919                let n = s.pop().ok_or_else(|| format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()))?;
920                let me = tokens.len();
921                let diff = me.checked_signed_diff(n).ok_or_else(|| format!("Input too long"))?;
922                token.pair = -diff;
923                tokens[n].pair = diff;
924            }
925        }
926        
927        s = &s[longest..];
928        tokens.push(token);
929    }
930    Ok(tokens)
931}