Skip to main content

pred_recdec/
bnf.rs

1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6//use regex::bytes::Regex as Regex;
7//use pcre2::bytes::Regex as Regex;
8
9// Yes this is normal for low-level parsers.
10// No the rust stdlib does not have an equivalent function.
11pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13    s[i..].chars().next().unwrap()
14}
15/// One-to-one wrapper around Regex::new().
16pub fn new_regex(s : &str) -> Result<Regex, regex::Error>
17//pub (crate) fn new_regex(s : &str) -> Result<Regex, pcre2::Error>
18{
19    Regex::new(s)
20    //regex::RegexBuilder::new(s).unicode(false).build()
21    //pcre2::bytes::RegexBuilder::new().jit(true).build(s)
22}
23
24pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
25//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<pcre2::bytes::Match<'a>>
26//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::bytes::Match<'a>>
27{
28    //r.find(s.as_bytes())
29    //r.find(s.as_bytes()).unwrap()
30    r.find(s)
31}
32pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
33{
34    //r.is_match(s.as_bytes())
35    //r.is_match(s.as_bytes()).unwrap()
36    r.is_match(s)
37}
38
39
40#[derive(Debug, Clone, Hash)]
41pub (crate) struct K<T> {
42    pub (crate) k : Rc<T>
43}
44
45impl<T : PartialEq> PartialEq for K<T> {
46    fn eq(&self, other: &Self) -> bool {
47        if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
48        self.k == other.k
49    }
50}
51impl<T : Eq> Eq for K<T> { }
52
53#[derive(Debug, Clone)]
54pub struct RegexCacher {
55    r : Regex,
56    cache : Rc<RefCell<HashMap<K<String>, bool>>>,
57    cache2 : Rc<RefCell<HashMap<u32, bool>>>,
58}
59
60impl RegexCacher {
61    /// Wrap a Regex with a fast-path cache that checks if a given known input is already known.
62    ///
63    /// This is faster than whatever caching Regex does internally because of string interning.
64    #[allow(unused)]
65    pub fn new(r : Regex) -> RegexCacher
66    {
67        let cache = Rc::new(RefCell::new(HashMap::default()));
68        let cache2 = Rc::new(RefCell::new(HashMap::default()));
69        RegexCacher { r, cache, cache2 }
70    }
71    pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
72    {
73        let s = r.as_str().to_string();
74        let mut cache = Rc::new(RefCell::new(HashMap::default()));
75        let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
76        if let Some(cached) = cache_pool.get(&s)
77        {
78            cache = Rc::clone(&cached.0);
79            cache2 = Rc::clone(&cached.1);
80        }
81        else
82        {
83            cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
84        }
85        
86        RegexCacher { r, cache, cache2 }
87    }
88    /// Does the regex match the string?
89    pub fn is_match(&self, s : &Rc<String>) -> bool
90    {
91        let mut cache = self.cache.borrow_mut();
92        let k = K { k : Rc::clone(s) };
93        if let Some(result) = cache.get(&k) { return *result; }
94        let ret = regex_is_match(&self.r, &*s);
95        cache.insert(k, ret);
96        ret
97    }
98    /// Does the regex match the string based on its interned ID? See also: [`Grammar::string_cache_inv`]
99    pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
100    {
101        let mut cache = self.cache2.borrow_mut();
102        if let Some(result) = cache.get(&i) { return *result; }
103        let s = &string_cache_inv[i as usize];
104        let ret = regex_is_match(&self.r, &*s);
105        cache.insert(i, ret);
106        ret
107    }
108}
109
110#[derive(Default)]
111/// Produced by [`bnf_to_grammar`].
112///
113/// Next step: [`tokenize`].
114pub struct Grammar {
115    /// Arena for all grammar points in the grammar.
116    pub points: Vec<GrammarPoint>,
117    /// Lookup table for grammar point IDs (indexes in the arena) by their name.
118    pub by_name: HashMap<String, usize>,
119    
120    pub (crate) literals: Vec<String>,
121    pub (crate) regexes: Vec<(Regex, RegexCacher)>,
122    
123    /// String interning cache: from string to interned ID.
124    pub string_cache : HashMap<String, u32>,
125    /// Inverse string interning cache. Index = string ID. The given `Rc<String>` is the canonical object for that interned string.
126    pub string_cache_inv : Vec<Rc<String>>,
127    
128    pub (crate) bracket_pairs : Vec<(String, String)>,
129    pub (crate) comments : Vec<String>,
130    pub (crate) comment_pairs : Vec<(String, String)>,
131    pub (crate) comment_pairs_nested : Vec<(String, String)>,
132    pub (crate) comment_regexes : Vec<Regex>,
133    pub (crate) reserved : Option<Regex>,
134}
135
136#[derive(Debug, Clone)]
137/// A grammar rule.
138/// 
139/// More specifically, every production/alternation associated with a given name, in order. Alternations are stored and tested in the same order as written in the grammar.
140pub struct GrammarPoint {
141    /// Name of the grammar point (LHS).
142    pub name: Rc<String>,
143    /// ID of the grammar point (index in [`Grammar::points`]).
144    pub name_id: u32,
145    /// List of productions/alternations under this grammar point's LHS
146    pub forms: Vec<Alternation>,
147    pub (crate) recover: Option<(RegexCacher, bool)>,
148}
149
150#[derive(Debug, Clone)]
151/// A particular production of a grammar rule.
152///
153/// Named "Alternation" because they're tested in order.
154pub struct Alternation {
155    /// List of terms, in order, for this alternation.
156    pub matching_terms: Vec<MatchingTerm>,
157    /// Does this alternation want terminals to be added to it, or dropped?
158    pub (crate) pruned: bool,
159}
160
161#[derive(Debug, Clone)]
162pub(crate) enum MatchDirective {
163    Any, Become, BecomeAs, Hoist, HoistIfUnit, Drop, DropIfEmpty, Rename,
164}
165
166#[derive(Debug, Clone)]
167/// Intentionally opaque for API stability reasons. Don't worry, it's just a single enum internally.
168pub struct MatchingTerm { pub(crate) t : MatchingTermE }
169
170#[derive(Debug, Clone)]
171#[non_exhaustive]
172pub(crate) enum MatchingTermE {
173    Rule(usize),
174    TermLit(u32),
175    TermRegex(RegexCacher),
176    Directive(MatchDirective),
177    Hook(Rc<String>),
178    _AutoTemp,
179    
180    Eof,
181    Peek(isize, u32),
182    PeekR(isize, RegexCacher),
183    PeekRes(isize, RegexCacher),
184    Guard(Rc<String>),
185}
186impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
187
188/// Look up a string in the string interning cache.
189pub fn string_cache_lookup(
190    string_cache : &mut HashMap<String, u32>,
191    string_cache_inv : &mut Vec<Rc<String>>,
192    s : &str) -> (Rc<String>, u32)
193{
194    if let Some(sn) = string_cache.get(s)
195    {
196        return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
197    }
198    let rc = Rc::new(s.to_string());
199    let n = string_cache_inv.len().try_into().unwrap();
200    string_cache.insert(s.to_string(), n);
201    string_cache_inv.push(Rc::clone(&rc));
202    (rc, n)
203}
204/// Look up a string in the string interning cache, but skip the `Rc::clone()` overhead because we only need the ID.
205pub fn string_cache_lookup_id(
206    string_cache : &mut HashMap<String, u32>,
207    string_cache_inv : &mut Vec<Rc<String>>,
208    s : &str) -> u32
209{
210    if let Some(sn) = string_cache.get(s)
211    {
212        return *sn;
213    }
214    let rc = Rc::new(s.to_string());
215    let n = string_cache_inv.len().try_into().unwrap();
216    string_cache.insert(s.to_string(), n);
217    string_cache_inv.push(rc);
218    n
219}
220
221pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
222{
223    let mut rules = Vec::new();
224    
225    let mut metalist = Vec::new();
226    let mut current = Vec::new();
227    
228    let mut name : Option<String> = None;
229    let mut found_separator = false;
230    
231    let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
232    let mut lines2 : Vec<String> = vec!();
233    for l in lines
234    {
235        if let Some(l0) = lines2.last_mut()
236        {
237            if l0.ends_with("\\")
238            {
239                *l0 = format!("{}{l}", &l0[..l0.len()-1]);
240                continue;
241            }
242        }
243        
244        lines2.push(l);
245    }
246    for (mut linenum, rest) in lines2.iter().enumerate()
247    {
248        let mut rest : &str = rest;
249        linenum += 1; // user-facing line numbers are 1-indexed
250        
251        let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
252        
253        if _split.get(1) == Some(&"::=") // ::= is only allowed as the second token on a line and must be space-separated
254        {
255            if name.is_some()
256            {
257                metalist.push(current);
258                rules.push((name.unwrap(), metalist));
259            }
260            
261            name = None;
262            found_separator = false;
263            metalist = Vec::new();
264            current = Vec::new();
265        }
266        
267        while !rest.is_empty()
268        {
269            // skip extra whitespace
270            if get_char_at_byte(rest, 0).is_whitespace()
271            {
272                rest = rest.trim_start();
273            }
274            // comment
275            else if rest.starts_with("#")
276            {
277                break;
278            }
279            // literal
280            else if rest.starts_with("\"")
281            {
282                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
283                let mut len = 1;
284                let mut in_escape = false;
285                let mut found_exit = false;
286                while len < rest.len()
287                {
288                    let c = get_char_at_byte(rest, len);
289                    len += c.len_utf8();
290                    if !in_escape && c == '\\'
291                    {
292                        in_escape = true;
293                        continue;
294                    }
295                    if !in_escape && c == '"' { found_exit = true; break; }
296                    in_escape = false;
297                }
298                if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
299                current.push(rest[..len].to_string());
300                rest = &rest[len..];
301            }
302            // regex
303            else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
304            {
305                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
306                let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
307                let len = end + 4;
308                current.push(rest[..len].to_string());
309                rest = &rest[len..];
310            }
311            // split
312            else if rest.starts_with("::=")
313            {
314                if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
315                if name.is_none() { return Err(format!("missing name on line {linenum}")); }
316                found_separator = true;
317                rest = &rest[3..];
318            }
319            // operators
320            else if  rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
321            {
322                current.push(rest[0..1].to_string());
323                rest = &rest[1..];
324            }
325            else if rest.starts_with("|")
326            {
327                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
328                metalist.push(current);
329                current = vec!();
330                rest = &rest[1..];
331            }
332            // name
333            else
334            {
335                let mut end = rest.len();
336                for (i, ch) in rest.char_indices()
337                {
338                    if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
339                        || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
340                    {
341                        end = i;
342                        break;
343                    }
344                }
345                if name.is_none()
346                {
347                    name = Some(rest[..end].to_string());
348                }
349                else
350                {
351                    if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
352                    current.push(rest[..end].to_string());
353                }
354                rest = &rest[end..];
355            }
356        }
357    }
358    
359    if name.is_some()
360    {
361        metalist.push(current);
362        rules.push((name.unwrap(), metalist));
363    }
364    
365    Ok(rules)
366}
367
368pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
369{
370    let mut by_name = HashMap::default();
371    for (name, _) in input.iter()
372    {
373        if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
374        if by_name.insert(name.clone(), by_name.len()).is_some()
375        {
376            return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
377        }
378    }
379    
380    let mut string_cache = HashMap::default();
381    let mut string_cache_inv = Vec::new();
382    let mut points = Vec::new();
383    let mut literals = HashSet::default();
384    let mut lex_regexes = HashMap::default();
385    
386    let mut bracket_pairs = Vec::new();
387    let mut comment_pairs = Vec::new();
388    let mut comment_pairs_nested = Vec::new();
389    let mut comment_regexes = Vec::new();
390    let mut comments = Vec::new();
391    
392    let mut cache_pool = HashMap::<String, _>::default();
393    
394    let mut reserved = None;
395    for (name, raw_forms) in input.iter()
396    {
397        if name == "__RESERVED_WORDS"
398        {
399            let mut set = Vec::new();
400            for s in raw_forms.iter().map(|x| x.iter()).flatten()
401            {
402                set.push(s.clone());
403            }
404            reserved = Some(build_literal_regex(&set, true));
405            continue;
406        }
407        if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
408        {
409            for raw_alt in raw_forms
410            {
411                match (raw_alt.get(0), raw_alt.get(1))
412                {
413                    (Some(l), Some(r)) =>
414                    {
415                        if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
416                        if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
417                        if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
418                    }
419                    _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
420                }
421            }
422            continue;
423        }
424        if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
425        {
426            for s in raw_forms.iter().map(|x| x.iter()).flatten()
427            {
428                if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
429                {
430                    let pattern = &s[2..s.len() - 2];
431                    let pattern = format!("\\A{pattern}");
432                    let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
433                    comment_regexes.push(re);
434                }
435                if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
436                {
437                    let pattern = &s[1..s.len() - 1];
438                    comments.push(pattern.to_string());
439                }
440            }
441            continue;
442        }
443        let index = *by_name.get(name).unwrap();
444        
445        let mut forms = Vec::new();
446        let mut recover = None;
447        
448        for raw_alt in raw_forms
449        {
450            //println!("{:?}", raw_alt);
451            let mut matching_terms = Vec::new();
452            let mut pruned = false;
453            
454            let mut i = 0;
455            while i < raw_alt.len()
456            {
457                let term_str = &raw_alt[i];
458                i += 1;
459                
460                if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
461                {
462                    let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
463                    matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
464                    
465                    literals.insert(literal.clone());
466                    continue;
467                }
468                if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
469                {
470                    let pattern = &term_str[2..term_str.len() - 2];
471                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
472                    let pattern = format!("\\A(?:{pattern})"); // at start (for tokenization)
473                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
474                    let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
475                    lex_regexes.insert(pattern, re2.clone());
476                    matching_terms.push(MatchingTermE::TermRegex(re2).to());
477                    continue;
478                }
479                // non-tokenizing regex (optimization: hide redundant regexes from the tokenizer)
480                if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
481                {
482                    let pattern = &term_str[2..term_str.len() - 2];
483                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
484                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
485                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
486                    continue;
487                }
488                if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
489                {
490                    let pattern = &term_str[2..term_str.len() - 2];
491                    let pattern_all = format!("\\A(?:{pattern})");
492                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
493                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
494                    continue;
495                }
496                if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
497                {
498                    let pattern = &raw_alt[i];
499                    if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
500                        || !pattern.ends_with("`r")
501                    {
502                        Err(format!("@recover guards only accept regex strings"))?
503                    }
504                    let no_z = pattern.starts_with("A`");
505                    let pattern = &pattern[2..pattern.len() - 2];
506                    let mut pattern_all = format!("\\A(?:{})\\z", pattern);
507                    if no_z { pattern_all = format!("\\A(?:{})", pattern); }
508                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
509                    // TODO: make regex cachers use interior mutability and share the cache
510                    if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
511                    recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
512                    i += 1;
513                    continue;
514                }
515                if matches!(&**term_str, "@EOF" | "@eof")
516                {
517                    matching_terms.push(MatchingTermE::Eof.to());
518                    continue;
519                }
520                if matches!(&**term_str, "@AUTO" | "@auto")
521                {
522                    matching_terms.push(MatchingTermE::_AutoTemp.to());
523                    continue;
524                }
525                if (term_str == "@PEEK" || term_str == "@peek"
526                    || term_str == "@PEEKR" || term_str == "@peekr"
527                    || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
528                {
529                    if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
530                    let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
531                    if term_str == "@PEEK" || term_str == "@peek"
532                    {
533                        let literal = &raw_alt[i+3];
534                        if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
535                        {
536                            return Err(format!("@peek guards only accept plain strings"));
537                        }
538                        let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
539                        let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
540                        matching_terms.push(MatchingTermE::Peek(n, s).to());
541                    }
542                    else
543                    {
544                        let pattern = &raw_alt[i+3];
545                        if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
546                            || !pattern.ends_with("`r")
547                        {
548                            Err(format!("@recover guards only accept regex strings"))?
549                        }
550                        let no_z = pattern.starts_with("A`");
551                        let pattern = &pattern[2..pattern.len() - 2];
552                        let mut pattern_all = format!("\\A(?:{})\\z", pattern);
553                        if no_z { pattern_all = format!("\\A(?:{})", pattern); }
554                        
555                        let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
556                        // TODO: make regex cachers use interior mutability and share the cache
557                        if term_str == "@PEEKRES" || term_str == "@peekres"
558                        {
559                            matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
560                        }
561                        else
562                        {
563                            matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
564                        }
565                    }
566                    i += 5;
567                    continue;
568                }
569                if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
570                {
571                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
572                    {
573                        return Err(format!("Invalid guard syntax: must be @guard(name)"));
574                    }
575                    let literal = &raw_alt[i+1];
576                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
577                    matching_terms.push(MatchingTermE::Guard(s).to());
578                    i += 3;
579                    continue;
580                }
581                if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
582                {
583                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
584                    {
585                        return Err(format!("Invalid hook syntax: must be @hook(name)"));
586                    }
587                    let literal = &raw_alt[i+1];
588                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
589                    matching_terms.push(MatchingTermE::Hook(s).to());
590                    i += 3;
591                    continue;
592                }
593                if matches!(&**term_str, "$BECOME" | "$become")
594                {
595                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
596                    continue;
597                }
598                if matches!(&**term_str, "$BECOME_AS" | "$become_as")
599                {
600                    matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
601                    continue;
602                }
603                if matches!(&**term_str, "$HOIST" | "$hoist")
604                {
605                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Hoist).to());
606                    continue;
607                }
608                if matches!(&**term_str, "$HOIST_UNIT" | "$hoist_unit")
609                {
610                    matching_terms.push(MatchingTermE::Directive(MatchDirective::HoistIfUnit).to());
611                    continue;
612                }
613                if matches!(&**term_str, "$DROP" | "$drop")
614                {
615                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Drop).to());
616                    continue;
617                }
618                if matches!(&**term_str, "$DROP_EMPTY" | "$drop_empty")
619                {
620                    matching_terms.push(MatchingTermE::Directive(MatchDirective::DropIfEmpty).to());
621                    continue;
622                }
623                if matches!(&**term_str, "$RENAME" | "$rename")
624                {
625                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Rename).to());
626                    continue;
627                }
628                if matches!(&**term_str, "$ANY" | "$any")
629                {
630                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
631                    continue;
632                }
633                if matches!(&**term_str, "$PRUNED" | "$pruned")
634                {
635                    pruned = true;
636                    continue;
637                }
638                let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
639                matching_terms.push(MatchingTermE::Rule(*id).to());
640            }
641            if matching_terms.len() > 60000
642            {
643                Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
644            }
645            if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
646            {
647                match matching_terms.get(1).map(|x| &x.t)
648                {
649                    Some(MatchingTermE::TermLit(s)) =>
650                    {
651                        matching_terms[0] = MatchingTermE::Peek(0, *s).to();
652                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
653                    }
654                    Some(MatchingTermE::TermRegex(r)) =>
655                    {
656                        let r = r.clone();
657                        matching_terms[0] = MatchingTermE::PeekR(0, r).to();
658                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
659                    }
660                    _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
661                }
662            }
663            // TODO: check for illegally-placed become, becomeas, hoist
664            forms.push(Alternation { matching_terms, pruned });
665        }
666        if forms.len() > 60000
667        {
668            Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
669        }
670        let mut num_nonguards = 0;
671        for f in &forms
672        {
673            if num_nonguards != 0
674            {
675                eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
676                break;
677            }
678            if !(if let Some(x) = f.matching_terms.get(0) // early 2026: working around not-yet-supported syntax
679                && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
680            { num_nonguards += 1; }
681        }
682        
683        if index > 4000000000
684        {
685            Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
686        }
687        let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
688        points.push(GrammarPoint
689        {
690            name,
691            name_id,
692            //id: index as u32,
693            forms,
694            recover,
695        });
696    }
697    
698    let mut literals = literals.into_iter().collect::<Vec<_>>();
699    literals.sort();
700    
701    let mut regexes = Vec::new();
702    for (r, r2) in lex_regexes
703    {
704        regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
705    }
706    Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
707}
708
709/// Turns a BNF string into a [`Grammar`]. See the comments at [the crate root](super) for syntax notes. The basic parts are standard BNF.
710///
711/// Next step: [`tokenize`].
712pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
713{
714    grammar_convert(&bnf_parse(s)?)
715}
716
717#[derive(Debug, Clone, Default)]
718/// Produced by [`tokenize`].
719///
720/// Next step: [`ast::parse`](`super::ast::parse`).
721pub struct Token {
722    /// Interned string ID, see [`Grammar::string_cache_inv`]
723    pub text : u32,
724    /// For bracket pairs: how far away, in which direction, is the paired bracket? In terms of tokens.
725    pub pair : isize,
726}
727
728// Sort literals from grammar by length and combine them into a single match-longest regex.
729pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
730{
731    let mut text_token_regex_s = "\\A(?:".to_string();
732    
733    let mut lits = literals.clone();
734    lits.sort_by(|a, b| b.len().cmp(&a.len()));
735    for text in lits.iter()
736    {
737        let s2 = regex::escape(&text);
738        text_token_regex_s += &s2;
739        text_token_regex_s += "|";
740    }
741    if lits.len() > 0 { text_token_regex_s.pop(); }
742    text_token_regex_s += ")";
743    if terminated { text_token_regex_s += "\\z"; }
744    let text_token_regex = new_regex(&text_token_regex_s).unwrap();
745    text_token_regex
746}
747
748/// Tokenizer error state.
749#[derive(Clone, Debug)]
750pub struct TokError {
751    /// Human-readable error message. Format is not guaranteed and may change arbitrarily.
752    #[allow(unused)] pub err_message : String,
753    /// Number of tokens produced so far.
754    #[allow(unused)] pub produced : usize,
755    /// At what byte index in the input string did tokenization fail to continue?
756    #[allow(unused)] pub location : usize,
757    /// Is `Some` if tokenization failed because of failing to pair brackets:
758    ///
759    /// - `String`: responsible token text
760    /// - `bool`: openness
761    ///
762    /// If the bool is true, then the failed pairing is "open", like the string `()(`. Otherwise, it's "closed", like `())`.
763    ///
764    /// The responsible token is the token where the error was noticed, not the token that's "missing". E.g. for `())` it's `")"`.
765    ///
766    /// It is currently unspecified whether non-nested overlaps like `( { ) }` trigger an error.
767    ///
768    /// KNOWN BUG: "open" failed pairings like `()(` do not currently produce errors. They will become errors in a future version.
769    #[allow(unused)] pub pairing_error : Option<(String, bool)>,
770}
771
772/// Scans the given string and produces a stream of [`Token`]s.
773///
774/// Next step: [`ast::parse`](`super::ast::parse`).
775///
776/// The scanner performs maximal munch between all string literals and r``r terminals in the grammar. It also skips whitespace, and comments (as defined in the grammar). The produced tokens use string interning and can be bracket-paired. See [the crate root](super) for more details.
777pub fn tokenize(
778    g : &mut Grammar,
779    mut s : &str
780) -> Result<Vec<Token>, TokError>
781{
782    let s_orig = s;
783    let mut tokens = Vec::<Token>::new();
784    tokens.reserve(s.len()/16 + 1);
785    
786    let mut lit_filtered = Vec::new();
787    for l in g.literals.iter()
788    {
789        let mut covered = false;
790        for r in &g.regexes
791        {
792            if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
793            {
794                if loc == l.len()
795                {
796                    covered = true;
797                    break;
798                }
799            }
800        }
801        if !covered
802        {
803            lit_filtered.push(l.clone());
804        }
805    }
806    let all_literals_regex = if lit_filtered.len() > 0
807    {
808        Some(build_literal_regex(&lit_filtered, false))
809    }
810    else
811    {
812        None
813    };
814    //println!("{}", all_literals_regex);
815    
816    for text in g.literals.iter()
817    {
818        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
819    }
820    for point in g.points.iter()
821    {
822        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
823    }
824    
825    let mut openers = HashSet::default();
826    let mut closers = HashMap::default();
827    let mut stacks = HashMap::default();
828    let mut any_paired = false;
829    for (l, r) in &g.bracket_pairs
830    {
831        let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
832        let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
833        openers.insert(lsc);
834        closers.insert(rsc, lsc);
835        stacks.insert(lsc, Vec::<usize>::new());
836        any_paired = true;
837    }
838    
839    /*
840    println!("num regexes to check: {}", g.regexes.len());
841    for r in &g.regexes
842    {
843        println!("{:?}", r.as_str());
844    }
845    */
846    
847    'top: while !s.is_empty()
848    {
849        if get_char_at_byte(s, 0) as u32 <= 0x20
850        {
851            if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
852            {
853                while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
854                {
855                    s = &s[1..]; // ascii whitespace is always 1 byte long
856                }
857                if s.is_empty() { break; }
858                continue 'top;
859            }
860        }
861        
862        // Pure regex comments
863        for re in &g.comment_regexes
864        {
865            if let Some(x) = regex_find(re, s)
866            {
867                //s = &s[x.len()..];
868                s = &s[x.end() - x.start()..];
869                continue 'top;
870            }
871        }
872        // Comments with escapable newline handling
873        for c in &g.comments
874        {
875            if s.starts_with(c)
876            {
877                s = &s[c.len()..];
878                while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
879                {
880                    if get_char_at_byte(s, 0) == '\\'
881                    {
882                        s = &s[get_char_at_byte(s, 0).len_utf8()..]; // extra skip
883                    }
884                    if s.len() > 0
885                    {
886                        s = &s[get_char_at_byte(s, 0).len_utf8()..];
887                    }
888                }
889                continue 'top;
890            }
891        }
892        // Pair comments with nesting
893        for (l, r) in &g.comment_pairs_nested
894        {
895            if s.starts_with(l)
896            {
897                s = &s[l.len()..];
898                let mut nest = 1;
899                while s.len() > 0 && nest > 0
900                {
901                    if s.starts_with(l) { nest += 1; }
902                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
903                    if s.starts_with(r) { nest -= 1; }
904                }
905                if s.starts_with(r) { s = &s[r.len()..]; }
906                continue 'top;
907            }
908        }
909        // Pair comments without nesting
910        for (l, r) in &g.comment_pairs
911        {
912            if s.starts_with(l)
913            {
914                s = &s[l.len()..];
915                while s.len() > 0 && !s.starts_with(r)
916                {
917                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
918                }
919                s = &s[r.len()..];
920                continue 'top;
921            }
922        }
923        // Maximal munch: Regex pass
924        let mut longest = 0;
925        //let mut found_regex = None;
926        for r in &g.regexes
927        {
928            if let Some(loc) = regex_find(&r.0, s)
929            {
930                let len = loc.end() - loc.start();
931                //found_regex = Some(&r.1);
932                longest = longest.max(len);
933            }
934        }
935        // Literals pass.
936        if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
937        {
938            let len = loc.end() - loc.start();
939            //found_regex = None;
940            longest = longest.max(len);
941        }
942        if longest == 0
943        {
944            let sn : String = s.chars().take(5).collect();
945            return Err(TokError {
946                err_message: format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn),
947                produced : tokens.len(),
948                location : s_orig.len() - s.len(),
949                pairing_error : None,
950            });
951        }
952        
953        //let text_info = string_cache_lookup(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
954        //let text = text_info.1;
955        let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
956        let mut token = Token { text, pair : 0 };
957        
958        /*
959        if let Some(r2) = found_regex
960        {
961            r2.cache2.borrow_mut().insert(text, true);
962            r2.cache.borrow_mut().insert(text_info.0, true);
963        }
964        */
965        
966        if any_paired
967        {
968            if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
969            {
970                s.push(tokens.len());
971            }
972            if let Some(l) = closers.get(&text) && let Some(stack) = stacks.get_mut(l)
973            {
974                let n = match stack.pop() {
975                    Some(n) => n,
976                    None => Err(TokError {
977                        err_message: format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()),
978                        produced : tokens.len(),
979                        location : s_orig.len() - s.len(),
980                        pairing_error : Some((s[..longest].to_string(), false)),
981                    })?,
982                };
983                //.ok_or_else(|| )?;
984                
985                let me = tokens.len();
986                let diff = match me.checked_signed_diff(n) {
987                    Some(n) => n,
988                    None => Err(TokError {
989                        err_message: format!("Input too long"),
990                        produced : tokens.len(),
991                        location : s_orig.len() - s.len(),
992                        pairing_error : None,
993                    })?,
994                };
995                token.pair = -diff;
996                tokens[n].pair = diff;
997            }
998        }
999        
1000        s = &s[longest..];
1001        tokens.push(token);
1002    }
1003    Ok(tokens)
1004}