Skip to main content

pred_recdec/
bnf.rs

1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6//use regex::bytes::Regex as Regex;
7//use pcre2::bytes::Regex as Regex;
8
9// Yes this is normal for low-level parsers.
10// No the rust stdlib does not have an equivalent function.
11pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13    s[i..].chars().next().unwrap()
14}
15/// One-to-one wrapper around Regex::new().
16pub fn new_regex(s : &str) -> Result<Regex, regex::Error>
17//pub (crate) fn new_regex(s : &str) -> Result<Regex, pcre2::Error>
18{
19    Regex::new(s)
20    //regex::RegexBuilder::new(s).unicode(false).build()
21    //pcre2::bytes::RegexBuilder::new().jit(true).build(s)
22}
23
24pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
25//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<pcre2::bytes::Match<'a>>
26//pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::bytes::Match<'a>>
27{
28    //r.find(s.as_bytes())
29    //r.find(s.as_bytes()).unwrap()
30    r.find(s)
31}
32pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
33{
34    //r.is_match(s.as_bytes())
35    //r.is_match(s.as_bytes()).unwrap()
36    r.is_match(s)
37}
38
39
40#[derive(Debug, Clone, Hash)]
41pub (crate) struct K<T> {
42    pub (crate) k : Rc<T>
43}
44
45impl<T : PartialEq> PartialEq for K<T> {
46    fn eq(&self, other: &Self) -> bool {
47        if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
48        self.k == other.k
49    }
50}
51impl<T : Eq> Eq for K<T> { }
52
53#[derive(Debug, Clone)]
54pub struct RegexCacher {
55    r : Regex,
56    cache : Rc<RefCell<HashMap<K<String>, bool>>>,
57    cache2 : Rc<RefCell<HashMap<u32, bool>>>,
58}
59
60impl RegexCacher {
61    /// Wrap a Regex with a fast-path cache that checks if a given known input is already known.
62    ///
63    /// This is faster than whatever caching Regex does internally because of string interning.
64    #[allow(unused)]
65    pub fn new(r : Regex) -> RegexCacher
66    {
67        let cache = Rc::new(RefCell::new(HashMap::default()));
68        let cache2 = Rc::new(RefCell::new(HashMap::default()));
69        RegexCacher { r, cache, cache2 }
70    }
71    pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
72    {
73        let s = r.as_str().to_string();
74        let mut cache = Rc::new(RefCell::new(HashMap::default()));
75        let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
76        if let Some(cached) = cache_pool.get(&s)
77        {
78            cache = Rc::clone(&cached.0);
79            cache2 = Rc::clone(&cached.1);
80        }
81        else
82        {
83            cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
84        }
85        
86        RegexCacher { r, cache, cache2 }
87    }
88    /// Does the regex match the string?
89    #[inline(never)]
90    pub fn is_match(&self, s : &Rc<String>) -> bool
91    {
92        let mut cache = self.cache.borrow_mut();
93        let k = K { k : Rc::clone(s) };
94        if let Some(result) = cache.get(&k) { return *result; }
95        let ret = regex_is_match(&self.r, &*s);
96        cache.insert(k, ret);
97        ret
98    }
99    /// Does the regex match the string based on its interned ID? See also: [`Grammar::string_cache_inv`]
100    #[inline(never)]
101    pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
102    {
103        let mut cache = self.cache2.borrow_mut();
104        if let Some(result) = cache.get(&i) { return *result; }
105        let s = &string_cache_inv[i as usize];
106        let ret = regex_is_match(&self.r, &*s);
107        cache.insert(i, ret);
108        ret
109    }
110}
111
112#[derive(Default)]
113/// Produced by [`bnf_to_grammar`].
114///
115/// Next step: [`tokenize`].
116pub struct Grammar {
117    /// Arena for all grammar points in the grammar.
118    pub points: Vec<GrammarPoint>,
119    /// Lookup table for grammar point IDs (indexes in the arena) by their name.
120    pub by_name: HashMap<String, usize>,
121    
122    pub (crate) literals: Vec<String>,
123    pub (crate) regexes: Vec<(Regex, RegexCacher)>,
124    
125    /// String interning cache: from string to interned ID.
126    pub string_cache : HashMap<String, u32>,
127    /// Inverse string interning cache. Index = string ID. The given `Rc<String>` is the canonical object for that interned string.
128    pub string_cache_inv : Vec<Rc<String>>,
129    
130    pub (crate) bracket_pairs : Vec<(String, String)>,
131    pub (crate) comments : Vec<String>,
132    pub (crate) comment_pairs : Vec<(String, String)>,
133    pub (crate) comment_pairs_nested : Vec<(String, String)>,
134    pub (crate) comment_regexes : Vec<Regex>,
135    pub (crate) reserved : Option<Regex>,
136}
137
138#[derive(Debug, Clone)]
139/// A grammar rule.
140/// 
141/// More specifically, every production/alternation associated with a given name, in order. Alternations are stored and tested in the same order as written in the grammar.
142pub struct GrammarPoint {
143    /// Name of the grammar point (LHS).
144    pub name: Rc<String>,
145    /// ID of the grammar point (index in [`Grammar::points`]).
146    pub name_id: u32,
147    /// List of productions/alternations under this grammar point's LHS
148    pub forms: Vec<Alternation>,
149    pub (crate) recover: Option<(RegexCacher, bool)>,
150}
151
152#[derive(Debug, Clone)]
153/// A particular production of a grammar rule.
154///
155/// Named "Alternation" because they're tested in order.
156pub struct Alternation {
157    /// List of terms, in order, for this alternation.
158    pub matching_terms: Vec<MatchingTerm>,
159    /// Does this alternation want terminals to be added to it, or dropped?
160    pub (crate) pruned: bool,
161}
162
163#[derive(Debug, Clone)]
164pub(crate) enum MatchDirective {
165    Any, Become, BecomeAs, Hoist, HoistIfUnit, Drop, DropIfEmpty, Rename,
166}
167
168#[derive(Debug, Clone)]
169/// Intentionally opaque for API stability reasons. Don't worry, it's just a single enum internally.
170pub struct MatchingTerm { pub(crate) t : MatchingTermE }
171
172#[derive(Debug, Clone)]
173#[non_exhaustive]
174pub(crate) enum MatchingTermE {
175    Rule(usize),
176    TermLit(u32),
177    TermRegex(RegexCacher),
178    Directive(MatchDirective),
179    Hook(Rc<String>),
180    _AutoTemp,
181    
182    Eof,
183    Peek(isize, u32),
184    PeekR(isize, RegexCacher),
185    PeekRes(isize, RegexCacher),
186    Guard(Rc<String>),
187}
188impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
189
190/// Look up a string in the string interning cache.
191pub fn string_cache_lookup(
192    string_cache : &mut HashMap<String, u32>,
193    string_cache_inv : &mut Vec<Rc<String>>,
194    s : &str) -> (Rc<String>, u32)
195{
196    if let Some(sn) = string_cache.get(s)
197    {
198        return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
199    }
200    let rc = Rc::new(s.to_string());
201    let n = string_cache_inv.len().try_into().unwrap();
202    string_cache.insert(s.to_string(), n);
203    string_cache_inv.push(Rc::clone(&rc));
204    (rc, n)
205}
206/// Look up a string in the string interning cache, but skip the `Rc::clone()` overhead because we only need the ID.
207pub fn string_cache_lookup_id(
208    string_cache : &mut HashMap<String, u32>,
209    string_cache_inv : &mut Vec<Rc<String>>,
210    s : &str) -> u32
211{
212    if let Some(sn) = string_cache.get(s)
213    {
214        return *sn;
215    }
216    let rc = Rc::new(s.to_string());
217    let n = string_cache_inv.len().try_into().unwrap();
218    string_cache.insert(s.to_string(), n);
219    string_cache_inv.push(rc);
220    n
221}
222
223pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
224{
225    let mut rules = Vec::new();
226    
227    let mut metalist = Vec::new();
228    let mut current = Vec::new();
229    
230    let mut name : Option<String> = None;
231    let mut found_separator = false;
232    
233    let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
234    let mut lines2 : Vec<String> = vec!();
235    for l in lines
236    {
237        if let Some(l0) = lines2.last_mut()
238        {
239            if l0.ends_with("\\")
240            {
241                *l0 = format!("{}{l}", &l0[..l0.len()-1]);
242                continue;
243            }
244        }
245        
246        lines2.push(l);
247    }
248    for (mut linenum, rest) in lines2.iter().enumerate()
249    {
250        let mut rest : &str = rest;
251        linenum += 1; // user-facing line numbers are 1-indexed
252        
253        let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
254        
255        if _split.get(1) == Some(&"::=") // ::= is only allowed as the second token on a line and must be space-separated
256        {
257            if name.is_some()
258            {
259                metalist.push(current);
260                rules.push((name.unwrap(), metalist));
261            }
262            
263            name = None;
264            found_separator = false;
265            metalist = Vec::new();
266            current = Vec::new();
267        }
268        
269        while !rest.is_empty()
270        {
271            // skip extra whitespace
272            if get_char_at_byte(rest, 0).is_whitespace()
273            {
274                rest = rest.trim_start();
275            }
276            // comment
277            else if rest.starts_with("#")
278            {
279                break;
280            }
281            // literal
282            else if rest.starts_with("\"")
283            {
284                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
285                let mut len = 1;
286                let mut in_escape = false;
287                let mut found_exit = false;
288                while len < rest.len()
289                {
290                    let c = get_char_at_byte(rest, len);
291                    len += c.len_utf8();
292                    if !in_escape && c == '\\'
293                    {
294                        in_escape = true;
295                        continue;
296                    }
297                    if !in_escape && c == '"' { found_exit = true; break; }
298                    in_escape = false;
299                }
300                if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
301                current.push(rest[..len].to_string());
302                rest = &rest[len..];
303            }
304            // regex
305            else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
306            {
307                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
308                let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
309                let len = end + 4;
310                current.push(rest[..len].to_string());
311                rest = &rest[len..];
312            }
313            // split
314            else if rest.starts_with("::=")
315            {
316                if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
317                if name.is_none() { return Err(format!("missing name on line {linenum}")); }
318                found_separator = true;
319                rest = &rest[3..];
320            }
321            // operators
322            else if  rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
323            {
324                current.push(rest[0..1].to_string());
325                rest = &rest[1..];
326            }
327            else if rest.starts_with("|")
328            {
329                if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
330                metalist.push(current);
331                current = vec!();
332                rest = &rest[1..];
333            }
334            // name
335            else
336            {
337                let mut end = rest.len();
338                for (i, ch) in rest.char_indices()
339                {
340                    if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
341                        || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
342                    {
343                        end = i;
344                        break;
345                    }
346                }
347                if name.is_none()
348                {
349                    name = Some(rest[..end].to_string());
350                }
351                else
352                {
353                    if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
354                    current.push(rest[..end].to_string());
355                }
356                rest = &rest[end..];
357            }
358        }
359    }
360    
361    if name.is_some()
362    {
363        metalist.push(current);
364        rules.push((name.unwrap(), metalist));
365    }
366    
367    Ok(rules)
368}
369
370pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
371{
372    let mut by_name = HashMap::default();
373    for (name, _) in input.iter()
374    {
375        if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
376        if by_name.insert(name.clone(), by_name.len()).is_some()
377        {
378            return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
379        }
380    }
381    
382    let mut string_cache = HashMap::default();
383    let mut string_cache_inv = Vec::new();
384    let mut points = Vec::new();
385    let mut literals = HashSet::default();
386    let mut lex_regexes = HashMap::default();
387    
388    let mut bracket_pairs = Vec::new();
389    let mut comment_pairs = Vec::new();
390    let mut comment_pairs_nested = Vec::new();
391    let mut comment_regexes = Vec::new();
392    let mut comments = Vec::new();
393    
394    let mut cache_pool = HashMap::<String, _>::default();
395    
396    let mut reserved = None;
397    for (name, raw_forms) in input.iter()
398    {
399        if name == "__RESERVED_WORDS"
400        {
401            let mut set = Vec::new();
402            for s in raw_forms.iter().map(|x| x.iter()).flatten()
403            {
404                set.push(s.clone());
405            }
406            reserved = Some(build_literal_regex(&set, true));
407            continue;
408        }
409        if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
410        {
411            for raw_alt in raw_forms
412            {
413                match (raw_alt.get(0), raw_alt.get(1))
414                {
415                    (Some(l), Some(r)) =>
416                    {
417                        if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
418                        if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
419                        if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
420                    }
421                    _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
422                }
423            }
424            continue;
425        }
426        if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
427        {
428            for s in raw_forms.iter().map(|x| x.iter()).flatten()
429            {
430                if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
431                {
432                    let pattern = &s[2..s.len() - 2];
433                    let pattern = format!("\\A{pattern}");
434                    let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
435                    comment_regexes.push(re);
436                }
437                if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
438                {
439                    let pattern = &s[1..s.len() - 1];
440                    comments.push(pattern.to_string());
441                }
442            }
443            continue;
444        }
445        let index = *by_name.get(name).unwrap();
446        
447        let mut forms = Vec::new();
448        let mut recover = None;
449        
450        for raw_alt in raw_forms
451        {
452            //println!("{:?}", raw_alt);
453            let mut matching_terms = Vec::new();
454            let mut pruned = false;
455            
456            let mut i = 0;
457            while i < raw_alt.len()
458            {
459                let term_str = &raw_alt[i];
460                i += 1;
461                
462                if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
463                {
464                    let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
465                    matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
466                    
467                    literals.insert(literal.clone());
468                    continue;
469                }
470                if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
471                {
472                    let pattern = &term_str[2..term_str.len() - 2];
473                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
474                    let pattern = format!("\\A(?:{pattern})"); // at start (for tokenization)
475                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
476                    let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
477                    lex_regexes.insert(pattern, re2.clone());
478                    matching_terms.push(MatchingTermE::TermRegex(re2).to());
479                    continue;
480                }
481                // non-tokenizing regex (optimization: hide redundant regexes from the tokenizer)
482                if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
483                {
484                    let pattern = &term_str[2..term_str.len() - 2];
485                    let pattern_all = format!("\\A(?:{pattern})\\z"); // full match (for parsing)
486                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
487                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
488                    continue;
489                }
490                if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
491                {
492                    let pattern = &term_str[2..term_str.len() - 2];
493                    let pattern_all = format!("\\A(?:{pattern})");
494                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
495                    matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
496                    continue;
497                }
498                if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
499                {
500                    let pattern = &raw_alt[i];
501                    if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
502                        || !pattern.ends_with("`r")
503                    {
504                        Err(format!("@recover guards only accept regex strings"))?
505                    }
506                    let no_z = pattern.starts_with("A`");
507                    let pattern = &pattern[2..pattern.len() - 2];
508                    let mut pattern_all = format!("\\A(?:{})\\z", pattern);
509                    if no_z { pattern_all = format!("\\A(?:{})", pattern); }
510                    let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
511                    // TODO: make regex cachers use interior mutability and share the cache
512                    if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
513                    recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
514                    i += 1;
515                    continue;
516                }
517                if matches!(&**term_str, "@EOF" | "@eof")
518                {
519                    matching_terms.push(MatchingTermE::Eof.to());
520                    continue;
521                }
522                if matches!(&**term_str, "@AUTO" | "@auto")
523                {
524                    matching_terms.push(MatchingTermE::_AutoTemp.to());
525                    continue;
526                }
527                if (term_str == "@PEEK" || term_str == "@peek"
528                    || term_str == "@PEEKR" || term_str == "@peekr"
529                    || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
530                {
531                    if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
532                    let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
533                    if term_str == "@PEEK" || term_str == "@peek"
534                    {
535                        let literal = &raw_alt[i+3];
536                        if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
537                        {
538                            return Err(format!("@peek guards only accept plain strings"));
539                        }
540                        let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
541                        let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
542                        matching_terms.push(MatchingTermE::Peek(n, s).to());
543                    }
544                    else
545                    {
546                        let pattern = &raw_alt[i+3];
547                        if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
548                            || !pattern.ends_with("`r")
549                        {
550                            Err(format!("@recover guards only accept regex strings"))?
551                        }
552                        let no_z = pattern.starts_with("A`");
553                        let pattern = &pattern[2..pattern.len() - 2];
554                        let mut pattern_all = format!("\\A(?:{})\\z", pattern);
555                        if no_z { pattern_all = format!("\\A(?:{})", pattern); }
556                        
557                        let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
558                        // TODO: make regex cachers use interior mutability and share the cache
559                        if term_str == "@PEEKRES" || term_str == "@peekres"
560                        {
561                            matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
562                        }
563                        else
564                        {
565                            matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
566                        }
567                    }
568                    i += 5;
569                    continue;
570                }
571                if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
572                {
573                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
574                    {
575                        return Err(format!("Invalid guard syntax: must be @guard(name)"));
576                    }
577                    let literal = &raw_alt[i+1];
578                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
579                    matching_terms.push(MatchingTermE::Guard(s).to());
580                    i += 3;
581                    continue;
582                }
583                if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
584                {
585                    if raw_alt[i] != "(" || raw_alt[i+2] != ")"
586                    {
587                        return Err(format!("Invalid hook syntax: must be @hook(name)"));
588                    }
589                    let literal = &raw_alt[i+1];
590                    let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
591                    matching_terms.push(MatchingTermE::Hook(s).to());
592                    i += 3;
593                    continue;
594                }
595                if matches!(&**term_str, "$BECOME" | "$become")
596                {
597                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
598                    continue;
599                }
600                if matches!(&**term_str, "$BECOME_AS" | "$become_as")
601                {
602                    matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
603                    continue;
604                }
605                if matches!(&**term_str, "$HOIST" | "$hoist")
606                {
607                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Hoist).to());
608                    continue;
609                }
610                if matches!(&**term_str, "$HOIST_UNIT" | "$hoist_unit")
611                {
612                    matching_terms.push(MatchingTermE::Directive(MatchDirective::HoistIfUnit).to());
613                    continue;
614                }
615                if matches!(&**term_str, "$DROP" | "$drop")
616                {
617                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Drop).to());
618                    continue;
619                }
620                if matches!(&**term_str, "$DROP_EMPTY" | "$drop_empty")
621                {
622                    matching_terms.push(MatchingTermE::Directive(MatchDirective::DropIfEmpty).to());
623                    continue;
624                }
625                if matches!(&**term_str, "$RENAME" | "$rename")
626                {
627                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Rename).to());
628                    continue;
629                }
630                if matches!(&**term_str, "$ANY" | "$any")
631                {
632                    matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
633                    continue;
634                }
635                if matches!(&**term_str, "$PRUNED" | "$pruned")
636                {
637                    pruned = true;
638                    continue;
639                }
640                let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
641                matching_terms.push(MatchingTermE::Rule(*id).to());
642            }
643            if matching_terms.len() > 60000
644            {
645                Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
646            }
647            if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
648            {
649                match matching_terms.get(1).map(|x| &x.t)
650                {
651                    Some(MatchingTermE::TermLit(s)) =>
652                    {
653                        matching_terms[0] = MatchingTermE::Peek(0, *s).to();
654                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
655                    }
656                    Some(MatchingTermE::TermRegex(r)) =>
657                    {
658                        let r = r.clone();
659                        matching_terms[0] = MatchingTermE::PeekR(0, r).to();
660                        matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
661                    }
662                    _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
663                }
664            }
665            // TODO: check for illegally-placed become, becomeas, hoist
666            forms.push(Alternation { matching_terms, pruned });
667        }
668        if forms.len() > 60000
669        {
670            Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
671        }
672        let mut num_nonguards = 0;
673        for f in &forms
674        {
675            if num_nonguards != 0
676            {
677                eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
678                break;
679            }
680            if !(if let Some(x) = f.matching_terms.get(0) // early 2026: working around not-yet-supported syntax
681                && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
682            { num_nonguards += 1; }
683        }
684        
685        if index > 4000000000
686        {
687            Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
688        }
689        let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
690        points.push(GrammarPoint
691        {
692            name,
693            name_id,
694            //id: index as u32,
695            forms,
696            recover,
697        });
698    }
699    
700    let mut literals = literals.into_iter().collect::<Vec<_>>();
701    literals.sort();
702    
703    let mut regexes = Vec::new();
704    for (r, r2) in lex_regexes
705    {
706        regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
707    }
708    Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
709}
710
711/// Turns a BNF string into a [`Grammar`]. See the comments at [the crate root](super) for syntax notes. The basic parts are standard BNF.
712///
713/// Next step: [`tokenize`].
714pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
715{
716    grammar_convert(&bnf_parse(s)?)
717}
718
719#[derive(Debug, Clone, Default)]
720/// Produced by [`tokenize`].
721///
722/// Next step: [`ast::parse`](`super::ast::parse`).
723pub struct Token {
724    /// Interned string ID, see [`Grammar::string_cache_inv`]
725    pub text : u32,
726    /// For bracket pairs: how far away, in which direction, is the paired bracket? In terms of tokens.
727    pub pair : isize,
728}
729
730// Sort literals from grammar by length and combine them into a single match-longest regex.
731pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
732{
733    let mut text_token_regex_s = "\\A(?:".to_string();
734    
735    let mut lits = literals.clone();
736    lits.sort_by(|a, b| b.len().cmp(&a.len()));
737    for text in lits.iter()
738    {
739        let s2 = regex::escape(&text);
740        text_token_regex_s += &s2;
741        text_token_regex_s += "|";
742    }
743    if lits.len() > 0 { text_token_regex_s.pop(); }
744    text_token_regex_s += ")";
745    if terminated { text_token_regex_s += "\\z"; }
746    let text_token_regex = new_regex(&text_token_regex_s).unwrap();
747    text_token_regex
748}
749
750/// Tokenizer error state.
751#[derive(Clone, Debug)]
752pub struct TokError {
753    /// Human-readable error message. Format is not guaranteed and may change arbitrarily.
754    #[allow(unused)] pub err_message : String,
755    /// Number of tokens produced so far.
756    #[allow(unused)] pub produced : usize,
757    /// At what byte index in the input string did tokenization fail to continue?
758    #[allow(unused)] pub location : usize,
759    /// Is `Some` if tokenization failed because of failing to pair brackets:
760    ///
761    /// - `String`: responsible token text
762    /// - `bool`: openness
763    ///
764    /// If the bool is true, then the failed pairing is "open", like the string `()(`. Otherwise, it's "closed", like `())`.
765    ///
766    /// The responsible token is the token where the error was noticed, not the token that's "missing". E.g. for `())` it's `")"`.
767    ///
768    /// It is currently unspecified whether non-nested overlaps like `( { ) }` trigger an error.
769    ///
770    /// KNOWN BUG: "open" failed pairings like `()(` do not currently produce errors. They will become errors in a future version.
771    #[allow(unused)] pub pairing_error : Option<(String, bool)>,
772}
773
774/// Scans the given string and produces a stream of [`Token`]s.
775///
776/// Next step: [`ast::parse`](`super::ast::parse`).
777///
778/// The scanner performs maximal munch between all string literals and r``r terminals in the grammar. It also skips whitespace, and comments (as defined in the grammar). The produced tokens use string interning and can be bracket-paired. See [the crate root](super) for more details.
779pub fn tokenize(
780    g : &mut Grammar,
781    mut s : &str
782) -> Result<Vec<Token>, TokError>
783{
784    let s_orig = s;
785    let mut tokens = Vec::<Token>::new();
786    tokens.reserve(s.len()/16 + 1);
787    
788    let mut lit_filtered = Vec::new();
789    for l in g.literals.iter()
790    {
791        let mut covered = false;
792        for r in &g.regexes
793        {
794            if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
795            {
796                if loc == l.len()
797                {
798                    covered = true;
799                    break;
800                }
801            }
802        }
803        if !covered
804        {
805            lit_filtered.push(l.clone());
806        }
807    }
808    let all_literals_regex = if lit_filtered.len() > 0
809    {
810        Some(build_literal_regex(&lit_filtered, false))
811    }
812    else
813    {
814        None
815    };
816    //println!("{}", all_literals_regex);
817    
818    for text in g.literals.iter()
819    {
820        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
821    }
822    for point in g.points.iter()
823    {
824        string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
825    }
826    
827    let mut openers = HashSet::default();
828    let mut closers = HashMap::default();
829    let mut stacks = HashMap::default();
830    let mut any_paired = false;
831    for (l, r) in &g.bracket_pairs
832    {
833        let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
834        let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
835        openers.insert(lsc);
836        closers.insert(rsc, lsc);
837        stacks.insert(lsc, Vec::<usize>::new());
838        any_paired = true;
839    }
840    
841    /*
842    println!("num regexes to check: {}", g.regexes.len());
843    for r in &g.regexes
844    {
845        println!("{:?}", r.as_str());
846    }
847    */
848    
849    'top: while !s.is_empty()
850    {
851        if get_char_at_byte(s, 0) as u32 <= 0x20
852        {
853            if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
854            {
855                while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
856                {
857                    s = &s[1..]; // ascii whitespace is always 1 byte long
858                }
859                if s.is_empty() { break; }
860                continue 'top;
861            }
862        }
863        
864        // Pure regex comments
865        for re in &g.comment_regexes
866        {
867            if let Some(x) = regex_find(re, s)
868            {
869                //s = &s[x.len()..];
870                s = &s[x.end() - x.start()..];
871                continue 'top;
872            }
873        }
874        // Comments with escapable newline handling
875        for c in &g.comments
876        {
877            if s.starts_with(c)
878            {
879                s = &s[c.len()..];
880                while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
881                {
882                    if get_char_at_byte(s, 0) == '\\'
883                    {
884                        s = &s[get_char_at_byte(s, 0).len_utf8()..]; // extra skip
885                    }
886                    if s.len() > 0
887                    {
888                        s = &s[get_char_at_byte(s, 0).len_utf8()..];
889                    }
890                }
891                continue 'top;
892            }
893        }
894        // Pair comments with nesting
895        for (l, r) in &g.comment_pairs_nested
896        {
897            if s.starts_with(l)
898            {
899                s = &s[l.len()..];
900                let mut nest = 1;
901                while s.len() > 0 && nest > 0
902                {
903                    if s.starts_with(l) { nest += 1; }
904                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
905                    if s.starts_with(r) { nest -= 1; }
906                }
907                if s.starts_with(r) { s = &s[r.len()..]; }
908                continue 'top;
909            }
910        }
911        // Pair comments without nesting
912        for (l, r) in &g.comment_pairs
913        {
914            if s.starts_with(l)
915            {
916                s = &s[l.len()..];
917                while s.len() > 0 && !s.starts_with(r)
918                {
919                    s = &s[get_char_at_byte(s, 0).len_utf8()..];
920                }
921                s = &s[r.len()..];
922                continue 'top;
923            }
924        }
925        // Maximal munch: Regex pass
926        let mut longest = 0;
927        //let mut found_regex = None;
928        for r in &g.regexes
929        {
930            if let Some(loc) = regex_find(&r.0, s)
931            {
932                let len = loc.end() - loc.start();
933                //found_regex = Some(&r.1);
934                longest = longest.max(len);
935            }
936        }
937        // Literals pass.
938        if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
939        {
940            let len = loc.end() - loc.start();
941            //found_regex = None;
942            longest = longest.max(len);
943        }
944        if longest == 0
945        {
946            let sn : String = s.chars().take(5).collect();
947            return Err(TokError {
948                err_message: format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn),
949                produced : tokens.len(),
950                location : s_orig.len() - s.len(),
951                pairing_error : None,
952            });
953        }
954        
955        //let text_info = string_cache_lookup(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
956        //let text = text_info.1;
957        let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
958        let mut token = Token { text, pair : 0 };
959        
960        /*
961        if let Some(r2) = found_regex
962        {
963            r2.cache2.borrow_mut().insert(text, true);
964            r2.cache.borrow_mut().insert(text_info.0, true);
965        }
966        */
967        
968        if any_paired
969        {
970            if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
971            {
972                s.push(tokens.len());
973            }
974            if let Some(l) = closers.get(&text) && let Some(stack) = stacks.get_mut(l)
975            {
976                let n = match stack.pop() {
977                    Some(n) => n,
978                    None => Err(TokError {
979                        err_message: format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()),
980                        produced : tokens.len(),
981                        location : s_orig.len() - s.len(),
982                        pairing_error : Some((s[..longest].to_string(), false)),
983                    })?,
984                };
985                //.ok_or_else(|| )?;
986                
987                let me = tokens.len();
988                let diff = match me.checked_signed_diff(n) {
989                    Some(n) => n,
990                    None => Err(TokError {
991                        err_message: format!("Input too long"),
992                        produced : tokens.len(),
993                        location : s_orig.len() - s.len(),
994                        pairing_error : None,
995                    })?,
996                };
997                token.pair = -diff;
998                tokens[n].pair = diff;
999            }
1000        }
1001        
1002        s = &s[longest..];
1003        tokens.push(token);
1004    }
1005    Ok(tokens)
1006}