1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13 s[i..].chars().next().unwrap()
14}
15pub fn new_regex(s : &str) -> Result<Regex, regex::Error>
17{
19 Regex::new(s)
20 }
23
24pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
25{
28 r.find(s)
31}
32pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
33{
34 r.is_match(s)
37}
38
39
40#[derive(Debug, Clone, Hash)]
41pub (crate) struct K<T> {
42 pub (crate) k : Rc<T>
43}
44
45impl<T : PartialEq> PartialEq for K<T> {
46 fn eq(&self, other: &Self) -> bool {
47 if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
48 self.k == other.k
49 }
50}
51impl<T : Eq> Eq for K<T> { }
52
53#[derive(Debug, Clone)]
54pub struct RegexCacher {
55 r : Regex,
56 cache : Rc<RefCell<HashMap<K<String>, bool>>>,
57 cache2 : Rc<RefCell<HashMap<u32, bool>>>,
58}
59
60impl RegexCacher {
61 #[allow(unused)]
65 pub fn new(r : Regex) -> RegexCacher
66 {
67 let cache = Rc::new(RefCell::new(HashMap::default()));
68 let cache2 = Rc::new(RefCell::new(HashMap::default()));
69 RegexCacher { r, cache, cache2 }
70 }
71 pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
72 {
73 let s = r.as_str().to_string();
74 let mut cache = Rc::new(RefCell::new(HashMap::default()));
75 let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
76 if let Some(cached) = cache_pool.get(&s)
77 {
78 cache = Rc::clone(&cached.0);
79 cache2 = Rc::clone(&cached.1);
80 }
81 else
82 {
83 cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
84 }
85
86 RegexCacher { r, cache, cache2 }
87 }
88 #[inline(never)]
90 pub fn is_match(&self, s : &Rc<String>) -> bool
91 {
92 let mut cache = self.cache.borrow_mut();
93 let k = K { k : Rc::clone(s) };
94 if let Some(result) = cache.get(&k) { return *result; }
95 let ret = regex_is_match(&self.r, &*s);
96 cache.insert(k, ret);
97 ret
98 }
99 #[inline(never)]
101 pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
102 {
103 let mut cache = self.cache2.borrow_mut();
104 if let Some(result) = cache.get(&i) { return *result; }
105 let s = &string_cache_inv[i as usize];
106 let ret = regex_is_match(&self.r, &*s);
107 cache.insert(i, ret);
108 ret
109 }
110}
111
112#[derive(Default)]
113pub struct Grammar {
117 pub points: Vec<GrammarPoint>,
119 pub by_name: HashMap<String, usize>,
121
122 pub (crate) literals: Vec<String>,
123 pub (crate) regexes: Vec<(Regex, RegexCacher)>,
124
125 pub string_cache : HashMap<String, u32>,
127 pub string_cache_inv : Vec<Rc<String>>,
129
130 pub (crate) bracket_pairs : Vec<(String, String)>,
131 pub (crate) comments : Vec<String>,
132 pub (crate) comment_pairs : Vec<(String, String)>,
133 pub (crate) comment_pairs_nested : Vec<(String, String)>,
134 pub (crate) comment_regexes : Vec<Regex>,
135 pub (crate) reserved : Option<Regex>,
136}
137
138#[derive(Debug, Clone)]
139pub struct GrammarPoint {
143 pub name: Rc<String>,
145 pub name_id: u32,
147 pub forms: Vec<Alternation>,
149 pub (crate) recover: Option<(RegexCacher, bool)>,
150}
151
152#[derive(Debug, Clone)]
153pub struct Alternation {
157 pub matching_terms: Vec<MatchingTerm>,
159 pub (crate) pruned: bool,
161}
162
163#[derive(Debug, Clone)]
164pub(crate) enum MatchDirective {
165 Any, Become, BecomeAs, Hoist, HoistIfUnit, Drop, DropIfEmpty, Rename,
166}
167
168#[derive(Debug, Clone)]
169pub struct MatchingTerm { pub(crate) t : MatchingTermE }
171
172#[derive(Debug, Clone)]
173#[non_exhaustive]
174pub(crate) enum MatchingTermE {
175 Rule(usize),
176 TermLit(u32),
177 TermRegex(RegexCacher),
178 Directive(MatchDirective),
179 Hook(Rc<String>),
180 _AutoTemp,
181
182 Eof,
183 Peek(isize, u32),
184 PeekR(isize, RegexCacher),
185 PeekRes(isize, RegexCacher),
186 Guard(Rc<String>),
187}
188impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
189
190pub fn string_cache_lookup(
192 string_cache : &mut HashMap<String, u32>,
193 string_cache_inv : &mut Vec<Rc<String>>,
194 s : &str) -> (Rc<String>, u32)
195{
196 if let Some(sn) = string_cache.get(s)
197 {
198 return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
199 }
200 let rc = Rc::new(s.to_string());
201 let n = string_cache_inv.len().try_into().unwrap();
202 string_cache.insert(s.to_string(), n);
203 string_cache_inv.push(Rc::clone(&rc));
204 (rc, n)
205}
206pub fn string_cache_lookup_id(
208 string_cache : &mut HashMap<String, u32>,
209 string_cache_inv : &mut Vec<Rc<String>>,
210 s : &str) -> u32
211{
212 if let Some(sn) = string_cache.get(s)
213 {
214 return *sn;
215 }
216 let rc = Rc::new(s.to_string());
217 let n = string_cache_inv.len().try_into().unwrap();
218 string_cache.insert(s.to_string(), n);
219 string_cache_inv.push(rc);
220 n
221}
222
223pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
224{
225 let mut rules = Vec::new();
226
227 let mut metalist = Vec::new();
228 let mut current = Vec::new();
229
230 let mut name : Option<String> = None;
231 let mut found_separator = false;
232
233 let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
234 let mut lines2 : Vec<String> = vec!();
235 for l in lines
236 {
237 if let Some(l0) = lines2.last_mut()
238 {
239 if l0.ends_with("\\")
240 {
241 *l0 = format!("{}{l}", &l0[..l0.len()-1]);
242 continue;
243 }
244 }
245
246 lines2.push(l);
247 }
248 for (mut linenum, rest) in lines2.iter().enumerate()
249 {
250 let mut rest : &str = rest;
251 linenum += 1; let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
254
255 if _split.get(1) == Some(&"::=") {
257 if name.is_some()
258 {
259 metalist.push(current);
260 rules.push((name.unwrap(), metalist));
261 }
262
263 name = None;
264 found_separator = false;
265 metalist = Vec::new();
266 current = Vec::new();
267 }
268
269 while !rest.is_empty()
270 {
271 if get_char_at_byte(rest, 0).is_whitespace()
273 {
274 rest = rest.trim_start();
275 }
276 else if rest.starts_with("#")
278 {
279 break;
280 }
281 else if rest.starts_with("\"")
283 {
284 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
285 let mut len = 1;
286 let mut in_escape = false;
287 let mut found_exit = false;
288 while len < rest.len()
289 {
290 let c = get_char_at_byte(rest, len);
291 len += c.len_utf8();
292 if !in_escape && c == '\\'
293 {
294 in_escape = true;
295 continue;
296 }
297 if !in_escape && c == '"' { found_exit = true; break; }
298 in_escape = false;
299 }
300 if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
301 current.push(rest[..len].to_string());
302 rest = &rest[len..];
303 }
304 else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
306 {
307 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
308 let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
309 let len = end + 4;
310 current.push(rest[..len].to_string());
311 rest = &rest[len..];
312 }
313 else if rest.starts_with("::=")
315 {
316 if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
317 if name.is_none() { return Err(format!("missing name on line {linenum}")); }
318 found_separator = true;
319 rest = &rest[3..];
320 }
321 else if rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
323 {
324 current.push(rest[0..1].to_string());
325 rest = &rest[1..];
326 }
327 else if rest.starts_with("|")
328 {
329 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
330 metalist.push(current);
331 current = vec!();
332 rest = &rest[1..];
333 }
334 else
336 {
337 let mut end = rest.len();
338 for (i, ch) in rest.char_indices()
339 {
340 if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
341 || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
342 {
343 end = i;
344 break;
345 }
346 }
347 if name.is_none()
348 {
349 name = Some(rest[..end].to_string());
350 }
351 else
352 {
353 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
354 current.push(rest[..end].to_string());
355 }
356 rest = &rest[end..];
357 }
358 }
359 }
360
361 if name.is_some()
362 {
363 metalist.push(current);
364 rules.push((name.unwrap(), metalist));
365 }
366
367 Ok(rules)
368}
369
370pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
371{
372 let mut by_name = HashMap::default();
373 for (name, _) in input.iter()
374 {
375 if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
376 if by_name.insert(name.clone(), by_name.len()).is_some()
377 {
378 return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
379 }
380 }
381
382 let mut string_cache = HashMap::default();
383 let mut string_cache_inv = Vec::new();
384 let mut points = Vec::new();
385 let mut literals = HashSet::default();
386 let mut lex_regexes = HashMap::default();
387
388 let mut bracket_pairs = Vec::new();
389 let mut comment_pairs = Vec::new();
390 let mut comment_pairs_nested = Vec::new();
391 let mut comment_regexes = Vec::new();
392 let mut comments = Vec::new();
393
394 let mut cache_pool = HashMap::<String, _>::default();
395
396 let mut reserved = None;
397 for (name, raw_forms) in input.iter()
398 {
399 if name == "__RESERVED_WORDS"
400 {
401 let mut set = Vec::new();
402 for s in raw_forms.iter().map(|x| x.iter()).flatten()
403 {
404 set.push(s.clone());
405 }
406 reserved = Some(build_literal_regex(&set, true));
407 continue;
408 }
409 if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
410 {
411 for raw_alt in raw_forms
412 {
413 match (raw_alt.get(0), raw_alt.get(1))
414 {
415 (Some(l), Some(r)) =>
416 {
417 if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
418 if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
419 if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
420 }
421 _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
422 }
423 }
424 continue;
425 }
426 if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
427 {
428 for s in raw_forms.iter().map(|x| x.iter()).flatten()
429 {
430 if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
431 {
432 let pattern = &s[2..s.len() - 2];
433 let pattern = format!("\\A{pattern}");
434 let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
435 comment_regexes.push(re);
436 }
437 if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
438 {
439 let pattern = &s[1..s.len() - 1];
440 comments.push(pattern.to_string());
441 }
442 }
443 continue;
444 }
445 let index = *by_name.get(name).unwrap();
446
447 let mut forms = Vec::new();
448 let mut recover = None;
449
450 for raw_alt in raw_forms
451 {
452 let mut matching_terms = Vec::new();
454 let mut pruned = false;
455
456 let mut i = 0;
457 while i < raw_alt.len()
458 {
459 let term_str = &raw_alt[i];
460 i += 1;
461
462 if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
463 {
464 let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
465 matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
466
467 literals.insert(literal.clone());
468 continue;
469 }
470 if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
471 {
472 let pattern = &term_str[2..term_str.len() - 2];
473 let pattern_all = format!("\\A(?:{pattern})\\z"); let pattern = format!("\\A(?:{pattern})"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
476 let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
477 lex_regexes.insert(pattern, re2.clone());
478 matching_terms.push(MatchingTermE::TermRegex(re2).to());
479 continue;
480 }
481 if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
483 {
484 let pattern = &term_str[2..term_str.len() - 2];
485 let pattern_all = format!("\\A(?:{pattern})\\z"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
487 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
488 continue;
489 }
490 if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
491 {
492 let pattern = &term_str[2..term_str.len() - 2];
493 let pattern_all = format!("\\A(?:{pattern})");
494 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
495 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
496 continue;
497 }
498 if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
499 {
500 let pattern = &raw_alt[i];
501 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
502 || !pattern.ends_with("`r")
503 {
504 Err(format!("@recover guards only accept regex strings"))?
505 }
506 let no_z = pattern.starts_with("A`");
507 let pattern = &pattern[2..pattern.len() - 2];
508 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
509 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
510 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
511 if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
513 recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
514 i += 1;
515 continue;
516 }
517 if matches!(&**term_str, "@EOF" | "@eof")
518 {
519 matching_terms.push(MatchingTermE::Eof.to());
520 continue;
521 }
522 if matches!(&**term_str, "@AUTO" | "@auto")
523 {
524 matching_terms.push(MatchingTermE::_AutoTemp.to());
525 continue;
526 }
527 if (term_str == "@PEEK" || term_str == "@peek"
528 || term_str == "@PEEKR" || term_str == "@peekr"
529 || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
530 {
531 if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
532 let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
533 if term_str == "@PEEK" || term_str == "@peek"
534 {
535 let literal = &raw_alt[i+3];
536 if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
537 {
538 return Err(format!("@peek guards only accept plain strings"));
539 }
540 let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
541 let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
542 matching_terms.push(MatchingTermE::Peek(n, s).to());
543 }
544 else
545 {
546 let pattern = &raw_alt[i+3];
547 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
548 || !pattern.ends_with("`r")
549 {
550 Err(format!("@recover guards only accept regex strings"))?
551 }
552 let no_z = pattern.starts_with("A`");
553 let pattern = &pattern[2..pattern.len() - 2];
554 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
555 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
556
557 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
558 if term_str == "@PEEKRES" || term_str == "@peekres"
560 {
561 matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
562 }
563 else
564 {
565 matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
566 }
567 }
568 i += 5;
569 continue;
570 }
571 if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
572 {
573 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
574 {
575 return Err(format!("Invalid guard syntax: must be @guard(name)"));
576 }
577 let literal = &raw_alt[i+1];
578 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
579 matching_terms.push(MatchingTermE::Guard(s).to());
580 i += 3;
581 continue;
582 }
583 if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
584 {
585 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
586 {
587 return Err(format!("Invalid hook syntax: must be @hook(name)"));
588 }
589 let literal = &raw_alt[i+1];
590 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
591 matching_terms.push(MatchingTermE::Hook(s).to());
592 i += 3;
593 continue;
594 }
595 if matches!(&**term_str, "$BECOME" | "$become")
596 {
597 matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
598 continue;
599 }
600 if matches!(&**term_str, "$BECOME_AS" | "$become_as")
601 {
602 matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
603 continue;
604 }
605 if matches!(&**term_str, "$HOIST" | "$hoist")
606 {
607 matching_terms.push(MatchingTermE::Directive(MatchDirective::Hoist).to());
608 continue;
609 }
610 if matches!(&**term_str, "$HOIST_UNIT" | "$hoist_unit")
611 {
612 matching_terms.push(MatchingTermE::Directive(MatchDirective::HoistIfUnit).to());
613 continue;
614 }
615 if matches!(&**term_str, "$DROP" | "$drop")
616 {
617 matching_terms.push(MatchingTermE::Directive(MatchDirective::Drop).to());
618 continue;
619 }
620 if matches!(&**term_str, "$DROP_EMPTY" | "$drop_empty")
621 {
622 matching_terms.push(MatchingTermE::Directive(MatchDirective::DropIfEmpty).to());
623 continue;
624 }
625 if matches!(&**term_str, "$RENAME" | "$rename")
626 {
627 matching_terms.push(MatchingTermE::Directive(MatchDirective::Rename).to());
628 continue;
629 }
630 if matches!(&**term_str, "$ANY" | "$any")
631 {
632 matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
633 continue;
634 }
635 if matches!(&**term_str, "$PRUNED" | "$pruned")
636 {
637 pruned = true;
638 continue;
639 }
640 let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
641 matching_terms.push(MatchingTermE::Rule(*id).to());
642 }
643 if matching_terms.len() > 60000
644 {
645 Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
646 }
647 if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
648 {
649 match matching_terms.get(1).map(|x| &x.t)
650 {
651 Some(MatchingTermE::TermLit(s)) =>
652 {
653 matching_terms[0] = MatchingTermE::Peek(0, *s).to();
654 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
655 }
656 Some(MatchingTermE::TermRegex(r)) =>
657 {
658 let r = r.clone();
659 matching_terms[0] = MatchingTermE::PeekR(0, r).to();
660 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
661 }
662 _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
663 }
664 }
665 forms.push(Alternation { matching_terms, pruned });
667 }
668 if forms.len() > 60000
669 {
670 Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
671 }
672 let mut num_nonguards = 0;
673 for f in &forms
674 {
675 if num_nonguards != 0
676 {
677 eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
678 break;
679 }
680 if !(if let Some(x) = f.matching_terms.get(0) && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
682 { num_nonguards += 1; }
683 }
684
685 if index > 4000000000
686 {
687 Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
688 }
689 let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
690 points.push(GrammarPoint
691 {
692 name,
693 name_id,
694 forms,
696 recover,
697 });
698 }
699
700 let mut literals = literals.into_iter().collect::<Vec<_>>();
701 literals.sort();
702
703 let mut regexes = Vec::new();
704 for (r, r2) in lex_regexes
705 {
706 regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
707 }
708 Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
709}
710
711pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
715{
716 grammar_convert(&bnf_parse(s)?)
717}
718
719#[derive(Debug, Clone, Default)]
720pub struct Token {
724 pub text : u32,
726 pub pair : isize,
728}
729
730pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
732{
733 let mut text_token_regex_s = "\\A(?:".to_string();
734
735 let mut lits = literals.clone();
736 lits.sort_by(|a, b| b.len().cmp(&a.len()));
737 for text in lits.iter()
738 {
739 let s2 = regex::escape(&text);
740 text_token_regex_s += &s2;
741 text_token_regex_s += "|";
742 }
743 if lits.len() > 0 { text_token_regex_s.pop(); }
744 text_token_regex_s += ")";
745 if terminated { text_token_regex_s += "\\z"; }
746 let text_token_regex = new_regex(&text_token_regex_s).unwrap();
747 text_token_regex
748}
749
750#[derive(Clone, Debug)]
752pub struct TokError {
753 #[allow(unused)] pub err_message : String,
755 #[allow(unused)] pub produced : usize,
757 #[allow(unused)] pub location : usize,
759 #[allow(unused)] pub pairing_error : Option<(String, bool)>,
772}
773
774pub fn tokenize(
780 g : &mut Grammar,
781 mut s : &str
782) -> Result<Vec<Token>, TokError>
783{
784 let s_orig = s;
785 let mut tokens = Vec::<Token>::new();
786 tokens.reserve(s.len()/16 + 1);
787
788 let mut lit_filtered = Vec::new();
789 for l in g.literals.iter()
790 {
791 let mut covered = false;
792 for r in &g.regexes
793 {
794 if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
795 {
796 if loc == l.len()
797 {
798 covered = true;
799 break;
800 }
801 }
802 }
803 if !covered
804 {
805 lit_filtered.push(l.clone());
806 }
807 }
808 let all_literals_regex = if lit_filtered.len() > 0
809 {
810 Some(build_literal_regex(&lit_filtered, false))
811 }
812 else
813 {
814 None
815 };
816 for text in g.literals.iter()
819 {
820 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
821 }
822 for point in g.points.iter()
823 {
824 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
825 }
826
827 let mut openers = HashSet::default();
828 let mut closers = HashMap::default();
829 let mut stacks = HashMap::default();
830 let mut any_paired = false;
831 for (l, r) in &g.bracket_pairs
832 {
833 let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
834 let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
835 openers.insert(lsc);
836 closers.insert(rsc, lsc);
837 stacks.insert(lsc, Vec::<usize>::new());
838 any_paired = true;
839 }
840
841 'top: while !s.is_empty()
850 {
851 if get_char_at_byte(s, 0) as u32 <= 0x20
852 {
853 if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
854 {
855 while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
856 {
857 s = &s[1..]; }
859 if s.is_empty() { break; }
860 continue 'top;
861 }
862 }
863
864 for re in &g.comment_regexes
866 {
867 if let Some(x) = regex_find(re, s)
868 {
869 s = &s[x.end() - x.start()..];
871 continue 'top;
872 }
873 }
874 for c in &g.comments
876 {
877 if s.starts_with(c)
878 {
879 s = &s[c.len()..];
880 while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
881 {
882 if get_char_at_byte(s, 0) == '\\'
883 {
884 s = &s[get_char_at_byte(s, 0).len_utf8()..]; }
886 if s.len() > 0
887 {
888 s = &s[get_char_at_byte(s, 0).len_utf8()..];
889 }
890 }
891 continue 'top;
892 }
893 }
894 for (l, r) in &g.comment_pairs_nested
896 {
897 if s.starts_with(l)
898 {
899 s = &s[l.len()..];
900 let mut nest = 1;
901 while s.len() > 0 && nest > 0
902 {
903 if s.starts_with(l) { nest += 1; }
904 s = &s[get_char_at_byte(s, 0).len_utf8()..];
905 if s.starts_with(r) { nest -= 1; }
906 }
907 if s.starts_with(r) { s = &s[r.len()..]; }
908 continue 'top;
909 }
910 }
911 for (l, r) in &g.comment_pairs
913 {
914 if s.starts_with(l)
915 {
916 s = &s[l.len()..];
917 while s.len() > 0 && !s.starts_with(r)
918 {
919 s = &s[get_char_at_byte(s, 0).len_utf8()..];
920 }
921 s = &s[r.len()..];
922 continue 'top;
923 }
924 }
925 let mut longest = 0;
927 for r in &g.regexes
929 {
930 if let Some(loc) = regex_find(&r.0, s)
931 {
932 let len = loc.end() - loc.start();
933 longest = longest.max(len);
935 }
936 }
937 if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
939 {
940 let len = loc.end() - loc.start();
941 longest = longest.max(len);
943 }
944 if longest == 0
945 {
946 let sn : String = s.chars().take(5).collect();
947 return Err(TokError {
948 err_message: format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn),
949 produced : tokens.len(),
950 location : s_orig.len() - s.len(),
951 pairing_error : None,
952 });
953 }
954
955 let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
958 let mut token = Token { text, pair : 0 };
959
960 if any_paired
969 {
970 if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
971 {
972 s.push(tokens.len());
973 }
974 if let Some(l) = closers.get(&text) && let Some(stack) = stacks.get_mut(l)
975 {
976 let n = match stack.pop() {
977 Some(n) => n,
978 None => Err(TokError {
979 err_message: format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()),
980 produced : tokens.len(),
981 location : s_orig.len() - s.len(),
982 pairing_error : Some((s[..longest].to_string(), false)),
983 })?,
984 };
985 let me = tokens.len();
988 let diff = match me.checked_signed_diff(n) {
989 Some(n) => n,
990 None => Err(TokError {
991 err_message: format!("Input too long"),
992 produced : tokens.len(),
993 location : s_orig.len() - s.len(),
994 pairing_error : None,
995 })?,
996 };
997 token.pair = -diff;
998 tokens[n].pair = diff;
999 }
1000 }
1001
1002 s = &s[longest..];
1003 tokens.push(token);
1004 }
1005 Ok(tokens)
1006}