1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13 s[i..].chars().next().unwrap()
14}
15pub fn new_regex(s : &str) -> Result<Regex, regex::Error>
17{
19 Regex::new(s)
20 }
23
24pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
25{
28 r.find(s)
31}
32pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
33{
34 r.is_match(s)
37}
38
39
40#[derive(Debug, Clone, Hash)]
41pub (crate) struct K<T> {
42 pub (crate) k : Rc<T>
43}
44
45impl<T : PartialEq> PartialEq for K<T> {
46 fn eq(&self, other: &Self) -> bool {
47 if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
48 self.k == other.k
49 }
50}
51impl<T : Eq> Eq for K<T> { }
52
53#[derive(Debug, Clone)]
54pub struct RegexCacher {
55 r : Regex,
56 cache : Rc<RefCell<HashMap<K<String>, bool>>>,
57 cache2 : Rc<RefCell<HashMap<u32, bool>>>,
58}
59
60impl RegexCacher {
61 #[allow(unused)]
65 pub fn new(r : Regex) -> RegexCacher
66 {
67 let cache = Rc::new(RefCell::new(HashMap::default()));
68 let cache2 = Rc::new(RefCell::new(HashMap::default()));
69 RegexCacher { r, cache, cache2 }
70 }
71 pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
72 {
73 let s = r.as_str().to_string();
74 let mut cache = Rc::new(RefCell::new(HashMap::default()));
75 let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
76 if let Some(cached) = cache_pool.get(&s)
77 {
78 cache = Rc::clone(&cached.0);
79 cache2 = Rc::clone(&cached.1);
80 }
81 else
82 {
83 cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
84 }
85
86 RegexCacher { r, cache, cache2 }
87 }
88 pub fn is_match(&self, s : &Rc<String>) -> bool
90 {
91 let mut cache = self.cache.borrow_mut();
92 let k = K { k : Rc::clone(s) };
93 if let Some(result) = cache.get(&k) { return *result; }
94 let ret = regex_is_match(&self.r, &*s);
95 cache.insert(k, ret);
96 ret
97 }
98 pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
100 {
101 let mut cache = self.cache2.borrow_mut();
102 if let Some(result) = cache.get(&i) { return *result; }
103 let s = &string_cache_inv[i as usize];
104 let ret = regex_is_match(&self.r, &*s);
105 cache.insert(i, ret);
106 ret
107 }
108}
109
110#[derive(Default)]
111pub struct Grammar {
115 pub points: Vec<GrammarPoint>,
117 pub by_name: HashMap<String, usize>,
119
120 pub (crate) literals: Vec<String>,
121 pub (crate) regexes: Vec<(Regex, RegexCacher)>,
122
123 pub string_cache : HashMap<String, u32>,
125 pub string_cache_inv : Vec<Rc<String>>,
127
128 pub (crate) bracket_pairs : Vec<(String, String)>,
129 pub (crate) comments : Vec<String>,
130 pub (crate) comment_pairs : Vec<(String, String)>,
131 pub (crate) comment_pairs_nested : Vec<(String, String)>,
132 pub (crate) comment_regexes : Vec<Regex>,
133 pub (crate) reserved : Option<Regex>,
134}
135
136#[derive(Debug, Clone)]
137pub struct GrammarPoint {
141 pub name: Rc<String>,
143 pub name_id: u32,
145 pub forms: Vec<Alternation>,
147 pub (crate) recover: Option<(RegexCacher, bool)>,
148}
149
150#[derive(Debug, Clone)]
151pub struct Alternation {
155 pub matching_terms: Vec<MatchingTerm>,
157 pub (crate) pruned: bool,
159}
160
161#[derive(Debug, Clone)]
162pub(crate) enum MatchDirective {
163 Any, Become, BecomeAs, Hoist, HoistIfUnit, Drop, DropIfEmpty, Rename,
164}
165
166#[derive(Debug, Clone)]
167pub struct MatchingTerm { pub(crate) t : MatchingTermE }
169
170#[derive(Debug, Clone)]
171#[non_exhaustive]
172pub(crate) enum MatchingTermE {
173 Rule(usize),
174 TermLit(u32),
175 TermRegex(RegexCacher),
176 Directive(MatchDirective),
177 Hook(Rc<String>),
178 _AutoTemp,
179
180 Eof,
181 Peek(isize, u32),
182 PeekR(isize, RegexCacher),
183 PeekRes(isize, RegexCacher),
184 Guard(Rc<String>),
185}
186impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
187
188pub fn string_cache_lookup(
190 string_cache : &mut HashMap<String, u32>,
191 string_cache_inv : &mut Vec<Rc<String>>,
192 s : &str) -> (Rc<String>, u32)
193{
194 if let Some(sn) = string_cache.get(s)
195 {
196 return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
197 }
198 let rc = Rc::new(s.to_string());
199 let n = string_cache_inv.len().try_into().unwrap();
200 string_cache.insert(s.to_string(), n);
201 string_cache_inv.push(Rc::clone(&rc));
202 (rc, n)
203}
204pub fn string_cache_lookup_id(
206 string_cache : &mut HashMap<String, u32>,
207 string_cache_inv : &mut Vec<Rc<String>>,
208 s : &str) -> u32
209{
210 if let Some(sn) = string_cache.get(s)
211 {
212 return *sn;
213 }
214 let rc = Rc::new(s.to_string());
215 let n = string_cache_inv.len().try_into().unwrap();
216 string_cache.insert(s.to_string(), n);
217 string_cache_inv.push(rc);
218 n
219}
220
221pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
222{
223 let mut rules = Vec::new();
224
225 let mut metalist = Vec::new();
226 let mut current = Vec::new();
227
228 let mut name : Option<String> = None;
229 let mut found_separator = false;
230
231 let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
232 let mut lines2 : Vec<String> = vec!();
233 for l in lines
234 {
235 if let Some(l0) = lines2.last_mut()
236 {
237 if l0.ends_with("\\")
238 {
239 *l0 = format!("{}{l}", &l0[..l0.len()-1]);
240 continue;
241 }
242 }
243
244 lines2.push(l);
245 }
246 for (mut linenum, rest) in lines2.iter().enumerate()
247 {
248 let mut rest : &str = rest;
249 linenum += 1; let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
252
253 if _split.get(1) == Some(&"::=") {
255 if name.is_some()
256 {
257 metalist.push(current);
258 rules.push((name.unwrap(), metalist));
259 }
260
261 name = None;
262 found_separator = false;
263 metalist = Vec::new();
264 current = Vec::new();
265 }
266
267 while !rest.is_empty()
268 {
269 if get_char_at_byte(rest, 0).is_whitespace()
271 {
272 rest = rest.trim_start();
273 }
274 else if rest.starts_with("#")
276 {
277 break;
278 }
279 else if rest.starts_with("\"")
281 {
282 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
283 let mut len = 1;
284 let mut in_escape = false;
285 let mut found_exit = false;
286 while len < rest.len()
287 {
288 let c = get_char_at_byte(rest, len);
289 len += c.len_utf8();
290 if !in_escape && c == '\\'
291 {
292 in_escape = true;
293 continue;
294 }
295 if !in_escape && c == '"' { found_exit = true; break; }
296 in_escape = false;
297 }
298 if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
299 current.push(rest[..len].to_string());
300 rest = &rest[len..];
301 }
302 else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
304 {
305 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
306 let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
307 let len = end + 4;
308 current.push(rest[..len].to_string());
309 rest = &rest[len..];
310 }
311 else if rest.starts_with("::=")
313 {
314 if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
315 if name.is_none() { return Err(format!("missing name on line {linenum}")); }
316 found_separator = true;
317 rest = &rest[3..];
318 }
319 else if rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
321 {
322 current.push(rest[0..1].to_string());
323 rest = &rest[1..];
324 }
325 else if rest.starts_with("|")
326 {
327 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
328 metalist.push(current);
329 current = vec!();
330 rest = &rest[1..];
331 }
332 else
334 {
335 let mut end = rest.len();
336 for (i, ch) in rest.char_indices()
337 {
338 if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
339 || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
340 {
341 end = i;
342 break;
343 }
344 }
345 if name.is_none()
346 {
347 name = Some(rest[..end].to_string());
348 }
349 else
350 {
351 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
352 current.push(rest[..end].to_string());
353 }
354 rest = &rest[end..];
355 }
356 }
357 }
358
359 if name.is_some()
360 {
361 metalist.push(current);
362 rules.push((name.unwrap(), metalist));
363 }
364
365 Ok(rules)
366}
367
368pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
369{
370 let mut by_name = HashMap::default();
371 for (name, _) in input.iter()
372 {
373 if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
374 if by_name.insert(name.clone(), by_name.len()).is_some()
375 {
376 return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
377 }
378 }
379
380 let mut string_cache = HashMap::default();
381 let mut string_cache_inv = Vec::new();
382 let mut points = Vec::new();
383 let mut literals = HashSet::default();
384 let mut lex_regexes = HashMap::default();
385
386 let mut bracket_pairs = Vec::new();
387 let mut comment_pairs = Vec::new();
388 let mut comment_pairs_nested = Vec::new();
389 let mut comment_regexes = Vec::new();
390 let mut comments = Vec::new();
391
392 let mut cache_pool = HashMap::<String, _>::default();
393
394 let mut reserved = None;
395 for (name, raw_forms) in input.iter()
396 {
397 if name == "__RESERVED_WORDS"
398 {
399 let mut set = Vec::new();
400 for s in raw_forms.iter().map(|x| x.iter()).flatten()
401 {
402 set.push(s.clone());
403 }
404 reserved = Some(build_literal_regex(&set, true));
405 continue;
406 }
407 if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
408 {
409 for raw_alt in raw_forms
410 {
411 match (raw_alt.get(0), raw_alt.get(1))
412 {
413 (Some(l), Some(r)) =>
414 {
415 if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
416 if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
417 if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
418 }
419 _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
420 }
421 }
422 continue;
423 }
424 if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
425 {
426 for s in raw_forms.iter().map(|x| x.iter()).flatten()
427 {
428 if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
429 {
430 let pattern = &s[2..s.len() - 2];
431 let pattern = format!("\\A{pattern}");
432 let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
433 comment_regexes.push(re);
434 }
435 if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
436 {
437 let pattern = &s[1..s.len() - 1];
438 comments.push(pattern.to_string());
439 }
440 }
441 continue;
442 }
443 let index = *by_name.get(name).unwrap();
444
445 let mut forms = Vec::new();
446 let mut recover = None;
447
448 for raw_alt in raw_forms
449 {
450 let mut matching_terms = Vec::new();
452 let mut pruned = false;
453
454 let mut i = 0;
455 while i < raw_alt.len()
456 {
457 let term_str = &raw_alt[i];
458 i += 1;
459
460 if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
461 {
462 let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
463 matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
464
465 literals.insert(literal.clone());
466 continue;
467 }
468 if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
469 {
470 let pattern = &term_str[2..term_str.len() - 2];
471 let pattern_all = format!("\\A(?:{pattern})\\z"); let pattern = format!("\\A(?:{pattern})"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
474 let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
475 lex_regexes.insert(pattern, re2.clone());
476 matching_terms.push(MatchingTermE::TermRegex(re2).to());
477 continue;
478 }
479 if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
481 {
482 let pattern = &term_str[2..term_str.len() - 2];
483 let pattern_all = format!("\\A(?:{pattern})\\z"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
485 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
486 continue;
487 }
488 if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
489 {
490 let pattern = &term_str[2..term_str.len() - 2];
491 let pattern_all = format!("\\A(?:{pattern})");
492 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
493 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
494 continue;
495 }
496 if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
497 {
498 let pattern = &raw_alt[i];
499 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
500 || !pattern.ends_with("`r")
501 {
502 Err(format!("@recover guards only accept regex strings"))?
503 }
504 let no_z = pattern.starts_with("A`");
505 let pattern = &pattern[2..pattern.len() - 2];
506 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
507 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
508 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
509 if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
511 recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
512 i += 1;
513 continue;
514 }
515 if matches!(&**term_str, "@EOF" | "@eof")
516 {
517 matching_terms.push(MatchingTermE::Eof.to());
518 continue;
519 }
520 if matches!(&**term_str, "@AUTO" | "@auto")
521 {
522 matching_terms.push(MatchingTermE::_AutoTemp.to());
523 continue;
524 }
525 if (term_str == "@PEEK" || term_str == "@peek"
526 || term_str == "@PEEKR" || term_str == "@peekr"
527 || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
528 {
529 if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
530 let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
531 if term_str == "@PEEK" || term_str == "@peek"
532 {
533 let literal = &raw_alt[i+3];
534 if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
535 {
536 return Err(format!("@peek guards only accept plain strings"));
537 }
538 let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
539 let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
540 matching_terms.push(MatchingTermE::Peek(n, s).to());
541 }
542 else
543 {
544 let pattern = &raw_alt[i+3];
545 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
546 || !pattern.ends_with("`r")
547 {
548 Err(format!("@recover guards only accept regex strings"))?
549 }
550 let no_z = pattern.starts_with("A`");
551 let pattern = &pattern[2..pattern.len() - 2];
552 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
553 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
554
555 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
556 if term_str == "@PEEKRES" || term_str == "@peekres"
558 {
559 matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
560 }
561 else
562 {
563 matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
564 }
565 }
566 i += 5;
567 continue;
568 }
569 if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
570 {
571 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
572 {
573 return Err(format!("Invalid guard syntax: must be @guard(name)"));
574 }
575 let literal = &raw_alt[i+1];
576 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
577 matching_terms.push(MatchingTermE::Guard(s).to());
578 i += 3;
579 continue;
580 }
581 if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
582 {
583 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
584 {
585 return Err(format!("Invalid hook syntax: must be @hook(name)"));
586 }
587 let literal = &raw_alt[i+1];
588 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
589 matching_terms.push(MatchingTermE::Hook(s).to());
590 i += 3;
591 continue;
592 }
593 if matches!(&**term_str, "$BECOME" | "$become")
594 {
595 matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
596 continue;
597 }
598 if matches!(&**term_str, "$BECOME_AS" | "$become_as")
599 {
600 matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
601 continue;
602 }
603 if matches!(&**term_str, "$HOIST" | "$hoist")
604 {
605 matching_terms.push(MatchingTermE::Directive(MatchDirective::Hoist).to());
606 continue;
607 }
608 if matches!(&**term_str, "$HOIST_UNIT" | "$hoist_unit")
609 {
610 matching_terms.push(MatchingTermE::Directive(MatchDirective::HoistIfUnit).to());
611 continue;
612 }
613 if matches!(&**term_str, "$DROP" | "$drop")
614 {
615 matching_terms.push(MatchingTermE::Directive(MatchDirective::Drop).to());
616 continue;
617 }
618 if matches!(&**term_str, "$DROP_EMPTY" | "$drop_empty")
619 {
620 matching_terms.push(MatchingTermE::Directive(MatchDirective::DropIfEmpty).to());
621 continue;
622 }
623 if matches!(&**term_str, "$RENAME" | "$rename")
624 {
625 matching_terms.push(MatchingTermE::Directive(MatchDirective::Rename).to());
626 continue;
627 }
628 if matches!(&**term_str, "$ANY" | "$any")
629 {
630 matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
631 continue;
632 }
633 if matches!(&**term_str, "$PRUNED" | "$pruned")
634 {
635 pruned = true;
636 continue;
637 }
638 let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
639 matching_terms.push(MatchingTermE::Rule(*id).to());
640 }
641 if matching_terms.len() > 60000
642 {
643 Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
644 }
645 if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
646 {
647 match matching_terms.get(1).map(|x| &x.t)
648 {
649 Some(MatchingTermE::TermLit(s)) =>
650 {
651 matching_terms[0] = MatchingTermE::Peek(0, *s).to();
652 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
653 }
654 Some(MatchingTermE::TermRegex(r)) =>
655 {
656 let r = r.clone();
657 matching_terms[0] = MatchingTermE::PeekR(0, r).to();
658 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
659 }
660 _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
661 }
662 }
663 forms.push(Alternation { matching_terms, pruned });
665 }
666 if forms.len() > 60000
667 {
668 Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
669 }
670 let mut num_nonguards = 0;
671 for f in &forms
672 {
673 if num_nonguards != 0
674 {
675 eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
676 break;
677 }
678 if !(if let Some(x) = f.matching_terms.get(0) && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
680 { num_nonguards += 1; }
681 }
682
683 if index > 4000000000
684 {
685 Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
686 }
687 let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
688 points.push(GrammarPoint
689 {
690 name,
691 name_id,
692 forms,
694 recover,
695 });
696 }
697
698 let mut literals = literals.into_iter().collect::<Vec<_>>();
699 literals.sort();
700
701 let mut regexes = Vec::new();
702 for (r, r2) in lex_regexes
703 {
704 regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
705 }
706 Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
707}
708
709pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
713{
714 grammar_convert(&bnf_parse(s)?)
715}
716
717#[derive(Debug, Clone, Default)]
718pub struct Token {
722 pub text : u32,
724 pub pair : isize,
726}
727
728pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
730{
731 let mut text_token_regex_s = "\\A(?:".to_string();
732
733 let mut lits = literals.clone();
734 lits.sort_by(|a, b| b.len().cmp(&a.len()));
735 for text in lits.iter()
736 {
737 let s2 = regex::escape(&text);
738 text_token_regex_s += &s2;
739 text_token_regex_s += "|";
740 }
741 if lits.len() > 0 { text_token_regex_s.pop(); }
742 text_token_regex_s += ")";
743 if terminated { text_token_regex_s += "\\z"; }
744 let text_token_regex = new_regex(&text_token_regex_s).unwrap();
745 text_token_regex
746}
747
748#[derive(Clone, Debug)]
750pub struct TokError {
751 #[allow(unused)] pub err_message : String,
753 #[allow(unused)] pub produced : usize,
755 #[allow(unused)] pub location : usize,
757 #[allow(unused)] pub pairing_error : Option<(String, bool)>,
770}
771
772pub fn tokenize(
778 g : &mut Grammar,
779 mut s : &str
780) -> Result<Vec<Token>, TokError>
781{
782 let s_orig = s;
783 let mut tokens = Vec::<Token>::new();
784 tokens.reserve(s.len()/16 + 1);
785
786 let mut lit_filtered = Vec::new();
787 for l in g.literals.iter()
788 {
789 let mut covered = false;
790 for r in &g.regexes
791 {
792 if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
793 {
794 if loc == l.len()
795 {
796 covered = true;
797 break;
798 }
799 }
800 }
801 if !covered
802 {
803 lit_filtered.push(l.clone());
804 }
805 }
806 let all_literals_regex = if lit_filtered.len() > 0
807 {
808 Some(build_literal_regex(&lit_filtered, false))
809 }
810 else
811 {
812 None
813 };
814 for text in g.literals.iter()
817 {
818 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
819 }
820 for point in g.points.iter()
821 {
822 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
823 }
824
825 let mut openers = HashSet::default();
826 let mut closers = HashMap::default();
827 let mut stacks = HashMap::default();
828 let mut any_paired = false;
829 for (l, r) in &g.bracket_pairs
830 {
831 let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
832 let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
833 openers.insert(lsc);
834 closers.insert(rsc, lsc);
835 stacks.insert(lsc, Vec::<usize>::new());
836 any_paired = true;
837 }
838
839 'top: while !s.is_empty()
848 {
849 if get_char_at_byte(s, 0) as u32 <= 0x20
850 {
851 if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
852 {
853 while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
854 {
855 s = &s[1..]; }
857 if s.is_empty() { break; }
858 continue 'top;
859 }
860 }
861
862 for re in &g.comment_regexes
864 {
865 if let Some(x) = regex_find(re, s)
866 {
867 s = &s[x.end() - x.start()..];
869 continue 'top;
870 }
871 }
872 for c in &g.comments
874 {
875 if s.starts_with(c)
876 {
877 s = &s[c.len()..];
878 while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
879 {
880 if get_char_at_byte(s, 0) == '\\'
881 {
882 s = &s[get_char_at_byte(s, 0).len_utf8()..]; }
884 if s.len() > 0
885 {
886 s = &s[get_char_at_byte(s, 0).len_utf8()..];
887 }
888 }
889 continue 'top;
890 }
891 }
892 for (l, r) in &g.comment_pairs_nested
894 {
895 if s.starts_with(l)
896 {
897 s = &s[l.len()..];
898 let mut nest = 1;
899 while s.len() > 0 && nest > 0
900 {
901 if s.starts_with(l) { nest += 1; }
902 s = &s[get_char_at_byte(s, 0).len_utf8()..];
903 if s.starts_with(r) { nest -= 1; }
904 }
905 if s.starts_with(r) { s = &s[r.len()..]; }
906 continue 'top;
907 }
908 }
909 for (l, r) in &g.comment_pairs
911 {
912 if s.starts_with(l)
913 {
914 s = &s[l.len()..];
915 while s.len() > 0 && !s.starts_with(r)
916 {
917 s = &s[get_char_at_byte(s, 0).len_utf8()..];
918 }
919 s = &s[r.len()..];
920 continue 'top;
921 }
922 }
923 let mut longest = 0;
925 for r in &g.regexes
927 {
928 if let Some(loc) = regex_find(&r.0, s)
929 {
930 let len = loc.end() - loc.start();
931 longest = longest.max(len);
933 }
934 }
935 if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
937 {
938 let len = loc.end() - loc.start();
939 longest = longest.max(len);
941 }
942 if longest == 0
943 {
944 let sn : String = s.chars().take(5).collect();
945 return Err(TokError {
946 err_message: format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn),
947 produced : tokens.len(),
948 location : s_orig.len() - s.len(),
949 pairing_error : None,
950 });
951 }
952
953 let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
956 let mut token = Token { text, pair : 0 };
957
958 if any_paired
967 {
968 if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
969 {
970 s.push(tokens.len());
971 }
972 if let Some(l) = closers.get(&text) && let Some(stack) = stacks.get_mut(l)
973 {
974 let n = match stack.pop() {
975 Some(n) => n,
976 None => Err(TokError {
977 err_message: format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()),
978 produced : tokens.len(),
979 location : s_orig.len() - s.len(),
980 pairing_error : Some((s[..longest].to_string(), false)),
981 })?,
982 };
983 let me = tokens.len();
986 let diff = match me.checked_signed_diff(n) {
987 Some(n) => n,
988 None => Err(TokError {
989 err_message: format!("Input too long"),
990 produced : tokens.len(),
991 location : s_orig.len() - s.len(),
992 pairing_error : None,
993 })?,
994 };
995 token.pair = -diff;
996 tokens[n].pair = diff;
997 }
998 }
999
1000 s = &s[longest..];
1001 tokens.push(token);
1002 }
1003 Ok(tokens)
1004}