1type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
2type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
3use std::rc::Rc;
4use std::cell::RefCell;
5use regex::Regex as Regex;
6pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
12{
13 s[i..].chars().next().unwrap()
14}
15pub (crate) fn new_regex(s : &str) -> Result<Regex, regex::Error>
16{
18 Regex::new(s)
19 }
22
23pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
24{
27 r.find(s)
30}
31pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
32{
33 r.is_match(s)
36}
37
38
39#[derive(Debug, Clone, Hash)]
40pub (crate) struct K<T> {
41 pub (crate) k : Rc<T>
42}
43
44impl<T : PartialEq> PartialEq for K<T> {
45 fn eq(&self, other: &Self) -> bool {
46 if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
47 self.k == other.k
48 }
49}
50impl<T : Eq> Eq for K<T> { }
51
52#[derive(Debug, Clone)]
53pub struct RegexCacher {
54 r : Regex,
55 cache : Rc<RefCell<HashMap<K<String>, bool>>>,
56 cache2 : Rc<RefCell<HashMap<u32, bool>>>,
57}
58
59impl RegexCacher {
60 #[allow(unused)]
64 pub fn new(r : Regex) -> RegexCacher
65 {
66 let cache = Rc::new(RefCell::new(HashMap::default()));
67 let cache2 = Rc::new(RefCell::new(HashMap::default()));
68 RegexCacher { r, cache, cache2 }
69 }
70 pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
71 {
72 let s = r.as_str().to_string();
73 let mut cache = Rc::new(RefCell::new(HashMap::default()));
74 let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
75 if let Some(cached) = cache_pool.get(&s)
76 {
77 cache = Rc::clone(&cached.0);
78 cache2 = Rc::clone(&cached.1);
79 }
80 else
81 {
82 cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
83 }
84
85 RegexCacher { r, cache, cache2 }
86 }
87 pub fn is_match(&self, s : &Rc<String>) -> bool
89 {
90 let mut cache = self.cache.borrow_mut();
91 let k = K { k : Rc::clone(s) };
92 if let Some(result) = cache.get(&k) { return *result; }
93 let ret = regex_is_match(&self.r, &*s);
94 cache.insert(k, ret);
95 ret
96 }
97 pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
99 {
100 let mut cache = self.cache2.borrow_mut();
101 if let Some(result) = cache.get(&i) { return *result; }
102 let s = &string_cache_inv[i as usize];
103 let ret = regex_is_match(&self.r, &*s);
104 cache.insert(i, ret);
105 ret
106 }
107}
108
109#[derive(Default)]
110pub struct Grammar {
114 pub points: Vec<GrammarPoint>,
116 pub by_name: HashMap<String, usize>,
118
119 pub (crate) literals: Vec<String>,
120 pub (crate) regexes: Vec<(Regex, RegexCacher)>,
121
122 pub string_cache : HashMap<String, u32>,
124 pub string_cache_inv : Vec<Rc<String>>,
126
127 pub (crate) bracket_pairs : Vec<(String, String)>,
128 pub (crate) comments : Vec<String>,
129 pub (crate) comment_pairs : Vec<(String, String)>,
130 pub (crate) comment_pairs_nested : Vec<(String, String)>,
131 pub (crate) comment_regexes : Vec<Regex>,
132 pub (crate) reserved : Option<Regex>,
133}
134
135#[derive(Debug, Clone)]
136pub struct GrammarPoint {
140 pub name: Rc<String>,
142 pub name_id: u32,
144 pub forms: Vec<Alternation>,
146 pub (crate) recover: Option<(RegexCacher, bool)>,
147}
148
149#[derive(Debug, Clone)]
150pub struct Alternation {
154 pub matching_terms: Vec<MatchingTerm>,
156 pub (crate) pruned: bool,
158}
159
160#[derive(Debug, Clone)]
161pub(crate) enum MatchDirective {
162 Any, Become, BecomeAs, Hoist, Drop, DropIfEmpty, Rename,
163}
164
165#[derive(Debug, Clone)]
166pub struct MatchingTerm { pub(crate) t : MatchingTermE }
168
169#[derive(Debug, Clone)]
170#[non_exhaustive]
171pub(crate) enum MatchingTermE {
172 Rule(usize),
173 TermLit(u32),
174 TermRegex(RegexCacher),
175 Directive(MatchDirective),
176 Hook(Rc<String>),
177 _AutoTemp,
178
179 Eof,
180 Peek(isize, u32),
181 PeekR(isize, RegexCacher),
182 PeekRes(isize, RegexCacher),
183 Guard(Rc<String>),
184}
185impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
186
187pub fn string_cache_lookup(
189 string_cache : &mut HashMap<String, u32>,
190 string_cache_inv : &mut Vec<Rc<String>>,
191 s : &str) -> (Rc<String>, u32)
192{
193 if let Some(sn) = string_cache.get(s)
194 {
195 return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
196 }
197 let rc = Rc::new(s.to_string());
198 let n = string_cache_inv.len().try_into().unwrap();
199 string_cache.insert(s.to_string(), n);
200 string_cache_inv.push(Rc::clone(&rc));
201 (rc, n)
202}
203pub fn string_cache_lookup_id(
205 string_cache : &mut HashMap<String, u32>,
206 string_cache_inv : &mut Vec<Rc<String>>,
207 s : &str) -> u32
208{
209 if let Some(sn) = string_cache.get(s)
210 {
211 return *sn;
212 }
213 let rc = Rc::new(s.to_string());
214 let n = string_cache_inv.len().try_into().unwrap();
215 string_cache.insert(s.to_string(), n);
216 string_cache_inv.push(rc);
217 n
218}
219
220pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
221{
222 let mut rules = Vec::new();
223
224 let mut metalist = Vec::new();
225 let mut current = Vec::new();
226
227 let mut name : Option<String> = None;
228 let mut found_separator = false;
229
230 let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
231 let mut lines2 : Vec<String> = vec!();
232 for l in lines
233 {
234 if let Some(l0) = lines2.last_mut()
235 {
236 if l0.ends_with("\\")
237 {
238 *l0 = format!("{}{l}", &l0[..l0.len()-1]);
239 continue;
240 }
241 }
242
243 lines2.push(l);
244 }
245 for (mut linenum, rest) in lines2.iter().enumerate()
246 {
247 let mut rest : &str = rest;
248 linenum += 1; let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
251
252 if _split.get(1) == Some(&"::=") {
254 if name.is_some()
255 {
256 metalist.push(current);
257 rules.push((name.unwrap(), metalist));
258 }
259
260 name = None;
261 found_separator = false;
262 metalist = Vec::new();
263 current = Vec::new();
264 }
265
266 while !rest.is_empty()
267 {
268 if get_char_at_byte(rest, 0).is_whitespace()
270 {
271 rest = rest.trim_start();
272 }
273 else if rest.starts_with("#")
275 {
276 break;
277 }
278 else if rest.starts_with("\"")
280 {
281 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
282 let mut len = 1;
283 let mut in_escape = false;
284 let mut found_exit = false;
285 while len < rest.len()
286 {
287 let c = get_char_at_byte(rest, len);
288 len += c.len_utf8();
289 if !in_escape && c == '\\'
290 {
291 in_escape = true;
292 continue;
293 }
294 if !in_escape && c == '"' { found_exit = true; break; }
295 in_escape = false;
296 }
297 if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
298 current.push(rest[..len].to_string());
299 rest = &rest[len..];
300 }
301 else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
303 {
304 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
305 let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
306 let len = end + 4;
307 current.push(rest[..len].to_string());
308 rest = &rest[len..];
309 }
310 else if rest.starts_with("::=")
312 {
313 if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
314 if name.is_none() { return Err(format!("missing name on line {linenum}")); }
315 found_separator = true;
316 rest = &rest[3..];
317 }
318 else if rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
320 {
321 current.push(rest[0..1].to_string());
322 rest = &rest[1..];
323 }
324 else if rest.starts_with("|")
325 {
326 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
327 metalist.push(current);
328 current = vec!();
329 rest = &rest[1..];
330 }
331 else
333 {
334 let mut end = rest.len();
335 for (i, ch) in rest.char_indices()
336 {
337 if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
338 || rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
339 {
340 end = i;
341 break;
342 }
343 }
344 if name.is_none()
345 {
346 name = Some(rest[..end].to_string());
347 }
348 else
349 {
350 if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
351 current.push(rest[..end].to_string());
352 }
353 rest = &rest[end..];
354 }
355 }
356 }
357
358 if name.is_some()
359 {
360 metalist.push(current);
361 rules.push((name.unwrap(), metalist));
362 }
363
364 Ok(rules)
365}
366
367pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
368{
369 let mut by_name = HashMap::default();
370 for (name, _) in input.iter()
371 {
372 if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
373 if by_name.insert(name.clone(), by_name.len()).is_some()
374 {
375 return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
376 }
377 }
378
379 let mut string_cache = HashMap::default();
380 let mut string_cache_inv = Vec::new();
381 let mut points = Vec::new();
382 let mut literals = HashSet::default();
383 let mut lex_regexes = HashMap::default();
384
385 let mut bracket_pairs = Vec::new();
386 let mut comment_pairs = Vec::new();
387 let mut comment_pairs_nested = Vec::new();
388 let mut comment_regexes = Vec::new();
389 let mut comments = Vec::new();
390
391 let mut cache_pool = HashMap::<String, _>::default();
392
393 let mut reserved = None;
394 for (name, raw_forms) in input.iter()
395 {
396 if name == "__RESERVED_WORDS"
397 {
398 let mut set = Vec::new();
399 for s in raw_forms.iter().map(|x| x.iter()).flatten()
400 {
401 set.push(s.clone());
402 }
403 reserved = Some(build_literal_regex(&set, true));
404 continue;
405 }
406 if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
407 {
408 for raw_alt in raw_forms
409 {
410 match (raw_alt.get(0), raw_alt.get(1))
411 {
412 (Some(l), Some(r)) =>
413 {
414 if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
415 if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
416 if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
417 }
418 _ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
419 }
420 }
421 continue;
422 }
423 if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
424 {
425 for s in raw_forms.iter().map(|x| x.iter()).flatten()
426 {
427 if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
428 {
429 let pattern = &s[2..s.len() - 2];
430 let pattern = format!("\\A{pattern}");
431 let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
432 comment_regexes.push(re);
433 }
434 if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
435 {
436 let pattern = &s[1..s.len() - 1];
437 comments.push(pattern.to_string());
438 }
439 }
440 continue;
441 }
442 let index = *by_name.get(name).unwrap();
443
444 let mut forms = Vec::new();
445 let mut recover = None;
446
447 for raw_alt in raw_forms
448 {
449 let mut matching_terms = Vec::new();
451 let mut pruned = false;
452
453 let mut i = 0;
454 while i < raw_alt.len()
455 {
456 let term_str = &raw_alt[i];
457 i += 1;
458
459 if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
460 {
461 let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
462 matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
463
464 literals.insert(literal.clone());
465 continue;
466 }
467 if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
468 {
469 let pattern = &term_str[2..term_str.len() - 2];
470 let pattern_all = format!("\\A(?:{pattern})\\z"); let pattern = format!("\\A(?:{pattern})"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
473 let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
474 lex_regexes.insert(pattern, re2.clone());
475 matching_terms.push(MatchingTermE::TermRegex(re2).to());
476 continue;
477 }
478 if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
480 {
481 let pattern = &term_str[2..term_str.len() - 2];
482 let pattern_all = format!("\\A(?:{pattern})\\z"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
484 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
485 continue;
486 }
487 if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
488 {
489 let pattern = &term_str[2..term_str.len() - 2];
490 let pattern_all = format!("\\A(?:{pattern})");
491 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
492 matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
493 continue;
494 }
495 if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
496 {
497 let pattern = &raw_alt[i];
498 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
499 || !pattern.ends_with("`r")
500 {
501 Err(format!("@recover guards only accept regex strings"))?
502 }
503 let no_z = pattern.starts_with("A`");
504 let pattern = &pattern[2..pattern.len() - 2];
505 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
506 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
507 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
508 if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
510 recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
511 i += 1;
512 continue;
513 }
514 if matches!(&**term_str, "@EOF" | "@eof")
515 {
516 matching_terms.push(MatchingTermE::Eof.to());
517 continue;
518 }
519 if matches!(&**term_str, "@AUTO" | "@auto")
520 {
521 matching_terms.push(MatchingTermE::_AutoTemp.to());
522 continue;
523 }
524 if (term_str == "@PEEK" || term_str == "@peek"
525 || term_str == "@PEEKR" || term_str == "@peekr"
526 || term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
527 {
528 if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
529 let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
530 if term_str == "@PEEK" || term_str == "@peek"
531 {
532 let literal = &raw_alt[i+3];
533 if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
534 {
535 return Err(format!("@peek guards only accept plain strings"));
536 }
537 let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
538 let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
539 matching_terms.push(MatchingTermE::Peek(n, s).to());
540 }
541 else
542 {
543 let pattern = &raw_alt[i+3];
544 if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
545 || !pattern.ends_with("`r")
546 {
547 Err(format!("@recover guards only accept regex strings"))?
548 }
549 let no_z = pattern.starts_with("A`");
550 let pattern = &pattern[2..pattern.len() - 2];
551 let mut pattern_all = format!("\\A(?:{})\\z", pattern);
552 if no_z { pattern_all = format!("\\A(?:{})", pattern); }
553
554 let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
555 if term_str == "@PEEKRES" || term_str == "@peekres"
557 {
558 matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
559 }
560 else
561 {
562 matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
563 }
564 }
565 i += 5;
566 continue;
567 }
568 if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
569 {
570 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
571 {
572 return Err(format!("Invalid guard syntax: must be @guard(name)"));
573 }
574 let literal = &raw_alt[i+1];
575 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
576 matching_terms.push(MatchingTermE::Guard(s).to());
577 i += 3;
578 continue;
579 }
580 if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
581 {
582 if raw_alt[i] != "(" || raw_alt[i+2] != ")"
583 {
584 return Err(format!("Invalid hook syntax: must be @hook(name)"));
585 }
586 let literal = &raw_alt[i+1];
587 let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
588 matching_terms.push(MatchingTermE::Hook(s).to());
589 i += 3;
590 continue;
591 }
592 if matches!(&**term_str, "$BECOME" | "$become")
593 {
594 matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
595 continue;
596 }
597 if matches!(&**term_str, "$BECOME_AS" | "$become_as")
598 {
599 matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
600 continue;
601 }
602 if matches!(&**term_str, "$ANY" | "$any")
603 {
604 matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
605 continue;
606 }
607 if matches!(&**term_str, "$PRUNED" | "$pruned")
608 {
609 pruned = true;
610 continue;
611 }
612 let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
613 matching_terms.push(MatchingTermE::Rule(*id).to());
614 }
615 if matching_terms.len() > 60000
616 {
617 Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
618 }
619 if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
620 {
621 match matching_terms.get(1).map(|x| &x.t)
622 {
623 Some(MatchingTermE::TermLit(s)) =>
624 {
625 matching_terms[0] = MatchingTermE::Peek(0, *s).to();
626 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
627 }
628 Some(MatchingTermE::TermRegex(r)) =>
629 {
630 let r = r.clone();
631 matching_terms[0] = MatchingTermE::PeekR(0, r).to();
632 matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
633 }
634 _ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
635 }
636 }
637 forms.push(Alternation { matching_terms, pruned });
639 }
640 if forms.len() > 60000
641 {
642 Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
643 }
644 let mut num_nonguards = 0;
645 for f in &forms
646 {
647 if num_nonguards != 0
648 {
649 eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
650 break;
651 }
652 if !(if let Some(x) = f.matching_terms.get(0) && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
654 { num_nonguards += 1; }
655 }
656
657 if index > 4000000000
658 {
659 Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
660 }
661 let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
662 points.push(GrammarPoint
663 {
664 name,
665 name_id,
666 forms,
668 recover,
669 });
670 }
671
672 let mut literals = literals.into_iter().collect::<Vec<_>>();
673 literals.sort();
674
675 let mut regexes = Vec::new();
676 for (r, r2) in lex_regexes
677 {
678 regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
679 }
680 Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
681}
682
683pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
687{
688 grammar_convert(&bnf_parse(s)?)
689}
690
691#[derive(Debug, Clone, Default)]
692pub struct Token {
696 pub text : u32,
698 pub pair : isize,
700}
701
702pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
704{
705 let mut text_token_regex_s = "\\A(?:".to_string();
706
707 let mut lits = literals.clone();
708 lits.sort_by(|a, b| b.len().cmp(&a.len()));
709 for text in lits.iter()
710 {
711 let s2 = regex::escape(&text);
712 text_token_regex_s += &s2;
713 text_token_regex_s += "|";
714 }
715 if lits.len() > 0 { text_token_regex_s.pop(); }
716 text_token_regex_s += ")";
717 if terminated { text_token_regex_s += "\\z"; }
718 let text_token_regex = new_regex(&text_token_regex_s).unwrap();
719 text_token_regex
720}
721
722pub fn tokenize(
728 g : &mut Grammar,
729 mut s : &str
730) -> Result<Vec<Token>, String>
731{
732 let s_orig = s;
733 let mut tokens = Vec::<Token>::new();
734 tokens.reserve(s.len()/16 + 1);
735
736 let mut lit_filtered = Vec::new();
737 for l in g.literals.iter()
738 {
739 let mut covered = false;
740 for r in &g.regexes
741 {
742 if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
743 {
744 if loc == l.len()
745 {
746 covered = true;
747 break;
748 }
749 }
750 }
751 if !covered
752 {
753 lit_filtered.push(l.clone());
754 }
755 }
756 let all_literals_regex = if lit_filtered.len() > 0
757 {
758 Some(build_literal_regex(&lit_filtered, false))
759 }
760 else
761 {
762 None
763 };
764 for text in g.literals.iter()
767 {
768 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
769 }
770 for point in g.points.iter()
771 {
772 string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
773 }
774
775 let mut openers = HashSet::default();
776 let mut closers = HashMap::default();
777 let mut stacks = HashMap::default();
778 let mut any_paired = false;
779 for (l, r) in &g.bracket_pairs
780 {
781 let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
782 let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
783 openers.insert(lsc);
784 closers.insert(rsc, lsc);
785 stacks.insert(lsc, Vec::<usize>::new());
786 any_paired = true;
787 }
788
789 'top: while !s.is_empty()
798 {
799 if get_char_at_byte(s, 0) as u32 <= 0x20
800 {
801 if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
802 {
803 while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
804 {
805 s = &s[1..]; }
807 if s.is_empty() { break; }
808 continue 'top;
809 }
810 }
811
812 for re in &g.comment_regexes
814 {
815 if let Some(x) = regex_find(re, s)
816 {
817 s = &s[x.end() - x.start()..];
819 continue 'top;
820 }
821 }
822 for c in &g.comments
824 {
825 if s.starts_with(c)
826 {
827 s = &s[c.len()..];
828 while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
829 {
830 if get_char_at_byte(s, 0) == '\\'
831 {
832 s = &s[get_char_at_byte(s, 0).len_utf8()..]; }
834 if s.len() > 0
835 {
836 s = &s[get_char_at_byte(s, 0).len_utf8()..];
837 }
838 }
839 continue 'top;
840 }
841 }
842 for (l, r) in &g.comment_pairs_nested
844 {
845 if s.starts_with(l)
846 {
847 s = &s[l.len()..];
848 let mut nest = 1;
849 while s.len() > 0 && nest > 0
850 {
851 if s.starts_with(l) { nest += 1; }
852 s = &s[get_char_at_byte(s, 0).len_utf8()..];
853 if s.starts_with(r) { nest -= 1; }
854 }
855 if s.starts_with(r) { s = &s[r.len()..]; }
856 continue 'top;
857 }
858 }
859 for (l, r) in &g.comment_pairs
861 {
862 if s.starts_with(l)
863 {
864 s = &s[l.len()..];
865 while s.len() > 0 && !s.starts_with(r)
866 {
867 s = &s[get_char_at_byte(s, 0).len_utf8()..];
868 }
869 s = &s[r.len()..];
870 continue 'top;
871 }
872 }
873 let mut longest = 0;
875 for r in &g.regexes
877 {
878 if let Some(loc) = regex_find(&r.0, s)
879 {
880 let len = loc.end() - loc.start();
881 longest = longest.max(len);
883 }
884 }
885 if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
887 {
888 let len = loc.end() - loc.start();
889 longest = longest.max(len);
891 }
892 if longest == 0
893 {
894 let sn : String = s.chars().take(5).collect();
895 return Err(format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn));
896 }
897
898 let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
901 let mut token = Token { text, pair : 0 };
902
903 if any_paired
912 {
913 if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
914 {
915 s.push(tokens.len());
916 }
917 if let Some(l) = closers.get(&text) && let Some(s) = stacks.get_mut(l)
918 {
919 let n = s.pop().ok_or_else(|| format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()))?;
920 let me = tokens.len();
921 let diff = me.checked_signed_diff(n).ok_or_else(|| format!("Input too long"))?;
922 token.pair = -diff;
923 tokens[n].pair = diff;
924 }
925 }
926
927 s = &s[longest..];
928 tokens.push(token);
929 }
930 Ok(tokens)
931}