type HashMap<K, V> = std::collections::HashMap::<K, V, crate::HashBuilder>;
type HashSet<K> = std::collections::HashSet::<K, crate::HashBuilder>;
use std::rc::Rc;
use std::cell::RefCell;
use regex::Regex as Regex;
pub (crate) fn get_char_at_byte(s : &str, i : usize) -> char
{
s[i..].chars().next().unwrap()
}
pub fn new_regex(s : &str) -> Result<Regex, regex::Error>
{
Regex::new(s)
}
pub (crate) fn regex_find<'a>(r : &Regex, s : &'a str) -> Option<regex::Match<'a>>
{
r.find(s)
}
pub (crate) fn regex_is_match<'a>(r : &Regex, s : &'a str) -> bool
{
r.is_match(s)
}
#[derive(Debug, Clone, Hash)]
pub (crate) struct K<T> {
pub (crate) k : Rc<T>
}
impl<T : PartialEq> PartialEq for K<T> {
fn eq(&self, other: &Self) -> bool {
if Rc::as_ptr(&self.k) == Rc::as_ptr(&other.k) { return true; }
self.k == other.k
}
}
impl<T : Eq> Eq for K<T> { }
#[derive(Debug, Clone)]
pub struct RegexCacher {
r : Regex,
cache : Rc<RefCell<HashMap<K<String>, bool>>>,
cache2 : Rc<RefCell<HashMap<u32, bool>>>,
}
impl RegexCacher {
#[allow(unused)]
pub fn new(r : Regex) -> RegexCacher
{
let cache = Rc::new(RefCell::new(HashMap::default()));
let cache2 = Rc::new(RefCell::new(HashMap::default()));
RegexCacher { r, cache, cache2 }
}
pub (crate) fn new_with_pool(r : Regex, cache_pool : &mut HashMap<String, (Rc<RefCell<HashMap<K<String>, bool>>>, Rc<RefCell<HashMap<u32, bool>>>)>) -> RegexCacher
{
let s = r.as_str().to_string();
let mut cache = Rc::new(RefCell::new(HashMap::default()));
let mut cache2 = Rc::new(RefCell::new(HashMap::default()));
if let Some(cached) = cache_pool.get(&s)
{
cache = Rc::clone(&cached.0);
cache2 = Rc::clone(&cached.1);
}
else
{
cache_pool.insert(s.clone(), (cache.clone(), cache2.clone()));
}
RegexCacher { r, cache, cache2 }
}
#[inline(never)]
pub fn is_match(&self, s : &Rc<String>) -> bool
{
let mut cache = self.cache.borrow_mut();
let k = K { k : Rc::clone(s) };
if let Some(result) = cache.get(&k) { return *result; }
let ret = regex_is_match(&self.r, &*s);
cache.insert(k, ret);
ret
}
#[inline(never)]
pub fn is_match_interned(&self, i : u32, string_cache_inv : &Vec<Rc<String>>) -> bool
{
let mut cache = self.cache2.borrow_mut();
if let Some(result) = cache.get(&i) { return *result; }
let s = &string_cache_inv[i as usize];
let ret = regex_is_match(&self.r, &*s);
cache.insert(i, ret);
ret
}
}
#[derive(Default)]
pub struct Grammar {
pub points: Vec<GrammarPoint>,
pub by_name: HashMap<String, usize>,
pub (crate) literals: Vec<String>,
pub (crate) regexes: Vec<(Regex, RegexCacher)>,
pub string_cache : HashMap<String, u32>,
pub string_cache_inv : Vec<Rc<String>>,
pub (crate) bracket_pairs : Vec<(String, String)>,
pub (crate) comments : Vec<String>,
pub (crate) comment_pairs : Vec<(String, String)>,
pub (crate) comment_pairs_nested : Vec<(String, String)>,
pub (crate) comment_regexes : Vec<Regex>,
pub (crate) reserved : Option<Regex>,
}
#[derive(Debug, Clone)]
pub struct GrammarPoint {
pub name: Rc<String>,
pub name_id: u32,
pub forms: Vec<Alternation>,
pub (crate) recover: Option<(RegexCacher, bool)>,
}
#[derive(Debug, Clone)]
pub struct Alternation {
pub matching_terms: Vec<MatchingTerm>,
pub (crate) pruned: bool,
}
#[derive(Debug, Clone)]
pub(crate) enum MatchDirective {
Any, Become, BecomeAs, Hoist, HoistIfUnit, Drop, DropIfEmpty, Rename,
}
#[derive(Debug, Clone)]
pub struct MatchingTerm { pub(crate) t : MatchingTermE }
#[derive(Debug, Clone)]
#[non_exhaustive]
pub(crate) enum MatchingTermE {
Rule(usize),
TermLit(u32),
TermRegex(RegexCacher),
Directive(MatchDirective),
Hook(Rc<String>),
_AutoTemp,
Eof,
Peek(isize, u32),
PeekR(isize, RegexCacher),
PeekRes(isize, RegexCacher),
Guard(Rc<String>),
}
impl MatchingTermE { pub(crate) fn to(self) -> MatchingTerm { MatchingTerm { t : self } } }
pub fn string_cache_lookup(
string_cache : &mut HashMap<String, u32>,
string_cache_inv : &mut Vec<Rc<String>>,
s : &str) -> (Rc<String>, u32)
{
if let Some(sn) = string_cache.get(s)
{
return (Rc::clone(&string_cache_inv[*sn as usize]), *sn);
}
let rc = Rc::new(s.to_string());
let n = string_cache_inv.len().try_into().unwrap();
string_cache.insert(s.to_string(), n);
string_cache_inv.push(Rc::clone(&rc));
(rc, n)
}
pub fn string_cache_lookup_id(
string_cache : &mut HashMap<String, u32>,
string_cache_inv : &mut Vec<Rc<String>>,
s : &str) -> u32
{
if let Some(sn) = string_cache.get(s)
{
return *sn;
}
let rc = Rc::new(s.to_string());
let n = string_cache_inv.len().try_into().unwrap();
string_cache.insert(s.to_string(), n);
string_cache_inv.push(rc);
n
}
pub (crate) fn bnf_parse(input: &str) -> Result<Vec<(String, Vec<Vec<String>>)>, String>
{
let mut rules = Vec::new();
let mut metalist = Vec::new();
let mut current = Vec::new();
let mut name : Option<String> = None;
let mut found_separator = false;
let lines = input.lines().map(|x| x.to_string()).collect::<Vec<_>>();
let mut lines2 : Vec<String> = vec!();
for l in lines
{
if let Some(l0) = lines2.last_mut()
{
if l0.ends_with("\\")
{
*l0 = format!("{}{l}", &l0[..l0.len()-1]);
continue;
}
}
lines2.push(l);
}
for (mut linenum, rest) in lines2.iter().enumerate()
{
let mut rest : &str = rest;
linenum += 1;
let _split = rest.trim().split_whitespace().collect::<Vec<_>>();
if _split.get(1) == Some(&"::=") {
if name.is_some()
{
metalist.push(current);
rules.push((name.unwrap(), metalist));
}
name = None;
found_separator = false;
metalist = Vec::new();
current = Vec::new();
}
while !rest.is_empty()
{
if get_char_at_byte(rest, 0).is_whitespace()
{
rest = rest.trim_start();
}
else if rest.starts_with("#")
{
break;
}
else if rest.starts_with("\"")
{
if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
let mut len = 1;
let mut in_escape = false;
let mut found_exit = false;
while len < rest.len()
{
let c = get_char_at_byte(rest, len);
len += c.len_utf8();
if !in_escape && c == '\\'
{
in_escape = true;
continue;
}
if !in_escape && c == '"' { found_exit = true; break; }
in_escape = false;
}
if !found_exit || len == 2 { return Err(format!("Broken literal text rule on line {linenum}")); }
current.push(rest[..len].to_string());
rest = &rest[len..];
}
else if rest.starts_with("r`") || rest.starts_with("R`") || rest.starts_with("A`")
{
if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
let end = rest[2..].find("`r").expect(&format!("Unterminated regex on line {linenum}"));
let len = end + 4;
current.push(rest[..len].to_string());
rest = &rest[len..];
}
else if rest.starts_with("::=")
{
if found_separator { return Err(format!("Unexpected ::= on line {linenum}")); }
if name.is_none() { return Err(format!("missing name on line {linenum}")); }
found_separator = true;
rest = &rest[3..];
}
else if rest.starts_with("(") || rest.starts_with(")") || rest.starts_with(",")
{
current.push(rest[0..1].to_string());
rest = &rest[1..];
}
else if rest.starts_with("|")
{
if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
metalist.push(current);
current = vec!();
rest = &rest[1..];
}
else
{
let mut end = rest.len();
for (i, ch) in rest.char_indices()
{
if ch.is_whitespace() || ch == '|' || ch == '(' || ch == ')' || ch == ',' || ch == '"' || ch == '#'
|| rest[i..].starts_with("::=") || rest[i..].starts_with("r`")
{
end = i;
break;
}
}
if name.is_none()
{
name = Some(rest[..end].to_string());
}
else
{
if !found_separator { return Err(format!("Missing ::= on line {linenum}")); }
current.push(rest[..end].to_string());
}
rest = &rest[end..];
}
}
}
if name.is_some()
{
metalist.push(current);
rules.push((name.unwrap(), metalist));
}
Ok(rules)
}
pub (crate) fn grammar_convert(input: &Vec<(String, Vec<Vec<String>>)>) -> Result<Grammar, String>
{
let mut by_name = HashMap::default();
for (name, _) in input.iter()
{
if matches!(&**name, "__BRACKET_PAIRS" | "__COMMENT_PAIRS" | "__COMMENT_PAIRS_NESTED" | "__COMMENT_REGEXES" | "__COMMENTS" | "__RESERVED_WORDS") { continue; }
if by_name.insert(name.clone(), by_name.len()).is_some()
{
return Err(format!("Duplicate rule {name}; use alternations (e.g. x ::= a | b), not additional definitions (like x ::= a [...] x ::= b)"));
}
}
let mut string_cache = HashMap::default();
let mut string_cache_inv = Vec::new();
let mut points = Vec::new();
let mut literals = HashSet::default();
let mut lex_regexes = HashMap::default();
let mut bracket_pairs = Vec::new();
let mut comment_pairs = Vec::new();
let mut comment_pairs_nested = Vec::new();
let mut comment_regexes = Vec::new();
let mut comments = Vec::new();
let mut cache_pool = HashMap::<String, _>::default();
let mut reserved = None;
for (name, raw_forms) in input.iter()
{
if name == "__RESERVED_WORDS"
{
let mut set = Vec::new();
for s in raw_forms.iter().map(|x| x.iter()).flatten()
{
set.push(s.clone());
}
reserved = Some(build_literal_regex(&set, true));
continue;
}
if name == "__BRACKET_PAIRS" || name == "__COMMENT_PAIRS" || name == "__COMMENT_PAIRS_NESTED"
{
for raw_alt in raw_forms
{
match (raw_alt.get(0), raw_alt.get(1))
{
(Some(l), Some(r)) =>
{
if name == "__BRACKET_PAIRS" { bracket_pairs.push((l.clone(), r.clone())); }
if name == "__COMMENT_PAIRS" { comment_pairs.push((l.clone(), r.clone())); }
if name == "__COMMENT_PAIRS_NESTED" { comment_pairs_nested.push((l.clone(), r.clone())); }
}
_ => Err(format!("Alternations of __BRACKET_PAIRS must all contain two bare string items"))?
}
}
continue;
}
if name == "__COMMENT_REGEXES" || name == "__COMMENTS"
{
for s in raw_forms.iter().map(|x| x.iter()).flatten()
{
if name == "__COMMENT_REGEXES" && s.starts_with("r`") && s.ends_with("`r") && s.len() >= 4
{
let pattern = &s[2..s.len() - 2];
let pattern = format!("\\A{pattern}");
let re = new_regex(&pattern).map_err(|e| format!("Invalid regex '{}': {}", pattern, e))?;
comment_regexes.push(re);
}
if name == "__COMMENTS" && s.starts_with("\"") && s.ends_with("\"") && s.len() >= 3
{
let pattern = &s[1..s.len() - 1];
comments.push(pattern.to_string());
}
}
continue;
}
let index = *by_name.get(name).unwrap();
let mut forms = Vec::new();
let mut recover = None;
for raw_alt in raw_forms
{
let mut matching_terms = Vec::new();
let mut pruned = false;
let mut i = 0;
while i < raw_alt.len()
{
let term_str = &raw_alt[i];
i += 1;
if term_str.starts_with('"') && term_str.ends_with('"') && term_str.len() >= 2
{
let literal = term_str[1..term_str.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
matching_terms.push(MatchingTermE::TermLit(string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal)).to());
literals.insert(literal.clone());
continue;
}
if term_str.starts_with("r`") && term_str.ends_with("`r") && term_str.len() >= 4
{
let pattern = &term_str[2..term_str.len() - 2];
let pattern_all = format!("\\A(?:{pattern})\\z"); let pattern = format!("\\A(?:{pattern})"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
let re2 = RegexCacher::new_with_pool(re2, &mut cache_pool);
lex_regexes.insert(pattern, re2.clone());
matching_terms.push(MatchingTermE::TermRegex(re2).to());
continue;
}
if term_str.starts_with("R`") && term_str.ends_with("`r") && term_str.len() >= 4
{
let pattern = &term_str[2..term_str.len() - 2];
let pattern_all = format!("\\A(?:{pattern})\\z"); let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
continue;
}
if term_str.starts_with("A`") && term_str.ends_with("`r") && term_str.len() >= 4
{
let pattern = &term_str[2..term_str.len() - 2];
let pattern_all = format!("\\A(?:{pattern})");
let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
matching_terms.push(MatchingTermE::TermRegex(RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
continue;
}
if matches!(&**term_str, "@RECOVER" | "@recover" | "@RECOVER_BEFORE" | "@recover_before") && i < raw_alt.len()
{
let pattern = &raw_alt[i];
if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
|| !pattern.ends_with("`r")
{
Err(format!("@recover guards only accept regex strings"))?
}
let no_z = pattern.starts_with("A`");
let pattern = &pattern[2..pattern.len() - 2];
let mut pattern_all = format!("\\A(?:{})\\z", pattern);
if no_z { pattern_all = format!("\\A(?:{})", pattern); }
let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
if recover.is_some() { Err(format!("Rule {name} has multiple @recover items. Only one is supported."))? }
recover = Some((RegexCacher::new_with_pool(re2, &mut cache_pool), matches!(&**term_str, "@RECOVER" | "@recover")));
i += 1;
continue;
}
if matches!(&**term_str, "@EOF" | "@eof")
{
matching_terms.push(MatchingTermE::Eof.to());
continue;
}
if matches!(&**term_str, "@AUTO" | "@auto")
{
matching_terms.push(MatchingTermE::_AutoTemp.to());
continue;
}
if (term_str == "@PEEK" || term_str == "@peek"
|| term_str == "@PEEKR" || term_str == "@peekr"
|| term_str == "@PEEKRES" || term_str == "@peekres") && i + 4 < raw_alt.len()
{
if raw_alt[i] != "(" || raw_alt[i+2] != "," || raw_alt[i+4] != ")" { Err(format!("Invalid peek syntax: must be @peek(num, str)"))? }
let n = raw_alt[i+1].parse::<isize>().map_err(|_| format!("Not a supported peek distance: {}", raw_alt[i+1]))?;
if term_str == "@PEEK" || term_str == "@peek"
{
let literal = &raw_alt[i+3];
if literal.len() < 2 || !literal.starts_with("\"") || !literal.ends_with("\"")
{
return Err(format!("@peek guards only accept plain strings"));
}
let literal = literal[1..literal.len() - 1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n");
let s = string_cache_lookup_id(&mut string_cache, &mut string_cache_inv, &literal);
matching_terms.push(MatchingTermE::Peek(n, s).to());
}
else
{
let pattern = &raw_alt[i+3];
if !(pattern.starts_with("r`") || pattern.starts_with("R`") || pattern.starts_with("A`"))
|| !pattern.ends_with("`r")
{
Err(format!("@recover guards only accept regex strings"))?
}
let no_z = pattern.starts_with("A`");
let pattern = &pattern[2..pattern.len() - 2];
let mut pattern_all = format!("\\A(?:{})\\z", pattern);
if no_z { pattern_all = format!("\\A(?:{})", pattern); }
let re2 = new_regex(&pattern_all).map_err(|e| format!("Invalid regex '{}': {}", pattern_all, e))?;
if term_str == "@PEEKRES" || term_str == "@peekres"
{
matching_terms.push(MatchingTermE::PeekRes(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
}
else
{
matching_terms.push(MatchingTermE::PeekR(n, RegexCacher::new_with_pool(re2, &mut cache_pool)).to());
}
}
i += 5;
continue;
}
if (term_str == "@GUARD" || term_str == "@guard") && i + 2 < raw_alt.len()
{
if raw_alt[i] != "(" || raw_alt[i+2] != ")"
{
return Err(format!("Invalid guard syntax: must be @guard(name)"));
}
let literal = &raw_alt[i+1];
let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
matching_terms.push(MatchingTermE::Guard(s).to());
i += 3;
continue;
}
if (term_str == "!HOOK" || term_str == "!hook") && i + 2 < raw_alt.len()
{
if raw_alt[i] != "(" || raw_alt[i+2] != ")"
{
return Err(format!("Invalid hook syntax: must be @hook(name)"));
}
let literal = &raw_alt[i+1];
let s = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &literal).0;
matching_terms.push(MatchingTermE::Hook(s).to());
i += 3;
continue;
}
if matches!(&**term_str, "$BECOME" | "$become")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::Become).to());
continue;
}
if matches!(&**term_str, "$BECOME_AS" | "$become_as")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::BecomeAs).to());
continue;
}
if matches!(&**term_str, "$HOIST" | "$hoist")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::Hoist).to());
continue;
}
if matches!(&**term_str, "$HOIST_UNIT" | "$hoist_unit")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::HoistIfUnit).to());
continue;
}
if matches!(&**term_str, "$DROP" | "$drop")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::Drop).to());
continue;
}
if matches!(&**term_str, "$DROP_EMPTY" | "$drop_empty")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::DropIfEmpty).to());
continue;
}
if matches!(&**term_str, "$RENAME" | "$rename")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::Rename).to());
continue;
}
if matches!(&**term_str, "$ANY" | "$any")
{
matching_terms.push(MatchingTermE::Directive(MatchDirective::Any).to());
continue;
}
if matches!(&**term_str, "$PRUNED" | "$pruned")
{
pruned = true;
continue;
}
let id = by_name.get(term_str).ok_or_else(|| format!("Not a defined grammar rule: '{term_str}' (context: '{name}')"))?;
matching_terms.push(MatchingTermE::Rule(*id).to());
}
if matching_terms.len() > 60000
{
Err(format!("More than 60k items in an alternation of {name}. Factor them out, dummy!"))?
}
if let Some(MatchingTermE::_AutoTemp) = matching_terms.get(0).map(|x| &x.t)
{
match matching_terms.get(1).map(|x| &x.t)
{
Some(MatchingTermE::TermLit(s)) =>
{
matching_terms[0] = MatchingTermE::Peek(0, *s).to();
matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
}
Some(MatchingTermE::TermRegex(r)) =>
{
let r = r.clone();
matching_terms[0] = MatchingTermE::PeekR(0, r).to();
matching_terms[1] = MatchingTermE::Directive(MatchDirective::Any).to();
}
_ => Err(format!("@auto must be followed by a string literal or regex literal (context: {name})"))?
}
}
forms.push(Alternation { matching_terms, pruned });
}
if forms.len() > 60000
{
Err(format!("More than 60k alternations in {name}. Factor them out, dummy!"))?
}
let mut num_nonguards = 0;
for f in &forms
{
if num_nonguards != 0
{
eprintln!("!!!!!!\n!!!!!! Warning: rule {name} has at least one alternation that is inaccessible!\n!!!!!!");
break;
}
if !(if let Some(x) = f.matching_terms.get(0) && matches!(x.t, MatchingTermE::Peek(_, _) | MatchingTermE::PeekR(_, _) | MatchingTermE::Guard(_) | MatchingTermE::Eof) { true } else { false })
{ num_nonguards += 1; }
}
if index > 4000000000
{
Err(format!("More than 4 billion grammar terms in grammar. What are you doing??? STOP!!!!! (╯°□°)╯︵ ┻━┻"))?
}
let (name, name_id) = string_cache_lookup(&mut string_cache, &mut string_cache_inv, &name);
points.push(GrammarPoint
{
name,
name_id,
forms,
recover,
});
}
let mut literals = literals.into_iter().collect::<Vec<_>>();
literals.sort();
let mut regexes = Vec::new();
for (r, r2) in lex_regexes
{
regexes.push((new_regex(&r).map_err(|e| format!("Invalid regex '{}': {}", r, e))?, r2));
}
Ok(Grammar { points, by_name, literals, regexes, string_cache, string_cache_inv, bracket_pairs, comments, comment_pairs, comment_regexes, reserved, comment_pairs_nested })
}
pub fn bnf_to_grammar(s : &str) -> Result<Grammar, String>
{
grammar_convert(&bnf_parse(s)?)
}
#[derive(Debug, Clone, Default)]
pub struct Token {
pub text : u32,
pub pair : isize,
}
pub (crate) fn build_literal_regex(literals : &Vec<String>, terminated : bool) -> Regex
{
let mut text_token_regex_s = "\\A(?:".to_string();
let mut lits = literals.clone();
lits.sort_by(|a, b| b.len().cmp(&a.len()));
for text in lits.iter()
{
let s2 = regex::escape(&text);
text_token_regex_s += &s2;
text_token_regex_s += "|";
}
if lits.len() > 0 { text_token_regex_s.pop(); }
text_token_regex_s += ")";
if terminated { text_token_regex_s += "\\z"; }
let text_token_regex = new_regex(&text_token_regex_s).unwrap();
text_token_regex
}
#[derive(Clone, Debug)]
pub struct TokError {
#[allow(unused)] pub err_message : String,
#[allow(unused)] pub produced : usize,
#[allow(unused)] pub location : usize,
#[allow(unused)] pub pairing_error : Option<(String, bool)>,
}
pub fn tokenize(
g : &mut Grammar,
mut s : &str
) -> Result<Vec<Token>, TokError>
{
let s_orig = s;
let mut tokens = Vec::<Token>::new();
tokens.reserve(s.len()/16 + 1);
let mut lit_filtered = Vec::new();
for l in g.literals.iter()
{
let mut covered = false;
for r in &g.regexes
{
if let Some(loc) = regex_find(&r.0, l).map(|x| x.end() - x.start())
{
if loc == l.len()
{
covered = true;
break;
}
}
}
if !covered
{
lit_filtered.push(l.clone());
}
}
let all_literals_regex = if lit_filtered.len() > 0
{
Some(build_literal_regex(&lit_filtered, false))
}
else
{
None
};
for text in g.literals.iter()
{
string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, text);
}
for point in g.points.iter()
{
string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &point.name);
}
let mut openers = HashSet::default();
let mut closers = HashMap::default();
let mut stacks = HashMap::default();
let mut any_paired = false;
for (l, r) in &g.bracket_pairs
{
let lsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &l);
let rsc = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &r);
openers.insert(lsc);
closers.insert(rsc, lsc);
stacks.insert(lsc, Vec::<usize>::new());
any_paired = true;
}
'top: while !s.is_empty()
{
if get_char_at_byte(s, 0) as u32 <= 0x20
{
if matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
{
while !s.is_empty() && matches!(get_char_at_byte(s, 0), ' ' | '\r' | '\n' | '\t')
{
s = &s[1..]; }
if s.is_empty() { break; }
continue 'top;
}
}
for re in &g.comment_regexes
{
if let Some(x) = regex_find(re, s)
{
s = &s[x.end() - x.start()..];
continue 'top;
}
}
for c in &g.comments
{
if s.starts_with(c)
{
s = &s[c.len()..];
while s.len() > 0 && get_char_at_byte(s, 0) != '\n'
{
if get_char_at_byte(s, 0) == '\\'
{
s = &s[get_char_at_byte(s, 0).len_utf8()..]; }
if s.len() > 0
{
s = &s[get_char_at_byte(s, 0).len_utf8()..];
}
}
continue 'top;
}
}
for (l, r) in &g.comment_pairs_nested
{
if s.starts_with(l)
{
s = &s[l.len()..];
let mut nest = 1;
while s.len() > 0 && nest > 0
{
if s.starts_with(l) { nest += 1; }
s = &s[get_char_at_byte(s, 0).len_utf8()..];
if s.starts_with(r) { nest -= 1; }
}
if s.starts_with(r) { s = &s[r.len()..]; }
continue 'top;
}
}
for (l, r) in &g.comment_pairs
{
if s.starts_with(l)
{
s = &s[l.len()..];
while s.len() > 0 && !s.starts_with(r)
{
s = &s[get_char_at_byte(s, 0).len_utf8()..];
}
s = &s[r.len()..];
continue 'top;
}
}
let mut longest = 0;
for r in &g.regexes
{
if let Some(loc) = regex_find(&r.0, s)
{
let len = loc.end() - loc.start();
longest = longest.max(len);
}
}
if let Some(all_literals_regex) = &all_literals_regex && let Some(loc) = regex_find(&all_literals_regex, s)
{
let len = loc.end() - loc.start();
longest = longest.max(len);
}
if longest == 0
{
let sn : String = s.chars().take(5).collect();
return Err(TokError {
err_message: format!("Failed to tokenize at index {}:{}[...]", s_orig.len()-s.len(), sn),
produced : tokens.len(),
location : s_orig.len() - s.len(),
pairing_error : None,
});
}
let text = string_cache_lookup_id(&mut g.string_cache, &mut g.string_cache_inv, &s[..longest]);
let mut token = Token { text, pair : 0 };
if any_paired
{
if openers.contains(&text) && let Some(s) = stacks.get_mut(&text)
{
s.push(tokens.len());
}
if let Some(l) = closers.get(&text) && let Some(stack) = stacks.get_mut(l)
{
let n = match stack.pop() {
Some(n) => n,
None => Err(TokError {
err_message: format!("Unmatched delimiter at {}: {}", s_orig.len() - s.len(), g.string_cache_inv.get(text as usize).unwrap()),
produced : tokens.len(),
location : s_orig.len() - s.len(),
pairing_error : Some((s[..longest].to_string(), false)),
})?,
};
let me = tokens.len();
let diff = match me.checked_signed_diff(n) {
Some(n) => n,
None => Err(TokError {
err_message: format!("Input too long"),
produced : tokens.len(),
location : s_orig.len() - s.len(),
pairing_error : None,
})?,
};
token.pair = -diff;
tokens[n].pair = diff;
}
}
s = &s[longest..];
tokens.push(token);
}
Ok(tokens)
}