use regex::Regex;
use std::collections::HashMap;
use std::sync::{Arc, Mutex, OnceLock};
use crate::parse::Parser;
use crate::state::{ParserState, Span};
use aho_corasick::{AhoCorasickBuilder, Anchored, Input, MatchKind, StartKind};
pub fn cached_regex(pattern: &str) -> Arc<Regex> {
static CACHE: OnceLock<Mutex<HashMap<String, Arc<Regex>>>> = OnceLock::new();
let cache = CACHE.get_or_init(|| Mutex::new(HashMap::new()));
let mut map = cache.lock().unwrap();
if let Some(re) = map.get(pattern) {
return Arc::clone(re);
}
let re = Arc::new(
Regex::new(pattern).unwrap_or_else(|_| panic!("Failed to compile regex: {}", pattern)),
);
map.insert(pattern.to_owned(), Arc::clone(&re));
re
}
#[inline(always)]
pub fn trim_leading_whitespace(state: &ParserState<'_>) -> usize {
let bytes = state.src_bytes;
let mut i = state.offset;
let end = bytes.len();
if i >= end || !matches!(unsafe { *bytes.get_unchecked(i) }, b' ' | b'\t' | b'\n' | b'\r') {
return 0;
}
i += 1;
{
use std::simd::prelude::*;
while i + 16 <= end {
let chunk = u8x16::from_slice(&bytes[i..i + 16]);
let mask = chunk.simd_eq(u8x16::splat(b' '))
| chunk.simd_eq(u8x16::splat(b'\t'))
| chunk.simd_eq(u8x16::splat(b'\n'))
| chunk.simd_eq(u8x16::splat(b'\r'));
if mask.all() {
i += 16;
continue;
}
let first_non_ws = (!mask).to_bitmask().trailing_zeros() as usize;
return i + first_non_ws - state.offset;
}
}
while i < end {
match unsafe { *bytes.get_unchecked(i) } {
b' ' | b'\t' | b'\n' | b'\r' => i += 1,
_ => break,
}
}
i - state.offset
}
#[inline(always)]
pub fn trim_leading_whitespace_mut(state: &mut ParserState<'_>) {
let n = trim_leading_whitespace(state);
state.offset += n;
}
#[inline]
pub fn epsilon<'a>() -> Parser<'a, ()> {
let epsilon = move |_: &mut ParserState<'a>| Some(());
Parser::new(epsilon)
}
#[inline(always)]
pub fn string_impl<'a>(
s_bytes: &[u8],
end: &usize,
state: &mut ParserState<'a>,
) -> Option<Span<'a>> {
if *end == 0 {
return Some(Span::new(state.offset, state.offset, state.src));
}
let Some(slc) = &state.src_bytes.get(state.offset..) else {
return None;
};
if slc.len() >= *end && slc[0] == s_bytes[0] && slc[1..*end].starts_with(&s_bytes[1..]) {
let start = state.offset;
state.offset += end;
Some(Span::new(start, state.offset, state.src))
} else {
None
}
}
#[inline(always)]
#[allow(clippy::manual_map)]
pub fn string<'a>(s: &'a str) -> Parser<'a, &'a str> {
let s_bytes = s.as_bytes();
let end = s_bytes.len();
#[cfg(feature = "diagnostics")]
let label: &'static str = Box::leak(format!("\"{}\"", s).into_boxed_str());
let string = move |state: &mut ParserState<'a>| match string_impl(s_bytes, &end, state) {
Some(span) => Some(span.as_str()),
None => {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
};
Parser::new(string)
}
#[inline(always)]
#[allow(clippy::manual_map)]
pub fn string_span<'a>(s: &'a str) -> Parser<'a, Span<'a>> {
let s_bytes = s.as_bytes();
let end = s_bytes.len();
#[cfg(feature = "diagnostics")]
let label: &'static str = Box::leak(format!("\"{}\"", s).into_boxed_str());
let string = move |state: &mut ParserState<'a>| match string_impl(s_bytes, &end, state) {
Some(span) => Some(span),
None => {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
};
Parser::new(string)
}
#[inline(always)]
fn regex_impl<'a>(re: &Regex, state: &mut ParserState<'a>) -> Option<Span<'a>> {
if state.is_at_end() {
return None;
}
let slc = state.src.get(state.offset..)?;
match re.find_at(slc, 0) {
Some(m) => {
if m.start() != 0 {
return None;
}
let start = state.offset;
state.offset += m.end();
Some(Span::new(start, state.offset, state.src))
}
None => None,
}
}
#[inline(always)]
#[allow(clippy::manual_map)]
pub fn regex<'a>(r: &'a str) -> Parser<'a, &'a str> {
let re = cached_regex(r);
#[cfg(feature = "diagnostics")]
let label: &'static str = Box::leak(format!("/{}/", r).into_boxed_str());
let regex = move |state: &mut ParserState<'a>| match regex_impl(&re, state) {
Some(span) => Some(span.as_str()),
None => {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
};
Parser::new(regex)
}
#[inline(always)]
#[allow(clippy::manual_map)]
pub fn regex_span<'a>(r: &'a str) -> Parser<'a, Span<'a>> {
let re = cached_regex(r);
#[cfg(feature = "diagnostics")]
let label: &'static str = Box::leak(format!("/{}/", r).into_boxed_str());
let regex = move |state: &mut ParserState<'a>| match regex_impl(&re, state) {
Some(span) => Some(span),
None => {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
};
Parser::new(regex)
}
#[inline]
pub fn take_while_span<'a, F>(f: F) -> Parser<'a, Span<'a>>
where
F: Fn(char) -> bool + 'a,
{
let take_while = move |state: &mut ParserState<'a>| {
let slc = state.src.get(state.offset..)?;
let mut len = slc
.char_indices()
.take_while(|(_, c)| f(*c))
.map(|(i, _)| i)
.last();
match len {
Some(ref mut l) => {
*l += 1;
while *l < slc.len() && !slc.is_char_boundary(*l) {
*l += 1;
}
let start = state.offset;
state.offset += *l;
Some(Span::new(start, state.offset, state.src))
}
None => {
#[cfg(feature = "diagnostics")]
state.add_expected("matching character");
None
}
}
};
Parser::new(take_while)
}
#[inline]
pub fn take_while_byte_span<'a>(f: fn(u8) -> bool) -> Parser<'a, Span<'a>> {
let take_while = move |state: &mut ParserState<'a>| {
let bytes = state.src_bytes;
let start = state.offset;
let end = bytes.len();
let mut i = start;
while i < end && f(unsafe { *bytes.get_unchecked(i) }) {
i += 1;
}
if i == start {
#[cfg(feature = "diagnostics")]
state.add_expected("matching byte");
return None;
}
state.offset = i;
Some(Span::new(start, i, state.src))
};
Parser::new(take_while)
}
#[inline]
pub fn take_until_any_span<'a>(excluded: &'static [u8]) -> Parser<'a, Span<'a>> {
enum TakeUntilScan {
One(u8),
Two(u8, u8),
Three(u8, u8, u8),
Lut(Box<[bool; 256]>),
}
let mut lut = [false; 256];
let mut unique = [0u8; 3];
let mut unique_count = 0usize;
let mut overflow = false;
for &b in excluded {
let idx = b as usize;
if lut[idx] {
continue;
}
lut[idx] = true;
if unique_count < 3 {
unique[unique_count] = b;
unique_count += 1;
} else {
overflow = true;
}
}
let scan = if overflow {
TakeUntilScan::Lut(Box::new(lut))
} else {
match unique_count {
1 => TakeUntilScan::One(unique[0]),
2 => TakeUntilScan::Two(unique[0], unique[1]),
3 => TakeUntilScan::Three(unique[0], unique[1], unique[2]),
_ => TakeUntilScan::Lut(Box::new(lut)),
}
};
#[cfg(feature = "diagnostics")]
let label: &'static str = {
let chars: String = excluded.iter().map(|&b| b as char).collect();
Box::leak(format!("any byte not in [{}]", chars).into_boxed_str())
};
let take_until = move |state: &mut ParserState<'a>| {
let bytes = state.src_bytes;
let start = state.offset;
if start >= bytes.len() {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
return None;
}
let scan_len = match &scan {
TakeUntilScan::One(b1) => {
memchr::memchr(*b1, &bytes[start..]).unwrap_or(bytes.len() - start)
}
TakeUntilScan::Two(b1, b2) => {
memchr::memchr2(*b1, *b2, &bytes[start..]).unwrap_or(bytes.len() - start)
}
TakeUntilScan::Three(b1, b2, b3) => {
memchr::memchr3(*b1, *b2, *b3, &bytes[start..]).unwrap_or(bytes.len() - start)
}
TakeUntilScan::Lut(lut) => {
let mut i = start;
let end = bytes.len();
while i < end && !lut[unsafe { *bytes.get_unchecked(i) } as usize] {
i += 1;
}
i - start
}
};
if scan_len == 0 {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
return None;
}
let end = start + scan_len;
state.offset = end;
Some(Span::new(start, end, state.src))
};
Parser::new(take_until)
}
#[inline]
pub fn next_span<'a>(amount: usize) -> Parser<'a, Span<'a>> {
let next = move |state: &mut ParserState<'a>| {
let start = state.offset;
let new_offset = start + amount;
if new_offset > state.src.len() {
return None;
}
state.offset = new_offset;
Some(Span::new(start, new_offset, state.src))
};
Parser::new(next)
}
pub fn any_span<'a>(patterns: &[&'a str]) -> Parser<'a, Span<'a>> {
let ac = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostFirst)
.start_kind(StartKind::Anchored)
.build(patterns)
.expect("failed to build aho-corasick automaton");
#[cfg(feature = "diagnostics")]
let label: &'static str = Box::leak(format!("one of {:?}", patterns).into_boxed_str());
let any = move |state: &mut ParserState<'a>| {
let slc = state.src.get(state.offset..)?;
let input = Input::new(slc).anchored(Anchored::Yes);
match ac.find(input) {
Some(m) => {
let start = state.offset;
state.offset += m.end();
Some(Span::new(start, state.offset, state.src))
}
None => {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
}
};
Parser::new(any)
}
pub fn one_of<'a, O: 'a>(parsers: Vec<Parser<'a, O>>) -> Parser<'a, O> {
Parser::new(move |state: &mut ParserState<'a>| {
for parser in &parsers {
let checkpoint = state.offset;
if let Some(value) = parser.call(state) {
return Some(value);
}
state.furthest_offset = state.furthest_offset.max(state.offset);
state.offset = checkpoint;
}
None
})
}
pub fn dispatch_byte<'a, O: 'a>(table: Vec<(u8, Parser<'a, O>)>) -> Parser<'a, O> {
let mut lut: [Option<u16>; 256] = [None; 256];
for (i, (byte, _)) in table.iter().enumerate() {
lut[*byte as usize] = Some(i as u16);
}
#[cfg(feature = "diagnostics")]
let label: &'static str = {
let chars: Vec<char> = table.iter().map(|(b, _)| *b as char).collect();
Box::leak(format!("one of {:?}", chars).into_boxed_str())
};
Parser::new(move |state: &mut ParserState<'a>| {
let byte = *state.src_bytes.get(state.offset)?;
if let Some(idx) = lut[byte as usize] {
table[idx as usize].1.call(state)
} else {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
})
}
pub fn dispatch_byte_multi<'a, O: 'a>(table: Vec<(&[u8], Parser<'a, O>)>) -> Parser<'a, O> {
let mut lut: [Option<u16>; 256] = [None; 256];
let mut parsers: Vec<Parser<'a, O>> = Vec::with_capacity(table.len());
#[cfg(feature = "diagnostics")]
let mut all_bytes: Vec<u8> = Vec::new();
for (bytes, parser) in table {
let idx = parsers.len() as u16;
parsers.push(parser);
for &byte in bytes {
lut[byte as usize] = Some(idx);
#[cfg(feature = "diagnostics")]
all_bytes.push(byte);
}
}
#[cfg(feature = "diagnostics")]
let label: &'static str = {
let chars: Vec<char> = all_bytes.iter().map(|b| *b as char).collect();
Box::leak(format!("one of {:?}", chars).into_boxed_str())
};
Parser::new(move |state: &mut ParserState<'a>| {
let byte = *state.src_bytes.get(state.offset)?;
if let Some(idx) = lut[byte as usize] {
parsers[idx as usize].call(state)
} else {
#[cfg(feature = "diagnostics")]
state.add_expected(label);
None
}
})
}