use proc_macro::TokenStream;
use quote::quote;
extern crate proc_macro;
pub mod config;
pub mod fixed_offset;
pub mod nfa_static;
pub mod one_pass_u8;
pub mod parse_tree;
pub mod pike_vm;
pub mod pike_vm_u8;
pub mod simplified_tree;
pub mod visualization;
pub mod working_nfa;
pub mod working_u8_nfa;
pub struct Regex<const N: usize = 1> {
test_fn: fn(&str) -> bool,
exec_fn: for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
}
impl<const N: usize> Regex<N> {
#[inline]
pub fn test(&self, text: &str) -> bool {
return (self.test_fn)(text);
}
#[inline]
pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
return (self.exec_fn)(text);
}
}
#[inline]
pub const fn __construct_regex<const N: usize>(
fn_pair: (
fn(&str) -> bool,
for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
),
) -> Regex<N> {
return Regex {
test_fn: fn_pair.0,
exec_fn: fn_pair.1,
};
}
fn pick_base_engine(
ere: parse_tree::ERE,
) -> (
proc_macro2::TokenStream,
simplified_tree::SimplifiedTreeNode,
working_nfa::WorkingNFA,
working_u8_nfa::U8NFA,
&'static str,
) {
let tree = simplified_tree::SimplifiedTreeNode::from(ere);
let nfa = working_nfa::WorkingNFA::new(&tree);
fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
return state
.transitions
.iter()
.flat_map(|t| t.symbol.to_ranges())
.all(|range| range.end().is_ascii());
}
let is_ascii = nfa.states.iter().all(is_state_ascii);
let u8_nfa = working_u8_nfa::U8NFA::new(&nfa);
const ONE_PASS_U8_DESC: &str = "This regular expression is [one-pass](https://swtch.com/~rsc/regexp/regexp3.html#:~:text=Let%27s%20define%20a%20%E2%80%9Cone%2Dpass%20regular%20expression%E2%80%9D).
This allows us to use an efficient [`::ere::one_pass_u8`] implementation.";
const PIKE_VM_U8_DESC: &str =
"Uses a general-case [`::ere::pike_vm_u8`] implementatation over `u8`s.";
const PIKE_VM_DESC: &str =
"Uses a general-case [`::ere::pike_vm`] implementatation over `char`s.";
let (base_engine, description) =
if let Some(engine) = one_pass_u8::serialize_one_pass_token_stream(&u8_nfa) {
(engine, ONE_PASS_U8_DESC)
} else if is_ascii {
(
pike_vm_u8::serialize_pike_vm_token_stream(&u8_nfa),
PIKE_VM_U8_DESC,
)
} else {
(pike_vm::serialize_pike_vm_token_stream(&nfa), PIKE_VM_DESC)
};
return (base_engine, tree, nfa, u8_nfa, description);
}
fn pick_engine(ere: parse_tree::ERE) -> (proc_macro2::TokenStream, String) {
let (base_engine, _, _, u8_nfa, base_description) = pick_base_engine(ere);
if let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) {
let engine = fixed_offset::serialize_fixed_offset_token_stream(
base_engine,
offsets,
u8_nfa.num_capture_groups(),
);
return (
engine,
format!(
"This regular expression's capture groups are always at fixed offsets.
Because of this, we can skip a complex `exec` implementation, and instead simply run `test` then index into the string.
### Details on the `test` implementation:
{base_description}"
),
);
};
return (base_engine, base_description.to_string());
}
pub fn __compile_regex(stream: TokenStream) -> TokenStream {
let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
let (fn_pair, _) = pick_engine(ere);
return quote! {
{
::ere::__construct_regex(#fn_pair)
}
}
.into();
}
pub fn __compile_regex_engine_pike_vm(stream: TokenStream) -> TokenStream {
let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
let tree = simplified_tree::SimplifiedTreeNode::from(ere);
let nfa = working_nfa::WorkingNFA::new(&tree);
let fn_pair = pike_vm::serialize_pike_vm_token_stream(&nfa);
return quote! {
::ere::__construct_regex(#fn_pair)
}
.into();
}
pub fn __compile_regex_engine_pike_vm_u8(stream: TokenStream) -> TokenStream {
let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
let tree = simplified_tree::SimplifiedTreeNode::from(ere);
let nfa = working_nfa::WorkingNFA::new(&tree);
let nfa = working_u8_nfa::U8NFA::new(&nfa);
let fn_pair = pike_vm_u8::serialize_pike_vm_token_stream(&nfa);
return quote! {
::ere::__construct_regex(#fn_pair)
}
.into();
}
pub fn __compile_regex_engine_one_pass_u8(stream: TokenStream) -> TokenStream {
let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
let tree = simplified_tree::SimplifiedTreeNode::from(ere);
let nfa = working_nfa::WorkingNFA::new(&tree);
let nfa = working_u8_nfa::U8NFA::new(&nfa);
let Some(fn_pair) = one_pass_u8::serialize_one_pass_token_stream(&nfa) else {
return syn::parse::Error::new(
proc_macro2::Span::call_site(),
"Regex was not one-pass and could not be optimized to become one pass.
Try using a different engine.",
)
.to_compile_error()
.into();
};
return quote! {
::ere::__construct_regex(#fn_pair)
}
.into();
}
pub fn __compile_regex_engine_fixed_offset(stream: TokenStream) -> TokenStream {
let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
let (base_engine, _, _, u8_nfa, _) = pick_base_engine(ere);
let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) else {
return syn::parse::Error::new(
proc_macro2::Span::call_site(),
"Regex capture groups were not fixed offset. Try using a different engine.",
)
.to_compile_error()
.into();
};
let fn_pair = fixed_offset::serialize_fixed_offset_token_stream(
base_engine,
offsets,
u8_nfa.num_capture_groups(),
);
return quote! {
::ere::__construct_regex(#fn_pair)
}
.into();
}
#[cfg(feature = "unstable-attr-regex")]
pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
let ere_litstr: syn::LitStr = syn::parse_macro_input!(attr);
let ere_str = ere_litstr.value();
let ere = match parse_tree::ERE::parse_str_syn(&ere_str, ere_litstr.span()) {
Ok(ere) => ere,
Err(compile_err) => return compile_err.into_compile_error().into(),
};
let tree = simplified_tree::SimplifiedTreeNode::from(ere.clone());
let nfa = working_nfa::WorkingNFA::new(&tree);
let capture_groups = nfa.num_capture_groups();
let optional_captures: Vec<bool> = (0..capture_groups)
.map(|group_num| nfa.capture_group_is_optional(group_num))
.collect();
let input_copy = input.clone();
let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
let syn::Data::Struct(data_struct) = regex_struct.data else {
return syn::parse::Error::new_spanned(
regex_struct,
"Attribute regexes currently only support structs.",
)
.to_compile_error()
.into();
};
let syn::Fields::Unnamed(fields) = data_struct.fields else {
return syn::parse::Error::new_spanned(
data_struct.fields,
"Attribute regexes currently require unnamed structs (tuple syntax).",
)
.to_compile_error()
.into();
};
if fields.unnamed.len() != optional_captures.len() {
return syn::parse::Error::new_spanned(
fields.unnamed,
format!(
"Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
optional_captures.len()
),
)
.to_compile_error()
.into();
}
let mut out: proc_macro2::TokenStream = input.into();
fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
return state
.transitions
.iter()
.flat_map(|t| t.symbol.to_ranges())
.all(|range| range.end().is_ascii());
}
let is_ascii = nfa.states.iter().all(is_state_ascii);
let struct_args: proc_macro2::TokenStream = optional_captures
.iter()
.enumerate()
.map(|(group_num, opt)| if *opt {
quote! { result[#group_num], }
} else {
quote! {
result[#group_num]
.expect(
"If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
),
}
})
.collect();
let (fn_pair, description) = pick_engine(ere);
let struct_name = regex_struct.ident;
let ere_display_doc = format!("`{ere_str}`");
let struct_name_link_doc = format!("[`{}`]", struct_name.to_string());
let implementation = quote! {
impl<'a> #struct_name<'a> {
const ENGINE: (
fn(&str) -> bool,
fn(&'a str) -> ::core::option::Option<[::core::option::Option<&'a str>; #capture_groups]>,
) = #fn_pair;
#[doc = #ere_display_doc]
#[doc = #description]
#[inline]
pub fn test(text: &str) -> bool {
return (Self::ENGINE.0)(text);
}
#[doc = #struct_name_link_doc]
#[doc = #ere_display_doc]
#[doc = #description]
pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
let result: [::core::option::Option<&'a str>; #capture_groups] = (Self::ENGINE.1)(text)?;
return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
#struct_args
));
}
}
};
out.extend(implementation);
return out.into();
}