1use proc_macro::TokenStream;
4use quote::quote;
5extern crate proc_macro;
6
7pub mod config;
8mod engines;
9mod epsilon_propogation;
10pub mod parse_tree;
11pub mod simplified_tree;
12pub mod visualization;
13pub mod working_nfa;
14pub mod working_u8_dfa;
15pub mod working_u8_nfa;
16
17pub use engines::*;
18
19pub struct Regex<const N: usize = 1> {
26 test_fn: fn(&str) -> bool,
27 exec_fn: for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
28}
29impl<const N: usize> Regex<N> {
30 #[inline]
32 pub fn test(&self, text: &str) -> bool {
33 return (self.test_fn)(text);
34 }
35 #[inline]
36 pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
37 return (self.exec_fn)(text);
38 }
39}
40
41#[inline]
43pub const fn __construct_regex<const N: usize>(
44 fn_pair: (
45 fn(&str) -> bool,
46 for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
47 ),
48) -> Regex<N> {
49 return Regex {
50 test_fn: fn_pair.0,
51 exec_fn: fn_pair.1,
52 };
53}
54
55fn pick_base_engine(
59 ere: parse_tree::ERE,
60) -> (
61 proc_macro2::TokenStream,
62 simplified_tree::SimplifiedTreeNode,
63 working_nfa::WorkingNFA,
64 working_u8_nfa::U8NFA,
65 &'static str,
66) {
67 let tree = simplified_tree::SimplifiedTreeNode::from(ere);
68 let nfa = working_nfa::WorkingNFA::new(&tree);
69
70 fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
72 return state
73 .transitions
74 .iter()
75 .flat_map(|t| t.symbol.to_ranges())
76 .all(|range| range.end().is_ascii());
77 }
78 let is_ascii = nfa.states.iter().all(is_state_ascii);
79
80 let u8_nfa = working_u8_nfa::U8NFA::new(&nfa);
81
82 const ONE_PASS_U8_DESC: &str = "This regular expression is [one-pass](https://swtch.com/~rsc/regexp/regexp3.html#:~:text=Let%27s%20define%20a%20%E2%80%9Cone%2Dpass%20regular%20expression%E2%80%9D).
83This allows us to use an efficient [`::ere::one_pass_u8`] implementation.";
84 const FLAT_LOCKSTEP_NFA_U8_DESC: &str =
85 "Uses a general-case [`::ere::flat_lockstep_nfa_u8`] implementatation over `u8`s.";
86 const FLAT_LOCKSTEP_NFA_DESC: &str =
87 "Uses a general-case [`::ere::flat_lockstep_nfa`] implementatation over `char`s.";
88 const DFA_DESC: &str = "Uses a [`::ere::dfa::U8DFA`] implementation over `u8`s.";
89
90 let pick_base_engine_inner = || -> (_, &str) {
91 if let Some(engine) = one_pass_u8::serialize_one_pass_token_stream(&u8_nfa) {
92 return (engine, ONE_PASS_U8_DESC);
93 }
94 let dfa_bound = working_u8_dfa::U8DFA::default_bound(u8_nfa.states.len());
95 if let Some(dfa) = working_u8_dfa::U8DFA::from_nfa(&u8_nfa, dfa_bound) {
96 return (dfa_u8::serialize_u8_dfa_token_stream(&dfa), DFA_DESC);
97 }
98 if is_ascii {
99 return (
100 flat_lockstep_nfa_u8::serialize_flat_lockstep_nfa_u8_token_stream(&u8_nfa),
101 FLAT_LOCKSTEP_NFA_U8_DESC,
102 );
103 }
104 return (
105 flat_lockstep_nfa::serialize_flat_lockstep_nfa_token_stream(&nfa),
106 FLAT_LOCKSTEP_NFA_DESC,
107 );
108 };
109
110 let (base_engine, description) = pick_base_engine_inner();
111 return (base_engine, tree, nfa, u8_nfa, description);
112}
113
114fn pick_engine(ere: parse_tree::ERE) -> (proc_macro2::TokenStream, String) {
118 let (base_engine, _, _, u8_nfa, base_description) = pick_base_engine(ere);
119
120 if let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) {
122 let engine = fixed_offset::serialize_fixed_offset_token_stream(
123 base_engine,
124 offsets,
125 u8_nfa.num_capture_groups(),
126 );
127 return (
128 engine,
129 format!(
130 "This regular expression's capture groups are always at fixed offsets.
131Because of this, we can skip a complex `exec` implementation, and instead simply run `test` then index into the string.
132
133### Details on the `test` implementation:
134
135{base_description}"
136 ),
137 );
138 };
139 return (base_engine, base_description.to_string());
140}
141
142pub fn __compile_regex(stream: TokenStream) -> TokenStream {
144 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
145 let (fn_pair, _) = pick_engine(ere);
146 return quote! {
147 {
148 ::ere::__construct_regex(#fn_pair)
149 }
150 }
151 .into();
152}
153
154pub fn __compile_regex_engine_dfa_u8(stream: TokenStream) -> TokenStream {
156 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
157 let tree = simplified_tree::SimplifiedTreeNode::from(ere);
158 let nfa = working_nfa::WorkingNFA::new(&tree);
159 let nfa = working_u8_nfa::U8NFA::new(&nfa);
160 let dfa_state_limit = working_u8_dfa::U8DFA::default_bound(nfa.states.len());
161 let dfa = working_u8_dfa::U8DFA::from_nfa(&nfa, dfa_state_limit);
162 let Some(dfa) = dfa else {
163 return syn::Error::new(
164 proc_macro2::Span::call_site(),
165 format!(
166 "Failed to convert NFA into DFA: exceeded DFA state limit of {},
167 compilation time and binary size could be exponential.",
168 dfa_state_limit
169 ),
170 )
171 .into_compile_error()
172 .into();
173 };
174 let fn_pair = dfa_u8::serialize_u8_dfa_token_stream(&dfa);
175 return quote! {
176 ::ere::__construct_regex(#fn_pair)
177 }
178 .into();
179}
180
181pub fn __compile_regex_engine_flat_lockstep_nfa(stream: TokenStream) -> TokenStream {
183 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
184 let tree = simplified_tree::SimplifiedTreeNode::from(ere);
185 let nfa = working_nfa::WorkingNFA::new(&tree);
186 let fn_pair = flat_lockstep_nfa::serialize_flat_lockstep_nfa_token_stream(&nfa);
187 return quote! {
188 ::ere::__construct_regex(#fn_pair)
189 }
190 .into();
191}
192
193pub fn __compile_regex_engine_flat_lockstep_nfa_u8(stream: TokenStream) -> TokenStream {
195 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
196 let tree = simplified_tree::SimplifiedTreeNode::from(ere);
197 let nfa = working_nfa::WorkingNFA::new(&tree);
198 let nfa = working_u8_nfa::U8NFA::new(&nfa);
199 let fn_pair = flat_lockstep_nfa_u8::serialize_flat_lockstep_nfa_u8_token_stream(&nfa);
200 return quote! {
201 ::ere::__construct_regex(#fn_pair)
202 }
203 .into();
204}
205
206pub fn __compile_regex_engine_one_pass_u8(stream: TokenStream) -> TokenStream {
210 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
211 let tree = simplified_tree::SimplifiedTreeNode::from(ere);
212 let nfa = working_nfa::WorkingNFA::new(&tree);
213 let nfa = working_u8_nfa::U8NFA::new(&nfa);
214 let Some(fn_pair) = one_pass_u8::serialize_one_pass_token_stream(&nfa) else {
215 return syn::parse::Error::new(
216 proc_macro2::Span::call_site(),
217 "Regex was not one-pass and could not be optimized to become one pass.
218Try using a different engine.",
219 )
220 .to_compile_error()
221 .into();
222 };
223 return quote! {
224 ::ere::__construct_regex(#fn_pair)
225 }
226 .into();
227}
228
229pub fn __compile_regex_engine_fixed_offset(stream: TokenStream) -> TokenStream {
233 let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
234 let (base_engine, _, _, u8_nfa, _) = pick_base_engine(ere);
235
236 let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) else {
237 return syn::parse::Error::new(
238 proc_macro2::Span::call_site(),
239 "Regex capture groups were not fixed offset. Try using a different engine.",
240 )
241 .to_compile_error()
242 .into();
243 };
244 let fn_pair = fixed_offset::serialize_fixed_offset_token_stream(
245 base_engine,
246 offsets,
247 u8_nfa.num_capture_groups(),
248 );
249 return quote! {
250 ::ere::__construct_regex(#fn_pair)
251 }
252 .into();
253}
254
255#[cfg(feature = "unstable-attr-regex")]
256pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
257 let ere_litstr: syn::LitStr = syn::parse_macro_input!(attr);
258 let ere_str = ere_litstr.value();
259 let ere = match parse_tree::ERE::parse_str_syn(&ere_str, ere_litstr.span()) {
260 Ok(ere) => ere,
261 Err(compile_err) => return compile_err.into_compile_error().into(),
262 };
263
264 let tree = simplified_tree::SimplifiedTreeNode::from(ere.clone());
265 let nfa = working_nfa::WorkingNFA::new(&tree);
266
267 let capture_groups = nfa.num_capture_groups();
268 let optional_captures: Vec<bool> = (0..capture_groups)
269 .map(|group_num| nfa.capture_group_is_optional(group_num))
270 .collect();
271
272 let input_copy = input.clone();
273 let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
274 let syn::Data::Struct(data_struct) = regex_struct.data else {
275 return syn::parse::Error::new_spanned(
276 regex_struct,
277 "Attribute regexes currently only support structs.",
278 )
279 .to_compile_error()
280 .into();
281 };
282 let syn::Fields::Unnamed(fields) = data_struct.fields else {
283 return syn::parse::Error::new_spanned(
284 data_struct.fields,
285 "Attribute regexes currently require unnamed structs (tuple syntax).",
286 )
287 .to_compile_error()
288 .into();
289 };
290 if fields.unnamed.len() != optional_captures.len() {
291 return syn::parse::Error::new_spanned(
292 fields.unnamed,
293 format!(
294 "Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
295 optional_captures.len()
296 ),
297 )
298 .to_compile_error()
299 .into();
300 }
301 let mut out: proc_macro2::TokenStream = input.into();
310
311 fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
313 return state
314 .transitions
315 .iter()
316 .flat_map(|t| t.symbol.to_ranges())
317 .all(|range| range.end().is_ascii());
318 }
319 let is_ascii = nfa.states.iter().all(is_state_ascii);
320
321 let struct_args: proc_macro2::TokenStream = optional_captures
322 .iter()
323 .enumerate()
324 .map(|(group_num, opt)| if *opt {
325 quote! { result[#group_num], }
326 } else {
327 quote! {
328 result[#group_num]
329 .expect(
330 "If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
331 ),
332 }
333 })
334 .collect();
335
336 let (fn_pair, description) = pick_engine(ere);
338 let struct_name = regex_struct.ident;
339
340 let ere_display_doc = format!("`{ere_str}`");
341 let struct_name_link_doc = format!("[`{}`]", struct_name.to_string());
342 let implementation = quote! {
343 impl<'a> #struct_name<'a> {
344 const ENGINE: (
345 fn(&str) -> bool,
346 fn(&'a str) -> ::core::option::Option<[::core::option::Option<&'a str>; #capture_groups]>,
347 ) = #fn_pair;
348 #[doc = #ere_display_doc]
350 #[doc = #description]
355 #[inline]
356 pub fn test(text: &str) -> bool {
357 return (Self::ENGINE.0)(text);
358 }
359 #[doc = #struct_name_link_doc]
361 #[doc = #ere_display_doc]
363 #[doc = #description]
368 pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
369 let result: [::core::option::Option<&'a str>; #capture_groups] = (Self::ENGINE.1)(text)?;
370 return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
371 #struct_args
372 ));
373 }
374 }
375 };
376 out.extend(implementation);
377
378 return out.into();
379}