ere_core/
lib.rs

1//! This crate provides the core functionality to the `ere` crate.
2
3use proc_macro::TokenStream;
4use quote::quote;
5extern crate proc_macro;
6
7pub mod config;
8mod engines;
9mod epsilon_propogation;
10pub mod parse_tree;
11pub mod simplified_tree;
12pub mod visualization;
13pub mod working_nfa;
14pub mod working_u8_dfa;
15pub mod working_u8_nfa;
16
17pub use engines::*;
18
19/// A regular expression (specifically, a [POSIX ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions)).
20///
21/// Internally, this may contain one of several engines depending on the expression.
22///
23/// The const generic `N` represents the number of capture groups (including capture group 0 which is the entire expression).
24/// It defaults to `1` (for just capture group 0), but you will need to specify it in the type for expressions with more capture groups.
25pub struct Regex<const N: usize = 1> {
26    test_fn: fn(&str) -> bool,
27    exec_fn: for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
28}
29impl<const N: usize> Regex<N> {
30    /// Returns whether or not the text is matched by the regular expression.
31    #[inline]
32    pub fn test(&self, text: &str) -> bool {
33        return (self.test_fn)(text);
34    }
35    #[inline]
36    pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
37        return (self.exec_fn)(text);
38    }
39}
40
41/// Intended to be used in macros only.
42#[inline]
43pub const fn __construct_regex<const N: usize>(
44    fn_pair: (
45        fn(&str) -> bool,
46        for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
47    ),
48) -> Regex<N> {
49    return Regex {
50        test_fn: fn_pair.0,
51        exec_fn: fn_pair.1,
52    };
53}
54
55/// Tries to pick the best engine that doesn't rely on sub-engines.
56///
57/// Returns a stream that evaluates to a pair `(test_fn, exec_fn)`
58fn pick_base_engine(
59    ere: parse_tree::ERE,
60) -> (
61    proc_macro2::TokenStream,
62    simplified_tree::SimplifiedTreeNode,
63    working_nfa::WorkingNFA,
64    working_u8_nfa::U8NFA,
65    &'static str,
66) {
67    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
68    let nfa = working_nfa::WorkingNFA::new(&tree);
69
70    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
71    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
72        return state
73            .transitions
74            .iter()
75            .flat_map(|t| t.symbol.to_ranges())
76            .all(|range| range.end().is_ascii());
77    }
78    let is_ascii = nfa.states.iter().all(is_state_ascii);
79
80    let u8_nfa = working_u8_nfa::U8NFA::new(&nfa);
81
82    const ONE_PASS_U8_DESC: &str = "This regular expression is [one-pass](https://swtch.com/~rsc/regexp/regexp3.html#:~:text=Let%27s%20define%20a%20%E2%80%9Cone%2Dpass%20regular%20expression%E2%80%9D).
83This allows us to use an efficient [`::ere::one_pass_u8`] implementation.";
84    const FLAT_LOCKSTEP_NFA_U8_DESC: &str =
85        "Uses a general-case [`::ere::flat_lockstep_nfa_u8`] implementatation over `u8`s.";
86    const FLAT_LOCKSTEP_NFA_DESC: &str =
87        "Uses a general-case [`::ere::flat_lockstep_nfa`] implementatation over `char`s.";
88    const DFA_DESC: &str = "Uses a [`::ere::dfa::U8DFA`] implementation over `u8`s.";
89
90    let pick_base_engine_inner = || -> (_, &str) {
91        if let Some(engine) = one_pass_u8::serialize_one_pass_token_stream(&u8_nfa) {
92            return (engine, ONE_PASS_U8_DESC);
93        }
94        let dfa_bound = working_u8_dfa::U8DFA::default_bound(u8_nfa.states.len());
95        if let Some(dfa) = working_u8_dfa::U8DFA::from_nfa(&u8_nfa, dfa_bound) {
96            return (dfa_u8::serialize_u8_dfa_token_stream(&dfa), DFA_DESC);
97        }
98        if is_ascii {
99            return (
100                flat_lockstep_nfa_u8::serialize_flat_lockstep_nfa_u8_token_stream(&u8_nfa),
101                FLAT_LOCKSTEP_NFA_U8_DESC,
102            );
103        }
104        return (
105            flat_lockstep_nfa::serialize_flat_lockstep_nfa_token_stream(&nfa),
106            FLAT_LOCKSTEP_NFA_DESC,
107        );
108    };
109
110    let (base_engine, description) = pick_base_engine_inner();
111    return (base_engine, tree, nfa, u8_nfa, description);
112}
113
114/// Tries to pick the best engine that doesn't rely on sub-engines.
115///
116/// Returns a stream that evaluates to a pair `(test_fn, exec_fn)`
117fn pick_engine(ere: parse_tree::ERE) -> (proc_macro2::TokenStream, String) {
118    let (base_engine, _, _, u8_nfa, base_description) = pick_base_engine(ere);
119
120    // Consider nested engines
121    if let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) {
122        let engine = fixed_offset::serialize_fixed_offset_token_stream(
123            base_engine,
124            offsets,
125            u8_nfa.num_capture_groups(),
126        );
127        return (
128            engine,
129            format!(
130                "This regular expression's capture groups are always at fixed offsets.
131Because of this, we can skip a complex `exec` implementation, and instead simply run `test` then index into the string.
132
133### Details on the `test` implementation:
134
135{base_description}"
136            ),
137        );
138    };
139    return (base_engine, base_description.to_string());
140}
141
142/// Tries to pick the best engine.
143pub fn __compile_regex(stream: TokenStream) -> TokenStream {
144    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
145    let (fn_pair, _) = pick_engine(ere);
146    return quote! {
147        {
148            ::ere::__construct_regex(#fn_pair)
149        }
150    }
151    .into();
152}
153
154/// Always uses the [`dfa_u8`] engine
155pub fn __compile_regex_engine_dfa_u8(stream: TokenStream) -> TokenStream {
156    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
157    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
158    let nfa = working_nfa::WorkingNFA::new(&tree);
159    let nfa = working_u8_nfa::U8NFA::new(&nfa);
160    let dfa_state_limit = working_u8_dfa::U8DFA::default_bound(nfa.states.len());
161    let dfa = working_u8_dfa::U8DFA::from_nfa(&nfa, dfa_state_limit);
162    let Some(dfa) = dfa else {
163        return syn::Error::new(
164            proc_macro2::Span::call_site(),
165            format!(
166                "Failed to convert NFA into DFA: exceeded DFA state limit of {},
167                compilation time and binary size could be exponential.",
168                dfa_state_limit
169            ),
170        )
171        .into_compile_error()
172        .into();
173    };
174    let fn_pair = dfa_u8::serialize_u8_dfa_token_stream(&dfa);
175    return quote! {
176        ::ere::__construct_regex(#fn_pair)
177    }
178    .into();
179}
180
181/// Always uses the [`flat_lockstep_nfa`] engine
182pub fn __compile_regex_engine_flat_lockstep_nfa(stream: TokenStream) -> TokenStream {
183    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
184    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
185    let nfa = working_nfa::WorkingNFA::new(&tree);
186    let fn_pair = flat_lockstep_nfa::serialize_flat_lockstep_nfa_token_stream(&nfa);
187    return quote! {
188        ::ere::__construct_regex(#fn_pair)
189    }
190    .into();
191}
192
193/// Always uses the [`flat_lockstep_nfa_u8`] engine
194pub fn __compile_regex_engine_flat_lockstep_nfa_u8(stream: TokenStream) -> TokenStream {
195    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
196    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
197    let nfa = working_nfa::WorkingNFA::new(&tree);
198    let nfa = working_u8_nfa::U8NFA::new(&nfa);
199    let fn_pair = flat_lockstep_nfa_u8::serialize_flat_lockstep_nfa_u8_token_stream(&nfa);
200    return quote! {
201        ::ere::__construct_regex(#fn_pair)
202    }
203    .into();
204}
205
206/// Always uses the [`one_pass_u8`]
207///
208/// Will return a compiler error if regex was not one-pass and could not be optimized to become one-pass.
209pub fn __compile_regex_engine_one_pass_u8(stream: TokenStream) -> TokenStream {
210    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
211    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
212    let nfa = working_nfa::WorkingNFA::new(&tree);
213    let nfa = working_u8_nfa::U8NFA::new(&nfa);
214    let Some(fn_pair) = one_pass_u8::serialize_one_pass_token_stream(&nfa) else {
215        return syn::parse::Error::new(
216            proc_macro2::Span::call_site(),
217            "Regex was not one-pass and could not be optimized to become one pass. 
218Try using a different engine.",
219        )
220        .to_compile_error()
221        .into();
222    };
223    return quote! {
224        ::ere::__construct_regex(#fn_pair)
225    }
226    .into();
227}
228
229/// Always uses the [`fixed_offset`]
230///
231/// Will return a compiler error if regex was not fixed offset.
232pub fn __compile_regex_engine_fixed_offset(stream: TokenStream) -> TokenStream {
233    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
234    let (base_engine, _, _, u8_nfa, _) = pick_base_engine(ere);
235
236    let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) else {
237        return syn::parse::Error::new(
238            proc_macro2::Span::call_site(),
239            "Regex capture groups were not fixed offset. Try using a different engine.",
240        )
241        .to_compile_error()
242        .into();
243    };
244    let fn_pair = fixed_offset::serialize_fixed_offset_token_stream(
245        base_engine,
246        offsets,
247        u8_nfa.num_capture_groups(),
248    );
249    return quote! {
250        ::ere::__construct_regex(#fn_pair)
251    }
252    .into();
253}
254
255#[cfg(feature = "unstable-attr-regex")]
256pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
257    let ere_litstr: syn::LitStr = syn::parse_macro_input!(attr);
258    let ere_str = ere_litstr.value();
259    let ere = match parse_tree::ERE::parse_str_syn(&ere_str, ere_litstr.span()) {
260        Ok(ere) => ere,
261        Err(compile_err) => return compile_err.into_compile_error().into(),
262    };
263
264    let tree = simplified_tree::SimplifiedTreeNode::from(ere.clone());
265    let nfa = working_nfa::WorkingNFA::new(&tree);
266
267    let capture_groups = nfa.num_capture_groups();
268    let optional_captures: Vec<bool> = (0..capture_groups)
269        .map(|group_num| nfa.capture_group_is_optional(group_num))
270        .collect();
271
272    let input_copy = input.clone();
273    let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
274    let syn::Data::Struct(data_struct) = regex_struct.data else {
275        return syn::parse::Error::new_spanned(
276            regex_struct,
277            "Attribute regexes currently only support structs.",
278        )
279        .to_compile_error()
280        .into();
281    };
282    let syn::Fields::Unnamed(fields) = data_struct.fields else {
283        return syn::parse::Error::new_spanned(
284            data_struct.fields,
285            "Attribute regexes currently require unnamed structs (tuple syntax).",
286        )
287        .to_compile_error()
288        .into();
289    };
290    if fields.unnamed.len() != optional_captures.len() {
291        return syn::parse::Error::new_spanned(
292            fields.unnamed,
293            format!(
294                "Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
295                optional_captures.len()
296            ),
297        )
298        .to_compile_error()
299        .into();
300    }
301    // for field in &fields.unnamed {
302    //     if let syn::Type::Reference(ty) = &field.ty {
303    //         if matches!(*ty.elem, syn::parse_quote!(str)) {
304    //             continue;
305    //         }
306    //     }
307    // }
308
309    let mut out: proc_macro2::TokenStream = input.into();
310
311    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
312    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
313        return state
314            .transitions
315            .iter()
316            .flat_map(|t| t.symbol.to_ranges())
317            .all(|range| range.end().is_ascii());
318    }
319    let is_ascii = nfa.states.iter().all(is_state_ascii);
320
321    let struct_args: proc_macro2::TokenStream = optional_captures
322        .iter()
323        .enumerate()
324        .map(|(group_num, opt)| if *opt {
325            quote! { result[#group_num], }
326        } else {
327            quote! {
328                result[#group_num]
329                .expect(
330                    "If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
331                ),
332            }
333        })
334        .collect();
335
336    // TODO: is it possible to more naturally extract struct args as optional or not?
337    let (fn_pair, description) = pick_engine(ere);
338    let struct_name = regex_struct.ident;
339
340    let ere_display_doc = format!("`{ere_str}`");
341    let struct_name_link_doc = format!("[`{}`]", struct_name.to_string());
342    let implementation = quote! {
343        impl<'a> #struct_name<'a> {
344            const ENGINE: (
345                fn(&str) -> bool,
346                fn(&'a str) -> ::core::option::Option<[::core::option::Option<&'a str>; #capture_groups]>,
347            ) = #fn_pair;
348            /// Returns `true` if the regular expression
349            #[doc = #ere_display_doc]
350            /// matches the string.
351            /// Otherwise, returns `false`
352            ///
353            /// ## Implementation
354            #[doc = #description]
355            #[inline]
356            pub fn test(text: &str) -> bool {
357                return (Self::ENGINE.0)(text);
358            }
359            /// Returns an instance of
360            #[doc = #struct_name_link_doc]
361            /// containing capture groups if
362            #[doc = #ere_display_doc]
363            /// matches the string.
364            /// Otherwise, returns `None`.
365            ///
366            /// ## Implementation
367            #[doc = #description]
368            pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
369                let result: [::core::option::Option<&'a str>; #capture_groups] = (Self::ENGINE.1)(text)?;
370                return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
371                    #struct_args
372                ));
373            }
374        }
375    };
376    out.extend(implementation);
377
378    return out.into();
379}