ere_core/
lib.rs

1//! This crate provides the core functionality to the `ere` crate.
2
3use proc_macro::TokenStream;
4use quote::quote;
5extern crate proc_macro;
6
7pub mod config;
8pub mod fixed_offset;
9pub mod nfa_static;
10pub mod one_pass_u8;
11pub mod parse_tree;
12pub mod pike_vm;
13pub mod pike_vm_u8;
14pub mod simplified_tree;
15pub mod visualization;
16pub mod working_nfa;
17pub mod working_u8_nfa;
18
19/// A regular expression (specifically, a [POSIX ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions)).
20///
21/// Internally, this may contain one of several engines depending on the expression.
22///
23/// The const generic `N` represents the number of capture groups (including capture group 0 which is the entire expression).
24/// It defaults to `1` (for just capture group 0), but you will need to specify it in the type for expressions with more capture groups.
25pub struct Regex<const N: usize = 1> {
26    test_fn: fn(&str) -> bool,
27    exec_fn: for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
28}
29impl<const N: usize> Regex<N> {
30    /// Returns whether or not the text is matched by the regular expression.
31    #[inline]
32    pub fn test(&self, text: &str) -> bool {
33        return (self.test_fn)(text);
34    }
35    #[inline]
36    pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
37        return (self.exec_fn)(text);
38    }
39}
40
41/// Intended to be used in macros only.
42#[inline]
43pub const fn __construct_regex<const N: usize>(
44    fn_pair: (
45        fn(&str) -> bool,
46        for<'a> fn(&'a str) -> Option<[Option<&'a str>; N]>,
47    ),
48) -> Regex<N> {
49    return Regex {
50        test_fn: fn_pair.0,
51        exec_fn: fn_pair.1,
52    };
53}
54
55/// Tries to pick the best engine that doesn't rely on sub-engines.
56///
57/// Returns a stream that evaluates to a pair `(test_fn, exec_fn)`
58fn pick_base_engine(
59    ere: parse_tree::ERE,
60) -> (
61    proc_macro2::TokenStream,
62    simplified_tree::SimplifiedTreeNode,
63    working_nfa::WorkingNFA,
64    working_u8_nfa::U8NFA,
65    &'static str,
66) {
67    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
68    let nfa = working_nfa::WorkingNFA::new(&tree);
69
70    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
71    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
72        return state
73            .transitions
74            .iter()
75            .flat_map(|t| t.symbol.to_ranges())
76            .all(|range| range.end().is_ascii());
77    }
78    let is_ascii = nfa.states.iter().all(is_state_ascii);
79
80    let u8_nfa = working_u8_nfa::U8NFA::new(&nfa);
81
82    const ONE_PASS_U8_DESC: &str = "This regular expression is [one-pass](https://swtch.com/~rsc/regexp/regexp3.html#:~:text=Let%27s%20define%20a%20%E2%80%9Cone%2Dpass%20regular%20expression%E2%80%9D).
83This allows us to use an efficient [`::ere::one_pass_u8`] implementation.";
84    const PIKE_VM_U8_DESC: &str =
85        "Uses a general-case [`::ere::pike_vm_u8`] implementatation over `u8`s.";
86    const PIKE_VM_DESC: &str =
87        "Uses a general-case [`::ere::pike_vm`] implementatation over `char`s.";
88
89    let (base_engine, description) =
90        if let Some(engine) = one_pass_u8::serialize_one_pass_token_stream(&u8_nfa) {
91            (engine, ONE_PASS_U8_DESC)
92        } else if is_ascii {
93            (
94                pike_vm_u8::serialize_pike_vm_token_stream(&u8_nfa),
95                PIKE_VM_U8_DESC,
96            )
97        } else {
98            (pike_vm::serialize_pike_vm_token_stream(&nfa), PIKE_VM_DESC)
99        };
100    return (base_engine, tree, nfa, u8_nfa, description);
101}
102
103/// Tries to pick the best engine that doesn't rely on sub-engines.
104///
105/// Returns a stream that evaluates to a pair `(test_fn, exec_fn)`
106fn pick_engine(ere: parse_tree::ERE) -> (proc_macro2::TokenStream, String) {
107    let (base_engine, _, _, u8_nfa, base_description) = pick_base_engine(ere);
108
109    // Consider nested engines
110    if let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) {
111        let engine = fixed_offset::serialize_fixed_offset_token_stream(
112            base_engine,
113            offsets,
114            u8_nfa.num_capture_groups(),
115        );
116        return (
117            engine,
118            format!(
119                "This regular expression's capture groups are always at fixed offsets.
120Because of this, we can skip a complex `exec` implementation, and instead simply run `test` then index into the string.
121
122### Details on the `test` implementation:
123
124{base_description}"
125            ),
126        );
127    };
128    return (base_engine, base_description.to_string());
129}
130
131/// Tries to pick the best engine.
132pub fn __compile_regex(stream: TokenStream) -> TokenStream {
133    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
134    let (fn_pair, _) = pick_engine(ere);
135    return quote! {
136        {
137            ::ere::__construct_regex(#fn_pair)
138        }
139    }
140    .into();
141}
142
143/// Always uses the [`pike_vm`] engine
144pub fn __compile_regex_engine_pike_vm(stream: TokenStream) -> TokenStream {
145    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
146    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
147    let nfa = working_nfa::WorkingNFA::new(&tree);
148    let fn_pair = pike_vm::serialize_pike_vm_token_stream(&nfa);
149    return quote! {
150        ::ere::__construct_regex(#fn_pair)
151    }
152    .into();
153}
154
155/// Always uses the [`pike_vm_u8`] engine
156pub fn __compile_regex_engine_pike_vm_u8(stream: TokenStream) -> TokenStream {
157    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
158    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
159    let nfa = working_nfa::WorkingNFA::new(&tree);
160    let nfa = working_u8_nfa::U8NFA::new(&nfa);
161    let fn_pair = pike_vm_u8::serialize_pike_vm_token_stream(&nfa);
162    return quote! {
163        ::ere::__construct_regex(#fn_pair)
164    }
165    .into();
166}
167
168/// Always uses the [`one_pass_u8`]
169///
170/// Will return a compiler error if regex was not one-pass and could not be optimized to become one-pass.
171pub fn __compile_regex_engine_one_pass_u8(stream: TokenStream) -> TokenStream {
172    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
173    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
174    let nfa = working_nfa::WorkingNFA::new(&tree);
175    let nfa = working_u8_nfa::U8NFA::new(&nfa);
176    let Some(fn_pair) = one_pass_u8::serialize_one_pass_token_stream(&nfa) else {
177        return syn::parse::Error::new(
178            proc_macro2::Span::call_site(),
179            "Regex was not one-pass and could not be optimized to become one pass. 
180Try using a different engine.",
181        )
182        .to_compile_error()
183        .into();
184    };
185    return quote! {
186        ::ere::__construct_regex(#fn_pair)
187    }
188    .into();
189}
190
191/// Always uses the [`fixed_offset`]
192///
193/// Will return a compiler error if regex was not fixed offset.
194pub fn __compile_regex_engine_fixed_offset(stream: TokenStream) -> TokenStream {
195    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
196    let (base_engine, _, _, u8_nfa, _) = pick_base_engine(ere);
197
198    let Some(offsets) = fixed_offset::get_fixed_offsets(&u8_nfa) else {
199        return syn::parse::Error::new(
200            proc_macro2::Span::call_site(),
201            "Regex capture groups were not fixed offset. Try using a different engine.",
202        )
203        .to_compile_error()
204        .into();
205    };
206    let fn_pair = fixed_offset::serialize_fixed_offset_token_stream(
207        base_engine,
208        offsets,
209        u8_nfa.num_capture_groups(),
210    );
211    return quote! {
212        ::ere::__construct_regex(#fn_pair)
213    }
214    .into();
215}
216
217#[cfg(feature = "unstable-attr-regex")]
218pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
219    let ere_litstr: syn::LitStr = syn::parse_macro_input!(attr);
220    let ere_str = ere_litstr.value();
221    let ere = match parse_tree::ERE::parse_str_syn(&ere_str, ere_litstr.span()) {
222        Ok(ere) => ere,
223        Err(compile_err) => return compile_err.into_compile_error().into(),
224    };
225
226    let tree = simplified_tree::SimplifiedTreeNode::from(ere.clone());
227    let nfa = working_nfa::WorkingNFA::new(&tree);
228
229    let capture_groups = nfa.num_capture_groups();
230    let optional_captures: Vec<bool> = (0..capture_groups)
231        .map(|group_num| nfa.capture_group_is_optional(group_num))
232        .collect();
233
234    let input_copy = input.clone();
235    let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
236    let syn::Data::Struct(data_struct) = regex_struct.data else {
237        return syn::parse::Error::new_spanned(
238            regex_struct,
239            "Attribute regexes currently only support structs.",
240        )
241        .to_compile_error()
242        .into();
243    };
244    let syn::Fields::Unnamed(fields) = data_struct.fields else {
245        return syn::parse::Error::new_spanned(
246            data_struct.fields,
247            "Attribute regexes currently require unnamed structs (tuple syntax).",
248        )
249        .to_compile_error()
250        .into();
251    };
252    if fields.unnamed.len() != optional_captures.len() {
253        return syn::parse::Error::new_spanned(
254            fields.unnamed,
255            format!(
256                "Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
257                optional_captures.len()
258            ),
259        )
260        .to_compile_error()
261        .into();
262    }
263    // for field in &fields.unnamed {
264    //     if let syn::Type::Reference(ty) = &field.ty {
265    //         if matches!(*ty.elem, syn::parse_quote!(str)) {
266    //             continue;
267    //         }
268    //     }
269    // }
270
271    let mut out: proc_macro2::TokenStream = input.into();
272
273    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
274    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
275        return state
276            .transitions
277            .iter()
278            .flat_map(|t| t.symbol.to_ranges())
279            .all(|range| range.end().is_ascii());
280    }
281    let is_ascii = nfa.states.iter().all(is_state_ascii);
282
283    let struct_args: proc_macro2::TokenStream = optional_captures
284        .iter()
285        .enumerate()
286        .map(|(group_num, opt)| if *opt {
287            quote! { result[#group_num], }
288        } else {
289            quote! {
290                result[#group_num]
291                .expect(
292                    "If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
293                ),
294            }
295        })
296        .collect();
297
298    // TODO: is it possible to more naturally extract struct args as optional or not?
299    let (fn_pair, description) = pick_engine(ere);
300    let struct_name = regex_struct.ident;
301
302    let ere_display_doc = format!("`{ere_str}`");
303    let struct_name_link_doc = format!("[`{}`]", struct_name.to_string());
304    let implementation = quote! {
305        impl<'a> #struct_name<'a> {
306            const ENGINE: (
307                fn(&str) -> bool,
308                fn(&'a str) -> ::core::option::Option<[::core::option::Option<&'a str>; #capture_groups]>,
309            ) = #fn_pair;
310            /// Returns `true` if the regular expression
311            #[doc = #ere_display_doc]
312            /// matches the string.
313            /// Otherwise, returns `false`
314            ///
315            /// ## Implementation
316            #[doc = #description]
317            #[inline]
318            pub fn test(text: &str) -> bool {
319                return (Self::ENGINE.0)(text);
320            }
321            /// Returns an instance of
322            #[doc = #struct_name_link_doc]
323            /// containing capture groups if
324            #[doc = #ere_display_doc]
325            /// matches the string.
326            /// Otherwise, returns `None`.
327            ///
328            /// ## Implementation
329            #[doc = #description]
330            pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
331                let result: [::core::option::Option<&'a str>; #capture_groups] = (Self::ENGINE.1)(text)?;
332                return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
333                    #struct_args
334                ));
335            }
336        }
337    };
338    out.extend(implementation);
339
340    return out.into();
341}