ere_core/
lib.rs

1//! This crate provides the core functionality to the `ere` crate.
2
3use proc_macro::TokenStream;
4use quote::quote;
5extern crate proc_macro;
6
7pub mod config;
8pub mod nfa_static;
9pub mod one_pass_u8;
10pub mod parse_tree;
11pub mod pike_vm;
12pub mod pike_vm_u8;
13pub mod simplified_tree;
14pub mod visualization;
15pub mod working_nfa;
16pub mod working_u8_nfa;
17
18enum RegexEngines<const N: usize> {
19    NFA(nfa_static::NFAStatic<N>),
20    PikeVM(pike_vm::PikeVM<N>),
21    U8PikeVM(pike_vm_u8::U8PikeVM<N>),
22    U8OnePass(one_pass_u8::U8OnePass<N>),
23}
24
25/// A regular expression (specifically, a [POSIX ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions)).
26///
27/// Internally, this may contain one of several engines depending on the expression.
28///
29/// The const generic `N` represents the number of capture groups (including capture group 0 which is the entire expression).
30/// It defaults to `1` (for just capture group 0), but you will need to specify it in the type for expressions with more capture groups.
31pub struct Regex<const N: usize = 1>(RegexEngines<N>);
32impl<const N: usize> Regex<N> {
33    /// Returns whether or not the text is matched by the regular expression.
34    pub fn test(&self, text: &str) -> bool {
35        return match &self.0 {
36            RegexEngines::NFA(nfa) => nfa.test(text),
37            RegexEngines::PikeVM(pike_vm) => pike_vm.test(text),
38            RegexEngines::U8PikeVM(pike_vm) => pike_vm.test(text),
39            RegexEngines::U8OnePass(one_pass) => one_pass.test(text),
40        };
41    }
42
43    pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
44        return match &self.0 {
45            RegexEngines::NFA(nfa) => unimplemented!(),
46            RegexEngines::PikeVM(pike_vm) => pike_vm.exec(text),
47            RegexEngines::U8PikeVM(pike_vm) => pike_vm.exec(text),
48            RegexEngines::U8OnePass(one_pass) => one_pass.exec(text),
49        };
50    }
51}
52impl<const N: usize> std::fmt::Display for Regex<N> {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        return match &self.0 {
55            RegexEngines::NFA(nfastatic) => nfastatic.fmt(f),
56            RegexEngines::PikeVM(_) => f.write_str("<Compiled VM>"),
57            RegexEngines::U8PikeVM(_) => f.write_str("<Compiled VM>"),
58            RegexEngines::U8OnePass(_) => f.write_str("<Compiled VM>"),
59        };
60    }
61}
62
63pub const fn __construct_pikevm_regex<const N: usize>(vm: pike_vm::PikeVM<N>) -> Regex<N> {
64    return Regex(RegexEngines::PikeVM(vm));
65}
66pub const fn __construct_u8pikevm_regex<const N: usize>(vm: pike_vm_u8::U8PikeVM<N>) -> Regex<N> {
67    return Regex(RegexEngines::U8PikeVM(vm));
68}
69pub const fn __construct_nfa_regex<const N: usize>(nfa: nfa_static::NFAStatic<N>) -> Regex<N> {
70    return Regex(RegexEngines::NFA(nfa));
71}
72pub const fn __construct_u8onepass_regex<const N: usize>(
73    nfa: one_pass_u8::U8OnePass<N>,
74) -> Regex<N> {
75    return Regex(RegexEngines::U8OnePass(nfa));
76}
77
78/// Tries to pick the best engine.
79pub fn __compile_regex(stream: TokenStream) -> TokenStream {
80    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
81    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
82    let nfa = working_nfa::WorkingNFA::new(&tree);
83
84    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
85    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
86        return state
87            .transitions
88            .iter()
89            .flat_map(|t| t.symbol.to_ranges())
90            .all(|range| range.end().is_ascii());
91    }
92    let is_ascii = nfa.states.iter().all(is_state_ascii);
93
94    let u8_nfa = working_u8_nfa::U8NFA::new(&nfa);
95
96    if let Some(engine) = one_pass_u8::serialize_one_pass_token_stream(&u8_nfa) {
97        return quote! { ::ere_core::__construct_u8onepass_regex(#engine) }.into();
98    }
99
100    if is_ascii {
101        let engine = pike_vm_u8::serialize_pike_vm_token_stream(&u8_nfa);
102        return quote! { ::ere_core::__construct_u8pikevm_regex(#engine) }.into();
103    } else if true {
104        let engine = pike_vm::serialize_pike_vm_token_stream(&nfa);
105        return quote! { ::ere_core::__construct_pikevm_regex(#engine) }.into();
106    } else {
107        let engine = nfa_static::serialize_nfa_as_token_stream(&nfa);
108        return quote! { ::ere_core::__construct_nfa_regex(#engine) }.into();
109    };
110}
111
112/// Always uses the [`pike_vm::PikeVM`] engine and returns an tokenized instance of it
113/// instead of [`Regex`]
114pub fn __compile_regex_engine_pike_vm(stream: TokenStream) -> TokenStream {
115    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
116    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
117    let nfa = working_nfa::WorkingNFA::new(&tree);
118    return pike_vm::serialize_pike_vm_token_stream(&nfa).into();
119}
120
121/// Always uses the [`pike_vm_u8::U8PikeVM`] engine and returns an tokenized instance of it
122/// instead of [`Regex`]
123pub fn __compile_regex_engine_pike_vm_u8(stream: TokenStream) -> TokenStream {
124    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
125    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
126    let nfa = working_nfa::WorkingNFA::new(&tree);
127    let nfa = working_u8_nfa::U8NFA::new(&nfa);
128    return pike_vm_u8::serialize_pike_vm_token_stream(&nfa).into();
129}
130
131/// Always uses the [`one_pass_u8::U8OnePass`] engine and returns an tokenized instance of it
132/// instead of [`Regex`].
133///
134/// Will return a compiler error if regex was not one-pass and could not be optimized to become one-pass.
135pub fn __compile_regex_engine_one_pass_u8(stream: TokenStream) -> TokenStream {
136    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
137    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
138    let nfa = working_nfa::WorkingNFA::new(&tree);
139    let nfa = working_u8_nfa::U8NFA::new(&nfa);
140    return one_pass_u8::serialize_one_pass_token_stream(&nfa)
141        .unwrap_or(
142            syn::parse::Error::new(
143                proc_macro2::Span::call_site(),
144                "Regex was not one-pass and could not be optimized to become one pass. 
145Try using a different engine.",
146            )
147            .to_compile_error(),
148        )
149        .into();
150}
151
152#[cfg(feature = "unstable-attr-regex")]
153pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
154    let ere: parse_tree::ERE = syn::parse_macro_input!(attr);
155    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
156    let nfa = working_nfa::WorkingNFA::new(&tree);
157
158    let capture_groups = nfa.num_capture_groups();
159    let optional_captures: Vec<bool> = (0..capture_groups)
160        .map(|group_num| nfa.capture_group_is_optional(group_num))
161        .collect();
162
163    let input_copy = input.clone();
164    let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
165    let syn::Data::Struct(data_struct) = regex_struct.data else {
166        return syn::parse::Error::new_spanned(
167            regex_struct,
168            "Attribute regexes currently only support structs.",
169        )
170        .to_compile_error()
171        .into();
172    };
173    let syn::Fields::Unnamed(fields) = data_struct.fields else {
174        return syn::parse::Error::new_spanned(
175            data_struct.fields,
176            "Attribute regexes currently require unnamed structs (tuple syntax).",
177        )
178        .to_compile_error()
179        .into();
180    };
181    if fields.unnamed.len() != optional_captures.len() {
182        return syn::parse::Error::new_spanned(
183            fields.unnamed,
184            format!(
185                "Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
186                optional_captures.len()
187            ),
188        )
189        .to_compile_error()
190        .into();
191    }
192    // for field in &fields.unnamed {
193    //     if let syn::Type::Reference(ty) = &field.ty {
194    //         if matches!(*ty.elem, syn::parse_quote!(str)) {
195    //             continue;
196    //         }
197    //     }
198    // }
199
200    let mut out: proc_macro2::TokenStream = input.into();
201
202    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
203    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
204        return state
205            .transitions
206            .iter()
207            .flat_map(|t| t.symbol.to_ranges())
208            .all(|range| range.end().is_ascii());
209    }
210    let is_ascii = nfa.states.iter().all(is_state_ascii);
211
212    let struct_args: proc_macro2::TokenStream = optional_captures
213        .iter()
214        .enumerate()
215        .map(|(group_num, opt)| if *opt {
216            quote! { result[#group_num], }
217        } else {
218            quote! {
219                result[#group_num]
220                .expect(
221                    "If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
222                ),
223            }
224        })
225        .collect();
226
227    // TODO: is it possible to avoid all this wrapping?
228    let struct_name = regex_struct.ident;
229    if is_ascii {
230        let nfa = working_u8_nfa::U8NFA::new(&nfa);
231        let engine = pike_vm_u8::serialize_pike_vm_token_stream(&nfa);
232        let implementation = quote! {
233            impl<'a> #struct_name<'a> {
234                const ENGINE: ::ere_core::pike_vm_u8::U8PikeVM::<#capture_groups> = #engine;
235                pub fn test(text: &str) -> bool {
236                    return Self::ENGINE.test(text);
237                }
238                pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
239                    let result: [::core::option::Option<&'a str>; #capture_groups] = Self::ENGINE.exec(text)?;
240                    return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
241                        #struct_args
242                    ));
243                }
244            }
245        };
246        out.extend(implementation);
247    } else {
248        let engine = pike_vm::serialize_pike_vm_token_stream(&nfa);
249        let implementation = quote! {
250            impl<'a> #struct_name<'a> {
251                const ENGINE: ::ere_core::pike_vm::PikeVM::<#capture_groups> = #engine;
252                pub fn test(text: &str) -> bool {
253                    return Self::ENGINE.test(text);
254                }
255                pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
256                    let result: [::core::option::Option<&'a str>; #capture_groups] = Self::ENGINE.exec(text)?;
257                    return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
258                        #struct_args
259                    ));
260                }
261            }
262        };
263        out.extend(implementation);
264    }
265
266    return out.into();
267}