ere_core/
lib.rs

1//! This crate provides the core functionality to the `ere` crate.
2
3use proc_macro::TokenStream;
4use quote::quote;
5extern crate proc_macro;
6
7pub mod config;
8pub mod nfa_static;
9pub mod parse_tree;
10pub mod pike_vm;
11pub mod pike_vm_u8;
12pub mod simplified_tree;
13pub mod visualization;
14pub mod working_nfa;
15pub mod working_u8_nfa;
16
17enum RegexEngines<const N: usize> {
18    NFA(nfa_static::NFAStatic<N>),
19    PikeVM(pike_vm::PikeVM<N>),
20    U8PikeVM(pike_vm_u8::U8PikeVM<N>),
21}
22
23/// A regular expression (specifically, a [POSIX ERE](https://en.wikibooks.org/wiki/Regular_Expressions/POSIX-Extended_Regular_Expressions)).
24///
25/// Internally, this may contain one of several engines depending on the expression.
26///
27/// The const generic `N` represents the number of capture groups (including capture group 0 which is the entire expression).
28/// It defaults to `1` (for just capture group 0), but you will need to specify it in the type for expressions with more capture groups.
29pub struct Regex<const N: usize = 1>(RegexEngines<N>);
30impl<const N: usize> Regex<N> {
31    /// Returns whether or not the text is matched by the regular expression.
32    pub fn test(&self, text: &str) -> bool {
33        return match &self.0 {
34            RegexEngines::NFA(nfa) => nfa.test(text),
35            RegexEngines::PikeVM(pike_vm) => pike_vm.test(text),
36            RegexEngines::U8PikeVM(pike_vm) => pike_vm.test(text),
37        };
38    }
39
40    pub fn exec<'a>(&self, text: &'a str) -> Option<[Option<&'a str>; N]> {
41        return match &self.0 {
42            RegexEngines::NFA(nfa) => unimplemented!(),
43            RegexEngines::PikeVM(pike_vm) => pike_vm.exec(text),
44            RegexEngines::U8PikeVM(pike_vm) => pike_vm.exec(text),
45        };
46    }
47}
48impl<const N: usize> std::fmt::Display for Regex<N> {
49    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50        return match &self.0 {
51            RegexEngines::NFA(nfastatic) => nfastatic.fmt(f),
52            RegexEngines::PikeVM(_) => f.write_str("<Compiled VM>"),
53            RegexEngines::U8PikeVM(_) => f.write_str("<Compiled VM>"),
54        };
55    }
56}
57
58pub const fn __construct_pikevm_regex<const N: usize>(vm: pike_vm::PikeVM<N>) -> Regex<N> {
59    return Regex(RegexEngines::PikeVM(vm));
60}
61pub const fn __construct_u8pikevm_regex<const N: usize>(vm: pike_vm_u8::U8PikeVM<N>) -> Regex<N> {
62    return Regex(RegexEngines::U8PikeVM(vm));
63}
64pub const fn __construct_nfa_regex<const N: usize>(nfa: nfa_static::NFAStatic<N>) -> Regex<N> {
65    return Regex(RegexEngines::NFA(nfa));
66}
67
68pub fn __compile_regex(stream: TokenStream) -> TokenStream {
69    let ere: parse_tree::ERE = syn::parse_macro_input!(stream);
70    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
71    let nfa = working_nfa::WorkingNFA::new(&tree);
72    // println!("{}", nfa.to_tikz(true));
73
74    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
75    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
76        return state
77            .transitions
78            .iter()
79            .flat_map(|t| t.symbol.to_ranges())
80            .all(|range| range.end().is_ascii());
81    }
82    let is_ascii = nfa.states.iter().all(is_state_ascii);
83
84    if is_ascii {
85        let nfa = working_u8_nfa::U8NFA::new(&nfa);
86        let engine = pike_vm_u8::serialize_pike_vm_token_stream(&nfa);
87        return quote! { ::ere_core::__construct_u8pikevm_regex(#engine) }.into();
88    } else if true {
89        let engine = pike_vm::serialize_pike_vm_token_stream(&nfa);
90        return quote! { ::ere_core::__construct_pikevm_regex(#engine) }.into();
91    } else {
92        let engine = nfa_static::serialize_nfa_as_token_stream(&nfa);
93        return quote! { ::ere_core::__construct_nfa_regex(#engine) }.into();
94    };
95}
96
97#[cfg(feature = "unstable-attr-regex")]
98pub fn __compile_regex_attr(attr: TokenStream, input: TokenStream) -> TokenStream {
99    let ere: parse_tree::ERE = syn::parse_macro_input!(attr);
100    let tree = simplified_tree::SimplifiedTreeNode::from(ere);
101    let nfa = working_nfa::WorkingNFA::new(&tree);
102
103    let capture_groups = nfa.num_capture_groups();
104    let optional_captures: Vec<bool> = (0..capture_groups)
105        .map(|group_num| nfa.capture_group_is_optional(group_num))
106        .collect();
107
108    let input_copy = input.clone();
109    let regex_struct: syn::DeriveInput = syn::parse_macro_input!(input_copy);
110    let syn::Data::Struct(data_struct) = regex_struct.data else {
111        return syn::parse::Error::new_spanned(
112            regex_struct,
113            "Attribute regexes currently only support structs.",
114        )
115        .to_compile_error()
116        .into();
117    };
118    let syn::Fields::Unnamed(fields) = data_struct.fields else {
119        return syn::parse::Error::new_spanned(
120            data_struct.fields,
121            "Attribute regexes currently require unnamed structs (tuple syntax).",
122        )
123        .to_compile_error()
124        .into();
125    };
126    if fields.unnamed.len() != optional_captures.len() {
127        return syn::parse::Error::new_spanned(
128            fields.unnamed,
129            format!(
130                "Expected struct to have {} unnamed fields, based on number of captures in regular expression.",
131                optional_captures.len()
132            ),
133        )
134        .to_compile_error()
135        .into();
136    }
137    // for field in &fields.unnamed {
138    //     if let syn::Type::Reference(ty) = &field.ty {
139    //         if matches!(*ty.elem, syn::parse_quote!(str)) {
140    //             continue;
141    //         }
142    //     }
143    // }
144
145    let mut out: proc_macro2::TokenStream = input.into();
146
147    // Currently use a conservative check: only use u8 engines when it will only match ascii strings
148    fn is_state_ascii(state: &working_nfa::WorkingState) -> bool {
149        return state
150            .transitions
151            .iter()
152            .flat_map(|t| t.symbol.to_ranges())
153            .all(|range| range.end().is_ascii());
154    }
155    let is_ascii = nfa.states.iter().all(is_state_ascii);
156
157    let struct_args: proc_macro2::TokenStream = optional_captures
158        .iter()
159        .enumerate()
160        .map(|(group_num, opt)| if *opt {
161            quote! { result[#group_num], }
162        } else {
163            quote! {
164                result[#group_num]
165                .expect(
166                    "If you are seeing this, there is probably an internal bug in the `ere-core` crate where a capture group was mistakenly marked as non-optional. Please report the bug."
167                ),
168            }
169        })
170        .collect();
171
172    // TODO: is it possible to avoid all this wrapping?
173    let struct_name = regex_struct.ident;
174    if is_ascii {
175        let nfa = working_u8_nfa::U8NFA::new(&nfa);
176        let engine = pike_vm_u8::serialize_pike_vm_token_stream(&nfa);
177        let implementation = quote! {
178            impl<'a> #struct_name<'a> {
179                const ENGINE: ::ere_core::pike_vm_u8::U8PikeVM::<#capture_groups> = #engine;
180                pub fn test(text: &str) -> bool {
181                    return Self::ENGINE.test(text);
182                }
183                pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
184                    let result: [::core::option::Option<&'a str>; #capture_groups] = Self::ENGINE.exec(text)?;
185                    return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
186                        #struct_args
187                    ));
188                }
189            }
190        };
191        out.extend(implementation);
192    } else {
193        let engine = pike_vm::serialize_pike_vm_token_stream(&nfa);
194        let implementation = quote! {
195            impl<'a> #struct_name<'a> {
196                const ENGINE: ::ere_core::pike_vm::PikeVM::<#capture_groups> = #engine;
197                pub fn test(text: &str) -> bool {
198                    return Self::ENGINE.test(text);
199                }
200                pub fn exec(text: &'a str) -> ::core::option::Option<#struct_name<'a>> {
201                    let result: [::core::option::Option<&'a str>; #capture_groups] = Self::ENGINE.exec(text)?;
202                    return ::core::option::Option::<#struct_name<'a>>::Some(#struct_name(
203                        #struct_args
204                    ));
205                }
206            }
207        };
208        out.extend(implementation);
209    }
210
211    return out.into();
212}