ere_core/
fixed_offset.rs

1//! This is a highly-efficient implementation
2//! for regexes where capture groups always have the same offset and the same length (in bytes).
3//! This also means the text has a fixed length.
4//!
5//! This means that no variable quantifiers are allowed,
6//! and alternations must have the same length in each case.
7//! Additionally, capture groups cannot occur within alternations.
8//!
9//! The strategy is essentially to only ever run `test` then to index fixed offsets
10//!
11//! ## Examples
12//! - `^#([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})$` for hex colors
13//! - `^\+1 ([0-9]{3})-([0-9]{3})-([0-9]{4})$` for US phone numbers
14
15use quote::quote;
16
17use crate::working_u8_nfa::U8NFA;
18
19/// If the offsets are fixed, returns them for each capture group.
20/// Value is [`usize::MAX`] if a capture group never matches (usually shouldn't happen)
21pub(crate) fn get_fixed_offsets(nfa: &U8NFA) -> Option<Vec<(usize, usize)>> {
22    let Some(_) = nfa.topological_ordering() else {
23        // there are loops
24        return None;
25    };
26
27    let num_capture_groups = nfa.num_capture_groups();
28    let mut final_capture_groups = None;
29    let mut stack = vec![(
30        0usize,
31        0usize,
32        vec![(usize::MAX, usize::MAX); num_capture_groups],
33    )];
34
35    while let Some((state_idx, offset, capture_offsets)) = stack.pop() {
36        let state = &nfa.states[state_idx];
37
38        // If at accept state
39        if state_idx + 1 == nfa.states.len() {
40            match &final_capture_groups {
41                None => {
42                    final_capture_groups = Some(capture_offsets.clone());
43                }
44                Some(final_capture_groups) if *final_capture_groups != capture_offsets => {
45                    return None;
46                }
47                _ => {}
48            }
49        }
50
51        // Handle transitions
52        for tr in &state.transitions {
53            stack.push((tr.to, offset + 1, capture_offsets.clone()));
54        }
55        for ep in &state.epsilons {
56            let mut capture_offsets = capture_offsets.clone();
57            match ep.special {
58                crate::working_nfa::EpsilonType::None => {}
59                crate::working_nfa::EpsilonType::StartAnchor => {}
60                crate::working_nfa::EpsilonType::EndAnchor => {}
61                crate::working_nfa::EpsilonType::StartCapture(group) => {
62                    capture_offsets[group].0 = offset
63                }
64                crate::working_nfa::EpsilonType::EndCapture(group) => {
65                    capture_offsets[group].1 = offset
66                }
67            }
68            stack.push((ep.to, offset, capture_offsets));
69        }
70    }
71    return final_capture_groups;
72}
73
74/// Uses the `test` function from an inner engine for the exec
75/// and extracts capture groups with fixed offsets
76pub(crate) fn serialize_fixed_offset_token_stream(
77    inner_engine: proc_macro2::TokenStream,
78    offsets: Vec<(usize, usize)>,
79    capture_groups: usize,
80) -> proc_macro2::TokenStream {
81    let extract_offsets: proc_macro2::TokenStream = offsets
82        .iter()
83        .map(|(start, end)| {
84            if *start == usize::MAX || *end == usize::MAX {
85                return quote! { ::core::option::Option::None, };
86            }
87            return quote! { ::core::option::Option::Some(&text[#start..#end]), };
88        })
89        .collect();
90
91    return quote! {{
92        const TEST_FN: fn(&str) -> bool = #inner_engine.0;
93        #[inline]
94        fn exec<'a>(text: &'a str) -> ::core::option::Option<[::core::option::Option<&'a str>; #capture_groups]> {
95            if !(TEST_FN)(text) {
96                return ::core::option::Option::None;
97            }
98            return ::core::option::Option::Some(
99                [
100                    #extract_offsets
101                ]
102            );
103        }
104        (TEST_FN, exec)
105    }};
106}