syntax_parser_generator/lex/
regex.rs

1use crate::automata::nfa::{Nfa, NfaState};
2use crate::handles::Handle;
3use crate::handles::specials::AutomaticallyHandled;
4
5/// A regular-expression pattern over raw bytes.
6///
7/// In practice, you won't need to create instances of this type directly. Check out the [Regex]
8/// API and the high-level factory methods it offers.
9#[derive(Clone)]
10pub enum Regex {
11    /// Only matches a single hardcoded byte.
12    SingleCharacter {
13        /// The matching byte.
14        value: u8,
15    },
16
17    /// Matches each one of the specified patterns.
18    Union {
19        /// The possible matching patterns.
20        options: Vec<Regex>,
21    },
22
23    /// Matches a concatenation of the specified patterns.
24    Concat {
25        /// The concatenated patterns.
26        parts: Vec<Regex>,
27    },
28
29    /// Matches a concatenation of zero or more repetitions of the specified pattern.
30    Star {
31        /// The repeated pattern.
32        repeated_pattern: Box<Regex>,
33    },
34}
35
36impl Regex {
37    /// Creates a pattern that only matches the specified character.
38    ///
39    /// # Panics
40    ///
41    /// If the character is not ASCII (cannot be represented by a single byte).
42    pub fn single_char(value: char) -> Regex {
43        Regex::SingleCharacter {
44            value: value.try_into().unwrap_or_else(|_| {
45                panic!(
46                    "Cannot create a single-character regex from {:?}, as it's not 1-byte long",
47                    value
48                )
49            }),
50        }
51    }
52
53    /// Creates a pattern that matches each of the specified patterns.
54    pub fn union(options: Vec<Regex>) -> Regex {
55        Regex::Union { options }
56    }
57
58    /// Creates a pattern that matches a concatenation of the specified patterns.
59    pub fn concat(parts: Vec<Regex>) -> Regex {
60        Regex::Concat { parts }
61    }
62
63    /// Creates a pattern that matches zero or more repetitions of the specified pattern.
64    pub fn star_from(repeated_pattern: Regex) -> Regex {
65        Regex::Star {
66            repeated_pattern: Box::new(repeated_pattern),
67        }
68    }
69
70    /// Creates a pattern that matches one or more repetitions of the specified pattern.
71    pub fn plus_from(repeated_pattern: Regex) -> Regex {
72        let star_pattern = Regex::star_from(repeated_pattern.clone());
73        Regex::concat(vec![repeated_pattern, star_pattern])
74    }
75
76    /// Creates a pattern that matches a single white-space character.
77    pub fn white_space() -> Regex {
78        let white_space_characters = vec![' ', '\t', '\n', '\r', '\x0B', '\x0C'];
79        Regex::union(
80            white_space_characters
81                .into_iter()
82                .map(Regex::single_char)
83                .collect(),
84        )
85    }
86
87    /// Creates a pattern that matches a hard-coded sequence of characters.
88    pub fn constant_string(string: &str) -> Regex {
89        Regex::concat(string.chars().map(Regex::single_char).collect())
90    }
91
92    /// Creates a pattern that matches any single character between the specified couple of
93    /// characters (inclusive).
94    pub fn character_range(start: char, end: char) -> Regex {
95        Regex::union((start..=end).map(Regex::single_char).collect())
96    }
97
98    /// Creates a pattern that matches the specified pattern, and an empty sequence of bytes.
99    pub fn optional(option: Regex) -> Regex {
100        Regex::union(vec![option, Regex::epsilon()])
101    }
102
103    /// Creates a pattern that only matches an empty sequence of characters.
104    pub fn epsilon() -> Regex {
105        Regex::concat(vec![])
106    }
107
108    pub(crate) fn build_into_nfa<Label>(
109        &self,
110        nfa: &mut Nfa<u8, Label>,
111    ) -> (Handle<NfaState<u8, Label>>, Handle<NfaState<u8, Label>>)
112    where
113    {
114        match self {
115            Regex::SingleCharacter { value } => {
116                let start = nfa.new_state();
117                let end = nfa.new_state();
118                nfa.link(start, end, Some(value.handle()));
119                (start, end)
120            }
121            Regex::Union { options } => {
122                let start = nfa.new_state();
123                let end = nfa.new_state();
124                for option in options {
125                    let (option_start, option_end) = option.build_into_nfa(nfa);
126                    nfa.link(start, option_start, None);
127                    nfa.link(option_end, end, None);
128                }
129                (start, end)
130            }
131            Regex::Concat { parts } => {
132                let start = nfa.new_state();
133                let end = nfa.new_state();
134                let mut curr = start;
135                for part in parts {
136                    let (part_start, part_end) = part.build_into_nfa(nfa);
137                    nfa.link(curr, part_start, None);
138                    curr = part_end;
139                }
140                nfa.link(curr, end, None);
141                (start, end)
142            }
143            Regex::Star { repeated_pattern } => {
144                let start = nfa.new_state();
145                let end = nfa.new_state();
146                let (repeated_pattern_start, repeated_pattern_end) =
147                    repeated_pattern.build_into_nfa(nfa);
148
149                nfa.link(start, repeated_pattern_start, None);
150                nfa.link(start, end, None);
151                nfa.link(repeated_pattern_end, end, None);
152                nfa.link(repeated_pattern_end, repeated_pattern_start, None);
153
154                (start, end)
155            }
156        }
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use crate::automata::dfa::Dfa;
163
164    use super::*;
165
166    fn create_dfa_for_regex(pattern: Regex) -> Dfa<u8, ()> {
167        let mut nfa = Nfa::new();
168        let (start, end) = pattern.build_into_nfa(&mut nfa);
169        nfa.label(end, Some(()));
170        nfa.set_initial_state(start);
171
172        let dfa = nfa
173            .compile_to_dfa(|labels| if labels.is_empty() { None } else { Some(()) })
174            .minimize();
175
176        return dfa;
177    }
178
179    fn is_string_in(dfa: &Dfa<u8, ()>, data: &str) -> bool {
180        match dfa.scan(
181            String::from(data)
182                .into_bytes()
183                .into_iter()
184                .map(|x| x.handle()),
185        ) {
186            None => false,
187            Some(end_state) => !dfa.get_label(end_state).is_none(),
188        }
189    }
190    #[test]
191    fn test_single_char() {
192        let pattern = Regex::single_char('a');
193        let dfa = create_dfa_for_regex(pattern);
194
195        assert_eq!(is_string_in(&dfa, "a"), true);
196        assert_eq!(is_string_in(&dfa, ""), false);
197        assert_eq!(is_string_in(&dfa, "aa"), false);
198    }
199
200    #[test]
201    fn test_union() {
202        let pattern = Regex::union(vec![
203            Regex::single_char('a'),
204            Regex::single_char('b'),
205            Regex::single_char('c'),
206        ]);
207        let dfa = create_dfa_for_regex(pattern);
208
209        assert_eq!(is_string_in(&dfa, "a"), true);
210        assert_eq!(is_string_in(&dfa, "b"), true);
211        assert_eq!(is_string_in(&dfa, "c"), true);
212        assert_eq!(is_string_in(&dfa, ""), false);
213        assert_eq!(is_string_in(&dfa, "aa"), false);
214        assert_eq!(is_string_in(&dfa, "d"), false);
215    }
216
217    #[test]
218    fn test_concat() {
219        let pattern = Regex::concat(vec![
220            Regex::single_char('a'),
221            Regex::single_char('b'),
222            Regex::single_char('c'),
223        ]);
224        let dfa = create_dfa_for_regex(pattern);
225
226        assert_eq!(is_string_in(&dfa, "abc"), true);
227        assert_eq!(is_string_in(&dfa, ""), false);
228        assert_eq!(is_string_in(&dfa, "a"), false);
229        assert_eq!(is_string_in(&dfa, "bc"), false);
230    }
231
232    //noinspection ALL
233    #[test]
234    fn test_star() {
235        let pattern = Regex::star_from(Regex::single_char('a'));
236        let dfa = create_dfa_for_regex(pattern);
237
238        assert_eq!(is_string_in(&dfa, ""), true);
239        assert_eq!(is_string_in(&dfa, "a"), true);
240        assert_eq!(is_string_in(&dfa, "aa"), true);
241        assert_eq!(is_string_in(&dfa, "aaaaaaa"), true);
242        assert_eq!(is_string_in(&dfa, "b"), false);
243        assert_eq!(is_string_in(&dfa, "ab"), false);
244    }
245
246    #[test]
247    fn test_plus() {
248        let pattern = Regex::plus_from(Regex::single_char('a'));
249        let dfa = create_dfa_for_regex(pattern);
250
251        assert_eq!(is_string_in(&dfa, ""), false);
252        assert_eq!(is_string_in(&dfa, "a"), true);
253        assert_eq!(is_string_in(&dfa, "aa"), true);
254        assert_eq!(is_string_in(&dfa, "aaaaaaa"), true);
255        assert_eq!(is_string_in(&dfa, "b"), false);
256        assert_eq!(is_string_in(&dfa, "ab"), false);
257    }
258
259    #[test]
260    fn test_complex() {
261        let pattern = Regex::concat(vec![
262            Regex::union(vec![
263                Regex::character_range('a', 'z'),
264                Regex::character_range('A', 'Z'),
265                Regex::single_char('_'),
266            ]),
267            Regex::star_from(Regex::union(vec![
268                Regex::character_range('a', 'z'),
269                Regex::character_range('A', 'Z'),
270                Regex::character_range('0', '9'),
271                Regex::single_char('_'),
272            ])),
273        ]);
274        let dfa = create_dfa_for_regex(pattern);
275
276        assert_eq!(is_string_in(&dfa, "MyThing"), true);
277        assert_eq!(is_string_in(&dfa, "our_thing_12"), true);
278        assert_eq!(is_string_in(&dfa, "i"), true);
279        assert_eq!(is_string_in(&dfa, "a1jh2b45"), true);
280        assert_eq!(is_string_in(&dfa, ""), false);
281        assert_eq!(is_string_in(&dfa, "mine()"), false);
282        assert_eq!(is_string_in(&dfa, "12"), false);
283        assert_eq!(is_string_in(&dfa, "1ours"), false);
284    }
285}