syntax_parser_generator/lex/
regex.rs1use crate::automata::nfa::{Nfa, NfaState};
2use crate::handles::Handle;
3use crate::handles::specials::AutomaticallyHandled;
4
5#[derive(Clone)]
10pub enum Regex {
11 SingleCharacter {
13 value: u8,
15 },
16
17 Union {
19 options: Vec<Regex>,
21 },
22
23 Concat {
25 parts: Vec<Regex>,
27 },
28
29 Star {
31 repeated_pattern: Box<Regex>,
33 },
34}
35
36impl Regex {
37 pub fn single_char(value: char) -> Regex {
43 Regex::SingleCharacter {
44 value: value.try_into().unwrap_or_else(|_| {
45 panic!(
46 "Cannot create a single-character regex from {:?}, as it's not 1-byte long",
47 value
48 )
49 }),
50 }
51 }
52
53 pub fn union(options: Vec<Regex>) -> Regex {
55 Regex::Union { options }
56 }
57
58 pub fn concat(parts: Vec<Regex>) -> Regex {
60 Regex::Concat { parts }
61 }
62
63 pub fn star_from(repeated_pattern: Regex) -> Regex {
65 Regex::Star {
66 repeated_pattern: Box::new(repeated_pattern),
67 }
68 }
69
70 pub fn plus_from(repeated_pattern: Regex) -> Regex {
72 let star_pattern = Regex::star_from(repeated_pattern.clone());
73 Regex::concat(vec![repeated_pattern, star_pattern])
74 }
75
76 pub fn white_space() -> Regex {
78 let white_space_characters = vec![' ', '\t', '\n', '\r', '\x0B', '\x0C'];
79 Regex::union(
80 white_space_characters
81 .into_iter()
82 .map(Regex::single_char)
83 .collect(),
84 )
85 }
86
87 pub fn constant_string(string: &str) -> Regex {
89 Regex::concat(string.chars().map(Regex::single_char).collect())
90 }
91
92 pub fn character_range(start: char, end: char) -> Regex {
95 Regex::union((start..=end).map(Regex::single_char).collect())
96 }
97
98 pub fn optional(option: Regex) -> Regex {
100 Regex::union(vec![option, Regex::epsilon()])
101 }
102
103 pub fn epsilon() -> Regex {
105 Regex::concat(vec![])
106 }
107
108 pub(crate) fn build_into_nfa<Label>(
109 &self,
110 nfa: &mut Nfa<u8, Label>,
111 ) -> (Handle<NfaState<u8, Label>>, Handle<NfaState<u8, Label>>)
112 where
113 {
114 match self {
115 Regex::SingleCharacter { value } => {
116 let start = nfa.new_state();
117 let end = nfa.new_state();
118 nfa.link(start, end, Some(value.handle()));
119 (start, end)
120 }
121 Regex::Union { options } => {
122 let start = nfa.new_state();
123 let end = nfa.new_state();
124 for option in options {
125 let (option_start, option_end) = option.build_into_nfa(nfa);
126 nfa.link(start, option_start, None);
127 nfa.link(option_end, end, None);
128 }
129 (start, end)
130 }
131 Regex::Concat { parts } => {
132 let start = nfa.new_state();
133 let end = nfa.new_state();
134 let mut curr = start;
135 for part in parts {
136 let (part_start, part_end) = part.build_into_nfa(nfa);
137 nfa.link(curr, part_start, None);
138 curr = part_end;
139 }
140 nfa.link(curr, end, None);
141 (start, end)
142 }
143 Regex::Star { repeated_pattern } => {
144 let start = nfa.new_state();
145 let end = nfa.new_state();
146 let (repeated_pattern_start, repeated_pattern_end) =
147 repeated_pattern.build_into_nfa(nfa);
148
149 nfa.link(start, repeated_pattern_start, None);
150 nfa.link(start, end, None);
151 nfa.link(repeated_pattern_end, end, None);
152 nfa.link(repeated_pattern_end, repeated_pattern_start, None);
153
154 (start, end)
155 }
156 }
157 }
158}
159
160#[cfg(test)]
161mod tests {
162 use crate::automata::dfa::Dfa;
163
164 use super::*;
165
166 fn create_dfa_for_regex(pattern: Regex) -> Dfa<u8, ()> {
167 let mut nfa = Nfa::new();
168 let (start, end) = pattern.build_into_nfa(&mut nfa);
169 nfa.label(end, Some(()));
170 nfa.set_initial_state(start);
171
172 let dfa = nfa
173 .compile_to_dfa(|labels| if labels.is_empty() { None } else { Some(()) })
174 .minimize();
175
176 return dfa;
177 }
178
179 fn is_string_in(dfa: &Dfa<u8, ()>, data: &str) -> bool {
180 match dfa.scan(
181 String::from(data)
182 .into_bytes()
183 .into_iter()
184 .map(|x| x.handle()),
185 ) {
186 None => false,
187 Some(end_state) => !dfa.get_label(end_state).is_none(),
188 }
189 }
190 #[test]
191 fn test_single_char() {
192 let pattern = Regex::single_char('a');
193 let dfa = create_dfa_for_regex(pattern);
194
195 assert_eq!(is_string_in(&dfa, "a"), true);
196 assert_eq!(is_string_in(&dfa, ""), false);
197 assert_eq!(is_string_in(&dfa, "aa"), false);
198 }
199
200 #[test]
201 fn test_union() {
202 let pattern = Regex::union(vec![
203 Regex::single_char('a'),
204 Regex::single_char('b'),
205 Regex::single_char('c'),
206 ]);
207 let dfa = create_dfa_for_regex(pattern);
208
209 assert_eq!(is_string_in(&dfa, "a"), true);
210 assert_eq!(is_string_in(&dfa, "b"), true);
211 assert_eq!(is_string_in(&dfa, "c"), true);
212 assert_eq!(is_string_in(&dfa, ""), false);
213 assert_eq!(is_string_in(&dfa, "aa"), false);
214 assert_eq!(is_string_in(&dfa, "d"), false);
215 }
216
217 #[test]
218 fn test_concat() {
219 let pattern = Regex::concat(vec![
220 Regex::single_char('a'),
221 Regex::single_char('b'),
222 Regex::single_char('c'),
223 ]);
224 let dfa = create_dfa_for_regex(pattern);
225
226 assert_eq!(is_string_in(&dfa, "abc"), true);
227 assert_eq!(is_string_in(&dfa, ""), false);
228 assert_eq!(is_string_in(&dfa, "a"), false);
229 assert_eq!(is_string_in(&dfa, "bc"), false);
230 }
231
232 #[test]
234 fn test_star() {
235 let pattern = Regex::star_from(Regex::single_char('a'));
236 let dfa = create_dfa_for_regex(pattern);
237
238 assert_eq!(is_string_in(&dfa, ""), true);
239 assert_eq!(is_string_in(&dfa, "a"), true);
240 assert_eq!(is_string_in(&dfa, "aa"), true);
241 assert_eq!(is_string_in(&dfa, "aaaaaaa"), true);
242 assert_eq!(is_string_in(&dfa, "b"), false);
243 assert_eq!(is_string_in(&dfa, "ab"), false);
244 }
245
246 #[test]
247 fn test_plus() {
248 let pattern = Regex::plus_from(Regex::single_char('a'));
249 let dfa = create_dfa_for_regex(pattern);
250
251 assert_eq!(is_string_in(&dfa, ""), false);
252 assert_eq!(is_string_in(&dfa, "a"), true);
253 assert_eq!(is_string_in(&dfa, "aa"), true);
254 assert_eq!(is_string_in(&dfa, "aaaaaaa"), true);
255 assert_eq!(is_string_in(&dfa, "b"), false);
256 assert_eq!(is_string_in(&dfa, "ab"), false);
257 }
258
259 #[test]
260 fn test_complex() {
261 let pattern = Regex::concat(vec![
262 Regex::union(vec![
263 Regex::character_range('a', 'z'),
264 Regex::character_range('A', 'Z'),
265 Regex::single_char('_'),
266 ]),
267 Regex::star_from(Regex::union(vec![
268 Regex::character_range('a', 'z'),
269 Regex::character_range('A', 'Z'),
270 Regex::character_range('0', '9'),
271 Regex::single_char('_'),
272 ])),
273 ]);
274 let dfa = create_dfa_for_regex(pattern);
275
276 assert_eq!(is_string_in(&dfa, "MyThing"), true);
277 assert_eq!(is_string_in(&dfa, "our_thing_12"), true);
278 assert_eq!(is_string_in(&dfa, "i"), true);
279 assert_eq!(is_string_in(&dfa, "a1jh2b45"), true);
280 assert_eq!(is_string_in(&dfa, ""), false);
281 assert_eq!(is_string_in(&dfa, "mine()"), false);
282 assert_eq!(is_string_in(&dfa, "12"), false);
283 assert_eq!(is_string_in(&dfa, "1ours"), false);
284 }
285}