1use id_arena::{Arena, Id};
2
3type AsciiString = Vec<u8>;
4pub type AsciiStringRef<'a> = &'a [u8];
5
6type NFAStateId = Id<NFAState>;
7type VisitedNodes = Vec<NFAStateId>;
8
9struct NFAState {
10 is_end: bool,
11 char_transition: Vec<Option<NFAStateId>>,
12 epsilon_transition: Vec<NFAStateId>,
13}
14
15impl NFAState {
16 fn new() -> Self {
17 NFAState {
18 is_end: false,
19 char_transition: vec![None; 256],
20 epsilon_transition: vec![],
21 }
22 }
23
24 fn add_epsilon(&mut self, to: NFAStateId) {
25 self.epsilon_transition.push(to)
26 }
27}
28
29struct NFAFragment {
30 start: NFAStateId,
31 out: NFAStateId,
32}
33
34impl NFAFragment {
35 fn new(arena: &mut Arena<NFAState>) -> Self {
36 let start = arena.alloc(NFAState::new());
37 let out = arena.alloc(NFAState::new());
38
39 arena[start].is_end = false;
40 arena[out].is_end = true;
41
42 NFAFragment { start, out }
43 }
44}
45
46fn insert_concat_operator(regexp_bytes: AsciiStringRef) -> AsciiString {
47 let n = regexp_bytes.len();
48 let mut result = Vec::with_capacity(n + n + 1);
49
50 for i in 0..n {
51 let at = regexp_bytes[i];
52 result.push(at);
53
54 if at == ('(' as u8) || at == ('|' as u8) {
55 continue;
56 }
57
58 if i < n - 1 {
59 let next = regexp_bytes[i + 1];
60
61 if next == ('|' as u8)
62 || next == ('+' as u8)
63 || next == ('*' as u8)
64 || next == (')' as u8)
65 || next == ('?' as u8)
66 {
67 continue;
68 }
69
70 result.push('.' as u8);
71 }
72 }
73
74 result
75}
76
77fn operator_precedence(c: u8) -> u8 {
78 match c as char {
79 '.' => 0,
80 '|' => 1,
81 '+' => 2,
82 '?' => 2,
83 '*' => 2,
84 _ => 0,
85 }
86}
87
88fn is_operator(c: u8) -> bool {
89 (c == '|' as u8) || (c == '+' as u8) || (c == '.' as u8) || (c == '*' as u8) || (c == '?' as u8)
90}
91
92fn regexp_to_postfix(regexp: AsciiStringRef) -> AsciiString {
93 let n = regexp.len();
94 let mut result: Vec<u8> = Vec::with_capacity(n + n + 1);
95 let mut stack: Vec<u8> = Vec::with_capacity(n + n + 1);
96
97 for i in 0..n {
98 let token: u8 = regexp[i];
99
100 if is_operator(token) {
101 let precedence = operator_precedence(token);
102
103 while let Some(c) = stack.last() {
104 if *c != '(' as u8 && operator_precedence(stack[0]) >= precedence {
105 result.push(*c);
106 stack.pop();
107 } else {
108 break;
109 }
110 }
111
112 stack.push(token);
113 } else if token == '(' as u8 {
114 stack.push(token);
115 } else if token == ')' as u8 {
116 while let Some(c) = stack.last() {
117 if *c != '(' as u8 {
118 result.push(*c);
119 stack.pop();
120 } else {
121 break;
122 }
123 }
124 stack.pop();
125 } else {
126 result.push(token);
127 }
128 }
129
130 while let Some(c) = stack.last() {
131 result.push(*c);
132 stack.pop();
133 }
134
135 result
136}
137
138fn postfix_to_nfa(arena: &mut Arena<NFAState>, postfix_regexp: AsciiStringRef) -> NFAFragment {
139 let n = postfix_regexp.len();
140 let mut stack: Vec<NFAFragment> = Vec::with_capacity(n);
141
142 for i in 0..n {
143 let at = postfix_regexp[i];
144 match at as char {
145 '.' => {
146 let right = stack.pop().unwrap();
147 let left = stack.pop().unwrap();
148
149 arena[left.out].is_end = false;
150 arena[left.out].add_epsilon(right.start);
151
152 let mut frag = NFAFragment::new(arena);
153 frag.start = left.start;
154 frag.out = right.out;
155 stack.push(frag);
156 }
157 '|' => {
158 let right = stack.pop().unwrap();
159 let left = stack.pop().unwrap();
160
161 let frag = NFAFragment::new(arena);
162 arena[frag.start].add_epsilon(right.start);
163 arena[frag.start].add_epsilon(left.start);
164
165 arena[left.out].is_end = false;
166 arena[left.out].add_epsilon(frag.out);
167
168 arena[right.out].is_end = false;
169 arena[right.out].add_epsilon(frag.out);
170
171 stack.push(frag);
172 }
173 '?' => {
174 let op = stack.pop().unwrap();
175
176 let frag = NFAFragment::new(arena);
177 arena[frag.start].add_epsilon(frag.out);
178 arena[frag.start].add_epsilon(op.start);
179 arena[op.out].add_epsilon(frag.out);
180 arena[op.out].is_end = false;
181
182 stack.push(frag);
183 }
184 '+' => {
185 let op = stack.pop().unwrap();
186
187 let frag = NFAFragment::new(arena);
188 arena[frag.start].add_epsilon(op.start);
189 arena[op.out].add_epsilon(op.start);
190 arena[op.out].add_epsilon(frag.out);
191 arena[op.out].is_end = false;
192
193 stack.push(frag);
194 }
195 '*' => {
196 let op = stack.pop().unwrap();
197
198 let frag = NFAFragment::new(arena);
199 arena[frag.start].add_epsilon(op.start);
200 arena[frag.start].add_epsilon(frag.out);
201 arena[op.out].add_epsilon(op.start);
202 arena[op.out].add_epsilon(frag.out);
203 arena[op.out].is_end = false;
204
205 stack.push(frag);
206 }
207 _ => {
208 let frag = NFAFragment::new(arena);
209 arena[frag.start].char_transition[at as usize] = Some(frag.out);
210 stack.push(frag);
211 }
212 }
213 }
214
215 let result = stack.pop().unwrap();
216 result
217}
218
219fn already_visited(v: &VisitedNodes, node: NFAStateId) -> bool {
220 for e in v {
221 if *e == node {
222 return true;
223 }
224 }
225
226 false
227}
228
229fn dfs(
230 arena: &Arena<NFAState>,
231 root: NFAStateId,
232 word: AsciiStringRef,
233 matched_num: usize,
234 visited: &mut VisitedNodes,
235) -> bool {
236 if already_visited(visited, root) {
237 return false;
238 }
239
240 visited.push(root);
241 if word.len() == matched_num {
242 if arena[root].is_end {
243 return true;
244 }
245
246 for v in &arena[root].epsilon_transition {
247 if dfs(arena, *v, word, matched_num, visited) {
248 return true;
249 }
250 }
251 } else {
252 if let Some(transition) = arena[root].char_transition[word[matched_num] as usize % 128] {
253 let mut visited = VisitedNodes::with_capacity(256);
254 if dfs(arena, transition, word, matched_num + 1, &mut visited) {
255 return true;
256 }
257 } else {
258 for v in &arena[root].epsilon_transition {
259 if dfs(arena, *v, word, matched_num, visited) {
260 return true;
261 }
262 }
263 }
264 }
265
266 false
267}
268
269pub struct Regex {
270 fragment: NFAFragment,
271 arena: Arena<NFAState>,
272}
273
274impl Regex {
275 pub fn new(regexp: AsciiStringRef) -> Option<Self> {
276 let mut arena: Arena<NFAState> = Arena::new();
277 let concatted = insert_concat_operator(regexp);
278 let postfix = regexp_to_postfix(&concatted);
279 let result = postfix_to_nfa(&mut arena, &postfix);
280 Some(Regex {
281 fragment: result,
282 arena,
283 })
284 }
285
286 pub fn is_match(&self, search: AsciiStringRef) -> bool {
287 let mut visited = vec![];
288 dfs(&self.arena, self.fragment.start, search, 0, &mut visited)
289 }
290}
291
292pub fn regex_match(regexp: &str, search: &str) -> bool {
293 let r = Regex::new(regexp.as_bytes()).unwrap();
294 r.is_match(search.as_bytes())
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 #[test]
302 fn test_regex_match() {
303 assert_eq!(true, regex_match("(zz)+", "zz"));
304 assert_eq!(true, regex_match("(x|y)*z", "xyxyyyxxxz"));
305 assert_eq!(false, regex_match("(x|y)*z+", "xy"));
306 assert_eq!(true, regex_match("(x|y)*z+", "xyzzz"));
307 assert_eq!(true, regex_match("(1|2|3|4|5|6|7|8|9)+", "1423"));
308 assert_eq!(false, regex_match("(1|2|3|4|5|6|7|8|9)+", "123abc"));
309 assert_eq!(true, regex_match("a?", ""));
310 assert_eq!(true, regex_match("a?", "a"));
311 assert_eq!(false, regex_match("a?", "aa"));
312 assert_eq!(true, regex_match("hell(a|o)?", "hello"));
313 assert_eq!(true, regex_match("(a|b)?", "a"));
314 }
315}