mr_regex/
lib.rs

1use id_arena::{Arena, Id};
2
3type AsciiString = Vec<u8>;
4pub type AsciiStringRef<'a> = &'a [u8];
5
6type NFAStateId = Id<NFAState>;
7type VisitedNodes = Vec<NFAStateId>;
8
9struct NFAState {
10    is_end: bool,
11    char_transition: Vec<Option<NFAStateId>>,
12    epsilon_transition: Vec<NFAStateId>,
13}
14
15impl NFAState {
16    fn new() -> Self {
17        NFAState {
18            is_end: false,
19            char_transition: vec![None; 256],
20            epsilon_transition: vec![],
21        }
22    }
23
24    fn add_epsilon(&mut self, to: NFAStateId) {
25        self.epsilon_transition.push(to)
26    }
27}
28
29struct NFAFragment {
30    start: NFAStateId,
31    out: NFAStateId,
32}
33
34impl NFAFragment {
35    fn new(arena: &mut Arena<NFAState>) -> Self {
36        let start = arena.alloc(NFAState::new());
37        let out = arena.alloc(NFAState::new());
38
39        arena[start].is_end = false;
40        arena[out].is_end = true;
41
42        NFAFragment { start, out }
43    }
44}
45
46fn insert_concat_operator(regexp_bytes: AsciiStringRef) -> AsciiString {
47    let n = regexp_bytes.len();
48    let mut result = Vec::with_capacity(n + n + 1);
49
50    for i in 0..n {
51        let at = regexp_bytes[i];
52        result.push(at);
53
54        if at == ('(' as u8) || at == ('|' as u8) {
55            continue;
56        }
57
58        if i < n - 1 {
59            let next = regexp_bytes[i + 1];
60
61            if next == ('|' as u8)
62                || next == ('+' as u8)
63                || next == ('*' as u8)
64                || next == (')' as u8)
65                || next == ('?' as u8)
66            {
67                continue;
68            }
69
70            result.push('.' as u8);
71        }
72    }
73
74    result
75}
76
77fn operator_precedence(c: u8) -> u8 {
78    match c as char {
79        '.' => 0,
80        '|' => 1,
81        '+' => 2,
82        '?' => 2,
83        '*' => 2,
84        _ => 0,
85    }
86}
87
88fn is_operator(c: u8) -> bool {
89    (c == '|' as u8) || (c == '+' as u8) || (c == '.' as u8) || (c == '*' as u8) || (c == '?' as u8)
90}
91
92fn regexp_to_postfix(regexp: AsciiStringRef) -> AsciiString {
93    let n = regexp.len();
94    let mut result: Vec<u8> = Vec::with_capacity(n + n + 1);
95    let mut stack: Vec<u8> = Vec::with_capacity(n + n + 1);
96
97    for i in 0..n {
98        let token: u8 = regexp[i];
99
100        if is_operator(token) {
101            let precedence = operator_precedence(token);
102
103            while let Some(c) = stack.last() {
104                if *c != '(' as u8 && operator_precedence(stack[0]) >= precedence {
105                    result.push(*c);
106                    stack.pop();
107                } else {
108                    break;
109                }
110            }
111
112            stack.push(token);
113        } else if token == '(' as u8 {
114            stack.push(token);
115        } else if token == ')' as u8 {
116            while let Some(c) = stack.last() {
117                if *c != '(' as u8 {
118                    result.push(*c);
119                    stack.pop();
120                } else {
121                    break;
122                }
123            }
124            stack.pop();
125        } else {
126            result.push(token);
127        }
128    }
129
130    while let Some(c) = stack.last() {
131        result.push(*c);
132        stack.pop();
133    }
134
135    result
136}
137
138fn postfix_to_nfa(arena: &mut Arena<NFAState>, postfix_regexp: AsciiStringRef) -> NFAFragment {
139    let n = postfix_regexp.len();
140    let mut stack: Vec<NFAFragment> = Vec::with_capacity(n);
141
142    for i in 0..n {
143        let at = postfix_regexp[i];
144        match at as char {
145            '.' => {
146                let right = stack.pop().unwrap();
147                let left = stack.pop().unwrap();
148
149                arena[left.out].is_end = false;
150                arena[left.out].add_epsilon(right.start);
151
152                let mut frag = NFAFragment::new(arena);
153                frag.start = left.start;
154                frag.out = right.out;
155                stack.push(frag);
156            }
157            '|' => {
158                let right = stack.pop().unwrap();
159                let left = stack.pop().unwrap();
160
161                let frag = NFAFragment::new(arena);
162                arena[frag.start].add_epsilon(right.start);
163                arena[frag.start].add_epsilon(left.start);
164
165                arena[left.out].is_end = false;
166                arena[left.out].add_epsilon(frag.out);
167
168                arena[right.out].is_end = false;
169                arena[right.out].add_epsilon(frag.out);
170
171                stack.push(frag);
172            }
173            '?' => {
174                let op = stack.pop().unwrap();
175
176                let frag = NFAFragment::new(arena);
177                arena[frag.start].add_epsilon(frag.out);
178                arena[frag.start].add_epsilon(op.start);
179                arena[op.out].add_epsilon(frag.out);
180                arena[op.out].is_end = false;
181
182                stack.push(frag);
183            }
184            '+' => {
185                let op = stack.pop().unwrap();
186
187                let frag = NFAFragment::new(arena);
188                arena[frag.start].add_epsilon(op.start);
189                arena[op.out].add_epsilon(op.start);
190                arena[op.out].add_epsilon(frag.out);
191                arena[op.out].is_end = false;
192
193                stack.push(frag);
194            }
195            '*' => {
196                let op = stack.pop().unwrap();
197
198                let frag = NFAFragment::new(arena);
199                arena[frag.start].add_epsilon(op.start);
200                arena[frag.start].add_epsilon(frag.out);
201                arena[op.out].add_epsilon(op.start);
202                arena[op.out].add_epsilon(frag.out);
203                arena[op.out].is_end = false;
204
205                stack.push(frag);
206            }
207            _ => {
208                let frag = NFAFragment::new(arena);
209                arena[frag.start].char_transition[at as usize] = Some(frag.out);
210                stack.push(frag);
211            }
212        }
213    }
214
215    let result = stack.pop().unwrap();
216    result
217}
218
219fn already_visited(v: &VisitedNodes, node: NFAStateId) -> bool {
220    for e in v {
221        if *e == node {
222            return true;
223        }
224    }
225
226    false
227}
228
229fn dfs(
230    arena: &Arena<NFAState>,
231    root: NFAStateId,
232    word: AsciiStringRef,
233    matched_num: usize,
234    visited: &mut VisitedNodes,
235) -> bool {
236    if already_visited(visited, root) {
237        return false;
238    }
239
240    visited.push(root);
241    if word.len() == matched_num {
242        if arena[root].is_end {
243            return true;
244        }
245
246        for v in &arena[root].epsilon_transition {
247            if dfs(arena, *v, word, matched_num, visited) {
248                return true;
249            }
250        }
251    } else {
252        if let Some(transition) = arena[root].char_transition[word[matched_num] as usize % 128] {
253            let mut visited = VisitedNodes::with_capacity(256);
254            if dfs(arena, transition, word, matched_num + 1, &mut visited) {
255                return true;
256            }
257        } else {
258            for v in &arena[root].epsilon_transition {
259                if dfs(arena, *v, word, matched_num, visited) {
260                    return true;
261                }
262            }
263        }
264    }
265
266    false
267}
268
269pub struct Regex {
270    fragment: NFAFragment,
271    arena: Arena<NFAState>,
272}
273
274impl Regex {
275    pub fn new(regexp: AsciiStringRef) -> Option<Self> {
276        let mut arena: Arena<NFAState> = Arena::new();
277        let concatted = insert_concat_operator(regexp);
278        let postfix = regexp_to_postfix(&concatted);
279        let result = postfix_to_nfa(&mut arena, &postfix);
280        Some(Regex {
281            fragment: result,
282            arena,
283        })
284    }
285
286    pub fn is_match(&self, search: AsciiStringRef) -> bool {
287        let mut visited = vec![];
288        dfs(&self.arena, self.fragment.start, search, 0, &mut visited)
289    }
290}
291
292pub fn regex_match(regexp: &str, search: &str) -> bool {
293    let r = Regex::new(regexp.as_bytes()).unwrap();
294    r.is_match(search.as_bytes())
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn test_regex_match() {
303        assert_eq!(true, regex_match("(zz)+", "zz"));
304        assert_eq!(true, regex_match("(x|y)*z", "xyxyyyxxxz"));
305        assert_eq!(false, regex_match("(x|y)*z+", "xy"));
306        assert_eq!(true, regex_match("(x|y)*z+", "xyzzz"));
307        assert_eq!(true, regex_match("(1|2|3|4|5|6|7|8|9)+", "1423"));
308        assert_eq!(false, regex_match("(1|2|3|4|5|6|7|8|9)+", "123abc"));
309        assert_eq!(true, regex_match("a?", ""));
310        assert_eq!(true, regex_match("a?", "a"));
311        assert_eq!(false, regex_match("a?", "aa"));
312        assert_eq!(true, regex_match("hell(a|o)?", "hello"));
313        assert_eq!(true, regex_match("(a|b)?", "a"));
314    }
315}