Skip to main content

pipa/regexp/
fast_engine.rs

1use super::compiler::{Program, compile};
2use super::dfa::SimpleDFA;
3use super::engine::{Match, execute as nfa_execute};
4use super::fast_class::FastClassMatcher;
5use super::optimizer::{OptimizedPattern, analyze_pattern};
6use super::parser::parse;
7
8fn is_pure_literal(pattern: &str) -> bool {
9    for c in pattern.chars() {
10        match c {
11            '\\' | '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' => {
12                return false;
13            }
14            _ => {}
15        }
16    }
17    true
18}
19
20#[derive(Debug, Clone, Copy, PartialEq)]
21pub enum ExecMode {
22    LiteralFast,
23
24    FastClass,
25
26    LiteralDFA,
27
28    ClassDFA,
29
30    OptimizedNFA,
31
32    FullNFA,
33}
34
35#[derive(Clone)]
36pub struct FastRegex {
37    mode: ExecMode,
38
39    fast_class: Option<FastClassMatcher>,
40
41    dfa: Option<SimpleDFA>,
42
43    program: Option<Program>,
44}
45
46impl FastRegex {
47    pub fn new(pattern: &str, flags: u16) -> Result<Self, String> {
48        if let Some(fast_class) = FastClassMatcher::from_pattern(pattern) {
49            return Ok(Self {
50                mode: ExecMode::FastClass,
51                fast_class: Some(fast_class),
52                dfa: None,
53                program: None,
54            });
55        }
56
57        if is_pure_literal(pattern) {
58            if let Some(dfa) = SimpleDFA::new(pattern) {
59                return Ok(Self {
60                    mode: ExecMode::LiteralFast,
61                    fast_class: None,
62                    dfa: Some(dfa),
63                    program: None,
64                });
65            }
66        }
67
68        if let Some(dfa) = SimpleDFA::new(pattern) {
69            return Ok(Self {
70                mode: ExecMode::ClassDFA,
71                fast_class: None,
72                dfa: Some(dfa),
73                program: None,
74            });
75        }
76
77        let ast = parse(pattern, flags)?;
78
79        let optimized = analyze_pattern(&ast);
80
81        match optimized {
82            OptimizedPattern::LiteralChar(c) => {
83                let mut literal_str = String::new();
84                literal_str.push(c);
85                let dfa = SimpleDFA::new(&literal_str).ok_or("Failed to create literal DFA")?;
86                Ok(Self {
87                    mode: ExecMode::LiteralFast,
88                    fast_class: None,
89                    dfa: Some(dfa),
90                    program: None,
91                })
92            }
93            OptimizedPattern::LiteralString(s) => {
94                let dfa = SimpleDFA::new(&s).ok_or("Failed to create literal DFA")?;
95                Ok(Self {
96                    mode: ExecMode::LiteralFast,
97                    fast_class: None,
98                    dfa: Some(dfa),
99                    program: None,
100                })
101            }
102            OptimizedPattern::CharClass(_) => {
103                let program = compile(&ast, flags)?;
104                Ok(Self {
105                    mode: ExecMode::FullNFA,
106                    fast_class: None,
107                    dfa: None,
108                    program: Some(program),
109                })
110            }
111            OptimizedPattern::Simple(ast) => {
112                let program = compile(&ast, flags).ok();
113                Ok(Self {
114                    mode: if program.is_some() {
115                        ExecMode::OptimizedNFA
116                    } else {
117                        ExecMode::FullNFA
118                    },
119                    fast_class: None,
120                    dfa: None,
121                    program,
122                })
123            }
124            OptimizedPattern::Complex(ast) => {
125                let program = compile(&ast, flags)?;
126                Ok(Self {
127                    mode: ExecMode::FullNFA,
128                    fast_class: None,
129                    dfa: None,
130                    program: Some(program),
131                })
132            }
133        }
134    }
135
136    pub fn find(&self, input: &str) -> Option<Match> {
137        match self.mode {
138            ExecMode::LiteralFast => self.dfa.as_ref().and_then(|dfa| dfa.find(input)),
139            ExecMode::FastClass => {
140                if let Some(ref fast_class) = self.fast_class {
141                    fast_class.find(input)
142                } else {
143                    None
144                }
145            }
146            ExecMode::LiteralDFA | ExecMode::ClassDFA => {
147                if let Some(ref dfa) = self.dfa {
148                    dfa.find(input)
149                } else {
150                    None
151                }
152            }
153            ExecMode::OptimizedNFA => {
154                if let Some(ref dfa) = self.dfa {
155                    if let Some(m) = dfa.find(input) {
156                        return Some(m);
157                    }
158                }
159
160                if let Some(ref program) = self.program {
161                    nfa_execute(program, input, 0)
162                } else {
163                    None
164                }
165            }
166            ExecMode::FullNFA => {
167                if let Some(ref program) = self.program {
168                    nfa_execute(program, input, 0)
169                } else {
170                    None
171                }
172            }
173        }
174    }
175
176    #[inline(always)]
177    pub fn is_match(&self, input: &str) -> bool {
178        match self.mode {
179            ExecMode::LiteralFast => self.dfa.as_ref().map_or(false, |dfa| dfa.is_match(input)),
180            ExecMode::FastClass => {
181                if let Some(ref fast_class) = self.fast_class {
182                    fast_class.is_match(input)
183                } else {
184                    false
185                }
186            }
187            ExecMode::LiteralDFA | ExecMode::ClassDFA => {
188                if let Some(ref dfa) = self.dfa {
189                    dfa.is_match(input)
190                } else {
191                    false
192                }
193            }
194            ExecMode::OptimizedNFA | ExecMode::FullNFA => self.find(input).is_some(),
195        }
196    }
197
198    pub fn find_all(&self, input: &str) -> Vec<Match> {
199        if let Some(ref dfa) = self.dfa {
200            return dfa.find_all(input);
201        }
202
203        if let Some(ref fast_class) = self.fast_class {
204            return fast_class.find_all(input);
205        }
206
207        let mut matches = Vec::new();
208        let mut pos = 0;
209
210        while pos <= input.len() {
211            if let Some(m) = self.find_from(input, pos) {
212                let match_end = m.end;
213                matches.push(m);
214
215                if match_end <= pos {
216                    pos += 1;
217                } else {
218                    pos = match_end;
219                }
220            } else {
221                break;
222            }
223        }
224
225        matches
226    }
227
228    fn find_from(&self, input: &str, start: usize) -> Option<Match> {
229        if start >= input.len() {
230            return None;
231        }
232
233        let slice = &input[start..];
234
235        match self.mode {
236            ExecMode::LiteralDFA | ExecMode::ClassDFA => {
237                if let Some(ref dfa) = self.dfa {
238                    dfa.find(slice).map(|m| Match {
239                        start: m.start + start,
240                        end: m.end + start,
241                        captures: m
242                            .captures
243                            .iter()
244                            .map(|(s, e)| (s.map(|x| x + start), e.map(|x| x + start)))
245                            .collect(),
246                    })
247                } else {
248                    None
249                }
250            }
251            _ => {
252                if let Some(ref program) = self.program {
253                    nfa_execute(program, input, start)
254                } else {
255                    None
256                }
257            }
258        }
259    }
260
261    pub fn mode(&self) -> ExecMode {
262        self.mode
263    }
264
265    pub fn dfa(&self) -> Option<&SimpleDFA> {
266        self.dfa.as_ref()
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273
274    #[test]
275    fn test_fast_literal() {
276        let re = FastRegex::new("hello", 0).unwrap();
277
278        assert_eq!(re.mode(), ExecMode::LiteralFast);
279        assert!(re.is_match("hello world"));
280        assert!(!re.is_match("goodbye"));
281    }
282
283    #[test]
284    fn test_fast_literal_find() {
285        let re = FastRegex::new("world", 0).unwrap();
286        let m = re.find("hello world").unwrap();
287        assert_eq!(m.start, 6);
288        assert_eq!(m.end, 11);
289    }
290
291    #[test]
292    fn test_fast_char_class() {
293        let re = FastRegex::new(r"\w+", 0).unwrap();
294        assert_eq!(
295            re.mode(),
296            ExecMode::FastClass,
297            "\\w+ should use FastClass mode"
298        );
299
300        let m = re.find("hello world").unwrap();
301        assert_eq!(m.start, 0);
302        assert_eq!(m.end, 5);
303
304        let re2 = FastRegex::new(r"\d+", 0).unwrap();
305        assert_eq!(
306            re2.mode(),
307            ExecMode::FastClass,
308            "\\d+ should use FastClass mode"
309        );
310
311        let m2 = re2.find("abc123def").unwrap();
312        assert_eq!(m2.start, 3);
313        assert_eq!(m2.end, 6);
314
315        let re3 = FastRegex::new(r"\s+", 0).unwrap();
316        assert_eq!(
317            re3.mode(),
318            ExecMode::FastClass,
319            "\\s+ should use FastClass mode"
320        );
321
322        let m3 = re3.find("hello   world").unwrap();
323        assert_eq!(m3.start, 5);
324        assert_eq!(m3.end, 8);
325    }
326
327    #[test]
328    fn test_find_all() {
329        let re = FastRegex::new("a", 0).unwrap();
330        let matches = re.find_all("banana");
331        assert_eq!(matches.len(), 3);
332    }
333
334    #[test]
335    fn test_is_match_fast() {
336        let re = FastRegex::new(r"\d+", 0).unwrap();
337        assert!(re.is_match("abc123def"));
338        assert!(!re.is_match("abcdef"));
339    }
340
341    #[test]
342    fn test_fast_literal_is_match() {
343        let re = FastRegex::new("hello", 0).unwrap();
344        assert!(re.is_match("hello world"));
345        assert!(re.is_match("say hello"));
346        assert!(!re.is_match("goodbye"));
347    }
348
349    #[test]
350    fn test_fast_empty_pattern() {
351        let re = FastRegex::new("", 0).unwrap();
352        assert!(re.is_match("anything"));
353        assert!(re.is_match(""));
354    }
355
356    #[test]
357    fn test_fast_single_char() {
358        let re = FastRegex::new("x", 0).unwrap();
359        assert!(re.is_match("xyz"));
360        assert!(re.is_match("abc x def"));
361        assert!(!re.is_match("abc"));
362    }
363
364    #[test]
365    fn test_fast_anchors() {
366        let re_start = FastRegex::new("^hello", 0).unwrap();
367        assert!(re_start.is_match("hello world"));
368        assert!(!re_start.is_match("say hello"));
369
370        let re_end = FastRegex::new("world$", 0).unwrap();
371        assert!(re_end.is_match("hello world"));
372        assert!(!re_end.is_match("worldly"));
373    }
374
375    #[test]
376    fn test_fast_star_plus() {
377        let re_star = FastRegex::new("a*", 0).unwrap();
378        assert!(re_star.is_match(""));
379        assert!(re_star.is_match("a"));
380        assert!(re_star.is_match("aaa"));
381        assert!(re_star.is_match("baaa"));
382
383        let re_plus = FastRegex::new("a+", 0).unwrap();
384        assert!(!re_plus.is_match(""));
385        assert!(!re_plus.is_match("bbb"));
386        assert!(re_plus.is_match("a"));
387        assert!(re_plus.is_match("aaa"));
388    }
389
390    #[test]
391    fn test_find_all_words() {
392        let re = FastRegex::new(r"\w+", 0).unwrap();
393        let matches = re.find_all("hello world test");
394        assert_eq!(matches.len(), 3);
395        assert_eq!(matches[0].start, 0);
396        assert_eq!(matches[0].end, 5);
397        assert_eq!(matches[1].start, 6);
398        assert_eq!(matches[1].end, 11);
399        assert_eq!(matches[2].start, 12);
400        assert_eq!(matches[2].end, 16);
401    }
402
403    #[test]
404    fn test_find_all_digits() {
405        let re = FastRegex::new(r"\d+", 0).unwrap();
406        let matches = re.find_all("a1b22c333d");
407        assert_eq!(matches.len(), 3);
408        assert_eq!(matches[0].as_str("a1b22c333d"), "1");
409        assert_eq!(matches[1].as_str("a1b22c333d"), "22");
410        assert_eq!(matches[2].as_str("a1b22c333d"), "333");
411    }
412
413    #[test]
414    fn test_find_all_no_matches() {
415        let re = FastRegex::new("xyz", 0).unwrap();
416        let matches = re.find_all("abc");
417        assert!(matches.is_empty());
418    }
419
420    #[test]
421    fn test_fast_unicode() {
422        let re = FastRegex::new("hello", 0).unwrap();
423        assert!(re.is_match("你好 hello 世界"));
424
425        let m = re.find("你好 hello 世界").unwrap();
426        assert_eq!(m.as_str("你好 hello 世界"), "hello");
427    }
428
429    #[test]
430    fn test_fast_pattern_with_special_chars() {
431        let re_dot = FastRegex::new("a.b", 0).unwrap();
432        assert!(re_dot.is_match("axb"));
433        assert!(!re_dot.is_match("ab"));
434
435        let re_alt = FastRegex::new("a|b", 0).unwrap();
436        assert!(re_alt.is_match("a"));
437        assert!(re_alt.is_match("b"));
438        assert!(!re_alt.is_match("c"));
439    }
440
441    #[test]
442    fn test_exec_mode_display() {
443        assert_eq!(format!("{:?}", ExecMode::LiteralFast), "LiteralFast");
444        assert_eq!(format!("{:?}", ExecMode::FastClass), "FastClass");
445        assert_eq!(format!("{:?}", ExecMode::ClassDFA), "ClassDFA");
446    }
447}