Skip to main content

pipa/regexp/
optimizer.rs

1use super::ast::Ast;
2use super::charclass::CharRange;
3
4#[derive(Debug, Clone)]
5pub enum OptimizedPattern {
6    LiteralChar(char),
7
8    LiteralString(String),
9
10    CharClass(CharRange),
11
12    Simple(Box<Ast>),
13
14    Complex(Box<Ast>),
15}
16
17pub fn analyze_pattern(ast: &Ast) -> OptimizedPattern {
18    match ast {
19        Ast::Char(c) => OptimizedPattern::LiteralChar(*c),
20        Ast::Concat(nodes) if nodes.len() == 1 => analyze_pattern(&nodes[0]),
21        Ast::Concat(nodes) => {
22            let mut literal = String::new();
23            for node in nodes {
24                match node {
25                    Ast::Char(c) => literal.push(*c),
26                    _ => return OptimizedPattern::Complex(Box::new(ast.clone())),
27                }
28            }
29            if literal.len() == 1 {
30                OptimizedPattern::LiteralChar(literal.chars().next().unwrap())
31            } else {
32                OptimizedPattern::LiteralString(literal)
33            }
34        }
35        Ast::Class(class) => OptimizedPattern::CharClass(class.ranges.clone()),
36        Ast::Any => OptimizedPattern::Simple(Box::new(ast.clone())),
37        _ => OptimizedPattern::Complex(Box::new(ast.clone())),
38    }
39}
40
41pub fn fast_literal_search(haystack: &str, needle: &str) -> Option<usize> {
42    if needle.len() == 1 {
43        let c = needle.chars().next().unwrap();
44        fast_char_search(haystack, c)
45    } else {
46        boyer_moore_search(haystack, needle)
47    }
48}
49
50#[inline(always)]
51pub fn fast_char_search(haystack: &str, needle: char) -> Option<usize> {
52    if needle.is_ascii() {
53        let needle_byte = needle as u8;
54        haystack.bytes().position(|b| b == needle_byte)
55    } else {
56        haystack.chars().position(|c| c == needle)
57    }
58}
59
60pub fn boyer_moore_search(haystack: &str, needle: &str) -> Option<usize> {
61    if needle.is_empty() {
62        return Some(0);
63    }
64    if haystack.len() < needle.len() {
65        return None;
66    }
67
68    if needle.len() < 4 {
69        return naive_search(haystack, needle);
70    }
71
72    let mut bad_char: [usize; 256] = [0; 256];
73    let needle_bytes = needle.as_bytes();
74    let needle_len = needle_bytes.len();
75
76    for (i, &b) in needle_bytes.iter().enumerate() {
77        bad_char[b as usize] = i + 1;
78    }
79
80    let haystack_bytes = haystack.as_bytes();
81    let mut i = 0;
82
83    while i <= haystack_bytes.len() - needle_len {
84        let mut j = needle_len;
85
86        while j > 0 && haystack_bytes[i + j - 1] == needle_bytes[j - 1] {
87            j -= 1;
88        }
89
90        if j == 0 {
91            return Some(i);
92        }
93
94        let bad = bad_char[haystack_bytes[i + needle_len - 1] as usize];
95        if bad == 0 {
96            i += needle_len;
97        } else {
98            i += needle_len - bad + 1;
99        }
100    }
101
102    None
103}
104
105#[inline(always)]
106fn naive_search(haystack: &str, needle: &str) -> Option<usize> {
107    let haystack_bytes = haystack.as_bytes();
108    let needle_bytes = needle.as_bytes();
109
110    haystack_bytes
111        .windows(needle_bytes.len())
112        .position(|window| window == needle_bytes)
113}
114
115pub fn fast_byte_search(haystack: &str, needle: u8) -> Option<usize> {
116    haystack.bytes().position(|b| b == needle)
117}
118
119#[derive(Clone)]
120pub struct SimpleDFA {
121    literal: Option<String>,
122
123    char_class: Option<CharRange>,
124}
125
126impl SimpleDFA {
127    pub fn new(pattern: &Ast) -> Option<Self> {
128        match analyze_pattern(pattern) {
129            OptimizedPattern::LiteralChar(c) => Some(Self {
130                literal: Some(c.to_string()),
131                char_class: None,
132            }),
133            OptimizedPattern::LiteralString(s) => Some(Self {
134                literal: Some(s),
135                char_class: None,
136            }),
137            OptimizedPattern::CharClass(class) => Some(Self {
138                literal: None,
139                char_class: Some(class),
140            }),
141            _ => None,
142        }
143    }
144
145    pub fn find(&self, input: &str) -> Option<(usize, usize)> {
146        if let Some(ref literal) = self.literal {
147            fast_literal_search(input, literal).map(|pos| (pos, pos + literal.len()))
148        } else if let Some(ref class) = self.char_class {
149            for (pos, c) in input.char_indices() {
150                if class.contains(c as u32) {
151                    let end = pos + c.len_utf8();
152                    return Some((pos, end));
153                }
154            }
155            None
156        } else {
157            None
158        }
159    }
160
161    pub fn match_at(&self, input: &str, pos: usize) -> Option<usize> {
162        if pos >= input.len() {
163            return None;
164        }
165
166        if let Some(ref literal) = self.literal {
167            let remaining = &input[pos..];
168            if remaining.starts_with(literal) {
169                Some(pos + literal.len())
170            } else {
171                None
172            }
173        } else if let Some(ref class) = self.char_class {
174            let remaining = &input[pos..];
175            if let Some(c) = remaining.chars().next() {
176                if class.contains(c as u32) {
177                    Some(pos + c.len_utf8())
178                } else {
179                    None
180                }
181            } else {
182                None
183            }
184        } else {
185            None
186        }
187    }
188}
189
190pub struct MatchStats {
191    pub attempts: u64,
192
193    pub successes: u64,
194
195    pub backtracks: u64,
196}
197
198impl MatchStats {
199    pub fn new() -> Self {
200        Self {
201            attempts: 0,
202            successes: 0,
203            backtracks: 0,
204        }
205    }
206
207    pub fn should_use_dfa(&self) -> bool {
208        if self.attempts < 100 {
209            return false;
210        }
211        let backtrack_ratio = self.backtracks as f64 / self.attempts as f64;
212        backtrack_ratio < 0.1
213    }
214}
215
216impl Default for MatchStats {
217    fn default() -> Self {
218        Self::new()
219    }
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn test_char_search() {
228        assert_eq!(fast_char_search("hello", 'l'), Some(2));
229        assert_eq!(fast_char_search("hello", 'x'), None);
230    }
231
232    #[test]
233    fn test_boyer_moore() {
234        let text = "The quick brown fox jumps over the lazy dog";
235        assert_eq!(boyer_moore_search(text, "fox"), Some(16));
236        assert_eq!(boyer_moore_search(text, "cat"), None);
237    }
238
239    #[test]
240    fn test_naive_search() {
241        assert_eq!(naive_search("hello", "ll"), Some(2));
242        assert_eq!(naive_search("hello", "abc"), None);
243    }
244
245    #[test]
246    fn test_dfa_literal() {
247        let ast = Ast::Char('a');
248        let dfa = SimpleDFA::new(&ast).unwrap();
249        assert_eq!(dfa.find("abc"), Some((0, 1)));
250        assert_eq!(dfa.find("bbc"), None);
251    }
252
253    #[test]
254    fn test_dfa_match_at() {
255        let ast = Ast::Char('a');
256        let dfa = SimpleDFA::new(&ast).unwrap();
257        assert_eq!(dfa.match_at("abc", 0), Some(1));
258        assert_eq!(dfa.match_at("abc", 1), None);
259    }
260}