1use super::ast::Ast;
2use super::charclass::CharRange;
3
4#[derive(Debug, Clone)]
5pub enum OptimizedPattern {
6 LiteralChar(char),
7
8 LiteralString(String),
9
10 CharClass(CharRange),
11
12 Simple(Box<Ast>),
13
14 Complex(Box<Ast>),
15}
16
17pub fn analyze_pattern(ast: &Ast) -> OptimizedPattern {
18 match ast {
19 Ast::Char(c) => OptimizedPattern::LiteralChar(*c),
20 Ast::Concat(nodes) if nodes.len() == 1 => analyze_pattern(&nodes[0]),
21 Ast::Concat(nodes) => {
22 let mut literal = String::new();
23 for node in nodes {
24 match node {
25 Ast::Char(c) => literal.push(*c),
26 _ => return OptimizedPattern::Complex(Box::new(ast.clone())),
27 }
28 }
29 if literal.len() == 1 {
30 OptimizedPattern::LiteralChar(literal.chars().next().unwrap())
31 } else {
32 OptimizedPattern::LiteralString(literal)
33 }
34 }
35 Ast::Class(class) => OptimizedPattern::CharClass(class.ranges.clone()),
36 Ast::Any => OptimizedPattern::Simple(Box::new(ast.clone())),
37 _ => OptimizedPattern::Complex(Box::new(ast.clone())),
38 }
39}
40
41pub fn fast_literal_search(haystack: &str, needle: &str) -> Option<usize> {
42 if needle.len() == 1 {
43 let c = needle.chars().next().unwrap();
44 fast_char_search(haystack, c)
45 } else {
46 boyer_moore_search(haystack, needle)
47 }
48}
49
50#[inline(always)]
51pub fn fast_char_search(haystack: &str, needle: char) -> Option<usize> {
52 if needle.is_ascii() {
53 let needle_byte = needle as u8;
54 haystack.bytes().position(|b| b == needle_byte)
55 } else {
56 haystack.chars().position(|c| c == needle)
57 }
58}
59
60pub fn boyer_moore_search(haystack: &str, needle: &str) -> Option<usize> {
61 if needle.is_empty() {
62 return Some(0);
63 }
64 if haystack.len() < needle.len() {
65 return None;
66 }
67
68 if needle.len() < 4 {
69 return naive_search(haystack, needle);
70 }
71
72 let mut bad_char: [usize; 256] = [0; 256];
73 let needle_bytes = needle.as_bytes();
74 let needle_len = needle_bytes.len();
75
76 for (i, &b) in needle_bytes.iter().enumerate() {
77 bad_char[b as usize] = i + 1;
78 }
79
80 let haystack_bytes = haystack.as_bytes();
81 let mut i = 0;
82
83 while i <= haystack_bytes.len() - needle_len {
84 let mut j = needle_len;
85
86 while j > 0 && haystack_bytes[i + j - 1] == needle_bytes[j - 1] {
87 j -= 1;
88 }
89
90 if j == 0 {
91 return Some(i);
92 }
93
94 let bad = bad_char[haystack_bytes[i + needle_len - 1] as usize];
95 if bad == 0 {
96 i += needle_len;
97 } else {
98 i += needle_len - bad + 1;
99 }
100 }
101
102 None
103}
104
105#[inline(always)]
106fn naive_search(haystack: &str, needle: &str) -> Option<usize> {
107 let haystack_bytes = haystack.as_bytes();
108 let needle_bytes = needle.as_bytes();
109
110 haystack_bytes
111 .windows(needle_bytes.len())
112 .position(|window| window == needle_bytes)
113}
114
115pub fn fast_byte_search(haystack: &str, needle: u8) -> Option<usize> {
116 haystack.bytes().position(|b| b == needle)
117}
118
119#[derive(Clone)]
120pub struct SimpleDFA {
121 literal: Option<String>,
122
123 char_class: Option<CharRange>,
124}
125
126impl SimpleDFA {
127 pub fn new(pattern: &Ast) -> Option<Self> {
128 match analyze_pattern(pattern) {
129 OptimizedPattern::LiteralChar(c) => Some(Self {
130 literal: Some(c.to_string()),
131 char_class: None,
132 }),
133 OptimizedPattern::LiteralString(s) => Some(Self {
134 literal: Some(s),
135 char_class: None,
136 }),
137 OptimizedPattern::CharClass(class) => Some(Self {
138 literal: None,
139 char_class: Some(class),
140 }),
141 _ => None,
142 }
143 }
144
145 pub fn find(&self, input: &str) -> Option<(usize, usize)> {
146 if let Some(ref literal) = self.literal {
147 fast_literal_search(input, literal).map(|pos| (pos, pos + literal.len()))
148 } else if let Some(ref class) = self.char_class {
149 for (pos, c) in input.char_indices() {
150 if class.contains(c as u32) {
151 let end = pos + c.len_utf8();
152 return Some((pos, end));
153 }
154 }
155 None
156 } else {
157 None
158 }
159 }
160
161 pub fn match_at(&self, input: &str, pos: usize) -> Option<usize> {
162 if pos >= input.len() {
163 return None;
164 }
165
166 if let Some(ref literal) = self.literal {
167 let remaining = &input[pos..];
168 if remaining.starts_with(literal) {
169 Some(pos + literal.len())
170 } else {
171 None
172 }
173 } else if let Some(ref class) = self.char_class {
174 let remaining = &input[pos..];
175 if let Some(c) = remaining.chars().next() {
176 if class.contains(c as u32) {
177 Some(pos + c.len_utf8())
178 } else {
179 None
180 }
181 } else {
182 None
183 }
184 } else {
185 None
186 }
187 }
188}
189
190pub struct MatchStats {
191 pub attempts: u64,
192
193 pub successes: u64,
194
195 pub backtracks: u64,
196}
197
198impl MatchStats {
199 pub fn new() -> Self {
200 Self {
201 attempts: 0,
202 successes: 0,
203 backtracks: 0,
204 }
205 }
206
207 pub fn should_use_dfa(&self) -> bool {
208 if self.attempts < 100 {
209 return false;
210 }
211 let backtrack_ratio = self.backtracks as f64 / self.attempts as f64;
212 backtrack_ratio < 0.1
213 }
214}
215
216impl Default for MatchStats {
217 fn default() -> Self {
218 Self::new()
219 }
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225
226 #[test]
227 fn test_char_search() {
228 assert_eq!(fast_char_search("hello", 'l'), Some(2));
229 assert_eq!(fast_char_search("hello", 'x'), None);
230 }
231
232 #[test]
233 fn test_boyer_moore() {
234 let text = "The quick brown fox jumps over the lazy dog";
235 assert_eq!(boyer_moore_search(text, "fox"), Some(16));
236 assert_eq!(boyer_moore_search(text, "cat"), None);
237 }
238
239 #[test]
240 fn test_naive_search() {
241 assert_eq!(naive_search("hello", "ll"), Some(2));
242 assert_eq!(naive_search("hello", "abc"), None);
243 }
244
245 #[test]
246 fn test_dfa_literal() {
247 let ast = Ast::Char('a');
248 let dfa = SimpleDFA::new(&ast).unwrap();
249 assert_eq!(dfa.find("abc"), Some((0, 1)));
250 assert_eq!(dfa.find("bbc"), None);
251 }
252
253 #[test]
254 fn test_dfa_match_at() {
255 let ast = Ast::Char('a');
256 let dfa = SimpleDFA::new(&ast).unwrap();
257 assert_eq!(dfa.match_at("abc", 0), Some(1));
258 assert_eq!(dfa.match_at("abc", 1), None);
259 }
260}