rust_rule_engine/parser/
simd_search.rs1use aho_corasick::AhoCorasick;
2use memchr;
13
14#[inline]
18pub fn find_byte_simd(haystack: &[u8], needle: u8) -> Option<usize> {
19 memchr::memchr(needle, haystack)
20}
21
22#[inline]
26pub fn find_either_byte_simd(haystack: &[u8], byte1: u8, byte2: u8) -> Option<usize> {
27 memchr::memchr2(byte1, byte2, haystack)
28}
29
30#[inline]
32pub fn find_any_of_three_simd(haystack: &[u8], byte1: u8, byte2: u8, byte3: u8) -> Option<usize> {
33 memchr::memchr3(byte1, byte2, byte3, haystack)
34}
35
36#[inline]
38pub fn find_newline_simd(haystack: &[u8]) -> Option<usize> {
39 memchr::memchr2(b'\r', b'\n', haystack)
40}
41
42pub fn parse_rule_header_simd(text: &str) -> Option<(String, usize)> {
46 let bytes = text.as_bytes();
47
48 let mut pos = 0;
50 while pos < bytes.len() {
51 pos += memchr::memchr(b'r', &bytes[pos..])?;
53
54 if pos + 4 <= bytes.len() && &bytes[pos..pos + 4] == b"rule" {
56 if pos > 0 && bytes[pos - 1].is_ascii_alphanumeric() {
58 pos += 1;
59 continue;
60 }
61
62 if pos + 4 < bytes.len() && bytes[pos + 4].is_ascii_alphanumeric() {
64 pos += 1;
65 continue;
66 }
67
68 let after_rule = &text[pos + 4..];
70 let name_start = after_rule.find(|c: char| !c.is_whitespace())?;
71 let after_ws = &after_rule[name_start..];
72
73 if after_ws.starts_with('"') {
75 let end_quote = memchr::memchr(b'"', &after_ws.as_bytes()[1..])?;
76 let name = after_ws[1..end_quote + 1].to_string();
77 let consumed = pos + 4 + name_start + end_quote + 2;
78 return Some((name, consumed));
79 }
80
81 let name_end = after_ws
83 .find(|c: char| !c.is_alphanumeric() && c != '_')
84 .unwrap_or(after_ws.len());
85
86 if name_end > 0 {
87 let name = after_ws[..name_end].to_string();
88 let consumed = pos + 4 + name_start + name_end;
89 return Some((name, consumed));
90 }
91 }
92
93 pos += 1;
94 }
95
96 None
97}
98
99pub fn find_then_keyword_simd(text: &str) -> Option<usize> {
103 let bytes = text.as_bytes();
104 let mut pos = 0;
105 let mut brace_depth = 0;
106 let mut paren_depth = 0;
107 let mut in_string = false;
108
109 while pos < bytes.len() {
110 let search_result = memchr::memchr3(b't', b'"', b'{', &bytes[pos..]);
112
113 if let Some(offset) = search_result {
114 pos += offset;
115
116 match bytes[pos] {
117 b'"' if !in_string => {
118 in_string = true;
119 pos += 1;
120 }
121 b'"' if in_string => {
122 if pos > 0 && bytes[pos - 1] == b'\\' {
124 pos += 1;
125 continue;
126 }
127 in_string = false;
128 pos += 1;
129 }
130 b'{' if !in_string => {
131 brace_depth += 1;
132 pos += 1;
133 }
134 b'}' if !in_string => {
135 brace_depth -= 1;
136 pos += 1;
137 }
138 b't' if !in_string && brace_depth == 0 && paren_depth == 0 => {
139 if pos + 4 <= bytes.len() && &bytes[pos..pos + 4] == b"then" {
141 let before_ok = pos == 0 || !bytes[pos - 1].is_ascii_alphanumeric();
143 let after_ok =
144 pos + 4 >= bytes.len() || !bytes[pos + 4].is_ascii_alphanumeric();
145 if before_ok && after_ok {
146 return Some(pos);
147 }
148 }
149 pos += 1;
150 }
151 _ => pos += 1,
152 }
153 } else {
154 break;
155 }
156
157 while pos < bytes.len() {
159 if bytes[pos] == b'(' && !in_string {
160 paren_depth += 1;
161 } else if bytes[pos] == b')' && !in_string {
162 paren_depth -= 1;
163 } else if memchr::memchr3(b't', b'"', b'{', &bytes[pos..pos + 1]).is_some() {
164 break;
165 }
166 pos += 1;
167 }
168 }
169
170 None
171}
172
173pub fn find_keywords_simd<'a>(text: &str, keywords: &'a [&str]) -> Vec<(usize, &'a str)> {
177 if keywords.is_empty() {
178 return Vec::new();
179 }
180
181 let ac = AhoCorasick::new(keywords).unwrap();
183
184 ac.find_iter(text)
186 .map(|mat| (mat.start(), keywords[mat.pattern().as_usize()]))
187 .collect()
188}
189
190pub fn count_lines_simd(text: &str) -> usize {
194 let bytes = text.as_bytes();
195 let mut count = 0;
196 let mut pos = 0;
197
198 while pos < bytes.len() {
199 if let Some(offset) = memchr::memchr2(b'\r', b'\n', &bytes[pos..]) {
200 pos += offset;
201
202 if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
204 pos += 2;
205 } else {
206 pos += 1;
207 }
208 count += 1;
209 } else {
210 break;
211 }
212 }
213
214 count
215}
216
217pub fn skip_whitespace_simd(text: &str) -> usize {
221 let bytes = text.as_bytes();
222
223 for (i, &byte) in bytes.iter().enumerate() {
225 if !matches!(byte, b' ' | b'\t' | b'\r' | b'\n') {
226 return i;
227 }
228 }
229
230 text.len()
231}
232
233pub fn extract_identifier_simd(text: &str) -> Option<String> {
237 let bytes = text.as_bytes();
238
239 if bytes.is_empty() {
240 return None;
241 }
242
243 if !bytes[0].is_ascii_alphabetic() && bytes[0] != b'_' {
245 return None;
246 }
247
248 let mut end = 1;
250 while end < bytes.len() {
251 let byte = bytes[end];
252 if !byte.is_ascii_alphanumeric() && byte != b'_' {
253 break;
254 }
255 end += 1;
256 }
257
258 Some(text[..end].to_string())
259}
260
261pub fn split_into_rules_simd(grl_text: &str) -> Vec<String> {
265 let bytes = grl_text.as_bytes();
266 let mut rules = Vec::new();
267 let mut pos = 0;
268
269 while pos < bytes.len() {
270 if let Some(offset) = memchr::memchr(b'r', &bytes[pos..]) {
272 let rule_pos = pos + offset;
273
274 if rule_pos + 5 <= bytes.len() && &bytes[rule_pos..rule_pos + 5] == b"rule " {
276 if let Some(brace_offset) = memchr::memchr(b'{', &bytes[rule_pos..]) {
278 let brace_pos = rule_pos + brace_offset;
279
280 if let Some(close_pos) = find_matching_brace_simd(grl_text, brace_pos) {
282 let rule_text = &grl_text[rule_pos..=close_pos];
283 rules.push(rule_text.to_string());
284 pos = close_pos + 1;
285 continue;
286 }
287 }
288 }
289
290 pos = rule_pos + 1;
291 } else {
292 break;
293 }
294 }
295
296 rules
297}
298
299pub fn find_matching_brace_simd(text: &str, open_pos: usize) -> Option<usize> {
303 let bytes = text.as_bytes();
304
305 if open_pos >= bytes.len() || bytes[open_pos] != b'{' {
306 return None;
307 }
308
309 let mut depth = 1;
310 let mut pos = open_pos + 1;
311 let mut in_string = false;
312 let mut escape_next = false;
313
314 while pos < bytes.len() {
315 let search = if in_string {
317 memchr::memchr2(b'"', b'\\', &bytes[pos..])
318 } else {
319 memchr::memchr3(b'{', b'}', b'"', &bytes[pos..])
320 };
321
322 if let Some(offset) = search {
323 pos += offset;
324
325 if escape_next {
326 escape_next = false;
327 pos += 1;
328 continue;
329 }
330
331 match bytes[pos] {
332 b'\\' if in_string => escape_next = true,
333 b'"' => in_string = !in_string,
334 b'{' if !in_string => depth += 1,
335 b'}' if !in_string => {
336 depth -= 1;
337 if depth == 0 {
338 return Some(pos);
339 }
340 }
341 _ => {}
342 }
343
344 pos += 1;
345 } else {
346 break;
347 }
348 }
349
350 None
351}
352
353#[cfg(test)]
354mod tests {
355 use super::*;
356
357 #[test]
358 fn test_find_byte_simd() {
359 assert_eq!(find_byte_simd(b"hello world", b'w'), Some(6));
360 assert_eq!(find_byte_simd(b"hello world", b'x'), None);
361 }
362
363 #[test]
364 fn test_find_either_byte_simd() {
365 assert_eq!(find_either_byte_simd(b"hello world", b'w', b'l'), Some(2));
366 assert_eq!(find_either_byte_simd(b"hello world", b'x', b'y'), None);
367 }
368
369 #[test]
370 fn test_parse_rule_header_simd() {
371 let (name, _) = parse_rule_header_simd(r#"rule "MyRule" {"#).unwrap();
372 assert_eq!(name, "MyRule");
373
374 let (name2, _) = parse_rule_header_simd("rule SimpleRule {").unwrap();
375 assert_eq!(name2, "SimpleRule");
376 }
377
378 #[test]
379 fn test_find_then_keyword_simd() {
380 let text = "when X > 5 then Y = 10";
381 let pos = find_then_keyword_simd(text).unwrap();
382 assert_eq!(&text[pos..pos + 4], "then");
383 }
384
385 #[test]
386 fn test_count_lines_simd() {
387 assert_eq!(count_lines_simd("line1\nline2\nline3"), 2);
388 assert_eq!(count_lines_simd("line1\r\nline2\r\nline3"), 2);
389 assert_eq!(count_lines_simd("single line"), 0);
390 }
391
392 #[test]
393 fn test_skip_whitespace_simd() {
394 assert_eq!(skip_whitespace_simd(" hello"), 3);
395 assert_eq!(skip_whitespace_simd("\t\n world"), 4);
396 assert_eq!(skip_whitespace_simd("no_space"), 0);
397 }
398
399 #[test]
400 fn test_extract_identifier_simd() {
401 assert_eq!(
402 extract_identifier_simd("hello world"),
403 Some("hello".to_string())
404 );
405 assert_eq!(
406 extract_identifier_simd("_test123"),
407 Some("_test123".to_string())
408 );
409 assert_eq!(extract_identifier_simd("123invalid"), None);
410 }
411
412 #[test]
413 fn test_split_into_rules_simd() {
414 let grl = r#"
415rule "Rule1" { when X > 5 then Y = 10 }
416rule "Rule2" { when A < 3 then B = 7 }
417 "#;
418 let rules = split_into_rules_simd(grl);
419 assert_eq!(rules.len(), 2);
420 assert!(rules[0].contains("Rule1"));
421 assert!(rules[1].contains("Rule2"));
422 }
423
424 #[test]
425 fn test_find_matching_brace_simd() {
426 let text = "{ nested { braces } here }";
427 let close = find_matching_brace_simd(text, 0).unwrap();
428 assert_eq!(text.chars().nth(close).unwrap(), '}');
429 assert_eq!(close, text.len() - 1);
430 }
431
432 #[test]
433 fn test_find_keywords_simd() {
434 let text = "when X > 5 then Y = 10 and Z = 20";
435 let keywords = vec!["when", "then", "and"];
436 let matches = find_keywords_simd(text, &keywords);
437
438 assert_eq!(matches.len(), 3);
439 assert_eq!(matches[0].1, "when");
440 assert_eq!(matches[1].1, "then");
441 assert_eq!(matches[2].1, "and");
442 }
443}