1#![allow(dead_code)]
9#![allow(clippy::clone_on_copy)]
10#![allow(clippy::match_like_matches_macro)]
11#![allow(clippy::type_complexity)]
12#![allow(clippy::upper_case_acronyms)]
13#![allow(clippy::redundant_closure)]
14#![allow(clippy::len_without_is_empty)]
15#![allow(clippy::too_many_arguments)]
16#![allow(clippy::comparison_chain)]
17#![allow(clippy::manual_range_contains)]
18#![allow(clippy::large_enum_variant)]
19#![allow(clippy::manual_strip)]
20#![allow(clippy::needless_range_loop)]
21#![allow(clippy::string_slice)]
22#![allow(clippy::needless_late_init)]
23#![allow(clippy::manual_is_ascii_check)]
24#![allow(clippy::sliced_string_as_bytes)]
25mod advanced; mod engine; pub mod optimization; mod parser; use aho_corasick::AhoCorasick;
146use memchr::memmem;
147use std::collections::HashMap;
148use std::sync::{Mutex, OnceLock};
149
150use advanced::{Lookaround, LookaroundType};
152use engine::DFA;
153use parser::{
154 is_sequence_pattern, parse_escape, parse_quantified_pattern, parse_sequence,
155 starts_with_escape, BoundaryType, CharClass, Flags, Group, QuantifiedPattern, Sequence,
156};
157
158pub use advanced::{CaptureGroup, Captures};
160pub use optimization::{literal, prefilter};
161
162#[derive(Debug, Clone)]
164pub struct Pattern {
165 matcher: Matcher,
166 prefilter: Option<(
167 optimization::prefilter::Prefilter,
168 optimization::literal::LiteralKind,
169 )>,
170 fast_path: Option<optimization::fast_path::FastPath>, #[allow(dead_code)]
172 flags: Flags, }
174
175pub type ReXile = Pattern;
177
178#[inline]
180fn safe_slice(text: &str, start: usize) -> Option<&str> {
181 text.get(start..)
182}
183
184#[inline]
185fn safe_slice_range(text: &str, start: usize, end: usize) -> Option<&str> {
186 text.get(start..end)
187}
188
189#[inline]
191#[allow(dead_code)]
192fn char_boundaries(text: &str, start_pos: usize) -> impl Iterator<Item = usize> + '_ {
193 (start_pos..=text.len()).filter(|&i| text.is_char_boundary(i))
194}
195
196impl Pattern {
197 pub fn new(pattern: &str) -> Result<Self, PatternError> {
198 let (flags, effective_pattern) =
200 if let Some((parsed_flags, rest)) = Flags::parse_from_pattern(pattern) {
201 (parsed_flags, rest)
202 } else {
203 (Flags::new(), pattern)
204 };
205
206 let has_start_anchor = effective_pattern.starts_with('^');
208 let has_end_anchor =
209 effective_pattern.ends_with('$') && !effective_pattern.ends_with("\\$");
210
211 let inner_pattern = {
213 let mut p = effective_pattern;
214 if has_start_anchor {
215 p = p.strip_prefix('^').unwrap_or(p);
216 }
217 if has_end_anchor {
218 p = p.strip_suffix('$').unwrap_or(p);
219 }
220 p
221 };
222
223 let has_captures = inner_pattern.contains('(')
225 && !inner_pattern.contains("(?:")
226 && !inner_pattern.contains("(?=")
227 && !inner_pattern.contains("(?!")
228 && !inner_pattern.contains("(?<=")
229 && !inner_pattern.contains("(?<!");
230
231 let inner_ast = if has_captures {
233 parse_pattern_with_captures_with_flags(inner_pattern, &flags)?
234 } else {
235 parse_pattern_with_flags(inner_pattern, &flags)?
236 };
237
238 let ast = if has_start_anchor || has_end_anchor {
240 Ast::AnchoredPattern {
241 inner: Box::new(inner_ast),
242 start: has_start_anchor,
243 end: has_end_anchor,
244 }
245 } else {
246 inner_ast
247 };
248 let mut matcher = compile_ast(&ast)?;
249
250 if flags.case_insensitive && !matches!(matcher, Matcher::CaseInsensitive(_)) {
252 matcher = Matcher::CaseInsensitive(Box::new(matcher));
253 }
254
255 let fast_path = if flags.any_set() {
258 None
259 } else {
260 if let Matcher::PatternWithCaptures { ref elements, .. } = matcher {
262 if let Some(dfa) = engine::capture_dfa::compile_capture_pattern(elements) {
264 Some(optimization::fast_path::FastPath::CaptureDFA(
266 std::sync::Arc::new(dfa),
267 ))
268 } else {
269 optimization::fast_path::detect_fast_path(effective_pattern)
271 }
272 } else {
273 optimization::fast_path::detect_fast_path(effective_pattern)
275 }
276 };
277
278 let literals = optimization::literal::extract_from_pattern(effective_pattern);
280
281 let has_groups = effective_pattern.contains("(?:")
286 || (effective_pattern.contains('(') && !effective_pattern.contains("(?"));
287 let prefilter = if !literals.is_empty()
288 && literals.kind == optimization::literal::LiteralKind::Prefix
289 && !has_groups
290 && !flags.any_set()
291 {
292 let pf = optimization::prefilter::Prefilter::from_literals(&literals);
293 if pf.is_available() {
294 Some((pf, literals.kind))
295 } else {
296 None
297 }
298 } else {
299 None
300 };
301
302 Ok(Pattern {
303 matcher,
304 prefilter,
305 fast_path,
306 flags,
307 })
308 }
309
310 pub fn is_match(&self, text: &str) -> bool {
311 if let Some(ref fp) = self.fast_path {
313 return fp.find(text).is_some();
314 }
315
316 if let Some((ref prefilter, literal_kind)) = self.prefilter {
318 return self.is_match_with_prefilter(text, prefilter, literal_kind);
319 }
320
321 self.matcher.is_match(text)
323 }
324
325 fn is_match_with_prefilter(
327 &self,
328 text: &str,
329 prefilter: &prefilter::Prefilter,
330 literal_kind: literal::LiteralKind,
331 ) -> bool {
332 let bytes = text.as_bytes();
333
334 let max_lookback = match literal_kind {
336 literal::LiteralKind::Prefix => 10, literal::LiteralKind::Inner => 30, literal::LiteralKind::Suffix => 50, literal::LiteralKind::None => return self.matcher.is_match(text),
340 };
341
342 for candidate_pos in prefilter.candidates(bytes) {
343 let lookback = candidate_pos.min(max_lookback);
344
345 for offset in 0..=lookback {
346 let start_pos = candidate_pos - offset;
347 if self
348 .matcher
349 .is_match(safe_slice(text, start_pos).unwrap_or(""))
350 {
351 return true;
352 }
353 }
354 }
355
356 false
357 }
358
359 pub fn find(&self, text: &str) -> Option<(usize, usize)> {
360 if let Some(ref fp) = self.fast_path {
362 return fp.find(text);
363 }
364
365 if let Some((ref prefilter, literal_kind)) = self.prefilter {
367 return self.find_with_prefilter(text, prefilter, literal_kind);
368 }
369
370 self.matcher.find(text)
372 }
373
374 fn find_with_prefilter(
376 &self,
377 text: &str,
378 prefilter: &prefilter::Prefilter,
379 literal_kind: literal::LiteralKind,
380 ) -> Option<(usize, usize)> {
381 let bytes = text.as_bytes();
382 let mut earliest_match: Option<(usize, usize)> = None;
383
384 let max_lookback = match literal_kind {
386 literal::LiteralKind::Prefix => 10,
387 literal::LiteralKind::Inner => 30,
388 literal::LiteralKind::Suffix => 50,
389 literal::LiteralKind::None => return self.matcher.find(text),
390 };
391
392 for candidate_pos in prefilter.candidates(bytes) {
394 if let Some((start, _)) = earliest_match {
396 if start < candidate_pos {
397 return earliest_match;
398 }
399 }
400
401 let lookback = candidate_pos.min(max_lookback);
402
403 for offset in 0..=lookback {
404 let start_pos = candidate_pos - offset;
405
406 if let Some((match_start, match_end)) =
408 self.matcher.find(safe_slice(text, start_pos).unwrap_or(""))
409 {
410 let abs_start = start_pos + match_start;
411 let abs_end = start_pos + match_end;
412
413 if earliest_match.is_none() || abs_start < earliest_match.unwrap().0 {
415 earliest_match = Some((abs_start, abs_end));
416 }
417 break;
418 }
419 }
420 }
421
422 earliest_match
423 }
424
425 pub fn find_all(&self, text: &str) -> Vec<(usize, usize)> {
426 if let Some(ref fp) = self.fast_path {
428 return fp.find_all(text);
429 }
430
431 match &self.matcher {
433 Matcher::Literal(lit) => {
434 memmem::find_iter(text.as_bytes(), lit.as_bytes())
436 .map(|pos| (pos, pos + lit.len()))
437 .collect()
438 }
439 Matcher::MultiLiteral(ac) => {
440 ac.find_iter(text)
442 .map(|mat| (mat.start(), mat.end()))
443 .collect()
444 }
445 Matcher::Sequence(seq) => {
446 seq.find_all(text)
448 }
449 _ => {
450 self.find_iter(text).map(|m| (m.start(), m.end())).collect()
452 }
453 }
454 }
455
456 pub fn find_iter<'a>(&'a self, text: &'a str) -> FindIter<'a> {
458 FindIter {
459 matcher: &self.matcher,
460 fast_path: &self.fast_path,
461 text,
462 pos: 0,
463 }
464 }
465
466 pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
484 if let Matcher::PatternWithCaptures {
486 elements,
487 total_groups,
488 } = &self.matcher
489 {
490 for start_pos in 0..=text.len() {
492 if let Some((end_pos, capture_list)) =
493 Matcher::match_elements_with_backtrack_and_captures(text, start_pos, elements)
494 {
495 if end_pos > start_pos || elements.is_empty() {
496 let mut caps = Captures::new(text, (start_pos, end_pos), *total_groups);
498
499 for (group_num, cap_start, cap_end) in capture_list {
501 caps.set(group_num, cap_start, cap_end);
502 }
503
504 return Some(caps);
505 }
506 }
507 }
508 None
509 } else if let Matcher::Capture(inner_matcher, group_index) = &self.matcher {
510 let total_groups =
512 if let Matcher::PatternWithCaptures { total_groups, .. } = **inner_matcher {
513 total_groups
514 } else {
515 *group_index };
517
518 if let Some((start, end)) = inner_matcher.find(text) {
519 let mut caps = Captures::new(text, (start, end), total_groups);
520
521 caps.set(*group_index, start, end);
523
524 let nested = inner_matcher.extract_nested_captures(text, start);
526 for (group_num, cap_start, cap_end) in nested {
527 caps.set(group_num, cap_start, cap_end);
528 }
529
530 Some(caps)
531 } else {
532 None
533 }
534 } else if let Matcher::AnchoredPattern { inner, start, end } = &self.matcher {
535 if let Matcher::PatternWithCaptures {
538 elements,
539 total_groups,
540 } = inner.as_ref()
541 {
542 let check_anchor = |match_start: usize, match_end: usize| -> bool {
544 let start_ok = !*start || match_start == 0;
545 let end_ok = !*end || match_end == text.len();
546 start_ok && end_ok
547 };
548
549 for start_pos in 0..=text.len() {
551 if *start && start_pos != 0 {
553 continue;
554 }
555
556 if let Some((end_pos, capture_list)) =
557 Matcher::match_elements_with_backtrack_and_captures(
558 text, start_pos, elements,
559 )
560 {
561 if (end_pos > start_pos || elements.is_empty())
562 && check_anchor(start_pos, end_pos)
563 {
564 let mut caps = Captures::new(text, (start_pos, end_pos), *total_groups);
566
567 for (group_num, cap_start, cap_end) in capture_list {
569 caps.set(group_num, cap_start, cap_end);
570 }
571
572 return Some(caps);
573 }
574 }
575 }
576 None
577 } else {
578 self.find(text).map(|(match_start, match_end)| {
580 Captures::new(text, (match_start, match_end), 0)
581 })
582 }
583 } else {
584 self.find(text)
586 .map(|(start, end)| Captures::new(text, (start, end), 0))
587 }
588 }
589
590 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CapturesIter<'r, 't> {
604 CapturesIter {
605 pattern: self,
606 text,
607 pos: 0,
608 }
609 }
610
611 pub fn replace(&self, text: &str, replacement: &str) -> String {
629 let has_captures = replacement.contains('$');
631
632 if !has_captures {
633 if let Some((start, end)) = self.find(text) {
635 let mut result = String::new();
636 result.push_str(&text[..start]);
637 result.push_str(replacement);
638 result.push_str(&text[end..]);
639 result
640 } else {
641 text.to_string()
643 }
644 } else {
645 if let Some(caps) = self.captures(text) {
647 let match_start = caps.pos(0).unwrap().0;
648 let match_end = caps.pos(0).unwrap().1;
649
650 let mut result = String::new();
651 result.push_str(&text[..match_start]);
652
653 let mut chars = replacement.chars().peekable();
655 while let Some(ch) = chars.next() {
656 if ch == '$' {
657 if let Some(&next_ch) = chars.peek() {
659 if next_ch.is_ascii_digit() {
660 chars.next(); let group_num = next_ch.to_digit(10).unwrap() as usize;
662
663 if let Some(group_text) = caps.get(group_num) {
665 result.push_str(group_text);
666 }
667 } else {
668 result.push('$');
669 }
670 } else {
671 result.push('$');
672 }
673 } else {
674 result.push(ch);
675 }
676 }
677
678 result.push_str(&text[match_end..]);
679 result
680 } else {
681 text.to_string()
683 }
684 }
685 }
686
687 pub fn replace_all(&self, text: &str, replacement: &str) -> String {
700 let has_captures = replacement.contains('$');
702
703 if !has_captures {
704 let mut result = String::new();
706 let mut last_end = 0;
707
708 for (start, end) in self.find_all(text) {
709 result.push_str(&text[last_end..start]);
710 result.push_str(replacement);
711 last_end = end;
712 }
713 result.push_str(&text[last_end..]);
714 return result;
715 }
716
717 let mut result = String::new();
719 let mut last_end = 0;
720
721 for caps in self.captures_iter(text) {
722 let _full_match = caps.get(0).unwrap();
723 let match_start = caps.pos(0).unwrap().0;
724 let match_end = caps.pos(0).unwrap().1;
725
726 result.push_str(&text[last_end..match_start]);
728
729 let mut chars = replacement.chars().peekable();
731 while let Some(ch) = chars.next() {
732 if ch == '$' {
733 if let Some(&next_ch) = chars.peek() {
735 if next_ch.is_ascii_digit() {
736 chars.next(); let group_num = next_ch.to_digit(10).unwrap() as usize;
738
739 if let Some(group_text) = caps.get(group_num) {
741 result.push_str(group_text);
742 }
743 } else {
745 result.push('$');
747 }
748 } else {
749 result.push('$');
751 }
752 } else {
753 result.push(ch);
754 }
755 }
756
757 last_end = match_end;
758 }
759
760 result.push_str(&text[last_end..]);
762 result
763 }
764
765 pub fn split<'r, 't>(&'r self, text: &'t str) -> SplitIter<'r, 't> {
776 SplitIter {
777 pattern: self,
778 text,
779 pos: 0,
780 finished: false,
781 }
782 }
783}
784
785#[derive(Debug, Clone, Copy, PartialEq, Eq)]
790pub struct Match<'t> {
791 text: &'t str,
792 start: usize,
793 end: usize,
794}
795
796impl<'t> Match<'t> {
797 #[inline]
799 pub fn new(text: &'t str, start: usize, end: usize) -> Self {
800 Self { text, start, end }
801 }
802
803 #[inline]
805 pub fn start(&self) -> usize {
806 self.start
807 }
808
809 #[inline]
811 pub fn end(&self) -> usize {
812 self.end
813 }
814
815 #[inline]
817 pub fn as_str(&self) -> &'t str {
818 &self.text[self.start..self.end]
819 }
820
821 #[inline]
823 pub fn range(&self) -> std::ops::Range<usize> {
824 self.start..self.end
825 }
826
827 #[inline]
829 pub fn len(&self) -> usize {
830 self.end - self.start
831 }
832
833 #[inline]
835 pub fn is_empty(&self) -> bool {
836 self.start == self.end
837 }
838}
839
840pub struct FindIter<'a> {
842 matcher: &'a Matcher,
843 fast_path: &'a Option<optimization::fast_path::FastPath>,
844 text: &'a str,
845 pos: usize,
846}
847
848impl<'a> Iterator for FindIter<'a> {
849 type Item = Match<'a>;
850
851 fn next(&mut self) -> Option<Self::Item> {
852 if self.pos >= self.text.len() {
854 return None;
855 }
856
857 if let Some(ref fast_path) = self.fast_path {
859 if let Some((start, end)) = fast_path.find_at(self.text, self.pos) {
860 self.pos = end.max(self.pos + 1);
862 return Some(Match::new(self.text, start, end));
863 } else {
864 return None;
866 }
867 }
868
869 let remaining = &self.text[self.pos..];
871 if let Some((rel_start, rel_end)) = self.matcher.find(remaining) {
872 let abs_start = self.pos + rel_start;
873 let abs_end = self.pos + rel_end;
874
875 self.pos = abs_end.max(self.pos + 1);
877
878 Some(Match::new(self.text, abs_start, abs_end))
879 } else {
880 None
881 }
882 }
883}
884
885pub struct CapturesIter<'r, 't> {
887 pattern: &'r Pattern,
888 text: &'t str,
889 pos: usize,
890}
891
892impl<'r, 't> Iterator for CapturesIter<'r, 't> {
893 type Item = Captures<'t>;
894
895 fn next(&mut self) -> Option<Self::Item> {
896 if self.pos >= self.text.len() {
897 return None;
898 }
899
900 if let Matcher::PatternWithCaptures {
902 elements,
903 total_groups,
904 } = &self.pattern.matcher
905 {
906 let remaining = &self.text[self.pos..];
908
909 let char_indices: Vec<usize> = remaining.char_indices().map(|(i, _)| i).collect();
911 let search_positions: Vec<usize> = if char_indices.is_empty() {
912 vec![0]
913 } else {
914 char_indices
915 .into_iter()
916 .chain(std::iter::once(remaining.len()))
917 .collect()
918 };
919
920 for &start_offset in &search_positions {
921 if start_offset >= remaining.len() {
922 break;
923 }
924
925 let mut pos = start_offset;
926 let mut capture_positions: Vec<(usize, usize)> = Vec::new();
927 let mut all_matched = true;
928
929 for element in elements {
930 let (matcher, group_num_opt) = match element {
931 CompiledCaptureElement::Capture(m, num) => (m, Some(*num)),
932 CompiledCaptureElement::NonCapture(m) => (m, None),
933 };
934
935 if let Some((rel_start, rel_end)) = matcher.find(&remaining[pos..]) {
936 if rel_start != 0 {
937 all_matched = false;
939 break;
940 }
941
942 let abs_start = pos;
943 let abs_end = pos + rel_end;
944
945 if let Some(group_num) = group_num_opt {
947 while capture_positions.len() < group_num {
949 capture_positions.push((0, 0));
950 }
951 capture_positions[group_num - 1] = (abs_start, abs_end);
952 }
953
954 pos = abs_end;
955 } else {
956 all_matched = false;
957 break;
958 }
959 }
960
961 if all_matched {
962 let abs_start = self.pos + start_offset;
964 let abs_end = self.pos + pos;
965
966 self.pos = abs_end.max(self.pos + 1);
968
969 let mut caps = Captures::new(self.text, (abs_start, abs_end), *total_groups);
971
972 for (i, &(start, end)) in capture_positions.iter().enumerate() {
974 caps.set(i + 1, self.pos - pos + start, self.pos - pos + end);
975 }
976
977 return Some(caps);
978 }
979 }
980 None
981 } else {
982 let remaining = &self.text[self.pos..];
984 if let Some((rel_start, rel_end)) = self.pattern.matcher.find(remaining) {
985 let abs_start = self.pos + rel_start;
986 let abs_end = self.pos + rel_end;
987
988 self.pos = abs_end.max(self.pos + 1);
990
991 Some(Captures::new(self.text, (abs_start, abs_end), 0))
993 } else {
994 None
995 }
996 }
997 }
998}
999
1000pub struct SplitIter<'r, 't> {
1002 pattern: &'r Pattern,
1003 text: &'t str,
1004 pos: usize,
1005 finished: bool,
1006}
1007
1008impl<'r, 't> Iterator for SplitIter<'r, 't> {
1009 type Item = &'t str;
1010
1011 fn next(&mut self) -> Option<Self::Item> {
1012 if self.finished {
1013 return None;
1014 }
1015
1016 let remaining = &self.text[self.pos..];
1018 if let Some((rel_start, rel_end)) = self.pattern.matcher.find(remaining) {
1019 let abs_start = self.pos + rel_start;
1020 let abs_end = self.pos + rel_end;
1021
1022 let result = &self.text[self.pos..abs_start];
1024 self.pos = abs_end;
1025
1026 Some(result)
1027 } else {
1028 self.finished = true;
1030 Some(&self.text[self.pos..])
1031 }
1032 }
1033}
1034
1035static CACHE: OnceLock<Mutex<HashMap<String, Pattern>>> = OnceLock::new();
1036
1037fn get_cache() -> &'static Mutex<HashMap<String, Pattern>> {
1038 CACHE.get_or_init(|| Mutex::new(HashMap::new()))
1039}
1040
1041pub fn get_pattern(pattern: &str) -> Result<Pattern, PatternError> {
1042 let mut cache = get_cache().lock().unwrap();
1043 if let Some(p) = cache.get(pattern) {
1044 return Ok(p.clone());
1045 }
1046 let compiled = Pattern::new(pattern)?;
1047 cache.insert(pattern.to_string(), compiled.clone());
1048 Ok(compiled)
1049}
1050
1051pub fn is_match(pattern: &str, text: &str) -> Result<bool, PatternError> {
1052 Ok(get_pattern(pattern)?.is_match(text))
1053}
1054
1055pub fn find(pattern: &str, text: &str) -> Result<Option<(usize, usize)>, PatternError> {
1056 Ok(get_pattern(pattern)?.find(text))
1057}
1058
1059#[derive(Debug, Clone, PartialEq, Eq)]
1060pub enum PatternError {
1061 ParseError(String),
1062 UnsupportedFeature(String),
1063}
1064
1065impl std::fmt::Display for PatternError {
1066 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1067 match self {
1068 PatternError::ParseError(msg) => write!(f, "Parse error: {}", msg),
1069 PatternError::UnsupportedFeature(msg) => write!(f, "Unsupported: {}", msg),
1070 }
1071 }
1072}
1073
1074impl std::error::Error for PatternError {}
1075
1076#[derive(Debug, Clone, PartialEq)]
1077enum Ast {
1078 Literal(String),
1079 Dot, DotAll, Alternation(Vec<String>),
1082 Anchored {
1083 literal: String,
1084 start: bool,
1085 end: bool,
1086 },
1087 AnchoredGroup {
1088 group: Group,
1089 start: bool,
1090 end: bool,
1091 },
1092 AnchoredPattern {
1093 inner: Box<Ast>,
1094 start: bool,
1095 end: bool,
1096 },
1097 CharClass(CharClass),
1098 Quantified(QuantifiedPattern),
1099 Sequence(Sequence),
1100 SequenceWithFlags(Sequence, Flags), Group(Group),
1102 Boundary(BoundaryType), Lookaround(Lookaround), Capture(Box<Ast>, usize), QuantifiedCapture(Box<Ast>, parser::quantifier::Quantifier), CombinedWithLookaround {
1107 prefix: Box<Ast>,
1108 lookaround: Lookaround,
1109 }, LookbehindWithSuffix {
1111 lookbehind: Lookaround,
1112 suffix: Box<Ast>,
1113 }, PatternWithCaptures {
1115 elements: Vec<CaptureElement>,
1116 total_groups: usize,
1117 }, AlternationWithCaptures {
1119 branches: Vec<Ast>,
1120 total_groups: usize,
1121 }, Backreference(usize), CaseInsensitive(Box<Ast>), }
1125
1126fn parse_pattern_with_groups(pattern: &str) -> Result<Ast, PatternError> {
1129 if pattern.matches('(').count() > 1 && !pattern.contains('|') {
1131 let mut combined_literals = Vec::new();
1132 let mut pos = 0;
1133 let mut all_parsed = true;
1134
1135 while pos < pattern.len() && pattern[pos..].starts_with('(') {
1136 match parser::group::parse_group(&pattern[pos..]) {
1137 Ok((group, bytes_consumed)) => {
1138 match &group.content {
1140 parser::group::GroupContent::Single(s) => {
1141 combined_literals.push(s.clone());
1142 }
1143 parser::group::GroupContent::Sequence(seq) => {
1144 let mut literal = String::new();
1146 let mut is_simple = true;
1147
1148 for elem in &seq.elements {
1149 match elem {
1150 crate::parser::sequence::SequenceElement::Char(ch) => {
1151 literal.push(*ch);
1152 }
1153 crate::parser::sequence::SequenceElement::Literal(lit) => {
1154 literal.push_str(lit);
1155 }
1156 _ => {
1157 is_simple = false;
1159 break;
1160 }
1161 }
1162 }
1163
1164 if is_simple {
1165 combined_literals.push(literal);
1166 } else {
1167 all_parsed = false;
1168 break;
1169 }
1170 }
1171 parser::group::GroupContent::Alternation(_)
1172 | parser::group::GroupContent::ParsedAlternation(_) => {
1173 all_parsed = false;
1175 break;
1176 }
1177 }
1178 pos += bytes_consumed;
1179 }
1180 Err(_) => {
1181 all_parsed = false;
1182 break;
1183 }
1184 }
1185 }
1186
1187 if all_parsed && pos == pattern.len() && !combined_literals.is_empty() {
1188 use crate::parser::sequence::{Sequence, SequenceElement};
1191
1192 let mut elements = Vec::new();
1193 for literal in combined_literals {
1194 elements.push(SequenceElement::Literal(literal));
1196 }
1197
1198 let seq = Sequence::new(elements);
1199 return Ok(Ast::Sequence(seq));
1200 }
1201 }
1202
1203 if pattern.starts_with("^(") || pattern.ends_with(")$") {
1205 let has_start = pattern.starts_with('^');
1206 let has_end = pattern.ends_with('$');
1207
1208 let mut inner = pattern;
1210 if has_start {
1211 inner = &inner[1..]; }
1213 if has_end {
1214 inner = &inner[..inner.len() - 1]; }
1216
1217 if inner.starts_with('(') {
1218 if let Ok((group, bytes_consumed)) = parser::group::parse_group(inner) {
1219 if bytes_consumed == inner.len() {
1220 let group_literal = match &group.content {
1222 parser::group::GroupContent::Single(s) => Some(s.clone()),
1223 parser::group::GroupContent::Sequence(seq) => {
1224 let mut literal = String::new();
1226 let mut is_simple = true;
1227
1228 for elem in &seq.elements {
1229 match elem {
1230 crate::parser::sequence::SequenceElement::Char(ch) => {
1231 literal.push(*ch);
1232 }
1233 crate::parser::sequence::SequenceElement::Literal(lit) => {
1234 literal.push_str(lit);
1235 }
1236 _ => {
1237 is_simple = false;
1239 break;
1240 }
1241 }
1242 }
1243
1244 if is_simple {
1245 Some(literal)
1246 } else {
1247 None
1248 }
1249 }
1250 parser::group::GroupContent::Alternation(_)
1251 | parser::group::GroupContent::ParsedAlternation(_) => {
1252 None
1254 }
1255 };
1256
1257 if let Some(lit) = group_literal {
1258 return Ok(Ast::Anchored {
1259 literal: lit,
1260 start: has_start,
1261 end: has_end,
1262 });
1263 } else {
1264 return Ok(Ast::AnchoredGroup {
1266 group,
1267 start: has_start,
1268 end: has_end,
1269 });
1270 }
1271 }
1272 }
1273 }
1274 }
1275
1276 if pattern.starts_with('(') {
1278 if let Ok((group, bytes_consumed)) = parser::group::parse_group(pattern) {
1279 if bytes_consumed == pattern.len() {
1280 return Ok(Ast::Group(group));
1281 }
1282
1283 if bytes_consumed < pattern.len() {
1285 let suffix = &pattern[bytes_consumed..];
1286 match &group.content {
1289 parser::group::GroupContent::Alternation(parts) => {
1290 let expanded: Vec<String> =
1291 parts.iter().map(|p| format!("{}{}", p, suffix)).collect();
1292 return Ok(Ast::Alternation(expanded));
1293 }
1294 parser::group::GroupContent::Sequence(seq) => {
1295 use crate::parser::sequence::{Sequence, SequenceElement};
1298
1299 let mut new_elements = seq.elements.clone();
1300 for ch in suffix.chars() {
1302 new_elements.push(SequenceElement::Char(ch));
1303 }
1304
1305 let combined_seq = Sequence::new(new_elements);
1306 return Ok(Ast::Sequence(combined_seq));
1307 }
1308 parser::group::GroupContent::Single(s) => {
1309 let combined = format!("{}{}", s, suffix);
1311 return Ok(Ast::Literal(combined));
1312 }
1313 parser::group::GroupContent::ParsedAlternation(_) => {
1314 }
1316 }
1317 }
1318 }
1319 }
1320
1321 if let Some(group_start) = pattern.find('(') {
1323 if group_start > 0 {
1324 let prefix = &pattern[..group_start];
1325 if prefix != "^" && prefix != "$" {
1327 let group_part = &pattern[group_start..];
1328
1329 if let Ok((group, bytes_consumed)) = parser::group::parse_group(group_part) {
1330 if bytes_consumed == group_part.len() {
1331 match &group.content {
1333 parser::group::GroupContent::Alternation(parts) => {
1334 let expanded: Vec<String> =
1335 parts.iter().map(|p| format!("{}{}", prefix, p)).collect();
1336 return Ok(Ast::Alternation(expanded));
1337 }
1338 _ => {
1339 return Ok(Ast::Group(group));
1341 }
1342 }
1343 }
1344 }
1345 }
1346 }
1347 }
1348
1349 Err(PatternError::ParseError(
1350 "Complex group pattern not fully supported".to_string(),
1351 ))
1352}
1353
1354fn parse_pattern(pattern: &str) -> Result<Ast, PatternError> {
1355 parse_pattern_with_depth(pattern, 0)
1356}
1357
1358const MAX_RECURSION_DEPTH: usize = 100;
1359
1360fn parse_pattern_with_depth(pattern: &str, depth: usize) -> Result<Ast, PatternError> {
1361 if depth > MAX_RECURSION_DEPTH {
1362 return Err(PatternError::ParseError(
1363 "Pattern too complex: recursion depth exceeded".to_string(),
1364 ));
1365 }
1366
1367 if pattern.is_empty() {
1368 return Ok(Ast::Literal(String::new()));
1369 }
1370
1371 if pattern.starts_with("(?=")
1373 || pattern.starts_with("(?!")
1374 || pattern.starts_with("(?<=")
1375 || pattern.starts_with("(?<!")
1376 {
1377 return parse_lookaround(pattern, depth);
1378 }
1379
1380 if pattern.contains("(?=")
1382 || pattern.contains("(?!")
1383 || pattern.contains("(?<=")
1384 || pattern.contains("(?<!")
1385 {
1386 if let Ok(ast) = parse_combined_with_lookaround(pattern, depth) {
1388 return Ok(ast);
1389 }
1390 }
1391
1392 if pattern.starts_with('(') && !pattern.starts_with("(?") {
1395 if let Some(close_idx) = find_matching_paren(pattern, 0) {
1397 if close_idx == pattern.len() - 1 {
1398 let inner = &pattern[1..close_idx];
1400
1401 if !contains_unescaped_paren(inner) || inner.starts_with("(?") {
1403 let inner_ast = parse_pattern_with_depth(inner, depth + 1)?;
1405 return Ok(Ast::Capture(Box::new(inner_ast), 1)); }
1407 }
1409 }
1410 }
1411
1412 let is_quantified_group = pattern.starts_with('(')
1417 && if let Some(close_idx) = find_matching_paren(pattern, 0) {
1418 close_idx == pattern.len() - 2
1419 && (pattern.ends_with('?') || pattern.ends_with('*') || pattern.ends_with('+'))
1420 } else {
1421 false
1422 };
1423
1424 let is_bounded_quantified_group = pattern.starts_with('(')
1425 && if let Some(close_idx) = find_matching_paren(pattern, 0) {
1426 close_idx < pattern.len() - 1 && pattern[close_idx + 1..].starts_with('{')
1427 } else {
1428 false
1429 };
1430
1431 if contains_unescaped_paren(pattern)
1432 && !pattern.starts_with('^')
1433 && !pattern.ends_with('$')
1434 && !is_quantified_group
1435 && !is_bounded_quantified_group
1436 && !pattern.contains("(?=")
1437 && !pattern.contains("(?!")
1438 && !pattern.contains("(?<=")
1439 && !pattern.contains("(?<!")
1440 {
1441 if let Ok(ast) = parse_pattern_with_captures(pattern) {
1443 return Ok(ast);
1444 }
1445 }
1446
1447 if contains_unescaped_paren(pattern) {
1450 if let Ok(ast) = parse_pattern_with_groups(pattern) {
1452 return Ok(ast);
1453 }
1454 }
1455
1456 let has_start_anchor = pattern.starts_with('^');
1458 let has_end_anchor = pattern.ends_with('$');
1459
1460 if has_start_anchor || has_end_anchor {
1461 let mut literal = pattern;
1463 if has_start_anchor {
1464 literal = literal.strip_prefix('^').unwrap();
1465 }
1466 if has_end_anchor {
1467 literal = literal.strip_suffix('$').unwrap();
1468 }
1469
1470 return Ok(Ast::Anchored {
1472 literal: literal.to_string(),
1473 start: has_start_anchor,
1474 end: has_end_anchor,
1475 });
1476 }
1477
1478 if pattern.contains('|') && !pattern.contains('[') {
1480 let parts: Vec<String> = pattern.split('|').map(|s| s.to_string()).collect();
1481 return Ok(Ast::Alternation(parts));
1482 }
1483
1484 if is_sequence_pattern(pattern) {
1486 match parse_sequence(pattern) {
1487 Ok(seq) => return Ok(Ast::Sequence(seq)),
1488 Err(_) => {
1489 }
1491 }
1492 }
1493
1494 if starts_with_escape(pattern) {
1496 match parse_escape(pattern) {
1497 Ok((seq, bytes_consumed)) => {
1498 if bytes_consumed == pattern.len() {
1500 if let Some(boundary_type) = seq.to_boundary() {
1502 return Ok(Ast::Boundary(boundary_type));
1503 }
1504 if let Some(cc) = seq.to_char_class() {
1506 return Ok(Ast::CharClass(cc));
1507 }
1508 if let Some(ch) = seq.to_char() {
1510 return Ok(Ast::Literal(ch.to_string()));
1511 }
1512 }
1513 let remaining = &pattern[bytes_consumed..];
1515 if !remaining.is_empty() {
1516 if let Some(q_char) = remaining.chars().next() {
1517 if q_char == '*' || q_char == '+' || q_char == '?' || q_char == '{' {
1518 if let Ok(qp) = parse_quantified_pattern(pattern) {
1520 return Ok(Ast::Quantified(qp));
1521 }
1522 }
1523 }
1524 }
1525 }
1526 Err(e) => return Err(PatternError::ParseError(e)),
1527 }
1528 }
1529
1530 let has_quantifier = pattern.ends_with('*')
1532 || pattern.ends_with('+')
1533 || pattern.ends_with('?')
1534 || (pattern.contains('{') && pattern.ends_with('}'));
1535
1536 if has_quantifier {
1537 match parse_quantified_pattern(pattern) {
1539 Ok(qp) => return Ok(Ast::Quantified(qp)),
1540 Err(_) => {
1541 }
1543 }
1544 }
1545
1546 if pattern.starts_with('[') && pattern.contains(']') {
1548 let end_idx = pattern.find(']').unwrap();
1549 if end_idx == pattern.len() - 1 {
1550 let class_content = &pattern[1..end_idx];
1552 let char_class = CharClass::parse(class_content).map_err(PatternError::ParseError)?;
1553 return Ok(Ast::CharClass(char_class));
1554 }
1555 }
1557
1558 if pattern == "." {
1560 return Ok(Ast::Dot);
1561 }
1562
1563 if pattern.contains('.') {
1565 use crate::parser::sequence::{Sequence, SequenceElement};
1567 let mut elements = Vec::new();
1568
1569 for ch in pattern.chars() {
1570 if ch == '.' {
1571 elements.push(SequenceElement::Dot);
1572 } else {
1573 elements.push(SequenceElement::Char(ch));
1574 }
1575 }
1576
1577 return Ok(Ast::Sequence(Sequence::new(elements)));
1578 }
1579
1580 Ok(Ast::Literal(pattern.to_string()))
1582}
1583
1584fn parse_pattern_with_flags(pattern: &str, flags: &Flags) -> Result<Ast, PatternError> {
1587 if flags.dot_matches_newline {
1591 let ast = parse_pattern_dotall(pattern, flags)?;
1593 if flags.case_insensitive {
1594 return Ok(Ast::CaseInsensitive(Box::new(ast)));
1595 }
1596 return Ok(ast);
1597 }
1598
1599 let ast = parse_pattern(pattern)?;
1601 if flags.case_insensitive {
1602 return Ok(Ast::CaseInsensitive(Box::new(ast)));
1603 }
1604 Ok(ast)
1605}
1606
1607fn parse_pattern_dotall(pattern: &str, flags: &Flags) -> Result<Ast, PatternError> {
1609 if pattern.is_empty() {
1610 return Ok(Ast::Literal(String::new()));
1611 }
1612
1613 if pattern == "." {
1615 return Ok(Ast::DotAll);
1616 }
1617
1618 if pattern.contains('.') {
1620 if is_sequence_pattern(pattern) {
1622 match parse_sequence(pattern) {
1624 Ok(seq) => return Ok(Ast::SequenceWithFlags(seq, *flags)),
1625 Err(_) => {
1626 }
1628 }
1629 }
1630
1631 use crate::parser::sequence::{Sequence, SequenceElement};
1633 let mut elements = Vec::new();
1634
1635 for ch in pattern.chars() {
1636 if ch == '.' {
1637 elements.push(SequenceElement::Dot);
1639 } else {
1640 elements.push(SequenceElement::Char(ch));
1641 }
1642 }
1643
1644 return Ok(Ast::SequenceWithFlags(Sequence::new(elements), *flags));
1645 }
1646
1647 parse_pattern(pattern)
1649}
1650
1651fn parse_pattern_with_captures_with_flags(
1653 pattern: &str,
1654 flags: &Flags,
1655) -> Result<Ast, PatternError> {
1656 let ast = parse_pattern_with_captures(pattern)?;
1659
1660 if flags.case_insensitive {
1661 return Ok(Ast::CaseInsensitive(Box::new(ast)));
1662 }
1663
1664 Ok(ast)
1667}
1668
1669#[derive(Debug, Clone)]
1670enum Matcher {
1671 Literal(String),
1672 MultiLiteral(AhoCorasick),
1673 AnchoredLiteral {
1674 literal: String,
1675 start: bool,
1676 end: bool,
1677 },
1678 AnchoredGroup {
1679 group: Group,
1680 start: bool,
1681 end: bool,
1682 },
1683 AnchoredPattern {
1684 inner: Box<Matcher>,
1685 start: bool,
1686 end: bool,
1687 },
1688 CharClass(CharClass),
1689 Quantified(QuantifiedPattern),
1690 Sequence(Sequence),
1691 SequenceWithFlags(Sequence, Flags), Group(Group),
1693 DigitRun, WordRun, Boundary(BoundaryType), Lookaround(Box<Lookaround>, Box<Matcher>), Capture(Box<Matcher>, usize), QuantifiedCapture(Box<Matcher>, parser::quantifier::Quantifier), CombinedWithLookaround {
1700 prefix: Box<Matcher>,
1701 lookaround: Box<Lookaround>,
1702 lookaround_matcher: Box<Matcher>,
1703 }, LookbehindWithSuffix {
1705 lookbehind: Box<Lookaround>,
1706 lookbehind_matcher: Box<Matcher>,
1707 suffix: Box<Matcher>,
1708 }, PatternWithCaptures {
1710 elements: Vec<CompiledCaptureElement>,
1711 total_groups: usize,
1712 }, AlternationWithCaptures {
1714 branches: Vec<Matcher>,
1715 #[allow(dead_code)]
1716 total_groups: usize,
1717 }, Backreference(usize), DFA(DFA), LazyDFA(engine::lazy_dfa::LazyDFA), CaseInsensitive(Box<Matcher>), }
1723
1724#[derive(Debug, Clone)]
1726enum CompiledCaptureElement {
1727 Capture(Matcher, usize), NonCapture(Matcher), }
1730
1731impl Matcher {
1732 fn is_match(&self, text: &str) -> bool {
1733 match self {
1734 Matcher::Literal(lit) => memmem::find(text.as_bytes(), lit.as_bytes()).is_some(),
1735 Matcher::MultiLiteral(ac) => ac.is_match(text),
1736 Matcher::AnchoredLiteral {
1737 literal,
1738 start,
1739 end,
1740 } => match (start, end) {
1741 (true, true) => text == literal,
1742 (true, false) => text.starts_with(literal),
1743 (false, true) => text.ends_with(literal),
1744 _ => unreachable!(),
1745 },
1746 Matcher::AnchoredGroup { group, start, end } => {
1747 match (start, end) {
1749 (true, true) => {
1750 group
1752 .match_at(text, 0)
1753 .map(|len| len == text.len())
1754 .unwrap_or(false)
1755 }
1756 (true, false) => {
1757 group.match_at(text, 0).is_some()
1759 }
1760 (false, true) => {
1761 if let Some((_start_pos, end_pos)) = group.find(text) {
1763 end_pos == text.len()
1764 } else {
1765 false
1766 }
1767 }
1768 _ => unreachable!(),
1769 }
1770 }
1771 Matcher::AnchoredPattern { inner, start, end } => {
1772 match (start, end) {
1774 (true, true) => {
1775 if let Some((match_start, match_end)) = inner.find(text) {
1777 match_start == 0 && match_end == text.len()
1778 } else {
1779 false
1780 }
1781 }
1782 (true, false) => {
1783 if let Some((match_start, _)) = inner.find(text) {
1785 match_start == 0
1786 } else {
1787 false
1788 }
1789 }
1790 (false, true) => {
1791 if let Some((_, match_end)) = inner.find(text) {
1793 match_end == text.len()
1794 } else {
1795 false
1796 }
1797 }
1798 _ => unreachable!(),
1799 }
1800 }
1801 Matcher::CharClass(cc) => {
1802 cc.find_first(text).is_some()
1804 }
1805 Matcher::Quantified(qp) => {
1806 if let crate::parser::quantifier::QuantifiedElement::CharClass(cc) = &qp.element {
1807 if let Some(bitmap) = cc.get_ascii_bitmap() {
1808 let negated = cc.negated;
1809 let min = qp.quantifier.min_matches();
1810 let bytes = text.as_bytes();
1811
1812 if min <= 1 {
1813 for &byte in bytes {
1815 if byte < 128 {
1816 let idx = byte as usize;
1817 let bit_set = (bitmap[idx / 64] & (1u64 << (idx % 64))) != 0;
1818 if bit_set != negated {
1819 return true;
1820 }
1821 }
1822 }
1823 return false;
1824 } else {
1825 let mut run = 0usize;
1827 for &byte in bytes {
1828 if byte < 128 {
1829 let idx = byte as usize;
1830 let bit_set = (bitmap[idx / 64] & (1u64 << (idx % 64))) != 0;
1831 if bit_set != negated {
1832 run += 1;
1833 if run >= min {
1834 return true;
1835 }
1836 continue;
1837 }
1838 }
1839 run = 0;
1840 }
1841 return false;
1842 }
1843 }
1844 }
1845 qp.is_match(text)
1846 }
1847 Matcher::Sequence(seq) => seq.is_match(text), Matcher::Group(group) => group.is_match(text), Matcher::DigitRun => Self::digit_run_is_match(text), Matcher::WordRun => Self::word_run_is_match(text), Matcher::Boundary(boundary_type) => boundary_type.find_first(text).is_some(),
1852 Matcher::Lookaround(lookaround, inner_matcher) => {
1853 for pos in 0..=text.len() {
1855 if lookaround.matches_at(text, pos, inner_matcher) {
1856 return true;
1857 }
1858 }
1859 false
1860 }
1861 Matcher::Capture(inner_matcher, _group_index) => {
1862 inner_matcher.is_match(text)
1864 }
1865 Matcher::QuantifiedCapture(inner_matcher, quantifier) => {
1866 Self::quantified_is_match(text, inner_matcher, quantifier)
1868 }
1869 Matcher::CombinedWithLookaround {
1870 prefix,
1871 lookaround,
1872 lookaround_matcher,
1873 } => {
1874 if let Some((_start, end)) = prefix.find(text) {
1876 lookaround.matches_at(text, end, lookaround_matcher)
1878 } else {
1879 false
1880 }
1881 }
1882 Matcher::LookbehindWithSuffix {
1883 lookbehind,
1884 lookbehind_matcher,
1885 suffix,
1886 } => {
1887 if let Some((start, _end)) = suffix.find(text) {
1890 lookbehind.matches_at(text, start, lookbehind_matcher)
1892 } else {
1893 false
1894 }
1895 }
1896 Matcher::PatternWithCaptures { .. } => {
1897 self.find(text).is_some()
1899 }
1900 Matcher::Backreference(_) => {
1901 false
1905 }
1906 Matcher::DFA(dfa) => {
1907 dfa.is_match(text)
1909 }
1910 Matcher::LazyDFA(lazy_dfa) => {
1911 let mut dfa = lazy_dfa.clone();
1913 dfa.find(text).is_some()
1914 }
1915 Matcher::SequenceWithFlags(seq, flags) => {
1916 seq.is_match_with_flags(text, flags)
1918 }
1919 Matcher::AlternationWithCaptures { branches, .. } => {
1920 for branch in branches {
1922 if branch.is_match(text) {
1923 return true;
1924 }
1925 }
1926 false
1927 }
1928 Matcher::CaseInsensitive(inner) => {
1929 if let Matcher::Literal(needle) = inner.as_ref() {
1932 let needle_bytes = needle.as_bytes();
1933 let text_bytes = text.as_bytes();
1934 let needle_is_ascii = needle_bytes.iter().all(|&b| b < 128);
1935 if needle_is_ascii
1936 && !needle_bytes.is_empty()
1937 && needle_bytes.len() <= text_bytes.len()
1938 {
1939 let first_lower = needle_bytes[0];
1940 let first_upper = if first_lower >= b'a' && first_lower <= b'z' {
1941 first_lower - 32
1942 } else {
1943 first_lower
1944 };
1945 for i in 0..=(text_bytes.len() - needle_bytes.len()) {
1946 let b = text_bytes[i];
1947 if b == first_lower || b == first_upper {
1948 let mut matched = true;
1949 for j in 1..needle_bytes.len() {
1950 let tb = text_bytes[i + j];
1951 let nb = needle_bytes[j];
1952 if tb >= 128 || nb >= 128 {
1954 matched = false;
1955 break;
1956 }
1957 if tb != nb && (tb ^ 32) != nb {
1958 matched = false;
1959 break;
1960 }
1961 }
1962 if matched {
1963 return true;
1964 }
1965 }
1966 }
1967 return false;
1968 }
1969 }
1970 if let Matcher::MultiLiteral(ac) = inner.as_ref() {
1972 let bytes = text.as_bytes();
1973 let len = bytes.len();
1974 if len <= 256 {
1975 let mut buf = [0u8; 256];
1976 let mut all_ascii = true;
1977 for i in 0..len {
1978 let b = bytes[i];
1979 if b >= 128 {
1980 all_ascii = false;
1981 break;
1982 }
1983 buf[i] = if b >= b'A' && b <= b'Z' { b + 32 } else { b };
1984 }
1985 if all_ascii {
1986 let lower = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
1987 return ac.is_match(lower);
1988 }
1989 }
1990 }
1991 let bytes = text.as_bytes();
1993 let len = bytes.len();
1994 if len <= 256 {
1995 let mut buf = [0u8; 256];
1996 let mut all_ascii = true;
1997 for i in 0..len {
1998 let b = bytes[i];
1999 if b >= 128 {
2000 all_ascii = false;
2001 break;
2002 }
2003 buf[i] = if b >= b'A' && b <= b'Z' { b + 32 } else { b };
2004 }
2005 if all_ascii {
2006 let lower = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
2007 return inner.is_match(lower);
2008 }
2009 }
2010 let lower_text = text.to_lowercase();
2011 inner.is_match(&lower_text)
2012 }
2013 }
2014 }
2015
2016 fn extract_nested_captures(&self, text: &str, start_pos: usize) -> Vec<(usize, usize, usize)> {
2019 let mut captures = Vec::new();
2020
2021 match self {
2022 Matcher::PatternWithCaptures { elements, .. } => {
2023 let mut pos = start_pos;
2024
2025 for element in elements {
2026 match element {
2027 CompiledCaptureElement::Capture(inner_matcher, group_num) => {
2028 if let Some((rel_start, rel_end)) =
2029 inner_matcher.find(safe_slice(text, pos).unwrap_or(""))
2030 {
2031 if rel_start == 0 {
2032 let abs_start = pos;
2033 let abs_end = pos + rel_end;
2034
2035 captures.push((*group_num, abs_start, abs_end));
2037
2038 let nested =
2040 inner_matcher.extract_nested_captures(text, abs_start);
2041 captures.extend(nested);
2042
2043 pos = abs_end;
2044 } else {
2045 break;
2046 }
2047 } else {
2048 break;
2049 }
2050 }
2051 CompiledCaptureElement::NonCapture(inner_matcher) => {
2052 if let Some((rel_start, rel_end)) =
2053 inner_matcher.find(safe_slice(text, pos).unwrap_or(""))
2054 {
2055 if rel_start == 0 {
2056 let abs_start = pos;
2057
2058 let nested =
2060 inner_matcher.extract_nested_captures(text, abs_start);
2061 captures.extend(nested);
2062
2063 pos += rel_end;
2064 } else {
2065 break;
2066 }
2067 } else {
2068 break;
2069 }
2070 }
2071 }
2072 }
2073 }
2074 Matcher::Capture(inner_matcher, group_num) => {
2075 if let Some((rel_start, rel_end)) =
2077 inner_matcher.find(safe_slice(text, start_pos).unwrap_or(""))
2078 {
2079 let abs_start = start_pos + rel_start;
2080 let abs_end = start_pos + rel_end;
2081
2082 captures.push((*group_num, abs_start, abs_end));
2084
2085 let nested = inner_matcher.extract_nested_captures(text, abs_start);
2087 captures.extend(nested);
2088 }
2089 }
2090 Matcher::AlternationWithCaptures { branches, .. } => {
2091 for branch in branches {
2093 if let Some((rel_start, _rel_end)) =
2094 branch.find(safe_slice(text, start_pos).unwrap_or(""))
2095 {
2096 if rel_start == 0 {
2097 let abs_start = start_pos;
2098 let nested = branch.extract_nested_captures(text, abs_start);
2100 captures.extend(nested);
2101 break; }
2103 }
2104 }
2105 }
2106 _ => {
2107 }
2109 }
2110
2111 captures
2112 }
2113
2114 fn match_pattern_with_backreferences(
2117 text: &str,
2118 start_pos: usize,
2119 elements: &[CompiledCaptureElement],
2120 ) -> Option<usize> {
2121 let mut pos = start_pos;
2122 let mut capture_positions: Vec<(usize, usize)> = Vec::new();
2123
2124 for element in elements {
2125 match element {
2126 CompiledCaptureElement::Capture(m, num) => {
2127 if let Some((rel_start, rel_end)) = m.find(safe_slice(text, pos).unwrap_or(""))
2128 {
2129 if rel_start != 0 {
2130 return None; }
2132
2133 let abs_start = pos;
2134 let abs_end = pos + rel_end;
2135
2136 while capture_positions.len() < *num {
2138 capture_positions.push((0, 0));
2139 }
2140 capture_positions[*num - 1] = (abs_start, abs_end);
2141 pos = abs_end;
2142 } else {
2143 return None;
2144 }
2145 }
2146 CompiledCaptureElement::NonCapture(m) => {
2147 if let Matcher::Backreference(ref_num) = m {
2149 if *ref_num > 0 && *ref_num <= capture_positions.len() {
2151 let (cap_start, cap_end) = capture_positions[*ref_num - 1];
2152 let captured_text = &text[cap_start..cap_end];
2153
2154 if text[pos..].starts_with(captured_text) {
2156 pos += captured_text.len();
2157 } else {
2158 return None;
2159 }
2160 } else {
2161 return None;
2163 }
2164 } else {
2165 if let Some((rel_start, rel_end)) =
2167 m.find(safe_slice(text, pos).unwrap_or(""))
2168 {
2169 if rel_start != 0 {
2170 return None;
2171 }
2172 pos += rel_end;
2173 } else {
2174 return None;
2175 }
2176 }
2177 }
2178 }
2179 }
2180
2181 Some(pos)
2182 }
2183
2184 #[inline(always)]
2186 fn digit_run_is_match(text: &str) -> bool {
2187 let bytes = text.as_bytes();
2188 if bytes.is_empty() {
2189 return false;
2190 }
2191
2192 bytes.iter().any(|&b| b.is_ascii_digit())
2194 }
2195
2196 #[inline(always)]
2198 fn word_run_is_match(text: &str) -> bool {
2199 let bytes = text.as_bytes();
2200 if bytes.is_empty() {
2201 return false;
2202 }
2203
2204 bytes.iter().any(|&b| {
2206 b.is_ascii_lowercase() || b.is_ascii_uppercase() || b.is_ascii_digit() || b == b'_'
2207 })
2208 }
2209
2210 fn quantified_is_match(
2212 text: &str,
2213 inner_matcher: &Matcher,
2214 quantifier: &parser::quantifier::Quantifier,
2215 ) -> bool {
2216 Self::quantified_find(text, inner_matcher, quantifier).is_some()
2217 }
2218
2219 fn quantified_find(
2221 text: &str,
2222 inner_matcher: &Matcher,
2223 quantifier: &parser::quantifier::Quantifier,
2224 ) -> Option<(usize, usize)> {
2225 let (min, max) = quantifier_bounds(quantifier);
2226
2227 if text.is_empty() {
2229 return if min == 0 { Some((0, 0)) } else { None };
2230 }
2231
2232 for start_pos in 0..text.len() {
2234 let mut pos = start_pos;
2235 let mut count = 0;
2236
2237 while count < max && pos < text.len() {
2239 if let Some((rel_start, rel_end)) =
2240 inner_matcher.find(safe_slice(text, pos).unwrap_or(""))
2241 {
2242 if rel_start != 0 {
2244 break;
2245 }
2246 if rel_end == 0 {
2247 break; }
2249 pos += rel_end;
2250 count += 1;
2251 } else {
2252 break;
2253 }
2254 }
2255
2256 if count >= min {
2257 return Some((start_pos, pos));
2258 }
2259 }
2260
2261 None
2262 }
2263
2264 fn contains_quantified(matcher: &Matcher) -> bool {
2267 match matcher {
2268 Matcher::Quantified(_) | Matcher::QuantifiedCapture(_, _) => true,
2269 Matcher::Capture(inner, _) => Self::contains_quantified(inner),
2270 Matcher::PatternWithCaptures { elements, .. } => {
2271 elements.iter().any(|elem| match elem {
2274 CompiledCaptureElement::Capture(m, _)
2275 | CompiledCaptureElement::NonCapture(m) => {
2276 match m {
2278 Matcher::AlternationWithCaptures { .. } => false,
2279 _ => Self::contains_quantified(m),
2280 }
2281 }
2282 })
2283 }
2284 Matcher::AlternationWithCaptures { .. } => false,
2287 _ => false,
2288 }
2289 }
2290
2291 fn match_elements_with_backtrack_and_captures(
2294 text: &str,
2295 start_pos: usize,
2296 elements: &[CompiledCaptureElement],
2297 ) -> Option<(usize, Vec<(usize, usize, usize)>)> {
2298 if elements.is_empty() {
2300 return Some((start_pos, Vec::new()));
2301 }
2302
2303 let first_element = &elements[0];
2305
2306 let needs_backtracking = if elements.len() <= 1 {
2308 false
2309 } else {
2310 match first_element {
2311 CompiledCaptureElement::Capture(m, _) | CompiledCaptureElement::NonCapture(m) => {
2312 Self::contains_quantified(m)
2313 }
2314 }
2315 };
2316
2317 if needs_backtracking {
2318 let remaining_text = safe_slice(text, start_pos).unwrap_or("");
2320 let remaining_len = remaining_text.len();
2321
2322 for try_len in (0..=remaining_len).rev() {
2325 let next_pos = start_pos + try_len;
2326
2327 if let Some((final_pos, mut remaining_caps)) =
2329 Self::match_elements_with_backtrack_and_captures(text, next_pos, &elements[1..])
2330 {
2331 if try_len == 0 {
2333 match first_element {
2335 CompiledCaptureElement::Capture(m, num) => {
2336 if let Some((rel_start, rel_end)) = m.find("") {
2337 if rel_start == 0 && rel_end == 0 {
2338 let mut caps = vec![(*num, start_pos, start_pos)];
2340 caps.append(&mut remaining_caps);
2341 return Some((final_pos, caps));
2342 }
2343 }
2344 }
2345 CompiledCaptureElement::NonCapture(m) => {
2346 if let Some((rel_start, rel_end)) = m.find("") {
2347 if rel_start == 0 && rel_end == 0 {
2348 return Some((final_pos, remaining_caps));
2350 }
2351 }
2352 }
2353 }
2354 } else {
2355 let substring = safe_slice_range(text, start_pos, next_pos).unwrap_or("");
2357
2358 match first_element {
2360 CompiledCaptureElement::Capture(m, num) => {
2361 if let Some((rel_start, rel_end)) = m.find(substring) {
2362 if rel_start == 0 && rel_end == substring.len() {
2363 let mut caps = vec![(*num, start_pos, next_pos)];
2365 caps.append(&mut remaining_caps);
2366 return Some((final_pos, caps));
2367 }
2368 }
2369 }
2370 CompiledCaptureElement::NonCapture(m) => {
2371 if let Some((rel_start, rel_end)) = m.find(substring) {
2372 if rel_start == 0 && rel_end == substring.len() {
2373 let nested_caps =
2375 m.extract_nested_captures(text, start_pos);
2376 let mut all_caps = nested_caps;
2377 all_caps.extend(remaining_caps);
2378 return Some((final_pos, all_caps));
2379 }
2380 }
2381 }
2382 }
2383 }
2384 }
2385 }
2386 None
2387 } else {
2388 match first_element {
2390 CompiledCaptureElement::Capture(m, num) => {
2391 if let Some((rel_start, rel_end)) =
2392 m.find(safe_slice(text, start_pos).unwrap_or(""))
2393 {
2394 if rel_start == 0 {
2395 let next_pos = start_pos + rel_end;
2396 if let Some((final_pos, mut remaining_caps)) =
2397 Self::match_elements_with_backtrack_and_captures(
2398 text,
2399 next_pos,
2400 &elements[1..],
2401 )
2402 {
2403 let mut caps = vec![(*num, start_pos, next_pos)];
2404 caps.append(&mut remaining_caps);
2405 return Some((final_pos, caps));
2406 }
2407 }
2408 }
2409 None
2410 }
2411 CompiledCaptureElement::NonCapture(m) => {
2412 if let Some((rel_start, rel_end)) =
2413 m.find(safe_slice(text, start_pos).unwrap_or(""))
2414 {
2415 if rel_start == 0 {
2416 let next_pos = start_pos + rel_end;
2417 if let Some((final_pos, remaining_caps)) =
2418 Self::match_elements_with_backtrack_and_captures(
2419 text,
2420 next_pos,
2421 &elements[1..],
2422 )
2423 {
2424 let nested_caps = m.extract_nested_captures(text, start_pos);
2426 let mut all_caps = nested_caps;
2427 all_caps.extend(remaining_caps);
2428 return Some((final_pos, all_caps));
2429 }
2430 }
2431 }
2432 None
2433 }
2434 }
2435 }
2436 }
2437
2438 fn match_elements_with_backtrack(
2441 text: &str,
2442 start_pos: usize,
2443 elements: &[CompiledCaptureElement],
2444 ) -> Option<usize> {
2445 if elements.is_empty() {
2447 return Some(start_pos);
2448 }
2449
2450 let first_element = &elements[0];
2452 let first_matcher = match first_element {
2453 CompiledCaptureElement::Capture(m, _) => m,
2454 CompiledCaptureElement::NonCapture(m) => m,
2455 };
2456
2457 let needs_backtracking = if elements.len() <= 1 {
2460 false } else {
2462 Self::contains_quantified(first_matcher)
2464 };
2465
2466 if needs_backtracking {
2467 let remaining_text = safe_slice(text, start_pos).unwrap_or("");
2471 let remaining_len = remaining_text.len();
2472
2473 for try_len in (0..=remaining_len).rev() {
2476 let next_pos = start_pos + try_len;
2477
2478 if let Some(final_pos) =
2480 Self::match_elements_with_backtrack(text, next_pos, &elements[1..])
2481 {
2482 let substring = safe_slice_range(text, start_pos, next_pos).unwrap_or("");
2484
2485 if try_len == 0 {
2489 if let Some((rel_start, rel_end)) = first_matcher.find("") {
2492 if rel_start == 0 && rel_end == 0 {
2493 return Some(final_pos);
2494 }
2495 }
2496 } else {
2497 if let Some((rel_start, rel_end)) = first_matcher.find(substring) {
2499 if rel_start == 0 && rel_end == substring.len() {
2500 return Some(final_pos);
2501 }
2502 }
2503 }
2504 }
2505 }
2506
2507 None
2508 } else {
2509 if let Matcher::AlternationWithCaptures { branches, .. } = first_matcher {
2513 for branch in branches {
2515 if let Some((rel_start, rel_end)) =
2516 branch.find(safe_slice(text, start_pos).unwrap_or(""))
2517 {
2518 if rel_start == 0 {
2519 let next_pos = start_pos + rel_end;
2520 if let Some(final_pos) =
2522 Self::match_elements_with_backtrack(text, next_pos, &elements[1..])
2523 {
2524 return Some(final_pos);
2525 }
2526 }
2528 }
2529 }
2530 return None;
2531 }
2532
2533 if let Some((rel_start, rel_end)) =
2535 first_matcher.find(safe_slice(text, start_pos).unwrap_or(""))
2536 {
2537 if rel_start == 0 {
2538 let next_pos = start_pos + rel_end;
2539 return Self::match_elements_with_backtrack(text, next_pos, &elements[1..]);
2541 }
2542 }
2543 None
2544 }
2545 }
2546
2547 fn quantified_find_all(
2549 text: &str,
2550 inner_matcher: &Matcher,
2551 quantifier: &parser::quantifier::Quantifier,
2552 ) -> Vec<(usize, usize)> {
2553 let mut matches = Vec::new();
2554 let mut search_pos = 0;
2555
2556 while search_pos < text.len() {
2557 if let Some((start, end)) =
2558 Self::quantified_find(&text[search_pos..], inner_matcher, quantifier)
2559 {
2560 matches.push((search_pos + start, search_pos + end));
2561 search_pos += start + 1; if start == end {
2563 search_pos += 1; }
2565 } else {
2566 break;
2567 }
2568 }
2569
2570 matches
2571 }
2572
2573 fn find(&self, text: &str) -> Option<(usize, usize)> {
2574 match self {
2575 Matcher::Literal(lit) => {
2576 let pos = memmem::find(text.as_bytes(), lit.as_bytes())?;
2577 Some((pos, pos + lit.len()))
2578 }
2579 Matcher::MultiLiteral(ac) => {
2580 let mat = ac.find(text)?;
2581 Some((mat.start(), mat.end()))
2582 }
2583 Matcher::AnchoredLiteral {
2584 literal,
2585 start,
2586 end,
2587 } => match (start, end) {
2588 (true, true) => (text == literal).then_some((0, text.len())),
2589 (true, false) => text.starts_with(literal).then_some((0, literal.len())),
2590 (false, true) => text
2591 .ends_with(literal)
2592 .then(|| (text.len() - literal.len(), text.len())),
2593 _ => unreachable!(),
2594 },
2595 Matcher::AnchoredGroup { group, start, end } => {
2596 match (start, end) {
2597 (true, true) => {
2598 group.match_at(text, 0).and_then(|len| {
2600 if len == text.len() {
2601 Some((0, len))
2602 } else {
2603 None
2604 }
2605 })
2606 }
2607 (true, false) => {
2608 group.match_at(text, 0).map(|len| (0, len))
2610 }
2611 (false, true) => {
2612 group.find(text).and_then(|(start_pos, end_pos)| {
2614 if end_pos == text.len() {
2615 Some((start_pos, end_pos))
2616 } else {
2617 None
2618 }
2619 })
2620 }
2621 _ => unreachable!(),
2622 }
2623 }
2624 Matcher::AnchoredPattern { inner, start, end } => {
2625 match (start, end) {
2626 (true, true) => {
2627 inner.find(text).and_then(|(match_start, match_end)| {
2629 if match_start == 0 && match_end == text.len() {
2630 Some((0, text.len()))
2631 } else {
2632 None
2633 }
2634 })
2635 }
2636 (true, false) => {
2637 inner.find(text).and_then(|(match_start, match_end)| {
2639 if match_start == 0 {
2640 Some((0, match_end))
2641 } else {
2642 None
2643 }
2644 })
2645 }
2646 (false, true) => {
2647 inner.find(text).and_then(|(match_start, match_end)| {
2649 if match_end == text.len() {
2650 Some((match_start, match_end))
2651 } else {
2652 None
2653 }
2654 })
2655 }
2656 _ => unreachable!(),
2657 }
2658 }
2659 Matcher::CharClass(cc) => {
2660 for (idx, ch) in text.char_indices() {
2662 if cc.matches(ch) {
2663 return Some((idx, idx + ch.len_utf8()));
2664 }
2665 }
2666 None
2667 }
2668 Matcher::Quantified(qp) => qp.find(text),
2669 Matcher::Sequence(seq) => seq.find(text),
2670 Matcher::Group(group) => group.find(text),
2671 Matcher::DigitRun => Self::digit_run_find(text), Matcher::WordRun => Self::word_run_find(text), Matcher::Boundary(boundary_type) => {
2674 boundary_type.find_first(text).map(|pos| (pos, pos))
2676 }
2677 Matcher::Lookaround(lookaround, inner_matcher) => {
2678 for pos in 0..=text.len() {
2680 if lookaround.matches_at(text, pos, inner_matcher) {
2681 return Some((pos, pos)); }
2683 }
2684 None
2685 }
2686 Matcher::Capture(inner_matcher, _group_index) => {
2687 inner_matcher.find(text)
2689 }
2690 Matcher::QuantifiedCapture(inner_matcher, quantifier) => {
2691 Self::quantified_find(text, inner_matcher, quantifier)
2693 }
2694 Matcher::CombinedWithLookaround {
2695 prefix,
2696 lookaround,
2697 lookaround_matcher,
2698 } => {
2699 let mut search_pos = 0;
2701 while search_pos < text.len() {
2702 let remaining = &text[search_pos..];
2703 if let Some((rel_start, rel_end)) = prefix.find(remaining) {
2704 let abs_start = search_pos + rel_start;
2705 let abs_end = search_pos + rel_end;
2706
2707 if lookaround.matches_at(text, abs_end, lookaround_matcher) {
2709 return Some((abs_start, abs_end));
2710 }
2711
2712 search_pos = abs_start + 1;
2714 } else {
2715 break;
2716 }
2717 }
2718 None
2719 }
2720 Matcher::LookbehindWithSuffix {
2721 lookbehind,
2722 lookbehind_matcher,
2723 suffix,
2724 } => {
2725 let mut search_pos = 0;
2727 while search_pos < text.len() {
2728 let remaining = &text[search_pos..];
2729 if let Some((rel_start, rel_end)) = suffix.find(remaining) {
2730 let abs_start = search_pos + rel_start;
2731 let abs_end = search_pos + rel_end;
2732
2733 if lookbehind.matches_at(text, abs_start, lookbehind_matcher) {
2735 return Some((abs_start, abs_end));
2736 }
2737
2738 search_pos = abs_start + 1;
2740 } else {
2741 break;
2742 }
2743 }
2744 None
2745 }
2746 Matcher::PatternWithCaptures { elements, .. } => {
2747 if elements.len() == 1 {
2749 let matcher = match &elements[0] {
2750 CompiledCaptureElement::Capture(m, _) => m,
2751 CompiledCaptureElement::NonCapture(m) => m,
2752 };
2753 return matcher.find(text);
2754 }
2755
2756 let has_backrefs = elements.iter().any(|elem| {
2758 matches!(
2759 elem,
2760 CompiledCaptureElement::NonCapture(Matcher::Backreference(_))
2761 )
2762 });
2763
2764 if has_backrefs {
2765 for start_pos in 0..=text.len() {
2767 if let Some(end_pos) =
2768 Self::match_pattern_with_backreferences(text, start_pos, elements)
2769 {
2770 if end_pos > start_pos || elements.is_empty() {
2771 return Some((start_pos, end_pos));
2772 }
2773 }
2774 }
2775 return None;
2776 }
2777
2778 if let Some(dfa) = crate::engine::capture_dfa::compile_capture_pattern(elements) {
2780 return dfa.find(text);
2781 }
2782
2783 for start_pos in 0..=text.len() {
2785 if let Some(end_pos) =
2786 Self::match_elements_with_backtrack(text, start_pos, elements)
2787 {
2788 if end_pos > start_pos || elements.is_empty() {
2789 return Some((start_pos, end_pos));
2790 }
2791 }
2792 }
2793
2794 None
2795 }
2796 Matcher::Backreference(_) => {
2797 None
2799 }
2800 Matcher::DFA(dfa) => {
2801 dfa.find(text)
2803 }
2804 Matcher::LazyDFA(lazy_dfa) => {
2805 let mut dfa = lazy_dfa.clone();
2807 dfa.find(text)
2808 }
2809 Matcher::SequenceWithFlags(seq, flags) => {
2810 seq.find_with_flags(text, flags)
2812 }
2813 Matcher::AlternationWithCaptures { branches, .. } => {
2814 let mut best_match: Option<(usize, usize)> = None;
2816
2817 for branch in branches {
2818 if let Some((start, end)) = branch.find(text) {
2819 if best_match.is_none() || start < best_match.unwrap().0 {
2821 best_match = Some((start, end));
2822 }
2823 }
2824 }
2825 best_match
2826 }
2827 Matcher::CaseInsensitive(inner) => {
2828 let bytes = text.as_bytes();
2829 let len = bytes.len();
2830 if len <= 256 {
2831 let mut buf = [0u8; 256];
2832 let mut all_ascii = true;
2833 for i in 0..len {
2834 let b = bytes[i];
2835 if b >= 128 {
2836 all_ascii = false;
2837 break;
2838 }
2839 buf[i] = if b >= b'A' && b <= b'Z' { b + 32 } else { b };
2840 }
2841 if all_ascii {
2842 let lower = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
2843 return inner.find(lower);
2844 }
2845 }
2846 let lower_text = text.to_lowercase();
2847 inner.find(&lower_text)
2848 }
2849 }
2850 }
2851
2852 #[inline(always)]
2854 fn digit_run_find(text: &str) -> Option<(usize, usize)> {
2855 let bytes = text.as_bytes();
2856
2857 let mut start = None;
2859 for (i, &b) in bytes.iter().enumerate() {
2860 if b.is_ascii_digit() {
2861 start = Some(i);
2862 break;
2863 }
2864 }
2865
2866 let start_idx = start?;
2867
2868 let mut end_idx = bytes.len();
2870 for (i, &b) in bytes[start_idx..].iter().enumerate() {
2871 if !b.is_ascii_digit() {
2872 end_idx = start_idx + i;
2873 break;
2874 }
2875 }
2876
2877 Some((start_idx, end_idx))
2878 }
2879
2880 #[inline(always)]
2882 fn word_run_find(text: &str) -> Option<(usize, usize)> {
2883 let bytes = text.as_bytes();
2884
2885 let mut start = None;
2887 for (i, &b) in bytes.iter().enumerate() {
2888 if b.is_ascii_lowercase() || b.is_ascii_uppercase() || b.is_ascii_digit() || b == b'_' {
2889 start = Some(i);
2890 break;
2891 }
2892 }
2893
2894 let start_idx = start?;
2895
2896 let mut end_idx = bytes.len();
2898 for (i, &b) in bytes[start_idx..].iter().enumerate() {
2899 if !(b.is_ascii_lowercase()
2900 || b.is_ascii_uppercase()
2901 || b.is_ascii_digit()
2902 || b == b'_')
2903 {
2904 end_idx = start_idx + i;
2905 break;
2906 }
2907 }
2908
2909 Some((start_idx, end_idx))
2910 }
2911
2912 fn find_all(&self, text: &str) -> Vec<(usize, usize)> {
2913 match self {
2914 Matcher::Literal(lit) => {
2915 let finder = memmem::Finder::new(lit.as_bytes());
2916 finder
2917 .find_iter(text.as_bytes())
2918 .map(|pos| (pos, pos + lit.len()))
2919 .collect()
2920 }
2921 Matcher::MultiLiteral(ac) => ac
2922 .find_iter(text)
2923 .map(|mat| (mat.start(), mat.end()))
2924 .collect(),
2925 Matcher::AnchoredLiteral { .. } => {
2926 if let Some(m) = self.find(text) {
2927 vec![m]
2928 } else {
2929 vec![]
2930 }
2931 }
2932 Matcher::AnchoredGroup { .. } => {
2933 if let Some(m) = self.find(text) {
2935 vec![m]
2936 } else {
2937 vec![]
2938 }
2939 }
2940 Matcher::AnchoredPattern { .. } => {
2941 if let Some(m) = self.find(text) {
2943 vec![m]
2944 } else {
2945 vec![]
2946 }
2947 }
2948 Matcher::CharClass(cc) => {
2949 text.char_indices()
2951 .filter(|(_, ch)| cc.matches(*ch))
2952 .map(|(idx, ch)| (idx, idx + ch.len_utf8()))
2953 .collect()
2954 }
2955 Matcher::Quantified(qp) => qp.find_all(text),
2956 Matcher::Sequence(seq) => seq.find_all(text),
2957 Matcher::Group(group) => group.find_all(text),
2958 Matcher::DigitRun => Self::digit_run_find_all(text), Matcher::WordRun => Self::word_run_find_all(text), Matcher::Boundary(boundary_type) => {
2961 boundary_type
2963 .find_all(text)
2964 .into_iter()
2965 .map(|pos| (pos, pos))
2966 .collect()
2967 }
2968 Matcher::Lookaround(lookaround, inner_matcher) => {
2969 (0..=text.len())
2971 .filter(|&pos| lookaround.matches_at(text, pos, inner_matcher))
2972 .map(|pos| (pos, pos)) .collect()
2974 }
2975 Matcher::Capture(inner_matcher, _group_index) => {
2976 inner_matcher.find_all(text)
2978 }
2979 Matcher::QuantifiedCapture(inner_matcher, quantifier) => {
2980 Self::quantified_find_all(text, inner_matcher, quantifier)
2982 }
2983 Matcher::CombinedWithLookaround {
2984 prefix,
2985 lookaround,
2986 lookaround_matcher,
2987 } => {
2988 let mut matches = Vec::new();
2990 let mut search_pos = 0;
2991
2992 while search_pos < text.len() {
2993 let remaining = &text[search_pos..];
2994 if let Some((rel_start, rel_end)) = prefix.find(remaining) {
2995 let abs_start = search_pos + rel_start;
2996 let abs_end = search_pos + rel_end;
2997
2998 if lookaround.matches_at(text, abs_end, lookaround_matcher) {
3000 matches.push((abs_start, abs_end));
3001 }
3002
3003 search_pos = abs_start + 1;
3005 } else {
3006 break;
3007 }
3008 }
3009
3010 matches
3011 }
3012 Matcher::LookbehindWithSuffix {
3013 lookbehind,
3014 lookbehind_matcher,
3015 suffix,
3016 } => {
3017 let mut matches = Vec::new();
3019 let mut search_pos = 0;
3020
3021 while search_pos < text.len() {
3022 let remaining = &text[search_pos..];
3023 if let Some((rel_start, rel_end)) = suffix.find(remaining) {
3024 let abs_start = search_pos + rel_start;
3025 let abs_end = search_pos + rel_end;
3026
3027 if lookbehind.matches_at(text, abs_start, lookbehind_matcher) {
3029 matches.push((abs_start, abs_end));
3030 }
3031
3032 search_pos = abs_start + 1;
3034 } else {
3035 break;
3036 }
3037 }
3038
3039 matches
3040 }
3041 Matcher::PatternWithCaptures { elements, .. } => {
3042 let mut matches = Vec::new();
3044 let mut start_pos = 0;
3045
3046 while start_pos < text.len() {
3047 let mut pos = start_pos;
3048 let mut all_matched = true;
3049
3050 for element in elements {
3051 let matcher = match element {
3052 CompiledCaptureElement::Capture(m, _) => m,
3053 CompiledCaptureElement::NonCapture(m) => m,
3054 };
3055
3056 if let Some((rel_start, rel_end)) =
3057 matcher.find(safe_slice(text, pos).unwrap_or(""))
3058 {
3059 if rel_start != 0 {
3060 all_matched = false;
3062 break;
3063 }
3064 pos += rel_end;
3065 } else {
3066 all_matched = false;
3067 break;
3068 }
3069 }
3070
3071 if all_matched {
3072 matches.push((start_pos, pos));
3073 start_pos = pos.max(start_pos + 1); } else {
3075 start_pos += 1;
3076 }
3077 }
3078
3079 matches
3080 }
3081 Matcher::Backreference(_) => {
3082 vec![]
3084 }
3085 Matcher::DFA(dfa) => {
3086 let mut matches = Vec::new();
3088 let mut search_start = 0;
3089
3090 while search_start < text.len() {
3091 if let Some((start, end)) = dfa.find(&text[search_start..]) {
3092 let abs_start = search_start + start;
3093 let abs_end = search_start + end;
3094 matches.push((abs_start, abs_end));
3095 search_start = abs_end.max(abs_start + 1);
3096 } else {
3097 break;
3098 }
3099 }
3100
3101 matches
3102 }
3103 Matcher::LazyDFA(lazy_dfa) => {
3104 let mut dfa = lazy_dfa.clone();
3106 let mut matches = Vec::new();
3107 let mut search_start = 0;
3108
3109 while search_start < text.len() {
3110 if let Some((start, end)) = dfa.find(&text[search_start..]) {
3111 let abs_start = search_start + start;
3112 let abs_end = search_start + end;
3113 matches.push((abs_start, abs_end));
3114 search_start = abs_end.max(abs_start + 1);
3115 } else {
3116 break;
3117 }
3118 }
3119
3120 matches
3121 }
3122 Matcher::SequenceWithFlags(seq, flags) => {
3123 let mut matches = Vec::new();
3125 let mut search_start = 0;
3126
3127 while search_start < text.len() {
3128 if let Some((start, end)) = seq.find_with_flags(&text[search_start..], flags) {
3129 let abs_start = search_start + start;
3130 let abs_end = search_start + end;
3131 matches.push((abs_start, abs_end));
3132 search_start = abs_end.max(abs_start + 1);
3133 } else {
3134 break;
3135 }
3136 }
3137
3138 matches
3139 }
3140 Matcher::AlternationWithCaptures { branches, .. } => {
3141 let mut matches = Vec::new();
3143 let mut search_start = 0;
3144
3145 while search_start < text.len() {
3146 let mut best_match: Option<(usize, usize)> = None;
3148
3149 for branch in branches {
3150 if let Some((start, end)) = branch.find(&text[search_start..]) {
3151 let abs_start = search_start + start;
3152 let abs_end = search_start + end;
3153
3154 if best_match.is_none() || abs_start < best_match.unwrap().0 {
3155 best_match = Some((abs_start, abs_end));
3156 }
3157 }
3158 }
3159
3160 if let Some((start, end)) = best_match {
3161 matches.push((start, end));
3162 search_start = end.max(start + 1);
3163 } else {
3164 break;
3165 }
3166 }
3167
3168 matches
3169 }
3170 Matcher::CaseInsensitive(inner) => {
3171 let bytes = text.as_bytes();
3172 let len = bytes.len();
3173 if len <= 256 {
3174 let mut buf = [0u8; 256];
3175 let mut all_ascii = true;
3176 for i in 0..len {
3177 let b = bytes[i];
3178 if b >= 128 {
3179 all_ascii = false;
3180 break;
3181 }
3182 buf[i] = if b >= b'A' && b <= b'Z' { b + 32 } else { b };
3183 }
3184 if all_ascii {
3185 let lower = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
3186 return inner.find_all(lower);
3187 }
3188 }
3189 let lower_text = text.to_lowercase();
3190 inner.find_all(&lower_text)
3191 }
3192 }
3193 }
3194
3195 #[inline]
3197 fn digit_run_find_all(text: &str) -> Vec<(usize, usize)> {
3198 let bytes = text.as_bytes();
3199 let mut matches = Vec::new();
3200 let mut i = 0;
3201
3202 while i < bytes.len() {
3203 while i < bytes.len() && (bytes[i] < b'0' || bytes[i] > b'9') {
3205 i += 1;
3206 }
3207
3208 if i >= bytes.len() {
3209 break;
3210 }
3211
3212 let start = i;
3214
3215 while i < bytes.len() && bytes[i] >= b'0' && bytes[i] <= b'9' {
3217 i += 1;
3218 }
3219
3220 matches.push((start, i));
3221 }
3222
3223 matches
3224 }
3225
3226 #[inline]
3228 fn word_run_find_all(text: &str) -> Vec<(usize, usize)> {
3229 let bytes = text.as_bytes();
3230 let mut matches = Vec::new();
3231 let mut i = 0;
3232
3233 while i < bytes.len() {
3234 while i < bytes.len() {
3236 let b = bytes[i];
3237 if b.is_ascii_lowercase()
3238 || b.is_ascii_uppercase()
3239 || b.is_ascii_digit()
3240 || b == b'_'
3241 {
3242 break;
3243 }
3244 i += 1;
3245 }
3246
3247 if i >= bytes.len() {
3248 break;
3249 }
3250
3251 let start = i;
3253
3254 while i < bytes.len() {
3256 let b = bytes[i];
3257 if !(b.is_ascii_lowercase()
3258 || b.is_ascii_uppercase()
3259 || b.is_ascii_digit()
3260 || b == b'_')
3261 {
3262 break;
3263 }
3264 i += 1;
3265 }
3266
3267 matches.push((start, i));
3268 }
3269
3270 matches
3271 }
3272}
3273
3274fn compile_ast(ast: &Ast) -> Result<Matcher, PatternError> {
3275 match ast {
3276 Ast::Literal(lit) => Ok(Matcher::Literal(lit.clone())),
3277 Ast::Dot => {
3278 use crate::parser::charclass::CharClass;
3281 let char_class = CharClass::parse(r"^\n")
3282 .map_err(|e| PatternError::ParseError(format!("Dot charclass: {}", e)))?;
3283 Ok(Matcher::CharClass(char_class))
3284 }
3285 Ast::Alternation(parts) => {
3286 use aho_corasick::MatchKind;
3287 let ac = AhoCorasick::builder()
3288 .match_kind(MatchKind::LeftmostFirst)
3289 .build(parts)
3290 .map_err(|e| PatternError::ParseError(format!("Aho-Corasick: {}", e)))?;
3291 Ok(Matcher::MultiLiteral(ac))
3292 }
3293 Ast::Anchored {
3294 literal,
3295 start,
3296 end,
3297 } => Ok(Matcher::AnchoredLiteral {
3298 literal: literal.clone(),
3299 start: *start,
3300 end: *end,
3301 }),
3302 Ast::AnchoredGroup { group, start, end } => Ok(Matcher::AnchoredGroup {
3303 group: group.clone(),
3304 start: *start,
3305 end: *end,
3306 }),
3307 Ast::AnchoredPattern { inner, start, end } => {
3308 let inner_matcher = compile_ast(inner)?;
3309 Ok(Matcher::AnchoredPattern {
3310 inner: Box::new(inner_matcher),
3311 start: *start,
3312 end: *end,
3313 })
3314 }
3315 Ast::CharClass(cc) => Ok(Matcher::CharClass(cc.clone())),
3316 Ast::Quantified(qp) => {
3317 if let crate::parser::quantifier::Quantifier::OneOrMore = qp.quantifier {
3319 if let crate::parser::quantifier::QuantifiedElement::CharClass(ref cc) = qp.element
3320 {
3321 if is_digit_charclass(cc) {
3323 return Ok(Matcher::DigitRun);
3324 }
3325 if is_word_charclass(cc) {
3327 return Ok(Matcher::WordRun);
3328 }
3329 }
3330 }
3331 Ok(Matcher::Quantified(qp.clone()))
3332 }
3333 Ast::Sequence(seq) => {
3334 if let Some(dfa) = engine::dfa::DFA::try_compile(seq) {
3342 return Ok(Matcher::DFA(dfa));
3343 }
3344 Ok(Matcher::Sequence(seq.clone()))
3346 }
3347 Ast::Group(group) => Ok(Matcher::Group(group.clone())),
3348 Ast::Boundary(boundary_type) => Ok(Matcher::Boundary(*boundary_type)),
3349 Ast::Lookaround(lookaround) => {
3350 let inner_matcher = compile_ast(&lookaround.pattern)?;
3352 Ok(Matcher::Lookaround(
3353 Box::new(lookaround.clone()),
3354 Box::new(inner_matcher),
3355 ))
3356 }
3357 Ast::Capture(inner_ast, group_index) => {
3358 let inner_matcher = compile_ast(inner_ast)?;
3360 Ok(Matcher::Capture(Box::new(inner_matcher), *group_index))
3361 }
3362 Ast::QuantifiedCapture(inner_ast, quantifier) => {
3363 let inner_matcher = compile_ast(inner_ast)?;
3365 Ok(Matcher::QuantifiedCapture(
3366 Box::new(inner_matcher),
3367 quantifier.clone(),
3368 ))
3369 }
3370 Ast::CombinedWithLookaround { prefix, lookaround } => {
3371 let prefix_matcher = compile_ast(prefix)?;
3373 let lookaround_inner = compile_ast(&lookaround.pattern)?;
3374 Ok(Matcher::CombinedWithLookaround {
3375 prefix: Box::new(prefix_matcher),
3376 lookaround: Box::new(lookaround.clone()),
3377 lookaround_matcher: Box::new(lookaround_inner),
3378 })
3379 }
3380 Ast::LookbehindWithSuffix { lookbehind, suffix } => {
3381 let lookbehind_inner = compile_ast(&lookbehind.pattern)?;
3383 let suffix_matcher = compile_ast(suffix)?;
3384 Ok(Matcher::LookbehindWithSuffix {
3385 lookbehind: Box::new(lookbehind.clone()),
3386 lookbehind_matcher: Box::new(lookbehind_inner),
3387 suffix: Box::new(suffix_matcher),
3388 })
3389 }
3390 Ast::PatternWithCaptures {
3391 elements,
3392 total_groups,
3393 } => {
3394 let mut compiled_elements = Vec::new();
3396 for elem in elements {
3397 match elem {
3398 CaptureElement::Capture(ast, group_num) => {
3399 let matcher = compile_ast(ast)?;
3400 compiled_elements
3401 .push(CompiledCaptureElement::Capture(matcher, *group_num));
3402 }
3403 CaptureElement::NonCapture(ast) => {
3404 let matcher = compile_ast(ast)?;
3405 compiled_elements.push(CompiledCaptureElement::NonCapture(matcher));
3406 }
3407 }
3408 }
3409 Ok(Matcher::PatternWithCaptures {
3410 elements: compiled_elements,
3411 total_groups: *total_groups,
3412 })
3413 }
3414 Ast::AlternationWithCaptures {
3415 branches,
3416 total_groups,
3417 } => {
3418 let mut compiled_branches = Vec::new();
3420 for branch_ast in branches {
3421 let branch_matcher = compile_ast(branch_ast)?;
3422 compiled_branches.push(branch_matcher);
3423 }
3424 Ok(Matcher::AlternationWithCaptures {
3425 branches: compiled_branches,
3426 total_groups: *total_groups,
3427 })
3428 }
3429 Ast::Backreference(group_num) => Ok(Matcher::Backreference(*group_num)),
3430 Ast::DotAll => {
3431 use crate::parser::charclass::CharClass;
3434 let mut char_class = CharClass::new();
3436 char_class.add_range('\0', char::MAX); char_class.finalize();
3438 Ok(Matcher::CharClass(char_class))
3439 }
3440 Ast::SequenceWithFlags(seq, flags) => {
3441 Ok(Matcher::SequenceWithFlags(seq.clone(), *flags))
3443 }
3444 Ast::CaseInsensitive(inner) => {
3445 let lowercased = lowercase_ast(inner);
3447 let inner_matcher = compile_ast(&lowercased)?;
3448 Ok(Matcher::CaseInsensitive(Box::new(inner_matcher)))
3449 }
3450 }
3451}
3452
3453fn lowercase_ast(ast: &Ast) -> Ast {
3455 match ast {
3456 Ast::Literal(s) => Ast::Literal(s.to_lowercase()),
3457 Ast::Alternation(branches) => {
3458 Ast::Alternation(branches.iter().map(|s| s.to_lowercase()).collect())
3459 }
3460 Ast::Group(g) => {
3461 let mut new_group = g.clone();
3463 new_group.content = match &g.content {
3464 parser::group::GroupContent::Single(s) => {
3465 parser::group::GroupContent::Single(s.to_lowercase())
3466 }
3467 parser::group::GroupContent::Alternation(branches) => {
3468 parser::group::GroupContent::Alternation(
3469 branches.iter().map(|s| s.to_lowercase()).collect(),
3470 )
3471 }
3472 parser::group::GroupContent::Sequence(seq) => {
3473 let mut new_seq = seq.clone();
3474 new_seq.elements = new_seq
3475 .elements
3476 .into_iter()
3477 .map(|elem| match elem {
3478 parser::sequence::SequenceElement::Literal(s) => {
3479 parser::sequence::SequenceElement::Literal(s.to_lowercase())
3480 }
3481 parser::sequence::SequenceElement::Char(c) => {
3482 let lowered: String = c.to_lowercase().collect();
3483 if lowered.len() == 1 {
3484 parser::sequence::SequenceElement::Char(
3485 lowered.chars().next().unwrap(),
3486 )
3487 } else {
3488 parser::sequence::SequenceElement::Literal(lowered)
3489 }
3490 }
3491 other => other,
3492 })
3493 .collect();
3494 parser::group::GroupContent::Sequence(new_seq)
3495 }
3496 parser::group::GroupContent::ParsedAlternation(sequences) => {
3497 let new_sequences: Vec<_> = sequences
3498 .iter()
3499 .map(|seq| {
3500 let mut new_seq = seq.clone();
3501 new_seq.elements = new_seq
3502 .elements
3503 .into_iter()
3504 .map(|elem| match elem {
3505 parser::sequence::SequenceElement::Literal(s) => {
3506 parser::sequence::SequenceElement::Literal(s.to_lowercase())
3507 }
3508 parser::sequence::SequenceElement::Char(c) => {
3509 let lowered: String = c.to_lowercase().collect();
3510 if lowered.len() == 1 {
3511 parser::sequence::SequenceElement::Char(
3512 lowered.chars().next().unwrap(),
3513 )
3514 } else {
3515 parser::sequence::SequenceElement::Literal(lowered)
3516 }
3517 }
3518 other => other,
3519 })
3520 .collect();
3521 new_seq
3522 })
3523 .collect();
3524 parser::group::GroupContent::ParsedAlternation(new_sequences)
3525 }
3526 };
3527 Ast::Group(new_group)
3528 }
3529 Ast::Sequence(seq) => {
3530 let new_elements: Vec<_> = seq
3533 .elements
3534 .iter()
3535 .map(|elem| match elem {
3536 parser::sequence::SequenceElement::Literal(s) => {
3537 parser::sequence::SequenceElement::Literal(s.to_lowercase())
3538 }
3539 parser::sequence::SequenceElement::Char(c) => {
3540 let lowered: String = c.to_lowercase().collect();
3541 if lowered.len() == 1 {
3542 parser::sequence::SequenceElement::Char(lowered.chars().next().unwrap())
3543 } else {
3544 parser::sequence::SequenceElement::Literal(lowered)
3545 }
3546 }
3547 other => other.clone(),
3548 })
3549 .collect();
3550 Ast::Sequence(parser::sequence::Sequence::new(new_elements))
3551 }
3552 Ast::Anchored {
3553 literal,
3554 start,
3555 end,
3556 } => Ast::Anchored {
3557 literal: literal.to_lowercase(),
3558 start: *start,
3559 end: *end,
3560 },
3561 Ast::AnchoredPattern { inner, start, end } => Ast::AnchoredPattern {
3562 inner: Box::new(lowercase_ast(inner)),
3563 start: *start,
3564 end: *end,
3565 },
3566 Ast::SequenceWithFlags(seq, flags) => {
3567 let mut new_seq = seq.clone();
3568 new_seq.elements = new_seq
3569 .elements
3570 .into_iter()
3571 .map(|elem| match elem {
3572 parser::sequence::SequenceElement::Literal(s) => {
3573 parser::sequence::SequenceElement::Literal(s.to_lowercase())
3574 }
3575 parser::sequence::SequenceElement::Char(c) => {
3576 let lowered: String = c.to_lowercase().collect();
3577 if lowered.len() == 1 {
3578 parser::sequence::SequenceElement::Char(lowered.chars().next().unwrap())
3579 } else {
3580 parser::sequence::SequenceElement::Literal(lowered)
3581 }
3582 }
3583 other => other,
3584 })
3585 .collect();
3586 Ast::SequenceWithFlags(new_seq, *flags)
3587 }
3588 Ast::CaseInsensitive(inner) => {
3589 Ast::CaseInsensitive(Box::new(lowercase_ast(inner)))
3591 }
3592 Ast::PatternWithCaptures {
3593 elements,
3594 total_groups,
3595 } => {
3596 let new_elements = elements
3598 .iter()
3599 .map(|elem| match elem {
3600 CaptureElement::NonCapture(ast) => {
3601 CaptureElement::NonCapture(lowercase_ast(ast))
3602 }
3603 CaptureElement::Capture(ast, group_num) => {
3604 CaptureElement::Capture(lowercase_ast(ast), *group_num)
3605 }
3606 })
3607 .collect();
3608 Ast::PatternWithCaptures {
3609 elements: new_elements,
3610 total_groups: *total_groups,
3611 }
3612 }
3613 Ast::AlternationWithCaptures {
3614 branches,
3615 total_groups,
3616 } => Ast::AlternationWithCaptures {
3617 branches: branches.iter().map(lowercase_ast).collect(),
3618 total_groups: *total_groups,
3619 },
3620 Ast::Capture(inner, group_index) => {
3621 Ast::Capture(Box::new(lowercase_ast(inner)), *group_index)
3623 }
3624 _ => ast.clone(),
3626 }
3627}
3628
3629fn quantifier_bounds(q: &parser::quantifier::Quantifier) -> (usize, usize) {
3631 use parser::quantifier::Quantifier;
3632 match q {
3633 Quantifier::ZeroOrMore | Quantifier::ZeroOrMoreLazy => (0, usize::MAX),
3634 Quantifier::OneOrMore | Quantifier::OneOrMoreLazy => (1, usize::MAX),
3635 Quantifier::ZeroOrOne | Quantifier::ZeroOrOneLazy => (0, 1),
3636 Quantifier::Exactly(n) => (*n, *n),
3637 Quantifier::AtLeast(n) => (*n, usize::MAX),
3638 Quantifier::Between(n, m) => (*n, *m),
3639 }
3640}
3641
3642fn is_digit_charclass(cc: &CharClass) -> bool {
3644 cc.ranges.len() == 1 && cc.ranges[0] == ('0', '9') && cc.chars.is_empty() && !cc.negated
3646}
3647
3648fn is_word_charclass(cc: &CharClass) -> bool {
3650 if cc.negated || cc.ranges.len() != 3 {
3652 return false;
3653 }
3654
3655 let mut has_lower = false;
3656 let mut has_upper = false;
3657 let mut has_digit = false;
3658
3659 for &(start, end) in &cc.ranges {
3660 if start == 'a' && end == 'z' {
3661 has_lower = true;
3662 } else if start == 'A' && end == 'Z' {
3663 has_upper = true;
3664 } else if start == '0' && end == '9' {
3665 has_digit = true;
3666 }
3667 }
3668
3669 has_lower && has_upper && has_digit && cc.chars.len() == 1 && cc.chars[0] == '_'
3670}
3671
3672fn parse_lookaround(pattern: &str, depth: usize) -> Result<Ast, PatternError> {
3674 let lookaround_type = if pattern.starts_with("(?=") {
3675 LookaroundType::PositiveLookahead
3676 } else if pattern.starts_with("(?!") {
3677 LookaroundType::NegativeLookahead
3678 } else if pattern.starts_with("(?<=") {
3679 LookaroundType::PositiveLookbehind
3680 } else if pattern.starts_with("(?<!") {
3681 LookaroundType::NegativeLookbehind
3682 } else {
3683 return Err(PatternError::ParseError(
3684 "Invalid lookaround syntax".to_string(),
3685 ));
3686 };
3687
3688 let prefix_len = if pattern.starts_with("(?<=") || pattern.starts_with("(?<!") {
3690 4 } else {
3692 3 };
3694
3695 if let Some(close_idx) = find_matching_paren(pattern, 0) {
3696 let inner = &pattern[prefix_len..close_idx];
3697 let inner_ast = parse_pattern_with_depth(inner, depth + 1)?;
3698
3699 if close_idx != pattern.len() - 1 {
3701 let suffix = &pattern[close_idx + 1..];
3703 let suffix_ast = parse_pattern_with_depth(suffix, depth + 1)?;
3704
3705 if matches!(
3708 lookaround_type,
3709 LookaroundType::PositiveLookbehind | LookaroundType::NegativeLookbehind
3710 ) {
3711 let lookbehind = Lookaround::new(lookaround_type, inner_ast);
3712 return Ok(Ast::LookbehindWithSuffix {
3713 lookbehind,
3714 suffix: Box::new(suffix_ast),
3715 });
3716 } else {
3717 return Err(PatternError::ParseError(
3718 "Lookahead cannot have suffix pattern after it".to_string(),
3719 ));
3720 }
3721 }
3722
3723 Ok(Ast::Lookaround(Lookaround::new(lookaround_type, inner_ast)))
3724 } else {
3725 Err(PatternError::ParseError(
3726 "Unmatched parenthesis in lookaround".to_string(),
3727 ))
3728 }
3729}
3730
3731fn parse_combined_with_lookaround(pattern: &str, depth: usize) -> Result<Ast, PatternError> {
3733 let lookaround_patterns = ["(?=", "(?!", "(?<=", "(?<!"];
3735
3736 for lookaround_start in lookaround_patterns {
3737 if let Some(pos) = pattern.find(lookaround_start) {
3738 if pos == 0 {
3739 continue;
3741 }
3742
3743 let prefix = &pattern[..pos];
3745 let lookaround_part = &pattern[pos..];
3746
3747 let prefix_ast = parse_pattern_with_depth(prefix, depth + 1)?;
3749
3750 let lookaround_type = if lookaround_start == "(?=" {
3752 LookaroundType::PositiveLookahead
3753 } else if lookaround_start == "(?!" {
3754 LookaroundType::NegativeLookahead
3755 } else if lookaround_start == "(?<=" {
3756 LookaroundType::PositiveLookbehind
3757 } else {
3758 LookaroundType::NegativeLookbehind
3759 };
3760
3761 let prefix_len = lookaround_start.len();
3762 if let Some(close_idx) = find_matching_paren(lookaround_part, 0) {
3763 if close_idx != lookaround_part.len() - 1 {
3764 return Err(PatternError::ParseError(
3765 "Extra characters after lookaround".to_string(),
3766 ));
3767 }
3768
3769 let inner = &lookaround_part[prefix_len..close_idx];
3770 let inner_ast = parse_pattern_with_depth(inner, depth + 1)?;
3771
3772 let lookaround = Lookaround::new(lookaround_type, inner_ast);
3773
3774 return Ok(Ast::CombinedWithLookaround {
3775 prefix: Box::new(prefix_ast),
3776 lookaround,
3777 });
3778 } else {
3779 return Err(PatternError::ParseError(
3780 "Unmatched parenthesis in lookaround".to_string(),
3781 ));
3782 }
3783 }
3784 }
3785
3786 Err(PatternError::ParseError(
3787 "No lookaround found in pattern".to_string(),
3788 ))
3789}
3790
3791fn contains_unescaped_paren(pattern: &str) -> bool {
3795 let bytes = pattern.as_bytes();
3796 let mut i = 0;
3797 while i < bytes.len() {
3798 if bytes[i] == b'\\' && i + 1 < bytes.len() {
3799 i += 2; } else if bytes[i] == b'[' {
3801 i += 1;
3803 if i < bytes.len() && bytes[i] == b'^' {
3804 i += 1;
3805 }
3806 while i < bytes.len() {
3807 if bytes[i] == b'\\' {
3808 i += 2;
3809 } else if bytes[i] == b']' {
3810 i += 1;
3811 break;
3812 } else {
3813 i += 1;
3814 }
3815 }
3816 } else if bytes[i] == b'(' || bytes[i] == b')' {
3817 return true;
3818 } else {
3819 i += 1;
3820 }
3821 }
3822 false
3823}
3824
3825fn find_matching_paren(pattern: &str, start: usize) -> Option<usize> {
3826 let bytes = pattern.as_bytes();
3827 if start >= bytes.len() || bytes[start] != b'(' {
3828 return None;
3829 }
3830
3831 let mut depth = 0;
3832 let mut i = 0;
3833 while i < bytes[start..].len() {
3834 match bytes[start + i] {
3835 b'\\' => {
3836 i += 2;
3838 continue;
3839 }
3840 b'[' => {
3841 i += 1;
3843 if i < bytes[start..].len() && bytes[start + i] == b'^' {
3845 i += 1;
3846 }
3847 while i < bytes[start..].len() {
3849 if bytes[start + i] == b'\\' {
3850 i += 2; } else if bytes[start + i] == b']' {
3852 i += 1;
3853 break;
3854 } else {
3855 i += 1;
3856 }
3857 }
3858 continue;
3859 }
3860 b'(' => depth += 1,
3861 b')' => {
3862 depth -= 1;
3863 if depth == 0 {
3864 return Some(start + i);
3865 }
3866 }
3867 _ => {}
3868 }
3869 i += 1;
3870 }
3871
3872 None }
3874
3875fn parse_pattern_with_captures(pattern: &str) -> Result<Ast, PatternError> {
3878 let mut group_counter = 1;
3879 let (ast, _total_groups) = parse_pattern_with_captures_inner(pattern, &mut group_counter)?;
3880 Ok(ast)
3881}
3882
3883fn split_by_alternation(pattern: &str) -> Option<Vec<String>> {
3886 let mut branches = Vec::new();
3887 let mut current = String::new();
3888 let mut depth = 0;
3889 let mut chars = pattern.chars().peekable();
3890
3891 while let Some(ch) = chars.next() {
3892 match ch {
3893 '\\' => {
3894 current.push(ch);
3896 if let Some(next) = chars.next() {
3897 current.push(next);
3898 }
3899 }
3900 '[' => {
3901 current.push(ch);
3903 if chars.peek() == Some(&'^') {
3905 current.push(chars.next().unwrap());
3906 }
3907 while let Some(c) = chars.next() {
3909 current.push(c);
3910 if c == '\\' {
3911 if let Some(next) = chars.next() {
3912 current.push(next);
3913 }
3914 } else if c == ']' {
3915 break;
3916 }
3917 }
3918 }
3919 '(' => {
3920 depth += 1;
3921 current.push(ch);
3922 }
3923 ')' => {
3924 depth -= 1;
3925 current.push(ch);
3926 }
3927 '|' if depth == 0 => {
3928 branches.push(current.clone());
3930 current.clear();
3931 }
3932 _ => {
3933 current.push(ch);
3934 }
3935 }
3936 }
3937
3938 if !current.is_empty() || !branches.is_empty() {
3940 branches.push(current);
3941 }
3942
3943 if branches.len() > 1 {
3945 Some(branches)
3946 } else {
3947 None
3948 }
3949}
3950
3951fn parse_pattern_with_captures_inner(
3953 pattern: &str,
3954 group_counter: &mut usize,
3955) -> Result<(Ast, usize), PatternError> {
3956 if let Some(branches) = split_by_alternation(pattern) {
3958 let _start_group = *group_counter;
3960 let mut parsed_branches = Vec::new();
3961
3962 for branch in branches {
3963 let (branch_ast, _) = parse_pattern_with_captures_inner(&branch, group_counter)?;
3965 parsed_branches.push(branch_ast);
3966 }
3967
3968 let total_groups = *group_counter - 1;
3969
3970 let all_literals = parsed_branches
3975 .iter()
3976 .all(|ast| matches!(ast, Ast::Literal(_)));
3977
3978 if all_literals {
3979 let literals: Vec<String> = parsed_branches
3981 .into_iter()
3982 .filter_map(|ast| {
3983 if let Ast::Literal(s) = ast {
3984 Some(s)
3985 } else {
3986 None
3987 }
3988 })
3989 .collect();
3990 return Ok((Ast::Alternation(literals), total_groups));
3991 } else {
3992 let mut sequences = Vec::new();
3995 for branch_ast in &parsed_branches {
3996 if let Ast::Sequence(seq) = branch_ast {
3998 sequences.push(seq.clone());
3999 } else {
4000 break;
4002 }
4003 }
4004
4005 if sequences.len() == parsed_branches.len() {
4006 use crate::parser::group::{Group, GroupContent};
4008 return Ok((
4009 Ast::Group(Group::new_non_capturing(GroupContent::ParsedAlternation(
4010 sequences,
4011 ))),
4012 total_groups,
4013 ));
4014 } else {
4015 return Ok((
4018 Ast::AlternationWithCaptures {
4019 branches: parsed_branches,
4020 total_groups,
4021 },
4022 total_groups,
4023 ));
4024 }
4025 }
4026 }
4027
4028 let mut elements: Vec<CaptureElement> = Vec::new();
4030 let mut pos = 0;
4031 let start_group = *group_counter;
4032
4033 while pos < pattern.len() {
4034 if pattern[pos..].starts_with("(?:") {
4035 if let Some(close_idx) = find_matching_paren(pattern, pos) {
4037 let inner = &pattern[pos + 3..close_idx]; let (inner_ast, _) = parse_pattern_with_captures_inner(inner, group_counter)?;
4040
4041 let mut after_group = close_idx + 1;
4043 let mut quantifier: Option<parser::quantifier::Quantifier> = None;
4044
4045 if after_group < pattern.len() {
4046 let remaining = &pattern[after_group..];
4047 let chars: Vec<char> = remaining.chars().take(2).collect();
4048 if !chars.is_empty() {
4049 let first = chars[0];
4050 let has_lazy = chars.len() > 1 && chars[1] == '?';
4051
4052 match first {
4053 '*' if has_lazy => {
4054 quantifier = Some(parser::quantifier::Quantifier::ZeroOrMoreLazy);
4055 after_group += 2;
4056 }
4057 '*' => {
4058 quantifier = Some(parser::quantifier::Quantifier::ZeroOrMore);
4059 after_group += 1;
4060 }
4061 '+' if has_lazy => {
4062 quantifier = Some(parser::quantifier::Quantifier::OneOrMoreLazy);
4063 after_group += 2;
4064 }
4065 '+' => {
4066 quantifier = Some(parser::quantifier::Quantifier::OneOrMore);
4067 after_group += 1;
4068 }
4069 '?' if has_lazy => {
4070 quantifier = Some(parser::quantifier::Quantifier::ZeroOrOneLazy);
4071 after_group += 2;
4072 }
4073 '?' => {
4074 quantifier = Some(parser::quantifier::Quantifier::ZeroOrOne);
4075 after_group += 1;
4076 }
4077 _ => {}
4078 }
4079 }
4080 }
4081
4082 if let Some(q) = quantifier {
4084 elements.push(CaptureElement::NonCapture(Ast::QuantifiedCapture(
4086 Box::new(inner_ast),
4087 q,
4088 )));
4089 } else {
4090 elements.push(CaptureElement::NonCapture(inner_ast));
4091 }
4092 pos = after_group;
4093 } else {
4094 return Err(PatternError::ParseError(
4095 "Unmatched parenthesis".to_string(),
4096 ));
4097 }
4098 } else if pattern[pos..].starts_with('(') && !pattern[pos..].starts_with("(?") {
4099 if let Some(close_idx) = find_matching_paren(pattern, pos) {
4101 let my_group_num = *group_counter;
4102 *group_counter += 1;
4103
4104 let inner = &pattern[pos + 1..close_idx];
4106 let (inner_ast, _) = parse_pattern_with_captures_inner(inner, group_counter)?;
4107
4108 let mut after_group = close_idx + 1;
4110 let mut quantifier: Option<parser::quantifier::Quantifier> = None;
4111
4112 if after_group < pattern.len() {
4113 let remaining = &pattern[after_group..];
4114 let chars: Vec<char> = remaining.chars().take(2).collect();
4115 if !chars.is_empty() {
4116 let first = chars[0];
4117 let has_lazy = chars.len() > 1 && chars[1] == '?';
4118
4119 match first {
4120 '*' if has_lazy => {
4121 quantifier = Some(parser::quantifier::Quantifier::ZeroOrMoreLazy);
4122 after_group += 2;
4123 }
4124 '*' => {
4125 quantifier = Some(parser::quantifier::Quantifier::ZeroOrMore);
4126 after_group += 1;
4127 }
4128 '+' if has_lazy => {
4129 quantifier = Some(parser::quantifier::Quantifier::OneOrMoreLazy);
4130 after_group += 2;
4131 }
4132 '+' => {
4133 quantifier = Some(parser::quantifier::Quantifier::OneOrMore);
4134 after_group += 1;
4135 }
4136 '?' if has_lazy => {
4137 quantifier = Some(parser::quantifier::Quantifier::ZeroOrOneLazy);
4138 after_group += 2;
4139 }
4140 '?' => {
4141 quantifier = Some(parser::quantifier::Quantifier::ZeroOrOne);
4142 after_group += 1;
4143 }
4144 _ => {}
4145 }
4146 }
4147 }
4148
4149 if let Some(q) = quantifier {
4151 elements.push(CaptureElement::Capture(
4152 Ast::QuantifiedCapture(Box::new(inner_ast), q),
4153 my_group_num,
4154 ));
4155 } else {
4156 elements.push(CaptureElement::Capture(inner_ast, my_group_num));
4157 }
4158 pos = after_group;
4159 } else {
4160 return Err(PatternError::ParseError(
4161 "Unmatched parenthesis".to_string(),
4162 ));
4163 }
4164 } else {
4165 if pattern[pos..].starts_with('\\') && pos + 1 < pattern.len() {
4167 let next_char = pattern.chars().nth(pos + 1);
4168 if let Some(ch) = next_char {
4169 if ch.is_ascii_digit() {
4170 let digit = ch.to_digit(10).unwrap() as usize;
4172 elements.push(CaptureElement::NonCapture(Ast::Backreference(digit)));
4173 pos += 2; continue;
4175 }
4176 }
4177 }
4178
4179 let next_paren = {
4182 let mut search_pos = pos;
4183 let mut result = pattern.len();
4184 let bytes = pattern.as_bytes();
4185 while search_pos < bytes.len() {
4186 if bytes[search_pos] == b'\\' && search_pos + 1 < bytes.len() {
4187 search_pos += 2; } else if bytes[search_pos] == b'[' {
4189 search_pos += 1;
4191 if search_pos < bytes.len() && bytes[search_pos] == b'^' {
4192 search_pos += 1;
4193 }
4194 while search_pos < bytes.len() {
4195 if bytes[search_pos] == b'\\' {
4196 search_pos += 2;
4197 } else if bytes[search_pos] == b']' {
4198 search_pos += 1;
4199 break;
4200 } else {
4201 search_pos += 1;
4202 }
4203 }
4204 } else if bytes[search_pos] == b'(' {
4205 result = search_pos;
4206 break;
4207 } else {
4208 search_pos += 1;
4209 }
4210 }
4211 result
4212 };
4213
4214 let mut search_pos = pos;
4216 let mut next_backref = pattern.len();
4217
4218 while search_pos < pattern.len() {
4219 if pattern[search_pos..].starts_with('\\') && search_pos + 1 < pattern.len() {
4220 let next_ch = pattern.chars().nth(search_pos + 1);
4221 if next_ch.map(|c| c.is_ascii_digit()).unwrap_or(false) {
4222 next_backref = search_pos;
4223 break;
4224 }
4225 search_pos += 2; } else {
4227 search_pos += 1;
4228 }
4229 }
4230
4231 let next_boundary = next_paren.min(next_backref);
4233
4234 if next_boundary > pos {
4235 let segment = &pattern[pos..next_boundary];
4237
4238 let segment_ast = if segment.is_empty() {
4240 Ast::Literal(String::new())
4241 } else {
4242 parse_pattern(segment)?
4244 };
4245
4246 elements.push(CaptureElement::NonCapture(segment_ast));
4247 pos = next_boundary;
4248 } else {
4249 pos += 1;
4251 }
4252 }
4253 }
4254
4255 let total_groups = *group_counter - 1;
4256
4257 if elements.len() == 1 {
4259 if let CaptureElement::Capture(ast, num) = &elements[0] {
4260 return Ok((Ast::Capture(Box::new(ast.clone()), *num), total_groups));
4261 }
4262 }
4263
4264 Ok((
4266 Ast::PatternWithCaptures {
4267 elements,
4268 total_groups,
4269 },
4270 *group_counter - start_group,
4271 ))
4272}
4273
4274#[derive(Debug, Clone, PartialEq)]
4276enum CaptureElement {
4277 Capture(Ast, usize), NonCapture(Ast), }
4280
4281#[cfg(test)]
4282mod tests {
4283 use super::*;
4284
4285 #[test]
4286 fn literal() {
4287 let p = Pattern::new("hello").unwrap();
4288 assert!(p.is_match("hello world"));
4289 assert!(!p.is_match("goodbye"));
4290 }
4291
4292 #[test]
4293 fn alternation() {
4294 let p = Pattern::new("foo|bar|baz").unwrap();
4295 assert!(p.is_match("foo"));
4296 assert!(p.is_match("bar"));
4297 assert!(!p.is_match("qux"));
4298 }
4299
4300 #[test]
4301 fn anchors() {
4302 let p = Pattern::new("^hello$").unwrap();
4303 assert!(p.is_match("hello"));
4304 assert!(!p.is_match("hello world"));
4305 }
4306
4307 #[test]
4308 fn find_test() {
4309 let p = Pattern::new("world").unwrap();
4310 assert_eq!(p.find("hello world"), Some((6, 11)));
4311 }
4312
4313 #[test]
4314 fn cached() {
4315 assert!(is_match("test", "this is a test").unwrap());
4316 }
4317}
4318
4319#[test]
4320fn char_class_simple() {
4321 let p = Pattern::new("[abc]").unwrap();
4322 assert!(p.is_match("a"));
4323 assert!(p.is_match("apple"));
4324 assert!(p.is_match("cab"));
4325 assert!(!p.is_match("xyz"));
4326}
4327
4328#[test]
4329fn char_class_range() {
4330 let p = Pattern::new("[a-z]").unwrap();
4331 assert!(p.is_match("hello"));
4332 assert!(p.is_match("xyz"));
4333 assert!(!p.is_match("HELLO"));
4334 assert!(!p.is_match("123"));
4335}
4336
4337#[test]
4338fn char_class_multiple_ranges() {
4339 let p = Pattern::new("[a-zA-Z0-9]").unwrap();
4340 assert!(p.is_match("hello"));
4341 assert!(p.is_match("WORLD"));
4342 assert!(p.is_match("test123"));
4343 assert!(!p.is_match("!!!"));
4344}
4345
4346#[test]
4347fn char_class_negated() {
4348 let p = Pattern::new("[^0-9]").unwrap();
4349 assert!(p.is_match("abc"));
4350 assert!(!p.is_match("123"));
4351 assert!(p.is_match("a1b")); }
4353
4354#[test]
4355fn char_class_find() {
4356 let p = Pattern::new("[0-9]").unwrap();
4357 assert_eq!(p.find("abc123"), Some((3, 4))); let matches = p.find_all("a1b2c3");
4360 assert_eq!(matches, vec![(1, 2), (3, 4), (5, 6)]);
4361}
4362
4363#[test]
4364fn debug_parse_group() {
4365 let pattern = "(foo|bar)+";
4366 match parser::group::parse_group(pattern) {
4367 Ok((group, bytes_consumed)) => {
4368 eprintln!("bytes_consumed: {}", bytes_consumed);
4369 eprintln!("pattern.len(): {}", pattern.len());
4370 eprintln!("group: {:?}", group);
4371 assert_eq!(
4372 bytes_consumed,
4373 pattern.len(),
4374 "Group should consume entire pattern"
4375 );
4376 }
4377 Err(e) => {
4378 panic!("Error: {}", e);
4379 }
4380 }
4381
4382 eprintln!("\n--- Testing Pattern::new ---");
4384 let re = Pattern::new(pattern).unwrap();
4385 eprintln!("Pattern created: {:?}", re);
4386 eprintln!("is_match('foo'): {}", re.is_match("foo"));
4387 eprintln!("is_match('bar'): {}", re.is_match("bar"));
4388 eprintln!("is_match('foobar'): {}", re.is_match("foobar"));
4389}