1#![allow(
7 clippy::needless_range_loop,
8 clippy::match_same_arms,
9 clippy::similar_names,
10 clippy::too_many_lines,
11 clippy::missing_panics_doc,
12 clippy::missing_errors_doc,
13 clippy::items_after_statements,
14 clippy::inline_always,
15 clippy::float_cmp,
16 clippy::allow_attributes,
17 let_underscore_drop
18)]
19
20use std::collections::HashMap;
23
24use memchr::{memchr, memmem};
25
26use super::fuzzy_bridge::FuzzyBridge;
27use super::hash::FxHashMap;
28use super::simd_class::AsciiClassBitmap;
29use crate::ir::{Nfa, State, StateId};
30use crate::parser::Anchor;
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
34struct ExtendedState {
35 state_id: StateId,
37 literal_offset: Option<usize>,
40}
41
42impl ExtendedState {
43 fn new(state_id: StateId) -> Self {
44 ExtendedState {
45 state_id,
46 literal_offset: None,
47 }
48 }
49
50 fn with_offset(state_id: StateId, offset: usize) -> Self {
51 ExtendedState {
52 state_id,
53 literal_offset: Some(offset),
54 }
55 }
56}
57
58#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60struct NfaStateSet(Vec<ExtendedState>);
61
62impl NfaStateSet {
63 fn new() -> Self {
64 NfaStateSet(Vec::new())
65 }
66
67 fn with_capacity(cap: usize) -> Self {
68 NfaStateSet(Vec::with_capacity(cap))
69 }
70
71 fn insert(&mut self, state: ExtendedState) {
72 if let Err(pos) = self.0.binary_search(&state) {
73 self.0.insert(pos, state);
74 }
75 }
76
77 fn contains(&self, state: &ExtendedState) -> bool {
78 self.0.binary_search(state).is_ok()
79 }
80
81 fn is_empty(&self) -> bool {
82 self.0.is_empty()
83 }
84
85 fn iter(&self) -> impl Iterator<Item = &ExtendedState> {
86 self.0.iter()
87 }
88}
89
90type DfaStateId = u32;
92
93const TRANS_UNKNOWN: u32 = u32::MAX;
95const TRANS_DEAD: u32 = u32::MAX - 1;
97
98#[derive(Clone)]
101struct AsciiTransitions {
102 table: Box<[u32; 128]>,
105}
106
107impl AsciiTransitions {
108 fn new() -> Self {
109 AsciiTransitions {
110 table: Box::new([TRANS_UNKNOWN; 128]),
111 }
112 }
113
114 #[inline]
115 fn get(&self, byte: u8) -> u32 {
116 self.table[byte as usize]
117 }
118
119 #[inline]
120 fn set(&mut self, byte: u8, state: u32) {
121 self.table[byte as usize] = state;
122 }
123}
124
125impl std::fmt::Debug for AsciiTransitions {
126 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
127 let computed_count = self.table.iter().filter(|&&v| v != TRANS_UNKNOWN).count();
129 f.debug_struct("AsciiTransitions")
130 .field("computed_count", &computed_count)
131 .finish()
132 }
133}
134
135#[derive(Debug, Clone)]
137struct DfaState {
138 nfa_states: NfaStateSet,
140 is_accept: bool,
142 has_start_anchor: bool,
144 has_end_anchor: bool,
146 ascii_transitions: AsciiTransitions,
148 unicode_transitions: FxHashMap<char, u32>,
150}
151
152#[derive(Debug, Clone)]
154enum DfaPrefilter {
155 None,
157 SingleByte(u8),
159 TwoBytes(u8, u8),
161 ThreeBytes(u8, u8, u8),
163 ManyBytes(Vec<u8>),
165 Literal(Vec<u8>),
167}
168
169#[derive(Debug)]
171#[allow(clippy::struct_excessive_bools)]
172pub struct Dfa {
173 nfa: Nfa,
175 literal_texts: Vec<String>,
177 states: Vec<DfaState>,
179 state_cache: HashMap<NfaStateSet, DfaStateId>,
181 start: DfaStateId,
183 anchored_start: bool,
185 anchored_end: bool,
187 case_insensitive: bool,
189 multi_line: bool,
191 char_class_bitmaps: Vec<Option<AsciiClassBitmap>>,
194 prefilter: DfaPrefilter,
196}
197
198#[derive(Debug, Clone)]
200pub struct DfaMatch {
201 pub start: usize,
203 pub end: usize,
205}
206
207impl Dfa {
208 #[must_use]
216 pub fn from_nfa(
217 nfa: &Nfa,
218 bridge: Option<&FuzzyBridge>,
219 case_insensitive: bool,
220 multi_line: bool,
221 ) -> Option<Self> {
222 if !Self::is_dfa_compatible(nfa, bridge) {
224 return None;
225 }
226
227 let literal_texts = if let Some(b) = bridge {
229 (0..b.pattern_count())
230 .filter_map(|i| b.pattern_text(i).map(ToString::to_string))
231 .collect()
232 } else {
233 Vec::new()
234 };
235
236 let char_class_bitmaps: Vec<Option<AsciiClassBitmap>> = nfa
238 .states
239 .iter()
240 .map(|state| {
241 if let State::Char { class, .. } = state {
242 Some(AsciiClassBitmap::from_hir_class(class))
243 } else {
244 None
245 }
246 })
247 .collect();
248
249 let prefilter = Self::extract_prefilter(nfa, &literal_texts, case_insensitive);
251
252 let mut dfa = Dfa {
253 nfa: nfa.clone(),
254 literal_texts,
255 states: Vec::new(),
256 state_cache: HashMap::new(),
257 start: 0,
258 anchored_start: false,
259 anchored_end: false,
260 case_insensitive,
261 multi_line,
262 char_class_bitmaps,
263 prefilter,
264 };
265
266 let mut start_set = NfaStateSet::new();
268 dfa.epsilon_closure(nfa.start, &mut start_set);
269
270 dfa.anchored_start = start_set.iter().any(|s| {
272 matches!(
273 &nfa.states[s.state_id],
274 State::Anchor {
275 kind: Anchor::Start,
276 ..
277 }
278 )
279 });
280
281 dfa.anchored_end = nfa.states.iter().any(|s| {
283 matches!(
284 s,
285 State::Anchor {
286 kind: Anchor::End,
287 ..
288 }
289 )
290 });
291
292 let start_id = dfa.get_or_create_state(start_set);
294 dfa.start = start_id;
295
296 Some(dfa)
297 }
298
299 fn extract_prefilter(
301 nfa: &Nfa,
302 literal_texts: &[String],
303 case_insensitive: bool,
304 ) -> DfaPrefilter {
305 let mut prefix = Vec::new();
306 let mut visited = vec![false; nfa.states.len()];
307 let mut current = nfa.start;
308
309 loop {
311 if visited[current] {
312 break;
313 }
314 visited[current] = true;
315
316 match &nfa.states[current] {
317 State::Epsilon { targets } if targets.len() == 1 => {
318 current = targets[0];
319 }
320 State::Anchor { next, .. }
321 | State::CaptureStart { next, .. }
322 | State::CaptureEnd { next, .. } => {
323 current = *next;
324 }
325 State::Char { class, .. } => {
326 if class.chars.len() == 1
328 && class.ranges.is_empty()
329 && class.named.is_empty()
330 && !class.negated
331 {
332 let ch = class.chars[0];
333 if ch.is_ascii() {
334 prefix.push(ch as u8);
335 break;
338 }
339 }
340 break;
341 }
342 State::FuzzyLiteral { pattern_index, .. } => {
343 if let Some(text) = literal_texts.get(*pattern_index) {
345 prefix.extend(text.as_bytes().iter().take(8)); }
347 break;
348 }
349 State::Split { branches, .. } => {
350 let first_bytes =
352 Self::collect_first_bytes_from_branches(nfa, branches, literal_texts);
353 if !first_bytes.is_empty() {
354 return Self::make_multi_byte_prefilter(&first_bytes, case_insensitive);
355 }
356 break;
357 }
358 _ => break,
359 }
360 }
361
362 if prefix.is_empty() {
363 return DfaPrefilter::None;
364 }
365
366 if case_insensitive {
369 let b = prefix[0];
370 if b.is_ascii_alphabetic() {
371 return DfaPrefilter::TwoBytes(b.to_ascii_lowercase(), b.to_ascii_uppercase());
372 }
373 return DfaPrefilter::SingleByte(b);
375 }
376
377 if prefix.len() == 1 {
379 return DfaPrefilter::SingleByte(prefix[0]);
380 }
381
382 DfaPrefilter::Literal(prefix)
384 }
385
386 fn collect_first_bytes_from_branches(
388 nfa: &Nfa,
389 branches: &[StateId],
390 literal_texts: &[String],
391 ) -> Vec<u8> {
392 let mut first_bytes = Vec::new();
393
394 for &branch in branches {
395 if let Some(byte) = Self::get_first_byte(nfa, branch, literal_texts)
396 && !first_bytes.contains(&byte)
397 {
398 first_bytes.push(byte);
399 }
400 }
401
402 first_bytes
403 }
404
405 fn get_first_byte(nfa: &Nfa, start: StateId, literal_texts: &[String]) -> Option<u8> {
407 let mut visited = vec![false; nfa.states.len()];
408 let mut current = start;
409
410 loop {
411 if current >= nfa.states.len() || visited[current] {
412 return None;
413 }
414 visited[current] = true;
415
416 match &nfa.states[current] {
417 State::Epsilon { targets } if !targets.is_empty() => {
418 current = targets[0];
419 }
420 State::CaptureStart { next, .. } | State::CaptureEnd { next, .. } => {
421 current = *next;
422 }
423 State::Char { class, .. } => {
424 if !class.chars.is_empty() {
426 let ch = class.chars[0];
427 let mut buf = [0u8; 4];
429 let encoded = ch.encode_utf8(&mut buf);
430 return Some(encoded.as_bytes()[0]);
431 }
432 if !class.ranges.is_empty() {
433 let (start, _) = class.ranges[0];
434 if start.is_ascii() {
435 return Some(start as u8);
436 }
437 let mut buf = [0u8; 4];
439 let encoded = start.encode_utf8(&mut buf);
440 return Some(encoded.as_bytes()[0]);
441 }
442 return None;
443 }
444 State::FuzzyLiteral { pattern_index, .. } => {
445 if let Some(text) = literal_texts.get(*pattern_index) {
446 return text.as_bytes().first().copied();
447 }
448 return None;
449 }
450 _ => return None,
451 }
452 }
453 }
454
455 fn make_multi_byte_prefilter(bytes: &[u8], case_insensitive: bool) -> DfaPrefilter {
457 if bytes.is_empty() {
458 return DfaPrefilter::None;
459 }
460
461 let expanded: Vec<u8> = if case_insensitive {
463 let mut result = Vec::new();
464 for &b in bytes {
465 if b.is_ascii_alphabetic() {
466 let lower = b.to_ascii_lowercase();
467 let upper = b.to_ascii_uppercase();
468 if !result.contains(&lower) {
469 result.push(lower);
470 }
471 if !result.contains(&upper) {
472 result.push(upper);
473 }
474 } else if !result.contains(&b) {
475 result.push(b);
476 }
477 }
478 result
479 } else {
480 bytes.to_vec()
481 };
482
483 match expanded.len() {
484 0 => DfaPrefilter::None,
485 1 => DfaPrefilter::SingleByte(expanded[0]),
486 2 => DfaPrefilter::TwoBytes(expanded[0], expanded[1]),
487 3 => DfaPrefilter::ThreeBytes(expanded[0], expanded[1], expanded[2]),
488 _ => {
489 DfaPrefilter::ManyBytes(expanded)
491 }
492 }
493 }
494
495 fn is_dfa_compatible(nfa: &Nfa, bridge: Option<&FuzzyBridge>) -> bool {
497 for state in &nfa.states {
498 match state {
499 State::Accept
500 | State::Epsilon { .. }
501 | State::ResetMatchStart { .. }
502 | State::Char { .. }
503 | State::Split { .. }
504 | State::CaptureStart { .. }
505 | State::CaptureEnd { .. } => {}
506
507 State::Anchor { kind, .. } => {
510 use crate::parser::ast::Anchor;
511 match kind {
512 Anchor::Start | Anchor::End => {}
513 Anchor::WordBoundary | Anchor::NotWordBoundary => return false,
514 }
515 }
516
517 State::FuzzyLiteral {
519 limits,
520 pattern_index,
521 ..
522 } => {
523 let max_edits = limits.as_ref().map(|l| {
524 l.get_edits().unwrap_or_else(|| {
525 let i = l.get_insertions().unwrap_or(0);
526 let d = l.get_deletions().unwrap_or(0);
527 let s = l.get_substitutions().unwrap_or(0);
528 let t = l.get_swaps().unwrap_or(0);
529 i.saturating_add(d).saturating_add(s).saturating_add(t)
530 })
531 });
532 if (max_edits.is_none() || max_edits == Some(0))
534 && bridge.is_some_and(|b| b.pattern_text(*pattern_index).is_some())
535 {
536 continue;
537 }
538 return false;
539 }
540
541 State::FuzzyChar { .. }
543 | State::Lookahead { .. }
544 | State::Lookbehind { .. }
545 | State::Backreference { .. }
546 | State::AtomicGroup { .. }
547 | State::RecursivePattern { .. }
548 | State::RecursiveGroup { .. }
549 | State::RecursiveNamedGroup { .. } => return false,
550 }
551 }
552 true
553 }
554
555 fn epsilon_closure(&self, state: StateId, result: &mut NfaStateSet) {
557 self.epsilon_closure_ext(ExtendedState::new(state), result);
558 }
559
560 fn epsilon_closure_ext(&self, ext_state: ExtendedState, result: &mut NfaStateSet) {
562 if result.contains(&ext_state) {
563 return;
564 }
565
566 let state = ext_state.state_id;
567
568 match &self.nfa.states[state] {
569 State::Epsilon { targets } => {
570 result.insert(ext_state);
571 for &target in targets {
572 self.epsilon_closure_ext(ExtendedState::new(target), result);
573 }
574 }
575 State::Split { branches, .. } => {
576 result.insert(ext_state);
577 for &branch in branches {
578 self.epsilon_closure_ext(ExtendedState::new(branch), result);
579 }
580 }
581 State::CaptureStart { next, .. } | State::CaptureEnd { next, .. } => {
582 result.insert(ext_state);
584 self.epsilon_closure_ext(ExtendedState::new(*next), result);
585 }
586 State::Anchor { next, .. } => {
587 result.insert(ext_state);
588 self.epsilon_closure_ext(ExtendedState::new(*next), result);
591 }
592 State::FuzzyLiteral { .. } => {
593 if ext_state.literal_offset.is_none() {
595 result.insert(ExtendedState::with_offset(state, 0));
597 } else {
598 result.insert(ext_state);
600 }
601 }
602 _ => {
603 result.insert(ext_state);
604 }
605 }
606 }
607
608 fn get_or_create_state(&mut self, nfa_states: NfaStateSet) -> DfaStateId {
610 if let Some(&id) = self.state_cache.get(&nfa_states) {
611 return id;
612 }
613
614 let is_accept = nfa_states
615 .iter()
616 .any(|ext| matches!(&self.nfa.states[ext.state_id], State::Accept));
617
618 let has_start_anchor = nfa_states.iter().any(|ext| {
619 matches!(
620 &self.nfa.states[ext.state_id],
621 State::Anchor {
622 kind: Anchor::Start,
623 ..
624 }
625 )
626 });
627
628 let has_end_anchor = nfa_states.iter().any(|ext| {
629 matches!(
630 &self.nfa.states[ext.state_id],
631 State::Anchor {
632 kind: Anchor::End,
633 ..
634 }
635 )
636 });
637
638 let id = self.states.len() as DfaStateId;
639 self.states.push(DfaState {
640 nfa_states: nfa_states.clone(),
641 is_accept,
642 has_start_anchor,
643 has_end_anchor,
644 ascii_transitions: AsciiTransitions::new(),
645 unicode_transitions: FxHashMap::default(),
646 });
647 self.state_cache.insert(nfa_states, id);
648 id
649 }
650
651 #[inline(always)]
654 fn chars_match<const CASE_INSENSITIVE: bool>(text_char: char, pattern_char: char) -> bool {
655 if CASE_INSENSITIVE {
656 if text_char.is_ascii() && pattern_char.is_ascii() {
659 text_char.eq_ignore_ascii_case(&pattern_char)
660 } else {
661 text_char.to_lowercase().eq(pattern_char.to_lowercase())
663 }
664 } else {
665 text_char == pattern_char
666 }
667 }
668
669 #[inline(always)]
672 fn next_state(&mut self, state_id: DfaStateId, ch: char) -> Option<DfaStateId> {
673 if self.case_insensitive {
674 self.next_state_impl::<true>(state_id, ch)
675 } else {
676 self.next_state_impl::<false>(state_id, ch)
677 }
678 }
679
680 #[inline(always)]
683 fn next_state_impl<const CASE_INSENSITIVE: bool>(
684 &mut self,
685 state_id: DfaStateId,
686 ch: char,
687 ) -> Option<DfaStateId> {
688 let state_idx = state_id as usize;
689
690 if ch.is_ascii() {
692 let byte = ch as u8;
693 let cached = self.states[state_idx].ascii_transitions.get(byte);
694 if cached != TRANS_UNKNOWN {
695 return if cached == TRANS_DEAD {
696 None
697 } else {
698 Some(cached)
699 };
700 }
701 } else {
702 if let Some(&cached) = self.states[state_idx].unicode_transitions.get(&ch) {
704 return if cached == TRANS_DEAD {
705 None
706 } else {
707 Some(cached)
708 };
709 }
710 }
711
712 let current = &self.states[state_idx];
714 let mut next_set = NfaStateSet::with_capacity(current.nfa_states.0.len());
715
716 let nfa_states: Vec<_> = current.nfa_states.0.clone();
718
719 for ext_state in nfa_states {
720 let nfa_state = ext_state.state_id;
721 match &self.nfa.states[nfa_state] {
722 State::Char { class, next } => {
723 let matches = if ch.is_ascii() {
727 if let Some(bitmap) = &self.char_class_bitmaps[nfa_state] {
728 if CASE_INSENSITIVE {
729 bitmap.contains(ch as u8)
730 || bitmap.contains((ch as u8).to_ascii_lowercase())
731 || bitmap.contains((ch as u8).to_ascii_uppercase())
732 } else {
733 bitmap.contains(ch as u8)
734 }
735 } else if CASE_INSENSITIVE {
736 class.matches(ch)
737 || class.matches(ch.to_ascii_lowercase())
738 || class.matches(ch.to_ascii_uppercase())
739 } else {
740 class.matches(ch)
741 }
742 } else {
743 if CASE_INSENSITIVE {
745 class.matches(ch)
746 || class.matches(ch.to_ascii_lowercase())
747 || class.matches(ch.to_ascii_uppercase())
748 } else {
749 class.matches(ch)
750 }
751 };
752 if matches {
753 self.epsilon_closure(*next, &mut next_set);
754 }
755 }
756 State::FuzzyLiteral {
757 pattern_index,
758 next,
759 ..
760 } => {
761 let offset = ext_state.literal_offset.unwrap_or(0);
763 if let Some(pattern) = self.literal_texts.get(*pattern_index) {
764 let pattern_chars: Vec<char> = pattern.chars().collect();
765 if offset < pattern_chars.len()
766 && Self::chars_match::<CASE_INSENSITIVE>(ch, pattern_chars[offset])
767 {
768 if offset + 1 == pattern_chars.len() {
770 self.epsilon_closure(*next, &mut next_set);
772 } else {
773 next_set.insert(ExtendedState::with_offset(nfa_state, offset + 1));
775 }
776 }
777 }
778 }
779 _ => {
780 }
783 }
784 }
785
786 let next_id = if next_set.is_empty() {
787 TRANS_DEAD
788 } else {
789 self.get_or_create_state(next_set)
790 };
791
792 if ch.is_ascii() {
794 self.states[state_idx]
795 .ascii_transitions
796 .set(ch as u8, next_id);
797 if CASE_INSENSITIVE {
799 let lower = ch.to_ascii_lowercase() as u8;
800 let upper = ch.to_ascii_uppercase() as u8;
801 if lower != ch as u8 {
802 self.states[state_idx].ascii_transitions.set(lower, next_id);
803 }
804 if upper != ch as u8 {
805 self.states[state_idx].ascii_transitions.set(upper, next_id);
806 }
807 }
808 } else {
809 self.states[state_idx]
810 .unicode_transitions
811 .insert(ch, next_id);
812 if CASE_INSENSITIVE {
814 for variant in ch.to_lowercase() {
815 if variant != ch {
816 self.states[state_idx]
817 .unicode_transitions
818 .insert(variant, next_id);
819 }
820 }
821 for variant in ch.to_uppercase() {
822 if variant != ch {
823 self.states[state_idx]
824 .unicode_transitions
825 .insert(variant, next_id);
826 }
827 }
828 }
829 }
830
831 if next_id == TRANS_DEAD {
832 None
833 } else {
834 Some(next_id)
835 }
836 }
837
838 pub fn find(&mut self, text: &str) -> Option<DfaMatch> {
840 if self.anchored_start && !self.multi_line {
841 self.find_at(text, 0)
843 } else if self.anchored_start && self.multi_line {
844 self.find_multiline_anchored(text)
846 } else {
847 self.find_with_prefilter(text)
849 }
850 }
851
852 fn find_multiline_anchored(&mut self, text: &str) -> Option<DfaMatch> {
855 if let Some(m) = self.find_at(text, 0) {
857 return Some(m);
858 }
859
860 let bytes = text.as_bytes();
862 let mut offset = 0;
863 while let Some(pos) = memchr(b'\n', &bytes[offset..]) {
864 let start = offset + pos + 1; if start < bytes.len()
866 && let Some(m) = self.find_at(text, start)
867 {
868 return Some(m);
869 }
870 offset = start;
871 }
872 None
873 }
874
875 fn find_with_prefilter(&mut self, text: &str) -> Option<DfaMatch> {
877 let bytes = text.as_bytes();
878 let accepts_empty = self.states[self.start as usize].is_accept;
879
880 if accepts_empty && bytes.is_empty() {
886 return Some(DfaMatch { start: 0, end: 0 });
888 }
889
890 if accepts_empty && !self.anchored_end {
893 if let Some(m) = self.find_at(text, 0) {
894 return Some(m);
895 }
896 return Some(DfaMatch { start: 0, end: 0 });
898 }
899
900 let prefilter = self.prefilter.clone();
905
906 match prefilter {
907 DfaPrefilter::None => {
908 self.find_linear(text)
910 }
911 DfaPrefilter::SingleByte(needle) => {
912 let mut offset = 0;
913 while let Some(pos) = memchr(needle, &bytes[offset..]) {
914 let start = offset + pos;
915 if let Some(m) = self.find_at(text, start) {
916 return Some(m);
917 }
918 offset = start + 1;
920 }
921 if accepts_empty {
923 return Some(DfaMatch { start: 0, end: 0 });
924 }
925 None
926 }
927 DfaPrefilter::TwoBytes(needle1, needle2) => {
928 let mut offset = 0;
929 while let Some(pos) = memchr::memchr2(needle1, needle2, &bytes[offset..]) {
930 let start = offset + pos;
931 if let Some(m) = self.find_at(text, start) {
932 return Some(m);
933 }
934 offset = start + 1;
935 }
936 if accepts_empty {
937 return Some(DfaMatch { start: 0, end: 0 });
938 }
939 None
940 }
941 DfaPrefilter::ThreeBytes(needle1, needle2, needle3) => {
942 let mut offset = 0;
943 while let Some(pos) = memchr::memchr3(needle1, needle2, needle3, &bytes[offset..]) {
944 let start = offset + pos;
945 if let Some(m) = self.find_at(text, start) {
946 return Some(m);
947 }
948 offset = start + 1;
949 }
950 if accepts_empty {
951 return Some(DfaMatch { start: 0, end: 0 });
952 }
953 None
954 }
955 DfaPrefilter::ManyBytes(ref needles) => {
956 let mut byte_set = [false; 256];
958 for &b in needles {
959 byte_set[b as usize] = true;
960 }
961 let mut offset = 0;
962 while offset < bytes.len() {
963 if let Some(pos) = bytes[offset..].iter().position(|&b| byte_set[b as usize]) {
965 let start = offset + pos;
966 if let Some(m) = self.find_at(text, start) {
967 return Some(m);
968 }
969 offset = start + 1;
970 } else {
971 break;
972 }
973 }
974 if accepts_empty {
975 return Some(DfaMatch { start: 0, end: 0 });
976 }
977 None
978 }
979 DfaPrefilter::Literal(ref lit) => {
980 let finder = memmem::Finder::new(lit);
981 let mut offset = 0;
982 while let Some(pos) = finder.find(&bytes[offset..]) {
983 let start = offset + pos;
984 if let Some(m) = self.find_at(text, start) {
985 return Some(m);
986 }
987 offset = start + 1;
988 }
989 if accepts_empty {
990 return Some(DfaMatch { start: 0, end: 0 });
991 }
992 None
993 }
994 }
995 }
996
997 fn find_linear(&mut self, text: &str) -> Option<DfaMatch> {
999 for (start_pos, _) in text.char_indices() {
1000 if let Some(m) = self.find_at(text, start_pos) {
1001 return Some(m);
1002 }
1003 }
1004 self.find_at(text, text.len())
1006 }
1007
1008 #[inline(always)]
1011 fn find_at(&mut self, text: &str, start: usize) -> Option<DfaMatch> {
1012 if self.multi_line {
1013 self.find_at_impl::<true>(text, start)
1014 } else {
1015 self.find_at_impl::<false>(text, start)
1016 }
1017 }
1018
1019 #[inline(always)]
1022 fn find_at_impl<const MULTI_LINE: bool>(
1023 &mut self,
1024 text: &str,
1025 start: usize,
1026 ) -> Option<DfaMatch> {
1027 let bytes = text.as_bytes();
1028 let mut state_id = self.start;
1029 let mut last_accept: Option<usize> = None;
1030
1031 let start_anchor_ok = Self::is_start_anchor_satisfied::<MULTI_LINE>(bytes, start);
1033
1034 if self.states[state_id as usize].is_accept {
1036 if !self.anchored_start || start_anchor_ok {
1038 let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, start);
1040 if !self.anchored_end || end_anchor_ok {
1041 last_accept = Some(start);
1042 }
1043 }
1044 }
1045
1046 if self.states[state_id as usize].has_start_anchor && !start_anchor_ok {
1048 return None;
1049 }
1050
1051 let mut pos = start;
1052 for ch in text[start..].chars() {
1053 let next = self.next_state(state_id, ch);
1055
1056 match next {
1057 Some(next_id) => {
1058 state_id = next_id;
1059 pos += ch.len_utf8();
1060
1061 if self.states[state_id as usize].is_accept {
1063 let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, pos);
1065 if !self.states[state_id as usize].has_end_anchor || end_anchor_ok {
1066 last_accept = Some(pos);
1067 }
1068 }
1069 }
1070 None => {
1071 break;
1073 }
1074 }
1075 }
1076
1077 if self.states[state_id as usize].has_end_anchor {
1079 let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, pos);
1080 if end_anchor_ok && self.states[state_id as usize].is_accept {
1081 last_accept = Some(pos);
1082 }
1083 }
1084
1085 last_accept.map(|end| DfaMatch { start, end })
1086 }
1087
1088 #[inline(always)]
1091 fn is_start_anchor_satisfied<const MULTI_LINE: bool>(bytes: &[u8], pos: usize) -> bool {
1092 if pos == 0 {
1093 return true;
1094 }
1095 if MULTI_LINE && bytes[pos - 1] == b'\n' {
1096 return true;
1097 }
1098 false
1099 }
1100
1101 #[inline(always)]
1104 fn is_end_anchor_satisfied<const MULTI_LINE: bool>(bytes: &[u8], pos: usize) -> bool {
1105 if pos == bytes.len() {
1106 return true;
1107 }
1108 if MULTI_LINE && bytes[pos] == b'\n' {
1109 return true;
1110 }
1111 false
1112 }
1113
1114 pub fn find_all(&mut self, text: &str) -> Vec<DfaMatch> {
1116 let mut matches = Vec::new();
1117 let mut pos = 0;
1118
1119 while pos <= text.len() {
1120 if let Some(m) = self.find_at(text, pos) {
1121 let end = m.end;
1122 matches.push(m);
1123 pos = if end > pos {
1125 end
1126 } else {
1127 text[pos..]
1129 .chars()
1130 .next()
1131 .map_or(text.len() + 1, |c| pos + c.len_utf8())
1132 };
1133 } else {
1134 pos = text[pos..]
1136 .chars()
1137 .next()
1138 .map_or(text.len() + 1, |c| pos + c.len_utf8());
1139 }
1140 }
1141
1142 matches
1143 }
1144
1145 pub fn find_n(&mut self, text: &str, n: usize) -> Vec<DfaMatch> {
1147 if n == 0 {
1148 return Vec::new();
1149 }
1150
1151 let mut matches = Vec::with_capacity(n);
1152 let mut pos = 0;
1153
1154 while pos <= text.len() && matches.len() < n {
1155 if let Some(m) = self.find_at(text, pos) {
1156 let end = m.end;
1157 matches.push(m);
1158 pos = if end > pos {
1160 end
1161 } else {
1162 text[pos..]
1164 .chars()
1165 .next()
1166 .map_or(text.len() + 1, |c| pos + c.len_utf8())
1167 };
1168 } else {
1169 pos = text[pos..]
1171 .chars()
1172 .next()
1173 .map_or(text.len() + 1, |c| pos + c.len_utf8());
1174 }
1175 }
1176
1177 matches
1178 }
1179
1180 #[must_use]
1182 pub fn is_anchored_start(&self) -> bool {
1183 self.anchored_start
1184 }
1185
1186 #[must_use]
1188 pub fn is_anchored_end(&self) -> bool {
1189 self.anchored_end
1190 }
1191
1192 #[must_use]
1194 pub fn state_count(&self) -> usize {
1195 self.states.len()
1196 }
1197
1198 fn complete(&mut self) {
1201 let mut i = 0;
1204 while i < self.states.len() {
1205 for byte in 0u8..128 {
1207 let ch = byte as char;
1208 let _ = self.next_state(i as DfaStateId, ch);
1209 }
1210 i += 1;
1211 }
1212 }
1213
1214 #[allow(clippy::mut_range_bound)] #[allow(clippy::needless_range_loop)] pub fn minimize(&mut self) -> usize {
1219 let original_count = self.states.len();
1220 if original_count <= 1 {
1221 return 0;
1222 }
1223
1224 self.complete();
1226
1227 let mut alphabet: Vec<char> = Vec::new();
1229 for byte in 0u8..128 {
1230 alphabet.push(byte as char);
1231 }
1232 for state in &self.states {
1234 for &ch in state.unicode_transitions.keys() {
1235 if !alphabet.contains(&ch) {
1236 alphabet.push(ch);
1237 }
1238 }
1239 }
1240
1241 let mut partition: Vec<usize> = vec![0; self.states.len()];
1244 let mut num_partitions = 0;
1245
1246 let mut partition_map: HashMap<(bool, bool, bool), usize> = HashMap::new();
1248 for (i, state) in self.states.iter().enumerate() {
1249 let key = (
1250 state.is_accept,
1251 state.has_start_anchor,
1252 state.has_end_anchor,
1253 );
1254 let p = *partition_map.entry(key).or_insert_with(|| {
1255 let p = num_partitions;
1256 num_partitions += 1;
1257 p
1258 });
1259 partition[i] = p;
1260 }
1261
1262 let mut changed = true;
1264 while changed {
1265 changed = false;
1266
1267 for p in 0..num_partitions {
1269 let states_in_p: Vec<usize> = partition
1271 .iter()
1272 .enumerate()
1273 .filter(|&(_, part)| *part == p)
1274 .map(|(i, _)| i)
1275 .collect();
1276
1277 if states_in_p.len() <= 1 {
1278 continue;
1279 }
1280
1281 let first = states_in_p[0];
1285 let mut to_split: Vec<usize> = Vec::new();
1286
1287 for &state_idx in &states_in_p[1..] {
1288 let mut same = true;
1289 for &ch in &alphabet {
1290 let t1 = self.get_transition(first, ch);
1291 let t2 = self.get_transition(state_idx, ch);
1292
1293 let p1 = t1.map(|s| partition[s as usize]);
1294 let p2 = t2.map(|s| partition[s as usize]);
1295
1296 if p1 != p2 {
1297 same = false;
1298 break;
1299 }
1300 }
1301 if !same {
1302 to_split.push(state_idx);
1303 }
1304 }
1305
1306 if !to_split.is_empty() {
1308 let new_partition = num_partitions;
1309 num_partitions += 1;
1310 for &state_idx in &to_split {
1311 partition[state_idx] = new_partition;
1312 }
1313 changed = true;
1314 }
1315 }
1316 }
1317
1318 if num_partitions == original_count {
1320 return 0;
1321 }
1322
1323 let mut new_states: Vec<DfaState> = Vec::with_capacity(num_partitions);
1326 let mut old_to_new: Vec<DfaStateId> = vec![0; original_count];
1327
1328 for p in 0..num_partitions {
1330 let repr = partition
1332 .iter()
1333 .enumerate()
1334 .find(|&(_, part)| *part == p)
1335 .map(|(i, _)| i)
1336 .unwrap();
1337
1338 old_to_new[repr] = p as DfaStateId;
1339
1340 let old_state = &self.states[repr];
1341 new_states.push(DfaState {
1342 nfa_states: old_state.nfa_states.clone(),
1343 is_accept: old_state.is_accept,
1344 has_start_anchor: old_state.has_start_anchor,
1345 has_end_anchor: old_state.has_end_anchor,
1346 ascii_transitions: AsciiTransitions::new(),
1347 unicode_transitions: FxHashMap::default(),
1348 });
1349 }
1350
1351 for (i, &p) in partition.iter().enumerate() {
1353 old_to_new[i] = p as DfaStateId;
1354 }
1355
1356 for p in 0..num_partitions {
1358 let repr = partition
1360 .iter()
1361 .enumerate()
1362 .find(|&(_, part)| *part == p)
1363 .map(|(i, _)| i)
1364 .unwrap();
1365
1366 let old_state = &self.states[repr];
1367
1368 for byte in 0u8..128 {
1370 let old_target = old_state.ascii_transitions.get(byte);
1371 let new_target = if old_target == TRANS_UNKNOWN || old_target == TRANS_DEAD {
1372 old_target
1373 } else {
1374 old_to_new[old_target as usize]
1375 };
1376 new_states[p].ascii_transitions.set(byte, new_target);
1377 }
1378
1379 for (&ch, &old_target) in &old_state.unicode_transitions {
1381 let new_target = if old_target == TRANS_DEAD {
1382 old_target
1383 } else {
1384 old_to_new[old_target as usize]
1385 };
1386 new_states[p].unicode_transitions.insert(ch, new_target);
1387 }
1388 }
1389
1390 let new_start = old_to_new[self.start as usize];
1392 self.states = new_states;
1393 self.start = new_start;
1394
1395 self.state_cache.clear();
1397 for (i, state) in self.states.iter().enumerate() {
1398 self.state_cache
1399 .insert(state.nfa_states.clone(), i as DfaStateId);
1400 }
1401
1402 original_count - self.states.len()
1403 }
1404
1405 fn get_transition(&self, state_idx: usize, ch: char) -> Option<DfaStateId> {
1407 let state = &self.states[state_idx];
1408 if ch.is_ascii() {
1409 let cached = state.ascii_transitions.get(ch as u8);
1410 if cached == TRANS_UNKNOWN || cached == TRANS_DEAD {
1411 None
1412 } else {
1413 Some(cached)
1414 }
1415 } else {
1416 state
1417 .unicode_transitions
1418 .get(&ch)
1419 .copied()
1420 .filter(|&t| t != TRANS_DEAD)
1421 }
1422 }
1423}
1424
1425#[cfg(test)]
1426mod tests {
1427 use super::*;
1428 use crate::compiler::build_nfa;
1429 use crate::ir::lower;
1430 use crate::parser::parse;
1431
1432 fn make_dfa(pattern: &str) -> Option<Dfa> {
1433 make_dfa_ci(pattern, false)
1434 }
1435
1436 fn make_dfa_ci(pattern: &str, case_insensitive: bool) -> Option<Dfa> {
1437 make_dfa_full(pattern, case_insensitive, false)
1438 }
1439
1440 fn make_dfa_full(pattern: &str, case_insensitive: bool, multi_line: bool) -> Option<Dfa> {
1441 let ast = parse(pattern).unwrap();
1442 let hir = lower(&ast, 0);
1443 let (nfa, literals) = build_nfa(&hir);
1444
1445 let bridge = if literals.is_empty() {
1447 None
1448 } else {
1449 FuzzyBridge::new(&literals, None, None, case_insensitive)
1450 };
1451
1452 Dfa::from_nfa(&nfa, bridge.as_ref(), case_insensitive, multi_line)
1453 }
1454
1455 #[test]
1456 fn test_simple_literal() {
1457 let mut dfa = make_dfa("hello").unwrap();
1458 let m = dfa.find("hello world").unwrap();
1459 assert_eq!(m.start, 0);
1460 assert_eq!(m.end, 5);
1461
1462 let m = dfa.find("say hello").unwrap();
1463 assert_eq!(m.start, 4);
1464 assert_eq!(m.end, 9);
1465 }
1466
1467 #[test]
1468 fn test_char_class() {
1469 let mut dfa = make_dfa("[a-z]+").unwrap();
1470 let m = dfa.find("123abc456").unwrap();
1471 assert_eq!(m.start, 3);
1472 assert_eq!(m.end, 6);
1473 }
1474
1475 #[test]
1476 fn test_start_anchor() {
1477 let mut dfa = make_dfa("^hello").unwrap();
1478 assert!(dfa.find("hello world").is_some());
1479 assert!(dfa.find("say hello").is_none());
1480 }
1481
1482 #[test]
1483 fn test_end_anchor() {
1484 let mut dfa = make_dfa("world$").unwrap();
1485 let m = dfa.find("hello world").unwrap();
1486 assert_eq!(m.start, 6);
1487 assert_eq!(m.end, 11);
1488
1489 assert!(dfa.find("world hello").is_none());
1490 }
1491
1492 #[test]
1493 fn test_alternation() {
1494 let mut dfa = make_dfa("cat|dog").unwrap();
1495 assert!(dfa.find("I have a cat").is_some());
1496 assert!(dfa.find("I have a dog").is_some());
1497 assert!(dfa.find("I have a bird").is_none());
1498 }
1499
1500 #[test]
1501 fn test_quantifiers() {
1502 let mut dfa = make_dfa("a+").unwrap();
1503 let m = dfa.find("baaab").unwrap();
1504 assert_eq!(m.start, 1);
1505 assert_eq!(m.end, 4);
1506
1507 let mut dfa = make_dfa("a*").unwrap();
1508 let m = dfa.find("baaab").unwrap();
1509 assert_eq!(m.start, 0); assert_eq!(m.end, 0);
1511
1512 let mut dfa = make_dfa("a?").unwrap();
1513 let m = dfa.find("baaab").unwrap();
1514 assert_eq!(m.start, 0); }
1516
1517 #[test]
1518 fn test_find_all() {
1519 let mut dfa = make_dfa("[a-z]+").unwrap();
1520 let matches = dfa.find_all("abc 123 def 456 ghi");
1521 assert_eq!(matches.len(), 3);
1522 assert_eq!(matches[0].start, 0);
1523 assert_eq!(matches[0].end, 3);
1524 assert_eq!(matches[1].start, 8);
1525 assert_eq!(matches[1].end, 11);
1526 }
1527
1528 #[test]
1529 fn test_fuzzy_not_compatible() {
1530 assert!(make_dfa("(?:hello){e<=1}").is_none());
1532 assert!(make_dfa("hello~1").is_none());
1533 }
1534
1535 #[test]
1537 fn test_case_insensitive_literal() {
1538 let mut dfa = make_dfa_ci("hello", true).unwrap();
1539
1540 assert!(dfa.find("hello").is_some());
1542 assert!(dfa.find("HELLO").is_some());
1543 assert!(dfa.find("HeLLo").is_some());
1544 assert!(dfa.find("hElLo").is_some());
1545
1546 let m = dfa.find("say HELLO world").unwrap();
1548 assert_eq!(m.start, 4);
1549 assert_eq!(m.end, 9);
1550 }
1551
1552 #[test]
1553 fn test_case_insensitive_char_class() {
1554 let mut dfa = make_dfa_ci("[a-z]+", true).unwrap();
1555
1556 let m = dfa.find("123ABC456").unwrap();
1558 assert_eq!(m.start, 3);
1559 assert_eq!(m.end, 6);
1560
1561 let m = dfa.find("123AbC456").unwrap();
1563 assert_eq!(m.start, 3);
1564 assert_eq!(m.end, 6);
1565 }
1566
1567 #[test]
1568 fn test_case_insensitive_find_all() {
1569 let mut dfa = make_dfa_ci("hello", true).unwrap();
1570 let matches = dfa.find_all("hello HELLO HeLLo");
1571 assert_eq!(matches.len(), 3);
1572 }
1573
1574 #[test]
1575 fn test_case_sensitive_does_not_match_wrong_case() {
1576 let mut dfa = make_dfa_ci("hello", false).unwrap();
1577
1578 assert!(dfa.find("hello").is_some());
1579 assert!(dfa.find("HELLO").is_none());
1580 assert!(dfa.find("HeLLo").is_none());
1581 }
1582
1583 #[test]
1585 fn test_minimize_simple() {
1586 let mut dfa = make_dfa("hello").unwrap();
1587 let before = dfa.state_count();
1588 let removed = dfa.minimize();
1589 let after = dfa.state_count();
1590
1591 assert_eq!(removed, before - after);
1592
1593 assert!(dfa.find("hello").is_some());
1595 assert!(dfa.find("world").is_none());
1596 let m = dfa.find("say hello").unwrap();
1597 assert_eq!(m.start, 4);
1598 assert_eq!(m.end, 9);
1599 }
1600
1601 #[test]
1602 fn test_minimize_alternation() {
1603 let mut dfa = make_dfa("cat|car|cap").unwrap();
1605 dfa.complete();
1607 let before = dfa.state_count();
1608 let removed = dfa.minimize();
1609 let after = dfa.state_count();
1610
1611 println!("cat|car|cap: Before={before}, After={after}, Removed={removed}");
1613
1614 assert!(dfa.find("cat").is_some());
1616 assert!(dfa.find("car").is_some());
1617 assert!(dfa.find("cap").is_some());
1618 assert!(dfa.find("cab").is_none());
1619 }
1620
1621 #[test]
1622 fn test_minimize_char_class() {
1623 let mut dfa = make_dfa("[abc]+").unwrap();
1624 dfa.complete();
1626 let before = dfa.state_count();
1627 let removed = dfa.minimize();
1628 let after = dfa.state_count();
1629
1630 println!("[abc]+ states: Before={before}, After={after}, Removed={removed}");
1631
1632 let m = dfa.find("xyzabc123").unwrap();
1634 assert_eq!(m.start, 3);
1635 assert_eq!(m.end, 6);
1636 }
1637
1638 #[test]
1639 fn test_minimize_preserves_anchors() {
1640 let mut dfa = make_dfa("^hello$").unwrap();
1641 dfa.minimize();
1642
1643 assert!(dfa.find("hello").is_some());
1645 assert!(dfa.find("hello world").is_none()); assert!(dfa.find("say hello").is_none()); }
1648
1649 #[test]
1650 fn test_minimize_case_insensitive() {
1651 let mut dfa = make_dfa_ci("hello", true).unwrap();
1652 dfa.complete();
1653 let before = dfa.state_count();
1654 let removed = dfa.minimize();
1655 let after = dfa.state_count();
1656
1657 println!("CI 'hello' states: Before={before}, After={after}, Removed={removed}");
1658
1659 assert!(dfa.find("hello").is_some());
1661 assert!(dfa.find("HELLO").is_some());
1662 assert!(dfa.find("HeLLo").is_some());
1663 }
1664
1665 #[test]
1666 fn test_state_count() {
1667 let dfa = make_dfa("a").unwrap();
1668 assert!(dfa.state_count() >= 1);
1669
1670 let dfa = make_dfa("abc").unwrap();
1671 assert!(dfa.state_count() >= 1);
1672 }
1673
1674 #[test]
1675 fn test_minimize_with_bounded_quantifier() {
1676 let mut dfa = make_dfa("^a{1,3}b$").unwrap();
1678 dfa.complete();
1679 let before = dfa.state_count();
1680 let removed = dfa.minimize();
1681 let after = dfa.state_count();
1682
1683 println!("^a{{1,3}}b$: Before={before}, After={after}, Removed={removed}");
1684
1685 assert!(dfa.find("ab").is_some());
1687 assert!(dfa.find("aab").is_some());
1688 assert!(dfa.find("aaab").is_some());
1689 assert!(dfa.find("aaaab").is_none()); assert!(dfa.find("b").is_none()); }
1692
1693 #[test]
1694 fn test_minimize_complex_alternation() {
1695 let mut dfa = make_dfa("abc|abd|abe|abf").unwrap();
1697 dfa.complete();
1698 let before = dfa.state_count();
1699 let removed = dfa.minimize();
1700 let after = dfa.state_count();
1701
1702 println!("abc|abd|abe|abf: Before={before}, After={after}, Removed={removed}");
1703
1704 assert!(dfa.find("abc").is_some());
1706 assert!(dfa.find("abd").is_some());
1707 assert!(dfa.find("abe").is_some());
1708 assert!(dfa.find("abf").is_some());
1709 assert!(dfa.find("abg").is_none());
1710 }
1711
1712 #[test]
1714 fn test_multiline_start_anchor() {
1715 let mut dfa = make_dfa_full("^line", false, false).unwrap();
1717 assert!(dfa.find("line1\nline2").is_some());
1718 assert!(dfa.find("first\nline2").is_none()); let mut dfa_ml = make_dfa_full("^line", false, true).unwrap();
1722 assert!(dfa_ml.find("line1\nline2").is_some());
1723 let m = dfa_ml.find("first\nline2").unwrap();
1724 assert_eq!(m.start, 6); assert_eq!(m.end, 10);
1726 }
1727
1728 #[test]
1729 fn test_multiline_end_anchor() {
1730 let mut dfa = make_dfa_full("end$", false, false).unwrap();
1732 assert!(dfa.find("the end").is_some());
1733 assert!(dfa.find("end\nnext").is_none()); let mut dfa_ml = make_dfa_full("end$", false, true).unwrap();
1737 assert!(dfa_ml.find("the end").is_some());
1738 let m = dfa_ml.find("end\nnext").unwrap();
1739 assert_eq!(m.start, 0);
1740 assert_eq!(m.end, 3);
1741 }
1742
1743 #[test]
1744 fn test_multiline_both_anchors() {
1745 let mut dfa_ml = make_dfa_full("^line$", false, true).unwrap();
1747
1748 let m = dfa_ml.find("line\nother").unwrap();
1750 assert_eq!(m.start, 0);
1751 assert_eq!(m.end, 4);
1752
1753 let m = dfa_ml.find("first\nline\nlast").unwrap();
1755 assert_eq!(m.start, 6);
1756 assert_eq!(m.end, 10);
1757
1758 let m = dfa_ml.find("other\nline").unwrap();
1760 assert_eq!(m.start, 6);
1761 assert_eq!(m.end, 10);
1762 }
1763
1764 #[test]
1765 fn test_multiline_find_all() {
1766 let mut dfa_ml = make_dfa_full("^[a-z]+$", false, true).unwrap();
1767 let matches = dfa_ml.find_all("hello\n123\nworld");
1768
1769 assert_eq!(matches.len(), 2);
1770 assert_eq!(matches[0].start, 0);
1771 assert_eq!(matches[0].end, 5);
1772 assert_eq!(matches[1].start, 10);
1773 assert_eq!(matches[1].end, 15);
1774 }
1775}