fuzzy_regex/engine/
dfa.rs

1//! DFA (Deterministic Finite Automaton) for fast exact matching.
2//!
3//! This module implements lazy DFA construction from NFA using subset construction.
4//! For patterns without fuzzy matching, DFA provides O(1) per character matching.
5
6#![allow(
7    clippy::needless_range_loop,
8    clippy::match_same_arms,
9    clippy::similar_names,
10    clippy::too_many_lines,
11    clippy::missing_panics_doc,
12    clippy::missing_errors_doc,
13    clippy::items_after_statements,
14    clippy::inline_always,
15    clippy::float_cmp,
16    clippy::allow_attributes,
17    let_underscore_drop
18)]
19
20// Note: iter_over_flow_is_subslice is not a valid lint
21
22use std::collections::HashMap;
23
24use memchr::{memchr, memmem};
25
26use super::fuzzy_bridge::FuzzyBridge;
27use super::hash::FxHashMap;
28use super::simd_class::AsciiClassBitmap;
29use crate::ir::{Nfa, State, StateId};
30use crate::parser::Anchor;
31
32/// An extended NFA state that can track position within a `FuzzyLiteral`.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
34struct ExtendedState {
35    /// The NFA state ID.
36    state_id: StateId,
37    /// Offset within a `FuzzyLiteral` pattern (0 means at start of pattern).
38    /// None for non-FuzzyLiteral states.
39    literal_offset: Option<usize>,
40}
41
42impl ExtendedState {
43    fn new(state_id: StateId) -> Self {
44        ExtendedState {
45            state_id,
46            literal_offset: None,
47        }
48    }
49
50    fn with_offset(state_id: StateId, offset: usize) -> Self {
51        ExtendedState {
52            state_id,
53            literal_offset: Some(offset),
54        }
55    }
56}
57
58/// A set of NFA states, represented as a sorted vector for hashing.
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60struct NfaStateSet(Vec<ExtendedState>);
61
62impl NfaStateSet {
63    fn new() -> Self {
64        NfaStateSet(Vec::new())
65    }
66
67    fn with_capacity(cap: usize) -> Self {
68        NfaStateSet(Vec::with_capacity(cap))
69    }
70
71    fn insert(&mut self, state: ExtendedState) {
72        if let Err(pos) = self.0.binary_search(&state) {
73            self.0.insert(pos, state);
74        }
75    }
76
77    fn contains(&self, state: &ExtendedState) -> bool {
78        self.0.binary_search(state).is_ok()
79    }
80
81    fn is_empty(&self) -> bool {
82        self.0.is_empty()
83    }
84
85    fn iter(&self) -> impl Iterator<Item = &ExtendedState> {
86        self.0.iter()
87    }
88}
89
90/// DFA state ID.
91type DfaStateId = u32;
92
93/// Sentinel value meaning "transition not yet computed".
94const TRANS_UNKNOWN: u32 = u32::MAX;
95/// Sentinel value meaning "dead state (no valid transition)".
96const TRANS_DEAD: u32 = u32::MAX - 1;
97
98/// Dense transition table for ASCII characters (0-127).
99/// Uses u32 state IDs with sentinel values for unknown/dead transitions.
100#[derive(Clone)]
101struct AsciiTransitions {
102    /// Transitions for ASCII bytes 0-127.
103    /// `TRANS_UNKNOWN` = not computed, `TRANS_DEAD` = dead state.
104    table: Box<[u32; 128]>,
105}
106
107impl AsciiTransitions {
108    fn new() -> Self {
109        AsciiTransitions {
110            table: Box::new([TRANS_UNKNOWN; 128]),
111        }
112    }
113
114    #[inline]
115    fn get(&self, byte: u8) -> u32 {
116        self.table[byte as usize]
117    }
118
119    #[inline]
120    fn set(&mut self, byte: u8, state: u32) {
121        self.table[byte as usize] = state;
122    }
123}
124
125impl std::fmt::Debug for AsciiTransitions {
126    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
127        // Only show non-UNKNOWN transitions for brevity
128        let computed_count = self.table.iter().filter(|&&v| v != TRANS_UNKNOWN).count();
129        f.debug_struct("AsciiTransitions")
130            .field("computed_count", &computed_count)
131            .finish()
132    }
133}
134
135/// A DFA state with transitions.
136#[derive(Debug, Clone)]
137struct DfaState {
138    /// The set of NFA states this DFA state represents.
139    nfa_states: NfaStateSet,
140    /// Whether this state is accepting.
141    is_accept: bool,
142    /// Whether this state contains a start anchor (^).
143    has_start_anchor: bool,
144    /// Whether this state contains an end anchor ($).
145    has_end_anchor: bool,
146    /// Dense transition table for ASCII characters (fast path).
147    ascii_transitions: AsciiTransitions,
148    /// Sparse transitions for non-ASCII characters.
149    unicode_transitions: FxHashMap<char, u32>,
150}
151
152/// Prefilter for fast candidate position detection.
153#[derive(Debug, Clone)]
154enum DfaPrefilter {
155    /// No prefiltering - scan every position.
156    None,
157    /// Single byte prefilter using memchr.
158    SingleByte(u8),
159    /// Two bytes using memchr2.
160    TwoBytes(u8, u8),
161    /// Three bytes using memchr3 (for alternations).
162    ThreeBytes(u8, u8, u8),
163    /// More than 3 bytes - fallback to byte set scanning.
164    ManyBytes(Vec<u8>),
165    /// Literal prefix using memmem.
166    Literal(Vec<u8>),
167}
168
169/// Lazy DFA for fast exact matching.
170#[derive(Debug)]
171#[allow(clippy::struct_excessive_bools)]
172pub struct Dfa {
173    /// The source NFA.
174    nfa: Nfa,
175    /// Literal patterns for `FuzzyLiteral` expansion.
176    literal_texts: Vec<String>,
177    /// DFA states.
178    states: Vec<DfaState>,
179    /// Map from NFA state set to DFA state ID.
180    state_cache: HashMap<NfaStateSet, DfaStateId>,
181    /// The start state ID.
182    start: DfaStateId,
183    /// Whether the pattern is anchored at the start (^).
184    anchored_start: bool,
185    /// Whether the pattern is anchored at the end ($).
186    anchored_end: bool,
187    /// Whether matching is case-insensitive.
188    case_insensitive: bool,
189    /// Whether multiline mode is enabled (^ and $ match at line boundaries).
190    multi_line: bool,
191    /// Pre-compiled character class bitmaps indexed by NFA state ID.
192    /// Enables O(1) ASCII character class membership testing.
193    char_class_bitmaps: Vec<Option<AsciiClassBitmap>>,
194    /// Prefilter for fast candidate position detection.
195    prefilter: DfaPrefilter,
196}
197
198/// Result of a DFA match.
199#[derive(Debug, Clone)]
200pub struct DfaMatch {
201    /// Start position (byte index).
202    pub start: usize,
203    /// End position (byte index).
204    pub end: usize,
205}
206
207impl Dfa {
208    /// Create a new DFA from an NFA.
209    /// Returns None if the NFA contains states that can't be converted to DFA
210    /// (fuzzy matching with edits, lookahead/lookbehind, backreferences).
211    ///
212    /// If `bridge` is provided, exact `FuzzyLiteral` states will be expanded.
213    /// If `case_insensitive` is true, matching will be case-insensitive.
214    /// If `multi_line` is true, ^ and $ will match at line boundaries.
215    #[must_use]
216    pub fn from_nfa(
217        nfa: &Nfa,
218        bridge: Option<&FuzzyBridge>,
219        case_insensitive: bool,
220        multi_line: bool,
221    ) -> Option<Self> {
222        // Check if NFA is DFA-compatible
223        if !Self::is_dfa_compatible(nfa, bridge) {
224            return None;
225        }
226
227        // Extract literal texts from bridge
228        let literal_texts = if let Some(b) = bridge {
229            (0..b.pattern_count())
230                .filter_map(|i| b.pattern_text(i).map(ToString::to_string))
231                .collect()
232        } else {
233            Vec::new()
234        };
235
236        // Pre-compile character class bitmaps for fast matching
237        let char_class_bitmaps: Vec<Option<AsciiClassBitmap>> = nfa
238            .states
239            .iter()
240            .map(|state| {
241                if let State::Char { class, .. } = state {
242                    Some(AsciiClassBitmap::from_hir_class(class))
243                } else {
244                    None
245                }
246            })
247            .collect();
248
249        // Extract literal prefix for prefiltering
250        let prefilter = Self::extract_prefilter(nfa, &literal_texts, case_insensitive);
251
252        let mut dfa = Dfa {
253            nfa: nfa.clone(),
254            literal_texts,
255            states: Vec::new(),
256            state_cache: HashMap::new(),
257            start: 0,
258            anchored_start: false,
259            anchored_end: false,
260            case_insensitive,
261            multi_line,
262            char_class_bitmaps,
263            prefilter,
264        };
265
266        // Compute epsilon closure of start state
267        let mut start_set = NfaStateSet::new();
268        dfa.epsilon_closure(nfa.start, &mut start_set);
269
270        // Check for start anchor
271        dfa.anchored_start = start_set.iter().any(|s| {
272            matches!(
273                &nfa.states[s.state_id],
274                State::Anchor {
275                    kind: Anchor::Start,
276                    ..
277                }
278            )
279        });
280
281        // Check for end anchor - scan all NFA states
282        dfa.anchored_end = nfa.states.iter().any(|s| {
283            matches!(
284                s,
285                State::Anchor {
286                    kind: Anchor::End,
287                    ..
288                }
289            )
290        });
291
292        // Create start state
293        let start_id = dfa.get_or_create_state(start_set);
294        dfa.start = start_id;
295
296        Some(dfa)
297    }
298
299    /// Extract a literal prefix from the NFA for prefiltering.
300    fn extract_prefilter(
301        nfa: &Nfa,
302        literal_texts: &[String],
303        case_insensitive: bool,
304    ) -> DfaPrefilter {
305        let mut prefix = Vec::new();
306        let mut visited = vec![false; nfa.states.len()];
307        let mut current = nfa.start;
308
309        // Follow the NFA from start, collecting literal bytes
310        loop {
311            if visited[current] {
312                break;
313            }
314            visited[current] = true;
315
316            match &nfa.states[current] {
317                State::Epsilon { targets } if targets.len() == 1 => {
318                    current = targets[0];
319                }
320                State::Anchor { next, .. }
321                | State::CaptureStart { next, .. }
322                | State::CaptureEnd { next, .. } => {
323                    current = *next;
324                }
325                State::Char { class, .. } => {
326                    // Check if this is a single character class
327                    if class.chars.len() == 1
328                        && class.ranges.is_empty()
329                        && class.named.is_empty()
330                        && !class.negated
331                    {
332                        let ch = class.chars[0];
333                        if ch.is_ascii() {
334                            prefix.push(ch as u8);
335                            // Continue to next state would require computing epsilon closure
336                            // which is complex, so we stop after first char for now
337                            break;
338                        }
339                    }
340                    break;
341                }
342                State::FuzzyLiteral { pattern_index, .. } => {
343                    // Get the literal text
344                    if let Some(text) = literal_texts.get(*pattern_index) {
345                        prefix.extend(text.as_bytes().iter().take(8)); // Cap at 8 bytes
346                    }
347                    break;
348                }
349                State::Split { branches, .. } => {
350                    // For alternations, collect first bytes from each branch
351                    let first_bytes =
352                        Self::collect_first_bytes_from_branches(nfa, branches, literal_texts);
353                    if !first_bytes.is_empty() {
354                        return Self::make_multi_byte_prefilter(&first_bytes, case_insensitive);
355                    }
356                    break;
357                }
358                _ => break,
359            }
360        }
361
362        if prefix.is_empty() {
363            return DfaPrefilter::None;
364        }
365
366        // For case-insensitive, we can only use single-byte or two-byte prefilters
367        // because memmem is case-sensitive
368        if case_insensitive {
369            let b = prefix[0];
370            if b.is_ascii_alphabetic() {
371                return DfaPrefilter::TwoBytes(b.to_ascii_lowercase(), b.to_ascii_uppercase());
372            }
373            // Non-alphabetic first byte - can use single byte
374            return DfaPrefilter::SingleByte(b);
375        }
376
377        // For case-sensitive single byte, use SingleByte
378        if prefix.len() == 1 {
379            return DfaPrefilter::SingleByte(prefix[0]);
380        }
381
382        // For multi-byte prefix, use Literal
383        DfaPrefilter::Literal(prefix)
384    }
385
386    /// Collect first bytes from all branches of a Split state.
387    fn collect_first_bytes_from_branches(
388        nfa: &Nfa,
389        branches: &[StateId],
390        literal_texts: &[String],
391    ) -> Vec<u8> {
392        let mut first_bytes = Vec::new();
393
394        for &branch in branches {
395            if let Some(byte) = Self::get_first_byte(nfa, branch, literal_texts)
396                && !first_bytes.contains(&byte)
397            {
398                first_bytes.push(byte);
399            }
400        }
401
402        first_bytes
403    }
404
405    /// Get the first byte of a pattern starting at a given NFA state.
406    fn get_first_byte(nfa: &Nfa, start: StateId, literal_texts: &[String]) -> Option<u8> {
407        let mut visited = vec![false; nfa.states.len()];
408        let mut current = start;
409
410        loop {
411            if current >= nfa.states.len() || visited[current] {
412                return None;
413            }
414            visited[current] = true;
415
416            match &nfa.states[current] {
417                State::Epsilon { targets } if !targets.is_empty() => {
418                    current = targets[0];
419                }
420                State::CaptureStart { next, .. } | State::CaptureEnd { next, .. } => {
421                    current = *next;
422                }
423                State::Char { class, .. } => {
424                    // Get first byte from character class
425                    if !class.chars.is_empty() {
426                        let ch = class.chars[0];
427                        // Return the first UTF-8 byte for any character (ASCII or non-ASCII)
428                        let mut buf = [0u8; 4];
429                        let encoded = ch.encode_utf8(&mut buf);
430                        return Some(encoded.as_bytes()[0]);
431                    }
432                    if !class.ranges.is_empty() {
433                        let (start, _) = class.ranges[0];
434                        if start.is_ascii() {
435                            return Some(start as u8);
436                        }
437                        // For non-ASCII ranges, return the first byte of the range start
438                        let mut buf = [0u8; 4];
439                        let encoded = start.encode_utf8(&mut buf);
440                        return Some(encoded.as_bytes()[0]);
441                    }
442                    return None;
443                }
444                State::FuzzyLiteral { pattern_index, .. } => {
445                    if let Some(text) = literal_texts.get(*pattern_index) {
446                        return text.as_bytes().first().copied();
447                    }
448                    return None;
449                }
450                _ => return None,
451            }
452        }
453    }
454
455    /// Create a prefilter for multiple first bytes.
456    fn make_multi_byte_prefilter(bytes: &[u8], case_insensitive: bool) -> DfaPrefilter {
457        if bytes.is_empty() {
458            return DfaPrefilter::None;
459        }
460
461        // Expand bytes for case-insensitive matching
462        let expanded: Vec<u8> = if case_insensitive {
463            let mut result = Vec::new();
464            for &b in bytes {
465                if b.is_ascii_alphabetic() {
466                    let lower = b.to_ascii_lowercase();
467                    let upper = b.to_ascii_uppercase();
468                    if !result.contains(&lower) {
469                        result.push(lower);
470                    }
471                    if !result.contains(&upper) {
472                        result.push(upper);
473                    }
474                } else if !result.contains(&b) {
475                    result.push(b);
476                }
477            }
478            result
479        } else {
480            bytes.to_vec()
481        };
482
483        match expanded.len() {
484            0 => DfaPrefilter::None,
485            1 => DfaPrefilter::SingleByte(expanded[0]),
486            2 => DfaPrefilter::TwoBytes(expanded[0], expanded[1]),
487            3 => DfaPrefilter::ThreeBytes(expanded[0], expanded[1], expanded[2]),
488            _ => {
489                // More than 3 bytes - use byte set scanning
490                DfaPrefilter::ManyBytes(expanded)
491            }
492        }
493    }
494
495    /// Check if an NFA can be converted to DFA.
496    fn is_dfa_compatible(nfa: &Nfa, bridge: Option<&FuzzyBridge>) -> bool {
497        for state in &nfa.states {
498            match state {
499                State::Accept
500                | State::Epsilon { .. }
501                | State::ResetMatchStart { .. }
502                | State::Char { .. }
503                | State::Split { .. }
504                | State::CaptureStart { .. }
505                | State::CaptureEnd { .. } => {}
506
507                // Only Start and End anchors are supported by DFA
508                // Word boundaries require NFA matching
509                State::Anchor { kind, .. } => {
510                    use crate::parser::ast::Anchor;
511                    match kind {
512                        Anchor::Start | Anchor::End => {}
513                        Anchor::WordBoundary | Anchor::NotWordBoundary => return false,
514                    }
515                }
516
517                // FuzzyLiteral is OK if it's exact (no edits)
518                State::FuzzyLiteral {
519                    limits,
520                    pattern_index,
521                    ..
522                } => {
523                    let max_edits = limits.as_ref().map(|l| {
524                        l.get_edits().unwrap_or_else(|| {
525                            let i = l.get_insertions().unwrap_or(0);
526                            let d = l.get_deletions().unwrap_or(0);
527                            let s = l.get_substitutions().unwrap_or(0);
528                            let t = l.get_swaps().unwrap_or(0);
529                            i.saturating_add(d).saturating_add(s).saturating_add(t)
530                        })
531                    });
532                    // Only compatible if exact and we have the pattern text
533                    if (max_edits.is_none() || max_edits == Some(0))
534                        && bridge.is_some_and(|b| b.pattern_text(*pattern_index).is_some())
535                    {
536                        continue;
537                    }
538                    return false;
539                }
540
541                // These can't be converted to DFA
542                State::FuzzyChar { .. }
543                | State::Lookahead { .. }
544                | State::Lookbehind { .. }
545                | State::Backreference { .. }
546                | State::AtomicGroup { .. }
547                | State::RecursivePattern { .. }
548                | State::RecursiveGroup { .. }
549                | State::RecursiveNamedGroup { .. } => return false,
550            }
551        }
552        true
553    }
554
555    /// Compute the epsilon closure of an NFA state.
556    fn epsilon_closure(&self, state: StateId, result: &mut NfaStateSet) {
557        self.epsilon_closure_ext(ExtendedState::new(state), result);
558    }
559
560    /// Compute the epsilon closure of an extended NFA state.
561    fn epsilon_closure_ext(&self, ext_state: ExtendedState, result: &mut NfaStateSet) {
562        if result.contains(&ext_state) {
563            return;
564        }
565
566        let state = ext_state.state_id;
567
568        match &self.nfa.states[state] {
569            State::Epsilon { targets } => {
570                result.insert(ext_state);
571                for &target in targets {
572                    self.epsilon_closure_ext(ExtendedState::new(target), result);
573                }
574            }
575            State::Split { branches, .. } => {
576                result.insert(ext_state);
577                for &branch in branches {
578                    self.epsilon_closure_ext(ExtendedState::new(branch), result);
579                }
580            }
581            State::CaptureStart { next, .. } | State::CaptureEnd { next, .. } => {
582                // Skip capture markers (they don't consume input)
583                result.insert(ext_state);
584                self.epsilon_closure_ext(ExtendedState::new(*next), result);
585            }
586            State::Anchor { next, .. } => {
587                result.insert(ext_state);
588                // Anchors don't consume input - follow to next state
589                // We keep the anchor in the set so we can check it at match time
590                self.epsilon_closure_ext(ExtendedState::new(*next), result);
591            }
592            State::FuzzyLiteral { .. } => {
593                // For FuzzyLiteral, we start at offset 0
594                if ext_state.literal_offset.is_none() {
595                    // Initial entry - start at offset 0
596                    result.insert(ExtendedState::with_offset(state, 0));
597                } else {
598                    // Already have an offset, keep it
599                    result.insert(ext_state);
600                }
601            }
602            _ => {
603                result.insert(ext_state);
604            }
605        }
606    }
607
608    /// Get or create a DFA state for the given NFA state set.
609    fn get_or_create_state(&mut self, nfa_states: NfaStateSet) -> DfaStateId {
610        if let Some(&id) = self.state_cache.get(&nfa_states) {
611            return id;
612        }
613
614        let is_accept = nfa_states
615            .iter()
616            .any(|ext| matches!(&self.nfa.states[ext.state_id], State::Accept));
617
618        let has_start_anchor = nfa_states.iter().any(|ext| {
619            matches!(
620                &self.nfa.states[ext.state_id],
621                State::Anchor {
622                    kind: Anchor::Start,
623                    ..
624                }
625            )
626        });
627
628        let has_end_anchor = nfa_states.iter().any(|ext| {
629            matches!(
630                &self.nfa.states[ext.state_id],
631                State::Anchor {
632                    kind: Anchor::End,
633                    ..
634                }
635            )
636        });
637
638        let id = self.states.len() as DfaStateId;
639        self.states.push(DfaState {
640            nfa_states: nfa_states.clone(),
641            is_accept,
642            has_start_anchor,
643            has_end_anchor,
644            ascii_transitions: AsciiTransitions::new(),
645            unicode_transitions: FxHashMap::default(),
646        });
647        self.state_cache.insert(nfa_states, id);
648        id
649    }
650
651    /// Check if two characters match, considering case-insensitivity.
652    /// Uses const generic to eliminate branch at compile time.
653    #[inline(always)]
654    fn chars_match<const CASE_INSENSITIVE: bool>(text_char: char, pattern_char: char) -> bool {
655        if CASE_INSENSITIVE {
656            // Use Unicode case folding for proper case-insensitive matching
657            // For ASCII, use the fast path; for non-ASCII, use to_lowercase()
658            if text_char.is_ascii() && pattern_char.is_ascii() {
659                text_char.eq_ignore_ascii_case(&pattern_char)
660            } else {
661                // Unicode case folding: compare lowercase forms
662                text_char.to_lowercase().eq(pattern_char.to_lowercase())
663            }
664        } else {
665            text_char == pattern_char
666        }
667    }
668
669    /// Compute the next DFA state for a given character.
670    /// Dispatches to const generic implementation for branch elimination.
671    #[inline(always)]
672    fn next_state(&mut self, state_id: DfaStateId, ch: char) -> Option<DfaStateId> {
673        if self.case_insensitive {
674            self.next_state_impl::<true>(state_id, ch)
675        } else {
676            self.next_state_impl::<false>(state_id, ch)
677        }
678    }
679
680    /// Const generic implementation of `next_state`.
681    /// Uses dense ASCII table for fast O(1) lookups on ASCII characters.
682    #[inline(always)]
683    fn next_state_impl<const CASE_INSENSITIVE: bool>(
684        &mut self,
685        state_id: DfaStateId,
686        ch: char,
687    ) -> Option<DfaStateId> {
688        let state_idx = state_id as usize;
689
690        // Fast path: ASCII character with dense table lookup
691        if ch.is_ascii() {
692            let byte = ch as u8;
693            let cached = self.states[state_idx].ascii_transitions.get(byte);
694            if cached != TRANS_UNKNOWN {
695                return if cached == TRANS_DEAD {
696                    None
697                } else {
698                    Some(cached)
699                };
700            }
701        } else {
702            // Non-ASCII: check sparse table
703            if let Some(&cached) = self.states[state_idx].unicode_transitions.get(&ch) {
704                return if cached == TRANS_DEAD {
705                    None
706                } else {
707                    Some(cached)
708                };
709            }
710        }
711
712        // Cache miss - compute the transition
713        let current = &self.states[state_idx];
714        let mut next_set = NfaStateSet::with_capacity(current.nfa_states.0.len());
715
716        // Collect states to process (to avoid borrowing issues)
717        let nfa_states: Vec<_> = current.nfa_states.0.clone();
718
719        for ext_state in nfa_states {
720            let nfa_state = ext_state.state_id;
721            match &self.nfa.states[nfa_state] {
722                State::Char { class, next } => {
723                    // Use pre-compiled bitmap for fast ASCII matching only.
724                    // For non-ASCII, use the original class matcher which handles
725                    // full Unicode character comparison correctly.
726                    let matches = if ch.is_ascii() {
727                        if let Some(bitmap) = &self.char_class_bitmaps[nfa_state] {
728                            if CASE_INSENSITIVE {
729                                bitmap.contains(ch as u8)
730                                    || bitmap.contains((ch as u8).to_ascii_lowercase())
731                                    || bitmap.contains((ch as u8).to_ascii_uppercase())
732                            } else {
733                                bitmap.contains(ch as u8)
734                            }
735                        } else if CASE_INSENSITIVE {
736                            class.matches(ch)
737                                || class.matches(ch.to_ascii_lowercase())
738                                || class.matches(ch.to_ascii_uppercase())
739                        } else {
740                            class.matches(ch)
741                        }
742                    } else {
743                        // Non-ASCII: use original class matcher
744                        if CASE_INSENSITIVE {
745                            class.matches(ch)
746                                || class.matches(ch.to_ascii_lowercase())
747                                || class.matches(ch.to_ascii_uppercase())
748                        } else {
749                            class.matches(ch)
750                        }
751                    };
752                    if matches {
753                        self.epsilon_closure(*next, &mut next_set);
754                    }
755                }
756                State::FuzzyLiteral {
757                    pattern_index,
758                    next,
759                    ..
760                } => {
761                    // Handle FuzzyLiteral with offset tracking
762                    let offset = ext_state.literal_offset.unwrap_or(0);
763                    if let Some(pattern) = self.literal_texts.get(*pattern_index) {
764                        let pattern_chars: Vec<char> = pattern.chars().collect();
765                        if offset < pattern_chars.len()
766                            && Self::chars_match::<CASE_INSENSITIVE>(ch, pattern_chars[offset])
767                        {
768                            // Character matches - advance offset or move to next state
769                            if offset + 1 == pattern_chars.len() {
770                                // Finished matching the literal - go to next state
771                                self.epsilon_closure(*next, &mut next_set);
772                            } else {
773                                // Still matching - advance offset
774                                next_set.insert(ExtendedState::with_offset(nfa_state, offset + 1));
775                            }
776                        }
777                    }
778                }
779                _ => {
780                    // Other state types don't consume characters
781                    // Anchors, Accept, Split, etc. are handled via epsilon_closure
782                }
783            }
784        }
785
786        let next_id = if next_set.is_empty() {
787            TRANS_DEAD
788        } else {
789            self.get_or_create_state(next_set)
790        };
791
792        // Cache the transition
793        if ch.is_ascii() {
794            self.states[state_idx]
795                .ascii_transitions
796                .set(ch as u8, next_id);
797            // For case-insensitive matching, also cache the other case variant
798            if CASE_INSENSITIVE {
799                let lower = ch.to_ascii_lowercase() as u8;
800                let upper = ch.to_ascii_uppercase() as u8;
801                if lower != ch as u8 {
802                    self.states[state_idx].ascii_transitions.set(lower, next_id);
803                }
804                if upper != ch as u8 {
805                    self.states[state_idx].ascii_transitions.set(upper, next_id);
806                }
807            }
808        } else {
809            self.states[state_idx]
810                .unicode_transitions
811                .insert(ch, next_id);
812            // For case-insensitive non-ASCII, cache both variants
813            if CASE_INSENSITIVE {
814                for variant in ch.to_lowercase() {
815                    if variant != ch {
816                        self.states[state_idx]
817                            .unicode_transitions
818                            .insert(variant, next_id);
819                    }
820                }
821                for variant in ch.to_uppercase() {
822                    if variant != ch {
823                        self.states[state_idx]
824                            .unicode_transitions
825                            .insert(variant, next_id);
826                    }
827                }
828            }
829        }
830
831        if next_id == TRANS_DEAD {
832            None
833        } else {
834            Some(next_id)
835        }
836    }
837
838    /// Find the first match in the text.
839    pub fn find(&mut self, text: &str) -> Option<DfaMatch> {
840        if self.anchored_start && !self.multi_line {
841            // Only try at position 0 (non-multiline anchored pattern)
842            self.find_at(text, 0)
843        } else if self.anchored_start && self.multi_line {
844            // In multiline mode, try at position 0 and after each newline
845            self.find_multiline_anchored(text)
846        } else {
847            // Use prefilter to find candidate positions
848            self.find_with_prefilter(text)
849        }
850    }
851
852    /// Find match for start-anchored patterns in multiline mode.
853    /// Tries at position 0 and after each newline.
854    fn find_multiline_anchored(&mut self, text: &str) -> Option<DfaMatch> {
855        // Try at position 0
856        if let Some(m) = self.find_at(text, 0) {
857            return Some(m);
858        }
859
860        // Try after each newline
861        let bytes = text.as_bytes();
862        let mut offset = 0;
863        while let Some(pos) = memchr(b'\n', &bytes[offset..]) {
864            let start = offset + pos + 1; // Position after the newline
865            if start < bytes.len()
866                && let Some(m) = self.find_at(text, start)
867            {
868                return Some(m);
869            }
870            offset = start;
871        }
872        None
873    }
874
875    /// Find using prefilter to skip non-candidate positions.
876    fn find_with_prefilter(&mut self, text: &str) -> Option<DfaMatch> {
877        let bytes = text.as_bytes();
878        let accepts_empty = self.states[self.start as usize].is_accept;
879
880        // For patterns that can match empty strings:
881        // - End-anchored patterns (like ^$) can only match empty if text is empty
882        // - Other patterns should try to find a longer match first via find_at
883        // We handle the empty-only case by trying find_at(0) first, which will
884        // return the longest match starting at position 0.
885        if accepts_empty && bytes.is_empty() {
886            // Empty text with pattern that accepts empty - return empty match
887            return Some(DfaMatch { start: 0, end: 0 });
888        }
889
890        // For non-empty text, try to find the longest match at position 0 first
891        // This handles patterns like a* that should match "aaa" not ""
892        if accepts_empty && !self.anchored_end {
893            if let Some(m) = self.find_at(text, 0) {
894                return Some(m);
895            }
896            // If no match at position 0, return empty match as fallback
897            return Some(DfaMatch { start: 0, end: 0 });
898        }
899
900        // For end-anchored patterns on non-empty text, fall through to regular matching
901        // (the end anchor check will reject empty matches at position 0)
902
903        // Clone prefilter data to avoid borrow conflicts
904        let prefilter = self.prefilter.clone();
905
906        match prefilter {
907            DfaPrefilter::None => {
908                // No prefilter - scan every position
909                self.find_linear(text)
910            }
911            DfaPrefilter::SingleByte(needle) => {
912                let mut offset = 0;
913                while let Some(pos) = memchr(needle, &bytes[offset..]) {
914                    let start = offset + pos;
915                    if let Some(m) = self.find_at(text, start) {
916                        return Some(m);
917                    }
918                    // Move past this position
919                    offset = start + 1;
920                }
921                // Handle patterns that accept empty string
922                if accepts_empty {
923                    return Some(DfaMatch { start: 0, end: 0 });
924                }
925                None
926            }
927            DfaPrefilter::TwoBytes(needle1, needle2) => {
928                let mut offset = 0;
929                while let Some(pos) = memchr::memchr2(needle1, needle2, &bytes[offset..]) {
930                    let start = offset + pos;
931                    if let Some(m) = self.find_at(text, start) {
932                        return Some(m);
933                    }
934                    offset = start + 1;
935                }
936                if accepts_empty {
937                    return Some(DfaMatch { start: 0, end: 0 });
938                }
939                None
940            }
941            DfaPrefilter::ThreeBytes(needle1, needle2, needle3) => {
942                let mut offset = 0;
943                while let Some(pos) = memchr::memchr3(needle1, needle2, needle3, &bytes[offset..]) {
944                    let start = offset + pos;
945                    if let Some(m) = self.find_at(text, start) {
946                        return Some(m);
947                    }
948                    offset = start + 1;
949                }
950                if accepts_empty {
951                    return Some(DfaMatch { start: 0, end: 0 });
952                }
953                None
954            }
955            DfaPrefilter::ManyBytes(ref needles) => {
956                // Create a byte set for O(1) lookup
957                let mut byte_set = [false; 256];
958                for &b in needles {
959                    byte_set[b as usize] = true;
960                }
961                let mut offset = 0;
962                while offset < bytes.len() {
963                    // Find next byte in our set
964                    if let Some(pos) = bytes[offset..].iter().position(|&b| byte_set[b as usize]) {
965                        let start = offset + pos;
966                        if let Some(m) = self.find_at(text, start) {
967                            return Some(m);
968                        }
969                        offset = start + 1;
970                    } else {
971                        break;
972                    }
973                }
974                if accepts_empty {
975                    return Some(DfaMatch { start: 0, end: 0 });
976                }
977                None
978            }
979            DfaPrefilter::Literal(ref lit) => {
980                let finder = memmem::Finder::new(lit);
981                let mut offset = 0;
982                while let Some(pos) = finder.find(&bytes[offset..]) {
983                    let start = offset + pos;
984                    if let Some(m) = self.find_at(text, start) {
985                        return Some(m);
986                    }
987                    offset = start + 1;
988                }
989                if accepts_empty {
990                    return Some(DfaMatch { start: 0, end: 0 });
991                }
992                None
993            }
994        }
995    }
996
997    /// Linear scan through all positions (fallback when no prefilter).
998    fn find_linear(&mut self, text: &str) -> Option<DfaMatch> {
999        for (start_pos, _) in text.char_indices() {
1000            if let Some(m) = self.find_at(text, start_pos) {
1001                return Some(m);
1002            }
1003        }
1004        // Try at end for patterns that match empty string
1005        self.find_at(text, text.len())
1006    }
1007
1008    /// Find a match starting at a specific position.
1009    /// Dispatches to const generic implementation for branch elimination.
1010    #[inline(always)]
1011    fn find_at(&mut self, text: &str, start: usize) -> Option<DfaMatch> {
1012        if self.multi_line {
1013            self.find_at_impl::<true>(text, start)
1014        } else {
1015            self.find_at_impl::<false>(text, start)
1016        }
1017    }
1018
1019    /// Const generic implementation of `find_at`.
1020    /// `MULTI_LINE` is a compile-time constant, eliminating runtime branches.
1021    #[inline(always)]
1022    fn find_at_impl<const MULTI_LINE: bool>(
1023        &mut self,
1024        text: &str,
1025        start: usize,
1026    ) -> Option<DfaMatch> {
1027        let bytes = text.as_bytes();
1028        let mut state_id = self.start;
1029        let mut last_accept: Option<usize> = None;
1030
1031        // Check if start position satisfies start anchor (^ constraint)
1032        let start_anchor_ok = Self::is_start_anchor_satisfied::<MULTI_LINE>(bytes, start);
1033
1034        // Check if start state is accepting
1035        if self.states[state_id as usize].is_accept {
1036            // For anchored_start patterns, check the start anchor
1037            if !self.anchored_start || start_anchor_ok {
1038                // For end-anchored patterns, also check the end anchor
1039                let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, start);
1040                if !self.anchored_end || end_anchor_ok {
1041                    last_accept = Some(start);
1042                }
1043            }
1044        }
1045
1046        // Handle start anchor - must be satisfied
1047        if self.states[state_id as usize].has_start_anchor && !start_anchor_ok {
1048            return None;
1049        }
1050
1051        let mut pos = start;
1052        for ch in text[start..].chars() {
1053            // Compute next state
1054            let next = self.next_state(state_id, ch);
1055
1056            match next {
1057                Some(next_id) => {
1058                    state_id = next_id;
1059                    pos += ch.len_utf8();
1060
1061                    // Check if this is an accepting state
1062                    if self.states[state_id as usize].is_accept {
1063                        // For end-anchored patterns, check end anchor constraint
1064                        let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, pos);
1065                        if !self.states[state_id as usize].has_end_anchor || end_anchor_ok {
1066                            last_accept = Some(pos);
1067                        }
1068                    }
1069                }
1070                None => {
1071                    // Dead state - return last accepting position if any
1072                    break;
1073                }
1074            }
1075        }
1076
1077        // Check for end anchor at final position
1078        if self.states[state_id as usize].has_end_anchor {
1079            let end_anchor_ok = Self::is_end_anchor_satisfied::<MULTI_LINE>(bytes, pos);
1080            if end_anchor_ok && self.states[state_id as usize].is_accept {
1081                last_accept = Some(pos);
1082            }
1083        }
1084
1085        last_accept.map(|end| DfaMatch { start, end })
1086    }
1087
1088    /// Check if start anchor (^) is satisfied at the given position.
1089    /// Uses const generic to eliminate multiline branch at compile time.
1090    #[inline(always)]
1091    fn is_start_anchor_satisfied<const MULTI_LINE: bool>(bytes: &[u8], pos: usize) -> bool {
1092        if pos == 0 {
1093            return true;
1094        }
1095        if MULTI_LINE && bytes[pos - 1] == b'\n' {
1096            return true;
1097        }
1098        false
1099    }
1100
1101    /// Check if end anchor ($) is satisfied at the given position.
1102    /// Uses const generic to eliminate multiline branch at compile time.
1103    #[inline(always)]
1104    fn is_end_anchor_satisfied<const MULTI_LINE: bool>(bytes: &[u8], pos: usize) -> bool {
1105        if pos == bytes.len() {
1106            return true;
1107        }
1108        if MULTI_LINE && bytes[pos] == b'\n' {
1109            return true;
1110        }
1111        false
1112    }
1113
1114    /// Find all non-overlapping matches.
1115    pub fn find_all(&mut self, text: &str) -> Vec<DfaMatch> {
1116        let mut matches = Vec::new();
1117        let mut pos = 0;
1118
1119        while pos <= text.len() {
1120            if let Some(m) = self.find_at(text, pos) {
1121                let end = m.end;
1122                matches.push(m);
1123                // Move past this match
1124                pos = if end > pos {
1125                    end
1126                } else {
1127                    // Empty match - advance by one character
1128                    text[pos..]
1129                        .chars()
1130                        .next()
1131                        .map_or(text.len() + 1, |c| pos + c.len_utf8())
1132                };
1133            } else {
1134                // No match at this position - advance
1135                pos = text[pos..]
1136                    .chars()
1137                    .next()
1138                    .map_or(text.len() + 1, |c| pos + c.len_utf8());
1139            }
1140        }
1141
1142        matches
1143    }
1144
1145    /// Find the first `n` non-overlapping matches.
1146    pub fn find_n(&mut self, text: &str, n: usize) -> Vec<DfaMatch> {
1147        if n == 0 {
1148            return Vec::new();
1149        }
1150
1151        let mut matches = Vec::with_capacity(n);
1152        let mut pos = 0;
1153
1154        while pos <= text.len() && matches.len() < n {
1155            if let Some(m) = self.find_at(text, pos) {
1156                let end = m.end;
1157                matches.push(m);
1158                // Move past this match
1159                pos = if end > pos {
1160                    end
1161                } else {
1162                    // Empty match - advance by one character
1163                    text[pos..]
1164                        .chars()
1165                        .next()
1166                        .map_or(text.len() + 1, |c| pos + c.len_utf8())
1167                };
1168            } else {
1169                // No match at this position - advance
1170                pos = text[pos..]
1171                    .chars()
1172                    .next()
1173                    .map_or(text.len() + 1, |c| pos + c.len_utf8());
1174            }
1175        }
1176
1177        matches
1178    }
1179
1180    /// Check if the DFA is anchored at start.
1181    #[must_use]
1182    pub fn is_anchored_start(&self) -> bool {
1183        self.anchored_start
1184    }
1185
1186    /// Check if the DFA is anchored at end.
1187    #[must_use]
1188    pub fn is_anchored_end(&self) -> bool {
1189        self.anchored_end
1190    }
1191
1192    /// Get the number of states in the DFA.
1193    #[must_use]
1194    pub fn state_count(&self) -> usize {
1195        self.states.len()
1196    }
1197
1198    /// Complete the DFA by computing all reachable transitions.
1199    /// This is required before minimization.
1200    fn complete(&mut self) {
1201        // Compute all transitions for all states by iterating over ASCII chars
1202        // We iterate until no new states are discovered
1203        let mut i = 0;
1204        while i < self.states.len() {
1205            // Compute transitions for all printable ASCII characters
1206            for byte in 0u8..128 {
1207                let ch = byte as char;
1208                let _ = self.next_state(i as DfaStateId, ch);
1209            }
1210            i += 1;
1211        }
1212    }
1213
1214    /// Minimize the DFA using Hopcroft's algorithm.
1215    /// Returns the number of states removed.
1216    #[allow(clippy::mut_range_bound)] // Intentional: num_partitions grows during iteration
1217    #[allow(clippy::needless_range_loop)] // Clearer to use index for partition mapping
1218    pub fn minimize(&mut self) -> usize {
1219        let original_count = self.states.len();
1220        if original_count <= 1 {
1221            return 0;
1222        }
1223
1224        // First, complete the DFA to ensure all transitions are computed
1225        self.complete();
1226
1227        // Collect all characters that have transitions
1228        let mut alphabet: Vec<char> = Vec::new();
1229        for byte in 0u8..128 {
1230            alphabet.push(byte as char);
1231        }
1232        // Also collect non-ASCII characters from unicode_transitions
1233        for state in &self.states {
1234            for &ch in state.unicode_transitions.keys() {
1235                if !alphabet.contains(&ch) {
1236                    alphabet.push(ch);
1237                }
1238            }
1239        }
1240
1241        // Initialize partitions: separate accepting from non-accepting states
1242        // Also consider anchor states as different equivalence classes
1243        let mut partition: Vec<usize> = vec![0; self.states.len()];
1244        let mut num_partitions = 0;
1245
1246        // Create initial partitions based on (is_accept, has_start_anchor, has_end_anchor)
1247        let mut partition_map: HashMap<(bool, bool, bool), usize> = HashMap::new();
1248        for (i, state) in self.states.iter().enumerate() {
1249            let key = (
1250                state.is_accept,
1251                state.has_start_anchor,
1252                state.has_end_anchor,
1253            );
1254            let p = *partition_map.entry(key).or_insert_with(|| {
1255                let p = num_partitions;
1256                num_partitions += 1;
1257                p
1258            });
1259            partition[i] = p;
1260        }
1261
1262        // Hopcroft's algorithm: refine partitions until stable
1263        let mut changed = true;
1264        while changed {
1265            changed = false;
1266
1267            // For each partition, check if it needs to be split
1268            for p in 0..num_partitions {
1269                // Get states in this partition
1270                let states_in_p: Vec<usize> = partition
1271                    .iter()
1272                    .enumerate()
1273                    .filter(|&(_, part)| *part == p)
1274                    .map(|(i, _)| i)
1275                    .collect();
1276
1277                if states_in_p.len() <= 1 {
1278                    continue;
1279                }
1280
1281                // Check if all states in this partition have the same behavior
1282                // Two states are equivalent if for all input symbols,
1283                // they transition to states in the same partition
1284                let first = states_in_p[0];
1285                let mut to_split: Vec<usize> = Vec::new();
1286
1287                for &state_idx in &states_in_p[1..] {
1288                    let mut same = true;
1289                    for &ch in &alphabet {
1290                        let t1 = self.get_transition(first, ch);
1291                        let t2 = self.get_transition(state_idx, ch);
1292
1293                        let p1 = t1.map(|s| partition[s as usize]);
1294                        let p2 = t2.map(|s| partition[s as usize]);
1295
1296                        if p1 != p2 {
1297                            same = false;
1298                            break;
1299                        }
1300                    }
1301                    if !same {
1302                        to_split.push(state_idx);
1303                    }
1304                }
1305
1306                // Split the partition if needed
1307                if !to_split.is_empty() {
1308                    let new_partition = num_partitions;
1309                    num_partitions += 1;
1310                    for &state_idx in &to_split {
1311                        partition[state_idx] = new_partition;
1312                    }
1313                    changed = true;
1314                }
1315            }
1316        }
1317
1318        // Check if any minimization is possible
1319        if num_partitions == original_count {
1320            return 0;
1321        }
1322
1323        // Build the minimized DFA
1324        // Each partition becomes a new state
1325        let mut new_states: Vec<DfaState> = Vec::with_capacity(num_partitions);
1326        let mut old_to_new: Vec<DfaStateId> = vec![0; original_count];
1327
1328        // Create new states (one per partition)
1329        for p in 0..num_partitions {
1330            // Find a representative state for this partition
1331            let repr = partition
1332                .iter()
1333                .enumerate()
1334                .find(|&(_, part)| *part == p)
1335                .map(|(i, _)| i)
1336                .unwrap();
1337
1338            old_to_new[repr] = p as DfaStateId;
1339
1340            let old_state = &self.states[repr];
1341            new_states.push(DfaState {
1342                nfa_states: old_state.nfa_states.clone(),
1343                is_accept: old_state.is_accept,
1344                has_start_anchor: old_state.has_start_anchor,
1345                has_end_anchor: old_state.has_end_anchor,
1346                ascii_transitions: AsciiTransitions::new(),
1347                unicode_transitions: FxHashMap::default(),
1348            });
1349        }
1350
1351        // Map all old states to their new partition IDs
1352        for (i, &p) in partition.iter().enumerate() {
1353            old_to_new[i] = p as DfaStateId;
1354        }
1355
1356        // Remap transitions
1357        for p in 0..num_partitions {
1358            // Find representative for this partition
1359            let repr = partition
1360                .iter()
1361                .enumerate()
1362                .find(|&(_, part)| *part == p)
1363                .map(|(i, _)| i)
1364                .unwrap();
1365
1366            let old_state = &self.states[repr];
1367
1368            // Remap ASCII transitions
1369            for byte in 0u8..128 {
1370                let old_target = old_state.ascii_transitions.get(byte);
1371                let new_target = if old_target == TRANS_UNKNOWN || old_target == TRANS_DEAD {
1372                    old_target
1373                } else {
1374                    old_to_new[old_target as usize]
1375                };
1376                new_states[p].ascii_transitions.set(byte, new_target);
1377            }
1378
1379            // Remap unicode transitions
1380            for (&ch, &old_target) in &old_state.unicode_transitions {
1381                let new_target = if old_target == TRANS_DEAD {
1382                    old_target
1383                } else {
1384                    old_to_new[old_target as usize]
1385                };
1386                new_states[p].unicode_transitions.insert(ch, new_target);
1387            }
1388        }
1389
1390        // Update the DFA
1391        let new_start = old_to_new[self.start as usize];
1392        self.states = new_states;
1393        self.start = new_start;
1394
1395        // Rebuild state cache
1396        self.state_cache.clear();
1397        for (i, state) in self.states.iter().enumerate() {
1398            self.state_cache
1399                .insert(state.nfa_states.clone(), i as DfaStateId);
1400        }
1401
1402        original_count - self.states.len()
1403    }
1404
1405    /// Get transition for a state and character (without computing new transitions).
1406    fn get_transition(&self, state_idx: usize, ch: char) -> Option<DfaStateId> {
1407        let state = &self.states[state_idx];
1408        if ch.is_ascii() {
1409            let cached = state.ascii_transitions.get(ch as u8);
1410            if cached == TRANS_UNKNOWN || cached == TRANS_DEAD {
1411                None
1412            } else {
1413                Some(cached)
1414            }
1415        } else {
1416            state
1417                .unicode_transitions
1418                .get(&ch)
1419                .copied()
1420                .filter(|&t| t != TRANS_DEAD)
1421        }
1422    }
1423}
1424
1425#[cfg(test)]
1426mod tests {
1427    use super::*;
1428    use crate::compiler::build_nfa;
1429    use crate::ir::lower;
1430    use crate::parser::parse;
1431
1432    fn make_dfa(pattern: &str) -> Option<Dfa> {
1433        make_dfa_ci(pattern, false)
1434    }
1435
1436    fn make_dfa_ci(pattern: &str, case_insensitive: bool) -> Option<Dfa> {
1437        make_dfa_full(pattern, case_insensitive, false)
1438    }
1439
1440    fn make_dfa_full(pattern: &str, case_insensitive: bool, multi_line: bool) -> Option<Dfa> {
1441        let ast = parse(pattern).unwrap();
1442        let hir = lower(&ast, 0);
1443        let (nfa, literals) = build_nfa(&hir);
1444
1445        // Create FuzzyBridge if we have literals
1446        let bridge = if literals.is_empty() {
1447            None
1448        } else {
1449            FuzzyBridge::new(&literals, None, None, case_insensitive)
1450        };
1451
1452        Dfa::from_nfa(&nfa, bridge.as_ref(), case_insensitive, multi_line)
1453    }
1454
1455    #[test]
1456    fn test_simple_literal() {
1457        let mut dfa = make_dfa("hello").unwrap();
1458        let m = dfa.find("hello world").unwrap();
1459        assert_eq!(m.start, 0);
1460        assert_eq!(m.end, 5);
1461
1462        let m = dfa.find("say hello").unwrap();
1463        assert_eq!(m.start, 4);
1464        assert_eq!(m.end, 9);
1465    }
1466
1467    #[test]
1468    fn test_char_class() {
1469        let mut dfa = make_dfa("[a-z]+").unwrap();
1470        let m = dfa.find("123abc456").unwrap();
1471        assert_eq!(m.start, 3);
1472        assert_eq!(m.end, 6);
1473    }
1474
1475    #[test]
1476    fn test_start_anchor() {
1477        let mut dfa = make_dfa("^hello").unwrap();
1478        assert!(dfa.find("hello world").is_some());
1479        assert!(dfa.find("say hello").is_none());
1480    }
1481
1482    #[test]
1483    fn test_end_anchor() {
1484        let mut dfa = make_dfa("world$").unwrap();
1485        let m = dfa.find("hello world").unwrap();
1486        assert_eq!(m.start, 6);
1487        assert_eq!(m.end, 11);
1488
1489        assert!(dfa.find("world hello").is_none());
1490    }
1491
1492    #[test]
1493    fn test_alternation() {
1494        let mut dfa = make_dfa("cat|dog").unwrap();
1495        assert!(dfa.find("I have a cat").is_some());
1496        assert!(dfa.find("I have a dog").is_some());
1497        assert!(dfa.find("I have a bird").is_none());
1498    }
1499
1500    #[test]
1501    fn test_quantifiers() {
1502        let mut dfa = make_dfa("a+").unwrap();
1503        let m = dfa.find("baaab").unwrap();
1504        assert_eq!(m.start, 1);
1505        assert_eq!(m.end, 4);
1506
1507        let mut dfa = make_dfa("a*").unwrap();
1508        let m = dfa.find("baaab").unwrap();
1509        assert_eq!(m.start, 0); // Empty match at start
1510        assert_eq!(m.end, 0);
1511
1512        let mut dfa = make_dfa("a?").unwrap();
1513        let m = dfa.find("baaab").unwrap();
1514        assert_eq!(m.start, 0); // Empty match at start
1515    }
1516
1517    #[test]
1518    fn test_find_all() {
1519        let mut dfa = make_dfa("[a-z]+").unwrap();
1520        let matches = dfa.find_all("abc 123 def 456 ghi");
1521        assert_eq!(matches.len(), 3);
1522        assert_eq!(matches[0].start, 0);
1523        assert_eq!(matches[0].end, 3);
1524        assert_eq!(matches[1].start, 8);
1525        assert_eq!(matches[1].end, 11);
1526    }
1527
1528    #[test]
1529    fn test_fuzzy_not_compatible() {
1530        // Fuzzy patterns should return None
1531        assert!(make_dfa("(?:hello){e<=1}").is_none());
1532        assert!(make_dfa("hello~1").is_none());
1533    }
1534
1535    // Case-insensitive tests
1536    #[test]
1537    fn test_case_insensitive_literal() {
1538        let mut dfa = make_dfa_ci("hello", true).unwrap();
1539
1540        // Should match all case variants
1541        assert!(dfa.find("hello").is_some());
1542        assert!(dfa.find("HELLO").is_some());
1543        assert!(dfa.find("HeLLo").is_some());
1544        assert!(dfa.find("hElLo").is_some());
1545
1546        // Should find in mixed text
1547        let m = dfa.find("say HELLO world").unwrap();
1548        assert_eq!(m.start, 4);
1549        assert_eq!(m.end, 9);
1550    }
1551
1552    #[test]
1553    fn test_case_insensitive_char_class() {
1554        let mut dfa = make_dfa_ci("[a-z]+", true).unwrap();
1555
1556        // Should match uppercase too
1557        let m = dfa.find("123ABC456").unwrap();
1558        assert_eq!(m.start, 3);
1559        assert_eq!(m.end, 6);
1560
1561        // Mixed case
1562        let m = dfa.find("123AbC456").unwrap();
1563        assert_eq!(m.start, 3);
1564        assert_eq!(m.end, 6);
1565    }
1566
1567    #[test]
1568    fn test_case_insensitive_find_all() {
1569        let mut dfa = make_dfa_ci("hello", true).unwrap();
1570        let matches = dfa.find_all("hello HELLO HeLLo");
1571        assert_eq!(matches.len(), 3);
1572    }
1573
1574    #[test]
1575    fn test_case_sensitive_does_not_match_wrong_case() {
1576        let mut dfa = make_dfa_ci("hello", false).unwrap();
1577
1578        assert!(dfa.find("hello").is_some());
1579        assert!(dfa.find("HELLO").is_none());
1580        assert!(dfa.find("HeLLo").is_none());
1581    }
1582
1583    // Minimization tests
1584    #[test]
1585    fn test_minimize_simple() {
1586        let mut dfa = make_dfa("hello").unwrap();
1587        let before = dfa.state_count();
1588        let removed = dfa.minimize();
1589        let after = dfa.state_count();
1590
1591        assert_eq!(removed, before - after);
1592
1593        // Should still match correctly after minimization
1594        assert!(dfa.find("hello").is_some());
1595        assert!(dfa.find("world").is_none());
1596        let m = dfa.find("say hello").unwrap();
1597        assert_eq!(m.start, 4);
1598        assert_eq!(m.end, 9);
1599    }
1600
1601    #[test]
1602    fn test_minimize_alternation() {
1603        // Alternation often creates mergeable states
1604        let mut dfa = make_dfa("cat|car|cap").unwrap();
1605        // Complete the DFA first to see all states
1606        dfa.complete();
1607        let before = dfa.state_count();
1608        let removed = dfa.minimize();
1609        let after = dfa.state_count();
1610
1611        // States sharing "ca" prefix should be merged
1612        println!("cat|car|cap: Before={before}, After={after}, Removed={removed}");
1613
1614        // Should still match correctly
1615        assert!(dfa.find("cat").is_some());
1616        assert!(dfa.find("car").is_some());
1617        assert!(dfa.find("cap").is_some());
1618        assert!(dfa.find("cab").is_none());
1619    }
1620
1621    #[test]
1622    fn test_minimize_char_class() {
1623        let mut dfa = make_dfa("[abc]+").unwrap();
1624        // Complete the DFA first to see all states
1625        dfa.complete();
1626        let before = dfa.state_count();
1627        let removed = dfa.minimize();
1628        let after = dfa.state_count();
1629
1630        println!("[abc]+ states: Before={before}, After={after}, Removed={removed}");
1631
1632        // Should still work correctly
1633        let m = dfa.find("xyzabc123").unwrap();
1634        assert_eq!(m.start, 3);
1635        assert_eq!(m.end, 6);
1636    }
1637
1638    #[test]
1639    fn test_minimize_preserves_anchors() {
1640        let mut dfa = make_dfa("^hello$").unwrap();
1641        dfa.minimize();
1642
1643        // Anchors should still work
1644        assert!(dfa.find("hello").is_some());
1645        assert!(dfa.find("hello world").is_none()); // $ anchor
1646        assert!(dfa.find("say hello").is_none()); // ^ anchor
1647    }
1648
1649    #[test]
1650    fn test_minimize_case_insensitive() {
1651        let mut dfa = make_dfa_ci("hello", true).unwrap();
1652        dfa.complete();
1653        let before = dfa.state_count();
1654        let removed = dfa.minimize();
1655        let after = dfa.state_count();
1656
1657        println!("CI 'hello' states: Before={before}, After={after}, Removed={removed}");
1658
1659        // Should still match case-insensitively
1660        assert!(dfa.find("hello").is_some());
1661        assert!(dfa.find("HELLO").is_some());
1662        assert!(dfa.find("HeLLo").is_some());
1663    }
1664
1665    #[test]
1666    fn test_state_count() {
1667        let dfa = make_dfa("a").unwrap();
1668        assert!(dfa.state_count() >= 1);
1669
1670        let dfa = make_dfa("abc").unwrap();
1671        assert!(dfa.state_count() >= 1);
1672    }
1673
1674    #[test]
1675    fn test_minimize_with_bounded_quantifier() {
1676        // Pattern with bounded quantifier
1677        let mut dfa = make_dfa("^a{1,3}b$").unwrap();
1678        dfa.complete();
1679        let before = dfa.state_count();
1680        let removed = dfa.minimize();
1681        let after = dfa.state_count();
1682
1683        println!("^a{{1,3}}b$: Before={before}, After={after}, Removed={removed}");
1684
1685        // Should still match correctly with anchors
1686        assert!(dfa.find("ab").is_some());
1687        assert!(dfa.find("aab").is_some());
1688        assert!(dfa.find("aaab").is_some());
1689        assert!(dfa.find("aaaab").is_none()); // 4 a's - doesn't match
1690        assert!(dfa.find("b").is_none()); // no a's - doesn't match
1691    }
1692
1693    #[test]
1694    fn test_minimize_complex_alternation() {
1695        // Many alternations might create some equivalent states
1696        let mut dfa = make_dfa("abc|abd|abe|abf").unwrap();
1697        dfa.complete();
1698        let before = dfa.state_count();
1699        let removed = dfa.minimize();
1700        let after = dfa.state_count();
1701
1702        println!("abc|abd|abe|abf: Before={before}, After={after}, Removed={removed}");
1703
1704        // Should still match
1705        assert!(dfa.find("abc").is_some());
1706        assert!(dfa.find("abd").is_some());
1707        assert!(dfa.find("abe").is_some());
1708        assert!(dfa.find("abf").is_some());
1709        assert!(dfa.find("abg").is_none());
1710    }
1711
1712    // Multiline tests
1713    #[test]
1714    fn test_multiline_start_anchor() {
1715        // Without multiline: ^ only matches at position 0
1716        let mut dfa = make_dfa_full("^line", false, false).unwrap();
1717        assert!(dfa.find("line1\nline2").is_some());
1718        assert!(dfa.find("first\nline2").is_none()); // ^ doesn't match after \n
1719
1720        // With multiline: ^ matches at start OR after \n
1721        let mut dfa_ml = make_dfa_full("^line", false, true).unwrap();
1722        assert!(dfa_ml.find("line1\nline2").is_some());
1723        let m = dfa_ml.find("first\nline2").unwrap();
1724        assert_eq!(m.start, 6); // matches "line" after newline
1725        assert_eq!(m.end, 10);
1726    }
1727
1728    #[test]
1729    fn test_multiline_end_anchor() {
1730        // Without multiline: $ only matches at end of string
1731        let mut dfa = make_dfa_full("end$", false, false).unwrap();
1732        assert!(dfa.find("the end").is_some());
1733        assert!(dfa.find("end\nnext").is_none()); // $ doesn't match before \n
1734
1735        // With multiline: $ matches at end OR before \n
1736        let mut dfa_ml = make_dfa_full("end$", false, true).unwrap();
1737        assert!(dfa_ml.find("the end").is_some());
1738        let m = dfa_ml.find("end\nnext").unwrap();
1739        assert_eq!(m.start, 0);
1740        assert_eq!(m.end, 3);
1741    }
1742
1743    #[test]
1744    fn test_multiline_both_anchors() {
1745        // Pattern ^line$ should match whole lines in multiline mode
1746        let mut dfa_ml = make_dfa_full("^line$", false, true).unwrap();
1747
1748        // Match first line
1749        let m = dfa_ml.find("line\nother").unwrap();
1750        assert_eq!(m.start, 0);
1751        assert_eq!(m.end, 4);
1752
1753        // Match middle line
1754        let m = dfa_ml.find("first\nline\nlast").unwrap();
1755        assert_eq!(m.start, 6);
1756        assert_eq!(m.end, 10);
1757
1758        // Match last line
1759        let m = dfa_ml.find("other\nline").unwrap();
1760        assert_eq!(m.start, 6);
1761        assert_eq!(m.end, 10);
1762    }
1763
1764    #[test]
1765    fn test_multiline_find_all() {
1766        let mut dfa_ml = make_dfa_full("^[a-z]+$", false, true).unwrap();
1767        let matches = dfa_ml.find_all("hello\n123\nworld");
1768
1769        assert_eq!(matches.len(), 2);
1770        assert_eq!(matches[0].start, 0);
1771        assert_eq!(matches[0].end, 5);
1772        assert_eq!(matches[1].start, 10);
1773        assert_eq!(matches[1].end, 15);
1774    }
1775}
fuzzy_regex/engine/dfa.rs

fuzzy_regex/engine/
dfa.rs