Skip to main content

fuzzy_regex/ir/
nfa.rs

1//! NFA (Non-deterministic Finite Automaton) representation.
2//!
3//! The NFA is used for matching regex patterns. It supports:
4//! - Fuzzy literal matching via Levenshtein automata
5//! - Exact character and class matching
6//! - Epsilon transitions for quantifiers and alternation
7//! - Capture groups
8//! - Anchors and assertions
9
10use std::sync::Arc;
11
12use crate::engine::fuzzy_bridge::FuzzyBridge;
13use crate::types::FuzzyLimits;
14
15use super::hir::HirClass;
16use crate::parser::Anchor;
17
18/// Cost constraint for fuzzy matching.
19/// Allows specifying custom costs for each edit operation type.
20#[derive(Debug, Clone, Default)]
21pub struct CostConstraint {
22    /// Cost per insertion (default: 1).
23    pub insertion_cost: u8,
24    /// Cost per deletion (default: 1).
25    pub deletion_cost: u8,
26    /// Cost per substitution (default: 1).
27    pub substitution_cost: u8,
28    /// Cost per transposition (default: 1).
29    pub transposition_cost: u8,
30    /// Maximum total cost allowed.
31    pub max_cost: u8,
32}
33
34impl CostConstraint {
35    /// Check if the given edit counts satisfy this cost constraint.
36    #[must_use]
37    pub fn is_satisfied(
38        &self,
39        insertions: u8,
40        deletions: u8,
41        substitutions: u8,
42        transpositions: u8,
43    ) -> bool {
44        let total_cost = (u16::from(insertions) * u16::from(self.insertion_cost))
45            + (u16::from(deletions) * u16::from(self.deletion_cost))
46            + (u16::from(substitutions) * u16::from(self.substitution_cost))
47            + (u16::from(transpositions) * u16::from(self.transposition_cost));
48        total_cost < u16::from(self.max_cost)
49    }
50}
51
52/// State identifier in the NFA.
53pub type StateId = usize;
54
55/// Pattern index for fuzzy literals.
56pub type PatternIndex = usize;
57
58/// NFA representation for fuzzy regex.
59#[derive(Debug, Clone)]
60pub struct Nfa {
61    /// All states in the NFA.
62    pub states: Vec<State>,
63    /// The start state.
64    pub start: StateId,
65    /// Group sub-NFAs for recursion support.
66    /// Maps group index -> (`start_state`, `end_state`) for the group's content.
67    pub group_states: Vec<(StateId, StateId)>,
68    /// Named group state ranges for recursion.
69    /// Maps group name -> (`start_state`, `end_state`).
70    pub named_group_states: std::collections::HashMap<String, (StateId, StateId)>,
71}
72
73impl Nfa {
74    /// Create a new NFA with a single start state.
75    #[must_use]
76    pub fn new() -> Self {
77        Nfa {
78            states: vec![State::Accept],
79            start: 0,
80            group_states: Vec::new(),
81            named_group_states: std::collections::HashMap::new(),
82        }
83    }
84
85    /// Check if this NFA is "simple" - just a single `FuzzyLiteral` leading to Accept.
86    /// Simple NFAs don't need full NFA simulation; we can use Bitap result directly.
87    ///
88    /// Returns true if the NFA is:
89    /// - Start → (optional Epsilon) → `FuzzyLiteral` → (optional Epsilon) → Accept
90    /// - No captures, no `min_edits`, no `cost_constraint`
91    #[must_use]
92    pub fn is_simple_fuzzy_only(&self) -> bool {
93        let mut visited = vec![false; self.states.len()];
94        self.check_simple_fuzzy_only(self.start, &mut visited, false)
95    }
96
97    /// Check if this NFA is a "simple alternation" - a Split of `FuzzyLiterals` all leading to Accept.
98    ///
99    /// Returns true if the NFA is:
100    /// - Start → Split { branches: [`FuzzyLiteral` → Accept, `FuzzyLiteral` → Accept, ...] }
101    /// - No captures, no `min_edits`, no `cost_constraint` in any branch
102    ///
103    /// This allows using a fast multi-pattern Bitap search instead of full NFA simulation.
104    #[must_use]
105    pub fn is_simple_alternation(&self) -> bool {
106        !self.get_alternation_pattern_indices().is_empty()
107    }
108
109    /// Get the pattern indices for a simple alternation.
110    ///
111    /// Returns a vector of pattern indices if the NFA is a simple alternation of `FuzzyLiterals`,
112    /// or an empty vector if the NFA is not a simple alternation.
113    #[must_use]
114    pub fn get_alternation_pattern_indices(&self) -> Vec<PatternIndex> {
115        // Follow through initial epsilon transitions
116        let mut state_id = self.start;
117        loop {
118            match &self.states[state_id] {
119                State::Epsilon { targets } if targets.len() == 1 => {
120                    state_id = targets[0];
121                }
122                _ => break,
123            }
124        }
125
126        // Must be a Split state with multiple branches
127        let branches = match &self.states[state_id] {
128            State::Split { branches, .. } if branches.len() >= 2 => branches,
129            _ => return Vec::new(),
130        };
131
132        let mut pattern_indices = Vec::with_capacity(branches.len());
133
134        for &branch_id in branches {
135            // Each branch must be FuzzyLiteral → (epsilon*) → Accept
136            // with no min_edits or cost_constraint
137            if let Some(idx) = self.check_simple_alternation_branch(branch_id) {
138                pattern_indices.push(idx);
139            } else {
140                return Vec::new(); // Not a simple alternation
141            }
142        }
143
144        pattern_indices
145    }
146
147    /// Check if a branch is a simple `FuzzyLiteral` → Accept path.
148    /// Returns the pattern index if valid, None otherwise.
149    fn check_simple_alternation_branch(&self, mut state_id: StateId) -> Option<PatternIndex> {
150        // Skip initial epsilon transitions
151        loop {
152            match &self.states[state_id] {
153                State::Epsilon { targets } if targets.len() == 1 => {
154                    state_id = targets[0];
155                }
156                _ => break,
157            }
158        }
159
160        // Must be FuzzyLiteral with no constraints
161        let (pattern_index, next) = match &self.states[state_id] {
162            State::FuzzyLiteral {
163                pattern_index,
164                next,
165                min_edits: None,
166                cost_constraint: None,
167                ..
168            } => (*pattern_index, *next),
169            _ => return None,
170        };
171
172        // Follow to Accept (through epsilon transitions)
173        let mut state_id = next;
174        loop {
175            match &self.states[state_id] {
176                State::Accept => return Some(pattern_index),
177                State::Epsilon { targets } if targets.len() == 1 => {
178                    state_id = targets[0];
179                }
180                _ => return None,
181            }
182        }
183    }
184
185    /// Recursive helper for `is_simple_fuzzy_only`.
186    /// `seen_fuzzy` tracks whether we've already seen a `FuzzyLiteral`.
187    fn check_simple_fuzzy_only(
188        &self,
189        state_id: StateId,
190        visited: &mut [bool],
191        seen_fuzzy: bool,
192    ) -> bool {
193        if visited[state_id] {
194            return false; // Cycle detected - not simple
195        }
196        visited[state_id] = true;
197
198        match &self.states[state_id] {
199            State::Accept => seen_fuzzy, // Must have seen exactly one FuzzyLiteral
200
201            State::Epsilon { targets } => {
202                // All targets must be simple
203                if targets.len() != 1 {
204                    return false; // Multiple branches - not simple
205                }
206                self.check_simple_fuzzy_only(targets[0], visited, seen_fuzzy)
207            }
208
209            State::FuzzyLiteral {
210                next,
211                min_edits,
212                cost_constraint,
213                ..
214            } => {
215                // Only simple if no constraints and haven't seen another FuzzyLiteral
216                if seen_fuzzy || min_edits.is_some() || cost_constraint.is_some() {
217                    return false;
218                }
219                self.check_simple_fuzzy_only(*next, visited, true)
220            }
221
222            // Any other state type makes it not simple
223            State::Char { .. }
224            | State::FuzzyChar { .. }
225            | State::CaptureStart { .. }
226            | State::CaptureEnd { .. }
227            | State::Anchor { .. }
228            | State::Lookahead { .. }
229            | State::Lookbehind { .. }
230            | State::Backreference { .. }
231            | State::Split { .. }
232            | State::ResetMatchStart { .. }
233            | State::AtomicGroup { .. }
234            | State::RecursivePattern { .. }
235            | State::RecursiveGroup { .. }
236            | State::RecursiveNamedGroup { .. } => false,
237        }
238    }
239
240    /// Add a new state and return its ID.
241    pub fn add_state(&mut self, state: State) -> StateId {
242        let id = self.states.len();
243        self.states.push(state);
244        id
245    }
246
247    /// Get a reference to a state.
248    #[must_use]
249    pub fn state(&self, id: StateId) -> &State {
250        &self.states[id]
251    }
252
253    /// Get a mutable reference to a state.
254    pub fn state_mut(&mut self, id: StateId) -> &mut State {
255        &mut self.states[id]
256    }
257
258    /// Check if a state is accepting.
259    #[must_use]
260    pub fn is_accepting(&self, id: StateId) -> bool {
261        matches!(self.states[id], State::Accept)
262    }
263
264    /// Get the total number of states.
265    #[must_use]
266    pub fn state_count(&self) -> usize {
267        self.states.len()
268    }
269
270    /// Check if this NFA contains any lazy (non-greedy) quantifiers.
271    ///
272    /// Returns true if any Split state has greedy=false.
273    /// Used to determine whether to prefer shorter matches.
274    #[must_use]
275    pub fn has_lazy_quantifiers(&self) -> bool {
276        self.states
277            .iter()
278            .any(|state| matches!(state, State::Split { greedy: false, .. }))
279    }
280
281    /// Check if this NFA contains a `ResetMatchStart` state (\K).
282    /// Used to skip DFA which doesn't properly handle \K reset.
283    #[must_use]
284    pub fn has_reset_match_start(&self) -> bool {
285        self.states
286            .iter()
287            .any(|state| matches!(state, State::ResetMatchStart { .. }))
288    }
289
290    /// Check if this NFA contains any lookahead or lookbehind assertions.
291    #[must_use]
292    pub fn has_lookahead(&self) -> bool {
293        self.states
294            .iter()
295            .any(|state| matches!(state, State::Lookahead { .. }))
296    }
297
298    /// Check if this NFA contains word boundary anchors.
299    #[must_use]
300    pub fn has_word_boundary(&self) -> bool {
301        self.states.iter().any(|state| {
302            if let State::Anchor { kind, .. } = state {
303                matches!(kind, Anchor::WordBoundary | Anchor::NotWordBoundary)
304            } else {
305                false
306            }
307        })
308    }
309
310    /// Check if this NFA contains any recursive patterns.
311    /// Used to determine whether to use backtracking engine.
312    #[must_use]
313    pub fn has_recursion(&self) -> bool {
314        self.states.iter().any(|state| {
315            matches!(
316                state,
317                State::RecursivePattern { .. }
318                    | State::RecursiveGroup { .. }
319                    | State::RecursiveNamedGroup { .. }
320            )
321        })
322    }
323
324    /// Check if this NFA is a simple word-bounded literal pattern like `\bword\b`.
325    ///
326    /// Returns true if the pattern is essentially:
327    /// - `\b` at start
328    /// - A single `FuzzyLiteral`
329    /// - `\b` at end
330    ///
331    /// This enables an optimization where we can find literal positions and
332    /// filter by word boundary instead of full NFA simulation.
333    #[must_use]
334    pub fn is_word_bounded_literal(&self) -> bool {
335        let mut visited = vec![false; self.states.len()];
336        self.check_word_bounded_literal(self.start, &mut visited, false, false, false)
337    }
338
339    /// Check if this NFA is a word-bounded literal pattern like `\bword\b`.
340    ///
341    /// Returns true if the pattern is exactly:
342    /// - `\b` at start
343    /// - A single `FuzzyLiteral`
344    /// - `\b` at end
345    ///
346    /// This enables an optimization where we can find literal positions and
347    /// filter by word boundary instead of full NFA simulation.
348    fn check_word_bounded_literal(
349        &self,
350        state_id: StateId,
351        visited: &mut [bool],
352        seen_start_boundary: bool,
353        seen_end_boundary: bool,
354        seen_literal: bool,
355    ) -> bool {
356        if visited[state_id] {
357            return false;
358        }
359        visited[state_id] = true;
360
361        match &self.states[state_id] {
362            State::Accept => {
363                // Valid only if we saw BOTH start and end boundaries with literal in between
364                seen_start_boundary && seen_end_boundary && seen_literal
365            }
366            State::Epsilon { targets } if targets.len() == 1 => self.check_word_bounded_literal(
367                targets[0],
368                visited,
369                seen_start_boundary,
370                seen_end_boundary,
371                seen_literal,
372            ),
373            State::Anchor {
374                kind: crate::parser::Anchor::WordBoundary,
375                next,
376            } => {
377                if seen_literal && !seen_end_boundary {
378                    // This is the second boundary (end boundary)
379                    // Only valid if we already saw a start boundary before the literal
380                    if seen_start_boundary {
381                        self.check_word_bounded_literal(
382                            *next,
383                            visited,
384                            seen_start_boundary,
385                            true, // Now we've seen the end boundary
386                            seen_literal,
387                        )
388                    } else {
389                        false
390                    }
391                } else if !seen_start_boundary && !seen_literal && !seen_end_boundary {
392                    // This is the first boundary (start boundary)
393                    self.check_word_bounded_literal(
394                        *next, visited, true, // Start boundary seen
395                        false, false,
396                    )
397                } else {
398                    false // Invalid state
399                }
400            }
401            State::FuzzyLiteral { next, .. } => {
402                if seen_start_boundary && !seen_literal && !seen_end_boundary {
403                    // We have start boundary, now seeing literal
404                    self.check_word_bounded_literal(
405                        *next,
406                        visited,
407                        seen_start_boundary,
408                        false,
409                        true, // Literal seen
410                    )
411                } else {
412                    false
413                }
414            }
415            _ => false,
416        }
417    }
418
419    /// Extract the first character class that must match for this NFA.
420    ///
421    /// This is used for quick rejection - if the first character doesn't match,
422    /// we can skip NFA simulation entirely.
423    ///
424    /// Returns None if:
425    /// - The pattern can match empty string
426    /// - The pattern starts with an anchor
427    /// - The first character is ambiguous (multiple branches with different first chars)
428    #[must_use]
429    pub fn first_char_class(&self) -> Option<HirClass> {
430        let mut visited = vec![false; self.states.len()];
431        self.first_char_class_from(self.start, &mut visited)
432    }
433
434    fn first_char_class_from(&self, state_id: StateId, visited: &mut [bool]) -> Option<HirClass> {
435        if visited[state_id] {
436            return None; // Cycle
437        }
438        visited[state_id] = true;
439
440        match &self.states[state_id] {
441            State::Epsilon { targets } => {
442                if targets.len() == 1 {
443                    self.first_char_class_from(targets[0], visited)
444                } else {
445                    None // Multiple paths - ambiguous
446                }
447            }
448            State::Char { class, .. } => Some(class.clone()),
449            State::FuzzyChar { class, limits, .. } => {
450                // Only use as prefilter if no deletions allowed
451                // (deletion would allow skipping the first char)
452                let max_deletions = limits
453                    .as_ref()
454                    .and_then(FuzzyLimits::get_deletions)
455                    .unwrap_or(0);
456                if max_deletions == 0 {
457                    Some(class.clone())
458                } else {
459                    None
460                }
461            }
462            State::CaptureStart { next, .. } | State::CaptureEnd { next, .. } => {
463                self.first_char_class_from(*next, visited)
464            }
465            State::Split { branches, .. } => {
466                // Check if all branches have the same first char class
467                if branches.is_empty() {
468                    return None;
469                }
470                let first = self.first_char_class_from(branches[0], visited)?;
471                for &branch in &branches[1..] {
472                    let branch_class = self.first_char_class_from(branch, visited)?;
473                    // For simplicity, only use if all branches have exactly the same class
474                    // (could be more sophisticated but this handles common cases)
475                    if branch_class.chars != first.chars
476                        || branch_class.ranges != first.ranges
477                        || branch_class.negated != first.negated
478                    {
479                        return None;
480                    }
481                }
482                Some(first)
483            }
484            _ => None,
485        }
486    }
487
488    /// Create a sub-NFA (used for lookahead/lookbehind).
489    #[must_use]
490    pub fn into_sub_nfa(self) -> Box<Nfa> {
491        Box::new(self)
492    }
493
494    /// Check if this NFA ends with an End anchor (`$`).
495    ///
496    /// For patterns like `\.$`, this allows the matcher to search from the end
497    /// of the text instead of scanning from the beginning.
498    ///
499    /// Returns true if all paths to Accept go through an End anchor.
500    #[must_use]
501    pub fn ends_with_end_anchor(&self) -> bool {
502        let mut visited = vec![false; self.states.len()];
503        self.check_ends_with_end_anchor(self.start, &mut visited)
504    }
505
506    /// Recursive helper for `ends_with_end_anchor`.
507    fn check_ends_with_end_anchor(&self, state_id: StateId, visited: &mut [bool]) -> bool {
508        if visited[state_id] {
509            return false; // Cycle - assume false
510        }
511        visited[state_id] = true;
512
513        #[allow(clippy::match_same_arms)]
514        match &self.states[state_id] {
515            // Accept without going through End anchor
516            State::Accept => false,
517
518            // Non-consuming states - check successors
519            State::Epsilon { targets } => {
520                // All targets must end with anchor
521                !targets.is_empty()
522                    && targets
523                        .iter()
524                        .all(|&t| self.check_ends_with_end_anchor(t, visited))
525            }
526
527            State::Split { branches, .. } => {
528                // All branches must end with anchor
529                !branches.is_empty()
530                    && branches
531                        .iter()
532                        .all(|&b| self.check_ends_with_end_anchor(b, visited))
533            }
534
535            // Check anchors
536            State::Anchor {
537                kind: Anchor::End,
538                next,
539            } => {
540                // Found End anchor - check that the path continues to Accept
541                self.path_reaches_accept(*next, &mut vec![false; self.states.len()])
542            }
543
544            State::Anchor { next, .. } => {
545                // Other anchor - continue
546                self.check_ends_with_end_anchor(*next, visited)
547            }
548
549            // Consuming states - check successor
550            State::Char { next, .. }
551            | State::FuzzyChar { next, .. }
552            | State::FuzzyLiteral { next, .. }
553            | State::CaptureStart { next, .. }
554            | State::CaptureEnd { next, .. }
555            | State::Backreference { next, .. }
556            | State::ResetMatchStart { next }
557            | State::Lookahead { next, .. }
558            | State::Lookbehind { next, .. }
559            | State::AtomicGroup { next, .. }
560            | State::RecursivePattern { next, .. }
561            | State::RecursiveGroup { next, .. }
562            | State::RecursiveNamedGroup { next, .. } => {
563                self.check_ends_with_end_anchor(*next, visited)
564            }
565        }
566    }
567
568    /// Check if a path reaches Accept (for end anchor validation).
569    fn path_reaches_accept(&self, state_id: StateId, visited: &mut [bool]) -> bool {
570        if visited[state_id] {
571            return false;
572        }
573        visited[state_id] = true;
574
575        match &self.states[state_id] {
576            State::Accept => true,
577            State::Epsilon { targets } => targets
578                .iter()
579                .any(|&t| self.path_reaches_accept(t, visited)),
580            State::CaptureEnd { next, .. } => self.path_reaches_accept(*next, visited),
581            _ => false, // Any consuming state after End anchor would be invalid
582        }
583    }
584
585    /// Get the maximum match length for simple (non-fuzzy) patterns.
586    ///
587    /// Returns None if the pattern has unbounded length (e.g., `.*`, `+`) or
588    /// contains `FuzzyLiteral` states that need bridge info.
589    ///
590    /// For end-anchored patterns, this helps limit the search range.
591    #[must_use]
592    pub fn max_simple_length(&self) -> Option<usize> {
593        let mut visited = vec![false; self.states.len()];
594        self.max_simple_length_from(self.start, &mut visited)
595    }
596
597    fn max_simple_length_from(&self, state_id: StateId, visited: &mut [bool]) -> Option<usize> {
598        if visited[state_id] {
599            return None; // Cycle means unbounded
600        }
601        visited[state_id] = true;
602
603        #[allow(clippy::match_same_arms)]
604        let result = match &self.states[state_id] {
605            State::Accept => Some(0),
606
607            State::Epsilon { targets } => {
608                // Max across all targets
609                let mut max = Some(0);
610                for &target in targets {
611                    let t_max = self.max_simple_length_from(target, visited)?;
612                    max = max.map(|m| m.max(t_max));
613                }
614                max
615            }
616
617            State::Char { next, .. } => self.max_simple_length_from(*next, visited).map(|m| m + 1),
618
619            State::FuzzyChar { next, limits, .. } => {
620                // With fuzzy, the match length can vary
621                // Max is still 1 char for the pattern position
622                self.max_simple_length_from(*next, visited).map(|m| {
623                    m + 1
624                        + limits
625                            .as_ref()
626                            .and_then(FuzzyLimits::get_insertions)
627                            .unwrap_or(0) as usize
628                })
629            }
630
631            State::FuzzyLiteral { .. } => None, // Need bridge info for accurate length
632
633            State::CaptureStart { next, .. }
634            | State::CaptureEnd { next, .. }
635            | State::Anchor { next, .. } => self.max_simple_length_from(*next, visited),
636
637            State::Split { branches, .. } => {
638                // Max across all branches
639                let mut max = Some(0);
640                for &branch in branches {
641                    let b_max = self.max_simple_length_from(branch, visited)?;
642                    max = max.map(|m| m.max(b_max));
643                }
644                max
645            }
646
647            State::Lookahead { next, .. } | State::Lookbehind { next, .. } => {
648                // Assertions don't consume
649                self.max_simple_length_from(*next, visited)
650            }
651
652            State::Backreference { .. } => None, // Unknown length
653
654            State::AtomicGroup { .. } => None, // Unknown length
655
656            State::ResetMatchStart { .. } => None, // \K - length becomes unknown after reset
657
658            State::RecursivePattern { .. } => None, // Recursive - unknown length
659            State::RecursiveGroup { .. } => None,   // Recursive - unknown length
660            State::RecursiveNamedGroup { .. } => None, // Recursive - unknown length
661        };
662
663        visited[state_id] = false; // Reset for other paths
664        result
665    }
666
667    /// Calculate the minimum and maximum possible match lengths for this NFA.
668    ///
669    /// For lookbehind assertions, this helps determine how far back to search.
670    /// Returns (`min_length`, `max_length`) where `max_length` is None if unbounded.
671    ///
672    /// The `pattern_lengths` callback provides (`char_len`, `max_edits`) for `FuzzyLiteral` patterns.
673    pub fn length_range<F>(&self, pattern_lengths: F) -> (usize, Option<usize>)
674    where
675        F: Fn(usize) -> Option<(usize, u8)>,
676    {
677        let mut visited = vec![false; self.states.len()];
678        let mut memo: Vec<Option<(usize, Option<usize>)>> = vec![None; self.states.len()];
679        self.length_range_state(self.start, &pattern_lengths, &mut visited, &mut memo)
680    }
681
682    /// Recursive helper for `length_range`.
683    fn length_range_state<F>(
684        &self,
685        state_id: StateId,
686        pattern_lengths: &F,
687        visited: &mut [bool],
688        memo: &mut [Option<(usize, Option<usize>)>],
689    ) -> (usize, Option<usize>)
690    where
691        F: Fn(usize) -> Option<(usize, u8)>,
692    {
693        // Return cached result if available
694        if let Some(result) = memo[state_id] {
695            return result;
696        }
697
698        // Cycle detection - return (0, unbounded) for cycles
699        if visited[state_id] {
700            return (0, None);
701        }
702        visited[state_id] = true;
703
704        let result = match &self.states[state_id] {
705            State::Accept | State::ResetMatchStart { .. } => (0, Some(0)),
706
707            State::Epsilon { targets } => {
708                // Min/max across all targets
709                let mut min = usize::MAX;
710                let mut max: Option<usize> = Some(0);
711                for &target in targets {
712                    let (t_min, t_max) =
713                        self.length_range_state(target, pattern_lengths, visited, memo);
714                    min = min.min(t_min);
715                    max = match (max, t_max) {
716                        (Some(a), Some(b)) => Some(a.max(b)),
717                        _ => None,
718                    };
719                }
720                if min == usize::MAX {
721                    min = 0;
722                }
723                (min, max)
724            }
725
726            State::Char { next, .. } => {
727                let (next_min, next_max) =
728                    self.length_range_state(*next, pattern_lengths, visited, memo);
729                (next_min + 1, next_max.map(|m| m + 1))
730            }
731
732            State::FuzzyChar { next, limits, .. } => {
733                // FuzzyChar can match 0-2 characters depending on edits:
734                // - Deletion: 0 chars (pattern char skipped)
735                // - Exact/Substitution: 1 char
736                // - (Insertion handled elsewhere in text loop)
737                let (next_min, next_max) =
738                    self.length_range_state(*next, pattern_lengths, visited, memo);
739                let max_edits = limits
740                    .as_ref()
741                    .and_then(FuzzyLimits::get_edits)
742                    .unwrap_or(0) as usize;
743                // With deletion allowed, can consume 0 chars; otherwise 1 char
744                let char_min = usize::from(max_edits == 0);
745                (next_min + char_min, next_max.map(|m| m + 1))
746            }
747
748            State::FuzzyLiteral {
749                pattern_index,
750                next,
751                ..
752            } => {
753                let (next_min, next_max) =
754                    self.length_range_state(*next, pattern_lengths, visited, memo);
755                if let Some((pat_len, max_edits)) = pattern_lengths(*pattern_index) {
756                    // With fuzzy matching, length can vary by edits:
757                    // - Insertions add chars to match
758                    // - Deletions remove chars from match
759                    let edits = max_edits as usize;
760                    let fuzzy_min = pat_len.saturating_sub(edits);
761                    let fuzzy_max = pat_len + edits;
762                    (next_min + fuzzy_min, next_max.map(|m| m + fuzzy_max))
763                } else {
764                    // Unknown pattern - assume arbitrary length
765                    (next_min, None)
766                }
767            }
768
769            State::CaptureStart { next, .. }
770            | State::CaptureEnd { next, .. }
771            | State::Anchor { next, .. }
772            | State::Lookahead { next, .. }
773            | State::Lookbehind { next, .. }
774            | State::AtomicGroup { next, .. } => {
775                self.length_range_state(*next, pattern_lengths, visited, memo)
776            }
777
778            State::Backreference { next, .. } => {
779                // Backreferences have unknown length (depends on captured text)
780                let _ = self.length_range_state(*next, pattern_lengths, visited, memo);
781                (0, None)
782            }
783
784            State::Split { branches, .. } => {
785                // Min/max across all branches
786                let mut min = usize::MAX;
787                let mut max: Option<usize> = Some(0);
788                for &branch in branches {
789                    let (b_min, b_max) =
790                        self.length_range_state(branch, pattern_lengths, visited, memo);
791                    min = min.min(b_min);
792                    max = match (max, b_max) {
793                        (Some(a), Some(b)) => Some(a.max(b)),
794                        _ => None,
795                    };
796                }
797                if min == usize::MAX {
798                    min = 0;
799                }
800                (min, max)
801            }
802
803            State::RecursivePattern { .. }
804            | State::RecursiveGroup { .. }
805            | State::RecursiveNamedGroup { .. } => (0, None), // Recursive - unknown
806        };
807
808        visited[state_id] = false;
809        memo[state_id] = Some(result);
810        result
811    }
812}
813
814impl Default for Nfa {
815    fn default() -> Self {
816        Self::new()
817    }
818}
819
820/// A state in the NFA.
821#[derive(Debug, Clone)]
822pub enum State {
823    /// Accept state - match succeeded.
824    Accept,
825
826    /// Epsilon transition - no input consumed.
827    Epsilon {
828        /// Target states (multiple for splits).
829        targets: Vec<StateId>,
830    },
831
832    /// Match a single character from a class.
833    Char {
834        /// The character class to match.
835        class: HirClass,
836        /// Next state after matching.
837        next: StateId,
838    },
839
840    /// Match a single character from a class with fuzzy matching support.
841    /// Used for character classes inside fuzzy groups like `(?:[a-z])~1`.
842    FuzzyChar {
843        /// The character class to match.
844        class: HirClass,
845        /// Fuzzy matching limits (insertions, deletions, substitutions).
846        limits: Option<FuzzyLimits>,
847        /// Minimum edits required (for exclusive lower bounds).
848        min_edits: Option<u8>,
849        /// Cost constraint (optional).
850        cost_constraint: Option<CostConstraint>,
851        /// Next state after matching.
852        next: StateId,
853    },
854
855    /// Match a literal string with fuzzy matching.
856    /// Uses Levenshtein automata for the match.
857    FuzzyLiteral {
858        /// Index into the pre-built pattern list.
859        pattern_index: PatternIndex,
860        /// Per-pattern fuzzy limits.
861        limits: Option<FuzzyLimits>,
862        /// Minimum edits required (for exclusive lower bounds like `{0<e<5}`).
863        min_edits: Option<u8>,
864        /// Cost constraint (optional).
865        cost_constraint: Option<CostConstraint>,
866        /// Next state after matching.
867        next: StateId,
868    },
869
870    /// Start of a capture group.
871    CaptureStart {
872        /// Capture group index (1-based).
873        index: usize,
874        /// Next state.
875        next: StateId,
876    },
877
878    /// End of a capture group.
879    CaptureEnd {
880        /// Capture group index (1-based).
881        index: usize,
882        /// Next state.
883        next: StateId,
884    },
885
886    /// Anchor assertion.
887    Anchor {
888        /// The type of anchor.
889        kind: Anchor,
890        /// Next state if anchor matches.
891        next: StateId,
892    },
893
894    /// Lookahead assertion.
895    Lookahead {
896        /// True for positive lookahead, false for negative.
897        positive: bool,
898        /// Sub-NFA to evaluate.
899        nfa: Box<Nfa>,
900        /// Literal patterns used by the sub-NFA.
901        literals: Vec<LiteralPattern>,
902        /// Next state if assertion passes.
903        next: StateId,
904    },
905
906    /// Lookbehind assertion.
907    Lookbehind {
908        /// True for positive lookbehind, false for negative.
909        positive: bool,
910        /// Sub-NFA to evaluate.
911        nfa: Box<Nfa>,
912        /// Literal patterns used by the sub-NFA.
913        literals: Vec<LiteralPattern>,
914        /// Pre-built `FuzzyBridge` for efficient matching (shared via Arc for Clone).
915        bridge: Option<Arc<FuzzyBridge>>,
916        /// Next state if assertion passes.
917        next: StateId,
918    },
919
920    /// Backreference - match the same text as a capture group.
921    Backreference {
922        /// The capture group to reference.
923        group: usize,
924        /// Optional fuzzy limits for fuzzy backreference matching.
925        limits: Option<FuzzyLimits>,
926        /// Next state after matching.
927        next: StateId,
928    },
929
930    /// Split state for alternation (prioritized).
931    /// Tries branches in order for greedy/non-greedy semantics.
932    Split {
933        /// Branch states in priority order.
934        branches: Vec<StateId>,
935        /// Whether this split is greedy (try first branch first) or non-greedy (try last branch first).
936        /// For quantifiers like *, +, ?, this determines match preference.
937        greedy: bool,
938    },
939
940    /// Reset match start - \K
941    /// Resets the match start position to the current position.
942    /// Everything before \K is matched but excluded from the final match.
943    ResetMatchStart {
944        /// Next state after the reset.
945        next: StateId,
946    },
947
948    /// Atomic group - (?>expr)
949    /// Once matched, prevents backtracking within the group.
950    AtomicGroup {
951        /// The sub-NFA for the expression inside the group.
952        nfa: Box<Nfa>,
953        /// Next state after the atomic group.
954        next: StateId,
955    },
956
957    /// Recursive pattern - (?R)
958    /// Recursively matches the entire pattern.
959    RecursivePattern {
960        /// Next state after the recursive call.
961        next: StateId,
962    },
963
964    /// Recursive numbered group - (?1), (?2), etc.
965    RecursiveGroup {
966        /// The capture group number to recurse into.
967        group: usize,
968        /// Next state after the recursive call.
969        next: StateId,
970    },
971
972    /// Recursive named group - (?&name) or (?P>name)
973    RecursiveNamedGroup {
974        /// The name of the capture group to recurse into.
975        name: String,
976        /// Next state after the recursive call.
977        next: StateId,
978    },
979}
980
981impl State {
982    /// Create an epsilon transition to a single target.
983    #[must_use]
984    pub fn epsilon(target: StateId) -> Self {
985        State::Epsilon {
986            targets: vec![target],
987        }
988    }
989
990    /// Create an epsilon transition to multiple targets.
991    #[must_use]
992    pub fn epsilon_multi(targets: Vec<StateId>) -> Self {
993        State::Epsilon { targets }
994    }
995
996    /// Create a character matching state.
997    #[must_use]
998    pub fn char_match(class: HirClass, next: StateId) -> Self {
999        State::Char { class, next }
1000    }
1001
1002    /// Create a fuzzy literal state.
1003    #[must_use]
1004    pub fn fuzzy_literal(
1005        pattern_index: PatternIndex,
1006        limits: Option<FuzzyLimits>,
1007        min_edits: Option<u8>,
1008        cost_constraint: Option<CostConstraint>,
1009        next: StateId,
1010    ) -> Self {
1011        State::FuzzyLiteral {
1012            pattern_index,
1013            limits,
1014            min_edits,
1015            cost_constraint,
1016            next,
1017        }
1018    }
1019
1020    /// Create a capture start state.
1021    #[must_use]
1022    pub fn capture_start(index: usize, next: StateId) -> Self {
1023        State::CaptureStart { index, next }
1024    }
1025
1026    /// Create a capture end state.
1027    #[must_use]
1028    pub fn capture_end(index: usize, next: StateId) -> Self {
1029        State::CaptureEnd { index, next }
1030    }
1031
1032    /// Create an anchor state.
1033    #[must_use]
1034    pub fn anchor(kind: Anchor, next: StateId) -> Self {
1035        State::Anchor { kind, next }
1036    }
1037
1038    /// Create a split state. Defaults to greedy.
1039    #[must_use]
1040    pub fn split(branches: Vec<StateId>) -> Self {
1041        State::Split {
1042            branches,
1043            greedy: true,
1044        }
1045    }
1046
1047    /// Create a lookahead state.
1048    #[must_use]
1049    pub fn lookahead(
1050        positive: bool,
1051        nfa: Box<Nfa>,
1052        literals: Vec<LiteralPattern>,
1053        next: StateId,
1054    ) -> Self {
1055        State::Lookahead {
1056            positive,
1057            nfa,
1058            literals,
1059            next,
1060        }
1061    }
1062
1063    /// Create a lookbehind state with pre-built `FuzzyBridge`.
1064    pub fn lookbehind(
1065        positive: bool,
1066        nfa: Box<Nfa>,
1067        literals: Vec<LiteralPattern>,
1068        next: StateId,
1069    ) -> Self {
1070        // Pre-build the FuzzyBridge for efficient matching
1071        let bridge = if literals.is_empty() {
1072            None
1073        } else {
1074            FuzzyBridge::new(&literals, None, None, false).map(Arc::new)
1075        };
1076        State::Lookbehind {
1077            positive,
1078            nfa,
1079            literals,
1080            bridge,
1081            next,
1082        }
1083    }
1084
1085    /// Create a backreference state.
1086    #[must_use]
1087    pub fn backreference(group: usize, limits: Option<FuzzyLimits>, next: StateId) -> Self {
1088        State::Backreference {
1089            group,
1090            limits,
1091            next,
1092        }
1093    }
1094
1095    /// Get the next state(s) from this state.
1096    #[must_use]
1097    pub fn next_states(&self) -> Vec<StateId> {
1098        #[allow(clippy::match_same_arms)]
1099        match self {
1100            State::Accept => vec![],
1101            State::Epsilon { targets } => targets.clone(),
1102            State::Char { next, .. }
1103            | State::FuzzyChar { next, .. }
1104            | State::FuzzyLiteral { next, .. }
1105            | State::CaptureStart { next, .. }
1106            | State::CaptureEnd { next, .. }
1107            | State::Anchor { next, .. }
1108            | State::Lookahead { next, .. }
1109            | State::Lookbehind { next, .. }
1110            | State::Backreference { next, .. }
1111            | State::AtomicGroup { next, .. }
1112            | State::RecursivePattern { next, .. }
1113            | State::RecursiveGroup { next, .. }
1114            | State::RecursiveNamedGroup { next, .. } => vec![*next],
1115            State::Split { branches, .. } => branches.clone(),
1116            State::ResetMatchStart { .. } => vec![],
1117        }
1118    }
1119}
1120
1121/// Fragment of an NFA being built (used during construction).
1122#[derive(Debug, Clone)]
1123pub struct NfaFragment {
1124    /// Entry state of the fragment.
1125    pub start: StateId,
1126    /// Exit states of the fragment (to be patched).
1127    pub ends: Vec<StateId>,
1128}
1129
1130impl NfaFragment {
1131    /// Create a new fragment with given start and ends.
1132    #[must_use]
1133    pub fn new(start: StateId, ends: Vec<StateId>) -> Self {
1134        NfaFragment { start, ends }
1135    }
1136
1137    /// Create a fragment with a single end state.
1138    #[must_use]
1139    pub fn single(start: StateId, end: StateId) -> Self {
1140        NfaFragment {
1141            start,
1142            ends: vec![end],
1143        }
1144    }
1145}
1146
1147/// Character class restriction for fuzzy edits.
1148/// When set, edits (insertions, substitutions, etc.) must involve characters from this class.
1149#[derive(Debug, Clone)]
1150pub struct EditCharRestriction {
1151    /// Characters that are allowed in edits.
1152    pub chars: Vec<char>,
1153    /// Character ranges allowed in edits.
1154    pub ranges: Vec<(char, char)>,
1155}
1156
1157impl EditCharRestriction {
1158    /// Create a new edit character restriction.
1159    #[must_use]
1160    pub fn new(chars: Vec<char>, ranges: Vec<(char, char)>) -> Self {
1161        EditCharRestriction { chars, ranges }
1162    }
1163
1164    /// Check if a character is allowed by this restriction.
1165    #[must_use]
1166    pub fn allows(&self, ch: char) -> bool {
1167        self.chars.contains(&ch)
1168            || self
1169                .ranges
1170                .iter()
1171                .any(|&(start, end)| ch >= start && ch <= end)
1172    }
1173}
1174
1175/// A literal pattern extracted from the HIR for fuzzy matching.
1176#[derive(Debug, Clone)]
1177pub struct LiteralPattern {
1178    /// The literal text.
1179    pub text: String,
1180    /// Fuzzy limits for this pattern.
1181    pub limits: Option<FuzzyLimits>,
1182    /// Minimum edits required (for exclusive lower bounds like `{0<e<5}`).
1183    pub min_edits: Option<u8>,
1184    /// Character class restriction for edits.
1185    /// If set, all edit characters must be from this class.
1186    pub edit_chars: Option<EditCharRestriction>,
1187}
1188
1189impl LiteralPattern {
1190    /// Create a new literal pattern.
1191    #[must_use]
1192    pub fn new(text: String, limits: Option<FuzzyLimits>, min_edits: Option<u8>) -> Self {
1193        LiteralPattern {
1194            text,
1195            limits,
1196            min_edits,
1197            edit_chars: None,
1198        }
1199    }
1200
1201    /// Create a new literal pattern with character class restriction.
1202    #[must_use]
1203    pub fn with_edit_chars(
1204        text: String,
1205        limits: Option<FuzzyLimits>,
1206        min_edits: Option<u8>,
1207        edit_chars: Option<EditCharRestriction>,
1208    ) -> Self {
1209        LiteralPattern {
1210            text,
1211            limits,
1212            min_edits,
1213            edit_chars,
1214        }
1215    }
1216}