fuzzy_regex/engine/
matcher.rs

1//! Core matching engine using NFA simulation.
2
3#![allow(
4    clippy::needless_range_loop,
5    clippy::match_same_arms,
6    clippy::too_many_lines,
7    clippy::similar_names,
8    clippy::missing_panics_doc,
9    clippy::missing_errors_doc,
10    clippy::ref_option_ref,
11    clippy::let_underscore_untyped,
12    clippy::items_after_statements,
13    clippy::float_cmp,
14    clippy::allow_attributes,
15    let_underscore_drop
16)]
17
18use std::sync::Arc;
19
20use super::captures::CaptureState;
21use super::fuzzy_bridge::{CachedMatches, FuzzyBridge, FuzzyMatchResult};
22use crate::ir::{LiteralPattern, Nfa, State, StateId};
23use crate::parser::Anchor;
24use crate::types::{Distance, FuzzyLimits, NumEdits};
25
26/// A thread in the NFA simulation.
27#[derive(Debug, Clone)]
28struct Thread {
29    /// Current NFA state.
30    state: StateId,
31    /// Current position in the text (byte index).
32    pos: usize,
33    /// Current match start position (can be reset by \K).
34    match_start: usize,
35    /// Capture state.
36    captures: CaptureState,
37    /// Accumulated similarity score.
38    similarity: f32,
39    /// Total edits.
40    edits: EditCounts,
41}
42
43impl Default for Thread {
44    fn default() -> Self {
45        Thread {
46            state: 0,
47            pos: 0,
48            match_start: 0,
49            captures: CaptureState::new(0),
50            similarity: 1.0,
51            edits: EditCounts::default(),
52        }
53    }
54}
55
56/// Edit operation counts.
57#[derive(Debug, Clone, Default)]
58pub struct EditCounts {
59    /// Number of character insertions.
60    pub insertions: u8,
61    /// Number of character deletions.
62    pub deletions: u8,
63    /// Number of character substitutions.
64    pub substitutions: u8,
65    /// Number of adjacent character swaps (transpositions).
66    pub swaps: u8,
67}
68
69impl EditCounts {
70    /// Get total edit count.
71    #[must_use]
72    pub fn total(&self) -> u8 {
73        self.insertions
74            .saturating_add(self.deletions)
75            .saturating_add(self.substitutions)
76            .saturating_add(self.swaps)
77    }
78
79    /// Calculate weighted cost of edits.
80    /// Default cost is 1 for each edit type (equivalent to `total()`).
81    #[must_use]
82    pub fn cost(&self, i_cost: u8, d_cost: u8, s_cost: u8, t_cost: u8) -> u16 {
83        u16::from(self.insertions) * u16::from(i_cost)
84            + u16::from(self.deletions) * u16::from(d_cost)
85            + u16::from(self.substitutions) * u16::from(s_cost)
86            + u16::from(self.swaps) * u16::from(t_cost)
87    }
88
89    /// Merge with another edit count.
90    #[must_use]
91    pub fn merge(&self, other: &EditCounts) -> Self {
92        EditCounts {
93            insertions: self.insertions.saturating_add(other.insertions),
94            deletions: self.deletions.saturating_add(other.deletions),
95            substitutions: self.substitutions.saturating_add(other.substitutions),
96            swaps: self.swaps.saturating_add(other.swaps),
97        }
98    }
99
100    /// Create from a fuzzy match result.
101    #[must_use]
102    pub fn from_fuzzy_result(result: &FuzzyMatchResult) -> Self {
103        EditCounts {
104            insertions: result.insertions,
105            deletions: result.deletions,
106            substitutions: result.substitutions,
107            swaps: result.swaps,
108        }
109    }
110}
111
112/// A match result.
113#[derive(Debug, Clone)]
114pub struct MatchResult {
115    /// Start position (byte index).
116    pub start: usize,
117    /// End position (byte index).
118    pub end: usize,
119    /// Similarity score (0.0 - 1.0).
120    pub similarity: f32,
121    /// Edit counts.
122    pub edits: EditCounts,
123    /// Capture groups.
124    pub captures: CaptureState,
125}
126
127impl MatchResult {
128    /// Get the matched text.
129    #[must_use]
130    pub fn as_str<'a>(&self, text: &'a str) -> &'a str {
131        &text[self.start..self.end]
132    }
133}
134
135/// Configuration for the matcher.
136#[derive(Debug, Clone)]
137pub struct MatcherConfig {
138    /// Similarity threshold (0.0 - 1.0).
139    pub threshold: f32,
140    /// Maximum number of threads (beam width).
141    pub max_threads: usize,
142    /// Whether to search for matches anywhere (true) or only at start (false).
143    pub unanchored: bool,
144    /// BESTMATCH mode - find best match instead of first match.
145    pub best_match: bool,
146    /// ENHANCEMATCH mode - improve the fit of the found match.
147    pub enhance_match: bool,
148    /// POSIX mode - find longest match at leftmost position.
149    pub posix: bool,
150    /// Global mode - find all matches instead of stopping at first.
151    /// When false (default), stops at first valid match (faster).
152    pub global: bool,
153    /// Multi-line mode - `^` and `$` match at line boundaries.
154    pub multi_line: bool,
155    /// Prefer shortest matches (for patterns with lazy quantifiers).
156    /// When true, prefers shorter matches over longer ones at the same similarity.
157    pub prefer_shortest: bool,
158    /// Unicode mode - enable Unicode character classes (\w, \d, \s match Unicode).
159    pub unicode: bool,
160    /// Greedy first-match mode - return first match found (faster).
161    /// Similar to mrab-regex behavior.
162    pub greedy_first: bool,
163}
164
165impl Default for MatcherConfig {
166    fn default() -> Self {
167        MatcherConfig {
168            threshold: 0.8,
169            max_threads: 1000,
170            unanchored: true,
171            best_match: false,
172            enhance_match: false,
173            posix: false,
174            global: true, // Default: full NFA simulation for correctness
175            multi_line: false,
176            prefer_shortest: false,
177            unicode: false,
178            greedy_first: false,
179        }
180    }
181}
182
183/// The NFA-based matching engine.
184pub struct Matcher<'a> {
185    /// The NFA to execute.
186    nfa: &'a Nfa,
187    /// Bridge to fuzzy matching (Levenshtein automata).
188    fuzzy_bridge: Option<&'a FuzzyBridge>,
189    /// Number of capture groups.
190    capture_count: usize,
191    /// Configuration.
192    config: MatcherConfig,
193    /// Prefilter for fast candidate position detection (Arc for cheap cloning).
194    prefilter: Arc<super::prefilter::Prefilter>,
195    /// Whether this is a simple fuzzy-only pattern (can skip NFA simulation).
196    is_simple_fuzzy: bool,
197    /// Pattern indices for simple alternations (enables multi-pattern fast path).
198    simple_alternation_indices: Option<Vec<usize>>,
199    /// Multi-pattern prefilter for simple alternations.
200    multi_prefilter: Option<super::prefilter::Prefilter>,
201    /// First character class for quick rejection (when no prefilter is available).
202    first_char_class: Option<crate::ir::hir::HirClass>,
203    /// Whether the pattern ends with an End anchor ($).
204    /// Enables reverse search optimization.
205    ends_with_end_anchor: bool,
206    /// Maximum match length for simple patterns (used with end anchor optimization).
207    max_simple_length: Option<usize>,
208}
209
210impl<'a> Matcher<'a> {
211    /// Create a new matcher.
212    #[must_use]
213    pub fn new(
214        nfa: &'a Nfa,
215        fuzzy_bridge: Option<&'a FuzzyBridge>,
216        capture_count: usize,
217        config: MatcherConfig,
218    ) -> Self {
219        let is_simple_fuzzy =
220            nfa.is_simple_fuzzy_only() && fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1);
221        let first_char_class = nfa.first_char_class();
222        let ends_with_end_anchor = nfa.ends_with_end_anchor();
223        let max_simple_length = Self::calculate_max_length(nfa, fuzzy_bridge);
224        Matcher {
225            nfa,
226            fuzzy_bridge,
227            capture_count,
228            config,
229            prefilter: Arc::new(super::prefilter::Prefilter::None),
230            is_simple_fuzzy,
231            simple_alternation_indices: None,
232            multi_prefilter: None,
233            first_char_class,
234            ends_with_end_anchor,
235            max_simple_length,
236        }
237    }
238
239    /// Create a new matcher with a prefilter.
240    #[must_use]
241    pub fn with_prefilter(
242        nfa: &'a Nfa,
243        fuzzy_bridge: Option<&'a FuzzyBridge>,
244        capture_count: usize,
245        config: MatcherConfig,
246        prefilter: Arc<super::prefilter::Prefilter>,
247    ) -> Self {
248        let is_simple_fuzzy =
249            nfa.is_simple_fuzzy_only() && fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1);
250
251        // Detect simple alternations for multi-pattern fast path
252        let (simple_alternation_indices, multi_prefilter) =
253            Self::detect_simple_alternation(nfa, fuzzy_bridge);
254
255        // Extract first char class for quick rejection (when no prefilter)
256        let first_char_class = nfa.first_char_class();
257
258        // Check if pattern ends with $ anchor
259        let ends_with_end_anchor = nfa.ends_with_end_anchor();
260
261        // Get max length for end-anchor optimization
262        let max_simple_length = Self::calculate_max_length(nfa, fuzzy_bridge);
263
264        Matcher {
265            nfa,
266            fuzzy_bridge,
267            capture_count,
268            config,
269            prefilter,
270            is_simple_fuzzy,
271            simple_alternation_indices,
272            multi_prefilter,
273            first_char_class,
274            ends_with_end_anchor,
275            max_simple_length,
276        }
277    }
278
279    /// Calculate the maximum match length using the bridge for `FuzzyLiteral` info.
280    fn calculate_max_length(nfa: &Nfa, fuzzy_bridge: Option<&FuzzyBridge>) -> Option<usize> {
281        // First try simple calculation (for patterns without FuzzyLiteral)
282        if let Some(len) = nfa.max_simple_length() {
283            return Some(len);
284        }
285
286        // Use length_range with bridge callback
287        let (_, max_len) = nfa.length_range(|pattern_idx| {
288            fuzzy_bridge.and_then(|b| {
289                let char_len = b.pattern_char_len(pattern_idx)?;
290                let max_edits = b.pattern_max_edits(pattern_idx).unwrap_or(0);
291                Some((char_len, max_edits))
292            })
293        });
294
295        max_len
296    }
297
298    /// Detect if the NFA is a simple alternation and build a multi-pattern prefilter.
299    /// Returns None if any pattern has fuzzy edits (multi-pattern fast path doesn't support fuzzy yet).
300    fn detect_simple_alternation(
301        nfa: &Nfa,
302        fuzzy_bridge: Option<&FuzzyBridge>,
303    ) -> (Option<Vec<usize>>, Option<super::prefilter::Prefilter>) {
304        let indices = nfa.get_alternation_pattern_indices();
305        if indices.is_empty() {
306            return (None, None);
307        }
308
309        let Some(bridge) = fuzzy_bridge else {
310            return (None, None);
311        };
312
313        // Build multi-pattern prefilter
314        let mut literals: Vec<(&str, u8)> = Vec::with_capacity(indices.len());
315        for &idx in &indices {
316            if let Some(text) = bridge.pattern_text(idx) {
317                let max_edits = bridge.pattern_max_edits(idx).unwrap_or(0);
318                literals.push((text, max_edits));
319            }
320        }
321
322        if literals.is_empty() {
323            return (None, None);
324        }
325
326        let multi_pf = super::prefilter::Prefilter::multi_fuzzy(&literals, false);
327
328        // Return indices even if prefilter is inactive - allows fallback to individual streaming
329        if multi_pf.is_active() {
330            (Some(indices), Some(multi_pf))
331        } else {
332            (Some(indices), None)
333        }
334    }
335
336    /// Find the first match in the text (or best match in BESTMATCH mode).
337    #[must_use]
338    pub fn find(&self, text: &str) -> Option<MatchResult> {
339        // Multi-pattern fast path: for simple alternations (cat|bat|rat),
340        // use parallel Bitap search instead of full NFA simulation.
341        // Skip for POSIX mode which needs to find the longest match.
342        if !self.config.global
343            && !self.config.best_match
344            && !self.config.enhance_match
345            && !self.config.posix
346            && self.config.unanchored
347            && self.simple_alternation_indices.is_some()
348            && self.multi_prefilter.is_some()
349        {
350            return self.find_multi_pattern_fast(text);
351        }
352
353        // Fallback for multi-pattern when prefilter is inactive (e.g., short patterns like 'a' with e<=1).
354        // Use individual streaming Bitap search for each pattern.
355        // Skip for POSIX mode which needs to find the longest match.
356        if !self.config.global
357            && !self.config.best_match
358            && !self.config.enhance_match
359            && !self.config.posix
360            && self.config.unanchored
361            && self.simple_alternation_indices.is_some()
362            && self.multi_prefilter.is_none()
363        {
364            return self.find_multi_pattern_individual_fallback(text);
365        }
366
367        // Non-global mode: search position by position, return on first match.
368        // This is similar to mrab-regex behavior and much faster when matches exist early.
369        // Also use this path when greedy_first is explicitly enabled.
370        if !self.config.global
371            && !self.config.best_match
372            && !self.config.enhance_match
373            && self.config.unanchored
374            && (self.prefilter.is_active() || self.config.greedy_first)
375            && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
376        {
377            return self.find_greedy_first(text);
378        }
379
380        // Streaming fallback for simple fuzzy patterns without prefilter (e.g., 'a' with e<=1).
381        // Uses streaming Bitap which is O(N) instead of O(N*P) search_all.
382        // Only used for simple fuzzy patterns where the literal IS the whole pattern.
383        // Also use when greedy_first is enabled.
384        if !self.config.global
385            && !self.config.best_match
386            && !self.config.enhance_match
387            && self.config.unanchored
388            && (!self.prefilter.is_active() || self.config.greedy_first)
389            && self.is_simple_fuzzy
390            && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
391        {
392            return self.find_streaming_fallback(text);
393        }
394
395        // Fast path for single-pattern first-match without prefilter.
396        // Use search_first for early termination instead of search_all.
397        // Skip if capture groups are needed (they require full search).
398        if !self.config.global
399            && !self.config.best_match
400            && !self.config.enhance_match
401            && self.config.unanchored
402            && !self.prefilter.is_active()
403            && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
404            && self.capture_count == 0
405        {
406            // Use search_first for fast first-match
407            if let Some(bridge) = self.fuzzy_bridge {
408                if let Some(result) = bridge.search_first(text, self.config.threshold, 0) {
409                    let mut captures = CaptureState::new(self.capture_count);
410                    captures.set_full_match(result.start, result.end);
411                    return Some(MatchResult {
412                        start: result.start,
413                        end: result.end,
414                        similarity: result.similarity,
415                        edits: EditCounts {
416                            insertions: result.insertions,
417                            deletions: result.deletions,
418                            substitutions: result.substitutions,
419                            swaps: result.swaps,
420                        },
421                        captures,
422                    });
423                }
424                return None;
425            }
426        }
427
428        // Fast path for anchored patterns: only check position 0.
429        // For patterns without fuzzy literals, skip cache entirely.
430        // For patterns with multiple fuzzy literals, we need full search because
431        // different literals can match at different positions.
432        if !self.config.unanchored && !self.config.best_match && !self.config.enhance_match {
433            // Check if we can skip fuzzy search entirely
434            let needs_fuzzy_cache = self.fuzzy_bridge.is_some_and(|b| !b.is_all_exact());
435
436            if !needs_fuzzy_cache {
437                // No fuzzy matching needed - just do NFA simulation at position 0
438                return self.find_at_with_cache(text, 0, None);
439            }
440
441            // For single fuzzy literal, we can search only at position 0
442            // For multiple literals, we need full search since they match at different positions
443            let cached = self.fuzzy_bridge.map(|b| {
444                if b.pattern_count() == 1 {
445                    b.search_cached_at_position(text, 0, self.config.threshold)
446                } else {
447                    // Multiple patterns - need to search all positions
448                    b.search_all(text, self.config.threshold)
449                }
450            });
451            return self.find_at_with_cache(text, 0, cached.as_ref());
452        }
453
454        // Build fuzzy cache. For single-literal patterns, we can use the prefilter
455        // to only search candidate positions. For multi-literal patterns, we must
456        // search all positions since each literal can match at different offsets.
457        let cached = self.fuzzy_bridge.map(|b| {
458            if self.prefilter.is_active() && b.pattern_count() == 1 {
459                b.search_all_with_prefilter(text, self.config.threshold, &self.prefilter)
460            } else {
461                b.search_all(text, self.config.threshold)
462            }
463        });
464
465        if self.config.best_match {
466            // BESTMATCH mode needs full search to find the best match
467            self.find_best_with_cache(text, cached.as_ref())
468        } else if self.config.posix {
469            // POSIX mode: find longest match at leftmost position
470            self.find_posix_with_cache(text, cached.as_ref())
471        } else if self.config.unanchored {
472            // For first-match, use prefilter to skip impossible positions
473            if self.prefilter.is_active() {
474                let mut last_tried = None;
475                let max_offset = self.prefilter.max_offset();
476                for candidate in self.prefilter.find_candidates(text.as_bytes()) {
477                    // Prefilter returns (found - max_offset), so we need to try positions
478                    // from candidate to candidate + max_offset
479                    for offset in 0..=max_offset {
480                        let pos = candidate + offset;
481                        if pos > text.len() {
482                            continue;
483                        }
484                        // Ensure we're on a char boundary and haven't already tried this position
485                        let idx = Self::snap_to_char_boundary(text, pos);
486                        if last_tried == Some(idx) {
487                            continue;
488                        }
489                        last_tried = Some(idx);
490
491                        if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
492                            return Some(m);
493                        }
494                    }
495                }
496                // Try at end for empty patterns
497                self.find_at_with_cache(text, text.len(), cached.as_ref())
498            } else if self.ends_with_end_anchor && !self.config.multi_line {
499                // Pattern ends with $ - search from end (much faster for patterns like \.$)
500                // (disabled in multiline mode where $ can match at any line boundary)
501                if let Some(max_len) = self.max_simple_length {
502                    // Only check last `max_len` character positions (very fast for short patterns)
503                    // Iterate backwards from end, collecting only the positions we need
504                    let bytes = text.as_bytes();
505                    let mut positions = Vec::with_capacity(max_len + 1);
506                    let mut byte_pos = bytes.len();
507                    let mut chars_counted = 0;
508
509                    while byte_pos > 0 && chars_counted < max_len {
510                        byte_pos -= 1;
511                        // Check if this is a UTF-8 start byte (not continuation byte 10xxxxxx)
512                        if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
513                            positions.push(byte_pos);
514                            chars_counted += 1;
515                        }
516                    }
517
518                    // Try positions from end (which is the start of positions since we collected backwards)
519                    for &idx in &positions {
520                        if let Some(ref fcc) = self.first_char_class {
521                            let ch = text[idx..].chars().next().unwrap();
522                            if !fcc.matches(ch) {
523                                continue;
524                            }
525                        }
526                        if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
527                            return Some(m);
528                        }
529                    }
530                    self.find_at_with_cache(text, text.len(), cached.as_ref())
531                } else {
532                    // Unknown max length - collect all positions and search in reverse
533                    let positions: Vec<_> = text.char_indices().map(|(idx, _)| idx).collect();
534                    for &idx in positions.iter().rev() {
535                        if let Some(ref fcc) = self.first_char_class {
536                            let ch = text[idx..].chars().next().unwrap();
537                            if !fcc.matches(ch) {
538                                continue;
539                            }
540                        }
541                        if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
542                            return Some(m);
543                        }
544                    }
545                    self.find_at_with_cache(text, text.len(), cached.as_ref())
546                }
547            } else {
548                // No prefilter - try every position, but use first_char_class for quick rejection
549                for (idx, ch) in text.char_indices() {
550                    // Quick reject: if first_char_class is set and doesn't match, skip
551                    if let Some(ref fcc) = self.first_char_class
552                        && !fcc.matches(ch)
553                    {
554                        continue;
555                    }
556                    if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
557                        return Some(m);
558                    }
559                }
560                self.find_at_with_cache(text, text.len(), cached.as_ref())
561            }
562        } else {
563            self.find_at_with_cache(text, 0, cached.as_ref())
564        }
565    }
566
567    /// Greedy first-match: search position by position, return on first match found.
568    /// Much faster when matches exist early in text (similar to mrab-regex behavior).
569    fn find_greedy_first(&self, text: &str) -> Option<MatchResult> {
570        let bridge = self.fuzzy_bridge?;
571        let max_offset = self.prefilter.max_offset();
572        let text_bytes = text.as_bytes();
573
574        // Try Guard NFA first - fastest path for single pattern with no prefilter needed
575        if self.is_simple_fuzzy && !self.prefilter.is_active() {
576            if let Some((start, result)) = bridge.find_first_guard_nfa(text, self.config.threshold)
577            {
578                let mut captures = CaptureState::new(self.capture_count);
579                captures.set_full_match(start, result.end);
580                return Some(MatchResult {
581                    start,
582                    end: result.end,
583                    similarity: result.similarity,
584                    edits: EditCounts {
585                        insertions: result.insertions,
586                        deletions: result.deletions,
587                        substitutions: result.substitutions,
588                        swaps: result.swaps,
589                    },
590                    captures,
591                });
592            }
593            return None;
594        }
595
596        // Try lazy streaming search FIRST - best for early matches (like mrab-regex)
597        // This processes candidates one at a time without collecting them all upfront
598        let use_prefilter =
599            self.prefilter.is_active() && self.prefilter.is_selective() && self.is_simple_fuzzy;
600        if use_prefilter {
601            if let Some((start, result)) =
602                bridge.find_first_lazy(text_bytes, self.config.threshold, &self.prefilter)
603            {
604                let mut captures = CaptureState::new(self.capture_count);
605                captures.set_full_match(start, result.end);
606                return Some(MatchResult {
607                    start,
608                    end: result.end,
609                    similarity: result.similarity,
610                    edits: EditCounts::from_fuzzy_result(&result),
611                    captures,
612                });
613            }
614            // Prefilter searched all candidates and found nothing - no match exists
615            return None;
616        }
617
618        // Fall back to streaming Bitap search (scans every character) - only when no prefilter
619        if let Some((start, result)) =
620            bridge.find_first_boyer_moore(text_bytes, self.config.threshold, max_offset)
621        {
622            // Fast path: for simple fuzzy-only patterns, skip NFA simulation entirely
623            // The Bitap result already has all the information we need
624            if self.is_simple_fuzzy {
625                let mut captures = CaptureState::new(self.capture_count);
626                captures.set_full_match(start, result.end);
627                return Some(MatchResult {
628                    start,
629                    end: result.end,
630                    similarity: result.similarity,
631                    edits: EditCounts::from_fuzzy_result(&result),
632                    captures,
633                });
634            }
635
636            // Complex pattern: need NFA simulation
637            let mut cached = CachedMatches::default();
638            cached.insert(0, start, result);
639
640            if let Some(m) = self.find_at_with_cache(text, start, Some(&cached)) {
641                return Some(m);
642            }
643        }
644
645        // Fall back to prefilter-based search for edge cases
646        let mut last_tried = None;
647        for candidate in self.prefilter.find_candidates(text_bytes) {
648            for offset in 0..=max_offset {
649                let pos = candidate + offset;
650                if pos > text.len() {
651                    continue;
652                }
653
654                let idx = Self::snap_to_char_boundary(text, pos);
655                if last_tried == Some(idx) {
656                    continue;
657                }
658                last_tried = Some(idx);
659
660                // Fall back to slower path (e.g., for patterns longer than 64 chars)
661                if let Some((start, result)) =
662                    bridge.search_at_position(text, idx, self.config.threshold)
663                {
664                    // Fast path for simple patterns
665                    if self.is_simple_fuzzy {
666                        let mut captures = CaptureState::new(self.capture_count);
667                        captures.set_full_match(start, result.end);
668                        return Some(MatchResult {
669                            start,
670                            end: result.end,
671                            similarity: result.similarity,
672                            edits: EditCounts::from_fuzzy_result(&result),
673                            captures,
674                        });
675                    }
676
677                    let mut cached = CachedMatches::default();
678                    cached.insert(0, start, result);
679
680                    if let Some(m) = self.find_at_with_cache(text, start, Some(&cached)) {
681                        return Some(m);
682                    }
683                }
684            }
685        }
686
687        None
688    }
689
690    /// Multi-pattern fast path: for simple alternations like (cat|bat|rat),
691    /// use parallel Bitap search instead of full NFA simulation.
692    fn find_multi_pattern_fast(&self, text: &str) -> Option<MatchResult> {
693        let bridge = self.fuzzy_bridge?;
694        let indices = self.simple_alternation_indices.as_ref()?;
695        let multi_prefilter = self.multi_prefilter.as_ref()?;
696
697        let text_bytes = text.as_bytes();
698
699        // Check if any pattern has fuzzy edits
700        // Fuzzy patterns MUST use individual streaming because the prefilter-based
701        // approach relies on exact byte matches which miss fuzzy variants
702        let has_fuzzy_edits = indices
703            .iter()
704            .any(|&idx| bridge.pattern_max_edits(idx).unwrap_or(0) > 0);
705
706        // Decide between combined prefilter vs individual pattern streaming.
707        // Individual streaming is better when:
708        // 1. Any pattern has fuzzy edits (prefilter can miss fuzzy variants), OR
709        // 2. The prefilter is not selective AND patterns are long
710        let use_individual = if has_fuzzy_edits {
711            // Fuzzy patterns must use streaming - prefilter misses fuzzy variants
712            true
713        } else if multi_prefilter.is_selective() {
714            false
715        } else {
716            // Check if patterns are long enough to benefit from individual streaming
717            let min_pattern_len = indices
718                .iter()
719                .filter_map(|&idx| bridge.pattern_text(idx))
720                .map(str::len)
721                .min()
722                .unwrap_or(0);
723            // Long patterns (>= 8 chars) benefit from individual streaming
724            min_pattern_len >= 8
725        };
726
727        let result = if use_individual {
728            bridge.find_first_multi_pattern_individual(text_bytes, self.config.threshold, indices)
729        } else {
730            bridge.find_first_multi_pattern(
731                text_bytes,
732                self.config.threshold,
733                indices,
734                multi_prefilter,
735            )
736        };
737
738        if let Some((_pattern_idx, start, result)) = result {
739            let mut captures = CaptureState::new(self.capture_count);
740            captures.set_full_match(start, result.end);
741            return Some(MatchResult {
742                start,
743                end: result.end,
744                similarity: result.similarity,
745                edits: EditCounts::from_fuzzy_result(&result),
746                captures,
747            });
748        }
749
750        None
751    }
752
753    /// Fallback for multi-pattern search when prefilter is inactive.
754    /// Uses individual streaming Bitap search for each pattern.
755    fn find_multi_pattern_individual_fallback(&self, text: &str) -> Option<MatchResult> {
756        let bridge = self.fuzzy_bridge?;
757        let indices = self.simple_alternation_indices.as_ref()?;
758
759        let text_bytes = text.as_bytes();
760
761        // Use individual streaming search for each pattern
762        if let Some((_pattern_idx, start, result)) =
763            bridge.find_first_multi_pattern_individual(text_bytes, self.config.threshold, indices)
764        {
765            let mut captures = CaptureState::new(self.capture_count);
766            captures.set_full_match(start, result.end);
767            return Some(MatchResult {
768                start,
769                end: result.end,
770                similarity: result.similarity,
771                edits: EditCounts::from_fuzzy_result(&result),
772                captures,
773            });
774        }
775
776        None
777    }
778
779    /// Streaming fallback for single pattern without prefilter.
780    /// Uses streaming Bitap which is O(N) instead of O(N*P) `search_all`.
781    fn find_streaming_fallback(&self, text: &str) -> Option<MatchResult> {
782        let bridge = self.fuzzy_bridge?;
783        let text_bytes = text.as_bytes();
784
785        // Use streaming Bitap for single pattern
786        if let Some((start, result)) = bridge
787            .find_first_multi_pattern_individual(text_bytes, self.config.threshold, &[0])
788            .map(|(_, start, result)| (start, result))
789        {
790            let mut captures = CaptureState::new(self.capture_count);
791            captures.set_full_match(start, result.end);
792            return Some(MatchResult {
793                start,
794                end: result.end,
795                similarity: result.similarity,
796                edits: EditCounts::from_fuzzy_result(&result),
797                captures,
798            });
799        }
800
801        None
802    }
803
804    /// Snap a byte position to the nearest valid char boundary.
805    #[inline]
806    fn snap_to_char_boundary(text: &str, pos: usize) -> usize {
807        if pos >= text.len() {
808            return text.len();
809        }
810        // Find the start of the character at or after this position
811        let bytes = text.as_bytes();
812        let mut p = pos;
813        while p < bytes.len() && (bytes[p] & 0b1100_0000) == 0b1000_0000 {
814            p += 1;
815        }
816        p
817    }
818
819    /// Find (without caching, for debugging).
820    ///
821    /// This is useful for debugging and testing when you want to verify
822    /// results without the caching optimization.
823    #[cfg(test)]
824    fn find_no_cache(&self, text: &str) -> Option<MatchResult> {
825        if self.config.unanchored {
826            for (idx, _) in text.char_indices() {
827                if let Some(m) = self.find_at(text, idx) {
828                    return Some(m);
829                }
830            }
831            self.find_at(text, text.len())
832        } else {
833            self.find_at(text, 0)
834        }
835    }
836
837    /// Find the best match in the text (BESTMATCH mode) using cached fuzzy matches.
838    fn find_best_with_cache(
839        &self,
840        text: &str,
841        cached: Option<&CachedMatches>,
842    ) -> Option<MatchResult> {
843        let mut best: Option<MatchResult> = None;
844
845        // BESTMATCH comparison: lowest cost wins (following mrab-regex semantics)
846        // Uses default cost of 1 for all edit types.
847        // Tiebreakers: higher similarity, then longer match, then earlier start
848        let is_better_match = |m: &MatchResult, current: &MatchResult| -> bool {
849            // Primary: lower cost is better (cost(1,1,1,1) = total edits with equal weights)
850            let m_cost = m.edits.cost(1, 1, 1, 1);
851            let current_cost = current.edits.cost(1, 1, 1, 1);
852            if m_cost != current_cost {
853                return m_cost < current_cost;
854            }
855
856            // Secondary: higher similarity is better
857            if (m.similarity - current.similarity).abs() > f32::EPSILON {
858                return m.similarity > current.similarity;
859            }
860
861            // Tertiary: longer match is better
862            let m_len = m.end - m.start;
863            let current_len = current.end - current.start;
864            if m_len != current_len {
865                return m_len > current_len;
866            }
867
868            // Quaternary: earlier start is better
869            m.start < current.start
870        };
871
872        // Try starting at each position
873        for (idx, _) in text.char_indices() {
874            if let Some(m) = self.find_at_with_cache(text, idx, cached) {
875                // Perfect match (0 cost) - return immediately
876                if m.edits.cost(1, 1, 1, 1) == 0 {
877                    return Some(m);
878                }
879
880                let dominated = best
881                    .as_ref()
882                    .is_some_and(|current| !is_better_match(&m, current));
883                if !dominated {
884                    best = Some(m);
885                }
886            }
887        }
888
889        // Try at end for empty patterns
890        if let Some(m) = self.find_at_with_cache(text, text.len(), cached) {
891            if m.edits.total() == 0 {
892                return Some(m);
893            }
894
895            let dominated = best
896                .as_ref()
897                .is_some_and(|current| !is_better_match(&m, current));
898            if !dominated {
899                best = Some(m);
900            }
901        }
902
903        best
904    }
905
906    /// Find the longest match at the leftmost position (POSIX mode) using cached fuzzy matches.
907    fn find_posix_with_cache(
908        &self,
909        text: &str,
910        _cached: Option<&CachedMatches>,
911    ) -> Option<MatchResult> {
912        // POSIX: find leftmost-longest
913        // We need overlapping matches, so we try each position and get the longest at each position
914
915        let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
916        let fuzzy_cached = if is_all_exact {
917            None
918        } else {
919            self.fuzzy_bridge
920                .map(|b| b.search_all(text, self.config.threshold))
921        };
922
923        let mut best: Option<MatchResult> = None;
924
925        // Try each starting position (overlapping)
926        for (idx, _) in text.char_indices() {
927            if let Some(m) = self.find_at_with_cache(text, idx, fuzzy_cached.as_ref()) {
928                if m.start != idx {
929                    continue;
930                }
931
932                match &best {
933                    None => best = Some(m),
934                    Some(current) => {
935                        if m.start < current.start
936                            || (m.start == current.start && m.end > current.end)
937                        {
938                            best = Some(m);
939                        }
940                    }
941                }
942            }
943        }
944
945        best
946    }
947
948    /// Find all non-overlapping matches.
949    #[must_use]
950    pub fn find_all(&self, text: &str) -> Vec<MatchResult> {
951        // For patterns where all literals are exact (no fuzzy edits), skip the cache entirely.
952        // This avoids the overhead of building and querying the cache for common characters.
953        let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
954
955        // Search all fuzzy matches once upfront - O(N) instead of O(N²)
956        // Skip for exact patterns to avoid overhead
957        let cached = if is_all_exact {
958            None
959        } else {
960            self.fuzzy_bridge
961                .map(|b| b.search_all(text, self.config.threshold))
962        };
963
964        let mut matches = Vec::new();
965
966        // Optimization for end-anchored patterns: only check positions near the end
967        // (disabled in multiline mode where $ can match at any line boundary)
968        if self.ends_with_end_anchor
969            && !self.config.multi_line
970            && let Some(max_len) = self.max_simple_length
971        {
972            // Only check last `max_len` character positions
973            let bytes = text.as_bytes();
974            let mut positions = Vec::with_capacity(max_len + 1);
975            let mut byte_pos = bytes.len();
976            let mut chars_counted = 0;
977
978            while byte_pos > 0 && chars_counted < max_len {
979                byte_pos -= 1;
980                if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
981                    positions.push(byte_pos);
982                    chars_counted += 1;
983                }
984            }
985
986            // Try positions from end (positions are already in reverse order)
987            for &idx in &positions {
988                if let Some(ref fcc) = self.first_char_class {
989                    let ch = text[idx..].chars().next().unwrap();
990                    if !fcc.matches(ch) {
991                        continue;
992                    }
993                }
994                if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
995                    matches.push(m);
996                    break; // End-anchored pattern can only match once
997                }
998            }
999
1000            // Try at end position for patterns that match empty string at end
1001            if matches.is_empty()
1002                && let Some(m) = self.find_at_with_cache(text, text.len(), cached.as_ref())
1003            {
1004                matches.push(m);
1005            }
1006
1007            return matches;
1008        }
1009
1010        // Optimization: Use cached literal positions to guide search
1011        // If we have literal matches in the cache, only try positions within
1012        // MAX_LOOKBACK of a literal position
1013        if let Some(ref cache) = cached
1014            && !cache.is_empty()
1015        {
1016            return self.find_all_with_literal_guide(text, cache);
1017        }
1018
1019        let mut pos = 0;
1020
1021        while pos < text.len() {
1022            // Get the character at the current position for quick rejection
1023            let ch = text[pos..].chars().next();
1024
1025            // Quick reject: if first_char_class is set and doesn't match, skip
1026            let should_try = match (&self.first_char_class, ch) {
1027                (Some(fcc), Some(c)) => fcc.matches(c),
1028                _ => true, // No first_char_class or at end, try anyway
1029            };
1030
1031            if should_try {
1032                let result = self.find_at_with_cache(text, pos, cached.as_ref());
1033                if let Some(m) = result {
1034                    let end = m.end;
1035                    matches.push(m);
1036                    // Move past this match
1037                    pos = if end > pos { end } else { pos + 1 };
1038                    continue;
1039                }
1040            }
1041
1042            // Move to next character
1043            pos = text[pos..]
1044                .char_indices()
1045                .nth(1)
1046                .map_or(text.len() + 1, |(i, _)| pos + i);
1047        }
1048
1049        // Try at end for empty patterns (only if no first_char_class restriction)
1050        if self.first_char_class.is_none()
1051            && let Some(m) = self.find_at_with_cache(text, text.len(), cached.as_ref())
1052        {
1053            matches.push(m);
1054        }
1055
1056        matches
1057    }
1058
1059    /// Find up to `n` non-overlapping matches.
1060    ///
1061    /// This is more efficient than `find_all` when only a limited number of matches is needed,
1062    /// as it stops searching after finding `n` matches.
1063    #[must_use]
1064    pub fn find_n(&self, text: &str, n: usize) -> Vec<MatchResult> {
1065        if n == 0 {
1066            return Vec::new();
1067        }
1068
1069        // For patterns where all literals are exact (no fuzzy edits), skip the cache entirely.
1070        let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
1071
1072        // Search all fuzzy matches once upfront
1073        let cached = if is_all_exact {
1074            None
1075        } else {
1076            self.fuzzy_bridge
1077                .map(|b| b.search_all(text, self.config.threshold))
1078        };
1079
1080        let mut matches = Vec::with_capacity(n);
1081        let mut pos = 0;
1082
1083        while pos < text.len() && matches.len() < n {
1084            // Get the character at the current position for quick rejection
1085            let ch = text[pos..].chars().next();
1086
1087            // Quick reject: if first_char_class is set and doesn't match, skip
1088            let should_try = match (&self.first_char_class, ch) {
1089                (Some(fcc), Some(c)) => fcc.matches(c),
1090                _ => true,
1091            };
1092
1093            if should_try && let Some(m) = self.find_at_with_cache(text, pos, cached.as_ref()) {
1094                let end = m.end;
1095                matches.push(m);
1096                // Move past this match
1097                pos = if end > pos { end } else { pos + 1 };
1098                continue;
1099            }
1100
1101            // Move to next character
1102            pos = text[pos..]
1103                .char_indices()
1104                .nth(1)
1105                .map_or(text.len() + 1, |(i, _)| pos + i);
1106        }
1107
1108        matches
1109    }
1110
1111    /// Find all matches using literal positions as a guide.
1112    /// Only tries positions that could reach a required literal.
1113    fn find_all_with_literal_guide(&self, text: &str, cached: &CachedMatches) -> Vec<MatchResult> {
1114        let mut matches = Vec::new();
1115
1116        // Collect all literal positions sorted by start
1117        let mut literal_positions: Vec<usize> = cached
1118            .iter()
1119            .flat_map(|((_, start), _)| std::iter::once(start))
1120            .collect();
1121        literal_positions.sort_unstable();
1122        literal_positions.dedup();
1123
1124        if literal_positions.is_empty() {
1125            return matches;
1126        }
1127
1128        // Maximum lookback from a literal to where a match could start
1129        // This is a heuristic - for unbounded patterns, we use a large value
1130        const MAX_LOOKBACK: usize = 256;
1131
1132        let mut pos = 0;
1133        let mut lit_idx = 0;
1134
1135        while pos < text.len() && lit_idx < literal_positions.len() {
1136            let next_lit_pos = literal_positions[lit_idx];
1137
1138            // Skip to position within MAX_LOOKBACK of the next literal
1139            if pos + MAX_LOOKBACK < next_lit_pos {
1140                // Jump to MAX_LOOKBACK before the literal
1141                pos = next_lit_pos.saturating_sub(MAX_LOOKBACK);
1142                // Snap to char boundary
1143                pos = Self::snap_to_char_boundary(text, pos);
1144            }
1145
1146            // Get the character at the current position for quick rejection
1147            let ch = text[pos..].chars().next();
1148
1149            // Quick reject: if first_char_class is set and doesn't match, skip
1150            let should_try = match (&self.first_char_class, ch) {
1151                (Some(fcc), Some(c)) => fcc.matches(c),
1152                _ => true,
1153            };
1154
1155            if should_try && let Some(m) = self.find_at_with_cache(text, pos, Some(cached)) {
1156                let end = m.end;
1157                matches.push(m);
1158                // Move past this match
1159                pos = if end > pos { end } else { pos + 1 };
1160                // Advance lit_idx past positions we've covered
1161                while lit_idx < literal_positions.len() && literal_positions[lit_idx] < pos {
1162                    lit_idx += 1;
1163                }
1164                continue;
1165            }
1166
1167            // Move to next character
1168            let next_pos = text[pos..]
1169                .char_indices()
1170                .nth(1)
1171                .map_or(text.len() + 1, |(i, _)| pos + i);
1172
1173            // If we've passed the current literal, move to the next one
1174            if next_pos > next_lit_pos {
1175                lit_idx += 1;
1176            }
1177
1178            pos = next_pos;
1179        }
1180
1181        matches
1182    }
1183
1184    /// Try to find a match starting at a specific position.
1185    ///
1186    /// Unlike `find()` which searches the entire text, this starts the search
1187    /// at the given position. The full text is still used for boundary checks
1188    /// (e.g., `\b` word boundaries).
1189    #[must_use]
1190    pub fn find_at(&self, text: &str, start: usize) -> Option<MatchResult> {
1191        self.find_at_with_cache(text, start, None)
1192    }
1193
1194    /// Build the fuzzy cache for the given text.
1195    ///
1196    /// This can be used for lazy iteration where we want to compute the cache
1197    /// once upfront and reuse it for multiple `find_at_with_cache` calls.
1198    pub fn build_cache(&self, text: &str) -> Option<CachedMatches> {
1199        // For patterns where all literals are exact (no fuzzy edits), skip the cache
1200        let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
1201        if is_all_exact {
1202            return None;
1203        }
1204        self.fuzzy_bridge
1205            .map(|b| b.search_all(text, self.config.threshold))
1206    }
1207
1208    /// Try to find a match starting at a specific position using cached fuzzy matches.
1209    #[must_use]
1210    pub fn find_at_with_cache(
1211        &self,
1212        text: &str,
1213        start: usize,
1214        cached: Option<&CachedMatches>,
1215    ) -> Option<MatchResult> {
1216        use super::hash::FxHashSet;
1217
1218        let mut threads = vec![Thread {
1219            state: self.nfa.start,
1220            pos: start,
1221            match_start: start,
1222            captures: CaptureState::new(self.capture_count),
1223            similarity: 1.0,
1224            edits: EditCounts::default(),
1225        }];
1226
1227        let mut best_match: Option<MatchResult> = None;
1228
1229        // Track visited (state, pos) pairs to avoid redundant exploration
1230        let mut visited: FxHashSet<(StateId, usize)> = FxHashSet::default();
1231        visited.insert((self.nfa.start, start));
1232
1233        while !threads.is_empty() {
1234            // Early termination for lazy quantifiers: once we have a match, stop exploring
1235            if self.config.prefer_shortest && best_match.is_some() {
1236                break;
1237            }
1238
1239            let mut next_threads = Vec::new();
1240
1241            for thread in threads {
1242                self.step_thread_with_cache(
1243                    text,
1244                    start,
1245                    thread,
1246                    &mut next_threads,
1247                    &mut best_match,
1248                    cached,
1249                );
1250            }
1251
1252            // Deduplicate threads by (state, pos)
1253            // In ENHANCEMATCH mode: keep the thread with LOWEST COST (fewest edits) at each position
1254            // In normal mode: keep the FIRST thread (for performance and correct "first match" semantics)
1255            let mut deduped = Vec::with_capacity(next_threads.len());
1256            if self.config.enhance_match {
1257                // ENHANCEMATCH: keep best thread at each (state, pos)
1258                // Best = lowest cost (using default cost of 1 for all edit types)
1259                use super::hash::FxHashMap;
1260                let mut best_at_pos: FxHashMap<(StateId, usize), usize> = FxHashMap::default();
1261                for (idx, thread) in next_threads.iter().enumerate() {
1262                    let key = (thread.state, thread.pos);
1263                    if let Some(&existing_idx) = best_at_pos.get(&key) {
1264                        // Keep the one with lower cost (fewer edits)
1265                        // Using cost(1,1,1,1) is equivalent to total() but more explicit
1266                        let new_cost = thread.edits.cost(1, 1, 1, 1);
1267                        let existing_cost = next_threads[existing_idx].edits.cost(1, 1, 1, 1);
1268                        if new_cost < existing_cost {
1269                            best_at_pos.insert(key, idx);
1270                        }
1271                    } else if visited.insert(key) {
1272                        best_at_pos.insert(key, idx);
1273                    }
1274                }
1275                // Collect the best threads
1276                let mut indices: Vec<_> = best_at_pos.values().copied().collect();
1277                indices.sort_unstable();
1278                for idx in indices {
1279                    deduped.push(next_threads[idx].clone());
1280                }
1281            } else {
1282                // Normal mode: keep first thread at each (state, pos)
1283                for thread in next_threads {
1284                    let key = (thread.state, thread.pos);
1285                    if visited.insert(key) {
1286                        deduped.push(thread);
1287                    }
1288                }
1289            }
1290            next_threads = deduped;
1291
1292            // Prune threads if too many
1293            if next_threads.len() > self.config.max_threads {
1294                next_threads.sort_by(|a, b| {
1295                    b.similarity
1296                        .partial_cmp(&a.similarity)
1297                        .unwrap_or(std::cmp::Ordering::Equal)
1298                });
1299                next_threads.truncate(self.config.max_threads);
1300            }
1301
1302            threads = next_threads;
1303        }
1304
1305        best_match
1306    }
1307
1308    /// Process a single thread for one step, using cached fuzzy matches.
1309    fn step_thread_with_cache(
1310        &self,
1311        text: &str,
1312        _start: usize,
1313        thread: Thread,
1314        next_threads: &mut Vec<Thread>,
1315        best_match: &mut Option<MatchResult>,
1316        cached: Option<&CachedMatches>,
1317    ) {
1318        let state = &self.nfa.states[thread.state];
1319
1320        match state {
1321            State::Accept => {
1322                let mut captures = thread.captures.clone();
1323                captures.set_full_match(thread.match_start, thread.pos);
1324
1325                let m = MatchResult {
1326                    start: thread.match_start,
1327                    end: thread.pos,
1328                    similarity: thread.similarity,
1329                    edits: thread.edits.clone(),
1330                    captures,
1331                };
1332
1333                // Keep best match:
1334                // - Earlier start position always wins
1335                // - At same start position:
1336                //   - In BESTMATCH/ENHANCEMATCH mode: prefer higher similarity, then longer
1337                //   - For simple alternations without fuzzy modifier: first match wins
1338                //   - For other patterns: prefer higher similarity, then longer (default behavior)
1339                let prefer_shorter = self.config.prefer_shortest;
1340                // Check if this is a simple alternation without any fuzzy edits
1341                let has_alternation = self.nfa.is_simple_alternation();
1342                // Check if any pattern in the alternation has fuzzy edits
1343                let has_fuzzy = self
1344                    .simple_alternation_indices
1345                    .as_ref()
1346                    .is_some_and(|indices| {
1347                        self.fuzzy_bridge.is_some_and(|bridge| {
1348                            indices
1349                                .iter()
1350                                .any(|&idx| bridge.pattern_max_edits(idx).unwrap_or(0) > 0)
1351                        })
1352                    });
1353                let is_simple_alt = has_alternation && !has_fuzzy;
1354                let is_special_mode = self.config.best_match || self.config.enhance_match;
1355                if best_match.as_ref().is_none_or(|best| {
1356                    // Earlier start wins
1357                    m.start < best.start
1358                        // At same start:
1359                        || (m.start == best.start && {
1360                        if is_special_mode {
1361                            // BESTMATCH/ENHANCEMATCH: prefer higher similarity/length
1362                            m.similarity > best.similarity
1363                                || (m.similarity == best.similarity && if prefer_shorter {
1364                                m.end - m.start < best.end - best.start
1365                            } else {
1366                                m.end - m.start > best.end - best.start
1367                            })
1368                        } else if is_simple_alt {
1369                            // Simple alternation (no fuzzy/captures): first match wins
1370                            false
1371                        } else {
1372                            // Default behavior: prefer higher similarity, then longer
1373                            m.similarity > best.similarity
1374                                || (m.similarity == best.similarity && if prefer_shorter {
1375                                m.end - m.start < best.end - best.start
1376                            } else {
1377                                m.end - m.start > best.end - best.start
1378                            })
1379                        }
1380                    })
1381                }) {
1382                    *best_match = Some(m);
1383                }
1384            }
1385
1386            State::Epsilon { targets } => {
1387                for &target in targets {
1388                    next_threads.push(Thread {
1389                        state: target,
1390                        ..thread.clone()
1391                    });
1392                }
1393            }
1394
1395            State::Split { branches, greedy } => {
1396                // For greedy: try branches in order (first branch is match/continue)
1397                // For non-greedy: try branches in reverse order (last branch is exit/skip)
1398                if *greedy {
1399                    for &branch in branches {
1400                        next_threads.push(Thread {
1401                            state: branch,
1402                            ..thread.clone()
1403                        });
1404                    }
1405                } else {
1406                    for &branch in branches.iter().rev() {
1407                        next_threads.push(Thread {
1408                            state: branch,
1409                            ..thread.clone()
1410                        });
1411                    }
1412                }
1413            }
1414
1415            State::Char { class, next } => {
1416                if thread.pos < text.len()
1417                    && let Some(ch) = text[thread.pos..].chars().next()
1418                    && class.matches(ch)
1419                {
1420                    next_threads.push(Thread {
1421                        state: *next,
1422                        pos: thread.pos + ch.len_utf8(),
1423                        ..thread
1424                    });
1425                }
1426            }
1427
1428            State::FuzzyChar {
1429                class,
1430                limits,
1431                min_edits: _,
1432                cost_constraint: _,
1433                next,
1434            } => {
1435                // Calculate edit budget
1436                let max_edits = limits
1437                    .as_ref()
1438                    .and_then(FuzzyLimits::get_edits)
1439                    .unwrap_or(u8::MAX);
1440                let max_deletions = limits
1441                    .as_ref()
1442                    .and_then(FuzzyLimits::get_deletions)
1443                    .unwrap_or(max_edits);
1444                let max_substitutions = limits
1445                    .as_ref()
1446                    .and_then(FuzzyLimits::get_substitutions)
1447                    .unwrap_or(max_edits);
1448
1449                let current_edits = thread.edits.total();
1450
1451                // Calculate similarity penalty per edit
1452                let edit_penalty = if max_edits > 0 {
1453                    1.0 / (f32::from(max_edits) + 1.0)
1454                } else {
1455                    1.0
1456                };
1457
1458                // 1. Try exact match or substitution
1459                if thread.pos < text.len()
1460                    && let Some(ch) = text[thread.pos..].chars().next()
1461                {
1462                    if class.matches(ch) {
1463                        // Exact match - advance both pattern and text
1464                        next_threads.push(Thread {
1465                            state: *next,
1466                            pos: thread.pos + ch.len_utf8(),
1467                            ..thread.clone()
1468                        });
1469                    } else if current_edits < max_edits
1470                        && thread.edits.substitutions < max_substitutions
1471                    {
1472                        // Substitution - consume mismatched char
1473                        let mut new_edits = thread.edits.clone();
1474                        new_edits.substitutions += 1;
1475                        next_threads.push(Thread {
1476                            state: *next,
1477                            pos: thread.pos + ch.len_utf8(),
1478                            similarity: thread.similarity * (1.0 - edit_penalty),
1479                            edits: new_edits,
1480                            ..thread.clone()
1481                        });
1482                    }
1483                }
1484
1485                // 2. Try deletion (skip this pattern char without consuming text)
1486                if current_edits < max_edits && thread.edits.deletions < max_deletions {
1487                    let mut new_edits = thread.edits.clone();
1488                    new_edits.deletions += 1;
1489                    next_threads.push(Thread {
1490                        state: *next,
1491                        pos: thread.pos, // Don't consume text
1492                        similarity: thread.similarity * (1.0 - edit_penalty),
1493                        edits: new_edits,
1494                        ..thread.clone()
1495                    });
1496                }
1497            }
1498
1499            State::FuzzyLiteral {
1500                pattern_index,
1501                next,
1502                limits,
1503                min_edits,
1504                cost_constraint,
1505            } => {
1506                if let Some(bridge) = self.fuzzy_bridge {
1507                    // Fast path for exact matching (no fuzzy edits)
1508                    // This avoids the full bridge lookup overhead for patterns without edits
1509                    let max_edits = limits.as_ref().map(|l| {
1510                        l.get_edits().unwrap_or_else(|| {
1511                            let i = l.get_insertions().unwrap_or(0);
1512                            let d = l.get_deletions().unwrap_or(0);
1513                            let s = l.get_substitutions().unwrap_or(0);
1514                            let t = l.get_swaps().unwrap_or(0);
1515                            i.saturating_add(d).saturating_add(s).saturating_add(t)
1516                        })
1517                    });
1518
1519                    if max_edits.is_none() || max_edits == Some(0) {
1520                        // Exact match fast path - direct string comparison
1521                        if let Some(pattern_text) = bridge.pattern_text(*pattern_index)
1522                            && thread.pos + pattern_text.len() <= text.len()
1523                            && text[thread.pos..].starts_with(pattern_text)
1524                        {
1525                            let new_end = thread.pos + pattern_text.len();
1526                            // Only accept if we consume at least one character
1527                            if new_end > thread.pos {
1528                                next_threads.push(Thread {
1529                                    state: *next,
1530                                    pos: new_end,
1531                                    similarity: thread.similarity,
1532                                    edits: thread.edits.clone(),
1533                                    captures: thread.captures.clone(),
1534                                    ..thread
1535                                });
1536                            }
1537                        }
1538                        // Skip the full fuzzy matching path for exact patterns
1539                    } else {
1540                        let expected_end = self.find_expected_end(*next, text.len());
1541
1542                        let meets_min_edits = |result: &FuzzyMatchResult| -> bool {
1543                            match min_edits {
1544                                Some(min) => result.total_edits() >= *min,
1545                                None => true,
1546                            }
1547                        };
1548
1549                        let meets_cost_constraint = |result: &FuzzyMatchResult| -> bool {
1550                            match cost_constraint {
1551                                Some(constraint) => constraint.is_satisfied(
1552                                    result.insertions,
1553                                    result.deletions,
1554                                    result.substitutions,
1555                                    result.swaps,
1556                                ),
1557                                None => true,
1558                            }
1559                        };
1560
1561                        let meets_all_constraints = |result: &FuzzyMatchResult| -> bool {
1562                            meets_min_edits(result) && meets_cost_constraint(result)
1563                        };
1564
1565                        // Use cached results if available, otherwise fall back to direct search
1566                        let result = if let Some(cache) = cached {
1567                            bridge.find_at_cached(cache, *pattern_index, thread.pos)
1568                        } else {
1569                            bridge.find_at(text, *pattern_index, thread.pos, self.config.threshold)
1570                        };
1571
1572                        if let Some(result) = result {
1573                            let should_try_boundary = expected_end.is_some()
1574                                && result.end < expected_end.unwrap()
1575                                && limits.as_ref().is_some_and(|l| {
1576                                    l.get_insertions().unwrap_or(0) > 0
1577                                        || l.get_edits().unwrap_or(0) > 0
1578                                });
1579
1580                            if should_try_boundary
1581                                && let Some(boundary_result) = bridge.find_with_boundary_insertions(
1582                                    text,
1583                                    *pattern_index,
1584                                    thread.pos,
1585                                    expected_end,
1586                                    self.config.threshold,
1587                                    cached,
1588                                )
1589                                && meets_all_constraints(&boundary_result)
1590                            {
1591                                next_threads.push(Thread {
1592                                    state: *next,
1593                                    pos: boundary_result.end,
1594                                    similarity: thread.similarity * boundary_result.similarity,
1595                                    edits: thread
1596                                        .edits
1597                                        .merge(&EditCounts::from_fuzzy_result(&boundary_result)),
1598                                    captures: thread.captures.clone(),
1599                                    ..thread
1600                                });
1601                            }
1602
1603                            // Only accept matches that consume at least one character
1604                            // to prevent infinite loops with quantifiers when fuzzy
1605                            // match can delete entire pattern
1606                            if meets_all_constraints(&result) && result.end > thread.pos {
1607                                next_threads.push(Thread {
1608                                    state: *next,
1609                                    pos: result.end,
1610                                    similarity: thread.similarity * result.similarity,
1611                                    edits: thread
1612                                        .edits
1613                                        .merge(&EditCounts::from_fuzzy_result(&result)),
1614                                    captures: thread.captures.clone(),
1615                                    ..thread
1616                                });
1617                            }
1618                        } else {
1619                            let can_use_boundary = limits.as_ref().is_some_and(|l| {
1620                                l.get_insertions().unwrap_or(0) > 0
1621                                    || l.get_edits().unwrap_or(0) > 0
1622                            });
1623
1624                            if can_use_boundary
1625                                && let Some(result) = bridge.find_with_boundary_insertions(
1626                                    text,
1627                                    *pattern_index,
1628                                    thread.pos,
1629                                    expected_end,
1630                                    self.config.threshold,
1631                                    cached,
1632                                )
1633                            {
1634                                // Must consume at least one character
1635                                if meets_all_constraints(&result) && result.end > thread.pos {
1636                                    next_threads.push(Thread {
1637                                        state: *next,
1638                                        pos: result.end,
1639                                        similarity: thread.similarity * result.similarity,
1640                                        edits: thread
1641                                            .edits
1642                                            .merge(&EditCounts::from_fuzzy_result(&result)),
1643                                        captures: thread.captures.clone(),
1644                                        ..thread
1645                                    });
1646                                }
1647                            }
1648                        }
1649
1650                        // Also try pure deletion path: skip entire pattern without consuming text
1651                        // This allows patterns like (?:b){e<=1}(?:c){e<=1} to match "c"
1652                        // by deleting 'b' and exactly matching 'c'
1653                        // Only do this when:
1654                        // 1. We've already consumed some text (thread.pos > 0) - prevents spurious empty matches at start
1655                        // 2. AND there's still text remaining (thread.pos < text.len()) - at end, find_at handles deletion
1656                        let max_edits_for_deletion = max_edits.unwrap_or(0);
1657                        let max_deletions = limits
1658                            .as_ref()
1659                            .and_then(FuzzyLimits::get_deletions)
1660                            .unwrap_or(max_edits_for_deletion);
1661
1662                        // Only allow pure deletion when we're in the middle of matching (not at start)
1663                        // and there's still text to match by subsequent patterns
1664                        if max_deletions > 0 && thread.pos > 0 && thread.pos < text.len() {
1665                            // Get pattern length to determine deletion cost
1666                            if let Some(pattern_char_len) = bridge.pattern_char_len(*pattern_index)
1667                            {
1668                                let pattern_len = pattern_char_len as u8;
1669                                let current_deletions = thread.edits.deletions;
1670
1671                                // Can we delete the entire pattern?
1672                                if current_deletions + pattern_len <= max_deletions
1673                                    && (thread.edits.total() as usize + pattern_len as usize)
1674                                        <= max_edits_for_deletion as usize
1675                                {
1676                                    // Calculate similarity for pure deletion using same formula as Bitap:
1677                                    // similarity = 1 - total_edits / max_len
1678                                    // For pure deletion, match_len=0, so max_len = pattern_len
1679                                    let similarity = (1.0
1680                                        - f32::from(pattern_len) / f32::from(pattern_len))
1681                                    .max(0.0);
1682
1683                                    // Check min_edits constraint
1684                                    let deletion_result = FuzzyMatchResult {
1685                                        end: thread.pos,
1686                                        similarity,
1687                                        insertions: 0,
1688                                        deletions: pattern_len,
1689                                        substitutions: 0,
1690                                        swaps: 0,
1691                                    };
1692
1693                                    if meets_all_constraints(&deletion_result) {
1694                                        let mut new_edits = thread.edits.clone();
1695                                        new_edits.deletions += pattern_len;
1696                                        next_threads.push(Thread {
1697                                            state: *next,
1698                                            pos: thread.pos, // Don't consume any text
1699                                            similarity: thread.similarity
1700                                                * deletion_result.similarity,
1701                                            edits: new_edits,
1702                                            captures: thread.captures.clone(),
1703                                            ..thread
1704                                        });
1705                                    }
1706                                }
1707                            }
1708                        }
1709                    } // Close the else block for fuzzy matching
1710                }
1711            }
1712
1713            State::CaptureStart { index, next } => {
1714                let mut captures = thread.captures.clone();
1715                captures.start_capture(*index, thread.pos);
1716                next_threads.push(Thread {
1717                    state: *next,
1718                    captures,
1719                    ..thread
1720                });
1721            }
1722
1723            State::CaptureEnd { index, next } => {
1724                let mut captures = thread.captures.clone();
1725                captures.end_capture(*index, thread.pos);
1726                next_threads.push(Thread {
1727                    state: *next,
1728                    captures,
1729                    ..thread
1730                });
1731            }
1732
1733            State::Anchor { kind, next } => {
1734                let matches = match kind {
1735                    Anchor::Start => {
1736                        if self.config.multi_line {
1737                            Self::is_line_start(text, thread.pos)
1738                        } else {
1739                            thread.pos == 0
1740                        }
1741                    }
1742                    Anchor::End => {
1743                        if self.config.multi_line {
1744                            Self::is_line_end(text, thread.pos)
1745                        } else {
1746                            thread.pos == text.len()
1747                        }
1748                    }
1749                    Anchor::WordBoundary => Self::is_word_boundary(text, thread.pos),
1750                    Anchor::NotWordBoundary => !Self::is_word_boundary(text, thread.pos),
1751                };
1752
1753                if matches {
1754                    next_threads.push(Thread {
1755                        state: *next,
1756                        ..thread
1757                    });
1758                }
1759            }
1760
1761            State::Lookahead {
1762                positive,
1763                nfa,
1764                literals,
1765                next,
1766            } => {
1767                // Build a fuzzy bridge for the lookahead from its literals
1768                let lookahead_bridge = if literals.is_empty() {
1769                    None
1770                } else {
1771                    FuzzyBridge::new(literals, None, None, false)
1772                };
1773
1774                // Create a sub-matcher for the lookahead with its own bridge
1775                let sub_matcher = Matcher::new(
1776                    nfa,
1777                    lookahead_bridge.as_ref(),
1778                    0,
1779                    MatcherConfig {
1780                        unanchored: false,
1781                        ..self.config.clone()
1782                    },
1783                );
1784
1785                let sub_result = sub_matcher.find_at(text, thread.pos);
1786
1787                let has_match = sub_result.is_some();
1788
1789                if has_match == *positive {
1790                    next_threads.push(Thread {
1791                        state: *next,
1792                        ..thread
1793                    });
1794                }
1795            }
1796
1797            State::Lookbehind {
1798                positive,
1799                nfa,
1800                literals,
1801                bridge,
1802                next,
1803            } => {
1804                let has_match =
1805                    self.try_lookbehind(text, thread.pos, nfa, literals, bridge.as_deref());
1806
1807                if has_match == *positive {
1808                    next_threads.push(Thread {
1809                        state: *next,
1810                        ..thread
1811                    });
1812                }
1813            }
1814
1815            State::Backreference {
1816                group,
1817                next,
1818                limits,
1819            } => {
1820                if let Some((cap_start, cap_end)) = thread.captures.get(*group) {
1821                    let captured = &text[cap_start..cap_end];
1822                    let remaining = &text[thread.pos..];
1823
1824                    if let Some(limits) = limits {
1825                        let max_edits = limits.get_edits().unwrap_or(
1826                            limits.get_insertions().unwrap_or(0)
1827                                + limits.get_deletions().unwrap_or(0)
1828                                + limits.get_substitutions().unwrap_or(0),
1829                        );
1830
1831                        let cap_len = captured.len();
1832                        let min_len = cap_len.saturating_sub(max_edits as usize);
1833                        let max_len = cap_len.saturating_add(max_edits as usize);
1834
1835                        for end_len in min_len..=max_len.min(remaining.len()) {
1836                            if !remaining.is_char_boundary(end_len) {
1837                                continue;
1838                            }
1839                            let candidate = &remaining[..end_len];
1840                            let distance = edit_distance_bounded(captured, candidate, max_edits);
1841
1842                            if distance != Distance::MAX && distance <= max_edits.into() {
1843                                next_threads.push(Thread {
1844                                    state: *next,
1845                                    pos: thread.pos + end_len,
1846                                    similarity: thread.similarity
1847                                        * (1.0
1848                                            - distance as f32 / cap_len.max(end_len).max(1) as f32),
1849                                    edits: thread.edits.merge(&EditCounts {
1850                                        substitutions: distance as u8,
1851                                        ..Default::default()
1852                                    }),
1853                                    captures: thread.captures.clone(),
1854                                    ..thread
1855                                });
1856                            }
1857                        }
1858                    } else if remaining.starts_with(captured) {
1859                        next_threads.push(Thread {
1860                            state: *next,
1861                            pos: thread.pos + captured.len(),
1862                            ..thread
1863                        });
1864                    }
1865                }
1866            }
1867
1868            // New NFA states - \K resets match start
1869            State::ResetMatchStart { next } => {
1870                let mut new_thread = thread.clone();
1871                new_thread.match_start = thread.pos;
1872                new_thread.state = *next;
1873                next_threads.push(new_thread);
1874            }
1875            State::AtomicGroup { .. }
1876            | State::RecursivePattern { .. }
1877            | State::RecursiveGroup { .. }
1878            | State::RecursiveNamedGroup { .. } => {}
1879        }
1880    }
1881
1882    /// Check if position is at a word boundary.
1883    /// Optimized to avoid O(n) scan - walks backwards at most 4 bytes (max UTF-8 char length).
1884    #[inline]
1885    fn is_word_boundary(text: &str, pos: usize) -> bool {
1886        let bytes = text.as_bytes();
1887
1888        // Get character before pos (walk backwards to find UTF-8 char start)
1889        let before_is_word = if pos > 0 {
1890            // UTF-8 continuation bytes have pattern 10xxxxxx (0x80-0xBF)
1891            // Walk back at most 4 bytes to find the leading byte
1892            let mut start = pos - 1;
1893            while start > 0 && (bytes[start] & 0xC0) == 0x80 {
1894                start -= 1;
1895            }
1896            // Decode the single character
1897            text[start..pos]
1898                .chars()
1899                .next()
1900                .is_some_and(|c| c.is_alphanumeric() || c == '_')
1901        } else {
1902            false
1903        };
1904
1905        // Get character at pos
1906        let after_is_word = text[pos..]
1907            .chars()
1908            .next()
1909            .is_some_and(|c| c.is_alphanumeric() || c == '_');
1910
1911        before_is_word != after_is_word
1912    }
1913
1914    /// Check if position is at the start of a line (after newline or at string start).
1915    #[inline]
1916    fn is_line_start(text: &str, pos: usize) -> bool {
1917        if pos == 0 {
1918            return true;
1919        }
1920        // Check if the character before pos is a newline
1921        let bytes = text.as_bytes();
1922        bytes[pos - 1] == b'\n'
1923    }
1924
1925    /// Check if position is at the end of a line (before newline or at string end).
1926    #[inline]
1927    fn is_line_end(text: &str, pos: usize) -> bool {
1928        if pos == text.len() {
1929            return true;
1930        }
1931        // Check if the character at pos is a newline
1932        let bytes = text.as_bytes();
1933        bytes[pos] == b'\n'
1934    }
1935
1936    /// Check if the path from a state leads to an End anchor (through non-consuming states).
1937    /// Returns `Some(text_len)` if we should expect the match to extend to end of text.
1938    fn find_expected_end(&self, state_id: StateId, text_len: usize) -> Option<usize> {
1939        let mut visited = vec![false; self.nfa.states.len()];
1940        self.find_expected_end_recursive(state_id, text_len, &mut visited)
1941    }
1942
1943    /// Recursive helper for `find_expected_end`.
1944    fn find_expected_end_recursive(
1945        &self,
1946        state_id: StateId,
1947        text_len: usize,
1948        visited: &mut Vec<bool>,
1949    ) -> Option<usize> {
1950        if visited[state_id] {
1951            return None;
1952        }
1953        visited[state_id] = true;
1954
1955        match &self.nfa.states[state_id] {
1956            // Found End anchor - return the expected end position
1957            State::Anchor {
1958                kind: Anchor::End,
1959                ..
1960            } => Some(text_len),
1961
1962            State::CaptureEnd { next, .. } => {
1963                self.find_expected_end_recursive(*next, text_len, visited)
1964            }
1965
1966            // Other anchors (like word boundary) - continue through
1967            State::Anchor { next, .. } => self.find_expected_end_recursive(*next, text_len, visited),
1968
1969            // Accept without End anchor - no expected end
1970            State::Accept
1971            | State::Epsilon { .. }
1972            | State::Split { .. }
1973            // Consuming states - stop searching
1974            | State::Char { .. }
1975            | State::FuzzyChar { .. }
1976            | State::FuzzyLiteral { .. }
1977            | State::Backreference { .. }
1978            | State::Lookahead { .. }
1979            | State::Lookbehind { .. }
1980            | State::CaptureStart { .. }
1981            | State::ResetMatchStart { .. }
1982            | State::AtomicGroup { .. }
1983            | State::RecursivePattern { .. }
1984            | State::RecursiveGroup { .. }
1985            | State::RecursiveNamedGroup { .. } => None,
1986        }
1987    }
1988
1989    /// Try to match a lookbehind pattern.
1990    ///
1991    /// Uses pattern length calculation to efficiently search only valid positions.
1992    /// `pos` is a byte index in the text.
1993    /// `bridge` is the pre-built `FuzzyBridge` for the lookbehind's literals.
1994    fn try_lookbehind(
1995        &self,
1996        text: &str,
1997        pos: usize,
1998        nfa: &Nfa,
1999        literals: &[LiteralPattern],
2000        bridge: Option<&FuzzyBridge>,
2001    ) -> bool {
2002        // Fast path: single exact literal lookbehind (no fuzzy matching)
2003        // This is very common and can be done with simple string comparison
2004        if literals.len() == 1 {
2005            let lit = &literals[0];
2006            let max_edits = lit
2007                .limits
2008                .as_ref()
2009                .map_or(0, |l| l.get_edits().unwrap_or(0));
2010
2011            if max_edits == 0 {
2012                // Exact match - just compare bytes
2013                let pattern = &lit.text;
2014                let pattern_bytes = pattern.as_bytes();
2015                if pos >= pattern_bytes.len() {
2016                    let start = pos - pattern_bytes.len();
2017                    if &text.as_bytes()[start..pos] == pattern_bytes {
2018                        return true;
2019                    }
2020                }
2021                return false;
2022            }
2023        }
2024
2025        // Calculate the length range using pre-built bridge
2026        let (min_char_len, max_char_len) = nfa.length_range(|pattern_idx| {
2027            bridge
2028                .and_then(|b| {
2029                    let char_len = b.pattern_char_len(pattern_idx)?;
2030                    let max_edits = b.pattern_max_edits(pattern_idx).unwrap_or(0);
2031                    Some((char_len, max_edits))
2032                })
2033                .or_else(|| {
2034                    // For Char states (not in bridge), use pattern length with 0 edits
2035                    literals
2036                        .get(pattern_idx)
2037                        .map(|l| (l.text.chars().count(), 0))
2038                })
2039        });
2040
2041        let sub_matcher = Matcher::new(
2042            nfa,
2043            bridge,
2044            0,
2045            MatcherConfig {
2046                unanchored: false,
2047                ..self.config.clone()
2048            },
2049        );
2050
2051        // For fixed-length patterns, only try exact position
2052        // This is O(1) - we just need to find the byte offset for one position
2053        if let Some(max) = max_char_len
2054            && min_char_len == max
2055        {
2056            // Fixed length - compute the exact starting byte position
2057            // by counting back min_char_len characters from pos
2058            let text_before = &text[..pos];
2059            let start_byte = Self::nth_char_from_end_byte_offset(text_before, min_char_len);
2060            if let Some(start) = start_byte
2061                && let Some(m) = sub_matcher.find_at(text, start)
2062                && m.end == pos
2063            {
2064                return true;
2065            }
2066            return false;
2067        }
2068
2069        // Variable length - need to compute char offsets for the search range
2070        let text_before = &text[..pos];
2071        let num_chars = text_before.chars().count();
2072
2073        // Quick reject
2074        if min_char_len > num_chars {
2075            return false;
2076        }
2077
2078        let max_char_search = max_char_len.unwrap_or(num_chars.min(256));
2079
2080        // Only compute char_offsets for the positions we need to check
2081        let search_start_char = num_chars.saturating_sub(max_char_search);
2082        let search_end_char = num_chars.saturating_sub(min_char_len);
2083
2084        // Collect only the byte offsets we need
2085        let mut char_count = 0;
2086        let mut positions = Vec::new();
2087        for (byte_idx, _) in text_before.char_indices() {
2088            if char_count >= search_start_char && char_count <= search_end_char {
2089                positions.push(byte_idx);
2090            }
2091            char_count += 1;
2092            if char_count > search_end_char {
2093                break;
2094            }
2095        }
2096
2097        // Try positions from longest match first
2098        for start_byte in positions.into_iter().rev() {
2099            if let Some(m) = sub_matcher.find_at(text, start_byte)
2100                && m.end == pos
2101            {
2102                return true;
2103            }
2104        }
2105
2106        false
2107    }
2108
2109    /// Get the byte offset of the nth character from the end of a string.
2110    /// Returns None if there aren't enough characters.
2111    /// This is O(n) where n is the lookbehind length, not the text length.
2112    fn nth_char_from_end_byte_offset(s: &str, n: usize) -> Option<usize> {
2113        if n == 0 {
2114            return Some(s.len());
2115        }
2116        // Walk backwards from the end, counting characters
2117        // This is O(n) where n is the number of chars to skip
2118        let bytes = s.as_bytes();
2119        let mut byte_pos = bytes.len();
2120        let mut chars_seen = 0;
2121
2122        while byte_pos > 0 && chars_seen < n {
2123            byte_pos -= 1;
2124            // Check if this is a UTF-8 start byte (not a continuation byte 10xxxxxx)
2125            if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
2126                chars_seen += 1;
2127            }
2128        }
2129
2130        if chars_seen == n {
2131            Some(byte_pos)
2132        } else {
2133            None
2134        }
2135    }
2136}
2137
2138/// Compute the Levenshtein edit distance between two strings.
2139/// Optimized with stack allocation for small strings (common in backreferences).
2140#[cfg(test)]
2141#[inline]
2142fn edit_distance(s1: &str, s2: &str) -> usize {
2143    edit_distance_bounded(s1, s2, NumEdits::MAX).into()
2144}
2145
2146/// Bounded edit distance with early termination when distance exceeds `max_edits`.
2147/// Returns `usize::MAX` if distance would exceed `max_edits`.
2148#[inline]
2149fn edit_distance_bounded(s1: &str, s2: &str, max_edits: NumEdits) -> Distance {
2150    // Quick check: length difference gives lower bound on edit distance
2151    let len_diff = s1.len().abs_diff(s2.len());
2152    if len_diff > max_edits as usize {
2153        return Distance::MAX;
2154    }
2155
2156    // Fast path for ASCII strings (common case)
2157    if s1.is_ascii() && s2.is_ascii() {
2158        return edit_distance_ascii_bounded(s1.as_bytes(), s2.as_bytes(), max_edits);
2159    }
2160
2161    // Unicode path
2162    let s1_chars: Vec<char> = s1.chars().collect();
2163    let s2_chars: Vec<char> = s2.chars().collect();
2164
2165    // Recheck with char length
2166    let char_diff = s1_chars.len().abs_diff(s2_chars.len());
2167    if char_diff > max_edits as usize {
2168        return Distance::MAX;
2169    }
2170
2171    edit_distance_slice_bounded(&s1_chars, &s2_chars, max_edits as usize)
2172}
2173
2174/// Bounded edit distance for ASCII byte slices with early termination.
2175#[inline]
2176fn edit_distance_ascii_bounded(s1: &[u8], s2: &[u8], max_edits: NumEdits) -> Distance {
2177    let m = s1.len();
2178    let n = s2.len();
2179
2180    if m == 0 {
2181        return if n <= max_edits.into() {
2182            n as Distance
2183        } else {
2184            Distance::MAX
2185        };
2186    }
2187    if n == 0 {
2188        return if m <= max_edits.into() {
2189            m as Distance
2190        } else {
2191            Distance::MAX
2192        };
2193    }
2194
2195    // Stack allocation for small strings (covers most backreference cases)
2196    const STACK_LIMIT: usize = 64;
2197    if n < STACK_LIMIT {
2198        let mut prev = [0usize; STACK_LIMIT];
2199        let mut curr = [0usize; STACK_LIMIT];
2200
2201        for j in 0..=n {
2202            prev[j] = j;
2203        }
2204
2205        for i in 1..=m {
2206            curr[0] = i;
2207            let mut row_min = curr[0];
2208            for j in 1..=n {
2209                let cost = usize::from(s1[i - 1] != s2[j - 1]);
2210                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2211                row_min = row_min.min(curr[j]);
2212            }
2213            // Early termination: if minimum in row exceeds max_edits, distance will too
2214            if row_min > max_edits.into() {
2215                return Distance::MAX;
2216            }
2217            std::mem::swap(&mut prev, &mut curr);
2218        }
2219        return prev[n] as Distance;
2220    }
2221
2222    // Heap allocation for larger strings
2223    let mut prev: Vec<usize> = (0..=n).collect();
2224    let mut curr = vec![0; n + 1];
2225
2226    for i in 1..=m {
2227        curr[0] = i;
2228        let mut row_min = curr[0];
2229        for j in 1..=n {
2230            let cost = usize::from(s1[i - 1] != s2[j - 1]);
2231            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2232            row_min = row_min.min(curr[j]);
2233        }
2234        if row_min > max_edits.into() {
2235            return Distance::MAX;
2236        }
2237        std::mem::swap(&mut prev, &mut curr);
2238    }
2239    prev[n] as Distance
2240}
2241
2242/// Bounded edit distance for char slices with early termination.
2243#[inline]
2244fn edit_distance_slice_bounded(s1: &[char], s2: &[char], max_edits: usize) -> Distance {
2245    let m = s1.len();
2246    let n = s2.len();
2247
2248    if m == 0 {
2249        return if n <= max_edits {
2250            n as Distance
2251        } else {
2252            Distance::MAX
2253        };
2254    }
2255    if n == 0 {
2256        return if m <= max_edits {
2257            m as Distance
2258        } else {
2259            Distance::MAX
2260        };
2261    }
2262
2263    // Stack allocation for small strings
2264    const STACK_LIMIT: usize = 64;
2265    if n < STACK_LIMIT {
2266        let mut prev = [0usize; STACK_LIMIT];
2267        let mut curr = [0usize; STACK_LIMIT];
2268
2269        for j in 0..=n {
2270            prev[j] = j;
2271        }
2272
2273        for i in 1..=m {
2274            curr[0] = i;
2275            let mut row_min = curr[0];
2276            for j in 1..=n {
2277                let cost = usize::from(s1[i - 1] != s2[j - 1]);
2278                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2279                row_min = row_min.min(curr[j]);
2280            }
2281            if row_min > max_edits {
2282                return Distance::MAX;
2283            }
2284            std::mem::swap(&mut prev, &mut curr);
2285        }
2286        return prev[n] as Distance;
2287    }
2288
2289    // Heap allocation for larger strings
2290    let mut prev: Vec<usize> = (0..=n).collect();
2291    let mut curr = vec![0; n + 1];
2292
2293    for i in 1..=m {
2294        curr[0] = i;
2295        let mut row_min = curr[0];
2296        for j in 1..=n {
2297            let cost = usize::from(s1[i - 1] != s2[j - 1]);
2298            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2299            row_min = row_min.min(curr[j]);
2300        }
2301        if row_min > max_edits {
2302            return Distance::MAX;
2303        }
2304        std::mem::swap(&mut prev, &mut curr);
2305    }
2306    prev[n] as Distance
2307}
2308
2309#[cfg(test)]
2310mod tests {
2311    use super::*;
2312    use crate::compiler::build_nfa;
2313    use crate::ir::lower;
2314    use crate::parser::parse;
2315
2316    fn make_matcher(pattern: &str) -> (Nfa, Option<FuzzyBridge>, usize) {
2317        let ast = parse(pattern).unwrap();
2318        let hir = lower(&ast, 0);
2319        let (nfa, literals) = build_nfa(&hir);
2320
2321        let bridge = FuzzyBridge::new(&literals, None, None, false);
2322
2323        // Count capture groups from AST
2324        let capture_count = count_captures(&ast);
2325
2326        (nfa, bridge, capture_count)
2327    }
2328
2329    fn count_captures(ast: &crate::parser::Ast) -> usize {
2330        use crate::parser::Ast;
2331        match ast {
2332            Ast::Group { index, expr, .. } => (*index).max(count_captures(expr)),
2333            Ast::NonCapturingGroup { expr, .. } => count_captures(expr),
2334            Ast::Concat(parts) => parts.iter().map(count_captures).max().unwrap_or(0),
2335            Ast::Alternation(alts) => alts.iter().map(count_captures).max().unwrap_or(0),
2336            Ast::Quantified { expr, .. } => count_captures(expr),
2337            Ast::Lookahead { expr, .. } => count_captures(expr),
2338            Ast::Lookbehind { expr, .. } => count_captures(expr),
2339            _ => 0,
2340        }
2341    }
2342
2343    #[test]
2344    fn test_simple_match() {
2345        let (nfa, bridge, captures) = make_matcher("hello");
2346        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2347
2348        let result = matcher.find("hello world");
2349        assert!(result.is_some());
2350        let m = result.unwrap();
2351        assert_eq!(m.start, 0);
2352        assert_eq!(m.end, 5);
2353    }
2354
2355    #[test]
2356    fn test_find_no_cache() {
2357        let (nfa, bridge, captures) = make_matcher("hello");
2358        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2359
2360        let result = matcher.find_no_cache("hello world");
2361        assert!(result.is_some());
2362        let m = result.unwrap();
2363        assert_eq!(m.start, 0);
2364        assert_eq!(m.end, 5);
2365    }
2366
2367    #[test]
2368    fn test_find_no_cache_fuzzy() {
2369        let (nfa, bridge, captures) = make_matcher("hello~1");
2370        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2371
2372        let result = matcher.find_no_cache("hallo world");
2373        assert!(result.is_some());
2374        let m = result.unwrap();
2375        assert_eq!(m.start, 0);
2376        assert_eq!(m.end, 5);
2377    }
2378
2379    #[test]
2380    fn test_char_class() {
2381        let (nfa, bridge, captures) = make_matcher("[a-z]+");
2382        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2383
2384        let result = matcher.find("123abc456");
2385        assert!(result.is_some());
2386        let m = result.unwrap();
2387        assert_eq!(&"123abc456"[m.start..m.end], "abc");
2388    }
2389
2390    #[test]
2391    fn test_capture_group() {
2392        let (nfa, bridge, captures) = make_matcher("(abc)");
2393        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2394
2395        let result = matcher.find("xyzabc123");
2396        assert!(result.is_some());
2397        let m = result.unwrap();
2398        assert!(m.captures.get(1).is_some());
2399    }
2400
2401    #[test]
2402    fn test_anchors() {
2403        let (nfa, bridge, captures) = make_matcher("^hello");
2404        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2405
2406        assert!(matcher.find("hello world").is_some());
2407        assert!(matcher.find("say hello").is_none());
2408    }
2409
2410    #[test]
2411    fn test_alternation() {
2412        let (nfa, bridge, captures) = make_matcher("cat|dog");
2413        let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2414
2415        assert!(matcher.find("I have a cat").is_some());
2416        assert!(matcher.find("I have a dog").is_some());
2417        assert!(matcher.find("I have a bird").is_none());
2418    }
2419
2420    #[test]
2421    fn test_edit_distance() {
2422        // Exact matches
2423        assert_eq!(edit_distance("hello", "hello"), 0);
2424        assert_eq!(edit_distance("", ""), 0);
2425
2426        // Single edits
2427        assert_eq!(edit_distance("hello", "hallo"), 1); // substitution
2428        assert_eq!(edit_distance("hello", "hell"), 1); // deletion
2429        assert_eq!(edit_distance("hello", "helloo"), 1); // insertion
2430        assert_eq!(edit_distance("cat", "hat"), 1); // substitution
2431
2432        // Multiple edits
2433        assert_eq!(edit_distance("kitten", "sitting"), 3);
2434        assert_eq!(edit_distance("saturday", "sunday"), 3);
2435
2436        // Edge cases
2437        assert_eq!(edit_distance("", "abc"), 3);
2438        assert_eq!(edit_distance("abc", ""), 3);
2439        assert_eq!(edit_distance("a", "b"), 1);
2440
2441        // Verify Bitap matches DP for various lengths
2442        for (s1, s2, expected) in [
2443            ("fox", "box", 1),
2444            ("quick", "quack", 1),
2445            ("brown", "brawn", 1),
2446            ("test", "taste", 2),
2447            ("abc", "xyz", 3),
2448        ] {
2449            assert_eq!(edit_distance(s1, s2), expected, "{s1} vs {s2}");
2450        }
2451    }
2452}
fuzzy_regex/engine/matcher.rs

fuzzy_regex/engine/
matcher.rs