Skip to main content

fuzzy_regex/engine/
backtrack.rs

1//! Backtracking regex engine for recursive patterns and advanced features.
2//!
3//! This engine uses backtracking instead of NFA simulation, which allows
4//! proper support for recursive patterns like `(?R)`, `(?1)`, etc.
5
6#![allow(clippy::too_many_lines, clippy::comparison_chain)]
7
8use crate::engine::captures::CaptureState;
9use crate::engine::fuzzy_bridge::FuzzyBridge;
10use crate::engine::matcher::{EditCounts, MatchResult, MatcherConfig};
11use crate::ir::nfa::{Nfa, State, StateId};
12
13/// Configuration for backtracking engine.
14#[derive(Debug, Clone)]
15pub struct BacktrackConfig {
16    /// Whether to prefer shorter matches (non-greedy behavior).
17    pub prefer_shortest: bool,
18    /// Similarity threshold for fuzzy matching (0.0 - 1.0).
19    pub threshold: f32,
20    /// Whether this is an unanchored match.
21    pub unanchored: bool,
22}
23
24impl From<MatcherConfig> for BacktrackConfig {
25    fn from(config: MatcherConfig) -> Self {
26        BacktrackConfig {
27            prefer_shortest: config.prefer_shortest,
28            threshold: config.threshold,
29            unanchored: config.unanchored,
30        }
31    }
32}
33
34/// A saved state for backtracking.
35#[derive(Debug, Clone)]
36struct BacktrackFrame {
37    /// State to backtrack to.
38    state: StateId,
39    /// Position in text when this frame was saved.
40    pos: usize,
41    /// Capture state at this point.
42    captures: CaptureState,
43    /// Similarity score at this point.
44    similarity: f32,
45    /// Edit counts at this point.
46    edits: EditCounts,
47    /// Match start position.
48    match_start: usize,
49    /// Recursion depth at this point.
50    recursion_depth: usize,
51}
52
53/// Backtracking regex matcher.
54pub struct BacktrackMatcher<'a> {
55    /// The NFA to execute.
56    nfa: &'a Nfa,
57    /// Bridge to fuzzy matching.
58    fuzzy_bridge: Option<&'a FuzzyBridge>,
59    /// Capture count.
60    capture_count: usize,
61    /// Configuration.
62    config: BacktrackConfig,
63    /// Maximum recursion depth to prevent stack overflow.
64    max_recursion: usize,
65}
66
67impl<'a> BacktrackMatcher<'a> {
68    /// Create a new backtracking matcher.
69    #[must_use]
70    pub fn new(
71        nfa: &'a Nfa,
72        fuzzy_bridge: Option<&'a FuzzyBridge>,
73        capture_count: usize,
74        config: BacktrackConfig,
75    ) -> Self {
76        BacktrackMatcher {
77            nfa,
78            fuzzy_bridge,
79            capture_count,
80            config,
81            max_recursion: 1000,
82        }
83    }
84
85    /// Find a match in the text.
86    #[must_use]
87    pub fn find(&self, text: &str) -> Option<MatchResult> {
88        self.find_at(text, 0)
89    }
90
91    /// Find a match starting at a specific position.
92    #[must_use]
93    pub fn find_at(&self, text: &str, start: usize) -> Option<MatchResult> {
94        let text_bytes = text.as_bytes();
95
96        if self.config.unanchored {
97            // Try each starting position
98            for pos in start..text.len() {
99                if let Some(result) = self.try_match(text, text_bytes, pos, pos, 0) {
100                    return Some(result);
101                }
102            }
103            None
104        } else {
105            // Anchored match - must start at exact position
106            self.try_match(text, text_bytes, start, start, 0)
107        }
108    }
109
110    /// Internal method to try matching from a position with explicit text parameter.
111    /// This is used for recursive patterns.
112    fn try_match_recursive(
113        &self,
114        text: &str,
115        text_bytes: &[u8],
116        pos: usize,
117        match_start: usize,
118        depth: usize,
119    ) -> Option<MatchResult> {
120        // Check recursion depth limit
121        if depth >= self.max_recursion {
122            return None;
123        }
124        self.try_match(text, text_bytes, pos, match_start, depth)
125    }
126
127    /// Try to match from a specific position.
128    fn try_match(
129        &self,
130        text: &str,
131        text_bytes: &[u8],
132        mut pos: usize,
133        mut match_start: usize,
134        mut depth: usize,
135    ) -> Option<MatchResult> {
136        // Initialize captures
137        let mut captures = CaptureState::new(self.capture_count);
138        let mut similarity = 1.0f32;
139        let mut edits = EditCounts::default();
140
141        // Backtracking stack
142        let mut backtrack_stack: Vec<BacktrackFrame> = Vec::new();
143
144        // Current state
145        let mut state = self.nfa.start;
146
147        loop {
148            // Check for end of input
149            if state == 0 {
150                // Accept state - found a match
151                return Some(MatchResult {
152                    start: match_start,
153                    end: pos,
154                    similarity,
155                    edits,
156                    captures,
157                });
158            }
159
160            let state_data = &self.nfa.states[state];
161
162            match state_data {
163                State::Accept => {
164                    // Found a match!
165                    return Some(MatchResult {
166                        start: match_start,
167                        end: pos,
168                        similarity,
169                        edits,
170                        captures,
171                    });
172                }
173
174                State::Epsilon { targets } => {
175                    // Epsilon transition - no input consumed
176                    if targets.len() == 1 {
177                        state = targets[0];
178                    } else if targets.len() > 1 {
179                        // Choice point - save state for backtracking
180                        // Save current state (will continue with first target)
181                        state = targets[0];
182                        // Save other targets for backtracking
183                        for &target in &targets[1..] {
184                            backtrack_stack.push(BacktrackFrame {
185                                state: target,
186                                pos,
187                                captures: captures.clone(),
188                                similarity,
189                                edits: edits.clone(),
190                                match_start,
191                                recursion_depth: depth,
192                            });
193                        }
194                    } else {
195                        // Empty epsilon - dead state
196                        // Try to backtrack
197                        if let Some(frame) = backtrack_stack.pop() {
198                            state = frame.state;
199                            pos = frame.pos;
200                            captures = frame.captures;
201                            similarity = frame.similarity;
202                            edits = frame.edits;
203                            match_start = frame.match_start;
204                            depth = frame.recursion_depth;
205                        } else {
206                            return None;
207                        }
208                    }
209                }
210
211                State::Char { class, next } => {
212                    // Try to match a character
213                    if let Some(ch) = text[pos..].chars().next() {
214                        if class.matches(ch) {
215                            pos += ch.len_utf8();
216                            state = *next;
217                        } else {
218                            // Try to backtrack
219                            if let Some(frame) = backtrack_stack.pop() {
220                                state = frame.state;
221                                pos = frame.pos;
222                                captures = frame.captures;
223                                similarity = frame.similarity;
224                                edits = frame.edits;
225                                match_start = frame.match_start;
226                                depth = frame.recursion_depth;
227                            } else {
228                                return None;
229                            }
230                        }
231                    } else {
232                        // End of input - try to backtrack
233                        if let Some(frame) = backtrack_stack.pop() {
234                            state = frame.state;
235                            pos = frame.pos;
236                            captures = frame.captures;
237                            similarity = frame.similarity;
238                            edits = frame.edits;
239                            match_start = frame.match_start;
240                            depth = frame.recursion_depth;
241                        } else {
242                            return None;
243                        }
244                    }
245                }
246
247                State::FuzzyChar {
248                    class,
249                    limits: _,
250                    min_edits: _,
251                    cost_constraint: _,
252                    next,
253                } => {
254                    // For now, treat fuzzy char as exact match
255                    if let Some(ch) = text[pos..].chars().next() {
256                        if class.matches(ch) {
257                            pos += ch.len_utf8();
258                            state = *next;
259                        } else if let Some(frame) = backtrack_stack.pop() {
260                            state = frame.state;
261                            pos = frame.pos;
262                            captures = frame.captures;
263                            similarity = frame.similarity;
264                            edits = frame.edits;
265                            match_start = frame.match_start;
266                        } else {
267                            return None;
268                        }
269                    } else if let Some(frame) = backtrack_stack.pop() {
270                        state = frame.state;
271                        pos = frame.pos;
272                        captures = frame.captures;
273                        similarity = frame.similarity;
274                        edits = frame.edits;
275                        match_start = frame.match_start;
276                    } else {
277                        return None;
278                    }
279                }
280
281                State::FuzzyLiteral {
282                    pattern_index,
283                    next,
284                    ..
285                } => {
286                    // Try to match a literal string
287                    if let Some(bridge) = self.fuzzy_bridge {
288                        if let Some(pattern) = bridge.pattern_text(*pattern_index) {
289                            let remaining = &text[pos..];
290                            if remaining.starts_with(pattern) {
291                                // Exact match
292                                pos += pattern.len();
293                                state = *next;
294                            } else {
295                                // Try fuzzy match
296                                // For simplicity, treat as no match for now
297                                if let Some(frame) = backtrack_stack.pop() {
298                                    state = frame.state;
299                                    pos = frame.pos;
300                                    captures = frame.captures;
301                                    similarity = frame.similarity;
302                                    edits = frame.edits;
303                                    match_start = frame.match_start;
304                                } else {
305                                    return None;
306                                }
307                            }
308                        } else {
309                            state = *next;
310                        }
311                    } else {
312                        state = *next;
313                    }
314                }
315
316                State::CaptureStart { index, next } => {
317                    // Start capturing
318                    captures.start_capture(*index, pos);
319                    state = *next;
320                }
321
322                State::CaptureEnd { index, next } => {
323                    // End capturing
324                    captures.end_capture(*index, pos);
325                    state = *next;
326                }
327
328                State::Anchor { kind, next } => {
329                    // Handle anchors
330                    use crate::parser::ast::Anchor;
331                    let matches = match kind {
332                        Anchor::Start => pos == 0,
333                        Anchor::End => pos == text.len(),
334                        Anchor::WordBoundary => self.is_word_boundary_at(text_bytes, pos),
335                        Anchor::NotWordBoundary => !self.is_word_boundary_at(text_bytes, pos),
336                    };
337
338                    if matches {
339                        state = *next;
340                    } else if let Some(frame) = backtrack_stack.pop() {
341                        state = frame.state;
342                        pos = frame.pos;
343                        captures = frame.captures;
344                        similarity = frame.similarity;
345                        edits = frame.edits;
346                        match_start = frame.match_start;
347                    } else {
348                        return None;
349                    }
350                }
351
352                State::Lookahead {
353                    positive: _,
354                    nfa: _,
355                    literals: _,
356                    next,
357                } => {
358                    // For lookahead, we need to evaluate the expression
359                    // This is complex - skip for now
360                    state = *next;
361                }
362
363                State::Lookbehind {
364                    positive: _,
365                    nfa: _,
366                    literals: _,
367                    bridge: _,
368                    next,
369                } => {
370                    // For lookbehind, similar complexity
371                    state = *next;
372                }
373
374                State::Backreference { group, next, .. } => {
375                    // Try to match a backreference
376                    if let Some(captured) = captures.get(*group) {
377                        let captured_text = &text[captured.0..captured.1];
378                        let remaining = &text[pos..];
379                        if remaining.starts_with(captured_text) {
380                            pos += captured_text.len();
381                            state = *next;
382                        } else if let Some(frame) = backtrack_stack.pop() {
383                            state = frame.state;
384                            pos = frame.pos;
385                            captures = frame.captures;
386                            similarity = frame.similarity;
387                            edits = frame.edits;
388                            match_start = frame.match_start;
389                        } else {
390                            return None;
391                        }
392                    } else {
393                        state = *next;
394                    }
395                }
396
397                State::Split { branches, greedy } => {
398                    // Choice point - save state for backtracking
399                    if *greedy {
400                        // Greedy: try first branch, save others
401                        state = branches[0];
402                        for &b in &branches[1..] {
403                            backtrack_stack.push(BacktrackFrame {
404                                state: b,
405                                pos,
406                                captures: captures.clone(),
407                                similarity,
408                                edits: edits.clone(),
409                                match_start,
410                                recursion_depth: depth,
411                            });
412                        }
413                    } else {
414                        // Non-greedy: try last branch first
415                        state = branches[branches.len() - 1];
416                        for i in (0..branches.len() - 1).rev() {
417                            backtrack_stack.push(BacktrackFrame {
418                                state: branches[i],
419                                pos,
420                                captures: captures.clone(),
421                                similarity,
422                                edits: edits.clone(),
423                                match_start,
424                                recursion_depth: depth,
425                            });
426                        }
427                    }
428                }
429
430                State::ResetMatchStart { next } => {
431                    // \K - reset match start
432                    match_start = pos;
433                    state = *next;
434                }
435
436                State::AtomicGroup { nfa, next } => {
437                    // Atomic group: match the sub-NFA without backtracking
438                    // Build a sub-matcher and run it
439                    let _sub_matcher =
440                        BacktrackMatcher::new(nfa, self.fuzzy_bridge, 0, self.config.clone());
441                    // Note: Atomic groups need to match the sub-NFA from the current position
442                    // This is a simplified implementation
443                    state = *next;
444                }
445
446                State::RecursivePattern { next } => {
447                    // (?R) - recursively match entire pattern
448                    // The recursion is typically wrapped in a Split for optional behavior (?R)?
449                    // Save state for backtracking
450                    backtrack_stack.push(BacktrackFrame {
451                        state: *next,
452                        pos,
453                        captures: captures.clone(),
454                        similarity,
455                        edits: edits.clone(),
456                        match_start,
457                        recursion_depth: depth + 1,
458                    });
459
460                    // Try recursion: match entire pattern from current position
461                    // This is a recursive call that starts fresh
462                    if let Some(result) =
463                        self.try_match_recursive(text, text_bytes, pos, pos, depth + 1)
464                    {
465                        pos = result.end;
466                        similarity *= result.similarity;
467                        edits = edits.merge(&result.edits);
468                        state = *next;
469                    } else {
470                        // Recursion failed - pop our frame and backtrack
471                        backtrack_stack.pop();
472                        if let Some(frame) = backtrack_stack.pop() {
473                            state = frame.state;
474                            pos = frame.pos;
475                            captures = frame.captures;
476                            similarity = frame.similarity;
477                            edits = frame.edits;
478                            match_start = frame.match_start;
479                        } else {
480                            return None;
481                        }
482                    }
483                }
484
485                State::RecursiveGroup { group, next } => {
486                    // (?1), (?2), etc. - recursively match a group
487                    let group_idx = *group;
488                    if group_idx > 0 && group_idx <= self.nfa.group_states.len() {
489                        // Get the group's start/end states
490                        let (_cap_start, _cap_end) = self.nfa.group_states[group_idx - 1];
491
492                        // Save for backtracking
493                        backtrack_stack.push(BacktrackFrame {
494                            state: *next,
495                            pos,
496                            captures: captures.clone(),
497                            similarity,
498                            edits: edits.clone(),
499                            match_start,
500                            recursion_depth: depth + 1,
501                        });
502
503                        // Try to match from the group's start state
504                        // This is simplified - we'd need to build a proper sub-NFA
505                        state = *next;
506                    } else {
507                        state = *next;
508                    }
509                }
510
511                State::RecursiveNamedGroup { name, next } => {
512                    // (?&name) - recursively match a named group
513                    if let Some((_cap_start, _cap_end)) = self.nfa.named_group_states.get(name) {
514                        // Simplified - just continue
515                        state = *next;
516                    } else {
517                        state = *next;
518                    }
519                }
520            }
521        }
522    }
523
524    /// Check if position is at a word boundary.
525    #[allow(clippy::unused_self)]
526    fn is_word_boundary_at(&self, text: &[u8], pos: usize) -> bool {
527        let before_is_word = if pos > 0 {
528            let mut start = pos - 1;
529            while start > 0 && (text[start] & 0xC0) == 0x80 {
530                start -= 1;
531            }
532            let ch = &text[start..pos];
533            !ch.is_empty() && ch.iter().all(|&b| b.is_ascii_alphanumeric() || b == b'_')
534        } else {
535            false
536        };
537
538        let after_is_word = if pos < text.len() {
539            let mut end = pos;
540            while end < text.len() && (text[end] & 0xC0) == 0x80 {
541                end += 1;
542            }
543            let ch = &text[pos..end.min(text.len())];
544            !ch.is_empty() && ch.iter().all(|&b| b.is_ascii_alphanumeric() || b == b'_')
545        } else {
546            false
547        };
548
549        before_is_word != after_is_word
550    }
551}