fuzzy_regex/engine/backtrack.rs
1//! Backtracking regex engine for recursive patterns and advanced features.
2//!
3//! This engine uses backtracking instead of NFA simulation, which allows
4//! proper support for recursive patterns like `(?R)`, `(?1)`, etc.
5
6#![allow(clippy::too_many_lines, clippy::comparison_chain)]
7
8use crate::engine::captures::CaptureState;
9use crate::engine::fuzzy_bridge::FuzzyBridge;
10use crate::engine::matcher::{EditCounts, MatchResult, MatcherConfig};
11use crate::ir::nfa::{Nfa, State, StateId};
12
13/// Configuration for backtracking engine.
14#[derive(Debug, Clone)]
15pub struct BacktrackConfig {
16 /// Whether to prefer shorter matches (non-greedy behavior).
17 pub prefer_shortest: bool,
18 /// Similarity threshold for fuzzy matching (0.0 - 1.0).
19 pub threshold: f32,
20 /// Whether this is an unanchored match.
21 pub unanchored: bool,
22}
23
24impl From<MatcherConfig> for BacktrackConfig {
25 fn from(config: MatcherConfig) -> Self {
26 BacktrackConfig {
27 prefer_shortest: config.prefer_shortest,
28 threshold: config.threshold,
29 unanchored: config.unanchored,
30 }
31 }
32}
33
34/// A saved state for backtracking.
35#[derive(Debug, Clone)]
36struct BacktrackFrame {
37 /// State to backtrack to.
38 state: StateId,
39 /// Position in text when this frame was saved.
40 pos: usize,
41 /// Capture state at this point.
42 captures: CaptureState,
43 /// Similarity score at this point.
44 similarity: f32,
45 /// Edit counts at this point.
46 edits: EditCounts,
47 /// Match start position.
48 match_start: usize,
49 /// Recursion depth at this point.
50 recursion_depth: usize,
51}
52
53/// Backtracking regex matcher.
54pub struct BacktrackMatcher<'a> {
55 /// The NFA to execute.
56 nfa: &'a Nfa,
57 /// Bridge to fuzzy matching.
58 fuzzy_bridge: Option<&'a FuzzyBridge>,
59 /// Capture count.
60 capture_count: usize,
61 /// Configuration.
62 config: BacktrackConfig,
63 /// Maximum recursion depth to prevent stack overflow.
64 max_recursion: usize,
65}
66
67impl<'a> BacktrackMatcher<'a> {
68 /// Create a new backtracking matcher.
69 #[must_use]
70 pub fn new(
71 nfa: &'a Nfa,
72 fuzzy_bridge: Option<&'a FuzzyBridge>,
73 capture_count: usize,
74 config: BacktrackConfig,
75 ) -> Self {
76 BacktrackMatcher {
77 nfa,
78 fuzzy_bridge,
79 capture_count,
80 config,
81 max_recursion: 1000,
82 }
83 }
84
85 /// Find a match in the text.
86 #[must_use]
87 pub fn find(&self, text: &str) -> Option<MatchResult> {
88 self.find_at(text, 0)
89 }
90
91 /// Find a match starting at a specific position.
92 #[must_use]
93 pub fn find_at(&self, text: &str, start: usize) -> Option<MatchResult> {
94 let text_bytes = text.as_bytes();
95
96 if self.config.unanchored {
97 // Try each starting position
98 for pos in start..text.len() {
99 if let Some(result) = self.try_match(text, text_bytes, pos, pos, 0) {
100 return Some(result);
101 }
102 }
103 None
104 } else {
105 // Anchored match - must start at exact position
106 self.try_match(text, text_bytes, start, start, 0)
107 }
108 }
109
110 /// Internal method to try matching from a position with explicit text parameter.
111 /// This is used for recursive patterns.
112 fn try_match_recursive(
113 &self,
114 text: &str,
115 text_bytes: &[u8],
116 pos: usize,
117 match_start: usize,
118 depth: usize,
119 ) -> Option<MatchResult> {
120 // Check recursion depth limit
121 if depth >= self.max_recursion {
122 return None;
123 }
124 self.try_match(text, text_bytes, pos, match_start, depth)
125 }
126
127 /// Try to match from a specific position.
128 fn try_match(
129 &self,
130 text: &str,
131 text_bytes: &[u8],
132 mut pos: usize,
133 mut match_start: usize,
134 mut depth: usize,
135 ) -> Option<MatchResult> {
136 // Initialize captures
137 let mut captures = CaptureState::new(self.capture_count);
138 let mut similarity = 1.0f32;
139 let mut edits = EditCounts::default();
140
141 // Backtracking stack
142 let mut backtrack_stack: Vec<BacktrackFrame> = Vec::new();
143
144 // Current state
145 let mut state = self.nfa.start;
146
147 loop {
148 // Check for end of input
149 if state == 0 {
150 // Accept state - found a match
151 return Some(MatchResult {
152 start: match_start,
153 end: pos,
154 similarity,
155 edits,
156 captures,
157 });
158 }
159
160 let state_data = &self.nfa.states[state];
161
162 match state_data {
163 State::Accept => {
164 // Found a match!
165 return Some(MatchResult {
166 start: match_start,
167 end: pos,
168 similarity,
169 edits,
170 captures,
171 });
172 }
173
174 State::Epsilon { targets } => {
175 // Epsilon transition - no input consumed
176 if targets.len() == 1 {
177 state = targets[0];
178 } else if targets.len() > 1 {
179 // Choice point - save state for backtracking
180 // Save current state (will continue with first target)
181 state = targets[0];
182 // Save other targets for backtracking
183 for &target in &targets[1..] {
184 backtrack_stack.push(BacktrackFrame {
185 state: target,
186 pos,
187 captures: captures.clone(),
188 similarity,
189 edits: edits.clone(),
190 match_start,
191 recursion_depth: depth,
192 });
193 }
194 } else {
195 // Empty epsilon - dead state
196 // Try to backtrack
197 if let Some(frame) = backtrack_stack.pop() {
198 state = frame.state;
199 pos = frame.pos;
200 captures = frame.captures;
201 similarity = frame.similarity;
202 edits = frame.edits;
203 match_start = frame.match_start;
204 depth = frame.recursion_depth;
205 } else {
206 return None;
207 }
208 }
209 }
210
211 State::Char { class, next } => {
212 // Try to match a character
213 if let Some(ch) = text[pos..].chars().next() {
214 if class.matches(ch) {
215 pos += ch.len_utf8();
216 state = *next;
217 } else {
218 // Try to backtrack
219 if let Some(frame) = backtrack_stack.pop() {
220 state = frame.state;
221 pos = frame.pos;
222 captures = frame.captures;
223 similarity = frame.similarity;
224 edits = frame.edits;
225 match_start = frame.match_start;
226 depth = frame.recursion_depth;
227 } else {
228 return None;
229 }
230 }
231 } else {
232 // End of input - try to backtrack
233 if let Some(frame) = backtrack_stack.pop() {
234 state = frame.state;
235 pos = frame.pos;
236 captures = frame.captures;
237 similarity = frame.similarity;
238 edits = frame.edits;
239 match_start = frame.match_start;
240 depth = frame.recursion_depth;
241 } else {
242 return None;
243 }
244 }
245 }
246
247 State::FuzzyChar {
248 class,
249 limits: _,
250 min_edits: _,
251 cost_constraint: _,
252 next,
253 } => {
254 // For now, treat fuzzy char as exact match
255 if let Some(ch) = text[pos..].chars().next() {
256 if class.matches(ch) {
257 pos += ch.len_utf8();
258 state = *next;
259 } else if let Some(frame) = backtrack_stack.pop() {
260 state = frame.state;
261 pos = frame.pos;
262 captures = frame.captures;
263 similarity = frame.similarity;
264 edits = frame.edits;
265 match_start = frame.match_start;
266 } else {
267 return None;
268 }
269 } else if let Some(frame) = backtrack_stack.pop() {
270 state = frame.state;
271 pos = frame.pos;
272 captures = frame.captures;
273 similarity = frame.similarity;
274 edits = frame.edits;
275 match_start = frame.match_start;
276 } else {
277 return None;
278 }
279 }
280
281 State::FuzzyLiteral {
282 pattern_index,
283 next,
284 ..
285 } => {
286 // Try to match a literal string
287 if let Some(bridge) = self.fuzzy_bridge {
288 if let Some(pattern) = bridge.pattern_text(*pattern_index) {
289 let remaining = &text[pos..];
290 if remaining.starts_with(pattern) {
291 // Exact match
292 pos += pattern.len();
293 state = *next;
294 } else {
295 // Try fuzzy match
296 // For simplicity, treat as no match for now
297 if let Some(frame) = backtrack_stack.pop() {
298 state = frame.state;
299 pos = frame.pos;
300 captures = frame.captures;
301 similarity = frame.similarity;
302 edits = frame.edits;
303 match_start = frame.match_start;
304 } else {
305 return None;
306 }
307 }
308 } else {
309 state = *next;
310 }
311 } else {
312 state = *next;
313 }
314 }
315
316 State::CaptureStart { index, next } => {
317 // Start capturing
318 captures.start_capture(*index, pos);
319 state = *next;
320 }
321
322 State::CaptureEnd { index, next } => {
323 // End capturing
324 captures.end_capture(*index, pos);
325 state = *next;
326 }
327
328 State::Anchor { kind, next } => {
329 // Handle anchors
330 use crate::parser::ast::Anchor;
331 let matches = match kind {
332 Anchor::Start => pos == 0,
333 Anchor::End => pos == text.len(),
334 Anchor::WordBoundary => self.is_word_boundary_at(text_bytes, pos),
335 Anchor::NotWordBoundary => !self.is_word_boundary_at(text_bytes, pos),
336 };
337
338 if matches {
339 state = *next;
340 } else if let Some(frame) = backtrack_stack.pop() {
341 state = frame.state;
342 pos = frame.pos;
343 captures = frame.captures;
344 similarity = frame.similarity;
345 edits = frame.edits;
346 match_start = frame.match_start;
347 } else {
348 return None;
349 }
350 }
351
352 State::Lookahead {
353 positive: _,
354 nfa: _,
355 literals: _,
356 next,
357 } => {
358 // For lookahead, we need to evaluate the expression
359 // This is complex - skip for now
360 state = *next;
361 }
362
363 State::Lookbehind {
364 positive: _,
365 nfa: _,
366 literals: _,
367 bridge: _,
368 next,
369 } => {
370 // For lookbehind, similar complexity
371 state = *next;
372 }
373
374 State::Backreference { group, next, .. } => {
375 // Try to match a backreference
376 if let Some(captured) = captures.get(*group) {
377 let captured_text = &text[captured.0..captured.1];
378 let remaining = &text[pos..];
379 if remaining.starts_with(captured_text) {
380 pos += captured_text.len();
381 state = *next;
382 } else if let Some(frame) = backtrack_stack.pop() {
383 state = frame.state;
384 pos = frame.pos;
385 captures = frame.captures;
386 similarity = frame.similarity;
387 edits = frame.edits;
388 match_start = frame.match_start;
389 } else {
390 return None;
391 }
392 } else {
393 state = *next;
394 }
395 }
396
397 State::Split { branches, greedy } => {
398 // Choice point - save state for backtracking
399 if *greedy {
400 // Greedy: try first branch, save others
401 state = branches[0];
402 for &b in &branches[1..] {
403 backtrack_stack.push(BacktrackFrame {
404 state: b,
405 pos,
406 captures: captures.clone(),
407 similarity,
408 edits: edits.clone(),
409 match_start,
410 recursion_depth: depth,
411 });
412 }
413 } else {
414 // Non-greedy: try last branch first
415 state = branches[branches.len() - 1];
416 for i in (0..branches.len() - 1).rev() {
417 backtrack_stack.push(BacktrackFrame {
418 state: branches[i],
419 pos,
420 captures: captures.clone(),
421 similarity,
422 edits: edits.clone(),
423 match_start,
424 recursion_depth: depth,
425 });
426 }
427 }
428 }
429
430 State::ResetMatchStart { next } => {
431 // \K - reset match start
432 match_start = pos;
433 state = *next;
434 }
435
436 State::AtomicGroup { nfa, next } => {
437 // Atomic group: match the sub-NFA without backtracking
438 // Build a sub-matcher and run it
439 let _sub_matcher =
440 BacktrackMatcher::new(nfa, self.fuzzy_bridge, 0, self.config.clone());
441 // Note: Atomic groups need to match the sub-NFA from the current position
442 // This is a simplified implementation
443 state = *next;
444 }
445
446 State::RecursivePattern { next } => {
447 // (?R) - recursively match entire pattern
448 // The recursion is typically wrapped in a Split for optional behavior (?R)?
449 // Save state for backtracking
450 backtrack_stack.push(BacktrackFrame {
451 state: *next,
452 pos,
453 captures: captures.clone(),
454 similarity,
455 edits: edits.clone(),
456 match_start,
457 recursion_depth: depth + 1,
458 });
459
460 // Try recursion: match entire pattern from current position
461 // This is a recursive call that starts fresh
462 if let Some(result) =
463 self.try_match_recursive(text, text_bytes, pos, pos, depth + 1)
464 {
465 pos = result.end;
466 similarity *= result.similarity;
467 edits = edits.merge(&result.edits);
468 state = *next;
469 } else {
470 // Recursion failed - pop our frame and backtrack
471 backtrack_stack.pop();
472 if let Some(frame) = backtrack_stack.pop() {
473 state = frame.state;
474 pos = frame.pos;
475 captures = frame.captures;
476 similarity = frame.similarity;
477 edits = frame.edits;
478 match_start = frame.match_start;
479 } else {
480 return None;
481 }
482 }
483 }
484
485 State::RecursiveGroup { group, next } => {
486 // (?1), (?2), etc. - recursively match a group
487 let group_idx = *group;
488 if group_idx > 0 && group_idx <= self.nfa.group_states.len() {
489 // Get the group's start/end states
490 let (_cap_start, _cap_end) = self.nfa.group_states[group_idx - 1];
491
492 // Save for backtracking
493 backtrack_stack.push(BacktrackFrame {
494 state: *next,
495 pos,
496 captures: captures.clone(),
497 similarity,
498 edits: edits.clone(),
499 match_start,
500 recursion_depth: depth + 1,
501 });
502
503 // Try to match from the group's start state
504 // This is simplified - we'd need to build a proper sub-NFA
505 state = *next;
506 } else {
507 state = *next;
508 }
509 }
510
511 State::RecursiveNamedGroup { name, next } => {
512 // (?&name) - recursively match a named group
513 if let Some((_cap_start, _cap_end)) = self.nfa.named_group_states.get(name) {
514 // Simplified - just continue
515 state = *next;
516 } else {
517 state = *next;
518 }
519 }
520 }
521 }
522 }
523
524 /// Check if position is at a word boundary.
525 #[allow(clippy::unused_self)]
526 fn is_word_boundary_at(&self, text: &[u8], pos: usize) -> bool {
527 let before_is_word = if pos > 0 {
528 let mut start = pos - 1;
529 while start > 0 && (text[start] & 0xC0) == 0x80 {
530 start -= 1;
531 }
532 let ch = &text[start..pos];
533 !ch.is_empty() && ch.iter().all(|&b| b.is_ascii_alphanumeric() || b == b'_')
534 } else {
535 false
536 };
537
538 let after_is_word = if pos < text.len() {
539 let mut end = pos;
540 while end < text.len() && (text[end] & 0xC0) == 0x80 {
541 end += 1;
542 }
543 let ch = &text[pos..end.min(text.len())];
544 !ch.is_empty() && ch.iter().all(|&b| b.is_ascii_alphanumeric() || b == b'_')
545 } else {
546 false
547 };
548
549 before_is_word != after_is_word
550 }
551}