1#![allow(
4 clippy::needless_range_loop,
5 clippy::match_same_arms,
6 clippy::too_many_lines,
7 clippy::similar_names,
8 clippy::missing_panics_doc,
9 clippy::missing_errors_doc,
10 clippy::ref_option_ref,
11 clippy::let_underscore_untyped,
12 clippy::items_after_statements,
13 clippy::float_cmp,
14 clippy::allow_attributes,
15 let_underscore_drop
16)]
17
18use std::sync::Arc;
19
20use super::captures::CaptureState;
21use super::fuzzy_bridge::{CachedMatches, FuzzyBridge, FuzzyMatchResult};
22use crate::ir::{LiteralPattern, Nfa, State, StateId};
23use crate::parser::Anchor;
24use crate::types::{Distance, FuzzyLimits, NumEdits};
25
26#[derive(Debug, Clone)]
28struct Thread {
29 state: StateId,
31 pos: usize,
33 match_start: usize,
35 captures: CaptureState,
37 similarity: f32,
39 edits: EditCounts,
41}
42
43impl Default for Thread {
44 fn default() -> Self {
45 Thread {
46 state: 0,
47 pos: 0,
48 match_start: 0,
49 captures: CaptureState::new(0),
50 similarity: 1.0,
51 edits: EditCounts::default(),
52 }
53 }
54}
55
56#[derive(Debug, Clone, Default)]
58pub struct EditCounts {
59 pub insertions: u8,
61 pub deletions: u8,
63 pub substitutions: u8,
65 pub swaps: u8,
67}
68
69impl EditCounts {
70 #[must_use]
72 pub fn total(&self) -> u8 {
73 self.insertions
74 .saturating_add(self.deletions)
75 .saturating_add(self.substitutions)
76 .saturating_add(self.swaps)
77 }
78
79 #[must_use]
82 pub fn cost(&self, i_cost: u8, d_cost: u8, s_cost: u8, t_cost: u8) -> u16 {
83 u16::from(self.insertions) * u16::from(i_cost)
84 + u16::from(self.deletions) * u16::from(d_cost)
85 + u16::from(self.substitutions) * u16::from(s_cost)
86 + u16::from(self.swaps) * u16::from(t_cost)
87 }
88
89 #[must_use]
91 pub fn merge(&self, other: &EditCounts) -> Self {
92 EditCounts {
93 insertions: self.insertions.saturating_add(other.insertions),
94 deletions: self.deletions.saturating_add(other.deletions),
95 substitutions: self.substitutions.saturating_add(other.substitutions),
96 swaps: self.swaps.saturating_add(other.swaps),
97 }
98 }
99
100 #[must_use]
102 pub fn from_fuzzy_result(result: &FuzzyMatchResult) -> Self {
103 EditCounts {
104 insertions: result.insertions,
105 deletions: result.deletions,
106 substitutions: result.substitutions,
107 swaps: result.swaps,
108 }
109 }
110}
111
112#[derive(Debug, Clone)]
114pub struct MatchResult {
115 pub start: usize,
117 pub end: usize,
119 pub similarity: f32,
121 pub edits: EditCounts,
123 pub captures: CaptureState,
125}
126
127impl MatchResult {
128 #[must_use]
130 pub fn as_str<'a>(&self, text: &'a str) -> &'a str {
131 &text[self.start..self.end]
132 }
133}
134
135#[derive(Debug, Clone)]
137pub struct MatcherConfig {
138 pub threshold: f32,
140 pub max_threads: usize,
142 pub unanchored: bool,
144 pub best_match: bool,
146 pub enhance_match: bool,
148 pub posix: bool,
150 pub global: bool,
153 pub multi_line: bool,
155 pub prefer_shortest: bool,
158 pub unicode: bool,
160 pub greedy_first: bool,
163}
164
165impl Default for MatcherConfig {
166 fn default() -> Self {
167 MatcherConfig {
168 threshold: 0.8,
169 max_threads: 1000,
170 unanchored: true,
171 best_match: false,
172 enhance_match: false,
173 posix: false,
174 global: true, multi_line: false,
176 prefer_shortest: false,
177 unicode: false,
178 greedy_first: false,
179 }
180 }
181}
182
183pub struct Matcher<'a> {
185 nfa: &'a Nfa,
187 fuzzy_bridge: Option<&'a FuzzyBridge>,
189 capture_count: usize,
191 config: MatcherConfig,
193 prefilter: Arc<super::prefilter::Prefilter>,
195 is_simple_fuzzy: bool,
197 simple_alternation_indices: Option<Vec<usize>>,
199 multi_prefilter: Option<super::prefilter::Prefilter>,
201 first_char_class: Option<crate::ir::hir::HirClass>,
203 ends_with_end_anchor: bool,
206 max_simple_length: Option<usize>,
208}
209
210impl<'a> Matcher<'a> {
211 #[must_use]
213 pub fn new(
214 nfa: &'a Nfa,
215 fuzzy_bridge: Option<&'a FuzzyBridge>,
216 capture_count: usize,
217 config: MatcherConfig,
218 ) -> Self {
219 let is_simple_fuzzy =
220 nfa.is_simple_fuzzy_only() && fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1);
221 let first_char_class = nfa.first_char_class();
222 let ends_with_end_anchor = nfa.ends_with_end_anchor();
223 let max_simple_length = Self::calculate_max_length(nfa, fuzzy_bridge);
224 Matcher {
225 nfa,
226 fuzzy_bridge,
227 capture_count,
228 config,
229 prefilter: Arc::new(super::prefilter::Prefilter::None),
230 is_simple_fuzzy,
231 simple_alternation_indices: None,
232 multi_prefilter: None,
233 first_char_class,
234 ends_with_end_anchor,
235 max_simple_length,
236 }
237 }
238
239 #[must_use]
241 pub fn with_prefilter(
242 nfa: &'a Nfa,
243 fuzzy_bridge: Option<&'a FuzzyBridge>,
244 capture_count: usize,
245 config: MatcherConfig,
246 prefilter: Arc<super::prefilter::Prefilter>,
247 ) -> Self {
248 let is_simple_fuzzy =
249 nfa.is_simple_fuzzy_only() && fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1);
250
251 let (simple_alternation_indices, multi_prefilter) =
253 Self::detect_simple_alternation(nfa, fuzzy_bridge);
254
255 let first_char_class = nfa.first_char_class();
257
258 let ends_with_end_anchor = nfa.ends_with_end_anchor();
260
261 let max_simple_length = Self::calculate_max_length(nfa, fuzzy_bridge);
263
264 Matcher {
265 nfa,
266 fuzzy_bridge,
267 capture_count,
268 config,
269 prefilter,
270 is_simple_fuzzy,
271 simple_alternation_indices,
272 multi_prefilter,
273 first_char_class,
274 ends_with_end_anchor,
275 max_simple_length,
276 }
277 }
278
279 fn calculate_max_length(nfa: &Nfa, fuzzy_bridge: Option<&FuzzyBridge>) -> Option<usize> {
281 if let Some(len) = nfa.max_simple_length() {
283 return Some(len);
284 }
285
286 let (_, max_len) = nfa.length_range(|pattern_idx| {
288 fuzzy_bridge.and_then(|b| {
289 let char_len = b.pattern_char_len(pattern_idx)?;
290 let max_edits = b.pattern_max_edits(pattern_idx).unwrap_or(0);
291 Some((char_len, max_edits))
292 })
293 });
294
295 max_len
296 }
297
298 fn detect_simple_alternation(
301 nfa: &Nfa,
302 fuzzy_bridge: Option<&FuzzyBridge>,
303 ) -> (Option<Vec<usize>>, Option<super::prefilter::Prefilter>) {
304 let indices = nfa.get_alternation_pattern_indices();
305 if indices.is_empty() {
306 return (None, None);
307 }
308
309 let Some(bridge) = fuzzy_bridge else {
310 return (None, None);
311 };
312
313 let mut literals: Vec<(&str, u8)> = Vec::with_capacity(indices.len());
315 for &idx in &indices {
316 if let Some(text) = bridge.pattern_text(idx) {
317 let max_edits = bridge.pattern_max_edits(idx).unwrap_or(0);
318 literals.push((text, max_edits));
319 }
320 }
321
322 if literals.is_empty() {
323 return (None, None);
324 }
325
326 let multi_pf = super::prefilter::Prefilter::multi_fuzzy(&literals, false);
327
328 if multi_pf.is_active() {
330 (Some(indices), Some(multi_pf))
331 } else {
332 (Some(indices), None)
333 }
334 }
335
336 #[must_use]
338 pub fn find(&self, text: &str) -> Option<MatchResult> {
339 if !self.config.global
343 && !self.config.best_match
344 && !self.config.enhance_match
345 && !self.config.posix
346 && self.config.unanchored
347 && self.simple_alternation_indices.is_some()
348 && self.multi_prefilter.is_some()
349 {
350 return self.find_multi_pattern_fast(text);
351 }
352
353 if !self.config.global
357 && !self.config.best_match
358 && !self.config.enhance_match
359 && !self.config.posix
360 && self.config.unanchored
361 && self.simple_alternation_indices.is_some()
362 && self.multi_prefilter.is_none()
363 {
364 return self.find_multi_pattern_individual_fallback(text);
365 }
366
367 if !self.config.global
371 && !self.config.best_match
372 && !self.config.enhance_match
373 && self.config.unanchored
374 && (self.prefilter.is_active() || self.config.greedy_first)
375 && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
376 {
377 return self.find_greedy_first(text);
378 }
379
380 if !self.config.global
385 && !self.config.best_match
386 && !self.config.enhance_match
387 && self.config.unanchored
388 && (!self.prefilter.is_active() || self.config.greedy_first)
389 && self.is_simple_fuzzy
390 && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
391 {
392 return self.find_streaming_fallback(text);
393 }
394
395 if !self.config.global
399 && !self.config.best_match
400 && !self.config.enhance_match
401 && self.config.unanchored
402 && !self.prefilter.is_active()
403 && self.fuzzy_bridge.is_some_and(|b| b.pattern_count() == 1)
404 && self.capture_count == 0
405 {
406 if let Some(bridge) = self.fuzzy_bridge {
408 if let Some(result) = bridge.search_first(text, self.config.threshold, 0) {
409 let mut captures = CaptureState::new(self.capture_count);
410 captures.set_full_match(result.start, result.end);
411 return Some(MatchResult {
412 start: result.start,
413 end: result.end,
414 similarity: result.similarity,
415 edits: EditCounts {
416 insertions: result.insertions,
417 deletions: result.deletions,
418 substitutions: result.substitutions,
419 swaps: result.swaps,
420 },
421 captures,
422 });
423 }
424 return None;
425 }
426 }
427
428 if !self.config.unanchored && !self.config.best_match && !self.config.enhance_match {
433 let needs_fuzzy_cache = self.fuzzy_bridge.is_some_and(|b| !b.is_all_exact());
435
436 if !needs_fuzzy_cache {
437 return self.find_at_with_cache(text, 0, None);
439 }
440
441 let cached = self.fuzzy_bridge.map(|b| {
444 if b.pattern_count() == 1 {
445 b.search_cached_at_position(text, 0, self.config.threshold)
446 } else {
447 b.search_all(text, self.config.threshold)
449 }
450 });
451 return self.find_at_with_cache(text, 0, cached.as_ref());
452 }
453
454 let cached = self.fuzzy_bridge.map(|b| {
458 if self.prefilter.is_active() && b.pattern_count() == 1 {
459 b.search_all_with_prefilter(text, self.config.threshold, &self.prefilter)
460 } else {
461 b.search_all(text, self.config.threshold)
462 }
463 });
464
465 if self.config.best_match {
466 self.find_best_with_cache(text, cached.as_ref())
468 } else if self.config.posix {
469 self.find_posix_with_cache(text, cached.as_ref())
471 } else if self.config.unanchored {
472 if self.prefilter.is_active() {
474 let mut last_tried = None;
475 let max_offset = self.prefilter.max_offset();
476 for candidate in self.prefilter.find_candidates(text.as_bytes()) {
477 for offset in 0..=max_offset {
480 let pos = candidate + offset;
481 if pos > text.len() {
482 continue;
483 }
484 let idx = Self::snap_to_char_boundary(text, pos);
486 if last_tried == Some(idx) {
487 continue;
488 }
489 last_tried = Some(idx);
490
491 if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
492 return Some(m);
493 }
494 }
495 }
496 self.find_at_with_cache(text, text.len(), cached.as_ref())
498 } else if self.ends_with_end_anchor && !self.config.multi_line {
499 if let Some(max_len) = self.max_simple_length {
502 let bytes = text.as_bytes();
505 let mut positions = Vec::with_capacity(max_len + 1);
506 let mut byte_pos = bytes.len();
507 let mut chars_counted = 0;
508
509 while byte_pos > 0 && chars_counted < max_len {
510 byte_pos -= 1;
511 if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
513 positions.push(byte_pos);
514 chars_counted += 1;
515 }
516 }
517
518 for &idx in &positions {
520 if let Some(ref fcc) = self.first_char_class {
521 let ch = text[idx..].chars().next().unwrap();
522 if !fcc.matches(ch) {
523 continue;
524 }
525 }
526 if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
527 return Some(m);
528 }
529 }
530 self.find_at_with_cache(text, text.len(), cached.as_ref())
531 } else {
532 let positions: Vec<_> = text.char_indices().map(|(idx, _)| idx).collect();
534 for &idx in positions.iter().rev() {
535 if let Some(ref fcc) = self.first_char_class {
536 let ch = text[idx..].chars().next().unwrap();
537 if !fcc.matches(ch) {
538 continue;
539 }
540 }
541 if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
542 return Some(m);
543 }
544 }
545 self.find_at_with_cache(text, text.len(), cached.as_ref())
546 }
547 } else {
548 for (idx, ch) in text.char_indices() {
550 if let Some(ref fcc) = self.first_char_class
552 && !fcc.matches(ch)
553 {
554 continue;
555 }
556 if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
557 return Some(m);
558 }
559 }
560 self.find_at_with_cache(text, text.len(), cached.as_ref())
561 }
562 } else {
563 self.find_at_with_cache(text, 0, cached.as_ref())
564 }
565 }
566
567 fn find_greedy_first(&self, text: &str) -> Option<MatchResult> {
570 let bridge = self.fuzzy_bridge?;
571 let max_offset = self.prefilter.max_offset();
572 let text_bytes = text.as_bytes();
573
574 if self.is_simple_fuzzy && !self.prefilter.is_active() {
576 if let Some((start, result)) = bridge.find_first_guard_nfa(text, self.config.threshold)
577 {
578 let mut captures = CaptureState::new(self.capture_count);
579 captures.set_full_match(start, result.end);
580 return Some(MatchResult {
581 start,
582 end: result.end,
583 similarity: result.similarity,
584 edits: EditCounts {
585 insertions: result.insertions,
586 deletions: result.deletions,
587 substitutions: result.substitutions,
588 swaps: result.swaps,
589 },
590 captures,
591 });
592 }
593 return None;
594 }
595
596 let use_prefilter =
599 self.prefilter.is_active() && self.prefilter.is_selective() && self.is_simple_fuzzy;
600 if use_prefilter {
601 if let Some((start, result)) =
602 bridge.find_first_lazy(text_bytes, self.config.threshold, &self.prefilter)
603 {
604 let mut captures = CaptureState::new(self.capture_count);
605 captures.set_full_match(start, result.end);
606 return Some(MatchResult {
607 start,
608 end: result.end,
609 similarity: result.similarity,
610 edits: EditCounts::from_fuzzy_result(&result),
611 captures,
612 });
613 }
614 return None;
616 }
617
618 if let Some((start, result)) =
620 bridge.find_first_boyer_moore(text_bytes, self.config.threshold, max_offset)
621 {
622 if self.is_simple_fuzzy {
625 let mut captures = CaptureState::new(self.capture_count);
626 captures.set_full_match(start, result.end);
627 return Some(MatchResult {
628 start,
629 end: result.end,
630 similarity: result.similarity,
631 edits: EditCounts::from_fuzzy_result(&result),
632 captures,
633 });
634 }
635
636 let mut cached = CachedMatches::default();
638 cached.insert(0, start, result);
639
640 if let Some(m) = self.find_at_with_cache(text, start, Some(&cached)) {
641 return Some(m);
642 }
643 }
644
645 let mut last_tried = None;
647 for candidate in self.prefilter.find_candidates(text_bytes) {
648 for offset in 0..=max_offset {
649 let pos = candidate + offset;
650 if pos > text.len() {
651 continue;
652 }
653
654 let idx = Self::snap_to_char_boundary(text, pos);
655 if last_tried == Some(idx) {
656 continue;
657 }
658 last_tried = Some(idx);
659
660 if let Some((start, result)) =
662 bridge.search_at_position(text, idx, self.config.threshold)
663 {
664 if self.is_simple_fuzzy {
666 let mut captures = CaptureState::new(self.capture_count);
667 captures.set_full_match(start, result.end);
668 return Some(MatchResult {
669 start,
670 end: result.end,
671 similarity: result.similarity,
672 edits: EditCounts::from_fuzzy_result(&result),
673 captures,
674 });
675 }
676
677 let mut cached = CachedMatches::default();
678 cached.insert(0, start, result);
679
680 if let Some(m) = self.find_at_with_cache(text, start, Some(&cached)) {
681 return Some(m);
682 }
683 }
684 }
685 }
686
687 None
688 }
689
690 fn find_multi_pattern_fast(&self, text: &str) -> Option<MatchResult> {
693 let bridge = self.fuzzy_bridge?;
694 let indices = self.simple_alternation_indices.as_ref()?;
695 let multi_prefilter = self.multi_prefilter.as_ref()?;
696
697 let text_bytes = text.as_bytes();
698
699 let has_fuzzy_edits = indices
703 .iter()
704 .any(|&idx| bridge.pattern_max_edits(idx).unwrap_or(0) > 0);
705
706 let use_individual = if has_fuzzy_edits {
711 true
713 } else if multi_prefilter.is_selective() {
714 false
715 } else {
716 let min_pattern_len = indices
718 .iter()
719 .filter_map(|&idx| bridge.pattern_text(idx))
720 .map(str::len)
721 .min()
722 .unwrap_or(0);
723 min_pattern_len >= 8
725 };
726
727 let result = if use_individual {
728 bridge.find_first_multi_pattern_individual(text_bytes, self.config.threshold, indices)
729 } else {
730 bridge.find_first_multi_pattern(
731 text_bytes,
732 self.config.threshold,
733 indices,
734 multi_prefilter,
735 )
736 };
737
738 if let Some((_pattern_idx, start, result)) = result {
739 let mut captures = CaptureState::new(self.capture_count);
740 captures.set_full_match(start, result.end);
741 return Some(MatchResult {
742 start,
743 end: result.end,
744 similarity: result.similarity,
745 edits: EditCounts::from_fuzzy_result(&result),
746 captures,
747 });
748 }
749
750 None
751 }
752
753 fn find_multi_pattern_individual_fallback(&self, text: &str) -> Option<MatchResult> {
756 let bridge = self.fuzzy_bridge?;
757 let indices = self.simple_alternation_indices.as_ref()?;
758
759 let text_bytes = text.as_bytes();
760
761 if let Some((_pattern_idx, start, result)) =
763 bridge.find_first_multi_pattern_individual(text_bytes, self.config.threshold, indices)
764 {
765 let mut captures = CaptureState::new(self.capture_count);
766 captures.set_full_match(start, result.end);
767 return Some(MatchResult {
768 start,
769 end: result.end,
770 similarity: result.similarity,
771 edits: EditCounts::from_fuzzy_result(&result),
772 captures,
773 });
774 }
775
776 None
777 }
778
779 fn find_streaming_fallback(&self, text: &str) -> Option<MatchResult> {
782 let bridge = self.fuzzy_bridge?;
783 let text_bytes = text.as_bytes();
784
785 if let Some((start, result)) = bridge
787 .find_first_multi_pattern_individual(text_bytes, self.config.threshold, &[0])
788 .map(|(_, start, result)| (start, result))
789 {
790 let mut captures = CaptureState::new(self.capture_count);
791 captures.set_full_match(start, result.end);
792 return Some(MatchResult {
793 start,
794 end: result.end,
795 similarity: result.similarity,
796 edits: EditCounts::from_fuzzy_result(&result),
797 captures,
798 });
799 }
800
801 None
802 }
803
804 #[inline]
806 fn snap_to_char_boundary(text: &str, pos: usize) -> usize {
807 if pos >= text.len() {
808 return text.len();
809 }
810 let bytes = text.as_bytes();
812 let mut p = pos;
813 while p < bytes.len() && (bytes[p] & 0b1100_0000) == 0b1000_0000 {
814 p += 1;
815 }
816 p
817 }
818
819 #[cfg(test)]
824 fn find_no_cache(&self, text: &str) -> Option<MatchResult> {
825 if self.config.unanchored {
826 for (idx, _) in text.char_indices() {
827 if let Some(m) = self.find_at(text, idx) {
828 return Some(m);
829 }
830 }
831 self.find_at(text, text.len())
832 } else {
833 self.find_at(text, 0)
834 }
835 }
836
837 fn find_best_with_cache(
839 &self,
840 text: &str,
841 cached: Option<&CachedMatches>,
842 ) -> Option<MatchResult> {
843 let mut best: Option<MatchResult> = None;
844
845 let is_better_match = |m: &MatchResult, current: &MatchResult| -> bool {
849 let m_cost = m.edits.cost(1, 1, 1, 1);
851 let current_cost = current.edits.cost(1, 1, 1, 1);
852 if m_cost != current_cost {
853 return m_cost < current_cost;
854 }
855
856 if (m.similarity - current.similarity).abs() > f32::EPSILON {
858 return m.similarity > current.similarity;
859 }
860
861 let m_len = m.end - m.start;
863 let current_len = current.end - current.start;
864 if m_len != current_len {
865 return m_len > current_len;
866 }
867
868 m.start < current.start
870 };
871
872 for (idx, _) in text.char_indices() {
874 if let Some(m) = self.find_at_with_cache(text, idx, cached) {
875 if m.edits.cost(1, 1, 1, 1) == 0 {
877 return Some(m);
878 }
879
880 let dominated = best
881 .as_ref()
882 .is_some_and(|current| !is_better_match(&m, current));
883 if !dominated {
884 best = Some(m);
885 }
886 }
887 }
888
889 if let Some(m) = self.find_at_with_cache(text, text.len(), cached) {
891 if m.edits.total() == 0 {
892 return Some(m);
893 }
894
895 let dominated = best
896 .as_ref()
897 .is_some_and(|current| !is_better_match(&m, current));
898 if !dominated {
899 best = Some(m);
900 }
901 }
902
903 best
904 }
905
906 fn find_posix_with_cache(
908 &self,
909 text: &str,
910 _cached: Option<&CachedMatches>,
911 ) -> Option<MatchResult> {
912 let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
916 let fuzzy_cached = if is_all_exact {
917 None
918 } else {
919 self.fuzzy_bridge
920 .map(|b| b.search_all(text, self.config.threshold))
921 };
922
923 let mut best: Option<MatchResult> = None;
924
925 for (idx, _) in text.char_indices() {
927 if let Some(m) = self.find_at_with_cache(text, idx, fuzzy_cached.as_ref()) {
928 if m.start != idx {
929 continue;
930 }
931
932 match &best {
933 None => best = Some(m),
934 Some(current) => {
935 if m.start < current.start
936 || (m.start == current.start && m.end > current.end)
937 {
938 best = Some(m);
939 }
940 }
941 }
942 }
943 }
944
945 best
946 }
947
948 #[must_use]
950 pub fn find_all(&self, text: &str) -> Vec<MatchResult> {
951 let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
954
955 let cached = if is_all_exact {
958 None
959 } else {
960 self.fuzzy_bridge
961 .map(|b| b.search_all(text, self.config.threshold))
962 };
963
964 let mut matches = Vec::new();
965
966 if self.ends_with_end_anchor
969 && !self.config.multi_line
970 && let Some(max_len) = self.max_simple_length
971 {
972 let bytes = text.as_bytes();
974 let mut positions = Vec::with_capacity(max_len + 1);
975 let mut byte_pos = bytes.len();
976 let mut chars_counted = 0;
977
978 while byte_pos > 0 && chars_counted < max_len {
979 byte_pos -= 1;
980 if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
981 positions.push(byte_pos);
982 chars_counted += 1;
983 }
984 }
985
986 for &idx in &positions {
988 if let Some(ref fcc) = self.first_char_class {
989 let ch = text[idx..].chars().next().unwrap();
990 if !fcc.matches(ch) {
991 continue;
992 }
993 }
994 if let Some(m) = self.find_at_with_cache(text, idx, cached.as_ref()) {
995 matches.push(m);
996 break; }
998 }
999
1000 if matches.is_empty()
1002 && let Some(m) = self.find_at_with_cache(text, text.len(), cached.as_ref())
1003 {
1004 matches.push(m);
1005 }
1006
1007 return matches;
1008 }
1009
1010 if let Some(ref cache) = cached
1014 && !cache.is_empty()
1015 {
1016 return self.find_all_with_literal_guide(text, cache);
1017 }
1018
1019 let mut pos = 0;
1020
1021 while pos < text.len() {
1022 let ch = text[pos..].chars().next();
1024
1025 let should_try = match (&self.first_char_class, ch) {
1027 (Some(fcc), Some(c)) => fcc.matches(c),
1028 _ => true, };
1030
1031 if should_try {
1032 let result = self.find_at_with_cache(text, pos, cached.as_ref());
1033 if let Some(m) = result {
1034 let end = m.end;
1035 matches.push(m);
1036 pos = if end > pos { end } else { pos + 1 };
1038 continue;
1039 }
1040 }
1041
1042 pos = text[pos..]
1044 .char_indices()
1045 .nth(1)
1046 .map_or(text.len() + 1, |(i, _)| pos + i);
1047 }
1048
1049 if self.first_char_class.is_none()
1051 && let Some(m) = self.find_at_with_cache(text, text.len(), cached.as_ref())
1052 {
1053 matches.push(m);
1054 }
1055
1056 matches
1057 }
1058
1059 #[must_use]
1064 pub fn find_n(&self, text: &str, n: usize) -> Vec<MatchResult> {
1065 if n == 0 {
1066 return Vec::new();
1067 }
1068
1069 let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
1071
1072 let cached = if is_all_exact {
1074 None
1075 } else {
1076 self.fuzzy_bridge
1077 .map(|b| b.search_all(text, self.config.threshold))
1078 };
1079
1080 let mut matches = Vec::with_capacity(n);
1081 let mut pos = 0;
1082
1083 while pos < text.len() && matches.len() < n {
1084 let ch = text[pos..].chars().next();
1086
1087 let should_try = match (&self.first_char_class, ch) {
1089 (Some(fcc), Some(c)) => fcc.matches(c),
1090 _ => true,
1091 };
1092
1093 if should_try && let Some(m) = self.find_at_with_cache(text, pos, cached.as_ref()) {
1094 let end = m.end;
1095 matches.push(m);
1096 pos = if end > pos { end } else { pos + 1 };
1098 continue;
1099 }
1100
1101 pos = text[pos..]
1103 .char_indices()
1104 .nth(1)
1105 .map_or(text.len() + 1, |(i, _)| pos + i);
1106 }
1107
1108 matches
1109 }
1110
1111 fn find_all_with_literal_guide(&self, text: &str, cached: &CachedMatches) -> Vec<MatchResult> {
1114 let mut matches = Vec::new();
1115
1116 let mut literal_positions: Vec<usize> = cached
1118 .iter()
1119 .flat_map(|((_, start), _)| std::iter::once(start))
1120 .collect();
1121 literal_positions.sort_unstable();
1122 literal_positions.dedup();
1123
1124 if literal_positions.is_empty() {
1125 return matches;
1126 }
1127
1128 const MAX_LOOKBACK: usize = 256;
1131
1132 let mut pos = 0;
1133 let mut lit_idx = 0;
1134
1135 while pos < text.len() && lit_idx < literal_positions.len() {
1136 let next_lit_pos = literal_positions[lit_idx];
1137
1138 if pos + MAX_LOOKBACK < next_lit_pos {
1140 pos = next_lit_pos.saturating_sub(MAX_LOOKBACK);
1142 pos = Self::snap_to_char_boundary(text, pos);
1144 }
1145
1146 let ch = text[pos..].chars().next();
1148
1149 let should_try = match (&self.first_char_class, ch) {
1151 (Some(fcc), Some(c)) => fcc.matches(c),
1152 _ => true,
1153 };
1154
1155 if should_try && let Some(m) = self.find_at_with_cache(text, pos, Some(cached)) {
1156 let end = m.end;
1157 matches.push(m);
1158 pos = if end > pos { end } else { pos + 1 };
1160 while lit_idx < literal_positions.len() && literal_positions[lit_idx] < pos {
1162 lit_idx += 1;
1163 }
1164 continue;
1165 }
1166
1167 let next_pos = text[pos..]
1169 .char_indices()
1170 .nth(1)
1171 .map_or(text.len() + 1, |(i, _)| pos + i);
1172
1173 if next_pos > next_lit_pos {
1175 lit_idx += 1;
1176 }
1177
1178 pos = next_pos;
1179 }
1180
1181 matches
1182 }
1183
1184 #[must_use]
1190 pub fn find_at(&self, text: &str, start: usize) -> Option<MatchResult> {
1191 self.find_at_with_cache(text, start, None)
1192 }
1193
1194 pub fn build_cache(&self, text: &str) -> Option<CachedMatches> {
1199 let is_all_exact = self.fuzzy_bridge.is_none_or(FuzzyBridge::is_all_exact);
1201 if is_all_exact {
1202 return None;
1203 }
1204 self.fuzzy_bridge
1205 .map(|b| b.search_all(text, self.config.threshold))
1206 }
1207
1208 #[must_use]
1210 pub fn find_at_with_cache(
1211 &self,
1212 text: &str,
1213 start: usize,
1214 cached: Option<&CachedMatches>,
1215 ) -> Option<MatchResult> {
1216 use super::hash::FxHashSet;
1217
1218 let mut threads = vec![Thread {
1219 state: self.nfa.start,
1220 pos: start,
1221 match_start: start,
1222 captures: CaptureState::new(self.capture_count),
1223 similarity: 1.0,
1224 edits: EditCounts::default(),
1225 }];
1226
1227 let mut best_match: Option<MatchResult> = None;
1228
1229 let mut visited: FxHashSet<(StateId, usize)> = FxHashSet::default();
1231 visited.insert((self.nfa.start, start));
1232
1233 while !threads.is_empty() {
1234 if self.config.prefer_shortest && best_match.is_some() {
1236 break;
1237 }
1238
1239 let mut next_threads = Vec::new();
1240
1241 for thread in threads {
1242 self.step_thread_with_cache(
1243 text,
1244 start,
1245 thread,
1246 &mut next_threads,
1247 &mut best_match,
1248 cached,
1249 );
1250 }
1251
1252 let mut deduped = Vec::with_capacity(next_threads.len());
1256 if self.config.enhance_match {
1257 use super::hash::FxHashMap;
1260 let mut best_at_pos: FxHashMap<(StateId, usize), usize> = FxHashMap::default();
1261 for (idx, thread) in next_threads.iter().enumerate() {
1262 let key = (thread.state, thread.pos);
1263 if let Some(&existing_idx) = best_at_pos.get(&key) {
1264 let new_cost = thread.edits.cost(1, 1, 1, 1);
1267 let existing_cost = next_threads[existing_idx].edits.cost(1, 1, 1, 1);
1268 if new_cost < existing_cost {
1269 best_at_pos.insert(key, idx);
1270 }
1271 } else if visited.insert(key) {
1272 best_at_pos.insert(key, idx);
1273 }
1274 }
1275 let mut indices: Vec<_> = best_at_pos.values().copied().collect();
1277 indices.sort_unstable();
1278 for idx in indices {
1279 deduped.push(next_threads[idx].clone());
1280 }
1281 } else {
1282 for thread in next_threads {
1284 let key = (thread.state, thread.pos);
1285 if visited.insert(key) {
1286 deduped.push(thread);
1287 }
1288 }
1289 }
1290 next_threads = deduped;
1291
1292 if next_threads.len() > self.config.max_threads {
1294 next_threads.sort_by(|a, b| {
1295 b.similarity
1296 .partial_cmp(&a.similarity)
1297 .unwrap_or(std::cmp::Ordering::Equal)
1298 });
1299 next_threads.truncate(self.config.max_threads);
1300 }
1301
1302 threads = next_threads;
1303 }
1304
1305 best_match
1306 }
1307
1308 fn step_thread_with_cache(
1310 &self,
1311 text: &str,
1312 _start: usize,
1313 thread: Thread,
1314 next_threads: &mut Vec<Thread>,
1315 best_match: &mut Option<MatchResult>,
1316 cached: Option<&CachedMatches>,
1317 ) {
1318 let state = &self.nfa.states[thread.state];
1319
1320 match state {
1321 State::Accept => {
1322 let mut captures = thread.captures.clone();
1323 captures.set_full_match(thread.match_start, thread.pos);
1324
1325 let m = MatchResult {
1326 start: thread.match_start,
1327 end: thread.pos,
1328 similarity: thread.similarity,
1329 edits: thread.edits.clone(),
1330 captures,
1331 };
1332
1333 let prefer_shorter = self.config.prefer_shortest;
1340 let has_alternation = self.nfa.is_simple_alternation();
1342 let has_fuzzy = self
1344 .simple_alternation_indices
1345 .as_ref()
1346 .is_some_and(|indices| {
1347 self.fuzzy_bridge.is_some_and(|bridge| {
1348 indices
1349 .iter()
1350 .any(|&idx| bridge.pattern_max_edits(idx).unwrap_or(0) > 0)
1351 })
1352 });
1353 let is_simple_alt = has_alternation && !has_fuzzy;
1354 let is_special_mode = self.config.best_match || self.config.enhance_match;
1355 if best_match.as_ref().is_none_or(|best| {
1356 m.start < best.start
1358 || (m.start == best.start && {
1360 if is_special_mode {
1361 m.similarity > best.similarity
1363 || (m.similarity == best.similarity && if prefer_shorter {
1364 m.end - m.start < best.end - best.start
1365 } else {
1366 m.end - m.start > best.end - best.start
1367 })
1368 } else if is_simple_alt {
1369 false
1371 } else {
1372 m.similarity > best.similarity
1374 || (m.similarity == best.similarity && if prefer_shorter {
1375 m.end - m.start < best.end - best.start
1376 } else {
1377 m.end - m.start > best.end - best.start
1378 })
1379 }
1380 })
1381 }) {
1382 *best_match = Some(m);
1383 }
1384 }
1385
1386 State::Epsilon { targets } => {
1387 for &target in targets {
1388 next_threads.push(Thread {
1389 state: target,
1390 ..thread.clone()
1391 });
1392 }
1393 }
1394
1395 State::Split { branches, greedy } => {
1396 if *greedy {
1399 for &branch in branches {
1400 next_threads.push(Thread {
1401 state: branch,
1402 ..thread.clone()
1403 });
1404 }
1405 } else {
1406 for &branch in branches.iter().rev() {
1407 next_threads.push(Thread {
1408 state: branch,
1409 ..thread.clone()
1410 });
1411 }
1412 }
1413 }
1414
1415 State::Char { class, next } => {
1416 if thread.pos < text.len()
1417 && let Some(ch) = text[thread.pos..].chars().next()
1418 && class.matches(ch)
1419 {
1420 next_threads.push(Thread {
1421 state: *next,
1422 pos: thread.pos + ch.len_utf8(),
1423 ..thread
1424 });
1425 }
1426 }
1427
1428 State::FuzzyChar {
1429 class,
1430 limits,
1431 min_edits: _,
1432 cost_constraint: _,
1433 next,
1434 } => {
1435 let max_edits = limits
1437 .as_ref()
1438 .and_then(FuzzyLimits::get_edits)
1439 .unwrap_or(u8::MAX);
1440 let max_deletions = limits
1441 .as_ref()
1442 .and_then(FuzzyLimits::get_deletions)
1443 .unwrap_or(max_edits);
1444 let max_substitutions = limits
1445 .as_ref()
1446 .and_then(FuzzyLimits::get_substitutions)
1447 .unwrap_or(max_edits);
1448
1449 let current_edits = thread.edits.total();
1450
1451 let edit_penalty = if max_edits > 0 {
1453 1.0 / (f32::from(max_edits) + 1.0)
1454 } else {
1455 1.0
1456 };
1457
1458 if thread.pos < text.len()
1460 && let Some(ch) = text[thread.pos..].chars().next()
1461 {
1462 if class.matches(ch) {
1463 next_threads.push(Thread {
1465 state: *next,
1466 pos: thread.pos + ch.len_utf8(),
1467 ..thread.clone()
1468 });
1469 } else if current_edits < max_edits
1470 && thread.edits.substitutions < max_substitutions
1471 {
1472 let mut new_edits = thread.edits.clone();
1474 new_edits.substitutions += 1;
1475 next_threads.push(Thread {
1476 state: *next,
1477 pos: thread.pos + ch.len_utf8(),
1478 similarity: thread.similarity * (1.0 - edit_penalty),
1479 edits: new_edits,
1480 ..thread.clone()
1481 });
1482 }
1483 }
1484
1485 if current_edits < max_edits && thread.edits.deletions < max_deletions {
1487 let mut new_edits = thread.edits.clone();
1488 new_edits.deletions += 1;
1489 next_threads.push(Thread {
1490 state: *next,
1491 pos: thread.pos, similarity: thread.similarity * (1.0 - edit_penalty),
1493 edits: new_edits,
1494 ..thread.clone()
1495 });
1496 }
1497 }
1498
1499 State::FuzzyLiteral {
1500 pattern_index,
1501 next,
1502 limits,
1503 min_edits,
1504 cost_constraint,
1505 } => {
1506 if let Some(bridge) = self.fuzzy_bridge {
1507 let max_edits = limits.as_ref().map(|l| {
1510 l.get_edits().unwrap_or_else(|| {
1511 let i = l.get_insertions().unwrap_or(0);
1512 let d = l.get_deletions().unwrap_or(0);
1513 let s = l.get_substitutions().unwrap_or(0);
1514 let t = l.get_swaps().unwrap_or(0);
1515 i.saturating_add(d).saturating_add(s).saturating_add(t)
1516 })
1517 });
1518
1519 if max_edits.is_none() || max_edits == Some(0) {
1520 if let Some(pattern_text) = bridge.pattern_text(*pattern_index)
1522 && thread.pos + pattern_text.len() <= text.len()
1523 && text[thread.pos..].starts_with(pattern_text)
1524 {
1525 let new_end = thread.pos + pattern_text.len();
1526 if new_end > thread.pos {
1528 next_threads.push(Thread {
1529 state: *next,
1530 pos: new_end,
1531 similarity: thread.similarity,
1532 edits: thread.edits.clone(),
1533 captures: thread.captures.clone(),
1534 ..thread
1535 });
1536 }
1537 }
1538 } else {
1540 let expected_end = self.find_expected_end(*next, text.len());
1541
1542 let meets_min_edits = |result: &FuzzyMatchResult| -> bool {
1543 match min_edits {
1544 Some(min) => result.total_edits() >= *min,
1545 None => true,
1546 }
1547 };
1548
1549 let meets_cost_constraint = |result: &FuzzyMatchResult| -> bool {
1550 match cost_constraint {
1551 Some(constraint) => constraint.is_satisfied(
1552 result.insertions,
1553 result.deletions,
1554 result.substitutions,
1555 result.swaps,
1556 ),
1557 None => true,
1558 }
1559 };
1560
1561 let meets_all_constraints = |result: &FuzzyMatchResult| -> bool {
1562 meets_min_edits(result) && meets_cost_constraint(result)
1563 };
1564
1565 let result = if let Some(cache) = cached {
1567 bridge.find_at_cached(cache, *pattern_index, thread.pos)
1568 } else {
1569 bridge.find_at(text, *pattern_index, thread.pos, self.config.threshold)
1570 };
1571
1572 if let Some(result) = result {
1573 let should_try_boundary = expected_end.is_some()
1574 && result.end < expected_end.unwrap()
1575 && limits.as_ref().is_some_and(|l| {
1576 l.get_insertions().unwrap_or(0) > 0
1577 || l.get_edits().unwrap_or(0) > 0
1578 });
1579
1580 if should_try_boundary
1581 && let Some(boundary_result) = bridge.find_with_boundary_insertions(
1582 text,
1583 *pattern_index,
1584 thread.pos,
1585 expected_end,
1586 self.config.threshold,
1587 cached,
1588 )
1589 && meets_all_constraints(&boundary_result)
1590 {
1591 next_threads.push(Thread {
1592 state: *next,
1593 pos: boundary_result.end,
1594 similarity: thread.similarity * boundary_result.similarity,
1595 edits: thread
1596 .edits
1597 .merge(&EditCounts::from_fuzzy_result(&boundary_result)),
1598 captures: thread.captures.clone(),
1599 ..thread
1600 });
1601 }
1602
1603 if meets_all_constraints(&result) && result.end > thread.pos {
1607 next_threads.push(Thread {
1608 state: *next,
1609 pos: result.end,
1610 similarity: thread.similarity * result.similarity,
1611 edits: thread
1612 .edits
1613 .merge(&EditCounts::from_fuzzy_result(&result)),
1614 captures: thread.captures.clone(),
1615 ..thread
1616 });
1617 }
1618 } else {
1619 let can_use_boundary = limits.as_ref().is_some_and(|l| {
1620 l.get_insertions().unwrap_or(0) > 0
1621 || l.get_edits().unwrap_or(0) > 0
1622 });
1623
1624 if can_use_boundary
1625 && let Some(result) = bridge.find_with_boundary_insertions(
1626 text,
1627 *pattern_index,
1628 thread.pos,
1629 expected_end,
1630 self.config.threshold,
1631 cached,
1632 )
1633 {
1634 if meets_all_constraints(&result) && result.end > thread.pos {
1636 next_threads.push(Thread {
1637 state: *next,
1638 pos: result.end,
1639 similarity: thread.similarity * result.similarity,
1640 edits: thread
1641 .edits
1642 .merge(&EditCounts::from_fuzzy_result(&result)),
1643 captures: thread.captures.clone(),
1644 ..thread
1645 });
1646 }
1647 }
1648 }
1649
1650 let max_edits_for_deletion = max_edits.unwrap_or(0);
1657 let max_deletions = limits
1658 .as_ref()
1659 .and_then(FuzzyLimits::get_deletions)
1660 .unwrap_or(max_edits_for_deletion);
1661
1662 if max_deletions > 0 && thread.pos > 0 && thread.pos < text.len() {
1665 if let Some(pattern_char_len) = bridge.pattern_char_len(*pattern_index)
1667 {
1668 let pattern_len = pattern_char_len as u8;
1669 let current_deletions = thread.edits.deletions;
1670
1671 if current_deletions + pattern_len <= max_deletions
1673 && (thread.edits.total() as usize + pattern_len as usize)
1674 <= max_edits_for_deletion as usize
1675 {
1676 let similarity = (1.0
1680 - f32::from(pattern_len) / f32::from(pattern_len))
1681 .max(0.0);
1682
1683 let deletion_result = FuzzyMatchResult {
1685 end: thread.pos,
1686 similarity,
1687 insertions: 0,
1688 deletions: pattern_len,
1689 substitutions: 0,
1690 swaps: 0,
1691 };
1692
1693 if meets_all_constraints(&deletion_result) {
1694 let mut new_edits = thread.edits.clone();
1695 new_edits.deletions += pattern_len;
1696 next_threads.push(Thread {
1697 state: *next,
1698 pos: thread.pos, similarity: thread.similarity
1700 * deletion_result.similarity,
1701 edits: new_edits,
1702 captures: thread.captures.clone(),
1703 ..thread
1704 });
1705 }
1706 }
1707 }
1708 }
1709 } }
1711 }
1712
1713 State::CaptureStart { index, next } => {
1714 let mut captures = thread.captures.clone();
1715 captures.start_capture(*index, thread.pos);
1716 next_threads.push(Thread {
1717 state: *next,
1718 captures,
1719 ..thread
1720 });
1721 }
1722
1723 State::CaptureEnd { index, next } => {
1724 let mut captures = thread.captures.clone();
1725 captures.end_capture(*index, thread.pos);
1726 next_threads.push(Thread {
1727 state: *next,
1728 captures,
1729 ..thread
1730 });
1731 }
1732
1733 State::Anchor { kind, next } => {
1734 let matches = match kind {
1735 Anchor::Start => {
1736 if self.config.multi_line {
1737 Self::is_line_start(text, thread.pos)
1738 } else {
1739 thread.pos == 0
1740 }
1741 }
1742 Anchor::End => {
1743 if self.config.multi_line {
1744 Self::is_line_end(text, thread.pos)
1745 } else {
1746 thread.pos == text.len()
1747 }
1748 }
1749 Anchor::WordBoundary => Self::is_word_boundary(text, thread.pos),
1750 Anchor::NotWordBoundary => !Self::is_word_boundary(text, thread.pos),
1751 };
1752
1753 if matches {
1754 next_threads.push(Thread {
1755 state: *next,
1756 ..thread
1757 });
1758 }
1759 }
1760
1761 State::Lookahead {
1762 positive,
1763 nfa,
1764 literals,
1765 next,
1766 } => {
1767 let lookahead_bridge = if literals.is_empty() {
1769 None
1770 } else {
1771 FuzzyBridge::new(literals, None, None, false)
1772 };
1773
1774 let sub_matcher = Matcher::new(
1776 nfa,
1777 lookahead_bridge.as_ref(),
1778 0,
1779 MatcherConfig {
1780 unanchored: false,
1781 ..self.config.clone()
1782 },
1783 );
1784
1785 let sub_result = sub_matcher.find_at(text, thread.pos);
1786
1787 let has_match = sub_result.is_some();
1788
1789 if has_match == *positive {
1790 next_threads.push(Thread {
1791 state: *next,
1792 ..thread
1793 });
1794 }
1795 }
1796
1797 State::Lookbehind {
1798 positive,
1799 nfa,
1800 literals,
1801 bridge,
1802 next,
1803 } => {
1804 let has_match =
1805 self.try_lookbehind(text, thread.pos, nfa, literals, bridge.as_deref());
1806
1807 if has_match == *positive {
1808 next_threads.push(Thread {
1809 state: *next,
1810 ..thread
1811 });
1812 }
1813 }
1814
1815 State::Backreference {
1816 group,
1817 next,
1818 limits,
1819 } => {
1820 if let Some((cap_start, cap_end)) = thread.captures.get(*group) {
1821 let captured = &text[cap_start..cap_end];
1822 let remaining = &text[thread.pos..];
1823
1824 if let Some(limits) = limits {
1825 let max_edits = limits.get_edits().unwrap_or(
1826 limits.get_insertions().unwrap_or(0)
1827 + limits.get_deletions().unwrap_or(0)
1828 + limits.get_substitutions().unwrap_or(0),
1829 );
1830
1831 let cap_len = captured.len();
1832 let min_len = cap_len.saturating_sub(max_edits as usize);
1833 let max_len = cap_len.saturating_add(max_edits as usize);
1834
1835 for end_len in min_len..=max_len.min(remaining.len()) {
1836 if !remaining.is_char_boundary(end_len) {
1837 continue;
1838 }
1839 let candidate = &remaining[..end_len];
1840 let distance = edit_distance_bounded(captured, candidate, max_edits);
1841
1842 if distance != Distance::MAX && distance <= max_edits.into() {
1843 next_threads.push(Thread {
1844 state: *next,
1845 pos: thread.pos + end_len,
1846 similarity: thread.similarity
1847 * (1.0
1848 - distance as f32 / cap_len.max(end_len).max(1) as f32),
1849 edits: thread.edits.merge(&EditCounts {
1850 substitutions: distance as u8,
1851 ..Default::default()
1852 }),
1853 captures: thread.captures.clone(),
1854 ..thread
1855 });
1856 }
1857 }
1858 } else if remaining.starts_with(captured) {
1859 next_threads.push(Thread {
1860 state: *next,
1861 pos: thread.pos + captured.len(),
1862 ..thread
1863 });
1864 }
1865 }
1866 }
1867
1868 State::ResetMatchStart { next } => {
1870 let mut new_thread = thread.clone();
1871 new_thread.match_start = thread.pos;
1872 new_thread.state = *next;
1873 next_threads.push(new_thread);
1874 }
1875 State::AtomicGroup { .. }
1876 | State::RecursivePattern { .. }
1877 | State::RecursiveGroup { .. }
1878 | State::RecursiveNamedGroup { .. } => {}
1879 }
1880 }
1881
1882 #[inline]
1885 fn is_word_boundary(text: &str, pos: usize) -> bool {
1886 let bytes = text.as_bytes();
1887
1888 let before_is_word = if pos > 0 {
1890 let mut start = pos - 1;
1893 while start > 0 && (bytes[start] & 0xC0) == 0x80 {
1894 start -= 1;
1895 }
1896 text[start..pos]
1898 .chars()
1899 .next()
1900 .is_some_and(|c| c.is_alphanumeric() || c == '_')
1901 } else {
1902 false
1903 };
1904
1905 let after_is_word = text[pos..]
1907 .chars()
1908 .next()
1909 .is_some_and(|c| c.is_alphanumeric() || c == '_');
1910
1911 before_is_word != after_is_word
1912 }
1913
1914 #[inline]
1916 fn is_line_start(text: &str, pos: usize) -> bool {
1917 if pos == 0 {
1918 return true;
1919 }
1920 let bytes = text.as_bytes();
1922 bytes[pos - 1] == b'\n'
1923 }
1924
1925 #[inline]
1927 fn is_line_end(text: &str, pos: usize) -> bool {
1928 if pos == text.len() {
1929 return true;
1930 }
1931 let bytes = text.as_bytes();
1933 bytes[pos] == b'\n'
1934 }
1935
1936 fn find_expected_end(&self, state_id: StateId, text_len: usize) -> Option<usize> {
1939 let mut visited = vec![false; self.nfa.states.len()];
1940 self.find_expected_end_recursive(state_id, text_len, &mut visited)
1941 }
1942
1943 fn find_expected_end_recursive(
1945 &self,
1946 state_id: StateId,
1947 text_len: usize,
1948 visited: &mut Vec<bool>,
1949 ) -> Option<usize> {
1950 if visited[state_id] {
1951 return None;
1952 }
1953 visited[state_id] = true;
1954
1955 match &self.nfa.states[state_id] {
1956 State::Anchor {
1958 kind: Anchor::End,
1959 ..
1960 } => Some(text_len),
1961
1962 State::CaptureEnd { next, .. } => {
1963 self.find_expected_end_recursive(*next, text_len, visited)
1964 }
1965
1966 State::Anchor { next, .. } => self.find_expected_end_recursive(*next, text_len, visited),
1968
1969 State::Accept
1971 | State::Epsilon { .. }
1972 | State::Split { .. }
1973 | State::Char { .. }
1975 | State::FuzzyChar { .. }
1976 | State::FuzzyLiteral { .. }
1977 | State::Backreference { .. }
1978 | State::Lookahead { .. }
1979 | State::Lookbehind { .. }
1980 | State::CaptureStart { .. }
1981 | State::ResetMatchStart { .. }
1982 | State::AtomicGroup { .. }
1983 | State::RecursivePattern { .. }
1984 | State::RecursiveGroup { .. }
1985 | State::RecursiveNamedGroup { .. } => None,
1986 }
1987 }
1988
1989 fn try_lookbehind(
1995 &self,
1996 text: &str,
1997 pos: usize,
1998 nfa: &Nfa,
1999 literals: &[LiteralPattern],
2000 bridge: Option<&FuzzyBridge>,
2001 ) -> bool {
2002 if literals.len() == 1 {
2005 let lit = &literals[0];
2006 let max_edits = lit
2007 .limits
2008 .as_ref()
2009 .map_or(0, |l| l.get_edits().unwrap_or(0));
2010
2011 if max_edits == 0 {
2012 let pattern = &lit.text;
2014 let pattern_bytes = pattern.as_bytes();
2015 if pos >= pattern_bytes.len() {
2016 let start = pos - pattern_bytes.len();
2017 if &text.as_bytes()[start..pos] == pattern_bytes {
2018 return true;
2019 }
2020 }
2021 return false;
2022 }
2023 }
2024
2025 let (min_char_len, max_char_len) = nfa.length_range(|pattern_idx| {
2027 bridge
2028 .and_then(|b| {
2029 let char_len = b.pattern_char_len(pattern_idx)?;
2030 let max_edits = b.pattern_max_edits(pattern_idx).unwrap_or(0);
2031 Some((char_len, max_edits))
2032 })
2033 .or_else(|| {
2034 literals
2036 .get(pattern_idx)
2037 .map(|l| (l.text.chars().count(), 0))
2038 })
2039 });
2040
2041 let sub_matcher = Matcher::new(
2042 nfa,
2043 bridge,
2044 0,
2045 MatcherConfig {
2046 unanchored: false,
2047 ..self.config.clone()
2048 },
2049 );
2050
2051 if let Some(max) = max_char_len
2054 && min_char_len == max
2055 {
2056 let text_before = &text[..pos];
2059 let start_byte = Self::nth_char_from_end_byte_offset(text_before, min_char_len);
2060 if let Some(start) = start_byte
2061 && let Some(m) = sub_matcher.find_at(text, start)
2062 && m.end == pos
2063 {
2064 return true;
2065 }
2066 return false;
2067 }
2068
2069 let text_before = &text[..pos];
2071 let num_chars = text_before.chars().count();
2072
2073 if min_char_len > num_chars {
2075 return false;
2076 }
2077
2078 let max_char_search = max_char_len.unwrap_or(num_chars.min(256));
2079
2080 let search_start_char = num_chars.saturating_sub(max_char_search);
2082 let search_end_char = num_chars.saturating_sub(min_char_len);
2083
2084 let mut char_count = 0;
2086 let mut positions = Vec::new();
2087 for (byte_idx, _) in text_before.char_indices() {
2088 if char_count >= search_start_char && char_count <= search_end_char {
2089 positions.push(byte_idx);
2090 }
2091 char_count += 1;
2092 if char_count > search_end_char {
2093 break;
2094 }
2095 }
2096
2097 for start_byte in positions.into_iter().rev() {
2099 if let Some(m) = sub_matcher.find_at(text, start_byte)
2100 && m.end == pos
2101 {
2102 return true;
2103 }
2104 }
2105
2106 false
2107 }
2108
2109 fn nth_char_from_end_byte_offset(s: &str, n: usize) -> Option<usize> {
2113 if n == 0 {
2114 return Some(s.len());
2115 }
2116 let bytes = s.as_bytes();
2119 let mut byte_pos = bytes.len();
2120 let mut chars_seen = 0;
2121
2122 while byte_pos > 0 && chars_seen < n {
2123 byte_pos -= 1;
2124 if bytes[byte_pos] & 0b1100_0000 != 0b1000_0000 {
2126 chars_seen += 1;
2127 }
2128 }
2129
2130 if chars_seen == n {
2131 Some(byte_pos)
2132 } else {
2133 None
2134 }
2135 }
2136}
2137
2138#[cfg(test)]
2141#[inline]
2142fn edit_distance(s1: &str, s2: &str) -> usize {
2143 edit_distance_bounded(s1, s2, NumEdits::MAX).into()
2144}
2145
2146#[inline]
2149fn edit_distance_bounded(s1: &str, s2: &str, max_edits: NumEdits) -> Distance {
2150 let len_diff = s1.len().abs_diff(s2.len());
2152 if len_diff > max_edits as usize {
2153 return Distance::MAX;
2154 }
2155
2156 if s1.is_ascii() && s2.is_ascii() {
2158 return edit_distance_ascii_bounded(s1.as_bytes(), s2.as_bytes(), max_edits);
2159 }
2160
2161 let s1_chars: Vec<char> = s1.chars().collect();
2163 let s2_chars: Vec<char> = s2.chars().collect();
2164
2165 let char_diff = s1_chars.len().abs_diff(s2_chars.len());
2167 if char_diff > max_edits as usize {
2168 return Distance::MAX;
2169 }
2170
2171 edit_distance_slice_bounded(&s1_chars, &s2_chars, max_edits as usize)
2172}
2173
2174#[inline]
2176fn edit_distance_ascii_bounded(s1: &[u8], s2: &[u8], max_edits: NumEdits) -> Distance {
2177 let m = s1.len();
2178 let n = s2.len();
2179
2180 if m == 0 {
2181 return if n <= max_edits.into() {
2182 n as Distance
2183 } else {
2184 Distance::MAX
2185 };
2186 }
2187 if n == 0 {
2188 return if m <= max_edits.into() {
2189 m as Distance
2190 } else {
2191 Distance::MAX
2192 };
2193 }
2194
2195 const STACK_LIMIT: usize = 64;
2197 if n < STACK_LIMIT {
2198 let mut prev = [0usize; STACK_LIMIT];
2199 let mut curr = [0usize; STACK_LIMIT];
2200
2201 for j in 0..=n {
2202 prev[j] = j;
2203 }
2204
2205 for i in 1..=m {
2206 curr[0] = i;
2207 let mut row_min = curr[0];
2208 for j in 1..=n {
2209 let cost = usize::from(s1[i - 1] != s2[j - 1]);
2210 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2211 row_min = row_min.min(curr[j]);
2212 }
2213 if row_min > max_edits.into() {
2215 return Distance::MAX;
2216 }
2217 std::mem::swap(&mut prev, &mut curr);
2218 }
2219 return prev[n] as Distance;
2220 }
2221
2222 let mut prev: Vec<usize> = (0..=n).collect();
2224 let mut curr = vec![0; n + 1];
2225
2226 for i in 1..=m {
2227 curr[0] = i;
2228 let mut row_min = curr[0];
2229 for j in 1..=n {
2230 let cost = usize::from(s1[i - 1] != s2[j - 1]);
2231 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2232 row_min = row_min.min(curr[j]);
2233 }
2234 if row_min > max_edits.into() {
2235 return Distance::MAX;
2236 }
2237 std::mem::swap(&mut prev, &mut curr);
2238 }
2239 prev[n] as Distance
2240}
2241
2242#[inline]
2244fn edit_distance_slice_bounded(s1: &[char], s2: &[char], max_edits: usize) -> Distance {
2245 let m = s1.len();
2246 let n = s2.len();
2247
2248 if m == 0 {
2249 return if n <= max_edits {
2250 n as Distance
2251 } else {
2252 Distance::MAX
2253 };
2254 }
2255 if n == 0 {
2256 return if m <= max_edits {
2257 m as Distance
2258 } else {
2259 Distance::MAX
2260 };
2261 }
2262
2263 const STACK_LIMIT: usize = 64;
2265 if n < STACK_LIMIT {
2266 let mut prev = [0usize; STACK_LIMIT];
2267 let mut curr = [0usize; STACK_LIMIT];
2268
2269 for j in 0..=n {
2270 prev[j] = j;
2271 }
2272
2273 for i in 1..=m {
2274 curr[0] = i;
2275 let mut row_min = curr[0];
2276 for j in 1..=n {
2277 let cost = usize::from(s1[i - 1] != s2[j - 1]);
2278 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2279 row_min = row_min.min(curr[j]);
2280 }
2281 if row_min > max_edits {
2282 return Distance::MAX;
2283 }
2284 std::mem::swap(&mut prev, &mut curr);
2285 }
2286 return prev[n] as Distance;
2287 }
2288
2289 let mut prev: Vec<usize> = (0..=n).collect();
2291 let mut curr = vec![0; n + 1];
2292
2293 for i in 1..=m {
2294 curr[0] = i;
2295 let mut row_min = curr[0];
2296 for j in 1..=n {
2297 let cost = usize::from(s1[i - 1] != s2[j - 1]);
2298 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
2299 row_min = row_min.min(curr[j]);
2300 }
2301 if row_min > max_edits {
2302 return Distance::MAX;
2303 }
2304 std::mem::swap(&mut prev, &mut curr);
2305 }
2306 prev[n] as Distance
2307}
2308
2309#[cfg(test)]
2310mod tests {
2311 use super::*;
2312 use crate::compiler::build_nfa;
2313 use crate::ir::lower;
2314 use crate::parser::parse;
2315
2316 fn make_matcher(pattern: &str) -> (Nfa, Option<FuzzyBridge>, usize) {
2317 let ast = parse(pattern).unwrap();
2318 let hir = lower(&ast, 0);
2319 let (nfa, literals) = build_nfa(&hir);
2320
2321 let bridge = FuzzyBridge::new(&literals, None, None, false);
2322
2323 let capture_count = count_captures(&ast);
2325
2326 (nfa, bridge, capture_count)
2327 }
2328
2329 fn count_captures(ast: &crate::parser::Ast) -> usize {
2330 use crate::parser::Ast;
2331 match ast {
2332 Ast::Group { index, expr, .. } => (*index).max(count_captures(expr)),
2333 Ast::NonCapturingGroup { expr, .. } => count_captures(expr),
2334 Ast::Concat(parts) => parts.iter().map(count_captures).max().unwrap_or(0),
2335 Ast::Alternation(alts) => alts.iter().map(count_captures).max().unwrap_or(0),
2336 Ast::Quantified { expr, .. } => count_captures(expr),
2337 Ast::Lookahead { expr, .. } => count_captures(expr),
2338 Ast::Lookbehind { expr, .. } => count_captures(expr),
2339 _ => 0,
2340 }
2341 }
2342
2343 #[test]
2344 fn test_simple_match() {
2345 let (nfa, bridge, captures) = make_matcher("hello");
2346 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2347
2348 let result = matcher.find("hello world");
2349 assert!(result.is_some());
2350 let m = result.unwrap();
2351 assert_eq!(m.start, 0);
2352 assert_eq!(m.end, 5);
2353 }
2354
2355 #[test]
2356 fn test_find_no_cache() {
2357 let (nfa, bridge, captures) = make_matcher("hello");
2358 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2359
2360 let result = matcher.find_no_cache("hello world");
2361 assert!(result.is_some());
2362 let m = result.unwrap();
2363 assert_eq!(m.start, 0);
2364 assert_eq!(m.end, 5);
2365 }
2366
2367 #[test]
2368 fn test_find_no_cache_fuzzy() {
2369 let (nfa, bridge, captures) = make_matcher("hello~1");
2370 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2371
2372 let result = matcher.find_no_cache("hallo world");
2373 assert!(result.is_some());
2374 let m = result.unwrap();
2375 assert_eq!(m.start, 0);
2376 assert_eq!(m.end, 5);
2377 }
2378
2379 #[test]
2380 fn test_char_class() {
2381 let (nfa, bridge, captures) = make_matcher("[a-z]+");
2382 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2383
2384 let result = matcher.find("123abc456");
2385 assert!(result.is_some());
2386 let m = result.unwrap();
2387 assert_eq!(&"123abc456"[m.start..m.end], "abc");
2388 }
2389
2390 #[test]
2391 fn test_capture_group() {
2392 let (nfa, bridge, captures) = make_matcher("(abc)");
2393 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2394
2395 let result = matcher.find("xyzabc123");
2396 assert!(result.is_some());
2397 let m = result.unwrap();
2398 assert!(m.captures.get(1).is_some());
2399 }
2400
2401 #[test]
2402 fn test_anchors() {
2403 let (nfa, bridge, captures) = make_matcher("^hello");
2404 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2405
2406 assert!(matcher.find("hello world").is_some());
2407 assert!(matcher.find("say hello").is_none());
2408 }
2409
2410 #[test]
2411 fn test_alternation() {
2412 let (nfa, bridge, captures) = make_matcher("cat|dog");
2413 let matcher = Matcher::new(&nfa, bridge.as_ref(), captures, MatcherConfig::default());
2414
2415 assert!(matcher.find("I have a cat").is_some());
2416 assert!(matcher.find("I have a dog").is_some());
2417 assert!(matcher.find("I have a bird").is_none());
2418 }
2419
2420 #[test]
2421 fn test_edit_distance() {
2422 assert_eq!(edit_distance("hello", "hello"), 0);
2424 assert_eq!(edit_distance("", ""), 0);
2425
2426 assert_eq!(edit_distance("hello", "hallo"), 1); assert_eq!(edit_distance("hello", "hell"), 1); assert_eq!(edit_distance("hello", "helloo"), 1); assert_eq!(edit_distance("cat", "hat"), 1); assert_eq!(edit_distance("kitten", "sitting"), 3);
2434 assert_eq!(edit_distance("saturday", "sunday"), 3);
2435
2436 assert_eq!(edit_distance("", "abc"), 3);
2438 assert_eq!(edit_distance("abc", ""), 3);
2439 assert_eq!(edit_distance("a", "b"), 1);
2440
2441 for (s1, s2, expected) in [
2443 ("fox", "box", 1),
2444 ("quick", "quack", 1),
2445 ("brown", "brawn", 1),
2446 ("test", "taste", 2),
2447 ("abc", "xyz", 3),
2448 ] {
2449 assert_eq!(edit_distance(s1, s2), expected, "{s1} vs {s2}");
2450 }
2451 }
2452}