1#![allow(
7 clippy::needless_range_loop,
8 clippy::items_after_statements,
9 clippy::similar_names,
10 clippy::too_many_lines
11)]
12
13use super::hash::{FxHashMap, FxHashSet};
14
15#[derive(Default, Debug)]
18pub struct SearchBuffers {
19 active: Vec<ActiveState>,
21 next_active: Vec<ActiveState>,
23 seen_set: FxHashSet<(State, usize)>,
25 deduped: FxHashMap<(usize, usize, bool), ActiveState>,
27 matches: FxHashMap<(usize, usize), DamLevMatch>,
29}
30
31impl SearchBuffers {
32 #[must_use]
34 pub fn new() -> Self {
35 SearchBuffers {
36 active: Vec::with_capacity(32),
37 next_active: Vec::with_capacity(32),
38 seen_set: FxHashSet::default(),
39 deduped: FxHashMap::default(),
40 matches: FxHashMap::default(),
41 }
42 }
43
44 pub fn clear(&mut self) {
46 self.active.clear();
47 self.next_active.clear();
48 self.seen_set.clear();
49 self.deduped.clear();
50 self.matches.clear();
51 }
52}
53
54#[derive(Debug, Clone, Default)]
56pub struct EditLimits {
57 pub max_edits: u8,
59 pub max_insertions: Option<u8>,
61 pub max_deletions: Option<u8>,
63 pub max_substitutions: Option<u8>,
65 pub max_swaps: Option<u8>,
67}
68
69impl EditLimits {
70 #[must_use]
72 pub fn new(max_edits: u8) -> Self {
73 EditLimits {
74 max_edits,
75 max_insertions: None,
76 max_deletions: None,
77 max_substitutions: None,
78 max_swaps: None,
79 }
80 }
81
82 #[must_use]
84 pub fn with_limits(
85 max_edits: u8,
86 max_insertions: Option<u8>,
87 max_deletions: Option<u8>,
88 max_substitutions: Option<u8>,
89 max_swaps: Option<u8>,
90 ) -> Self {
91 EditLimits {
92 max_edits,
93 max_insertions,
94 max_deletions,
95 max_substitutions,
96 max_swaps,
97 }
98 }
99}
100
101#[derive(Debug, Clone)]
103pub struct DamLevMatch {
104 pub start: usize,
106 pub end: usize,
108 pub insertions: u8,
110 pub deletions: u8,
112 pub substitutions: u8,
114 pub swaps: u8,
116 pub similarity: f32,
118}
119
120impl DamLevMatch {
121 #[must_use]
123 pub fn total_edits(&self) -> u8 {
124 self.insertions
125 .saturating_add(self.deletions)
126 .saturating_add(self.substitutions)
127 .saturating_add(self.swaps)
128 }
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
133struct State {
134 pos: usize,
136 insertions: u8,
138 deletions: u8,
140 substitutions: u8,
142 swaps: u8,
144 skip_next: bool,
146}
147
148impl State {
149 fn new(pos: usize) -> Self {
150 State {
151 pos,
152 insertions: 0,
153 deletions: 0,
154 substitutions: 0,
155 swaps: 0,
156 skip_next: false,
157 }
158 }
159
160 fn total_edits(&self) -> u8 {
161 self.insertions
162 .saturating_add(self.deletions)
163 .saturating_add(self.substitutions)
164 .saturating_add(self.swaps)
165 }
166
167 fn advance_match(&self) -> Self {
168 State {
169 pos: self.pos + 1,
170 skip_next: false,
171 ..*self
172 }
173 }
174
175 fn advance_substitution(&self) -> Self {
176 State {
177 pos: self.pos + 1,
178 substitutions: self.substitutions + 1,
179 skip_next: false,
180 ..*self
181 }
182 }
183
184 fn advance_deletion(&self) -> Self {
185 State {
186 pos: self.pos + 1,
187 deletions: self.deletions + 1,
188 skip_next: false,
189 ..*self
190 }
191 }
192
193 fn advance_insertion(&self) -> Self {
194 State {
195 insertions: self.insertions + 1,
196 skip_next: false,
197 ..*self
198 }
199 }
200
201 fn advance_swap(&self) -> Self {
204 State {
205 pos: self.pos + 2,
206 swaps: self.swaps + 1,
207 skip_next: true, ..*self
209 }
210 }
211}
212
213#[derive(Debug, Clone)]
215struct ActiveState {
216 state: State,
217 start_byte: usize,
218 start_char: usize,
219}
220
221#[derive(Debug)]
223pub struct DamLevNfa {
224 pattern: String,
225 pattern_chars: Vec<char>,
226 limits: EditLimits,
227 case_insensitive: bool,
228 beam_width: usize,
231}
232
233impl DamLevNfa {
234 #[must_use]
236 pub fn new(pattern: &str, limits: EditLimits, case_insensitive: bool) -> Self {
237 let pattern_chars: Vec<char> = if case_insensitive {
238 pattern.to_lowercase().chars().collect()
239 } else {
240 pattern.chars().collect()
241 };
242
243 let beam_width =
246 ((pattern_chars.len() + 1) * (limits.max_edits as usize + 1) * 2).clamp(32, 256);
247
248 DamLevNfa {
249 pattern: pattern.to_string(),
250 pattern_chars,
251 limits,
252 case_insensitive,
253 beam_width,
254 }
255 }
256
257 #[must_use]
259 pub fn pattern(&self) -> &str {
260 &self.pattern
261 }
262
263 fn can_insert(&self, state: &State) -> bool {
265 if state.total_edits() >= self.limits.max_edits {
266 return false;
267 }
268 self.limits
269 .max_insertions
270 .is_none_or(|max| state.insertions < max)
271 }
272
273 fn can_delete(&self, state: &State) -> bool {
275 if state.total_edits() >= self.limits.max_edits {
276 return false;
277 }
278 self.limits
279 .max_deletions
280 .is_none_or(|max| state.deletions < max)
281 }
282
283 fn can_substitute(&self, state: &State) -> bool {
285 if state.total_edits() >= self.limits.max_edits {
286 return false;
287 }
288 self.limits
289 .max_substitutions
290 .is_none_or(|max| state.substitutions < max)
291 }
292
293 fn can_swap(&self, state: &State) -> bool {
295 if state.total_edits() >= self.limits.max_edits {
296 return false;
297 }
298 self.limits.max_swaps.is_none_or(|max| state.swaps < max)
299 }
300
301 fn epsilon_closure(&self, states: &mut Vec<ActiveState>, seen: &mut FxHashSet<(State, usize)>) {
305 self.epsilon_closure_from(states, 0, seen);
306 }
307
308 fn epsilon_closure_from(
311 &self,
312 states: &mut Vec<ActiveState>,
313 start_idx: usize,
314 seen: &mut FxHashSet<(State, usize)>,
315 ) {
316 let mut i = start_idx;
317 while i < states.len() {
318 let active = &states[i];
319 let state = active.state;
320
321 if state.pos < self.pattern_chars.len() && self.can_delete(&state) {
323 let new_state = state.advance_deletion();
324 let key = (new_state, active.start_byte);
325
326 if !seen.contains(&key) {
327 seen.insert(key);
328 states.push(ActiveState {
329 state: new_state,
330 start_byte: active.start_byte,
331 start_char: active.start_char,
332 });
333 }
334 }
335
336 i += 1;
337 }
338 }
339
340 fn calc_similarity(&self, state: &State) -> f32 {
342 let pattern_len = self.pattern_chars.len() as f32;
343 if pattern_len == 0.0 {
344 return 1.0;
345 }
346
347 let edit_distance = f32::from(state.total_edits());
348 let matched_len = pattern_len + f32::from(state.insertions) - f32::from(state.deletions);
350 let max_len = pattern_len.max(matched_len).max(1.0);
351
352 (1.0 - edit_distance / max_len).max(0.0)
354 }
355
356 #[inline]
366 fn max_possible_similarity(&self, state: &State) -> f32 {
367 let pattern_len = self.pattern_chars.len() as f32;
368 if pattern_len == 0.0 {
369 return 1.0;
370 }
371
372 let min_edits = f32::from(state.total_edits());
373
374 let current_matched_len =
376 pattern_len + f32::from(state.insertions) - f32::from(state.deletions);
377
378 let remaining_edits = f32::from(self.limits.max_edits - state.total_edits());
381 let max_denominator = pattern_len.max(current_matched_len + remaining_edits);
382
383 (1.0 - min_edits / max_denominator).max(0.0)
385 }
386
387 #[inline]
390 fn beam_prune(&self, states: &mut Vec<ActiveState>) {
391 if states.len() > self.beam_width * 2 {
392 states.sort_by_key(|s| s.state.total_edits());
394 states.truncate(self.beam_width);
395 }
396 }
397
398 #[must_use]
402 pub fn find_first(&self, text: &str, threshold: f32) -> Option<DamLevMatch> {
403 self.find_all(text, threshold)
406 .into_iter()
407 .min_by_key(|m| m.start)
408 }
409
410 #[must_use]
414 pub fn find_all(&self, text: &str, threshold: f32) -> Vec<DamLevMatch> {
415 if self.pattern_chars.is_empty() {
416 return vec![DamLevMatch {
418 start: 0,
419 end: 0,
420 insertions: 0,
421 deletions: 0,
422 substitutions: 0,
423 swaps: 0,
424 similarity: 1.0,
425 }];
426 }
427
428 let text_chars: Vec<(usize, char)> = text.char_indices().collect();
429 let mut matches: FxHashMap<(usize, usize), DamLevMatch> = FxHashMap::default();
430
431 let mut seen_set: FxHashSet<(State, usize)> = FxHashSet::default();
433
434 let mut deduped: FxHashMap<(usize, usize, bool), ActiveState> = FxHashMap::default();
436
437 let mut active: Vec<ActiveState> = Vec::new();
439
440 for (char_idx, &(byte_pos, text_char)) in text_chars.iter().enumerate() {
442 let text_char = if self.case_insensitive {
443 text_char.to_lowercase().next().unwrap_or(text_char)
444 } else {
445 text_char
446 };
447
448 let next_text_char = text_chars.get(char_idx + 1).map(|&(_, c)| {
450 if self.case_insensitive {
451 c.to_lowercase().next().unwrap_or(c)
452 } else {
453 c
454 }
455 });
456
457 let initial = ActiveState {
459 state: State::new(0),
460 start_byte: byte_pos,
461 start_char: char_idx,
462 };
463 active.push(initial);
464
465 {
467 let start_idx = active.len() - 1;
469 seen_set.clear();
471 for a in &active {
472 seen_set.insert((a.state, a.start_byte));
473 }
474 self.epsilon_closure_from(&mut active, start_idx, &mut seen_set);
476 }
477
478 let mut next_active: Vec<ActiveState> = Vec::new();
480
481 for active_state in &active {
482 let state = active_state.state;
483
484 if state.skip_next {
486 let mut continued = state;
487 continued.skip_next = false;
488 next_active.push(ActiveState {
489 state: continued,
490 start_byte: active_state.start_byte,
491 start_char: active_state.start_char,
492 });
493 continue;
494 }
495
496 if state.pos < self.pattern_chars.len() {
497 let pattern_char = self.pattern_chars[state.pos];
498
499 if text_char == pattern_char {
501 let new_state = state.advance_match();
502 next_active.push(ActiveState {
503 state: new_state,
504 start_byte: active_state.start_byte,
505 start_char: active_state.start_char,
506 });
507 }
508
509 if text_char != pattern_char && self.can_substitute(&state) {
511 let new_state = state.advance_substitution();
512 next_active.push(ActiveState {
513 state: new_state,
514 start_byte: active_state.start_byte,
515 start_char: active_state.start_char,
516 });
517 }
518
519 if let Some(next_char) = next_text_char
522 && state.pos + 1 < self.pattern_chars.len()
523 && self.can_swap(&state)
524 {
525 let next_pattern_char = self.pattern_chars[state.pos + 1];
526 if pattern_char == next_char
528 && next_pattern_char == text_char
529 && pattern_char != next_pattern_char
530 {
531 let new_state = state.advance_swap();
532 next_active.push(ActiveState {
533 state: new_state,
534 start_byte: active_state.start_byte,
535 start_char: active_state.start_char,
536 });
537 }
538 }
539 }
540
541 if self.can_insert(&state) {
543 let new_state = state.advance_insertion();
544 next_active.push(ActiveState {
545 state: new_state,
546 start_byte: active_state.start_byte,
547 start_char: active_state.start_char,
548 });
549 }
550 }
551
552 seen_set.clear();
554 self.epsilon_closure(&mut next_active, &mut seen_set);
555
556 deduped.clear();
558 for active_state in next_active {
559 if self.max_possible_similarity(&active_state.state) < threshold {
561 continue;
562 }
563
564 let key = (
565 active_state.state.pos,
566 active_state.start_byte,
567 active_state.state.skip_next,
568 );
569 deduped
570 .entry(key)
571 .and_modify(|existing| {
572 if active_state.state.total_edits() < existing.state.total_edits() {
574 *existing = active_state.clone();
575 }
576 })
577 .or_insert(active_state);
578 }
579
580 active.clear();
581 active.extend(deduped.values().cloned());
582
583 self.beam_prune(&mut active);
585
586 let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
588
589 for active_state in &active {
590 if active_state.state.pos == self.pattern_chars.len()
591 && !active_state.state.skip_next
592 {
593 let sim = self.calc_similarity(&active_state.state);
594 if sim >= threshold {
595 let key = (active_state.start_byte, end_byte);
596 let m = DamLevMatch {
597 start: active_state.start_byte,
598 end: end_byte,
599 insertions: active_state.state.insertions,
600 deletions: active_state.state.deletions,
601 substitutions: active_state.state.substitutions,
602 swaps: active_state.state.swaps,
603 similarity: sim,
604 };
605
606 matches
607 .entry(key)
608 .and_modify(|existing| {
609 if m.similarity > existing.similarity {
610 *existing = m.clone();
611 }
612 })
613 .or_insert(m);
614 }
615 }
616 }
617
618 let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
621 active.retain(|a| {
622 (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
623 && (char_idx + 1 - a.start_char) <= max_window
624 });
625 }
626
627 for active_state in &active {
629 let state = active_state.state;
630 if state.skip_next {
631 continue; }
633 let remaining = (self.pattern_chars.len() - state.pos) as u8;
634
635 if remaining <= self.limits.max_edits - state.total_edits() {
637 let dels_needed = remaining;
638 let total_dels = state.deletions + dels_needed;
639
640 if self
641 .limits
642 .max_deletions
643 .is_none_or(|max| total_dels <= max)
644 {
645 let final_state = State {
646 pos: self.pattern_chars.len(),
647 deletions: total_dels,
648 ..state
649 };
650
651 let sim = self.calc_similarity(&final_state);
652 if sim >= threshold {
653 let key = (active_state.start_byte, text.len());
654 let m = DamLevMatch {
655 start: active_state.start_byte,
656 end: text.len(),
657 insertions: final_state.insertions,
658 deletions: final_state.deletions,
659 substitutions: final_state.substitutions,
660 swaps: final_state.swaps,
661 similarity: sim,
662 };
663
664 matches
665 .entry(key)
666 .and_modify(|existing| {
667 if m.similarity > existing.similarity {
668 *existing = m.clone();
669 }
670 })
671 .or_insert(m);
672 }
673 }
674 }
675 }
676
677 matches.into_values().collect()
678 }
679
680 #[must_use]
685 pub fn find_n(&self, text: &str, threshold: f32, n: usize) -> Vec<DamLevMatch> {
686 if n == 0 {
687 return Vec::new();
688 }
689
690 let mut all_matches = self.find_all(text, threshold);
691
692 all_matches.sort_by_key(|m| m.start);
694
695 let mut result = Vec::with_capacity(n.min(all_matches.len()));
697 let mut last_end = 0;
698
699 for m in all_matches {
700 if m.start >= last_end {
701 last_end = m.end;
702 result.push(m);
703 if result.len() >= n {
704 break;
705 }
706 }
707 }
708
709 result
710 }
711
712 #[must_use]
717 pub fn find_all_with_candidates(
718 &self,
719 text: &str,
720 threshold: f32,
721 candidates: &FxHashSet<usize>,
722 ) -> Vec<DamLevMatch> {
723 let mut buffers = SearchBuffers::new();
724 self.find_all_with_candidates_buffered(text, threshold, candidates, &mut buffers)
725 }
726
727 pub fn find_all_with_candidates_buffered(
729 &self,
730 text: &str,
731 threshold: f32,
732 candidates: &FxHashSet<usize>,
733 buffers: &mut SearchBuffers,
734 ) -> Vec<DamLevMatch> {
735 buffers.clear();
737
738 if self.pattern_chars.is_empty() {
739 return vec![DamLevMatch {
740 start: 0,
741 end: 0,
742 insertions: 0,
743 deletions: 0,
744 substitutions: 0,
745 swaps: 0,
746 similarity: 1.0,
747 }];
748 }
749
750 let mut char_iter = text.char_indices().peekable();
752
753 let mut char_idx = 0usize;
754 while let Some((byte_pos, raw_char)) = char_iter.next() {
755 let text_char = if self.case_insensitive {
756 raw_char.to_lowercase().next().unwrap_or(raw_char)
757 } else {
758 raw_char
759 };
760
761 let next_info = char_iter.peek().map(|&(next_byte, next_char)| {
763 let c = if self.case_insensitive {
764 next_char.to_lowercase().next().unwrap_or(next_char)
765 } else {
766 next_char
767 };
768 (next_byte, c)
769 });
770 let next_text_char = next_info.map(|(_, c)| c);
771 let end_byte = next_info.map_or(text.len(), |(b, _)| b);
772
773 if candidates.contains(&byte_pos) {
775 let initial = ActiveState {
776 state: State::new(0),
777 start_byte: byte_pos,
778 start_char: char_idx,
779 };
780 buffers.active.push(initial);
781
782 {
784 let start_idx = buffers.active.len() - 1;
785 buffers.seen_set.clear();
786 for a in &buffers.active {
787 buffers.seen_set.insert((a.state, a.start_byte));
788 }
789 self.epsilon_closure_from(
790 &mut buffers.active,
791 start_idx,
792 &mut buffers.seen_set,
793 );
794 }
795 }
796
797 buffers.next_active.clear();
799
800 for active_state in &buffers.active {
801 let state = active_state.state;
802
803 if state.skip_next {
805 let mut continued = state;
806 continued.skip_next = false;
807 buffers.next_active.push(ActiveState {
808 state: continued,
809 start_byte: active_state.start_byte,
810 start_char: active_state.start_char,
811 });
812 continue;
813 }
814
815 if state.pos < self.pattern_chars.len() {
816 let pattern_char = self.pattern_chars[state.pos];
817
818 if text_char == pattern_char {
819 buffers.next_active.push(ActiveState {
820 state: state.advance_match(),
821 start_byte: active_state.start_byte,
822 start_char: active_state.start_char,
823 });
824 }
825
826 if text_char != pattern_char && self.can_substitute(&state) {
827 buffers.next_active.push(ActiveState {
828 state: state.advance_substitution(),
829 start_byte: active_state.start_byte,
830 start_char: active_state.start_char,
831 });
832 }
833
834 if let Some(next_char) = next_text_char
836 && state.pos + 1 < self.pattern_chars.len()
837 && self.can_swap(&state)
838 {
839 let next_pattern_char = self.pattern_chars[state.pos + 1];
840 if pattern_char == next_char
841 && next_pattern_char == text_char
842 && pattern_char != next_pattern_char
843 {
844 buffers.next_active.push(ActiveState {
845 state: state.advance_swap(),
846 start_byte: active_state.start_byte,
847 start_char: active_state.start_char,
848 });
849 }
850 }
851 }
852
853 if self.can_insert(&state) {
854 buffers.next_active.push(ActiveState {
855 state: state.advance_insertion(),
856 start_byte: active_state.start_byte,
857 start_char: active_state.start_char,
858 });
859 }
860 }
861
862 buffers.seen_set.clear();
863 self.epsilon_closure(&mut buffers.next_active, &mut buffers.seen_set);
864
865 buffers.deduped.clear();
867 for active_state in buffers.next_active.drain(..) {
868 if self.max_possible_similarity(&active_state.state) < threshold {
870 continue;
871 }
872
873 let key = (
874 active_state.state.pos,
875 active_state.start_byte,
876 active_state.state.skip_next,
877 );
878 buffers
879 .deduped
880 .entry(key)
881 .and_modify(|existing| {
882 if active_state.state.total_edits() < existing.state.total_edits() {
883 *existing = active_state.clone();
884 }
885 })
886 .or_insert(active_state);
887 }
888
889 buffers.active.clear();
890 buffers.active.extend(buffers.deduped.values().cloned());
891
892 self.beam_prune(&mut buffers.active);
894
895 for active_state in &buffers.active {
897 if active_state.state.pos == self.pattern_chars.len()
898 && !active_state.state.skip_next
899 {
900 let sim = self.calc_similarity(&active_state.state);
901 if sim >= threshold {
902 let key = (active_state.start_byte, end_byte);
903 let m = DamLevMatch {
904 start: active_state.start_byte,
905 end: end_byte,
906 insertions: active_state.state.insertions,
907 deletions: active_state.state.deletions,
908 substitutions: active_state.state.substitutions,
909 swaps: active_state.state.swaps,
910 similarity: sim,
911 };
912
913 buffers
914 .matches
915 .entry(key)
916 .and_modify(|existing| {
917 if m.similarity > existing.similarity {
918 *existing = m.clone();
919 }
920 })
921 .or_insert(m);
922 }
923 }
924 }
925
926 let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
928 buffers.active.retain(|a| {
929 (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
930 && (char_idx + 1 - a.start_char) <= max_window
931 });
932
933 char_idx += 1;
934 }
935
936 for active_state in &buffers.active {
938 let state = active_state.state;
939 if state.skip_next {
940 continue;
941 }
942 let remaining = self.pattern_chars.len() - state.pos;
943
944 if remaining as u8 <= self.limits.max_edits - state.total_edits() {
945 let dels_needed = remaining as u8;
946 let total_dels = state.deletions + dels_needed;
947
948 if self
949 .limits
950 .max_deletions
951 .is_none_or(|max| total_dels <= max)
952 {
953 let final_state = State {
954 pos: self.pattern_chars.len(),
955 deletions: total_dels,
956 ..state
957 };
958
959 let sim = self.calc_similarity(&final_state);
960 if sim >= threshold {
961 let key = (active_state.start_byte, text.len());
962 let m = DamLevMatch {
963 start: active_state.start_byte,
964 end: text.len(),
965 insertions: final_state.insertions,
966 deletions: final_state.deletions,
967 substitutions: final_state.substitutions,
968 swaps: final_state.swaps,
969 similarity: sim,
970 };
971
972 buffers
973 .matches
974 .entry(key)
975 .and_modify(|existing| {
976 if m.similarity > existing.similarity {
977 *existing = m.clone();
978 }
979 })
980 .or_insert(m);
981 }
982 }
983 }
984 }
985
986 buffers.matches.drain().map(|(_, v)| v).collect()
987 }
988
989 #[must_use]
994 pub fn find_first_with_candidates(
995 &self,
996 text: &str,
997 threshold: f32,
998 candidates: &FxHashSet<usize>,
999 ) -> Option<DamLevMatch> {
1000 if self.pattern_chars.is_empty() {
1001 return Some(DamLevMatch {
1002 start: 0,
1003 end: 0,
1004 insertions: 0,
1005 deletions: 0,
1006 substitutions: 0,
1007 swaps: 0,
1008 similarity: 1.0,
1009 });
1010 }
1011
1012 let text_chars: Vec<(usize, char)> = text.char_indices().collect();
1013 let mut active: Vec<ActiveState> = Vec::new();
1014
1015 let mut seen_set: FxHashSet<(State, usize)> = FxHashSet::default();
1017
1018 let mut deduped: FxHashMap<(usize, usize, bool), ActiveState> = FxHashMap::default();
1020
1021 for (char_idx, &(byte_pos, text_char)) in text_chars.iter().enumerate() {
1022 let text_char = if self.case_insensitive {
1023 text_char.to_lowercase().next().unwrap_or(text_char)
1024 } else {
1025 text_char
1026 };
1027
1028 let next_text_char = text_chars.get(char_idx + 1).map(|&(_, c)| {
1030 if self.case_insensitive {
1031 c.to_lowercase().next().unwrap_or(c)
1032 } else {
1033 c
1034 }
1035 });
1036
1037 if candidates.contains(&byte_pos) {
1039 let initial = ActiveState {
1040 state: State::new(0),
1041 start_byte: byte_pos,
1042 start_char: char_idx,
1043 };
1044 active.push(initial);
1045
1046 let start_idx = active.len() - 1;
1048 seen_set.clear();
1049 for a in &active {
1050 seen_set.insert((a.state, a.start_byte));
1051 }
1052 self.epsilon_closure_from(&mut active, start_idx, &mut seen_set);
1053 }
1054
1055 if active.is_empty() {
1057 continue;
1058 }
1059
1060 let mut next_active: Vec<ActiveState> = Vec::new();
1062
1063 for active_state in &active {
1064 let state = active_state.state;
1065
1066 if state.skip_next {
1068 let mut continued = state;
1069 continued.skip_next = false;
1070 next_active.push(ActiveState {
1071 state: continued,
1072 start_byte: active_state.start_byte,
1073 start_char: active_state.start_char,
1074 });
1075 continue;
1076 }
1077
1078 if state.pos < self.pattern_chars.len() {
1079 let pattern_char = self.pattern_chars[state.pos];
1080
1081 if text_char == pattern_char {
1082 next_active.push(ActiveState {
1083 state: state.advance_match(),
1084 start_byte: active_state.start_byte,
1085 start_char: active_state.start_char,
1086 });
1087 }
1088
1089 if text_char != pattern_char && self.can_substitute(&state) {
1090 next_active.push(ActiveState {
1091 state: state.advance_substitution(),
1092 start_byte: active_state.start_byte,
1093 start_char: active_state.start_char,
1094 });
1095 }
1096
1097 if let Some(next_char) = next_text_char
1099 && state.pos + 1 < self.pattern_chars.len()
1100 && self.can_swap(&state)
1101 {
1102 let next_pattern_char = self.pattern_chars[state.pos + 1];
1103 if pattern_char == next_char
1104 && next_pattern_char == text_char
1105 && pattern_char != next_pattern_char
1106 {
1107 next_active.push(ActiveState {
1108 state: state.advance_swap(),
1109 start_byte: active_state.start_byte,
1110 start_char: active_state.start_char,
1111 });
1112 }
1113 }
1114 }
1115
1116 if self.can_insert(&state) {
1117 next_active.push(ActiveState {
1118 state: state.advance_insertion(),
1119 start_byte: active_state.start_byte,
1120 start_char: active_state.start_char,
1121 });
1122 }
1123 }
1124
1125 seen_set.clear();
1126 self.epsilon_closure(&mut next_active, &mut seen_set);
1127
1128 deduped.clear();
1130 for active_state in next_active {
1131 if self.max_possible_similarity(&active_state.state) < threshold {
1133 continue;
1134 }
1135
1136 let key = (
1137 active_state.state.pos,
1138 active_state.start_byte,
1139 active_state.state.skip_next,
1140 );
1141 deduped
1142 .entry(key)
1143 .and_modify(|existing| {
1144 if active_state.state.total_edits() < existing.state.total_edits() {
1145 *existing = active_state.clone();
1146 }
1147 })
1148 .or_insert(active_state);
1149 }
1150
1151 active.clear();
1152 active.extend(deduped.values().cloned());
1153
1154 self.beam_prune(&mut active);
1156
1157 let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
1159
1160 let mut best_match: Option<DamLevMatch> = None;
1162 for active_state in &active {
1163 if active_state.state.pos == self.pattern_chars.len()
1164 && !active_state.state.skip_next
1165 {
1166 let sim = self.calc_similarity(&active_state.state);
1167 if sim >= threshold {
1168 let m = DamLevMatch {
1169 start: active_state.start_byte,
1170 end: end_byte,
1171 insertions: active_state.state.insertions,
1172 deletions: active_state.state.deletions,
1173 substitutions: active_state.state.substitutions,
1174 swaps: active_state.state.swaps,
1175 similarity: sim,
1176 };
1177 if best_match.as_ref().is_none_or(|best| sim > best.similarity) {
1178 best_match = Some(m);
1179 }
1180 }
1181 }
1182 }
1183 if best_match.is_some() {
1184 return best_match;
1185 }
1186
1187 let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
1189 active.retain(|a| {
1190 (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
1191 && (char_idx + 1 - a.start_char) <= max_window
1192 });
1193 }
1194
1195 for active_state in &active {
1197 let state = active_state.state;
1198 if state.skip_next {
1199 continue;
1200 }
1201 let remaining = self.pattern_chars.len() - state.pos;
1202
1203 if remaining as u8 <= self.limits.max_edits - state.total_edits() {
1204 let dels_needed = remaining as u8;
1205 let total_dels = state.deletions + dels_needed;
1206
1207 if self
1208 .limits
1209 .max_deletions
1210 .is_none_or(|max| total_dels <= max)
1211 {
1212 let final_state = State {
1213 pos: self.pattern_chars.len(),
1214 deletions: total_dels,
1215 ..state
1216 };
1217
1218 let sim = self.calc_similarity(&final_state);
1219 if sim >= threshold {
1220 return Some(DamLevMatch {
1221 start: active_state.start_byte,
1222 end: text.len(),
1223 insertions: final_state.insertions,
1224 deletions: final_state.deletions,
1225 substitutions: final_state.substitutions,
1226 swaps: final_state.swaps,
1227 similarity: sim,
1228 });
1229 }
1230 }
1231 }
1232 }
1233
1234 None
1235 }
1236}
1237
1238#[cfg(test)]
1239mod tests {
1240 use super::*;
1241
1242 #[test]
1243 fn test_exact_match() {
1244 let nfa = DamLevNfa::new("hello", EditLimits::new(0), false);
1245 let matches = nfa.find_all("hello world", 0.8);
1246
1247 assert_eq!(matches.len(), 1);
1248 assert_eq!(matches[0].start, 0);
1249 assert_eq!(matches[0].end, 5);
1250 assert_eq!(matches[0].total_edits(), 0);
1251 }
1252
1253 #[test]
1254 fn test_one_substitution() {
1255 let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1256 let matches = nfa.find_all("hallo world", 0.5);
1257
1258 assert!(
1259 matches
1260 .iter()
1261 .any(|m| m.start == 0 && m.end == 5 && m.substitutions == 1)
1262 );
1263 }
1264
1265 #[test]
1266 fn test_one_insertion() {
1267 let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1268 let matches = nfa.find_all("heello world", 0.5);
1269
1270 assert!(matches.iter().any(|m| m.start == 0 && m.insertions == 1));
1271 }
1272
1273 #[test]
1274 fn test_one_deletion() {
1275 let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1276 let matches = nfa.find_all("helo world", 0.5);
1277
1278 assert!(matches.iter().any(|m| m.start == 0 && m.deletions == 1));
1279 }
1280
1281 #[test]
1282 fn test_case_insensitive() {
1283 let nfa = DamLevNfa::new("hello", EditLimits::new(0), true);
1284 let matches = nfa.find_all("HELLO world", 0.8);
1285
1286 assert_eq!(matches.len(), 1);
1287 assert_eq!(matches[0].start, 0);
1288 assert_eq!(matches[0].end, 5);
1289 }
1290
1291 #[test]
1292 fn test_multiple_matches() {
1293 let nfa = DamLevNfa::new("ab", EditLimits::new(0), false);
1294 let matches = nfa.find_all("ab ab ab", 0.8);
1295
1296 assert_eq!(matches.len(), 3);
1297 }
1298
1299 #[test]
1300 fn test_no_match() {
1301 let nfa = DamLevNfa::new("xyz", EditLimits::new(1), false);
1302 let matches = nfa.find_all("hello world", 0.8);
1303
1304 assert!(matches.is_empty());
1305 }
1306
1307 #[test]
1310 fn test_transposition_simple() {
1311 let nfa = DamLevNfa::new("ab", EditLimits::new(1), false);
1313 let matches = nfa.find_all("ba", 0.0);
1314
1315 assert!(!matches.is_empty(), "Should find match for transposition");
1316 let swap_match = matches.iter().find(|m| m.swaps == 1);
1318 assert!(
1319 swap_match.is_some(),
1320 "Should find match with 1 swap, got: {matches:?}"
1321 );
1322 let m = swap_match.unwrap();
1323 assert_eq!(m.substitutions, 0, "Should have 0 substitutions");
1324 assert_eq!(m.total_edits(), 1, "Total edits should be 1");
1325 }
1326
1327 #[test]
1328 fn test_transposition_in_word() {
1329 let nfa = DamLevNfa::new("the", EditLimits::new(1), false);
1331 let matches = nfa.find_all("teh", 0.0);
1332
1333 assert!(!matches.is_empty(), "Should find match for 'teh' -> 'the'");
1334 let m = matches.iter().find(|m| m.swaps == 1);
1335 assert!(m.is_some(), "Should find match with 1 swap");
1336 }
1337
1338 #[test]
1339 fn test_transposition_longer_word() {
1340 let nfa = DamLevNfa::new("receive", EditLimits::new(1), false);
1342 let matches = nfa.find_all("recieve", 0.0);
1343
1344 assert!(
1345 !matches.is_empty(),
1346 "Should find match for 'recieve' -> 'receive'"
1347 );
1348 let swap_match = matches.iter().find(|m| m.swaps == 1);
1350 assert!(
1351 swap_match.is_some(),
1352 "Should find match with 1 swap, got: {matches:?}"
1353 );
1354 }
1355
1356 #[test]
1357 fn test_transposition_with_other_edits() {
1358 let nfa = DamLevNfa::new("abcd", EditLimits::new(2), false);
1360 let matches = nfa.find_all("badc", 0.0);
1361
1362 assert!(!matches.is_empty(), "Should find match with 2 swaps");
1363 let m = matches.iter().find(|m| m.total_edits() == 2);
1365 assert!(m.is_some(), "Should find match with 2 total edits");
1366 }
1367
1368 #[test]
1369 fn test_transposition_not_same_char() {
1370 let nfa = DamLevNfa::new("aa", EditLimits::new(1), false);
1373 let matches = nfa.find_all("aa", 0.0);
1374
1375 assert!(!matches.is_empty());
1377 let exact = matches.iter().find(|m| m.total_edits() == 0);
1378 assert!(exact.is_some(), "Should find exact match");
1379 }
1380
1381 #[test]
1382 fn test_transposition_vs_substitution() {
1383 let nfa = DamLevNfa::new("ab", EditLimits::new(2), false);
1385 let matches = nfa.find_all("ba", 0.0);
1386
1387 let swap_match = matches
1389 .iter()
1390 .find(|m| m.swaps == 1 && m.total_edits() == 1);
1391 assert!(
1392 swap_match.is_some(),
1393 "Should find match with 1 swap, got: {matches:?}"
1394 );
1395
1396 let best_sim = matches
1398 .iter()
1399 .map(|m| m.similarity)
1400 .max_by(|a, b| a.partial_cmp(b).unwrap());
1401 let swap_sim = swap_match.unwrap().similarity;
1402 assert!(
1403 swap_sim >= best_sim.unwrap() - 0.01,
1404 "Swap match should have high similarity"
1405 );
1406 }
1407
1408 #[test]
1409 fn test_transposition_case_insensitive() {
1410 let nfa = DamLevNfa::new("AB", EditLimits::new(1), true);
1412 let matches = nfa.find_all("ba", 0.0);
1413
1414 assert!(
1415 !matches.is_empty(),
1416 "Should find case-insensitive transposition"
1417 );
1418 let m = matches.iter().find(|m| m.swaps == 1);
1419 assert!(m.is_some(), "Should find match with 1 swap");
1420 }
1421}
1422
1423#[test]
1424fn test_find_with_candidates() {
1425 let nfa = DamLevNfa::new("quik", EditLimits::new(1), false);
1426
1427 let text = "The quick brown fox";
1429 let all_positions: FxHashSet<usize> = text.char_indices().map(|(i, _)| i).collect();
1430 let matches = nfa.find_all_with_candidates(text, 0.8, &all_positions);
1431 println!("All positions candidates: {matches:?}");
1432
1433 let limited: FxHashSet<usize> = vec![3, 4].into_iter().collect();
1435 let matches2 = nfa.find_all_with_candidates(text, 0.8, &limited);
1436 println!("Limited candidates (3,4): {matches2:?}");
1437
1438 let matches3 = nfa.find_all(text, 0.8);
1440 println!("find_all: {matches3:?}");
1441
1442 assert!(!matches.is_empty(), "Should find match with all positions");
1443 assert!(
1444 !matches2.is_empty(),
1445 "Should find match with limited positions"
1446 );
1447}