Skip to main content

fuzzy_regex/engine/
damlev.rs

1//! `DamLev` automaton for efficient fuzzy string matching.
2//!
3//! This implements a `DamLev` NFA that can find all approximate matches
4//! of a pattern in a text with bounded edit distance in O(N × m × k) time.
5
6#![allow(
7    clippy::needless_range_loop,
8    clippy::items_after_statements,
9    clippy::similar_names,
10    clippy::too_many_lines
11)]
12
13use super::hash::{FxHashMap, FxHashSet};
14
15/// Reusable buffers for NFA search to avoid allocations.
16/// Create once and pass to search methods for reuse.
17#[derive(Default, Debug)]
18pub struct SearchBuffers {
19    /// Active states during search
20    active: Vec<ActiveState>,
21    /// Next iteration's active states
22    next_active: Vec<ActiveState>,
23    /// Seen states for epsilon closure
24    seen_set: FxHashSet<(State, usize)>,
25    /// Deduplication map
26    deduped: FxHashMap<(usize, usize, bool), ActiveState>,
27    /// Match results
28    matches: FxHashMap<(usize, usize), DamLevMatch>,
29}
30
31impl SearchBuffers {
32    /// Create new buffers with default capacity.
33    #[must_use]
34    pub fn new() -> Self {
35        SearchBuffers {
36            active: Vec::with_capacity(32),
37            next_active: Vec::with_capacity(32),
38            seen_set: FxHashSet::default(),
39            deduped: FxHashMap::default(),
40            matches: FxHashMap::default(),
41        }
42    }
43
44    /// Clear all buffers for reuse.
45    pub fn clear(&mut self) {
46        self.active.clear();
47        self.next_active.clear();
48        self.seen_set.clear();
49        self.deduped.clear();
50        self.matches.clear();
51    }
52}
53
54/// Edit operation limits.
55#[derive(Debug, Clone, Default)]
56pub struct EditLimits {
57    /// Maximum total number of edit operations allowed.
58    pub max_edits: u8,
59    /// Maximum number of insertion edits allowed (None = unlimited up to `max_edits`).
60    pub max_insertions: Option<u8>,
61    /// Maximum number of deletion edits allowed (None = unlimited up to `max_edits`).
62    pub max_deletions: Option<u8>,
63    /// Maximum number of substitution edits allowed (None = unlimited up to `max_edits`).
64    pub max_substitutions: Option<u8>,
65    /// Maximum number of transposition edits allowed (None = unlimited up to `max_edits`).
66    pub max_swaps: Option<u8>,
67}
68
69impl EditLimits {
70    /// Create edit limits with only a maximum total edit count.
71    #[must_use]
72    pub fn new(max_edits: u8) -> Self {
73        EditLimits {
74            max_edits,
75            max_insertions: None,
76            max_deletions: None,
77            max_substitutions: None,
78            max_swaps: None,
79        }
80    }
81
82    /// Create edit limits with specific limits for each operation type.
83    #[must_use]
84    pub fn with_limits(
85        max_edits: u8,
86        max_insertions: Option<u8>,
87        max_deletions: Option<u8>,
88        max_substitutions: Option<u8>,
89        max_swaps: Option<u8>,
90    ) -> Self {
91        EditLimits {
92            max_edits,
93            max_insertions,
94            max_deletions,
95            max_substitutions,
96            max_swaps,
97        }
98    }
99}
100
101/// A match found by the `DamLev` automaton.
102#[derive(Debug, Clone)]
103pub struct DamLevMatch {
104    /// Start position of the match (byte offset, inclusive).
105    pub start: usize,
106    /// End position of the match (byte offset, exclusive).
107    pub end: usize,
108    /// Number of insertion edits in this match.
109    pub insertions: u8,
110    /// Number of deletion edits in this match.
111    pub deletions: u8,
112    /// Number of substitution edits in this match.
113    pub substitutions: u8,
114    /// Number of transposition edits in this match.
115    pub swaps: u8,
116    /// Similarity score (0.0 to 1.0).
117    pub similarity: f32,
118}
119
120impl DamLevMatch {
121    /// Returns the total number of edit operations in this match.
122    #[must_use]
123    pub fn total_edits(&self) -> u8 {
124        self.insertions
125            .saturating_add(self.deletions)
126            .saturating_add(self.substitutions)
127            .saturating_add(self.swaps)
128    }
129}
130
131/// State in the `DamLev` NFA.
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
133struct State {
134    /// Position in pattern (`0..=pattern_len`).
135    pos: usize,
136    /// Number of insertions used.
137    insertions: u8,
138    /// Number of deletions used.
139    deletions: u8,
140    /// Number of substitutions used.
141    substitutions: u8,
142    /// Number of swaps (transpositions) used.
143    swaps: u8,
144    /// If true, skip processing on next character (used for transposition which consumes 2 chars).
145    skip_next: bool,
146}
147
148impl State {
149    fn new(pos: usize) -> Self {
150        State {
151            pos,
152            insertions: 0,
153            deletions: 0,
154            substitutions: 0,
155            swaps: 0,
156            skip_next: false,
157        }
158    }
159
160    fn total_edits(&self) -> u8 {
161        self.insertions
162            .saturating_add(self.deletions)
163            .saturating_add(self.substitutions)
164            .saturating_add(self.swaps)
165    }
166
167    fn advance_match(&self) -> Self {
168        State {
169            pos: self.pos + 1,
170            skip_next: false,
171            ..*self
172        }
173    }
174
175    fn advance_substitution(&self) -> Self {
176        State {
177            pos: self.pos + 1,
178            substitutions: self.substitutions + 1,
179            skip_next: false,
180            ..*self
181        }
182    }
183
184    fn advance_deletion(&self) -> Self {
185        State {
186            pos: self.pos + 1,
187            deletions: self.deletions + 1,
188            skip_next: false,
189            ..*self
190        }
191    }
192
193    fn advance_insertion(&self) -> Self {
194        State {
195            insertions: self.insertions + 1,
196            skip_next: false,
197            ..*self
198        }
199    }
200
201    /// Advance by transposition (swap two adjacent characters).
202    /// Consumes 2 pattern chars and 2 text chars for 1 edit.
203    fn advance_swap(&self) -> Self {
204        State {
205            pos: self.pos + 2,
206            swaps: self.swaps + 1,
207            skip_next: true, // Skip next text char since transposition consumes 2
208            ..*self
209        }
210    }
211}
212
213/// Active state tracking match start position.
214#[derive(Debug, Clone)]
215struct ActiveState {
216    state: State,
217    start_byte: usize,
218    start_char: usize,
219}
220
221/// `DamLev` NFA for fuzzy pattern matching.
222#[derive(Debug)]
223pub struct DamLevNfa {
224    pattern: String,
225    pattern_chars: Vec<char>,
226    limits: EditLimits,
227    case_insensitive: bool,
228    /// Beam width for state pruning - limits state explosion.
229    /// Default: `pattern_len` * 4 (adapts to pattern size).
230    beam_width: usize,
231}
232
233impl DamLevNfa {
234    /// Create a new `DamLev` NFA for the given pattern.
235    #[must_use]
236    pub fn new(pattern: &str, limits: EditLimits, case_insensitive: bool) -> Self {
237        let pattern_chars: Vec<char> = if case_insensitive {
238            pattern.to_lowercase().chars().collect()
239        } else {
240            pattern.chars().collect()
241        };
242
243        // Beam width scales with pattern length and max_edits
244        // Larger patterns need more states, but we cap it to prevent explosion
245        let beam_width =
246            ((pattern_chars.len() + 1) * (limits.max_edits as usize + 1) * 2).clamp(32, 256);
247
248        DamLevNfa {
249            pattern: pattern.to_string(),
250            pattern_chars,
251            limits,
252            case_insensitive,
253            beam_width,
254        }
255    }
256
257    /// Returns the original pattern string.
258    #[must_use]
259    pub fn pattern(&self) -> &str {
260        &self.pattern
261    }
262
263    /// Check if we can do an insertion.
264    fn can_insert(&self, state: &State) -> bool {
265        if state.total_edits() >= self.limits.max_edits {
266            return false;
267        }
268        self.limits
269            .max_insertions
270            .is_none_or(|max| state.insertions < max)
271    }
272
273    /// Check if we can do a deletion.
274    fn can_delete(&self, state: &State) -> bool {
275        if state.total_edits() >= self.limits.max_edits {
276            return false;
277        }
278        self.limits
279            .max_deletions
280            .is_none_or(|max| state.deletions < max)
281    }
282
283    /// Check if we can do a substitution.
284    fn can_substitute(&self, state: &State) -> bool {
285        if state.total_edits() >= self.limits.max_edits {
286            return false;
287        }
288        self.limits
289            .max_substitutions
290            .is_none_or(|max| state.substitutions < max)
291    }
292
293    /// Check if we can do a swap (transposition).
294    fn can_swap(&self, state: &State) -> bool {
295        if state.total_edits() >= self.limits.max_edits {
296            return false;
297        }
298        self.limits.max_swaps.is_none_or(|max| state.swaps < max)
299    }
300
301    /// Add epsilon closure (deletion transitions) to a set of states.
302    /// Takes a reusable `HashSet` to avoid allocation on every call.
303    /// The `HashSet` is used internally for deduplication and will be modified.
304    fn epsilon_closure(&self, states: &mut Vec<ActiveState>, seen: &mut FxHashSet<(State, usize)>) {
305        self.epsilon_closure_from(states, 0, seen);
306    }
307
308    /// Add epsilon closure starting from a specific index in the states vector.
309    /// The `HashSet` should be pre-populated with existing states for deduplication.
310    fn epsilon_closure_from(
311        &self,
312        states: &mut Vec<ActiveState>,
313        start_idx: usize,
314        seen: &mut FxHashSet<(State, usize)>,
315    ) {
316        let mut i = start_idx;
317        while i < states.len() {
318            let active = &states[i];
319            let state = active.state;
320
321            // Try deletion: skip pattern character without consuming text
322            if state.pos < self.pattern_chars.len() && self.can_delete(&state) {
323                let new_state = state.advance_deletion();
324                let key = (new_state, active.start_byte);
325
326                if !seen.contains(&key) {
327                    seen.insert(key);
328                    states.push(ActiveState {
329                        state: new_state,
330                        start_byte: active.start_byte,
331                        start_char: active.start_char,
332                    });
333                }
334            }
335
336            i += 1;
337        }
338    }
339
340    /// Calculate similarity score using normalized edit distance.
341    fn calc_similarity(&self, state: &State) -> f32 {
342        let pattern_len = self.pattern_chars.len() as f32;
343        if pattern_len == 0.0 {
344            return 1.0;
345        }
346
347        let edit_distance = f32::from(state.total_edits());
348        // Matched text length = pattern_len + insertions - deletions
349        let matched_len = pattern_len + f32::from(state.insertions) - f32::from(state.deletions);
350        let max_len = pattern_len.max(matched_len).max(1.0);
351
352        // Normalized DamLev similarity
353        (1.0 - edit_distance / max_len).max(0.0)
354    }
355
356    /// Calculate the maximum possible similarity a state can achieve.
357    /// Used for early pruning - if max possible < threshold, prune the state.
358    ///
359    /// This is conservative: we only prune if the state CANNOT reach the threshold.
360    /// The actual similarity is: (1.0 - edits / max(`pattern_len`, `matched_len`))
361    /// where `matched_len` = `pattern_len` + insertions - deletions.
362    ///
363    /// For early pruning, we use the most optimistic denominator (largest possible),
364    /// which gives the highest possible similarity.
365    #[inline]
366    fn max_possible_similarity(&self, state: &State) -> f32 {
367        let pattern_len = self.pattern_chars.len() as f32;
368        if pattern_len == 0.0 {
369            return 1.0;
370        }
371
372        let min_edits = f32::from(state.total_edits());
373
374        // Current matched length estimate (can grow with more insertions)
375        let current_matched_len =
376            pattern_len + f32::from(state.insertions) - f32::from(state.deletions);
377
378        // The max possible denominator is max(pattern_len, current_matched_len + remaining_edits)
379        // We're conservative: use the largest plausible denominator
380        let remaining_edits = f32::from(self.limits.max_edits - state.total_edits());
381        let max_denominator = pattern_len.max(current_matched_len + remaining_edits);
382
383        // Best case: no more edits needed, largest denominator
384        (1.0 - min_edits / max_denominator).max(0.0)
385    }
386
387    /// Apply beam pruning to active states - keep only the best states.
388    /// Sorts by `total_edits` (lower is better) and truncates to `beam_width`.
389    #[inline]
390    fn beam_prune(&self, states: &mut Vec<ActiveState>) {
391        if states.len() > self.beam_width * 2 {
392            // Sort by total_edits (ascending) - states with fewer edits are better
393            states.sort_by_key(|s| s.state.total_edits());
394            states.truncate(self.beam_width);
395        }
396    }
397
398    /// Find the first match in the text.
399    ///
400    /// Returns the earliest match found, or None if no match exists.
401    #[must_use]
402    pub fn find_first(&self, text: &str, threshold: f32) -> Option<DamLevMatch> {
403        // For NFA fallback, we use find_all and take the first result
404        // This is not optimal but NFA is only used as a fallback for >64 char patterns
405        self.find_all(text, threshold)
406            .into_iter()
407            .min_by_key(|m| m.start)
408    }
409
410    /// Find all matches in the text.
411    ///
412    /// Returns matches organized by (`start_byte`, `end_byte`) with the best match for each span.
413    #[must_use]
414    pub fn find_all(&self, text: &str, threshold: f32) -> Vec<DamLevMatch> {
415        if self.pattern_chars.is_empty() {
416            // Empty pattern matches everywhere
417            return vec![DamLevMatch {
418                start: 0,
419                end: 0,
420                insertions: 0,
421                deletions: 0,
422                substitutions: 0,
423                swaps: 0,
424                similarity: 1.0,
425            }];
426        }
427
428        let text_chars: Vec<(usize, char)> = text.char_indices().collect();
429        let mut matches: FxHashMap<(usize, usize), DamLevMatch> = FxHashMap::default();
430
431        // Reusable FxHashSet to avoid allocation per character
432        let mut seen_set: FxHashSet<(State, usize)> = FxHashSet::default();
433
434        // Reusable FxHashMap for deduplication
435        let mut deduped: FxHashMap<(usize, usize, bool), ActiveState> = FxHashMap::default();
436
437        // Active states: each contains (State, start_byte_pos, start_char_pos)
438        let mut active: Vec<ActiveState> = Vec::new();
439
440        // Process text character by character
441        for (char_idx, &(byte_pos, text_char)) in text_chars.iter().enumerate() {
442            let text_char = if self.case_insensitive {
443                text_char.to_lowercase().next().unwrap_or(text_char)
444            } else {
445                text_char
446            };
447
448            // Get next text char for transposition detection
449            let next_text_char = text_chars.get(char_idx + 1).map(|&(_, c)| {
450                if self.case_insensitive {
451                    c.to_lowercase().next().unwrap_or(c)
452                } else {
453                    c
454                }
455            });
456
457            // Start a new potential match at this position
458            let initial = ActiveState {
459                state: State::new(0),
460                start_byte: byte_pos,
461                start_char: char_idx,
462            };
463            active.push(initial);
464
465            // Add epsilon closure for new state - run directly on active
466            {
467                // Track where new states start
468                let start_idx = active.len() - 1;
469                // Populate seen_set with existing states
470                seen_set.clear();
471                for a in &active {
472                    seen_set.insert((a.state, a.start_byte));
473                }
474                // Run epsilon closure starting from the new state
475                self.epsilon_closure_from(&mut active, start_idx, &mut seen_set);
476            }
477
478            // Compute next states
479            let mut next_active: Vec<ActiveState> = Vec::new();
480
481            for active_state in &active {
482                let state = active_state.state;
483
484                // If this state is marked skip_next (from a transposition), just clear the flag
485                if state.skip_next {
486                    let mut continued = state;
487                    continued.skip_next = false;
488                    next_active.push(ActiveState {
489                        state: continued,
490                        start_byte: active_state.start_byte,
491                        start_char: active_state.start_char,
492                    });
493                    continue;
494                }
495
496                if state.pos < self.pattern_chars.len() {
497                    let pattern_char = self.pattern_chars[state.pos];
498
499                    // Match transition
500                    if text_char == pattern_char {
501                        let new_state = state.advance_match();
502                        next_active.push(ActiveState {
503                            state: new_state,
504                            start_byte: active_state.start_byte,
505                            start_char: active_state.start_char,
506                        });
507                    }
508
509                    // Substitution transition
510                    if text_char != pattern_char && self.can_substitute(&state) {
511                        let new_state = state.advance_substitution();
512                        next_active.push(ActiveState {
513                            state: new_state,
514                            start_byte: active_state.start_byte,
515                            start_char: active_state.start_char,
516                        });
517                    }
518
519                    // Transposition transition: pattern[pos:pos+2] matches text[idx:idx+2] in reverse
520                    // e.g., pattern "ab" matches text "ba"
521                    if let Some(next_char) = next_text_char
522                        && state.pos + 1 < self.pattern_chars.len()
523                        && self.can_swap(&state)
524                    {
525                        let next_pattern_char = self.pattern_chars[state.pos + 1];
526                        // Check if pattern[pos]=next_text and pattern[pos+1]=current_text (swapped)
527                        if pattern_char == next_char
528                            && next_pattern_char == text_char
529                            && pattern_char != next_pattern_char
530                        {
531                            let new_state = state.advance_swap();
532                            next_active.push(ActiveState {
533                                state: new_state,
534                                start_byte: active_state.start_byte,
535                                start_char: active_state.start_char,
536                            });
537                        }
538                    }
539                }
540
541                // Insertion transition (consume text char, stay at same pattern pos)
542                if self.can_insert(&state) {
543                    let new_state = state.advance_insertion();
544                    next_active.push(ActiveState {
545                        state: new_state,
546                        start_byte: active_state.start_byte,
547                        start_char: active_state.start_char,
548                    });
549                }
550            }
551
552            // Add epsilon closure (deletions)
553            seen_set.clear();
554            self.epsilon_closure(&mut next_active, &mut seen_set);
555
556            // Deduplicate: keep best state for each (pos, start_byte, skip_next)
557            deduped.clear();
558            for active_state in next_active {
559                // Early pruning: skip states that can't reach threshold
560                if self.max_possible_similarity(&active_state.state) < threshold {
561                    continue;
562                }
563
564                let key = (
565                    active_state.state.pos,
566                    active_state.start_byte,
567                    active_state.state.skip_next,
568                );
569                deduped
570                    .entry(key)
571                    .and_modify(|existing| {
572                        // Keep state with fewer edits
573                        if active_state.state.total_edits() < existing.state.total_edits() {
574                            *existing = active_state.clone();
575                        }
576                    })
577                    .or_insert(active_state);
578            }
579
580            active.clear();
581            active.extend(deduped.values().cloned());
582
583            // Beam pruning: if too many states, keep only the best ones
584            self.beam_prune(&mut active);
585
586            // Check for accepting states (reached end of pattern)
587            let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
588
589            for active_state in &active {
590                if active_state.state.pos == self.pattern_chars.len()
591                    && !active_state.state.skip_next
592                {
593                    let sim = self.calc_similarity(&active_state.state);
594                    if sim >= threshold {
595                        let key = (active_state.start_byte, end_byte);
596                        let m = DamLevMatch {
597                            start: active_state.start_byte,
598                            end: end_byte,
599                            insertions: active_state.state.insertions,
600                            deletions: active_state.state.deletions,
601                            substitutions: active_state.state.substitutions,
602                            swaps: active_state.state.swaps,
603                            similarity: sim,
604                        };
605
606                        matches
607                            .entry(key)
608                            .and_modify(|existing| {
609                                if m.similarity > existing.similarity {
610                                    *existing = m.clone();
611                                }
612                            })
613                            .or_insert(m);
614                    }
615                }
616            }
617
618            // Remove states that reached the end of pattern (they've been recorded)
619            // Also prune states that have fallen too far behind (can't possibly match)
620            let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
621            active.retain(|a| {
622                (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
623                    && (char_idx + 1 - a.start_char) <= max_window
624            });
625        }
626
627        // Handle remaining states that might match with deletions at the end
628        for active_state in &active {
629            let state = active_state.state;
630            if state.skip_next {
631                continue; // Can't complete if waiting for next char
632            }
633            let remaining = (self.pattern_chars.len() - state.pos) as u8;
634
635            // Can we delete all remaining pattern chars?
636            if remaining <= self.limits.max_edits - state.total_edits() {
637                let dels_needed = remaining;
638                let total_dels = state.deletions + dels_needed;
639
640                if self
641                    .limits
642                    .max_deletions
643                    .is_none_or(|max| total_dels <= max)
644                {
645                    let final_state = State {
646                        pos: self.pattern_chars.len(),
647                        deletions: total_dels,
648                        ..state
649                    };
650
651                    let sim = self.calc_similarity(&final_state);
652                    if sim >= threshold {
653                        let key = (active_state.start_byte, text.len());
654                        let m = DamLevMatch {
655                            start: active_state.start_byte,
656                            end: text.len(),
657                            insertions: final_state.insertions,
658                            deletions: final_state.deletions,
659                            substitutions: final_state.substitutions,
660                            swaps: final_state.swaps,
661                            similarity: sim,
662                        };
663
664                        matches
665                            .entry(key)
666                            .and_modify(|existing| {
667                                if m.similarity > existing.similarity {
668                                    *existing = m.clone();
669                                }
670                            })
671                            .or_insert(m);
672                    }
673                }
674            }
675        }
676
677        matches.into_values().collect()
678    }
679
680    /// Find up to `n` non-overlapping matches.
681    ///
682    /// Note: This is a simple implementation that finds all matches and takes the first `n`.
683    /// For large texts with many matches, consider using the Bitap-based matcher instead.
684    #[must_use]
685    pub fn find_n(&self, text: &str, threshold: f32, n: usize) -> Vec<DamLevMatch> {
686        if n == 0 {
687            return Vec::new();
688        }
689
690        let mut all_matches = self.find_all(text, threshold);
691
692        // Sort by start position
693        all_matches.sort_by_key(|m| m.start);
694
695        // Take non-overlapping matches up to limit
696        let mut result = Vec::with_capacity(n.min(all_matches.len()));
697        let mut last_end = 0;
698
699        for m in all_matches {
700            if m.start >= last_end {
701                last_end = m.end;
702                result.push(m);
703                if result.len() >= n {
704                    break;
705                }
706            }
707        }
708
709        result
710    }
711
712    /// Find all matches, but only start new potential matches at candidate positions.
713    ///
714    /// This is an optimization: instead of starting a new match at every character,
715    /// we only start at positions identified by a prefilter.
716    #[must_use]
717    pub fn find_all_with_candidates(
718        &self,
719        text: &str,
720        threshold: f32,
721        candidates: &FxHashSet<usize>,
722    ) -> Vec<DamLevMatch> {
723        let mut buffers = SearchBuffers::new();
724        self.find_all_with_candidates_buffered(text, threshold, candidates, &mut buffers)
725    }
726
727    /// Find all matches using pre-allocated buffers to avoid allocations.
728    pub fn find_all_with_candidates_buffered(
729        &self,
730        text: &str,
731        threshold: f32,
732        candidates: &FxHashSet<usize>,
733        buffers: &mut SearchBuffers,
734    ) -> Vec<DamLevMatch> {
735        // Clear buffers for reuse
736        buffers.clear();
737
738        if self.pattern_chars.is_empty() {
739            return vec![DamLevMatch {
740                start: 0,
741                end: 0,
742                insertions: 0,
743                deletions: 0,
744                substitutions: 0,
745                swaps: 0,
746                similarity: 1.0,
747            }];
748        }
749
750        // Stream through chars without collecting - use peekable for lookahead
751        let mut char_iter = text.char_indices().peekable();
752
753        let mut char_idx = 0usize;
754        while let Some((byte_pos, raw_char)) = char_iter.next() {
755            let text_char = if self.case_insensitive {
756                raw_char.to_lowercase().next().unwrap_or(raw_char)
757            } else {
758                raw_char
759            };
760
761            // Peek next char for transposition detection and end_byte calculation
762            let next_info = char_iter.peek().map(|&(next_byte, next_char)| {
763                let c = if self.case_insensitive {
764                    next_char.to_lowercase().next().unwrap_or(next_char)
765                } else {
766                    next_char
767                };
768                (next_byte, c)
769            });
770            let next_text_char = next_info.map(|(_, c)| c);
771            let end_byte = next_info.map_or(text.len(), |(b, _)| b);
772
773            // Only start a new potential match if this is a candidate position
774            if candidates.contains(&byte_pos) {
775                let initial = ActiveState {
776                    state: State::new(0),
777                    start_byte: byte_pos,
778                    start_char: char_idx,
779                };
780                buffers.active.push(initial);
781
782                // Add epsilon closure for new state - run directly on active
783                {
784                    let start_idx = buffers.active.len() - 1;
785                    buffers.seen_set.clear();
786                    for a in &buffers.active {
787                        buffers.seen_set.insert((a.state, a.start_byte));
788                    }
789                    self.epsilon_closure_from(
790                        &mut buffers.active,
791                        start_idx,
792                        &mut buffers.seen_set,
793                    );
794                }
795            }
796
797            // Process active states - reuse next_active buffer
798            buffers.next_active.clear();
799
800            for active_state in &buffers.active {
801                let state = active_state.state;
802
803                // If this state is marked skip_next (from a transposition), just clear the flag
804                if state.skip_next {
805                    let mut continued = state;
806                    continued.skip_next = false;
807                    buffers.next_active.push(ActiveState {
808                        state: continued,
809                        start_byte: active_state.start_byte,
810                        start_char: active_state.start_char,
811                    });
812                    continue;
813                }
814
815                if state.pos < self.pattern_chars.len() {
816                    let pattern_char = self.pattern_chars[state.pos];
817
818                    if text_char == pattern_char {
819                        buffers.next_active.push(ActiveState {
820                            state: state.advance_match(),
821                            start_byte: active_state.start_byte,
822                            start_char: active_state.start_char,
823                        });
824                    }
825
826                    if text_char != pattern_char && self.can_substitute(&state) {
827                        buffers.next_active.push(ActiveState {
828                            state: state.advance_substitution(),
829                            start_byte: active_state.start_byte,
830                            start_char: active_state.start_char,
831                        });
832                    }
833
834                    // Transposition transition
835                    if let Some(next_char) = next_text_char
836                        && state.pos + 1 < self.pattern_chars.len()
837                        && self.can_swap(&state)
838                    {
839                        let next_pattern_char = self.pattern_chars[state.pos + 1];
840                        if pattern_char == next_char
841                            && next_pattern_char == text_char
842                            && pattern_char != next_pattern_char
843                        {
844                            buffers.next_active.push(ActiveState {
845                                state: state.advance_swap(),
846                                start_byte: active_state.start_byte,
847                                start_char: active_state.start_char,
848                            });
849                        }
850                    }
851                }
852
853                if self.can_insert(&state) {
854                    buffers.next_active.push(ActiveState {
855                        state: state.advance_insertion(),
856                        start_byte: active_state.start_byte,
857                        start_char: active_state.start_char,
858                    });
859                }
860            }
861
862            buffers.seen_set.clear();
863            self.epsilon_closure(&mut buffers.next_active, &mut buffers.seen_set);
864
865            // Deduplicate with early pruning
866            buffers.deduped.clear();
867            for active_state in buffers.next_active.drain(..) {
868                // Early pruning: skip states that can't reach threshold
869                if self.max_possible_similarity(&active_state.state) < threshold {
870                    continue;
871                }
872
873                let key = (
874                    active_state.state.pos,
875                    active_state.start_byte,
876                    active_state.state.skip_next,
877                );
878                buffers
879                    .deduped
880                    .entry(key)
881                    .and_modify(|existing| {
882                        if active_state.state.total_edits() < existing.state.total_edits() {
883                            *existing = active_state.clone();
884                        }
885                    })
886                    .or_insert(active_state);
887            }
888
889            buffers.active.clear();
890            buffers.active.extend(buffers.deduped.values().cloned());
891
892            // Beam pruning: limit state explosion
893            self.beam_prune(&mut buffers.active);
894
895            // Check for accepting states (end_byte already computed above from peek)
896            for active_state in &buffers.active {
897                if active_state.state.pos == self.pattern_chars.len()
898                    && !active_state.state.skip_next
899                {
900                    let sim = self.calc_similarity(&active_state.state);
901                    if sim >= threshold {
902                        let key = (active_state.start_byte, end_byte);
903                        let m = DamLevMatch {
904                            start: active_state.start_byte,
905                            end: end_byte,
906                            insertions: active_state.state.insertions,
907                            deletions: active_state.state.deletions,
908                            substitutions: active_state.state.substitutions,
909                            swaps: active_state.state.swaps,
910                            similarity: sim,
911                        };
912
913                        buffers
914                            .matches
915                            .entry(key)
916                            .and_modify(|existing| {
917                                if m.similarity > existing.similarity {
918                                    *existing = m.clone();
919                                }
920                            })
921                            .or_insert(m);
922                    }
923                }
924            }
925
926            // Prune states
927            let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
928            buffers.active.retain(|a| {
929                (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
930                    && (char_idx + 1 - a.start_char) <= max_window
931            });
932
933            char_idx += 1;
934        }
935
936        // Handle remaining states at end of text
937        for active_state in &buffers.active {
938            let state = active_state.state;
939            if state.skip_next {
940                continue;
941            }
942            let remaining = self.pattern_chars.len() - state.pos;
943
944            if remaining as u8 <= self.limits.max_edits - state.total_edits() {
945                let dels_needed = remaining as u8;
946                let total_dels = state.deletions + dels_needed;
947
948                if self
949                    .limits
950                    .max_deletions
951                    .is_none_or(|max| total_dels <= max)
952                {
953                    let final_state = State {
954                        pos: self.pattern_chars.len(),
955                        deletions: total_dels,
956                        ..state
957                    };
958
959                    let sim = self.calc_similarity(&final_state);
960                    if sim >= threshold {
961                        let key = (active_state.start_byte, text.len());
962                        let m = DamLevMatch {
963                            start: active_state.start_byte,
964                            end: text.len(),
965                            insertions: final_state.insertions,
966                            deletions: final_state.deletions,
967                            substitutions: final_state.substitutions,
968                            swaps: final_state.swaps,
969                            similarity: sim,
970                        };
971
972                        buffers
973                            .matches
974                            .entry(key)
975                            .and_modify(|existing| {
976                                if m.similarity > existing.similarity {
977                                    *existing = m.clone();
978                                }
979                            })
980                            .or_insert(m);
981                    }
982                }
983            }
984        }
985
986        buffers.matches.drain().map(|(_, v)| v).collect()
987    }
988
989    /// Find the first match, stopping as soon as one is found.
990    ///
991    /// This is much faster than `find_all` when we only need the first match,
992    /// especially when the match is found early in the text.
993    #[must_use]
994    pub fn find_first_with_candidates(
995        &self,
996        text: &str,
997        threshold: f32,
998        candidates: &FxHashSet<usize>,
999    ) -> Option<DamLevMatch> {
1000        if self.pattern_chars.is_empty() {
1001            return Some(DamLevMatch {
1002                start: 0,
1003                end: 0,
1004                insertions: 0,
1005                deletions: 0,
1006                substitutions: 0,
1007                swaps: 0,
1008                similarity: 1.0,
1009            });
1010        }
1011
1012        let text_chars: Vec<(usize, char)> = text.char_indices().collect();
1013        let mut active: Vec<ActiveState> = Vec::new();
1014
1015        // Reusable FxHashSet to avoid allocation per character
1016        let mut seen_set: FxHashSet<(State, usize)> = FxHashSet::default();
1017
1018        // Reusable FxHashMap for deduplication
1019        let mut deduped: FxHashMap<(usize, usize, bool), ActiveState> = FxHashMap::default();
1020
1021        for (char_idx, &(byte_pos, text_char)) in text_chars.iter().enumerate() {
1022            let text_char = if self.case_insensitive {
1023                text_char.to_lowercase().next().unwrap_or(text_char)
1024            } else {
1025                text_char
1026            };
1027
1028            // Get next text char for transposition detection
1029            let next_text_char = text_chars.get(char_idx + 1).map(|&(_, c)| {
1030                if self.case_insensitive {
1031                    c.to_lowercase().next().unwrap_or(c)
1032                } else {
1033                    c
1034                }
1035            });
1036
1037            // Only start a new potential match if this is a candidate position
1038            if candidates.contains(&byte_pos) {
1039                let initial = ActiveState {
1040                    state: State::new(0),
1041                    start_byte: byte_pos,
1042                    start_char: char_idx,
1043                };
1044                active.push(initial);
1045
1046                // Add epsilon closure for new state - in-place from last element
1047                let start_idx = active.len() - 1;
1048                seen_set.clear();
1049                for a in &active {
1050                    seen_set.insert((a.state, a.start_byte));
1051                }
1052                self.epsilon_closure_from(&mut active, start_idx, &mut seen_set);
1053            }
1054
1055            // If no active states, continue to next char
1056            if active.is_empty() {
1057                continue;
1058            }
1059
1060            // Process active states
1061            let mut next_active: Vec<ActiveState> = Vec::new();
1062
1063            for active_state in &active {
1064                let state = active_state.state;
1065
1066                // If this state is marked skip_next (from a transposition), just clear the flag
1067                if state.skip_next {
1068                    let mut continued = state;
1069                    continued.skip_next = false;
1070                    next_active.push(ActiveState {
1071                        state: continued,
1072                        start_byte: active_state.start_byte,
1073                        start_char: active_state.start_char,
1074                    });
1075                    continue;
1076                }
1077
1078                if state.pos < self.pattern_chars.len() {
1079                    let pattern_char = self.pattern_chars[state.pos];
1080
1081                    if text_char == pattern_char {
1082                        next_active.push(ActiveState {
1083                            state: state.advance_match(),
1084                            start_byte: active_state.start_byte,
1085                            start_char: active_state.start_char,
1086                        });
1087                    }
1088
1089                    if text_char != pattern_char && self.can_substitute(&state) {
1090                        next_active.push(ActiveState {
1091                            state: state.advance_substitution(),
1092                            start_byte: active_state.start_byte,
1093                            start_char: active_state.start_char,
1094                        });
1095                    }
1096
1097                    // Transposition transition
1098                    if let Some(next_char) = next_text_char
1099                        && state.pos + 1 < self.pattern_chars.len()
1100                        && self.can_swap(&state)
1101                    {
1102                        let next_pattern_char = self.pattern_chars[state.pos + 1];
1103                        if pattern_char == next_char
1104                            && next_pattern_char == text_char
1105                            && pattern_char != next_pattern_char
1106                        {
1107                            next_active.push(ActiveState {
1108                                state: state.advance_swap(),
1109                                start_byte: active_state.start_byte,
1110                                start_char: active_state.start_char,
1111                            });
1112                        }
1113                    }
1114                }
1115
1116                if self.can_insert(&state) {
1117                    next_active.push(ActiveState {
1118                        state: state.advance_insertion(),
1119                        start_byte: active_state.start_byte,
1120                        start_char: active_state.start_char,
1121                    });
1122                }
1123            }
1124
1125            seen_set.clear();
1126            self.epsilon_closure(&mut next_active, &mut seen_set);
1127
1128            // Deduplicate (keep best state) with early pruning
1129            deduped.clear();
1130            for active_state in next_active {
1131                // Early pruning: skip states that can't reach threshold
1132                if self.max_possible_similarity(&active_state.state) < threshold {
1133                    continue;
1134                }
1135
1136                let key = (
1137                    active_state.state.pos,
1138                    active_state.start_byte,
1139                    active_state.state.skip_next,
1140                );
1141                deduped
1142                    .entry(key)
1143                    .and_modify(|existing| {
1144                        if active_state.state.total_edits() < existing.state.total_edits() {
1145                            *existing = active_state.clone();
1146                        }
1147                    })
1148                    .or_insert(active_state);
1149            }
1150
1151            active.clear();
1152            active.extend(deduped.values().cloned());
1153
1154            // Beam pruning: limit state explosion
1155            self.beam_prune(&mut active);
1156
1157            // Check for accepting states - return best match at this position
1158            let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
1159
1160            // Find the best accepting state (highest similarity = lowest edits)
1161            let mut best_match: Option<DamLevMatch> = None;
1162            for active_state in &active {
1163                if active_state.state.pos == self.pattern_chars.len()
1164                    && !active_state.state.skip_next
1165                {
1166                    let sim = self.calc_similarity(&active_state.state);
1167                    if sim >= threshold {
1168                        let m = DamLevMatch {
1169                            start: active_state.start_byte,
1170                            end: end_byte,
1171                            insertions: active_state.state.insertions,
1172                            deletions: active_state.state.deletions,
1173                            substitutions: active_state.state.substitutions,
1174                            swaps: active_state.state.swaps,
1175                            similarity: sim,
1176                        };
1177                        if best_match.as_ref().is_none_or(|best| sim > best.similarity) {
1178                            best_match = Some(m);
1179                        }
1180                    }
1181                }
1182            }
1183            if best_match.is_some() {
1184                return best_match;
1185            }
1186
1187            // Prune states
1188            let max_window = self.pattern_chars.len() + self.limits.max_edits as usize;
1189            active.retain(|a| {
1190                (a.state.pos < self.pattern_chars.len() || a.state.skip_next)
1191                    && (char_idx + 1 - a.start_char) <= max_window
1192            });
1193        }
1194
1195        // Check remaining states at end of text
1196        for active_state in &active {
1197            let state = active_state.state;
1198            if state.skip_next {
1199                continue;
1200            }
1201            let remaining = self.pattern_chars.len() - state.pos;
1202
1203            if remaining as u8 <= self.limits.max_edits - state.total_edits() {
1204                let dels_needed = remaining as u8;
1205                let total_dels = state.deletions + dels_needed;
1206
1207                if self
1208                    .limits
1209                    .max_deletions
1210                    .is_none_or(|max| total_dels <= max)
1211                {
1212                    let final_state = State {
1213                        pos: self.pattern_chars.len(),
1214                        deletions: total_dels,
1215                        ..state
1216                    };
1217
1218                    let sim = self.calc_similarity(&final_state);
1219                    if sim >= threshold {
1220                        return Some(DamLevMatch {
1221                            start: active_state.start_byte,
1222                            end: text.len(),
1223                            insertions: final_state.insertions,
1224                            deletions: final_state.deletions,
1225                            substitutions: final_state.substitutions,
1226                            swaps: final_state.swaps,
1227                            similarity: sim,
1228                        });
1229                    }
1230                }
1231            }
1232        }
1233
1234        None
1235    }
1236}
1237
1238#[cfg(test)]
1239mod tests {
1240    use super::*;
1241
1242    #[test]
1243    fn test_exact_match() {
1244        let nfa = DamLevNfa::new("hello", EditLimits::new(0), false);
1245        let matches = nfa.find_all("hello world", 0.8);
1246
1247        assert_eq!(matches.len(), 1);
1248        assert_eq!(matches[0].start, 0);
1249        assert_eq!(matches[0].end, 5);
1250        assert_eq!(matches[0].total_edits(), 0);
1251    }
1252
1253    #[test]
1254    fn test_one_substitution() {
1255        let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1256        let matches = nfa.find_all("hallo world", 0.5);
1257
1258        assert!(
1259            matches
1260                .iter()
1261                .any(|m| m.start == 0 && m.end == 5 && m.substitutions == 1)
1262        );
1263    }
1264
1265    #[test]
1266    fn test_one_insertion() {
1267        let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1268        let matches = nfa.find_all("heello world", 0.5);
1269
1270        assert!(matches.iter().any(|m| m.start == 0 && m.insertions == 1));
1271    }
1272
1273    #[test]
1274    fn test_one_deletion() {
1275        let nfa = DamLevNfa::new("hello", EditLimits::new(1), false);
1276        let matches = nfa.find_all("helo world", 0.5);
1277
1278        assert!(matches.iter().any(|m| m.start == 0 && m.deletions == 1));
1279    }
1280
1281    #[test]
1282    fn test_case_insensitive() {
1283        let nfa = DamLevNfa::new("hello", EditLimits::new(0), true);
1284        let matches = nfa.find_all("HELLO world", 0.8);
1285
1286        assert_eq!(matches.len(), 1);
1287        assert_eq!(matches[0].start, 0);
1288        assert_eq!(matches[0].end, 5);
1289    }
1290
1291    #[test]
1292    fn test_multiple_matches() {
1293        let nfa = DamLevNfa::new("ab", EditLimits::new(0), false);
1294        let matches = nfa.find_all("ab ab ab", 0.8);
1295
1296        assert_eq!(matches.len(), 3);
1297    }
1298
1299    #[test]
1300    fn test_no_match() {
1301        let nfa = DamLevNfa::new("xyz", EditLimits::new(1), false);
1302        let matches = nfa.find_all("hello world", 0.8);
1303
1304        assert!(matches.is_empty());
1305    }
1306
1307    // --- Transposition tests ---
1308
1309    #[test]
1310    fn test_transposition_simple() {
1311        // Pattern "ab" should match "ba" with 1 swap
1312        let nfa = DamLevNfa::new("ab", EditLimits::new(1), false);
1313        let matches = nfa.find_all("ba", 0.0);
1314
1315        assert!(!matches.is_empty(), "Should find match for transposition");
1316        // Find the swap match (there might be other matches like deletion-only)
1317        let swap_match = matches.iter().find(|m| m.swaps == 1);
1318        assert!(
1319            swap_match.is_some(),
1320            "Should find match with 1 swap, got: {matches:?}"
1321        );
1322        let m = swap_match.unwrap();
1323        assert_eq!(m.substitutions, 0, "Should have 0 substitutions");
1324        assert_eq!(m.total_edits(), 1, "Total edits should be 1");
1325    }
1326
1327    #[test]
1328    fn test_transposition_in_word() {
1329        // Pattern "teh" should match "the" with 1 swap (common typo)
1330        let nfa = DamLevNfa::new("the", EditLimits::new(1), false);
1331        let matches = nfa.find_all("teh", 0.0);
1332
1333        assert!(!matches.is_empty(), "Should find match for 'teh' -> 'the'");
1334        let m = matches.iter().find(|m| m.swaps == 1);
1335        assert!(m.is_some(), "Should find match with 1 swap");
1336    }
1337
1338    #[test]
1339    fn test_transposition_longer_word() {
1340        // Pattern "receive" should match "recieve" with 1 swap (common typo)
1341        let nfa = DamLevNfa::new("receive", EditLimits::new(1), false);
1342        let matches = nfa.find_all("recieve", 0.0);
1343
1344        assert!(
1345            !matches.is_empty(),
1346            "Should find match for 'recieve' -> 'receive'"
1347        );
1348        // Check that we found a match with 1 swap (i and e swapped)
1349        let swap_match = matches.iter().find(|m| m.swaps == 1);
1350        assert!(
1351            swap_match.is_some(),
1352            "Should find match with 1 swap, got: {matches:?}"
1353        );
1354    }
1355
1356    #[test]
1357    fn test_transposition_with_other_edits() {
1358        // "abcd" matching "badc" - two transpositions
1359        let nfa = DamLevNfa::new("abcd", EditLimits::new(2), false);
1360        let matches = nfa.find_all("badc", 0.0);
1361
1362        assert!(!matches.is_empty(), "Should find match with 2 swaps");
1363        // Should find a match with 2 swaps
1364        let m = matches.iter().find(|m| m.total_edits() == 2);
1365        assert!(m.is_some(), "Should find match with 2 total edits");
1366    }
1367
1368    #[test]
1369    fn test_transposition_not_same_char() {
1370        // Transposition should only work for different adjacent characters
1371        // "aa" cannot be swapped to "aa" (same characters)
1372        let nfa = DamLevNfa::new("aa", EditLimits::new(1), false);
1373        let matches = nfa.find_all("aa", 0.0);
1374
1375        // Should find exact match with 0 edits
1376        assert!(!matches.is_empty());
1377        let exact = matches.iter().find(|m| m.total_edits() == 0);
1378        assert!(exact.is_some(), "Should find exact match");
1379    }
1380
1381    #[test]
1382    fn test_transposition_vs_substitution() {
1383        // With swaps enabled, "ab" -> "ba" should find a swap match with 1 edit
1384        let nfa = DamLevNfa::new("ab", EditLimits::new(2), false);
1385        let matches = nfa.find_all("ba", 0.0);
1386
1387        // Should find a swap match (1 edit) among the results
1388        let swap_match = matches
1389            .iter()
1390            .find(|m| m.swaps == 1 && m.total_edits() == 1);
1391        assert!(
1392            swap_match.is_some(),
1393            "Should find match with 1 swap, got: {matches:?}"
1394        );
1395
1396        // The swap match should have better similarity than substitution matches
1397        let best_sim = matches
1398            .iter()
1399            .map(|m| m.similarity)
1400            .max_by(|a, b| a.partial_cmp(b).unwrap());
1401        let swap_sim = swap_match.unwrap().similarity;
1402        assert!(
1403            swap_sim >= best_sim.unwrap() - 0.01,
1404            "Swap match should have high similarity"
1405        );
1406    }
1407
1408    #[test]
1409    fn test_transposition_case_insensitive() {
1410        // Case insensitive transposition
1411        let nfa = DamLevNfa::new("AB", EditLimits::new(1), true);
1412        let matches = nfa.find_all("ba", 0.0);
1413
1414        assert!(
1415            !matches.is_empty(),
1416            "Should find case-insensitive transposition"
1417        );
1418        let m = matches.iter().find(|m| m.swaps == 1);
1419        assert!(m.is_some(), "Should find match with 1 swap");
1420    }
1421}
1422
1423#[test]
1424fn test_find_with_candidates() {
1425    let nfa = DamLevNfa::new("quik", EditLimits::new(1), false);
1426
1427    // Test 1: All positions as candidates (like find_all)
1428    let text = "The quick brown fox";
1429    let all_positions: FxHashSet<usize> = text.char_indices().map(|(i, _)| i).collect();
1430    let matches = nfa.find_all_with_candidates(text, 0.8, &all_positions);
1431    println!("All positions candidates: {matches:?}");
1432
1433    // Test 2: Only positions 3, 4 as candidates
1434    let limited: FxHashSet<usize> = vec![3, 4].into_iter().collect();
1435    let matches2 = nfa.find_all_with_candidates(text, 0.8, &limited);
1436    println!("Limited candidates (3,4): {matches2:?}");
1437
1438    // Test 3: Compare with find_all
1439    let matches3 = nfa.find_all(text, 0.8);
1440    println!("find_all: {matches3:?}");
1441
1442    assert!(!matches.is_empty(), "Should find match with all positions");
1443    assert!(
1444        !matches2.is_empty(),
1445        "Should find match with limited positions"
1446    );
1447}