Skip to main content

fuzzy_regex/engine/
bitap.rs

1//! Bitap algorithm for fast fuzzy string matching.
2//!
3//! The Bitap algorithm (also known as shift-or or shift-and) uses bitwise
4//! operations to perform fuzzy matching very efficiently for short patterns
5//! (up to 64 characters).
6//!
7//! Time complexity: O(n × k) where n = text length, k = max edits
8//! Each step involves only a few bitwise operations.
9
10#![allow(
11    clippy::needless_range_loop,
12    clippy::items_after_statements,
13    clippy::too_many_lines,
14    clippy::inline_always
15)]
16
17use super::damlev::{DamLevMatch, EditLimits};
18use super::hash::FxHashMap;
19
20/// Fast UTF-8 character decoder - avoids `str::from_utf8` + `chars().next()` overhead.
21/// Returns (char, `byte_length`).
22///
23/// # Safety
24/// Assumes input is valid UTF-8. Invalid sequences return replacement char.
25#[inline(always)]
26fn decode_utf8_char_fast(bytes: &[u8], pos: usize) -> (char, usize) {
27    let b0 = bytes[pos];
28
29    if b0 < 128 {
30        // ASCII: single byte
31        (b0 as char, 1)
32    } else if b0 < 224 {
33        // 2-byte UTF-8 (Latin Extended, Cyrillic, etc.)
34        if pos + 1 < bytes.len() {
35            let b1 = bytes[pos + 1];
36            let codepoint = ((u32::from(b0) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
37            // SAFETY: Valid 2-byte UTF-8 always produces valid codepoint in 0x80-0x7FF range
38            (unsafe { char::from_u32_unchecked(codepoint) }, 2)
39        } else {
40            ('\u{FFFD}', 1)
41        }
42    } else if b0 < 240 {
43        // 3-byte UTF-8 (CJK, etc.)
44        if pos + 2 < bytes.len() {
45            let b1 = bytes[pos + 1];
46            let b2 = bytes[pos + 2];
47            let codepoint = ((u32::from(b0) & 0x0F) << 12)
48                | ((u32::from(b1) & 0x3F) << 6)
49                | (u32::from(b2) & 0x3F);
50            // SAFETY: Valid 3-byte UTF-8 produces valid codepoint (excluding surrogates handled by validation)
51            (unsafe { char::from_u32_unchecked(codepoint) }, 3)
52        } else {
53            ('\u{FFFD}', 1)
54        }
55    } else {
56        // 4-byte UTF-8 (Emoji, etc.)
57        if pos + 3 < bytes.len() {
58            let b1 = bytes[pos + 1];
59            let b2 = bytes[pos + 2];
60            let b3 = bytes[pos + 3];
61            let codepoint = ((u32::from(b0) & 0x07) << 18)
62                | ((u32::from(b1) & 0x3F) << 12)
63                | ((u32::from(b2) & 0x3F) << 6)
64                | (u32::from(b3) & 0x3F);
65            // SAFETY: Valid 4-byte UTF-8 always produces valid codepoint
66            (unsafe { char::from_u32_unchecked(codepoint) }, 4)
67        } else {
68            ('\u{FFFD}', 1)
69        }
70    }
71}
72
73/// Maximum pattern length supported by Bitap (using u64 bitmasks).
74pub const MAX_PATTERN_LEN: usize = 64;
75
76/// Bitap matcher for fuzzy string matching.
77#[derive(Debug)]
78pub struct BitapMatcher {
79    pattern: String,
80    pattern_chars: Vec<char>,
81    pattern_len: usize,
82    limits: EditLimits,
83    case_insensitive: bool,
84    /// Character masks: for each character, a bitmask where bit i is 0
85    /// if pattern[i] == character.
86    char_masks: FxHashMap<char, u64>,
87    /// ASCII byte masks for O(1) lookup (all 1s = no match).
88    byte_masks: [u64; 128],
89    /// Boyer-Moore style skip table: how far to skip when a byte is NOT in pattern.
90    /// For bytes in pattern: 0 (can't skip). For bytes not in pattern: `pattern_len` - `max_edits`.
91    skip_table: [u8; 256],
92    /// Whether the pattern is pure ASCII.
93    is_ascii: bool,
94    /// Mask with 1 in the position of the last pattern character.
95    accept_mask: u64,
96    /// Unicode block masks for O(1) lookup of non-ASCII characters.
97    /// If all pattern chars are in the same 256-codepoint block, we use this instead of `HashMap`.
98    /// `block_base` is the start codepoint (e.g., 0x0400 for Cyrillic).
99    unicode_block_base: u32,
100    unicode_block_masks: Option<Box<[u64; 256]>>,
101}
102
103impl BitapMatcher {
104    /// Create a new Bitap matcher.
105    ///
106    /// Returns None if the pattern is too long (> 64 chars).
107    pub fn new(pattern: &str, limits: EditLimits, case_insensitive: bool) -> Option<Self> {
108        let pattern_chars: Vec<char> = if case_insensitive {
109            pattern.to_lowercase().chars().collect()
110        } else {
111            pattern.chars().collect()
112        };
113
114        if pattern_chars.len() > MAX_PATTERN_LEN || pattern_chars.is_empty() {
115            return None;
116        }
117
118        let pattern_len = pattern_chars.len();
119        let is_ascii = pattern_chars.iter().all(char::is_ascii);
120
121        // Build character masks
122        // For each character in the alphabet, create a bitmask where bit i is 0
123        // if pattern[i] matches the character, 1 otherwise.
124        // We use the "shift-or" variant where 0 means match.
125        let mut char_masks: FxHashMap<char, u64> = FxHashMap::default();
126        let mut byte_masks = [!0u64; 128]; // All 1s = no match
127
128        for (i, &ch) in pattern_chars.iter().enumerate() {
129            // Set bit i to 0 for this character (start with all 1s, clear bit i)
130            let mask = char_masks.entry(ch).or_insert(!0u64);
131            *mask &= !(1u64 << i);
132
133            // Also update byte_masks for ASCII characters
134            if ch.is_ascii() {
135                let byte = ch as u8;
136                byte_masks[byte as usize] &= !(1u64 << i);
137                // Handle case insensitivity for ASCII
138                if case_insensitive {
139                    if byte.is_ascii_lowercase() {
140                        byte_masks[byte.to_ascii_uppercase() as usize] &= !(1u64 << i);
141                    } else if byte.is_ascii_uppercase() {
142                        byte_masks[byte.to_ascii_lowercase() as usize] &= !(1u64 << i);
143                    }
144                }
145            }
146        }
147
148        // Accept mask: 1 in position (pattern_len - 1)
149        let accept_mask = 1u64 << (pattern_len - 1);
150
151        // Build Boyer-Moore style skip table
152        // If a byte is not in the pattern, we can skip ahead when we see it
153        let max_edits = limits.max_edits as usize;
154        let skip_distance = pattern_len.saturating_sub(max_edits).max(1) as u8;
155        let mut skip_table = [skip_distance; 256];
156
157        // Bytes that ARE in the pattern can't be skipped
158        for &ch in &pattern_chars {
159            if ch.is_ascii() {
160                skip_table[ch as usize] = 0;
161                // Also mark case variants
162                if case_insensitive {
163                    if ch.is_ascii_lowercase() {
164                        skip_table[ch.to_ascii_uppercase() as usize] = 0;
165                    } else if ch.is_ascii_uppercase() {
166                        skip_table[ch.to_ascii_lowercase() as usize] = 0;
167                    }
168                }
169            }
170        }
171
172        // Build Unicode block lookup table for non-ASCII patterns.
173        // If all pattern chars fall within a single 256-codepoint block, we can use O(1) array lookup.
174        let (unicode_block_base, unicode_block_masks) =
175            Self::build_unicode_block_masks(&pattern_chars, &char_masks);
176
177        Some(BitapMatcher {
178            pattern: pattern.to_string(),
179            pattern_chars,
180            pattern_len,
181            limits,
182            case_insensitive,
183            char_masks,
184            byte_masks,
185            skip_table,
186            is_ascii,
187            accept_mask,
188            unicode_block_base,
189            unicode_block_masks,
190        })
191    }
192
193    /// Returns the original pattern string.
194    #[must_use]
195    pub fn pattern(&self) -> &str {
196        &self.pattern
197    }
198
199    /// Returns the pattern as a slice of characters.
200    #[must_use]
201    pub fn pattern_chars(&self) -> &[char] {
202        &self.pattern_chars
203    }
204
205    /// Build Unicode block lookup table for O(1) character mask access.
206    /// Returns (`block_base`, `Some(masks)`) if all non-ASCII chars are in a single 256-codepoint block.
207    fn build_unicode_block_masks(
208        pattern_chars: &[char],
209        char_masks: &FxHashMap<char, u64>,
210    ) -> (u32, Option<Box<[u64; 256]>>) {
211        // Find non-ASCII characters
212        let non_ascii: Vec<char> = pattern_chars
213            .iter()
214            .filter(|c| !c.is_ascii())
215            .copied()
216            .collect();
217
218        if non_ascii.is_empty() {
219            return (0, None);
220        }
221
222        // Check if all non-ASCII chars are in the same 256-codepoint block
223        let first_cp = non_ascii[0] as u32;
224        let block_base = first_cp & !0xFF; // Round down to block start (e.g., 0x0400 for Cyrillic)
225
226        let all_in_block = non_ascii.iter().all(|&ch| {
227            let cp = ch as u32;
228            (cp & !0xFF) == block_base
229        });
230
231        if !all_in_block {
232            return (0, None);
233        }
234
235        // Build the lookup table
236        let mut masks = Box::new([!0u64; 256]);
237        for (&ch, &mask) in char_masks {
238            let cp = ch as u32;
239            if (cp & !0xFF) == block_base {
240                let idx = (cp & 0xFF) as usize;
241                masks[idx] = mask;
242            }
243        }
244
245        (block_base, Some(masks))
246    }
247
248    /// Get character mask for a character (all 1s if not in pattern).
249    #[inline(always)]
250    fn get_mask(&self, ch: char) -> u64 {
251        let cp = ch as u32;
252
253        // Fast path: check Unicode block lookup table
254        if let Some(ref masks) = self.unicode_block_masks
255            && (cp & !0xFF) == self.unicode_block_base
256        {
257            return masks[(cp & 0xFF) as usize];
258        }
259
260        // Fallback to HashMap
261        *self.char_masks.get(&ch).unwrap_or(&!0u64)
262    }
263
264    /// Get mask directly from 2-byte UTF-8 sequence (avoids char decode).
265    /// Returns (mask, 2) if successful, or falls back to `decode_utf8_char_fast`.
266    #[inline(always)]
267    fn get_mask_2byte(&self, b0: u8, b1: u8) -> u64 {
268        if let Some(ref masks) = self.unicode_block_masks {
269            // Compute codepoint index directly from UTF-8 bytes
270            // For 2-byte UTF-8: codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F)
271            // Block check: (codepoint & !0xFF) == block_base
272            // Index: codepoint & 0xFF
273            let codepoint_low6 = (u32::from(b0) & 0x1F) << 6;
274            let codepoint = codepoint_low6 | (u32::from(b1) & 0x3F);
275            if (codepoint & !0xFF) == self.unicode_block_base {
276                return masks[(codepoint & 0xFF) as usize];
277            }
278        }
279        // Fallback: decode to char and lookup
280        let codepoint = ((u32::from(b0) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
281        let ch = unsafe { char::from_u32_unchecked(codepoint) };
282        *self.char_masks.get(&ch).unwrap_or(&!0u64)
283    }
284
285    /// Get Boyer-Moore skip distance for a byte.
286    /// Returns 0 if byte is in pattern, otherwise returns skip distance.
287    #[inline(always)]
288    #[must_use]
289    pub fn get_skip(&self, byte: u8) -> usize {
290        self.skip_table[byte as usize] as usize
291    }
292
293    /// Find the next position worth checking using Boyer-Moore skipping.
294    /// Scans from `start` looking for a byte that's in the pattern.
295    /// Returns the position of the first pattern-relevant byte, or `text.len()` if none.
296    #[inline]
297    #[must_use]
298    pub fn find_next_candidate(&self, text: &[u8], start: usize) -> usize {
299        let mut pos = start;
300        while pos < text.len() {
301            let skip = self.skip_table[text[pos] as usize];
302            if skip == 0 {
303                return pos;
304            }
305            pos += skip as usize;
306        }
307        text.len()
308    }
309
310    /// Calculate similarity score.
311    fn calc_similarity(&self, edits: u8, insertions: u8, deletions: u8) -> f32 {
312        let pattern_len = self.pattern_len as f32;
313        if pattern_len == 0.0 {
314            return 1.0;
315        }
316
317        let edit_distance = f32::from(edits);
318        let matched_len = pattern_len + f32::from(insertions) - f32::from(deletions);
319        let max_len = pattern_len.max(matched_len).max(1.0);
320
321        (1.0 - edit_distance / max_len).max(0.0)
322    }
323
324    /// Myers' bit-vector algorithm for fast O(n) edit distance computation.
325    /// Returns the edit distance between pattern and text.
326    ///
327    /// This is much faster than full DP (O(n) vs O(m×n)) but doesn't give
328    /// breakdown of edit types. Use for fast verification.
329    #[inline]
330    fn compute_edit_distance_myers(&self, text_chars: &[char]) -> u8 {
331        let m = self.pattern_len;
332        let n = text_chars.len();
333
334        if m == 0 {
335            return n as u8;
336        }
337        if n == 0 {
338            return m as u8;
339        }
340
341        // Myers' algorithm using our precomputed masks
342        // Note: Our masks have bit=0 for match, which is the inverse of typical Myers
343        // We adapt by inverting the eq mask
344        let mut pv = !0u64; // positive vertical delta (all 1s)
345        let mut mv = 0u64; // negative vertical delta (all 0s)
346        let mut score = m as u8;
347
348        let mask = 1u64 << (m - 1);
349
350        for &text_char in text_chars {
351            // Get pattern equality mask (bit i is 1 if pattern[i] matches text_char)
352            // Our stored masks have 0 for match, so we invert
353            let eq = if text_char.is_ascii() {
354                !self.byte_masks[text_char as usize]
355            } else {
356                !self.get_mask(text_char)
357            };
358
359            let xv = eq | mv;
360            let xh = ((eq & pv).wrapping_add(pv)) ^ pv | eq;
361
362            let ph = mv | !(xh | pv);
363            let mh = pv & xh;
364
365            // Update score
366            if (ph & mask) != 0 {
367                score = score.saturating_add(1);
368            }
369            if (mh & mask) != 0 {
370                score = score.saturating_sub(1);
371            }
372
373            // Shift for next column
374            let ph_shift = (ph << 1) | 1;
375            let mh_shift = mh << 1;
376
377            pv = mh_shift | !(xv | ph_shift);
378            mv = ph_shift & xv;
379        }
380
381        score
382    }
383
384    /// Fast edit breakdown using Myers for distance + length heuristic for breakdown.
385    /// Returns (insertions, deletions, substitutions, swaps).
386    ///
387    /// This approximates the breakdown based on:
388    /// - Total distance from Myers (exact)
389    /// - Length difference (insertions - deletions = `text_len` - `pattern_len`)
390    ///
391    /// Transpositions are counted as substitutions in this fast version.
392    #[inline]
393    #[allow(dead_code)]
394    fn compute_edit_breakdown_fast(&self, text_chars: &[char]) -> (u8, u8, u8, u8) {
395        let m = self.pattern_len;
396        let n = text_chars.len();
397
398        if m == 0 {
399            return (n as u8, 0, 0, 0);
400        }
401        if n == 0 {
402            return (0, m as u8, 0, 0);
403        }
404
405        let distance = self.compute_edit_distance_myers(text_chars);
406
407        // Heuristic: length difference determines insertions vs deletions
408        let len_diff = n as i32 - m as i32;
409
410        if len_diff >= 0 {
411            // Text is longer or equal: extra chars are insertions
412            let insertions = (len_diff as u8).min(distance);
413            let other_edits = distance.saturating_sub(insertions);
414            (insertions, 0, other_edits, 0)
415        } else {
416            // Text is shorter: missing chars are deletions
417            let deletions = ((-len_diff) as u8).min(distance);
418            let other_edits = distance.saturating_sub(deletions);
419            (0, deletions, other_edits, 0)
420        }
421    }
422
423    /// Find all matches in text using Bitap algorithm with k errors.
424    #[must_use]
425    pub fn find_all(&self, text: &str, threshold: f32) -> Vec<DamLevMatch> {
426        let max_edits = self.limits.max_edits as usize;
427        let text_chars: Vec<(usize, char)> = text.char_indices().collect();
428
429        if text_chars.is_empty() {
430            return vec![];
431        }
432
433        let mut matches: FxHashMap<(usize, usize), DamLevMatch> = FxHashMap::default();
434
435        // State vectors: R[d] tracks matching state with exactly d errors
436        // Bit i is 0 if we've matched pattern[0..=i] with d errors
437        // Use two buffers and swap to avoid allocation per character
438        let mut r: Vec<u64> = vec![!0u64; max_edits + 1];
439        let mut old_r: Vec<u64> = vec![!0u64; max_edits + 1];
440
441        // Initialize: we can delete up to k characters from the start of pattern
442        // R[d] starts with first d bits as 0 (matched d chars via deletion)
443        // Left shift advances pattern position (bit i → bit i+1)
444        for d in 1..=max_edits {
445            r[d] = r[d - 1] << 1;
446        }
447
448        for (char_idx, &(_, text_char)) in text_chars.iter().enumerate() {
449            let text_char = if self.case_insensitive {
450                text_char.to_lowercase().next().unwrap_or(text_char)
451            } else {
452                text_char
453            };
454
455            let char_mask = self.get_mask(text_char);
456
457            // Swap buffers: old_r gets previous r, r will be updated (no allocation!)
458            std::mem::swap(&mut r, &mut old_r);
459
460            // Update R[0] (exact matching) - use old_r since we swapped
461            r[0] = (old_r[0] << 1) | char_mask;
462
463            // Update R[d] for d > 0 (fuzzy matching)
464            for d in 1..=max_edits {
465                // Can insert from R[d-1]: consume text char without advancing pattern
466                let insert = old_r[d - 1];
467
468                // Can delete from R[d-1]: advance pattern without consuming text
469                // Uses r[d-1] (already updated) with << 1 to advance pattern position
470                let delete = r[d - 1] << 1;
471
472                // Can substitute from R[d-1]: consume both and treat as match
473                let substitute = old_r[d - 1] << 1;
474
475                // Regular match with d errors
476                let match_d = (old_r[d] << 1) | char_mask;
477
478                r[d] = match_d & insert & delete & substitute;
479            }
480
481            // Check for matches (bit pattern_len-1 is 0)
482            let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
483
484            for d in 0..=max_edits {
485                if (r[d] & self.accept_mask) == 0 {
486                    // Found a match with d edits
487                    // Estimate start position (approximate)
488                    let min_start_char = char_idx.saturating_sub(self.pattern_len + d);
489                    let max_start_char =
490                        char_idx.saturating_sub(self.pattern_len.saturating_sub(d + 1));
491
492                    for start_char in min_start_char..=max_start_char.min(char_idx) {
493                        let start_byte = text_chars.get(start_char).map_or(0, |(b, _)| *b);
494
495                        // Compute exact edit breakdown using DP
496                        let (insertions, deletions, substitutions, swaps) = self
497                            .compute_exact_edit_breakdown(&text.as_bytes()[start_byte..end_byte]);
498
499                        // Use actual edit count from DP, verify it matches Bitap state
500                        let total_edits = insertions + deletions + substitutions + swaps;
501                        if total_edits as usize > d {
502                            continue; // More edits than this state allows
503                        }
504                        let sim = self.calc_similarity(total_edits, insertions, deletions);
505                        if sim >= threshold {
506                            let key = (start_byte, end_byte);
507                            let m = DamLevMatch {
508                                start: start_byte,
509                                end: end_byte,
510                                insertions,
511                                deletions,
512                                substitutions,
513                                swaps,
514                                similarity: sim,
515                            };
516
517                            matches
518                                .entry(key)
519                                .and_modify(|existing| {
520                                    if m.similarity > existing.similarity {
521                                        *existing = m.clone();
522                                    }
523                                })
524                                .or_insert(m);
525                        }
526                    }
527                }
528            }
529        }
530
531        // Handle text shorter than pattern: positions reached during the last iteration
532        // need extra propagation (via deletion) to reach the accept position.
533        if text_chars.len() < self.pattern_len {
534            let chars_short = self.pattern_len - text_chars.len();
535            for _ in 0..chars_short.min(max_edits) {
536                std::mem::swap(&mut r, &mut old_r);
537
538                // Apply deletion propagation: advance pattern position without consuming text.
539                // From d-1 errors at position p, we can delete pattern[p] to reach d errors at p+1.
540                r[0] = old_r[0]; // Can't advance without consuming text or adding error
541                for d in 1..=max_edits {
542                    // Deletion: skip pattern char without consuming text
543                    let delete = old_r[d - 1] << 1;
544                    // Keep existing state if already matched
545                    r[d] = old_r[d] & delete;
546                }
547
548                // Check for matches after propagation
549                for d in 0..=max_edits {
550                    if (r[d] & self.accept_mask) == 0 {
551                        let end_byte = text.len();
552                        let min_start_char = text_chars
553                            .len()
554                            .saturating_sub(self.pattern_len.saturating_sub(d + 1));
555
556                        for start_char in 0..=min_start_char.min(text_chars.len().saturating_sub(1))
557                        {
558                            let start_byte = text_chars.get(start_char).map_or(0, |(b, _)| *b);
559
560                            let (insertions, deletions, substitutions, swaps) = self
561                                .compute_exact_edit_breakdown(
562                                    &text.as_bytes()[start_byte..end_byte],
563                                );
564
565                            let total_edits = insertions + deletions + substitutions + swaps;
566                            if total_edits as usize <= d {
567                                let sim = self.calc_similarity(total_edits, insertions, deletions);
568                                if sim >= threshold {
569                                    let key = (start_byte, end_byte);
570                                    let m = DamLevMatch {
571                                        start: start_byte,
572                                        end: end_byte,
573                                        insertions,
574                                        deletions,
575                                        substitutions,
576                                        swaps,
577                                        similarity: sim,
578                                    };
579
580                                    matches
581                                        .entry(key)
582                                        .and_modify(|existing| {
583                                            if m.similarity > existing.similarity {
584                                                *existing = m.clone();
585                                            }
586                                        })
587                                        .or_insert(m);
588                                }
589                            }
590                        }
591                    }
592                }
593            }
594        }
595
596        matches.into_values().collect()
597    }
598
599    /// Find all non-overlapping matches, preferring best (highest similarity) matches.
600    ///
601    /// This method finds all overlapping candidates, sorts by similarity, then
602    /// greedily selects non-overlapping matches starting from highest similarity.
603    /// This ensures we prefer "Lorem" (sim=1.0) over "ore" (sim=0.6).
604    ///
605    /// Matches must be at least `pattern_len - max_edits` characters long to be
606    /// considered valid. This prevents overly short fuzzy matches.
607    ///
608    /// When `require_first_char` is true, matches must start with the same first
609    /// character as the pattern (case-insensitive). This filters out spurious
610    /// matches like "bore" when searching for "Lorem".
611    #[must_use]
612    pub fn find_best_non_overlapping(
613        &self,
614        text: &str,
615        threshold: f32,
616        require_first_char: bool,
617    ) -> Vec<DamLevMatch> {
618        // Get all overlapping matches
619        let mut all_matches = self.find_all(text, threshold);
620
621        if all_matches.is_empty() {
622            return vec![];
623        }
624
625        // Filter: minimum match length = pattern_len - max_edits
626        let min_match_len = self
627            .pattern_len
628            .saturating_sub(self.limits.max_edits as usize);
629        all_matches.retain(|m| m.end - m.start >= min_match_len);
630
631        // Filter: require first character to match pattern's first char
632        // Respects case_insensitive setting - if case-sensitive, require exact first char match
633        if require_first_char && !self.pattern_chars.is_empty() {
634            let pattern_first = self.pattern_chars[0];
635            let text_bytes = text.as_bytes();
636            all_matches.retain(|m| {
637                if m.start >= text_bytes.len() {
638                    return false;
639                }
640                // Decode the first character of the match
641                let (first_char, _) = decode_utf8_char_fast(text_bytes, m.start);
642                if self.case_insensitive {
643                    first_char.eq_ignore_ascii_case(&pattern_first)
644                } else {
645                    first_char == pattern_first
646                }
647            });
648        }
649
650        if all_matches.is_empty() {
651            return vec![];
652        }
653
654        // Sort by similarity descending, then by start position ascending
655        all_matches.sort_by(|a, b| match b.similarity.partial_cmp(&a.similarity) {
656            Some(std::cmp::Ordering::Equal) | None => a.start.cmp(&b.start),
657            Some(ord) => ord,
658        });
659
660        // Greedily select non-overlapping matches
661        let mut result = Vec::new();
662        let mut occupied = vec![false; text.len() + 1];
663
664        for m in all_matches {
665            // Check if this match overlaps with any already selected
666            let overlaps = (m.start..m.end).any(|i| occupied[i]);
667            if !overlaps {
668                // Mark this range as occupied
669                for i in m.start..m.end {
670                    occupied[i] = true;
671                }
672                result.push(m);
673            }
674        }
675
676        // Sort result by start position for consistent ordering
677        result.sort_by_key(|m| m.start);
678        result
679    }
680
681    /// Fast find of non-overlapping matches optimized for iteration (greedy leftmost).
682    ///
683    /// This is faster than `find_all()` followed by filtering because:
684    /// 1. It skips ahead after each match (no overlapping work)
685    /// 2. It only verifies the most likely start position per match
686    /// 3. For exact matches (d=0), it trusts Bitap without DP
687    ///
688    /// When `require_first_char` is true, matches must start with the same first
689    /// character as the pattern (case-sensitive unless `case_insensitive` mode).
690    ///
691    /// Note: This uses greedy-leftmost strategy. For best-match selection
692    /// (preferring higher similarity), use `find_best_non_overlapping` instead.
693    #[must_use]
694    pub fn find_all_non_overlapping(
695        &self,
696        text: &str,
697        threshold: f32,
698        require_first_char: bool,
699    ) -> Vec<DamLevMatch> {
700        self.find_non_overlapping_impl(text, threshold, require_first_char, 0)
701    }
702
703    /// Find the first match using the same algorithm as `find_all_non_overlapping`.
704    /// Returns as soon as a match is found, avoiding scanning the rest of the text.
705    #[must_use]
706    pub fn find_first_non_overlapping(&self, text: &str, threshold: f32) -> Option<DamLevMatch> {
707        // Try ASCII fast path if both pattern and text are ASCII
708        if self.is_ascii && text.is_ascii() {
709            if let Some(m) = self.find_first_ascii_fast(text.as_bytes(), threshold) {
710                return Some(m);
711            }
712            // If fast path returns None due to max_edits > 4, fall through to generic path
713            if self.limits.max_edits <= 4 {
714                return None;
715            }
716        }
717
718        let matches = self.find_non_overlapping_impl(text, threshold, false, 1);
719        matches.into_iter().next()
720    }
721
722    /// Find up to `n` non-overlapping matches.
723    /// Stops searching after finding `n` matches for efficiency.
724    #[must_use]
725    pub fn find_n_non_overlapping(
726        &self,
727        text: &str,
728        threshold: f32,
729        require_first_char: bool,
730        n: usize,
731    ) -> Vec<DamLevMatch> {
732        self.find_non_overlapping_impl(text, threshold, require_first_char, n)
733    }
734
735    /// Implementation of non-overlapping match search with optional limit.
736    /// `limit` of 0 means unlimited matches, otherwise stops after finding `limit` matches.
737    fn find_non_overlapping_impl(
738        &self,
739        text: &str,
740        threshold: f32,
741        require_first_char: bool,
742        limit: usize,
743    ) -> Vec<DamLevMatch> {
744        let max_edits = self.limits.max_edits as usize;
745        let text_bytes = text.as_bytes();
746        let text_len = text_bytes.len();
747
748        // Handle empty text: matches if pattern can be fully deleted
749        if text_len == 0 {
750            if self.pattern_len <= max_edits {
751                let deletions = self.pattern_len as u8;
752                let sim = self.calc_similarity(deletions, 0, deletions);
753                if sim >= threshold {
754                    return vec![DamLevMatch {
755                        start: 0,
756                        end: 0,
757                        insertions: 0,
758                        deletions,
759                        substitutions: 0,
760                        swaps: 0,
761                        similarity: sim,
762                    }];
763                }
764            }
765            return vec![];
766        }
767
768        // Precompute first-char check if needed
769        let first_char_check: Option<char> = if require_first_char && !self.pattern_chars.is_empty()
770        {
771            Some(self.pattern_chars[0])
772        } else {
773            None
774        };
775
776        let mut matches = Vec::new();
777        let mut last_end = 0usize;
778
779        // State vectors (3 buffers for transposition support)
780        let mut r: Vec<u64> = vec![!0u64; max_edits + 1];
781        let mut old_r: Vec<u64> = vec![!0u64; max_edits + 1];
782        let mut old_old_r: Vec<u64> = vec![!0u64; max_edits + 1];
783
784        // Initialize deletion states
785        for d in 1..=max_edits {
786            r[d] = r[d - 1] << 1;
787        }
788
789        // Track pending fuzzy match (wait for potential better exact match)
790        let mut pending_match: Option<(usize, DamLevMatch)> = None; // (edit_level, match)
791        let mut chars_since_pending = 0usize;
792
793        // Track previous character mask for transposition
794        let mut prev_mask: u64 = !0;
795
796        // Circular buffer to track byte positions of recent characters
797        // Used to correctly compute start position for matches with multi-byte UTF-8
798        let history_size = self.pattern_len + max_edits + 1;
799        let mut byte_history: Vec<usize> = vec![0; history_size];
800        let mut history_idx = 0usize;
801
802        let mut byte_pos = 0;
803
804        while byte_pos < text_len {
805            // Decode current character
806            let (text_char, char_len) = decode_utf8_char_fast(text_bytes, byte_pos);
807            let text_char = if self.case_insensitive {
808                text_char.to_lowercase().next().unwrap_or(text_char)
809            } else {
810                text_char
811            };
812
813            // Record byte position in circular buffer for correct UTF-8 start computation
814            byte_history[history_idx] = byte_pos;
815            history_idx = (history_idx + 1) % history_size;
816
817            let char_mask = self.get_mask(text_char);
818
819            // Rotate buffers: old_old_r <- old_r <- r
820            std::mem::swap(&mut old_old_r, &mut old_r);
821            std::mem::swap(&mut old_r, &mut r);
822
823            // Update R[0] (exact matching)
824            r[0] = (old_r[0] << 1) | char_mask;
825
826            // Update R[d] for d > 0 (fuzzy matching with transposition)
827            for d in 1..=max_edits {
828                let insert = old_r[d - 1];
829                let delete = r[d - 1] << 1;
830                let substitute = old_r[d - 1] << 1;
831                let match_d = (old_r[d] << 1) | char_mask;
832                let mut new_r = match_d & insert & delete & substitute;
833
834                // Transposition: check if we can swap adjacent chars
835                // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
836                let trans_valid_mask = char_mask | (prev_mask >> 1);
837                // From matched position k, we can reach k+2 via transposition at k+1
838                let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
839                new_r &= trans;
840
841                r[d] = new_r;
842            }
843
844            // Update prev_mask for next iteration
845            prev_mask = char_mask;
846
847            let end_byte = byte_pos + char_len;
848
849            // Check for match at each error level (prefer lower error levels)
850            'error_levels: for d in 0..=max_edits {
851                if (r[d] & self.accept_mask) == 0 {
852                    // Found a potential match with d edits
853                    // For fuzzy matches, the match length could vary:
854                    // - With deletions: match is shorter than pattern
855                    // - With insertions: match is longer than pattern
856                    let min_match_len = self.pattern_len.saturating_sub(d);
857                    let max_match_len = self.pattern_len + d;
858
859                    // Track best candidate at this error level
860                    let mut best_at_level: Option<DamLevMatch> = None;
861
862                    // Try all possible match lengths to find the best one
863                    // We check all lengths and pick: earliest start, then longest match
864                    for try_len in min_match_len..=max_match_len {
865                        // Compute start_byte by going back try_len characters (not bytes)
866                        // Use the circular buffer to handle multi-byte UTF-8 correctly
867                        let start_byte = if try_len <= history_size && try_len > 0 {
868                            // Look up the byte position from the circular buffer
869                            // history_idx points to the next slot, so most recent is (history_idx - 1)
870                            // We need to go back (try_len - 1) more slots from there
871                            let idx = (history_idx + history_size - try_len) % history_size;
872                            byte_history[idx]
873                        } else if try_len == 0 {
874                            end_byte
875                        } else {
876                            // try_len > history_size: shouldn't happen normally, fall back to byte math
877                            // This could be inaccurate for multi-byte chars
878                            end_byte.saturating_sub(try_len)
879                        };
880
881                        if start_byte >= end_byte {
882                            continue;
883                        }
884
885                        // Skip empty matches at end of text (can happen when try_len=0)
886                        if start_byte >= text_len {
887                            continue;
888                        }
889
890                        // Skip if this match overlaps with previous confirmed match
891                        if start_byte < last_end {
892                            continue;
893                        }
894
895                        // Check first-char filter if required
896                        if let Some(pattern_first) = first_char_check {
897                            let (match_first, _) = decode_utf8_char_fast(text_bytes, start_byte);
898                            let matches_first = if self.case_insensitive {
899                                match_first.eq_ignore_ascii_case(&pattern_first)
900                            } else {
901                                match_first == pattern_first
902                            };
903                            if !matches_first {
904                                continue;
905                            }
906                        }
907
908                        // For exact match (d=0), accept immediately
909                        if d == 0 {
910                            let sim = 1.0f32;
911                            if sim >= threshold {
912                                // Clear any pending fuzzy match (this exact match is better)
913                                pending_match = None;
914                                chars_since_pending = 0;
915
916                                matches.push(DamLevMatch {
917                                    start: start_byte,
918                                    end: end_byte,
919                                    insertions: 0,
920                                    deletions: 0,
921                                    substitutions: 0,
922                                    swaps: 0,
923                                    similarity: sim,
924                                });
925
926                                // Early exit: return immediately if limit reached
927                                if limit > 0 && matches.len() >= limit {
928                                    return matches;
929                                }
930
931                                last_end = end_byte;
932
933                                // Reset state for next non-overlapping match
934                                r.fill(!0u64);
935                                old_r.fill(!0u64);
936                                old_old_r.fill(!0u64);
937                                for dd in 1..=max_edits {
938                                    r[dd] = r[dd - 1] << 1;
939                                }
940                                prev_mask = !0;
941                                break 'error_levels; // Found exact match, move on
942                            }
943                        } else {
944                            // For fuzzy match, verify with DP
945                            let matched_text = &text_bytes[start_byte..end_byte];
946                            let (insertions, deletions, substitutions, swaps) =
947                                self.compute_exact_edit_breakdown(matched_text);
948
949                            let total_edits = insertions + deletions + substitutions + swaps;
950                            if total_edits as usize <= max_edits {
951                                let sim = self.calc_similarity(total_edits, insertions, deletions);
952                                if sim >= threshold {
953                                    let candidate = DamLevMatch {
954                                        start: start_byte,
955                                        end: end_byte,
956                                        insertions,
957                                        deletions,
958                                        substitutions,
959                                        swaps,
960                                        similarity: sim,
961                                    };
962
963                                    // Check if this candidate is better than best at this level
964                                    // Prefer: earlier start, then longer match
965                                    let dominated = best_at_level.as_ref().is_some_and(|best| {
966                                        let best_len = best.end - best.start;
967                                        let cand_len = candidate.end - candidate.start;
968                                        best.start < candidate.start
969                                            || (best.start == candidate.start
970                                                && best_len >= cand_len)
971                                    });
972
973                                    if !dominated {
974                                        best_at_level = Some(candidate);
975                                    }
976                                }
977                            }
978                        }
979                    }
980
981                    // If we found a valid match at this error level, update pending
982                    if let Some(candidate) = best_at_level {
983                        // Check if this is better than existing pending match
984                        let dominated = pending_match.as_ref().is_some_and(|(pd, pm)| {
985                            let pm_len = pm.end - pm.start;
986                            let cand_len = candidate.end - candidate.start;
987                            *pd < d
988                                || (*pd == d && pm.start < candidate.start)
989                                || (*pd == d && pm.start == candidate.start && pm_len >= cand_len)
990                        });
991
992                        if !dominated {
993                            // Always reset counter when setting a new pending match
994                            // This ensures the new match gets its full waiting period
995                            chars_since_pending = 0;
996                            pending_match = Some((d, candidate));
997                        }
998                        break 'error_levels; // Found valid fuzzy match at this level
999                    }
1000                }
1001            }
1002
1003            // Check if we should commit the pending fuzzy match
1004            if let Some((d, ref m)) = pending_match {
1005                chars_since_pending += 1;
1006                let match_len = m.end - m.start;
1007
1008                // Determine when to commit:
1009                // - Exact matches (d=0): commit immediately
1010                // - Fuzzy matches (d>0): wait at least 1 char to let potential exact matches appear
1011                //   This handles cases like "mhussei" (fuzzy at d=2) vs "hussein" (exact at d=0)
1012                //   where the exact match ends one character later
1013                let commit_threshold = if d == 0 {
1014                    1 // Exact match: commit on first check
1015                } else if match_len >= self.pattern_len {
1016                    2 // Full-length fuzzy match: wait 1 char for potential exact match
1017                } else {
1018                    max_edits + 1 // Short match: wait longer
1019                };
1020
1021                if chars_since_pending >= commit_threshold {
1022                    let (_, m) = pending_match.take().unwrap();
1023                    last_end = m.end;
1024                    matches.push(m);
1025
1026                    // Early exit: return immediately if limit reached
1027                    if limit > 0 && matches.len() >= limit {
1028                        return matches;
1029                    }
1030
1031                    // Reset state
1032                    r.fill(!0u64);
1033                    old_r.fill(!0u64);
1034                    old_old_r.fill(!0u64);
1035                    for dd in 1..=max_edits {
1036                        r[dd] = r[dd - 1] << 1;
1037                    }
1038                    prev_mask = !0;
1039                    chars_since_pending = 0;
1040                }
1041            }
1042
1043            byte_pos = end_byte;
1044        }
1045
1046        // Commit any remaining pending match
1047        if let Some((_, m)) = pending_match {
1048            matches.push(m);
1049        }
1050
1051        matches
1052    }
1053
1054    /// Ultra-fast ASCII-only path for finding first match.
1055    /// Only called when both pattern and text are pure ASCII.
1056    /// Uses stack arrays instead of Vec, direct byte lookup instead of `HashMap`.
1057    ///
1058    /// Returns Some((start, end, edits)) on match, None if no match found.
1059    #[inline]
1060    fn find_first_ascii_fast(&self, text: &[u8], threshold: f32) -> Option<DamLevMatch> {
1061        debug_assert!(self.is_ascii);
1062
1063        let max_edits = self.limits.max_edits as usize;
1064        let text_len = text.len();
1065
1066        // Handle empty text
1067        if text_len == 0 {
1068            if self.pattern_len <= max_edits {
1069                let deletions = self.pattern_len as u8;
1070                let sim = self.calc_similarity(deletions, 0, deletions);
1071                if sim >= threshold {
1072                    return Some(DamLevMatch {
1073                        start: 0,
1074                        end: 0,
1075                        insertions: 0,
1076                        deletions,
1077                        substitutions: 0,
1078                        swaps: 0,
1079                        similarity: sim,
1080                    });
1081                }
1082            }
1083            return None;
1084        }
1085
1086        // Use fixed-size arrays for state vectors (up to 4 edits supported in fast path)
1087        // For more edits, fall back to Vec-based implementation
1088        if max_edits > 4 {
1089            return None; // Signal caller to use generic path
1090        }
1091
1092        // State vectors (3 buffers for transposition support) - stack allocated
1093        let mut r: [u64; 5] = [!0u64; 5];
1094        let mut old_r: [u64; 5] = [!0u64; 5];
1095        let mut old_old_r: [u64; 5] = [!0u64; 5];
1096
1097        // Initialize deletion states
1098        for d in 1..=max_edits {
1099            r[d] = r[d - 1] << 1;
1100        }
1101
1102        // Track pending fuzzy match
1103        let mut pending_match: Option<(usize, DamLevMatch)> = None;
1104        let mut chars_since_pending = 0usize;
1105
1106        // Track previous character mask for transposition
1107        let mut prev_mask: u64 = !0;
1108
1109        // Circular buffer for start position tracking (pattern_len + max_edits + 1)
1110        // Since ASCII: 1 byte = 1 char, we can use byte positions directly
1111        let history_size = self.pattern_len + max_edits + 1;
1112        // Use fixed array - max pattern is 64, max edits is 4, so max history is 69
1113        let mut byte_history: [usize; 72] = [0; 72];
1114        let mut history_idx = 0usize;
1115
1116        let mut byte_pos = 0;
1117
1118        // Pre-fetch for case insensitivity - use a static lookup table
1119        let to_lower: fn(u8) -> u8 = if self.case_insensitive {
1120            |b| b.to_ascii_lowercase()
1121        } else {
1122            |b| b
1123        };
1124
1125        while byte_pos < text_len {
1126            let byte = to_lower(text[byte_pos]);
1127
1128            // Record byte position
1129            byte_history[history_idx % history_size] = byte_pos;
1130            history_idx += 1;
1131
1132            // Direct byte mask lookup - no HashMap, no UTF-8 decode
1133            let char_mask = if byte < 128 {
1134                self.byte_masks[byte as usize]
1135            } else {
1136                !0u64 // Non-ASCII byte: no match (shouldn't happen in ASCII path)
1137            };
1138
1139            // Rotate buffers
1140            let tmp = old_old_r;
1141            old_old_r = old_r;
1142            old_r = r;
1143            r = tmp;
1144
1145            // Update R[0] (exact matching)
1146            r[0] = (old_r[0] << 1) | char_mask;
1147
1148            // Update R[d] for d > 0 (fuzzy matching with transposition)
1149            for d in 1..=max_edits {
1150                let insert = old_r[d - 1];
1151                let delete = r[d - 1] << 1;
1152                let substitute = old_r[d - 1] << 1;
1153                let match_d = (old_r[d] << 1) | char_mask;
1154                let mut new_r = match_d & insert & delete & substitute;
1155
1156                // Transposition
1157                let trans_valid_mask = char_mask | (prev_mask >> 1);
1158                let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
1159                new_r &= trans;
1160
1161                r[d] = new_r;
1162            }
1163
1164            prev_mask = char_mask;
1165            let end_byte = byte_pos + 1; // ASCII: 1 byte per char
1166
1167            // Check for match at each error level
1168            'error_levels: for d in 0..=max_edits {
1169                if (r[d] & self.accept_mask) == 0 {
1170                    let min_match_len = self.pattern_len.saturating_sub(d);
1171                    let max_match_len = self.pattern_len + d;
1172
1173                    let mut best_at_level: Option<DamLevMatch> = None;
1174
1175                    for try_len in min_match_len..=max_match_len {
1176                        let start_byte =
1177                            if try_len <= history_size && try_len > 0 && history_idx >= try_len {
1178                                byte_history[(history_idx - try_len) % history_size]
1179                            } else if try_len == 0 {
1180                                end_byte
1181                            } else {
1182                                end_byte.saturating_sub(try_len)
1183                            };
1184
1185                        if start_byte >= end_byte || start_byte >= text_len {
1186                            continue;
1187                        }
1188
1189                        // For exact match (d=0), return immediately
1190                        if d == 0 {
1191                            let sim = 1.0f32;
1192                            if sim >= threshold {
1193                                return Some(DamLevMatch {
1194                                    start: start_byte,
1195                                    end: end_byte,
1196                                    insertions: 0,
1197                                    deletions: 0,
1198                                    substitutions: 0,
1199                                    swaps: 0,
1200                                    similarity: sim,
1201                                });
1202                            }
1203                        } else {
1204                            // Fuzzy match - verify with DP
1205                            let matched_text = &text[start_byte..end_byte];
1206                            let (insertions, deletions, substitutions, swaps) =
1207                                self.compute_exact_edit_breakdown(matched_text);
1208
1209                            let total_edits = insertions + deletions + substitutions + swaps;
1210                            if total_edits as usize <= max_edits {
1211                                let sim = self.calc_similarity(total_edits, insertions, deletions);
1212                                if sim >= threshold {
1213                                    let candidate = DamLevMatch {
1214                                        start: start_byte,
1215                                        end: end_byte,
1216                                        insertions,
1217                                        deletions,
1218                                        substitutions,
1219                                        swaps,
1220                                        similarity: sim,
1221                                    };
1222
1223                                    let dominated = best_at_level.as_ref().is_some_and(|best| {
1224                                        let best_len = best.end - best.start;
1225                                        let cand_len = candidate.end - candidate.start;
1226                                        best.start < candidate.start
1227                                            || (best.start == candidate.start
1228                                                && best_len >= cand_len)
1229                                    });
1230
1231                                    if !dominated {
1232                                        best_at_level = Some(candidate);
1233                                    }
1234                                }
1235                            }
1236                        }
1237                    }
1238
1239                    if let Some(candidate) = best_at_level {
1240                        let dominated = pending_match.as_ref().is_some_and(|(pd, pm)| {
1241                            let pm_len = pm.end - pm.start;
1242                            let cand_len = candidate.end - candidate.start;
1243                            *pd < d
1244                                || (*pd == d && pm.start < candidate.start)
1245                                || (*pd == d && pm.start == candidate.start && pm_len >= cand_len)
1246                        });
1247
1248                        if !dominated {
1249                            chars_since_pending = 0;
1250                            pending_match = Some((d, candidate));
1251                        }
1252                        break 'error_levels;
1253                    }
1254                }
1255            }
1256
1257            // Check if we should commit pending match
1258            if let Some((d, ref m)) = pending_match {
1259                chars_since_pending += 1;
1260                let match_len = m.end - m.start;
1261
1262                let commit_threshold = if d == 0 {
1263                    1
1264                } else if match_len >= self.pattern_len {
1265                    2
1266                } else {
1267                    max_edits + 1
1268                };
1269
1270                if chars_since_pending >= commit_threshold {
1271                    let (_, m) = pending_match.take().unwrap();
1272                    return Some(m);
1273                }
1274            }
1275
1276            byte_pos += 1;
1277        }
1278
1279        // Return any pending match
1280        pending_match.map(|(_, m)| m)
1281    }
1282
1283    /// Compute exact Damerau-Levenshtein edit breakdown using dynamic programming.
1284    /// Returns (insertions, deletions, substitutions, swaps).
1285    ///
1286    /// Optimized version using:
1287    /// - Myers' bit-vector algorithm for fast early positive confirmation
1288    /// - 3-row rotation instead of full O(m×n) table (for transposition support)
1289    /// - Stack allocation for small patterns (no heap allocation in common case)
1290    fn compute_exact_edit_breakdown(&self, matched_text: &[u8]) -> (u8, u8, u8, u8) {
1291        let pattern = &self.pattern_chars;
1292        let m = pattern.len();
1293
1294        // Parse text as UTF-8
1295        let Ok(text_str) = std::str::from_utf8(matched_text) else {
1296            return (0, m as u8, 0, 0);
1297        };
1298
1299        if m == 0 {
1300            let n = text_str.chars().count();
1301            return (n as u8, 0, 0, 0);
1302        }
1303        if text_str.is_empty() {
1304            return (0, m as u8, 0, 0);
1305        }
1306
1307        // For small text, use fully stack-allocated version (common case)
1308        // Stack limit chosen to cover pattern_len <= 64 + typical edits
1309        const STACK_LIMIT: usize = 72;
1310
1311        // For ASCII text, byte length == char count (fast path)
1312        let is_ascii = text_str.is_ascii();
1313        let n = if is_ascii {
1314            text_str.len()
1315        } else {
1316            text_str.chars().count()
1317        };
1318
1319        if n < STACK_LIMIT {
1320            // Fast early rejection using Myers
1321            // Since Myers doesn't support transpositions (counts as 2 subs instead of 1),
1322            // we can only reject if Myers distance > max_edits + potential_transpositions
1323            // Conservative: reject if Myers distance > max_edits + max_possible_transpositions
1324            let max_possible_trans = (m.min(n) / 2) as u8;
1325
1326            // Build text chars for Myers check
1327            let mut text_chars_buf: [char; STACK_LIMIT] = ['\0'; STACK_LIMIT];
1328            for (idx, c) in text_str.chars().take(STACK_LIMIT).enumerate() {
1329                text_chars_buf[idx] = if self.case_insensitive {
1330                    c.to_ascii_lowercase()
1331                } else {
1332                    c
1333                };
1334            }
1335            let text_chars = &text_chars_buf[..n];
1336
1337            let myers_dist = self.compute_edit_distance_myers(text_chars);
1338
1339            // If Myers distance is low enough that no transpositions could make it invalid,
1340            // we still need full DP for exact breakdown
1341            // If Myers distance is very high, reject early
1342            // Use saturating_add to avoid overflow when max_edits is u8::MAX (unlimited)
1343            if myers_dist > self.limits.max_edits.saturating_add(max_possible_trans) {
1344                // Definitely too many edits - return high value for rejection
1345                return (myers_dist, 0, 0, 0);
1346            }
1347
1348            self.compute_edit_breakdown_small::<STACK_LIMIT>(pattern, text_str, m, n)
1349        } else {
1350            self.compute_edit_breakdown_large(pattern, text_str, m, n)
1351        }
1352    }
1353
1354    /// Optimized DP for small text (stack allocated, 3-row rotation).
1355    #[inline]
1356    fn compute_edit_breakdown_small<const N: usize>(
1357        &self,
1358        pattern: &[char],
1359        text_str: &str,
1360        m: usize,
1361        n: usize,
1362    ) -> (u8, u8, u8, u8) {
1363        debug_assert!(n < N);
1364
1365        // 3 rows for rotation: prev_prev (i-2), prev (i-1), curr (i)
1366        // Each row has n+1 elements
1367        type Cell = (u8, u8, u8, u8, u8); // (dist, ins, del, sub, swap)
1368        let mut prev_prev: [Cell; N] = [(0, 0, 0, 0, 0); N];
1369        let mut prev: [Cell; N] = [(0, 0, 0, 0, 0); N];
1370        let mut curr: [Cell; N] = [(0, 0, 0, 0, 0); N];
1371
1372        // Stack-allocated text chars buffer (avoids heap allocation)
1373        let mut text_chars_buf: [char; N] = ['\0'; N];
1374        for (idx, c) in text_str.chars().take(N).enumerate() {
1375            text_chars_buf[idx] = if self.case_insensitive {
1376                c.to_ascii_lowercase()
1377            } else {
1378                c
1379            };
1380        }
1381        let text_chars = &text_chars_buf[..n];
1382
1383        // Initialize row 0 (base case: insert j chars from text)
1384        // This goes into prev since the loop starts at i=1 and uses prev for i-1
1385        for j in 0..=n {
1386            prev[j] = (j as u8, j as u8, 0, 0, 0);
1387        }
1388
1389        let mut prev_pattern_char = '\0';
1390
1391        for i in 1..=m {
1392            let pattern_char = if self.case_insensitive {
1393                pattern[i - 1].to_ascii_lowercase()
1394            } else {
1395                pattern[i - 1]
1396            };
1397
1398            // Base case for column 0: delete i chars from pattern
1399            curr[0] = (i as u8, 0, i as u8, 0, 0);
1400
1401            for j in 1..=n {
1402                let text_char = text_chars[j - 1];
1403
1404                if pattern_char == text_char {
1405                    // Match - no edit needed
1406                    curr[j] = prev[j - 1];
1407                } else {
1408                    // Try substitution (from prev[j-1])
1409                    let (sub_d, sub_i, sub_del, sub_s, sub_sw) = prev[j - 1];
1410                    let mut best = (sub_d + 1, sub_i, sub_del, sub_s + 1, sub_sw);
1411
1412                    // Try insertion (from curr[j-1])
1413                    let (ins_d, ins_i, ins_del, ins_s, ins_sw) = curr[j - 1];
1414                    if ins_d + 1 < best.0 {
1415                        best = (ins_d + 1, ins_i + 1, ins_del, ins_s, ins_sw);
1416                    }
1417
1418                    // Try deletion (from prev[j])
1419                    let (del_d, del_i, del_del, del_s, del_sw) = prev[j];
1420                    if del_d + 1 < best.0 {
1421                        best = (del_d + 1, del_i, del_del + 1, del_s, del_sw);
1422                    }
1423
1424                    // Try transposition (from prev_prev[j-2])
1425                    if i > 1 && j > 1 {
1426                        let prev_text_char = text_chars[j - 2];
1427                        if pattern_char == prev_text_char && prev_pattern_char == text_char {
1428                            let (tr_d, tr_i, tr_del, tr_s, tr_sw) = prev_prev[j - 2];
1429                            if tr_d + 1 < best.0 {
1430                                best = (tr_d + 1, tr_i, tr_del, tr_s, tr_sw + 1);
1431                            }
1432                        }
1433                    }
1434
1435                    curr[j] = best;
1436                }
1437            }
1438
1439            // Rotate rows: prev_prev <- prev <- curr
1440            std::mem::swap(&mut prev_prev, &mut prev);
1441            std::mem::swap(&mut prev, &mut curr);
1442            prev_pattern_char = pattern_char;
1443        }
1444
1445        // Result is in prev[n] (after final rotation)
1446        let (_, ins, del, sub, sw) = prev[n];
1447        (ins, del, sub, sw)
1448    }
1449
1450    /// Fallback DP for large text (heap allocated).
1451    fn compute_edit_breakdown_large(
1452        &self,
1453        pattern: &[char],
1454        text_str: &str,
1455        m: usize,
1456        n: usize,
1457    ) -> (u8, u8, u8, u8) {
1458        type Cell = (u8, u8, u8, u8, u8);
1459
1460        // 3 rows for rotation
1461        let mut prev_prev: Vec<Cell> = vec![(0, 0, 0, 0, 0); n + 1];
1462        let mut prev: Vec<Cell> = vec![(0, 0, 0, 0, 0); n + 1];
1463        let mut curr: Vec<Cell> = vec![(0, 0, 0, 0, 0); n + 1];
1464
1465        // Initialize row 0
1466        for j in 0..=n {
1467            prev_prev[j] = (j as u8, j as u8, 0, 0, 0);
1468        }
1469
1470        let text_chars: Vec<char> = if self.case_insensitive {
1471            text_str.chars().map(|c| c.to_ascii_lowercase()).collect()
1472        } else {
1473            text_str.chars().collect()
1474        };
1475
1476        let mut prev_pattern_char = '\0';
1477
1478        for i in 1..=m {
1479            let pattern_char = if self.case_insensitive {
1480                pattern[i - 1].to_ascii_lowercase()
1481            } else {
1482                pattern[i - 1]
1483            };
1484
1485            curr[0] = (i as u8, 0, i as u8, 0, 0);
1486
1487            for j in 1..=n {
1488                let text_char = text_chars[j - 1];
1489
1490                if pattern_char == text_char {
1491                    curr[j] = prev[j - 1];
1492                } else {
1493                    let (sub_d, sub_i, sub_del, sub_s, sub_sw) = prev[j - 1];
1494                    let mut best = (sub_d + 1, sub_i, sub_del, sub_s + 1, sub_sw);
1495
1496                    let (ins_d, ins_i, ins_del, ins_s, ins_sw) = curr[j - 1];
1497                    if ins_d + 1 < best.0 {
1498                        best = (ins_d + 1, ins_i + 1, ins_del, ins_s, ins_sw);
1499                    }
1500
1501                    let (del_d, del_i, del_del, del_s, del_sw) = prev[j];
1502                    if del_d + 1 < best.0 {
1503                        best = (del_d + 1, del_i, del_del + 1, del_s, del_sw);
1504                    }
1505
1506                    if i > 1 && j > 1 {
1507                        let prev_text_char = text_chars[j - 2];
1508                        if pattern_char == prev_text_char && prev_pattern_char == text_char {
1509                            let (tr_d, tr_i, tr_del, tr_s, tr_sw) = prev_prev[j - 2];
1510                            if tr_d + 1 < best.0 {
1511                                best = (tr_d + 1, tr_i, tr_del, tr_s, tr_sw + 1);
1512                            }
1513                        }
1514                    }
1515
1516                    curr[j] = best;
1517                }
1518            }
1519
1520            std::mem::swap(&mut prev_prev, &mut prev);
1521            std::mem::swap(&mut prev, &mut curr);
1522            prev_pattern_char = pattern_char;
1523        }
1524
1525        let (_, ins, del, sub, sw) = prev[n];
1526        (ins, del, sub, sw)
1527    }
1528
1529    /// Find the first match in the text.
1530    ///
1531    /// This delegates to `find_all_non_overlapping` and returns the first result.
1532    /// While not optimal for all cases, this ensures correct behavior for edge cases
1533    /// like transpositions and complex fuzzy matches.
1534    #[must_use]
1535    pub fn find_first(&self, text: &str, threshold: f32) -> Option<DamLevMatch> {
1536        // Delegate to the well-tested find_all_non_overlapping
1537        // require_first_char=false allows matches where first char is edited
1538        let matches = self.find_all_non_overlapping(text, threshold, false);
1539        matches.into_iter().min_by_key(|m| m.start)
1540    }
1541
1542    /// Find first match starting from candidate positions only.
1543    #[must_use]
1544    pub fn find_first_with_candidates(
1545        &self,
1546        text: &str,
1547        threshold: f32,
1548        candidates: &super::hash::FxHashSet<usize>,
1549    ) -> Option<DamLevMatch> {
1550        let max_edits = self.limits.max_edits as usize;
1551        let text_chars: Vec<(usize, char)> = text.char_indices().collect();
1552
1553        if text_chars.is_empty() || candidates.is_empty() {
1554            return None;
1555        }
1556
1557        // For each candidate position, run a localized Bitap search
1558        let mut sorted_candidates: Vec<usize> = candidates.iter().copied().collect();
1559        sorted_candidates.sort_unstable();
1560
1561        // Pre-allocate state buffers outside the loop (reused across candidates)
1562        let mut r: Vec<u64> = vec![!0u64; max_edits + 1];
1563        let mut old_r: Vec<u64> = vec![!0u64; max_edits + 1];
1564
1565        for &start_byte in &sorted_candidates {
1566            // Find the character index for this byte position using binary search (O(log N))
1567            let start_char = text_chars
1568                .binary_search_by_key(&start_byte, |(b, _)| *b)
1569                .unwrap_or(0);
1570
1571            // Reset state for this candidate
1572            r.fill(!0u64);
1573
1574            // Initialize deletion states - left shift advances pattern position
1575            for d in 1..=max_edits {
1576                r[d] = r[d - 1] << 1;
1577            }
1578
1579            let max_window = self.pattern_len + max_edits;
1580
1581            // Track best match within this window
1582            let mut best_match: Option<(usize, DamLevMatch)> = None;
1583
1584            for (rel_idx, &(_, text_char)) in text_chars[start_char..]
1585                .iter()
1586                .enumerate()
1587                .take(max_window + 1)
1588            {
1589                let text_char = if self.case_insensitive {
1590                    text_char.to_lowercase().next().unwrap_or(text_char)
1591                } else {
1592                    text_char
1593                };
1594
1595                let char_mask = self.get_mask(text_char);
1596
1597                // Swap buffers: old_r gets previous r (no allocation!)
1598                std::mem::swap(&mut r, &mut old_r);
1599
1600                r[0] = (old_r[0] << 1) | char_mask;
1601
1602                for d in 1..=max_edits {
1603                    let insert = old_r[d - 1];
1604                    let delete = r[d - 1] << 1; // left shift advances pattern position
1605                    let substitute = old_r[d - 1] << 1;
1606                    let match_d = (old_r[d] << 1) | char_mask;
1607
1608                    r[d] = match_d & insert & delete & substitute;
1609                }
1610
1611                // Check for match
1612                let abs_idx = start_char + rel_idx;
1613                let end_byte = text_chars.get(abs_idx + 1).map_or(text.len(), |(b, _)| *b);
1614
1615                for d in 0..=max_edits {
1616                    if (r[d] & self.accept_mask) == 0 {
1617                        // Compute exact edit breakdown using DP
1618                        let (insertions, deletions, substitutions, swaps) = self
1619                            .compute_exact_edit_breakdown(&text.as_bytes()[start_byte..end_byte]);
1620
1621                        let sim = self.calc_similarity(d as u8, insertions, deletions);
1622                        if sim >= threshold {
1623                            let candidate = DamLevMatch {
1624                                start: start_byte,
1625                                end: end_byte,
1626                                insertions,
1627                                deletions,
1628                                substitutions,
1629                                swaps,
1630                                similarity: sim,
1631                            };
1632
1633                            // Update best if this has fewer edits
1634                            let dominated =
1635                                best_match.as_ref().is_some_and(|(best_d, _)| *best_d <= d);
1636                            if !dominated {
1637                                best_match = Some((d, candidate));
1638                            }
1639
1640                            // If exact match found, return immediately
1641                            if d == 0 {
1642                                return best_match.map(|(_, m)| m);
1643                            }
1644                        }
1645                    }
1646                }
1647            }
1648
1649            // Return best match from this candidate window if found
1650            if let Some((_, m)) = best_match {
1651                return Some(m);
1652            }
1653        }
1654
1655        None
1656    }
1657
1658    /// Ultra-fast search starting from a specific byte position.
1659    ///
1660    /// This method is optimized for the greedy-first hot path:
1661    /// - No allocations (uses stack arrays for small k)
1662    /// - Direct byte iteration
1663    /// - Early termination on first match
1664    /// - SIMD acceleration when available (`AVX2` on `x86_64`)
1665    #[inline]
1666    #[must_use]
1667    pub fn find_at_byte_position(
1668        &self,
1669        text: &[u8],
1670        start_pos: usize,
1671        threshold: f32,
1672    ) -> Option<DamLevMatch> {
1673        let max_edits = self.limits.max_edits as usize;
1674
1675        // Handle empty/exhausted text: pattern can still match via pure deletions
1676        if start_pos >= text.len() {
1677            // If pattern length <= max_edits, we can delete the entire pattern
1678            if self.pattern_len <= max_edits {
1679                let deletions = self.pattern_len as u8;
1680                let sim = self.calc_similarity(deletions, 0, deletions);
1681                if sim >= threshold {
1682                    return Some(DamLevMatch {
1683                        start: start_pos,
1684                        end: start_pos,
1685                        insertions: 0,
1686                        deletions,
1687                        substitutions: 0,
1688                        swaps: 0,
1689                        similarity: sim,
1690                    });
1691                }
1692            }
1693            return None;
1694        }
1695
1696        // SIMD fast path: NEON on aarch64 for ASCII patterns with k <= 1
1697        #[cfg(all(feature = "simd", target_arch = "aarch64"))]
1698        {
1699            if self.is_ascii && max_edits <= 1 {
1700                // SAFETY: NEON is mandatory on aarch64
1701                return unsafe {
1702                    self.find_at_byte_position_neon(text, start_pos, threshold, max_edits)
1703                };
1704            }
1705        }
1706
1707        // SIMD fast path: AVX2 on x86_64 for ASCII patterns with k <= 3
1708        #[cfg(all(feature = "simd", target_arch = "x86_64"))]
1709        {
1710            if self.is_ascii && max_edits <= 3 && simd_avx2::is_available() {
1711                // SAFETY: We've verified AVX2 is available via runtime detection
1712                return unsafe {
1713                    self.find_at_byte_position_avx2(text, start_pos, threshold, max_edits)
1714                };
1715            }
1716        }
1717
1718        // Use ASCII fast path when pattern is ASCII
1719        // This avoids UTF-8 decoding and uses direct byte array lookup
1720        if self.is_ascii && max_edits <= 4 {
1721            return self.find_at_byte_position_ascii::<5>(text, start_pos, threshold);
1722        }
1723
1724        // Use stack array for small k (common case), fall back to vec for large k
1725        if max_edits <= 4 {
1726            self.find_at_byte_position_small_k::<5>(text, start_pos, threshold)
1727        } else {
1728            self.find_at_byte_position_large_k(text, start_pos, threshold)
1729        }
1730    }
1731
1732    /// NEON-accelerated search for ASCII patterns with k <= 1.
1733    ///
1734    /// # Safety
1735    /// Safe on all aarch64 targets (NEON is mandatory).
1736    #[cfg(all(feature = "simd", target_arch = "aarch64"))]
1737    #[inline]
1738    unsafe fn find_at_byte_position_neon(
1739        &self,
1740        text: &[u8],
1741        start_pos: usize,
1742        threshold: f32,
1743        max_edits: usize,
1744    ) -> Option<DamLevMatch> {
1745        debug_assert!(max_edits <= 1);
1746        debug_assert!(self.is_ascii);
1747
1748        let max_window = self.pattern_len + max_edits;
1749        let end_limit = (start_pos + max_window + 1).min(text.len());
1750        let search_len = end_limit - start_pos;
1751
1752        if search_len == 0 {
1753            return None;
1754        }
1755
1756        // State arrays: r = current, old_r = previous, old_old_r = 2 iterations ago (for transposition)
1757        let mut r = [!0u64; 4];
1758        let mut old_r = [!0u64; 4];
1759
1760        // Initialize deletion states - left shift advances pattern position
1761        for d in 1..=max_edits {
1762            r[d] = r[d - 1] << 1;
1763        }
1764
1765        // SAFETY: start_pos < text.len() verified by caller, search_len bounds checked above
1766        let text_ptr = unsafe { text.as_ptr().add(start_pos) };
1767        let byte_masks_ptr = self.byte_masks.as_ptr();
1768        let accept_mask = self.accept_mask;
1769
1770        let mut prev_mask: u64 = !0u64;
1771        // old_old_r is 2 iterations ago - on first iteration, it equals old_r's initial state
1772        let mut old_old_r = old_r;
1773
1774        for i in 0..search_len {
1775            // SAFETY: i < search_len which is bounded by text.len() - start_pos
1776            let byte = unsafe { *text_ptr.add(i) };
1777            let mask_idx = (byte & 0x7F) as usize;
1778            // SAFETY: mask_idx is always < 128 due to & 0x7F, and byte_masks has 128 elements
1779            let char_mask = unsafe { *byte_masks_ptr.add(mask_idx) };
1780
1781            // Rotate state history before update
1782            std::mem::swap(&mut old_old_r, &mut old_r);
1783            old_r = r;
1784
1785            // Use NEON state update
1786            // SAFETY: NEON is mandatory on aarch64
1787            unsafe {
1788                simd_neon::update_states_with_trans_k1_neon(
1789                    &mut r, &old_r, &old_old_r, char_mask, prev_mask,
1790                );
1791            }
1792
1793            let char_count = i + 1;
1794
1795            for d in 0..=max_edits {
1796                if (r[d] & accept_mask) == 0 {
1797                    let end_byte = start_pos + char_count;
1798                    let (insertions, deletions, substitutions, swaps) =
1799                        self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
1800
1801                    let sim = self.calc_similarity(d as u8, insertions, deletions);
1802                    if sim >= threshold {
1803                        return Some(DamLevMatch {
1804                            start: start_pos,
1805                            end: end_byte,
1806                            insertions,
1807                            deletions,
1808                            substitutions,
1809                            swaps,
1810                            similarity: sim,
1811                        });
1812                    }
1813                }
1814            }
1815
1816            prev_mask = char_mask;
1817        }
1818
1819        // Handle text shorter than pattern: positions reached during the last iteration
1820        // need extra match propagation to reach the accept position.
1821        let end_byte = start_pos + search_len;
1822        let chars_short = self.pattern_len.saturating_sub(search_len);
1823        if chars_short > 0 && prev_mask != !0u64 {
1824            for _ in 0..chars_short.min(max_edits) {
1825                old_r = r;
1826
1827                // Apply match propagation with last char's mask
1828                for d in 1..=max_edits {
1829                    let match_d = (old_r[d] << 1) | prev_mask;
1830                    r[d] &= match_d;
1831                }
1832
1833                // Check for accept
1834                for d in 0..=max_edits {
1835                    if (r[d] & accept_mask) == 0 {
1836                        let (insertions, deletions, substitutions, swaps) =
1837                            self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
1838
1839                        let total = insertions + deletions + substitutions + swaps;
1840                        if total as usize <= d {
1841                            let sim = self.calc_similarity(total, insertions, deletions);
1842                            if sim >= threshold {
1843                                return Some(DamLevMatch {
1844                                    start: start_pos,
1845                                    end: end_byte,
1846                                    insertions,
1847                                    deletions,
1848                                    substitutions,
1849                                    swaps,
1850                                    similarity: sim,
1851                                });
1852                            }
1853                        }
1854                    }
1855                }
1856            }
1857        }
1858
1859        None
1860    }
1861
1862    /// AVX2-accelerated search for ASCII patterns with k <= 3.
1863    ///
1864    /// # Safety
1865    /// Caller must ensure AVX2 is available (check with `simd_avx2::is_available()`).
1866    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
1867    #[target_feature(enable = "avx2")]
1868    #[inline]
1869    unsafe fn find_at_byte_position_avx2(
1870        &self,
1871        text: &[u8],
1872        start_pos: usize,
1873        threshold: f32,
1874        max_edits: usize,
1875    ) -> Option<DamLevMatch> {
1876        debug_assert!(max_edits <= 3);
1877        debug_assert!(self.is_ascii);
1878
1879        let max_window = self.pattern_len + max_edits;
1880        let end_limit = (start_pos + max_window + 1).min(text.len());
1881        let search_len = end_limit - start_pos;
1882
1883        if search_len == 0 {
1884            return None;
1885        }
1886
1887        // State arrays (4 elements for k <= 3)
1888        let mut r = [!0u64; 4];
1889        let mut old_r = [!0u64; 4];
1890        #[allow(unused_assignments)]
1891        let mut old_old_r = [!0u64; 4];
1892
1893        // Initialize deletion states - left shift advances pattern position
1894        for d in 1..=max_edits {
1895            r[d] = r[d - 1] << 1;
1896        }
1897
1898        // SAFETY: start_pos < text.len() verified by caller, search_len bounds checked above
1899        let text_ptr = unsafe { text.as_ptr().add(start_pos) };
1900        let byte_masks_ptr = self.byte_masks.as_ptr();
1901        let accept_mask = self.accept_mask;
1902
1903        let mut prev_mask: u64 = !0u64;
1904
1905        for i in 0..search_len {
1906            // SAFETY: i < search_len which is bounded by text.len() - start_pos
1907            let byte = unsafe { *text_ptr.add(i) };
1908
1909            // Direct array lookup for ASCII
1910            let mask_idx = (byte & 0x7F) as usize;
1911            // SAFETY: mask_idx is always < 128 due to & 0x7F, and byte_masks has 128 elements
1912            let char_mask = unsafe { *byte_masks_ptr.add(mask_idx) };
1913
1914            // Save old states
1915            old_old_r = old_r;
1916            old_r = r;
1917
1918            // Use SIMD state update with transposition
1919            // SAFETY: AVX2 availability verified by caller via simd_avx2::is_available()
1920            unsafe {
1921                simd_avx2::update_states_with_trans_avx2(
1922                    &mut r, &old_r, &old_old_r, char_mask, prev_mask, max_edits,
1923                );
1924            }
1925
1926            let char_count = i + 1;
1927
1928            // Check for match (prefer fewer edits)
1929            for d in 0..=max_edits {
1930                if (r[d] & accept_mask) == 0 {
1931                    let end_byte = start_pos + char_count;
1932
1933                    // Compute exact edit breakdown using DP
1934                    let (insertions, deletions, substitutions, swaps) =
1935                        self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
1936
1937                    let sim = self.calc_similarity(d as u8, insertions, deletions);
1938                    if sim >= threshold {
1939                        return Some(DamLevMatch {
1940                            start: start_pos,
1941                            end: end_byte,
1942                            insertions,
1943                            deletions,
1944                            substitutions,
1945                            swaps,
1946                            similarity: sim,
1947                        });
1948                    }
1949                }
1950            }
1951
1952            prev_mask = char_mask;
1953        }
1954
1955        // Handle text shorter than pattern: positions reached during the last iteration
1956        // need extra match propagation to reach the accept position.
1957        let end_byte = start_pos + search_len;
1958        let chars_short = self.pattern_len.saturating_sub(search_len);
1959        if chars_short > 0 && prev_mask != !0u64 {
1960            for _ in 0..chars_short.min(max_edits) {
1961                old_r = r;
1962
1963                // Apply match propagation with last char's mask
1964                for d in 1..=max_edits {
1965                    let match_d = (old_r[d] << 1) | prev_mask;
1966                    r[d] &= match_d;
1967                }
1968
1969                // Check for accept
1970                for d in 0..=max_edits {
1971                    if (r[d] & accept_mask) == 0 {
1972                        let (insertions, deletions, substitutions, swaps) =
1973                            self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
1974
1975                        let total = insertions + deletions + substitutions + swaps;
1976                        if total as usize <= d {
1977                            let sim = self.calc_similarity(total, insertions, deletions);
1978                            if sim >= threshold {
1979                                return Some(DamLevMatch {
1980                                    start: start_pos,
1981                                    end: end_byte,
1982                                    insertions,
1983                                    deletions,
1984                                    substitutions,
1985                                    swaps,
1986                                    similarity: sim,
1987                                });
1988                            }
1989                        }
1990                    }
1991                }
1992            }
1993        }
1994
1995        None
1996    }
1997
1998    /// Search multiple positions in parallel using SIMD.
1999    ///
2000    /// Processes up to 4 candidate positions simultaneously, returning the first match.
2001    /// This avoids the cascade dependency issue by parallelizing across positions
2002    /// rather than across error levels.
2003    ///
2004    /// Returns `Some((position_index, match))` if a match is found.
2005    #[cfg(all(feature = "simd", target_arch = "aarch64"))]
2006    #[inline]
2007    #[must_use]
2008    pub fn find_at_positions_parallel(
2009        &self,
2010        text: &[u8],
2011        positions: &[usize],
2012        threshold: f32,
2013    ) -> Option<(usize, DamLevMatch)> {
2014        if positions.is_empty() || !self.is_ascii {
2015            return None;
2016        }
2017
2018        let max_edits = self.limits.max_edits as usize;
2019
2020        // NEON processes 2 positions at a time for k=0
2021        if max_edits == 0 {
2022            // Process pairs of positions
2023            let mut i = 0;
2024            while i + 1 < positions.len() {
2025                let pos_pair = [positions[i], positions[i + 1]];
2026                if let Some((idx, m)) =
2027                    unsafe { self.find_at_2_positions_neon_k0(text, pos_pair, threshold) }
2028                {
2029                    return Some((i + idx, m));
2030                }
2031                i += 2;
2032            }
2033            // Handle remaining position
2034            if i < positions.len()
2035                && let Some(m) = self.find_at_byte_position(text, positions[i], threshold)
2036            {
2037                return Some((i, m));
2038            }
2039            return None;
2040        }
2041
2042        // For k >= 1, fall back to sequential (NEON k=1 is already optimized)
2043        for (i, &pos) in positions.iter().enumerate() {
2044            if let Some(m) = self.find_at_byte_position(text, pos, threshold) {
2045                return Some((i, m));
2046            }
2047        }
2048        None
2049    }
2050
2051    /// Search multiple positions in parallel using AVX2.
2052    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
2053    #[inline]
2054    #[must_use]
2055    pub fn find_at_positions_parallel(
2056        &self,
2057        text: &[u8],
2058        positions: &[usize],
2059        threshold: f32,
2060    ) -> Option<(usize, DamLevMatch)> {
2061        if positions.is_empty() || !self.is_ascii {
2062            return None;
2063        }
2064
2065        let max_edits = self.limits.max_edits as usize;
2066
2067        // AVX2 processes 4 positions at a time for k=0
2068        if max_edits == 0 && simd_avx2::is_available() {
2069            let mut i = 0;
2070            while i + 3 < positions.len() {
2071                let pos_quad = [
2072                    positions[i],
2073                    positions[i + 1],
2074                    positions[i + 2],
2075                    positions[i + 3],
2076                ];
2077                if let Some((idx, m)) =
2078                    unsafe { self.find_at_4_positions_avx2_k0(text, pos_quad, threshold) }
2079                {
2080                    return Some((i + idx, m));
2081                }
2082                i += 4;
2083            }
2084            // Handle remaining positions
2085            while i < positions.len() {
2086                if let Some(m) = self.find_at_byte_position(text, positions[i], threshold) {
2087                    return Some((i, m));
2088                }
2089                i += 1;
2090            }
2091            return None;
2092        }
2093
2094        // Fall back to sequential for k >= 1 or no AVX2
2095        for (i, &pos) in positions.iter().enumerate() {
2096            if let Some(m) = self.find_at_byte_position(text, pos, threshold) {
2097                return Some((i, m));
2098            }
2099        }
2100        None
2101    }
2102
2103    /// Fallback for non-SIMD builds
2104    #[cfg(not(any(
2105        all(feature = "simd", target_arch = "aarch64"),
2106        all(feature = "simd", target_arch = "x86_64")
2107    )))]
2108    #[inline]
2109    pub fn find_at_positions_parallel(
2110        &self,
2111        text: &[u8],
2112        positions: &[usize],
2113        threshold: f32,
2114    ) -> Option<(usize, DamLevMatch)> {
2115        for (i, &pos) in positions.iter().enumerate() {
2116            if let Some(m) = self.find_at_byte_position(text, pos, threshold) {
2117                return Some((i, m));
2118            }
2119        }
2120        None
2121    }
2122
2123    /// NEON: Search 2 positions in parallel for k=0 (exact match).
2124    ///
2125    /// Uses 128-bit NEON to process 2 independent exact-match searches.
2126    #[cfg(all(feature = "simd", target_arch = "aarch64"))]
2127    #[inline]
2128    unsafe fn find_at_2_positions_neon_k0(
2129        &self,
2130        text: &[u8],
2131        positions: [usize; 2],
2132        threshold: f32,
2133    ) -> Option<(usize, DamLevMatch)> {
2134        #[allow(clippy::wildcard_imports)]
2135        use std::arch::aarch64::*;
2136
2137        let max_window = self.pattern_len;
2138        let accept_mask = self.accept_mask;
2139        let byte_masks = &self.byte_masks;
2140
2141        // Calculate search lengths for each position
2142        let end0 = (positions[0] + max_window + 1).min(text.len());
2143        let end1 = (positions[1] + max_window + 1).min(text.len());
2144        let len0 = end0.saturating_sub(positions[0]);
2145        let len1 = end1.saturating_sub(positions[1]);
2146        let max_len = len0.max(len1);
2147
2148        if max_len == 0 {
2149            return None;
2150        }
2151
2152        // Initialize states: all 1s means no match yet
2153        // r[0] = position 0 state, r[1] = position 1 state
2154        let mut r = unsafe { vdupq_n_u64(!0u64) };
2155        let accept_vec = unsafe { vdupq_n_u64(accept_mask) };
2156
2157        for i in 0..max_len {
2158            // Get char masks for both positions (scalar loads, then combine)
2159            let mask0 = if i < len0 {
2160                let byte = text[positions[0] + i];
2161                byte_masks[(byte & 0x7F) as usize]
2162            } else {
2163                !0u64 // No match possible
2164            };
2165
2166            let mask1 = if i < len1 {
2167                let byte = text[positions[1] + i];
2168                byte_masks[(byte & 0x7F) as usize]
2169            } else {
2170                !0u64
2171            };
2172
2173            // Combine masks into NEON vector
2174            let char_masks = unsafe { vcombine_u64(vcreate_u64(mask0), vcreate_u64(mask1)) };
2175
2176            // State update: r = (r << 1) | char_mask
2177            let shifted = unsafe { vshlq_n_u64(r, 1) };
2178            r = unsafe { vorrq_u64(shifted, char_masks) };
2179
2180            // Check for matches: (r & accept_mask) == 0
2181            let masked = unsafe { vandq_u64(r, accept_vec) };
2182
2183            // Extract and check each lane
2184            let lane0 = unsafe { vgetq_lane_u64(masked, 0) };
2185            let lane1 = unsafe { vgetq_lane_u64(masked, 1) };
2186
2187            if lane0 == 0 && i < len0 {
2188                let end_byte = positions[0] + i + 1;
2189                let (insertions, deletions, substitutions, swaps) =
2190                    self.compute_exact_edit_breakdown(&text[positions[0]..end_byte]);
2191                let sim = self.calc_similarity(0, insertions, deletions);
2192                if sim >= threshold {
2193                    return Some((
2194                        0,
2195                        DamLevMatch {
2196                            start: positions[0],
2197                            end: end_byte,
2198                            insertions,
2199                            deletions,
2200                            substitutions,
2201                            swaps,
2202                            similarity: sim,
2203                        },
2204                    ));
2205                }
2206            }
2207
2208            if lane1 == 0 && i < len1 {
2209                let end_byte = positions[1] + i + 1;
2210                let (insertions, deletions, substitutions, swaps) =
2211                    self.compute_exact_edit_breakdown(&text[positions[1]..end_byte]);
2212                let sim = self.calc_similarity(0, insertions, deletions);
2213                if sim >= threshold {
2214                    return Some((
2215                        1,
2216                        DamLevMatch {
2217                            start: positions[1],
2218                            end: end_byte,
2219                            insertions,
2220                            deletions,
2221                            substitutions,
2222                            swaps,
2223                            similarity: sim,
2224                        },
2225                    ));
2226                }
2227            }
2228        }
2229
2230        None
2231    }
2232
2233    /// AVX2: Search 4 positions in parallel for k=0 (exact match).
2234    ///
2235    /// Uses 256-bit AVX2 to process 4 independent exact-match searches.
2236    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
2237    #[target_feature(enable = "avx2")]
2238    #[inline]
2239    unsafe fn find_at_4_positions_avx2_k0(
2240        &self,
2241        text: &[u8],
2242        positions: [usize; 4],
2243        threshold: f32,
2244    ) -> Option<(usize, DamLevMatch)> {
2245        #[allow(clippy::wildcard_imports)]
2246        use std::arch::x86_64::*;
2247
2248        let max_window = self.pattern_len;
2249        let accept_mask = self.accept_mask;
2250        let byte_masks = &self.byte_masks;
2251
2252        // Calculate search lengths for each position
2253        let ends: [usize; 4] = [
2254            (positions[0] + max_window + 1).min(text.len()),
2255            (positions[1] + max_window + 1).min(text.len()),
2256            (positions[2] + max_window + 1).min(text.len()),
2257            (positions[3] + max_window + 1).min(text.len()),
2258        ];
2259        let lens: [usize; 4] = [
2260            ends[0].saturating_sub(positions[0]),
2261            ends[1].saturating_sub(positions[1]),
2262            ends[2].saturating_sub(positions[2]),
2263            ends[3].saturating_sub(positions[3]),
2264        ];
2265        let max_len = lens[0].max(lens[1]).max(lens[2]).max(lens[3]);
2266
2267        if max_len == 0 {
2268            return None;
2269        }
2270
2271        // Initialize states: all 1s
2272        let mut r = _mm256_set1_epi64x(!0i64);
2273        let accept_vec = _mm256_set1_epi64x(accept_mask as i64);
2274
2275        for i in 0..max_len {
2276            // Get char masks for all 4 positions
2277            let masks: [u64; 4] = [
2278                if i < lens[0] {
2279                    byte_masks[(text[positions[0] + i] & 0x7F) as usize]
2280                } else {
2281                    !0u64
2282                },
2283                if i < lens[1] {
2284                    byte_masks[(text[positions[1] + i] & 0x7F) as usize]
2285                } else {
2286                    !0u64
2287                },
2288                if i < lens[2] {
2289                    byte_masks[(text[positions[2] + i] & 0x7F) as usize]
2290                } else {
2291                    !0u64
2292                },
2293                if i < lens[3] {
2294                    byte_masks[(text[positions[3] + i] & 0x7F) as usize]
2295                } else {
2296                    !0u64
2297                },
2298            ];
2299
2300            let char_masks = _mm256_set_epi64x(
2301                masks[3] as i64,
2302                masks[2] as i64,
2303                masks[1] as i64,
2304                masks[0] as i64,
2305            );
2306
2307            // State update: r = (r << 1) | char_mask
2308            let shifted = _mm256_slli_epi64(r, 1);
2309            r = _mm256_or_si256(shifted, char_masks);
2310
2311            // Check for matches
2312            let masked = _mm256_and_si256(r, accept_vec);
2313
2314            // Extract and check each lane (use movemask for efficiency)
2315            let zero = _mm256_setzero_si256();
2316            let cmp = _mm256_cmpeq_epi64(masked, zero);
2317            let match_mask = _mm256_movemask_epi8(cmp);
2318
2319            // Check each position (lanes are in order: 0, 1, 2, 3)
2320            // Each lane is 8 bytes, so bits 0-7 = lane 0, 8-15 = lane 1, etc.
2321            if match_mask != 0 {
2322                for (lane, &len) in lens.iter().enumerate() {
2323                    if i < len {
2324                        let lane_mask = 0xFF << (lane * 8);
2325                        if (match_mask & lane_mask) == lane_mask {
2326                            let end_byte = positions[lane] + i + 1;
2327                            let (insertions, deletions, substitutions, swaps) =
2328                                self.compute_exact_edit_breakdown(&text[positions[lane]..end_byte]);
2329                            let sim = self.calc_similarity(0, insertions, deletions);
2330                            if sim >= threshold {
2331                                return Some((
2332                                    lane,
2333                                    DamLevMatch {
2334                                        start: positions[lane],
2335                                        end: end_byte,
2336                                        insertions,
2337                                        deletions,
2338                                        substitutions,
2339                                        swaps,
2340                                        similarity: sim,
2341                                    },
2342                                ));
2343                            }
2344                        }
2345                    }
2346                }
2347            }
2348        }
2349
2350        None
2351    }
2352
2353    /// ASCII-optimized search - no UTF-8 decoding, direct byte mask lookup.
2354    /// Uses unsafe to eliminate bounds checks in the hot loop.
2355    #[inline(always)]
2356    fn find_at_byte_position_ascii<const K: usize>(
2357        &self,
2358        text: &[u8],
2359        start_pos: usize,
2360        threshold: f32,
2361    ) -> Option<DamLevMatch> {
2362        let max_edits = self.limits.max_edits as usize;
2363        debug_assert!(max_edits < K);
2364
2365        let max_window = self.pattern_len + max_edits;
2366        let end_limit = (start_pos + max_window + 1).min(text.len());
2367        let search_len = end_limit - start_pos;
2368
2369        if search_len == 0 {
2370            return None;
2371        }
2372
2373        // SAFETY: We've bounds-checked above, and byte_masks has 128 elements
2374        // which covers all ASCII bytes (0-127). Non-ASCII bytes are handled
2375        // by returning !0u64 (no match).
2376        unsafe {
2377            self.find_at_byte_position_ascii_unchecked::<K>(
2378                text, start_pos, search_len, threshold, max_edits,
2379            )
2380        }
2381    }
2382
2383    /// Inner loop with no bounds checks - SAFETY: caller must ensure bounds are valid
2384    #[inline(always)]
2385    unsafe fn find_at_byte_position_ascii_unchecked<const K: usize>(
2386        &self,
2387        text: &[u8],
2388        start_pos: usize,
2389        search_len: usize,
2390        threshold: f32,
2391        max_edits: usize,
2392    ) -> Option<DamLevMatch> {
2393        // SAFETY: caller guarantees all bounds are valid
2394        unsafe {
2395            // Stack-allocated state vectors
2396            let mut r = [!0u64; K];
2397            let mut old_r = [!0u64; K];
2398            let mut old_old_r = [!0u64; K]; // State from 2 iterations ago for transposition
2399
2400            // Initialize deletion states - left shift advances pattern position
2401            for d in 1..=max_edits {
2402                *r.get_unchecked_mut(d) = *r.get_unchecked(d - 1) << 1;
2403            }
2404
2405            let text_ptr = text.as_ptr().add(start_pos);
2406            let byte_masks_ptr = self.byte_masks.as_ptr();
2407            let accept_mask = self.accept_mask;
2408            let _ = self.pattern_len;
2409
2410            let mut prev_byte: Option<u8> = None;
2411
2412            for i in 0..search_len {
2413                let byte = *text_ptr.add(i);
2414
2415                // Direct array lookup - mask non-ASCII to 0 index (which has !0u64)
2416                let mask_idx = (byte & 0x7F) as usize;
2417                let char_mask = *byte_masks_ptr.add(mask_idx);
2418
2419                // Save states from 2 iterations ago
2420                for d in 0..=max_edits {
2421                    *old_old_r.get_unchecked_mut(d) = *old_r.get_unchecked(d);
2422                    *old_r.get_unchecked_mut(d) = *r.get_unchecked(d);
2423                }
2424
2425                // Update state 0 (exact match)
2426                *r.get_unchecked_mut(0) = (*r.get_unchecked(0) << 1) | char_mask;
2427
2428                // Update fuzzy states
2429                for d in 1..=max_edits {
2430                    let insert = *old_r.get_unchecked(d - 1);
2431                    let delete = *r.get_unchecked(d - 1) << 1; // left shift advances pattern position
2432                    let substitute = *old_r.get_unchecked(d - 1) << 1;
2433                    let match_d = (*old_r.get_unchecked(d) << 1) | char_mask;
2434                    let mut new_r = match_d & insert & delete & substitute;
2435
2436                    // Transposition: if we have a previous character, check for swaps
2437                    // Transposition at position j means pattern[j]=curr AND pattern[j+1]=prev
2438                    if let Some(prev_b) = prev_byte {
2439                        let prev_mask_idx = (prev_b & 0x7F) as usize;
2440                        let prev_mask = *byte_masks_ptr.add(prev_mask_idx);
2441                        // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
2442                        let trans_valid_mask = char_mask | (prev_mask >> 1);
2443                        // From matched position k (bit k=0), we can reach k+2 via transposition at k+1
2444                        // Shift old_old_r left first to align: bit k becomes bit k+1
2445                        // This also makes bit 0 = 0, allowing transposition at position 0
2446                        let trans =
2447                            ((*old_old_r.get_unchecked(d - 1) << 1) | trans_valid_mask) << 1;
2448                        new_r &= trans;
2449                    }
2450
2451                    *r.get_unchecked_mut(d) = new_r;
2452                }
2453
2454                let char_count = i + 1;
2455
2456                // Check for match (prefer fewer edits)
2457                for d in 0..=max_edits {
2458                    if (*r.get_unchecked(d) & accept_mask) == 0 {
2459                        let end_byte = start_pos + char_count;
2460                        // Compute exact edit breakdown using DP
2461                        let (insertions, deletions, substitutions, swaps) =
2462                            self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2463
2464                        let sim = self.calc_similarity(d as u8, insertions, deletions);
2465                        if sim >= threshold {
2466                            return Some(DamLevMatch {
2467                                start: start_pos,
2468                                end: end_byte,
2469                                insertions,
2470                                deletions,
2471                                substitutions,
2472                                swaps,
2473                                similarity: sim,
2474                            });
2475                        }
2476                    }
2477                }
2478
2479                prev_byte = Some(byte);
2480            }
2481
2482            // Handle text shorter than pattern: positions reached during the last iteration
2483            // need extra match propagation to reach the accept position.
2484            // Re-process the last char_mask to allow match propagation.
2485            let end_byte = start_pos + search_len;
2486            if let Some(last_byte) = prev_byte {
2487                let last_mask = *byte_masks_ptr.add((last_byte & 0x7F) as usize);
2488                let chars_short = self.pattern_len.saturating_sub(search_len);
2489
2490                for _ in 0..chars_short.min(max_edits) {
2491                    for d in 0..=max_edits {
2492                        *old_r.get_unchecked_mut(d) = *r.get_unchecked(d);
2493                    }
2494
2495                    // Apply match propagation: from position p with d errors,
2496                    // if pattern[p+1] matches last_char, reach position p+1 with d errors
2497                    for d in 1..=max_edits {
2498                        let match_d = (*old_r.get_unchecked(d) << 1) | last_mask;
2499                        *r.get_unchecked_mut(d) &= match_d;
2500                    }
2501
2502                    // Check for accept after each propagation
2503                    for d in 0..=max_edits {
2504                        if (*r.get_unchecked(d) & accept_mask) == 0 {
2505                            let (insertions, deletions, substitutions, swaps) =
2506                                self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2507
2508                            let total = insertions + deletions + substitutions + swaps;
2509                            if total as usize <= d {
2510                                let sim = self.calc_similarity(total, insertions, deletions);
2511                                if sim >= threshold {
2512                                    return Some(DamLevMatch {
2513                                        start: start_pos,
2514                                        end: end_byte,
2515                                        insertions,
2516                                        deletions,
2517                                        substitutions,
2518                                        swaps,
2519                                        similarity: sim,
2520                                    });
2521                                }
2522                            }
2523                        }
2524                    }
2525                }
2526            }
2527
2528            None
2529        }
2530    }
2531
2532    #[inline]
2533    fn find_at_byte_position_small_k<const K: usize>(
2534        &self,
2535        text: &[u8],
2536        start_pos: usize,
2537        threshold: f32,
2538    ) -> Option<DamLevMatch> {
2539        let max_edits = self.limits.max_edits as usize;
2540        debug_assert!(max_edits < K);
2541
2542        // Stack-allocated state vectors
2543        let mut r = [!0u64; K];
2544        let mut old_r = [!0u64; K];
2545        let mut old_old_r = [!0u64; K]; // State from 2 iterations ago for transposition
2546
2547        // Initialize deletion states - left shift advances pattern position
2548        for d in 1..=max_edits {
2549            r[d] = r[d - 1] << 1;
2550        }
2551
2552        // max_window is in characters, but we iterate bytes.
2553        // For UTF-8, multiply by max char size (4) to ensure we process enough bytes.
2554        let max_window_chars = self.pattern_len + max_edits;
2555        let max_window_bytes = if self.is_ascii {
2556            max_window_chars + 1
2557        } else {
2558            max_window_chars * 4 + 1
2559        };
2560        let end_limit = (start_pos + max_window_bytes).min(text.len());
2561
2562        // Cache previous mask to avoid redundant lookups in transposition check
2563        let mut prev_mask: Option<u64> = None;
2564        let case_insensitive = self.case_insensitive;
2565
2566        // Iterate bytes, handling UTF-8
2567        let mut pos = start_pos;
2568        let mut char_count = 0usize;
2569        while pos < end_limit && char_count <= max_window_chars {
2570            let byte = text[pos];
2571
2572            // Get character mask and length with fast paths
2573            let (char_mask, char_len) = if byte < 128 {
2574                // ASCII fast path
2575                let lookup_byte = if case_insensitive {
2576                    byte.to_ascii_lowercase()
2577                } else {
2578                    byte
2579                };
2580                (self.byte_masks[lookup_byte as usize], 1)
2581            } else if byte < 224 && pos + 1 < text.len() {
2582                // 2-byte UTF-8 fast path (Cyrillic, etc.)
2583                let b1 = text[pos + 1];
2584                if case_insensitive {
2585                    let codepoint = ((u32::from(byte) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
2586                    let ch = unsafe { char::from_u32_unchecked(codepoint) };
2587                    let ch_lower = ch.to_lowercase().next().unwrap_or(ch);
2588                    (self.get_mask(ch_lower), 2)
2589                } else {
2590                    (self.get_mask_2byte(byte, b1), 2)
2591                }
2592            } else {
2593                // 3/4-byte UTF-8 or incomplete
2594                let (ch, len) = decode_utf8_char_fast(text, pos);
2595                let ch = if case_insensitive {
2596                    ch.to_lowercase().next().unwrap_or(ch)
2597                } else {
2598                    ch
2599                };
2600                (self.get_mask(ch), len)
2601            };
2602
2603            // Save old states
2604            old_old_r[..=max_edits].copy_from_slice(&old_r[..=max_edits]);
2605            old_r[..=max_edits].copy_from_slice(&r[..=max_edits]);
2606
2607            // Update states
2608            r[0] = (r[0] << 1) | char_mask;
2609
2610            for d in 1..=max_edits {
2611                let insert = old_r[d - 1];
2612                let delete = r[d - 1] << 1; // left shift advances pattern position
2613                let substitute = old_r[d - 1] << 1;
2614                let match_d = (old_r[d] << 1) | char_mask;
2615                let mut new_r = match_d & insert & delete & substitute;
2616
2617                // Transposition: if we have a previous mask, check for swaps
2618                // (use cached prev_mask instead of recomputing)
2619                if let Some(pm) = prev_mask {
2620                    // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
2621                    let trans_valid_mask = char_mask | (pm >> 1);
2622                    // From matched position k, we can reach k+2 via transposition at k+1
2623                    let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
2624                    new_r &= trans;
2625                }
2626
2627                r[d] = new_r;
2628            }
2629
2630            let end_byte = pos + char_len;
2631
2632            // Check for match (prefer fewer edits)
2633            for d in 0..=max_edits {
2634                if (r[d] & self.accept_mask) == 0 {
2635                    // Compute exact edit breakdown using DP
2636                    let (insertions, deletions, substitutions, swaps) =
2637                        self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2638
2639                    let sim = self.calc_similarity(d as u8, insertions, deletions);
2640                    if sim >= threshold {
2641                        return Some(DamLevMatch {
2642                            start: start_pos,
2643                            end: end_byte,
2644                            insertions,
2645                            deletions,
2646                            substitutions,
2647                            swaps,
2648                            similarity: sim,
2649                        });
2650                    }
2651                }
2652            }
2653
2654            prev_mask = Some(char_mask);
2655            pos += char_len;
2656            char_count += 1;
2657        }
2658
2659        // Handle text shorter than pattern: positions reached during the last iteration
2660        // need extra match propagation to reach the accept position.
2661        // Re-process the last char_mask to allow match propagation.
2662        let end_byte = pos;
2663        if let Some(last_mask) = prev_mask {
2664            let chars_short = self.pattern_len.saturating_sub(char_count);
2665            for _ in 0..chars_short.min(max_edits) {
2666                old_r = r;
2667                // Note: old_old_r is intentionally not updated - transpositions don't apply
2668                // when we're just propagating matches without processing new characters
2669
2670                // Apply match propagation: from position p with d errors,
2671                // if pattern[p+1] matches last_char, reach position p+1 with d errors
2672                // Note: transpositions don't apply here since we're not processing new characters
2673                for d in 1..=max_edits {
2674                    let match_d = (old_r[d] << 1) | last_mask;
2675                    r[d] &= match_d;
2676                }
2677
2678                // Check for accept after each propagation
2679                for d in 0..=max_edits {
2680                    if (r[d] & self.accept_mask) == 0 {
2681                        let (insertions, deletions, substitutions, swaps) =
2682                            self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2683
2684                        let total = insertions + deletions + substitutions + swaps;
2685                        if total as usize <= d {
2686                            let sim = self.calc_similarity(total, insertions, deletions);
2687                            if sim >= threshold {
2688                                return Some(DamLevMatch {
2689                                    start: start_pos,
2690                                    end: end_byte,
2691                                    insertions,
2692                                    deletions,
2693                                    substitutions,
2694                                    swaps,
2695                                    similarity: sim,
2696                                });
2697                            }
2698                        }
2699                    }
2700                }
2701            }
2702        }
2703
2704        None
2705    }
2706
2707    fn find_at_byte_position_large_k(
2708        &self,
2709        text: &[u8],
2710        start_pos: usize,
2711        threshold: f32,
2712    ) -> Option<DamLevMatch> {
2713        let max_edits = self.limits.max_edits as usize;
2714
2715        let mut r = vec![!0u64; max_edits + 1];
2716        let mut old_r = vec![!0u64; max_edits + 1];
2717        let mut old_old_r = vec![!0u64; max_edits + 1]; // State from 2 iterations ago for transposition
2718
2719        // Initialize deletion states - left shift advances pattern position
2720        for d in 1..=max_edits {
2721            r[d] = r[d - 1] << 1;
2722        }
2723
2724        // max_window is in characters, but we iterate bytes.
2725        // For UTF-8, multiply by max char size (4) to ensure we process enough bytes.
2726        let max_window_chars = self.pattern_len + max_edits;
2727        let max_window_bytes = if self.is_ascii {
2728            max_window_chars + 1
2729        } else {
2730            max_window_chars * 4 + 1
2731        };
2732        let end_limit = (start_pos + max_window_bytes).min(text.len());
2733
2734        let mut pos = start_pos;
2735        let mut prev_mask: Option<u64> = None;
2736        let case_insensitive = self.case_insensitive;
2737        let mut char_count = 0usize;
2738
2739        while pos < end_limit && char_count <= max_window_chars {
2740            let byte = text[pos];
2741
2742            // Get character mask and length with fast paths
2743            let (char_mask, char_len) = if byte < 128 {
2744                let lookup_byte = if case_insensitive {
2745                    byte.to_ascii_lowercase()
2746                } else {
2747                    byte
2748                };
2749                (self.byte_masks[lookup_byte as usize], 1)
2750            } else if byte < 224 && pos + 1 < text.len() {
2751                // 2-byte UTF-8 fast path
2752                let b1 = text[pos + 1];
2753                if case_insensitive {
2754                    let codepoint = ((u32::from(byte) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
2755                    let ch = unsafe { char::from_u32_unchecked(codepoint) };
2756                    let ch_lower = ch.to_lowercase().next().unwrap_or(ch);
2757                    (self.get_mask(ch_lower), 2)
2758                } else {
2759                    (self.get_mask_2byte(byte, b1), 2)
2760                }
2761            } else {
2762                let (ch, len) = decode_utf8_char_fast(text, pos);
2763                let ch = if case_insensitive {
2764                    ch.to_lowercase().next().unwrap_or(ch)
2765                } else {
2766                    ch
2767                };
2768                (self.get_mask(ch), len)
2769            };
2770
2771            old_old_r.copy_from_slice(&old_r);
2772            old_r.copy_from_slice(&r);
2773
2774            r[0] = (r[0] << 1) | char_mask;
2775
2776            for d in 1..=max_edits {
2777                let insert = old_r[d - 1];
2778                let delete = r[d - 1] << 1; // left shift advances pattern position
2779                let substitute = old_r[d - 1] << 1;
2780                let match_d = (old_r[d] << 1) | char_mask;
2781                let mut new_r = match_d & insert & delete & substitute;
2782
2783                // Transposition: use cached prev_mask instead of recomputing
2784                if let Some(pm) = prev_mask {
2785                    let trans_valid_mask = char_mask | (pm >> 1);
2786                    // From matched position k, we can reach k+2 via transposition at k+1
2787                    let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
2788                    new_r &= trans;
2789                }
2790
2791                r[d] = new_r;
2792            }
2793
2794            let end_byte = pos + char_len;
2795
2796            for d in 0..=max_edits {
2797                if (r[d] & self.accept_mask) == 0 {
2798                    // Compute exact edit breakdown using DP
2799                    let (insertions, deletions, substitutions, swaps) =
2800                        self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2801
2802                    let sim = self.calc_similarity(d as u8, insertions, deletions);
2803                    if sim >= threshold {
2804                        return Some(DamLevMatch {
2805                            start: start_pos,
2806                            end: end_byte,
2807                            insertions,
2808                            deletions,
2809                            substitutions,
2810                            swaps,
2811                            similarity: sim,
2812                        });
2813                    }
2814                }
2815            }
2816
2817            prev_mask = Some(char_mask);
2818            pos += char_len;
2819            char_count += 1;
2820        }
2821
2822        // Handle text shorter than pattern: positions reached during the last iteration
2823        // need extra match propagation to reach the accept position.
2824        let end_byte = pos;
2825        if let Some(last_mask) = prev_mask {
2826            let chars_short = self.pattern_len.saturating_sub(char_count);
2827            for _ in 0..chars_short.min(max_edits) {
2828                old_r.copy_from_slice(&r);
2829
2830                // Apply match propagation: from position p with d errors,
2831                // if pattern[p+1] matches last_char, reach position p+1 with d errors
2832                for d in 1..=max_edits {
2833                    let match_d = (old_r[d] << 1) | last_mask;
2834                    r[d] &= match_d;
2835                }
2836
2837                // Check for accept after each propagation
2838                for d in 0..=max_edits {
2839                    if (r[d] & self.accept_mask) == 0 {
2840                        let (insertions, deletions, substitutions, swaps) =
2841                            self.compute_exact_edit_breakdown(&text[start_pos..end_byte]);
2842
2843                        let total = insertions + deletions + substitutions + swaps;
2844                        if total as usize <= d {
2845                            let sim = self.calc_similarity(total, insertions, deletions);
2846                            if sim >= threshold {
2847                                return Some(DamLevMatch {
2848                                    start: start_pos,
2849                                    end: end_byte,
2850                                    insertions,
2851                                    deletions,
2852                                    substitutions,
2853                                    swaps,
2854                                    similarity: sim,
2855                                });
2856                            }
2857                        }
2858                    }
2859                }
2860            }
2861        }
2862
2863        None
2864    }
2865
2866    /// Streaming search: scan entire text in one pass, return first match.
2867    /// This is O(n * k) where n = text length, k = max edits.
2868    /// Much faster for long texts than repeated `find_at_byte_position` calls.
2869    #[inline]
2870    #[must_use]
2871    pub fn find_first_streaming(&self, text: &[u8], threshold: f32) -> Option<DamLevMatch> {
2872        let max_edits = self.limits.max_edits as usize;
2873
2874        // Use const generics for common cases - already highly optimized
2875        match max_edits {
2876            0 => self.find_first_streaming_k::<1>(text, threshold, 0),
2877            1 => self.find_first_streaming_k::<2>(text, threshold, 1),
2878            2 => self.find_first_streaming_k::<3>(text, threshold, 2),
2879            3 => self.find_first_streaming_k::<4>(text, threshold, 3),
2880            4 => self.find_first_streaming_k::<5>(text, threshold, 4),
2881            _ => self.find_first_streaming_large_k(text, threshold),
2882        }
2883    }
2884
2885    /// Streaming search with const-size state arrays for performance.
2886    /// Uses three rotating buffers to support transposition detection.
2887    /// Continues processing after finding fuzzy matches to prefer exact matches.
2888    #[inline]
2889    fn find_first_streaming_k<const K: usize>(
2890        &self,
2891        text: &[u8],
2892        threshold: f32,
2893        max_edits: usize,
2894    ) -> Option<DamLevMatch> {
2895        debug_assert!(max_edits < K);
2896
2897        // Handle empty text: pattern can still match via pure deletions
2898        if text.is_empty() && self.pattern_len <= max_edits {
2899            let deletions = self.pattern_len as u8;
2900            let sim = self.calc_similarity(deletions, 0, deletions);
2901            if sim >= threshold {
2902                return Some(DamLevMatch {
2903                    start: 0,
2904                    end: 0,
2905                    insertions: 0,
2906                    deletions,
2907                    substitutions: 0,
2908                    swaps: 0,
2909                    similarity: sim,
2910                });
2911            }
2912            return None;
2913        }
2914
2915        // Use three state arrays for rotation (need old_old for transposition)
2916        let mut r0 = [!0u64; K];
2917        let mut r1 = [!0u64; K];
2918        let mut r2 = [!0u64; K];
2919
2920        // Initialize: can delete up to max_edits chars from pattern start
2921        for d in 1..=max_edits {
2922            r0[d] = r0[d - 1] << 1; // left shift advances pattern position
2923        }
2924
2925        // Track byte positions where each error level's current match started
2926        let mut start_bytes = [0usize; K];
2927
2928        let byte_masks = &self.byte_masks;
2929        let accept_mask = self.accept_mask;
2930        let case_insensitive = self.case_insensitive;
2931
2932        let mut pos = 0usize;
2933        let mut rotation = 0usize; // 0, 1, 2 rotation for three buffers
2934        let mut prev_mask: u64 = !0u64; // Previous character mask for transposition
2935
2936        // Track best match found so far (prefer fewer edits)
2937        // After finding a fuzzy match, continue for max_edits more chars to find better matches
2938        let mut best_match: Option<(usize, DamLevMatch)> = None; // (edit_level, match)
2939        let mut chars_since_first_match = 0usize;
2940
2941        while pos < text.len() {
2942            let byte = text[pos];
2943
2944            // Get character mask and length (ASCII fast path)
2945            let (char_mask, char_len) = if byte < 128 {
2946                let lookup_byte = if case_insensitive {
2947                    byte.to_ascii_lowercase()
2948                } else {
2949                    byte
2950                };
2951                (byte_masks[lookup_byte as usize], 1)
2952            } else if byte < 224 && pos + 1 < text.len() {
2953                // 2-byte UTF-8 fast path (Cyrillic, Latin Extended, etc.)
2954                let b1 = text[pos + 1];
2955                if case_insensitive {
2956                    // Need full decode for case conversion
2957                    let codepoint = ((u32::from(byte) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
2958                    let ch = unsafe { char::from_u32_unchecked(codepoint) };
2959                    let ch_lower = ch.to_lowercase().next().unwrap_or(ch);
2960                    (self.get_mask(ch_lower), 2)
2961                } else {
2962                    (self.get_mask_2byte(byte, b1), 2)
2963                }
2964            } else {
2965                // 3/4-byte UTF-8 or incomplete sequence
2966                let (ch, len) = decode_utf8_char_fast(text, pos);
2967                let ch = if case_insensitive {
2968                    ch.to_lowercase().next().unwrap_or(ch)
2969                } else {
2970                    ch
2971                };
2972                (self.get_mask(ch), len)
2973            };
2974
2975            // Three-way rotation: old_old -> old -> new
2976            let (old_old_r, old_r, new_r) = match rotation {
2977                0 => (&r2, &r0, &mut r1),
2978                1 => (&r0, &r1, &mut r2),
2979                _ => (&r1, &r2, &mut r0),
2980            };
2981
2982            // Update R[0] (exact matching)
2983            new_r[0] = (old_r[0] << 1) | char_mask;
2984
2985            // Update start position for d=0 if no partial match
2986            if new_r[0] == !0u64 {
2987                start_bytes[0] = pos + char_len;
2988            }
2989
2990            // Update R[d] for d > 0 (fuzzy matching)
2991            for d in 1..=max_edits {
2992                let insert = old_r[d - 1]; // consume text char without advancing pattern
2993                let delete = new_r[d - 1] << 1; // left shift advances pattern position
2994                let substitute = old_r[d - 1] << 1; // replace pattern char
2995                let match_d = (old_r[d] << 1) | char_mask;
2996
2997                let mut new_val = match_d & insert & delete & substitute;
2998
2999                // Transposition: check if we can swap adjacent chars
3000                // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
3001                let trans_valid_mask = char_mask | (prev_mask >> 1);
3002                // From matched position k, we can reach k+2 via transposition at k+1
3003                let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
3004                new_val &= trans;
3005
3006                new_r[d] = new_val;
3007
3008                // Update start position if no partial match
3009                if new_r[d] == !0u64 {
3010                    start_bytes[d] = pos + char_len;
3011                }
3012            }
3013
3014            // Check for matches (prefer fewer edits)
3015            let end_byte = pos + char_len;
3016
3017            for d in 0..=max_edits {
3018                if (new_r[d] & accept_mask) == 0 {
3019                    // Streaming found a potential match ending here.
3020                    // Use tracked start position for this error level (fast path)
3021                    let tracked_start = start_bytes[d];
3022
3023                    // Fast path: if tracked start gives exact pattern length match, use it directly
3024                    if end_byte >= tracked_start {
3025                        let match_len = end_byte - tracked_start;
3026
3027                        // Ultra-fast path for exact matches (d=0, length matches exactly)
3028                        // Skip DP computation entirely - we know it's 0 edits
3029                        if d == 0 && match_len == self.pattern_len {
3030                            return Some(DamLevMatch {
3031                                start: tracked_start,
3032                                end: end_byte,
3033                                insertions: 0,
3034                                deletions: 0,
3035                                substitutions: 0,
3036                                swaps: 0,
3037                                similarity: 1.0,
3038                            });
3039                        }
3040
3041                        // Check if this is likely the best match (close to pattern length)
3042                        if match_len >= self.pattern_len.saturating_sub(d)
3043                            && match_len <= self.pattern_len + d
3044                        {
3045                            let (insertions, deletions, substitutions, swaps) =
3046                                self.compute_exact_edit_breakdown(&text[tracked_start..end_byte]);
3047                            let total = insertions + deletions + substitutions + swaps;
3048
3049                            if total as usize <= d {
3050                                let sim = self.calc_similarity(total, insertions, deletions);
3051                                if sim >= threshold {
3052                                    // For exact length matches with d=0, return immediately
3053                                    if d == 0 {
3054                                        return Some(DamLevMatch {
3055                                            start: tracked_start,
3056                                            end: end_byte,
3057                                            insertions,
3058                                            deletions,
3059                                            substitutions,
3060                                            swaps,
3061                                            similarity: sim,
3062                                        });
3063                                    }
3064
3065                                    // For fuzzy matches, track as candidate and continue
3066                                    let candidate = DamLevMatch {
3067                                        start: tracked_start,
3068                                        end: end_byte,
3069                                        insertions,
3070                                        deletions,
3071                                        substitutions,
3072                                        swaps,
3073                                        similarity: sim,
3074                                    };
3075
3076                                    // Prefer: fewer edits, then closer to pattern length
3077                                    let len_diff =
3078                                        (match_len as i32 - self.pattern_len as i32).abs();
3079                                    if best_match.as_ref().is_none_or(|(best_d, b)| {
3080                                        let b_len = b.end - b.start;
3081                                        let b_len_diff =
3082                                            (b_len as i32 - self.pattern_len as i32).abs();
3083                                        d < *best_d
3084                                            || (d == *best_d && total < b.total_edits())
3085                                            || (d == *best_d
3086                                                && total == b.total_edits()
3087                                                && len_diff < b_len_diff)
3088                                    }) {
3089                                        if best_match.is_none() {
3090                                            chars_since_first_match = 0;
3091                                        }
3092                                        best_match = Some((d, candidate));
3093                                    }
3094                                }
3095                            }
3096                        }
3097                    }
3098
3099                    // For fuzzy matches not caught by fast path, search all possible start positions
3100                    if d > 0 {
3101                        let search_start = end_byte.saturating_sub(self.pattern_len + d);
3102
3103                        for try_start in search_start..end_byte {
3104                            // Skip if not at a valid UTF-8 char boundary
3105                            if try_start > 0 && text[try_start] >= 0x80 && text[try_start] < 0xC0 {
3106                                continue;
3107                            }
3108
3109                            // Compute exact edit breakdown using DP
3110                            let (insertions, deletions, substitutions, swaps) =
3111                                self.compute_exact_edit_breakdown(&text[try_start..end_byte]);
3112
3113                            let total = insertions + deletions + substitutions + swaps;
3114                            if total as usize <= d {
3115                                let sim = self.calc_similarity(total, insertions, deletions);
3116                                if sim >= threshold {
3117                                    let candidate = DamLevMatch {
3118                                        start: try_start,
3119                                        end: end_byte,
3120                                        insertions,
3121                                        deletions,
3122                                        substitutions,
3123                                        swaps,
3124                                        similarity: sim,
3125                                    };
3126                                    // Prefer: fewer edits, then closer to pattern length
3127                                    let match_len = end_byte - try_start;
3128                                    let len_diff =
3129                                        (match_len as i32 - self.pattern_len as i32).abs();
3130                                    if best_match.as_ref().is_none_or(|(best_d, b)| {
3131                                        let b_len = b.end - b.start;
3132                                        let b_len_diff =
3133                                            (b_len as i32 - self.pattern_len as i32).abs();
3134                                        d < *best_d
3135                                            || (d == *best_d && total < b.total_edits())
3136                                            || (d == *best_d
3137                                                && total == b.total_edits()
3138                                                && len_diff < b_len_diff)
3139                                    }) {
3140                                        if best_match.is_none() {
3141                                            chars_since_first_match = 0;
3142                                        }
3143                                        best_match = Some((d, candidate));
3144                                    }
3145                                }
3146                            }
3147                        }
3148                    }
3149                }
3150            }
3151
3152            // After finding a fuzzy match, check if we need to continue looking for better matches.
3153            // Only continue if the match is "suspicious" (shorter than pattern, indicating possible
3154            // early accept due to deletions). If match_length >= pattern_length, return immediately.
3155            if let Some((_, ref m)) = best_match {
3156                let match_len = m.end - m.start;
3157                if match_len >= self.pattern_len {
3158                    // Match is at least pattern length - can't be early accept due to deletions
3159                    return best_match.map(|(_, m)| m);
3160                }
3161                // Short match - might be early accept, continue for a few more chars
3162                chars_since_first_match += 1;
3163                if chars_since_first_match > max_edits {
3164                    return best_match.map(|(_, m)| m);
3165                }
3166            }
3167
3168            prev_mask = char_mask;
3169            pos += char_len;
3170            rotation = (rotation + 1) % 3;
3171        }
3172
3173        best_match.map(|(_, m)| m)
3174    }
3175
3176    /// Streaming search for large k values (uses heap allocation with three-buffer rotation).
3177    /// Supports transposition detection. Continues processing after fuzzy matches to prefer exact matches.
3178    fn find_first_streaming_large_k(&self, text: &[u8], threshold: f32) -> Option<DamLevMatch> {
3179        let max_edits = self.limits.max_edits as usize;
3180
3181        // Handle empty text: pattern can still match via pure deletions
3182        if text.is_empty() && self.pattern_len <= max_edits {
3183            let deletions = self.pattern_len as u8;
3184            let sim = self.calc_similarity(deletions, 0, deletions);
3185            if sim >= threshold {
3186                return Some(DamLevMatch {
3187                    start: 0,
3188                    end: 0,
3189                    insertions: 0,
3190                    deletions,
3191                    substitutions: 0,
3192                    swaps: 0,
3193                    similarity: sim,
3194                });
3195            }
3196            return None;
3197        }
3198
3199        // Use three buffers for rotation (need old_old for transposition)
3200        let mut r0 = vec![!0u64; max_edits + 1];
3201        let mut r1 = vec![!0u64; max_edits + 1];
3202        let mut r2 = vec![!0u64; max_edits + 1];
3203        let mut start_bytes = vec![0usize; max_edits + 1];
3204
3205        // Initialize: can delete up to max_edits chars from pattern start
3206        for d in 1..=max_edits {
3207            r0[d] = r0[d - 1] << 1; // left shift advances pattern position
3208        }
3209
3210        let byte_masks = &self.byte_masks;
3211        let accept_mask = self.accept_mask;
3212        let case_insensitive = self.case_insensitive;
3213
3214        let mut pos = 0usize;
3215        let mut rotation = 0usize; // 0, 1, 2 rotation for three buffers
3216        let mut prev_mask: u64 = !0u64; // Previous character mask for transposition
3217
3218        // Track best match found so far (prefer fewer edits)
3219        // After finding a fuzzy match, continue for max_edits more chars to find better matches
3220        let mut best_match: Option<(usize, DamLevMatch)> = None; // (edit_level, match)
3221        let mut chars_since_first_match = 0usize;
3222
3223        while pos < text.len() {
3224            let byte = text[pos];
3225
3226            let (char_mask, char_len) = if byte < 128 {
3227                let lookup_byte = if case_insensitive {
3228                    byte.to_ascii_lowercase()
3229                } else {
3230                    byte
3231                };
3232                (byte_masks[lookup_byte as usize], 1)
3233            } else if byte < 224 && pos + 1 < text.len() {
3234                // 2-byte UTF-8 fast path (Cyrillic, Latin Extended, etc.)
3235                let b1 = text[pos + 1];
3236                if case_insensitive {
3237                    let codepoint = ((u32::from(byte) & 0x1F) << 6) | (u32::from(b1) & 0x3F);
3238                    let ch = unsafe { char::from_u32_unchecked(codepoint) };
3239                    let ch_lower = ch.to_lowercase().next().unwrap_or(ch);
3240                    (self.get_mask(ch_lower), 2)
3241                } else {
3242                    (self.get_mask_2byte(byte, b1), 2)
3243                }
3244            } else {
3245                // 3/4-byte UTF-8 or incomplete sequence
3246                let (ch, len) = decode_utf8_char_fast(text, pos);
3247                let ch = if case_insensitive {
3248                    ch.to_lowercase().next().unwrap_or(ch)
3249                } else {
3250                    ch
3251                };
3252                (self.get_mask(ch), len)
3253            };
3254
3255            // Three-way rotation: old_old -> old -> new
3256            let (old_old_r, old_r, new_r) = match rotation {
3257                0 => (&r2, &r0, &mut r1),
3258                1 => (&r0, &r1, &mut r2),
3259                _ => (&r1, &r2, &mut r0),
3260            };
3261
3262            // Update R[0] (exact matching)
3263            new_r[0] = (old_r[0] << 1) | char_mask;
3264            if new_r[0] == !0u64 {
3265                start_bytes[0] = pos + char_len;
3266            }
3267
3268            // Update R[d] for d > 0 (fuzzy matching)
3269            for d in 1..=max_edits {
3270                let insert = old_r[d - 1]; // consume text char without advancing pattern
3271                let delete = new_r[d - 1] << 1; // left shift advances pattern position
3272                let substitute = old_r[d - 1] << 1; // replace pattern char
3273                let match_d = (old_r[d] << 1) | char_mask;
3274
3275                let mut new_val = match_d & insert & delete & substitute;
3276
3277                // Transposition: check if we can swap adjacent chars
3278                // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
3279                let trans_valid_mask = char_mask | (prev_mask >> 1);
3280                // From matched position k, we can reach k+2 via transposition at k+1
3281                let trans = ((old_old_r[d - 1] << 1) | trans_valid_mask) << 1;
3282                new_val &= trans;
3283
3284                new_r[d] = new_val;
3285
3286                if new_r[d] == !0u64 {
3287                    start_bytes[d] = pos + char_len;
3288                }
3289            }
3290
3291            let end_byte = pos + char_len;
3292
3293            for d in 0..=max_edits {
3294                if (new_r[d] & accept_mask) == 0 {
3295                    // Streaming found a potential match ending here.
3296                    // For exact match (d=0), return immediately
3297                    if d == 0 {
3298                        let tracked_start = start_bytes[0];
3299                        if end_byte >= tracked_start {
3300                            let match_len = end_byte - tracked_start;
3301                            if match_len == self.pattern_len {
3302                                return Some(DamLevMatch {
3303                                    start: tracked_start,
3304                                    end: end_byte,
3305                                    insertions: 0,
3306                                    deletions: 0,
3307                                    substitutions: 0,
3308                                    swaps: 0,
3309                                    similarity: 1.0,
3310                                });
3311                            }
3312                        }
3313                    }
3314
3315                    // Search all possible start positions
3316                    let search_start = end_byte.saturating_sub(self.pattern_len + d);
3317
3318                    for try_start in search_start..end_byte {
3319                        // Skip if not at a valid UTF-8 char boundary
3320                        if try_start > 0 && text[try_start] >= 0x80 && text[try_start] < 0xC0 {
3321                            continue;
3322                        }
3323
3324                        // Compute exact edit breakdown using DP
3325                        let (insertions, deletions, substitutions, swaps) =
3326                            self.compute_exact_edit_breakdown(&text[try_start..end_byte]);
3327
3328                        let total = insertions + deletions + substitutions + swaps;
3329                        if total as usize <= d {
3330                            let sim = self.calc_similarity(total, insertions, deletions);
3331                            if sim >= threshold {
3332                                // For exact match (d=0, total=0), return immediately
3333                                if d == 0 && total == 0 {
3334                                    return Some(DamLevMatch {
3335                                        start: try_start,
3336                                        end: end_byte,
3337                                        insertions,
3338                                        deletions,
3339                                        substitutions,
3340                                        swaps,
3341                                        similarity: sim,
3342                                    });
3343                                }
3344
3345                                let candidate = DamLevMatch {
3346                                    start: try_start,
3347                                    end: end_byte,
3348                                    insertions,
3349                                    deletions,
3350                                    substitutions,
3351                                    swaps,
3352                                    similarity: sim,
3353                                };
3354                                // Prefer: fewer edits, then closer to pattern length
3355                                let match_len = end_byte - try_start;
3356                                let len_diff = (match_len as i32 - self.pattern_len as i32).abs();
3357                                if best_match.as_ref().is_none_or(|(best_d, b)| {
3358                                    let b_len = b.end - b.start;
3359                                    let b_len_diff = (b_len as i32 - self.pattern_len as i32).abs();
3360                                    d < *best_d
3361                                        || (d == *best_d && total < b.total_edits())
3362                                        || (d == *best_d
3363                                            && total == b.total_edits()
3364                                            && len_diff < b_len_diff)
3365                                }) {
3366                                    if best_match.is_none() {
3367                                        chars_since_first_match = 0;
3368                                    }
3369                                    best_match = Some((d, candidate));
3370                                }
3371                            }
3372                        }
3373                    }
3374                }
3375            }
3376
3377            // After finding a fuzzy match, check if we need to continue looking for better matches.
3378            // Only continue if the match is "suspicious" (shorter than pattern, indicating possible
3379            // early accept due to deletions). If match_length >= pattern_length, return immediately.
3380            if let Some((_, ref m)) = best_match {
3381                let match_len = m.end - m.start;
3382                if match_len >= self.pattern_len {
3383                    // Match is at least pattern length - can't be early accept due to deletions
3384                    return best_match.map(|(_, m)| m);
3385                }
3386                // Short match - might be early accept, continue for a few more chars
3387                chars_since_first_match += 1;
3388                if chars_since_first_match > max_edits {
3389                    return best_match.map(|(_, m)| m);
3390                }
3391            }
3392
3393            prev_mask = char_mask;
3394            pos += char_len;
3395            rotation = (rotation + 1) % 3;
3396        }
3397
3398        best_match.map(|(_, m)| m)
3399    }
3400}
3401
3402// SIMD-accelerated Bitap for ARM with NEON
3403#[cfg(all(feature = "simd", target_arch = "aarch64"))]
3404mod simd_neon {
3405    #[allow(clippy::wildcard_imports)]
3406    use std::arch::aarch64::*;
3407
3408    /// NEON state update with transposition for k <= 1.
3409    #[inline]
3410    pub unsafe fn update_states_with_trans_k1_neon(
3411        r: &mut [u64; 4],
3412        old_r: &[u64; 4],
3413        old_old_r: &[u64; 4],
3414        char_mask: u64,
3415        prev_mask: u64,
3416    ) {
3417        unsafe {
3418            let old_vec = vld1q_u64(old_r.as_ptr());
3419            let old_old_vec = vld1q_u64(old_old_r.as_ptr());
3420            let mask = vdupq_n_u64(char_mask);
3421
3422            // match_d = (old_r[d] << 1) | char_mask
3423            let match_d = vorrq_u64(vshlq_n_u64(old_vec, 1), mask);
3424
3425            // insert: [!0, old_r[0]]
3426            let all_ones = vdupq_n_u64(!0u64);
3427            let insert = vextq_u64(all_ones, old_vec, 1);
3428
3429            // substitute = insert << 1
3430            let subst = vshlq_n_u64(insert, 1);
3431
3432            // Transposition
3433            let trans_valid = char_mask | (prev_mask >> 1);
3434            let trans_valid_vec = vdupq_n_u64(trans_valid);
3435            let old_old_dm1 = vextq_u64(all_ones, old_old_vec, 1);
3436            let trans_inner = vorrq_u64(vshlq_n_u64(old_old_dm1, 1), trans_valid_vec);
3437            let trans = vshlq_n_u64(trans_inner, 1);
3438
3439            // partial = match_d & insert & substitute & trans
3440            let partial = vandq_u64(vandq_u64(vandq_u64(match_d, insert), subst), trans);
3441
3442            vst1q_u64(r.as_mut_ptr(), partial);
3443            r[1] &= r[0] << 1; // left shift advances pattern position
3444        }
3445    }
3446}
3447
3448// SIMD-accelerated Bitap for x86_64 with AVX2
3449#[cfg(all(feature = "simd", target_arch = "x86_64"))]
3450mod simd_avx2 {
3451    #[cfg(target_arch = "x86_64")]
3452    #[allow(clippy::wildcard_imports)]
3453    use std::arch::x86_64::*;
3454
3455    /// Check if AVX2 is available at runtime.
3456    #[inline]
3457    pub fn is_available() -> bool {
3458        is_x86_feature_detected!("avx2")
3459    }
3460
3461    /// SIMD-accelerated state update for Bitap with k <= 3.
3462    ///
3463    /// Computes:
3464    /// - `r[0]` = (`old_r[0]` << 1) | `char_mask`
3465    /// - `r[d]` = ((`old_r[d]` << 1) | `char_mask`) & `old_r[d-1]` & (`r[d-1]` << 1) & (`old_r[d-1]` << 1)
3466    ///
3467    /// The cascade dependency (r[d] depends on r[d-1]) is handled sequentially after
3468    /// computing the independent terms in parallel.
3469    ///
3470    /// # Safety
3471    /// Requires AVX2 support. Caller must verify with `is_available()`.
3472    #[target_feature(enable = "avx2")]
3473    #[inline]
3474    #[allow(dead_code)]
3475    pub unsafe fn update_states_avx2(
3476        r: &mut [u64; 4],
3477        old_r: &[u64; 4],
3478        char_mask: u64,
3479        max_edits: usize,
3480    ) {
3481        debug_assert!(max_edits <= 3);
3482
3483        unsafe {
3484            // Load old states into 256-bit register
3485            let old_vec = _mm256_loadu_si256(old_r.as_ptr().cast::<__m256i>());
3486            let mask = _mm256_set1_epi64x(char_mask as i64);
3487
3488            // match_d = (old_r[d] << 1) | char_mask (parallel for all d)
3489            let match_d = _mm256_or_si256(_mm256_slli_epi64(old_vec, 1), mask);
3490
3491            // insert = old_r[d-1]: shift lanes right, filling lane 0 with !0
3492            // Use permute to shift: [old_r[0], old_r[1], old_r[2], old_r[3]] -> [!0, old_r[0], old_r[1], old_r[2]]
3493            let all_ones = _mm256_set1_epi64x(!0i64);
3494            // _mm256_permute4x64_epi64 with control 0b10_01_00_11 = [3,0,1,2] but we need [X,0,1,2]
3495            // Instead, use blend: shift and insert !0 at position 0
3496            let shifted = _mm256_permute4x64_epi64(old_vec, 0b10_01_00_00); // [0,0,1,2]
3497            let insert = _mm256_blend_epi32(shifted, all_ones, 0b0000_0011); // lane 0 = !0
3498
3499            // substitute = old_r[d-1] << 1
3500            let subst = _mm256_slli_epi64(insert, 1);
3501
3502            // Combine independent terms: match_d & insert & substitute
3503            let partial = _mm256_and_si256(_mm256_and_si256(match_d, insert), subst);
3504
3505            // Store partial results
3506            _mm256_storeu_si256(r.as_mut_ptr().cast::<__m256i>(), partial);
3507
3508            // Apply delete cascade: r[d] &= r[d-1] << 1
3509            // Left shift advances pattern position, must be sequential due to dependency
3510            if max_edits >= 1 {
3511                r[1] &= r[0] << 1;
3512            }
3513            if max_edits >= 2 {
3514                r[2] &= r[1] << 1;
3515            }
3516            if max_edits >= 3 {
3517                r[3] &= r[2] << 1;
3518            }
3519        }
3520    }
3521
3522    /// SIMD-accelerated state update with transposition support.
3523    ///
3524    /// # Safety
3525    /// Requires AVX2 support. Caller must verify with `is_available()`.
3526    #[target_feature(enable = "avx2")]
3527    #[inline]
3528    pub unsafe fn update_states_with_trans_avx2(
3529        r: &mut [u64; 4],
3530        old_r: &[u64; 4],
3531        old_old_r: &[u64; 4],
3532        char_mask: u64,
3533        prev_mask: u64,
3534        max_edits: usize,
3535    ) {
3536        debug_assert!(max_edits <= 3);
3537
3538        unsafe {
3539            // Load old states
3540            let old_vec = _mm256_loadu_si256(old_r.as_ptr().cast::<__m256i>());
3541            let old_old_vec = _mm256_loadu_si256(old_old_r.as_ptr().cast::<__m256i>());
3542            let mask = _mm256_set1_epi64x(char_mask as i64);
3543
3544            // match_d = (old_r[d] << 1) | char_mask
3545            let match_d = _mm256_or_si256(_mm256_slli_epi64(old_vec, 1), mask);
3546
3547            // Shift for d-1 access
3548            let all_ones = _mm256_set1_epi64x(!0i64);
3549            let shifted_old = _mm256_permute4x64_epi64(old_vec, 0b10_01_00_00);
3550            let insert = _mm256_blend_epi32(shifted_old, all_ones, 0b0000_0011);
3551
3552            // substitute = old_r[d-1] << 1
3553            let subst = _mm256_slli_epi64(insert, 1);
3554
3555            // Transposition term
3556            // trans_valid_mask: bit j is 0 if pattern[j]=curr AND pattern[j+1]=prev
3557            let trans_valid = char_mask | (prev_mask >> 1);
3558            let trans_valid_vec = _mm256_set1_epi64x(trans_valid as i64);
3559
3560            // Shift old_old for d-1 access
3561            let shifted_old_old = _mm256_permute4x64_epi64(old_old_vec, 0b10_01_00_00);
3562            let old_old_dm1 = _mm256_blend_epi32(shifted_old_old, all_ones, 0b0000_0011);
3563
3564            // trans = ((old_old_r[d-1] << 1) | trans_valid_mask) << 1
3565            let trans_inner = _mm256_or_si256(_mm256_slli_epi64(old_old_dm1, 1), trans_valid_vec);
3566            let trans = _mm256_slli_epi64(trans_inner, 1);
3567
3568            // Combine: match_d & insert & substitute & trans
3569            let partial = _mm256_and_si256(
3570                _mm256_and_si256(_mm256_and_si256(match_d, insert), subst),
3571                trans,
3572            );
3573
3574            // Store partial results
3575            _mm256_storeu_si256(r.as_mut_ptr().cast::<__m256i>(), partial);
3576
3577            // Apply delete cascade - left shift advances pattern position
3578            if max_edits >= 1 {
3579                r[1] &= r[0] << 1;
3580            }
3581            if max_edits >= 2 {
3582                r[2] &= r[1] << 1;
3583            }
3584            if max_edits >= 3 {
3585                r[3] &= r[2] << 1;
3586            }
3587        }
3588    }
3589}
3590
3591#[cfg(test)]
3592mod tests {
3593    use super::*;
3594
3595    #[test]
3596    fn test_exact_match() {
3597        let matcher = BitapMatcher::new("hello", EditLimits::new(0), false).unwrap();
3598        let matches = matcher.find_all("hello world", 0.8);
3599
3600        assert!(!matches.is_empty());
3601        assert!(matches.iter().any(|m| m.start == 0 && m.total_edits() == 0));
3602    }
3603
3604    #[test]
3605    fn test_one_substitution() {
3606        let matcher = BitapMatcher::new("hello", EditLimits::new(1), false).unwrap();
3607        let matches = matcher.find_all("hallo world", 0.5);
3608
3609        assert!(!matches.is_empty());
3610        assert!(matches.iter().any(|m| m.start == 0));
3611    }
3612
3613    #[test]
3614    fn test_find_first() {
3615        let matcher = BitapMatcher::new("quick", EditLimits::new(1), false).unwrap();
3616        let result = matcher.find_first("The quick brown fox", 0.8);
3617
3618        assert!(result.is_some());
3619        let m = result.unwrap();
3620        assert_eq!(m.start, 4);
3621    }
3622
3623    #[test]
3624    fn test_case_insensitive() {
3625        let matcher = BitapMatcher::new("hello", EditLimits::new(0), true).unwrap();
3626        let matches = matcher.find_all("HELLO world", 0.8);
3627
3628        assert!(!matches.is_empty());
3629    }
3630
3631    #[test]
3632    fn test_pattern_too_long() {
3633        let long_pattern = "a".repeat(65);
3634        let result = BitapMatcher::new(&long_pattern, EditLimits::new(1), false);
3635        assert!(result.is_none());
3636    }
3637
3638    #[test]
3639    fn test_transposition_match() {
3640        // With transposition support, "ba" should match "ab" with 1 edit (swap)
3641        // not 2 edits (2 substitutions)
3642        let matcher = BitapMatcher::new("ab", EditLimits::new(1), false).unwrap();
3643        let result = matcher.find_at_byte_position(b"ba", 0, 0.5);
3644
3645        assert!(result.is_some(), "Should find transposition match");
3646        let m = result.unwrap();
3647        assert_eq!(m.total_edits(), 1, "Transposition should count as 1 edit");
3648    }
3649
3650    #[test]
3651    fn test_transposition_in_word() {
3652        // "teh" should match "the" with 1 transposition
3653        let matcher = BitapMatcher::new("the", EditLimits::new(1), false).unwrap();
3654        let result = matcher.find_at_byte_position(b"teh quick brown", 0, 0.5);
3655
3656        assert!(
3657            result.is_some(),
3658            "Should find transposition match for 'teh'"
3659        );
3660        let m = result.unwrap();
3661        assert_eq!(m.total_edits(), 1, "Should be 1 edit for transposition");
3662    }
3663
3664    #[test]
3665    fn test_transposition_vs_two_substitutions() {
3666        // Without transposition, matching "ab" against "ba" would need 2 substitutions
3667        // With transposition, it's just 1 edit
3668        // Test that with max_edits=1, we CAN find "ba" because transposition counts as 1
3669        let matcher = BitapMatcher::new("ab", EditLimits::new(1), false).unwrap();
3670        let result = matcher.find_at_byte_position(b"ba", 0, 0.0);
3671        assert!(
3672            result.is_some(),
3673            "Transposition should allow match with 1 edit"
3674        );
3675    }
3676}
3677
3678/// Match result with pattern index for multi-pattern search.
3679#[derive(Debug, Clone)]
3680pub struct MultiPatternMatch {
3681    /// Index of the pattern that matched.
3682    pub pattern_index: usize,
3683    /// The match details.
3684    pub match_result: DamLevMatch,
3685}
3686
3687/// Multi-pattern Bitap matcher for searching multiple patterns in a single text pass.
3688///
3689/// This is more efficient than running N separate Bitap searches because:
3690/// 1. Text is scanned only once
3691/// 2. Character decoding is done once per character
3692/// 3. Better cache locality
3693#[derive(Debug)]
3694pub struct MultiBitapMatcher {
3695    /// Individual matchers for each pattern.
3696    matchers: Vec<BitapMatcher>,
3697    /// Case insensitive matching.
3698    case_insensitive: bool,
3699}
3700
3701impl MultiBitapMatcher {
3702    /// Create a new multi-pattern matcher.
3703    ///
3704    /// Returns None if any pattern is too long or empty.
3705    #[must_use]
3706    pub fn new(patterns: &[&str], limits: &EditLimits, case_insensitive: bool) -> Option<Self> {
3707        if patterns.is_empty() {
3708            return None;
3709        }
3710
3711        let matchers: Option<Vec<BitapMatcher>> = patterns
3712            .iter()
3713            .map(|p| BitapMatcher::new(p, limits.clone(), case_insensitive))
3714            .collect();
3715
3716        Some(MultiBitapMatcher {
3717            matchers: matchers?,
3718            case_insensitive,
3719        })
3720    }
3721
3722    /// Find all matches for all patterns in a single text pass.
3723    ///
3724    /// Returns matches with their pattern indices.
3725    #[must_use]
3726    pub fn find_all(&self, text: &str, threshold: f32) -> Vec<MultiPatternMatch> {
3727        let text_chars: Vec<(usize, char)> = text.char_indices().collect();
3728
3729        if text_chars.is_empty() || self.matchers.is_empty() {
3730            return vec![];
3731        }
3732
3733        // Find max_edits across all patterns
3734        let max_edits = self
3735            .matchers
3736            .iter()
3737            .map(|m| m.limits.max_edits as usize)
3738            .max()
3739            .unwrap_or(0);
3740
3741        // State vectors for each pattern: r[pattern][edit_level]
3742        let mut r: Vec<Vec<u64>> = self
3743            .matchers
3744            .iter()
3745            .map(|_| vec![!0u64; max_edits + 1])
3746            .collect();
3747        let mut old_r: Vec<Vec<u64>> = self
3748            .matchers
3749            .iter()
3750            .map(|_| vec![!0u64; max_edits + 1])
3751            .collect();
3752
3753        // Initialize deletion states for each pattern
3754        for (p_idx, matcher) in self.matchers.iter().enumerate() {
3755            let p_max = matcher.limits.max_edits as usize;
3756            for d in 1..=p_max {
3757                r[p_idx][d] = r[p_idx][d - 1] << 1;
3758            }
3759        }
3760
3761        // Use FxHashMap for deduplication
3762        let mut matches: FxHashMap<(usize, usize, usize), MultiPatternMatch> = FxHashMap::default();
3763
3764        // Process each character once
3765        for (char_idx, &(_, text_char)) in text_chars.iter().enumerate() {
3766            let text_char = if self.case_insensitive {
3767                text_char.to_lowercase().next().unwrap_or(text_char)
3768            } else {
3769                text_char
3770            };
3771
3772            // Update all pattern states
3773            for (p_idx, matcher) in self.matchers.iter().enumerate() {
3774                let p_max = matcher.limits.max_edits as usize;
3775                let char_mask = matcher.get_mask(text_char);
3776
3777                // Swap buffers for this pattern
3778                std::mem::swap(&mut r[p_idx], &mut old_r[p_idx]);
3779
3780                // Update R[0] (exact matching)
3781                r[p_idx][0] = (old_r[p_idx][0] << 1) | char_mask;
3782
3783                // Update R[d] for d > 0 (fuzzy matching)
3784                for d in 1..=p_max {
3785                    let insert = old_r[p_idx][d - 1];
3786                    let delete = r[p_idx][d - 1] << 1;
3787                    let substitute = old_r[p_idx][d - 1] << 1;
3788                    let match_d = (old_r[p_idx][d] << 1) | char_mask;
3789                    r[p_idx][d] = match_d & insert & delete & substitute;
3790                }
3791
3792                // Check for matches
3793                let end_byte = text_chars.get(char_idx + 1).map_or(text.len(), |(b, _)| *b);
3794
3795                for d in 0..=p_max {
3796                    if (r[p_idx][d] & matcher.accept_mask) == 0 {
3797                        // Found a match with d edits for pattern p_idx
3798                        let min_start_char = char_idx.saturating_sub(matcher.pattern_len + d);
3799                        let max_start_char =
3800                            char_idx.saturating_sub(matcher.pattern_len.saturating_sub(d + 1));
3801
3802                        for start_char in min_start_char..=max_start_char.min(char_idx) {
3803                            let start_byte = text_chars.get(start_char).map_or(0, |(b, _)| *b);
3804
3805                            let (insertions, deletions, substitutions, swaps) = matcher
3806                                .compute_exact_edit_breakdown(
3807                                    &text.as_bytes()[start_byte..end_byte],
3808                                );
3809
3810                            let total_edits = insertions + deletions + substitutions + swaps;
3811                            if total_edits as usize > d {
3812                                continue;
3813                            }
3814
3815                            let sim = matcher.calc_similarity(total_edits, insertions, deletions);
3816                            if sim >= threshold {
3817                                let key = (start_byte, end_byte, p_idx);
3818                                let m = MultiPatternMatch {
3819                                    pattern_index: p_idx,
3820                                    match_result: DamLevMatch {
3821                                        start: start_byte,
3822                                        end: end_byte,
3823                                        insertions,
3824                                        deletions,
3825                                        substitutions,
3826                                        swaps,
3827                                        similarity: sim,
3828                                    },
3829                                };
3830
3831                                matches
3832                                    .entry(key)
3833                                    .and_modify(|existing| {
3834                                        if m.match_result.similarity
3835                                            > existing.match_result.similarity
3836                                        {
3837                                            *existing = m.clone();
3838                                        }
3839                                    })
3840                                    .or_insert(m);
3841                            }
3842                        }
3843                    }
3844                }
3845            }
3846        }
3847
3848        matches.into_values().collect()
3849    }
3850
3851    /// Get the number of patterns.
3852    #[must_use]
3853    pub fn pattern_count(&self) -> usize {
3854        self.matchers.len()
3855    }
3856
3857    /// Get a pattern by index.
3858    #[must_use]
3859    pub fn pattern(&self, index: usize) -> Option<&str> {
3860        self.matchers.get(index).map(BitapMatcher::pattern)
3861    }
3862}