fuzzy_regex/engine/
prefilter.rs

1//! Prefilter for fast candidate position detection.
2//!
3//! Uses literal prefixes to quickly skip positions that cannot possibly match,
4//! dramatically improving performance for unanchored searches.
5
6#![allow(
7    clippy::match_same_arms,
8    clippy::too_many_lines,
9    clippy::items_after_statements
10)]
11
12use memchr::{memchr, memchr2, memchr3, memmem};
13
14/// A prefilter that can quickly find candidate start positions.
15#[derive(Debug, Clone)]
16pub enum Prefilter {
17    /// No prefiltering - try every position.
18    None,
19    /// Search for a single byte.
20    SingleByte {
21        /// The byte to search for.
22        byte: u8,
23        /// Maximum distance from found byte where match could start.
24        max_offset: usize,
25    },
26    /// Search for two possible bytes (for case-insensitive or fuzzy).
27    TwoBytes {
28        /// First byte to search for.
29        byte1: u8,
30        /// Second byte to search for.
31        byte2: u8,
32        /// Maximum distance from found byte where match could start.
33        max_offset: usize,
34    },
35    /// Search for three possible bytes.
36    ThreeBytes {
37        /// First byte to search for.
38        byte1: u8,
39        /// Second byte to search for.
40        byte2: u8,
41        /// Third byte to search for.
42        byte3: u8,
43        /// Maximum distance from found byte where match could start.
44        max_offset: usize,
45    },
46    /// Search for 4+ possible bytes (uses linear scan).
47    MultiBytes {
48        /// Collection of bytes to search for.
49        bytes: Vec<u8>,
50        /// Maximum distance from found byte where match could start.
51        max_offset: usize,
52    },
53    /// Search for an exact literal substring.
54    Literal {
55        /// The byte sequence to search for.
56        needle: Vec<u8>,
57    },
58    /// Search for a literal prefix with offset (for fuzzy matching).
59    LiteralWithOffset {
60        /// The byte sequence to search for.
61        needle: Vec<u8>,
62        /// Maximum distance from found literal where match could start.
63        max_offset: usize,
64    },
65    /// Fast search for exactly 2-byte literal (uses memchr + check).
66    /// Much faster than memmem for 2-byte needles.
67    TwoByteLiteral {
68        /// First byte of the literal.
69        byte1: u8,
70        /// Second byte of the literal.
71        byte2: u8,
72        /// Maximum distance from found literal where match could start.
73        max_offset: usize,
74    },
75    /// Search for a literal with fuzzy tolerance.
76    FuzzyLiteral {
77        /// First byte of the literal (or common variant).
78        first_byte: u8,
79        /// Alternative first bytes (for substitutions).
80        alt_bytes: Vec<u8>,
81        /// Maximum edit distance.
82        max_edits: usize,
83    },
84    /// Pigeonhole-based prefilter: for k edits, at least one of (k+1) pieces must match exactly.
85    /// Much more selective than single-byte prefiltering for longer patterns.
86    PigeonholePieces {
87        /// The pattern pieces (at least one must match exactly).
88        pieces: Vec<Vec<u8>>,
89        /// Offset of each piece within the original pattern.
90        offsets: Vec<usize>,
91        /// Maximum edit distance.
92        max_edits: usize,
93    },
94}
95
96impl Prefilter {
97    /// Create a prefilter for an exact literal.
98    #[must_use]
99    pub fn exact(literal: &str) -> Self {
100        if literal.is_empty() {
101            return Prefilter::None;
102        }
103
104        let bytes = literal.as_bytes();
105
106        // For ASCII-only short literals, search for first byte
107        // This is safe because ASCII characters are single-byte
108        if bytes.len() <= 3 && bytes[0] < 128 {
109            return Prefilter::SingleByte {
110                byte: bytes[0],
111                max_offset: 0,
112            };
113        }
114
115        // For non-ASCII or longer literals, use substring search
116        // This ensures we match the full first character, not just its first byte
117        Prefilter::Literal {
118            needle: bytes.to_vec(),
119        }
120    }
121
122    /// Create a prefilter for a fuzzy literal.
123    ///
124    /// Strategy: Even with fuzzy matching, certain bytes from the pattern must
125    /// appear in any valid match. We search for bytes from the first positions
126    /// that could possibly match.
127    #[must_use]
128    pub fn fuzzy(literal: &str, max_edits: u8) -> Self {
129        if literal.is_empty() {
130            return Prefilter::None;
131        }
132
133        let bytes = literal.as_bytes();
134        let max_edits_usize = max_edits as usize;
135
136        // If edits >= pattern length, any text could match - no useful prefilter
137        if max_edits_usize >= bytes.len() {
138            return Prefilter::None;
139        }
140
141        // For non-ASCII patterns (UTF-8), use substring search on the first character.
142        // Single-byte prefilter is ineffective because UTF-8 leading bytes (208, 209 for
143        // Cyrillic, 228-233 for CJK) appear in almost every character of that script.
144        //
145        // Only use this for exact matches (e<=0) or when pattern is long enough that
146        // the first character must appear. For fuzzy matches, the first char could be
147        // deleted, so we'd need to search for both first and second chars.
148        if bytes[0] >= 128 {
149            let first_char_len = Self::utf8_char_len(bytes[0]);
150            let char_count = bytes.iter().filter(|&&b| (b & 0xC0) != 0x80).count(); // Count UTF-8 start bytes
151
152            // Use substring search only when:
153            // 1. First char is multi-byte, AND
154            // 2. Either exact match (e<=0), or pattern has enough chars that first must appear
155            if first_char_len > 1 && first_char_len <= bytes.len() {
156                let first_char_must_appear =
157                    max_edits_usize == 0 || char_count > max_edits_usize + 1;
158
159                if first_char_must_appear {
160                    // For 2-byte UTF-8 characters (Cyrillic, etc.), use fast TwoByteLiteral
161                    // which is much faster than memmem::Finder for short needles
162                    if first_char_len == 2 {
163                        return Prefilter::TwoByteLiteral {
164                            byte1: bytes[0],
165                            byte2: bytes[1],
166                            max_offset: max_edits_usize,
167                        };
168                    }
169                    // For 3+ byte characters, use memmem
170                    let needle = bytes[..first_char_len].to_vec();
171                    return Prefilter::LiteralWithOffset {
172                        needle,
173                        max_offset: max_edits_usize,
174                    };
175                }
176            }
177        }
178
179        // For ASCII patterns, collect unique bytes from the first (max_edits + 1) positions.
180        let search_depth = (max_edits_usize + 1).min(bytes.len());
181        let mut search_bytes: Vec<u8> = Vec::with_capacity(search_depth * 2);
182
183        for &b in bytes.iter().take(search_depth) {
184            if !search_bytes.contains(&b) {
185                search_bytes.push(b);
186                // Also add case variant
187                if b.is_ascii_lowercase() {
188                    let upper = b.to_ascii_uppercase();
189                    if !search_bytes.contains(&upper) {
190                        search_bytes.push(upper);
191                    }
192                } else if b.is_ascii_uppercase() {
193                    let lower = b.to_ascii_lowercase();
194                    if !search_bytes.contains(&lower) {
195                        search_bytes.push(lower);
196                    }
197                }
198            }
199        }
200
201        // The max_offset accounts for:
202        // - Insertions before the pattern (up to max_edits)
203        // - The byte we find might be at position (max_edits) in pattern
204        let max_offset = max_edits_usize;
205
206        match search_bytes.len() {
207            0 => Prefilter::None,
208            1 => Prefilter::SingleByte {
209                byte: search_bytes[0],
210                max_offset,
211            },
212            2 => Prefilter::TwoBytes {
213                byte1: search_bytes[0],
214                byte2: search_bytes[1],
215                max_offset,
216            },
217            3 => Prefilter::ThreeBytes {
218                byte1: search_bytes[0],
219                byte2: search_bytes[1],
220                byte3: search_bytes[2],
221                max_offset,
222            },
223            _ => Prefilter::MultiBytes {
224                bytes: search_bytes,
225                max_offset,
226            },
227        }
228    }
229
230    /// Get the byte length of a UTF-8 character from its leading byte.
231    #[inline]
232    fn utf8_char_len(leading_byte: u8) -> usize {
233        if leading_byte < 128 {
234            1 // ASCII
235        } else if leading_byte < 224 {
236            2 // 2-byte (Cyrillic, Latin Extended, etc.)
237        } else if leading_byte < 240 {
238            3 // 3-byte (CJK, etc.)
239        } else {
240            4 // 4-byte (Emoji, rare scripts)
241        }
242    }
243
244    /// Create a prefilter for a fuzzy literal with rarity-based byte selection.
245    ///
246    /// For patterns with small alphabets (like DNA), select the rarest bytes
247    /// to minimize false positives. Falls back to streaming if all bytes are common.
248    #[must_use]
249    pub fn fuzzy_rare(literal: &str, max_edits: u8, text_sample: Option<&[u8]>) -> Self {
250        if literal.is_empty() {
251            return Prefilter::None;
252        }
253
254        let bytes = literal.as_bytes();
255        let max_edits_usize = max_edits as usize;
256
257        if max_edits_usize >= bytes.len() {
258            return Prefilter::None;
259        }
260
261        // If we have a text sample, find the rarest byte in the pattern
262        if let Some(sample) = text_sample {
263            // Count byte frequencies in sample
264            let mut freq = [0usize; 256];
265            for &b in sample {
266                freq[b as usize] += 1;
267            }
268
269            // Find the rarest byte in pattern (within first max_edits+1 positions)
270            let search_depth = (max_edits_usize + 1).min(bytes.len());
271            let mut rarest_byte = bytes[0];
272            let mut rarest_freq = freq[bytes[0] as usize];
273
274            for &b in bytes.iter().take(search_depth) {
275                if freq[b as usize] < rarest_freq {
276                    rarest_freq = freq[b as usize];
277                    rarest_byte = b;
278                }
279            }
280
281            // If the rarest byte appears in >25% of positions, prefilter won't help much
282            if rarest_freq * 4 > sample.len() {
283                return Prefilter::None; // Fall back to streaming
284            }
285
286            return Prefilter::SingleByte {
287                byte: rarest_byte,
288                max_offset: max_edits_usize,
289            };
290        }
291
292        // No sample - use standard fuzzy prefilter
293        Self::fuzzy(literal, max_edits)
294    }
295
296    /// Create a pigeonhole-based prefilter for fuzzy matching.
297    ///
298    /// For a pattern of length m with at most k edits, we split the pattern into
299    /// (k+1) non-overlapping pieces. By the pigeonhole principle, at least one
300    /// piece must match exactly in any valid fuzzy match.
301    ///
302    /// This is much more selective than single-byte prefiltering for longer patterns.
303    /// For example, `"hello"` with k=1 → pieces `["hel", "lo"]`, both 2-3 chars long.
304    /// Finding `"hel"` or `"lo"` is much rarer than finding 'h' or 'e'.
305    #[must_use]
306    pub fn pigeonhole(literal: &str, max_edits: u8) -> Self {
307        let bytes = literal.as_bytes();
308        let m = bytes.len();
309        let k = max_edits as usize;
310
311        // Need at least k+1 bytes to form k+1 pieces of at least 1 byte each
312        if m < k + 1 || k == 0 {
313            return Self::fuzzy(literal, max_edits);
314        }
315
316        // Split pattern into k+1 pieces
317        let num_pieces = k + 1;
318        let base_piece_len = m / num_pieces;
319        let extra = m % num_pieces;
320
321        let mut pieces = Vec::with_capacity(num_pieces);
322        let mut offsets = Vec::with_capacity(num_pieces);
323        let mut pos = 0;
324
325        for i in 0..num_pieces {
326            // Distribute extra bytes among first `extra` pieces
327            let piece_len = base_piece_len + usize::from(i < extra);
328            offsets.push(pos);
329            pieces.push(bytes[pos..pos + piece_len].to_vec());
330            pos += piece_len;
331        }
332
333        // Only use pigeonhole if pieces are long enough to be selective
334        // Single-byte pieces are no better than regular fuzzy prefilter
335        let min_piece_len = pieces.iter().map(Vec::len).min().unwrap_or(0);
336        if min_piece_len < 2 {
337            return Self::fuzzy(literal, max_edits);
338        }
339
340        Prefilter::PigeonholePieces {
341            pieces,
342            offsets,
343            max_edits: k,
344        }
345    }
346
347    /// Create a prefilter for case-insensitive matching.
348    #[must_use]
349    pub fn case_insensitive(literal: &str) -> Self {
350        if literal.is_empty() {
351            return Prefilter::None;
352        }
353
354        let first = literal.as_bytes()[0];
355
356        if first.is_ascii_alphabetic() {
357            Prefilter::TwoBytes {
358                byte1: first.to_ascii_lowercase(),
359                byte2: first.to_ascii_uppercase(),
360                max_offset: 0,
361            }
362        } else {
363            Prefilter::SingleByte {
364                byte: first,
365                max_offset: 0,
366            }
367        }
368    }
369
370    /// Create a prefilter for multiple fuzzy literals.
371    ///
372    /// Collects first bytes from all patterns and uses memchr to find candidates.
373    /// Each entry is (literal, `max_edits`).
374    #[must_use]
375    pub fn multi_fuzzy(literals: &[(&str, u8)], case_insensitive: bool) -> Self {
376        if literals.is_empty() {
377            return Prefilter::None;
378        }
379
380        // Collect unique first bytes from all patterns
381        let mut search_bytes: Vec<u8> = Vec::new();
382        let mut max_offset: usize = 0;
383
384        for (lit, max_edits) in literals {
385            if lit.is_empty() {
386                continue;
387            }
388
389            let bytes = lit.as_bytes();
390            let max_edits_usize = *max_edits as usize;
391
392            // If any pattern has edits >= length, can't use prefilter
393            if max_edits_usize >= bytes.len() {
394                return Prefilter::None;
395            }
396
397            // Update max_offset (accounts for insertions at start)
398            max_offset = max_offset.max(max_edits_usize);
399
400            // Collect bytes from first (max_edits + 1) positions
401            let search_depth = (max_edits_usize + 1).min(bytes.len());
402            for &b in bytes.iter().take(search_depth) {
403                if !search_bytes.contains(&b) {
404                    search_bytes.push(b);
405                }
406                // Also add case variant
407                if case_insensitive && b.is_ascii_alphabetic() {
408                    let variant = if b.is_ascii_lowercase() {
409                        b.to_ascii_uppercase()
410                    } else {
411                        b.to_ascii_lowercase()
412                    };
413                    if !search_bytes.contains(&variant) {
414                        search_bytes.push(variant);
415                    }
416                }
417            }
418        }
419
420        match search_bytes.len() {
421            0 => Prefilter::None,
422            1 => Prefilter::SingleByte {
423                byte: search_bytes[0],
424                max_offset,
425            },
426            2 => Prefilter::TwoBytes {
427                byte1: search_bytes[0],
428                byte2: search_bytes[1],
429                max_offset,
430            },
431            3 => Prefilter::ThreeBytes {
432                byte1: search_bytes[0],
433                byte2: search_bytes[1],
434                byte3: search_bytes[2],
435                max_offset,
436            },
437            _ => Prefilter::MultiBytes {
438                bytes: search_bytes,
439                max_offset,
440            },
441        }
442    }
443
444    /// Find candidate positions in the text.
445    /// Returns an iterator over byte positions where a match might start.
446    #[must_use]
447    pub fn find_candidates<'a>(&'a self, text: &'a [u8]) -> CandidateIter<'a> {
448        CandidateIter {
449            prefilter: self,
450            text,
451            pos: 0,
452            finder: None,
453        }
454    }
455
456    /// Check if this prefilter does anything useful.
457    #[must_use]
458    pub fn is_active(&self) -> bool {
459        !matches!(self, Prefilter::None)
460    }
461
462    /// Get the `max_offset` for this prefilter (how far back a match could start).
463    #[must_use]
464    pub fn max_offset(&self) -> usize {
465        match self {
466            Prefilter::None | Prefilter::Literal { .. } => 0,
467            Prefilter::SingleByte { max_offset, .. }
468            | Prefilter::TwoBytes { max_offset, .. }
469            | Prefilter::ThreeBytes { max_offset, .. }
470            | Prefilter::MultiBytes { max_offset, .. }
471            | Prefilter::LiteralWithOffset { max_offset, .. }
472            | Prefilter::TwoByteLiteral { max_offset, .. } => *max_offset,
473            Prefilter::FuzzyLiteral { max_edits, .. }
474            | Prefilter::PigeonholePieces { max_edits, .. } => *max_edits,
475        }
476    }
477
478    /// Check if this prefilter is "selective" enough to be effective.
479    /// A prefilter searching for too many different bytes will hit too many positions.
480    /// Returns the number of unique bytes being searched for (lower is better).
481    #[must_use]
482    pub fn selectivity(&self) -> usize {
483        match self {
484            Prefilter::None => usize::MAX,
485            Prefilter::SingleByte { .. } => 1,
486            Prefilter::TwoBytes { .. }
487            | Prefilter::ThreeBytes { .. }
488            | Prefilter::TwoByteLiteral { .. } => 2,
489            Prefilter::MultiBytes { bytes, .. } => bytes.len(),
490            Prefilter::Literal { needle } | Prefilter::LiteralWithOffset { needle, .. } => {
491                needle.len().min(4)
492            }
493            Prefilter::FuzzyLiteral { alt_bytes, .. } => 1 + alt_bytes.len(),
494            Prefilter::PigeonholePieces { pieces, .. } => {
495                pieces.iter().map(Vec::len).min().unwrap_or(1)
496            }
497        }
498    }
499
500    /// Check if this prefilter is likely to be effective for multi-pattern search.
501    /// Returns false if the prefilter searches for too many different bytes.
502    #[must_use]
503    pub fn is_selective(&self) -> bool {
504        // More than 6 different bytes starts to become ineffective
505        // (each common byte appears ~5-10% of the time in English text)
506        // With 6 bytes, we hit ~30-40% of positions which is still reasonable.
507        // With 8+ bytes, we hit ~50%+ which makes prefiltering ineffective.
508        self.selectivity() <= 6
509    }
510}
511
512/// Iterator over candidate start positions.
513pub struct CandidateIter<'a> {
514    /// Reference to the prefilter configuration.
515    prefilter: &'a Prefilter,
516    /// The text being searched.
517    text: &'a [u8],
518    /// Current search position in the text.
519    pos: usize,
520    /// Lazily initialized substring finder for literal searches.
521    finder: Option<memmem::Finder<'a>>,
522}
523
524impl Iterator for CandidateIter<'_> {
525    type Item = usize;
526
527    #[allow(clippy::too_many_lines)]
528    fn next(&mut self) -> Option<usize> {
529        if self.pos >= self.text.len() {
530            return None;
531        }
532
533        match self.prefilter {
534            Prefilter::None => {
535                // Return every position
536                let result = self.pos;
537                self.pos += 1;
538                // Skip to next char boundary
539                while self.pos < self.text.len() && !is_char_boundary(self.text, self.pos) {
540                    self.pos += 1;
541                }
542                Some(result)
543            }
544
545            Prefilter::SingleByte { byte, max_offset } => {
546                if let Some(idx) = memchr(*byte, &self.text[self.pos..]) {
547                    let found = self.pos + idx;
548                    self.pos = found + 1;
549
550                    // Return position adjusted by max_offset
551                    // Match could start up to max_offset before the found byte
552                    Some(found.saturating_sub(*max_offset))
553                } else {
554                    self.pos = self.text.len();
555                    None
556                }
557            }
558
559            Prefilter::TwoBytes {
560                byte1,
561                byte2,
562                max_offset,
563            } => {
564                if let Some(idx) = memchr2(*byte1, *byte2, &self.text[self.pos..]) {
565                    let found = self.pos + idx;
566                    self.pos = found + 1;
567                    Some(found.saturating_sub(*max_offset))
568                } else {
569                    self.pos = self.text.len();
570                    None
571                }
572            }
573
574            Prefilter::ThreeBytes {
575                byte1,
576                byte2,
577                byte3,
578                max_offset,
579            } => {
580                if let Some(idx) = memchr3(*byte1, *byte2, *byte3, &self.text[self.pos..]) {
581                    let found = self.pos + idx;
582                    self.pos = found + 1;
583                    Some(found.saturating_sub(*max_offset))
584                } else {
585                    self.pos = self.text.len();
586                    None
587                }
588            }
589
590            Prefilter::MultiBytes { bytes, max_offset } => {
591                // Linear scan for any of the bytes
592                let remaining = &self.text[self.pos..];
593                if let Some(idx) = remaining.iter().position(|b| bytes.contains(b)) {
594                    let found = self.pos + idx;
595                    self.pos = found + 1;
596                    Some(found.saturating_sub(*max_offset))
597                } else {
598                    self.pos = self.text.len();
599                    None
600                }
601            }
602
603            Prefilter::Literal { needle } => {
604                // Initialize finder lazily
605                let finder = self
606                    .finder
607                    .get_or_insert_with(|| memmem::Finder::new(needle));
608
609                if let Some(idx) = finder.find(&self.text[self.pos..]) {
610                    let found = self.pos + idx;
611                    self.pos = found + 1;
612                    Some(found)
613                } else {
614                    self.pos = self.text.len();
615                    None
616                }
617            }
618
619            Prefilter::LiteralWithOffset { needle, max_offset } => {
620                // Initialize finder lazily - memmem uses SIMD for fast search
621                let finder = self
622                    .finder
623                    .get_or_insert_with(|| memmem::Finder::new(needle));
624
625                if let Some(idx) = finder.find(&self.text[self.pos..]) {
626                    let found = self.pos + idx;
627                    self.pos = found + 1;
628                    Some(found.saturating_sub(*max_offset))
629                } else {
630                    self.pos = self.text.len();
631                    None
632                }
633            }
634
635            Prefilter::TwoByteLiteral {
636                byte1,
637                byte2,
638                max_offset,
639            } => {
640                // For short remaining text, use memchr + second byte check (low overhead)
641                // For long remaining text, use memmem (SIMD-efficient for many false positives)
642                let remaining = &self.text[self.pos..];
643
644                // Threshold tuned empirically: memmem SIMD setup cost ~10-15ns
645                // breaks even around 50-100 bytes of scanning
646                const MEMMEM_THRESHOLD: usize = 64;
647
648                if remaining.len() < MEMMEM_THRESHOLD {
649                    // Short text: use memchr + check (avoid memmem setup)
650                    let mut search_pos = 0;
651                    while search_pos < remaining.len() {
652                        if let Some(idx) = memchr(*byte1, &remaining[search_pos..]) {
653                            let abs_idx = search_pos + idx;
654                            if abs_idx + 1 < remaining.len() && remaining[abs_idx + 1] == *byte2 {
655                                let found = self.pos + abs_idx;
656                                self.pos = found + 1;
657                                return Some(found.saturating_sub(*max_offset));
658                            }
659                            search_pos = abs_idx + 1;
660                        } else {
661                            break;
662                        }
663                    }
664                    self.pos = self.text.len();
665                    None
666                } else {
667                    // Long text: use memmem for SIMD-optimized search
668                    let needle = [*byte1, *byte2];
669                    let finder = memmem::Finder::new(&needle);
670                    if let Some(idx) = finder.find(remaining) {
671                        let found = self.pos + idx;
672                        self.pos = found + 1;
673                        Some(found.saturating_sub(*max_offset))
674                    } else {
675                        self.pos = self.text.len();
676                        None
677                    }
678                }
679            }
680
681            Prefilter::FuzzyLiteral {
682                first_byte,
683                alt_bytes,
684                max_edits,
685            } => {
686                // Search for first byte or any alternative
687                let remaining = &self.text[self.pos..];
688
689                let idx = if alt_bytes.is_empty() {
690                    memchr(*first_byte, remaining)
691                } else if alt_bytes.len() == 1 {
692                    memchr2(*first_byte, alt_bytes[0], remaining)
693                } else if alt_bytes.len() == 2 {
694                    memchr3(*first_byte, alt_bytes[0], alt_bytes[1], remaining)
695                } else {
696                    // Fallback to simple search
697                    remaining
698                        .iter()
699                        .position(|&b| b == *first_byte || alt_bytes.contains(&b))
700                };
701
702                if let Some(idx) = idx {
703                    let found = self.pos + idx;
704                    self.pos = found + 1;
705                    Some(found.saturating_sub(*max_edits))
706                } else {
707                    self.pos = self.text.len();
708                    None
709                }
710            }
711
712            Prefilter::PigeonholePieces {
713                pieces,
714                offsets,
715                max_edits,
716            } => {
717                // Search for any piece match and return adjusted position
718                // Strategy: search for first bytes of each piece, verify full match
719                let remaining = &self.text[self.pos..];
720
721                // Collect first bytes of all pieces for fast initial scan
722                let first_bytes: Vec<u8> = pieces.iter().map(|p| p[0]).collect();
723
724                // Find the next occurrence of any first byte
725                let mut search_offset = 0;
726                while search_offset < remaining.len() {
727                    // Find next position with any of the first bytes
728                    let byte_match = match first_bytes.len() {
729                        1 => memchr(first_bytes[0], &remaining[search_offset..]),
730                        2 => memchr2(first_bytes[0], first_bytes[1], &remaining[search_offset..]),
731                        3 => memchr3(
732                            first_bytes[0],
733                            first_bytes[1],
734                            first_bytes[2],
735                            &remaining[search_offset..],
736                        ),
737                        _ => remaining[search_offset..]
738                            .iter()
739                            .position(|b| first_bytes.contains(b)),
740                    };
741
742                    match byte_match {
743                        Some(idx) => {
744                            let abs_idx = search_offset + idx;
745                            let text_pos = self.pos + abs_idx;
746
747                            // Check which piece(s) match at this position
748                            for (piece_idx, piece) in pieces.iter().enumerate() {
749                                if remaining[abs_idx..].starts_with(piece) {
750                                    // Found a piece match!
751                                    // The pattern could start at text_pos - offset - max_edits
752                                    let piece_offset = offsets[piece_idx];
753                                    let candidate_start = text_pos
754                                        .saturating_sub(piece_offset)
755                                        .saturating_sub(*max_edits);
756
757                                    self.pos = text_pos + 1;
758                                    return Some(candidate_start);
759                                }
760                            }
761
762                            // First byte matched but no piece matched - continue searching
763                            search_offset = abs_idx + 1;
764                        }
765                        None => {
766                            // No more first bytes found
767                            break;
768                        }
769                    }
770                }
771
772                self.pos = self.text.len();
773                None
774            }
775        }
776    }
777}
778
779/// Check if a byte position is a UTF-8 char boundary.
780#[inline]
781fn is_char_boundary(text: &[u8], pos: usize) -> bool {
782    if pos >= text.len() {
783        return true;
784    }
785    // UTF-8 continuation bytes start with 10xxxxxx
786    (text[pos] & 0b1100_0000) != 0b1000_0000
787}
788
789/// Extract a prefilter from pattern information.
790#[must_use]
791pub fn create_prefilter(
792    first_literal: Option<&str>,
793    max_edits: Option<u8>,
794    case_insensitive: bool,
795) -> Prefilter {
796    match first_literal {
797        None | Some("") => Prefilter::None,
798        Some(lit) => {
799            if case_insensitive {
800                Prefilter::case_insensitive(lit)
801            } else if let Some(edits) = max_edits {
802                if edits > 0 {
803                    Prefilter::fuzzy(lit, edits)
804                } else {
805                    Prefilter::exact(lit)
806                }
807            } else {
808                Prefilter::exact(lit)
809            }
810        }
811    }
812}
813
814#[cfg(test)]
815mod tests {
816    use super::*;
817
818    #[test]
819    fn test_exact_prefilter() {
820        let pf = Prefilter::exact("hello");
821        let text = b"say hello world hello";
822        let candidates: Vec<_> = pf.find_candidates(text).collect();
823        assert_eq!(candidates, vec![4, 16]);
824    }
825
826    #[test]
827    fn test_single_byte_prefilter() {
828        let pf = Prefilter::SingleByte {
829            byte: b'h',
830            max_offset: 0,
831        };
832        let text = b"say hello world";
833        let candidates: Vec<_> = pf.find_candidates(text).collect();
834        assert_eq!(candidates, vec![4]);
835    }
836
837    #[test]
838    fn test_fuzzy_prefilter() {
839        // Fuzzy prefilter searches for bytes from first (max_edits+1) positions
840        // "hello" with 2 edits: searches for 'h', 'H', 'e' (first 3 unique bytes with case variants)
841        let pf = Prefilter::fuzzy("hello", 2);
842        let text = b"say hello world";
843        let candidates: Vec<_> = pf.find_candidates(text).collect();
844        // 'h' at position 4 → returns 4-2=2
845        // 'e' at position 5 → returns 5-2=3
846        // 'e' at position 12 → returns 12-2=10
847        // 'l' and 'o' are not in search set (only first 3 bytes used)
848        assert!(candidates.contains(&2), "Should find 'h' at 4, offset to 2");
849        assert!(candidates.contains(&3), "Should find 'e' at 5, offset to 3");
850
851        // For exact match (0 edits), prefilter works normally
852        let pf_exact = Prefilter::fuzzy("hello", 0);
853        let candidates_exact: Vec<_> = pf_exact.find_candidates(text).collect();
854        assert_eq!(candidates_exact, vec![4]); // Just 'h' at position 4
855
856        // For high edit distance >= pattern length, no prefilter
857        let pf_high = Prefilter::fuzzy("hi", 2);
858        assert!(matches!(pf_high, Prefilter::None));
859    }
860
861    #[test]
862    fn test_case_insensitive_prefilter() {
863        let pf = Prefilter::case_insensitive("Hello");
864        let text = b"say HELLO and hello";
865        let candidates: Vec<_> = pf.find_candidates(text).collect();
866        assert_eq!(candidates, vec![4, 14]); // H at 4, h at 14
867    }
868
869    #[test]
870    fn test_pigeonhole_prefilter() {
871        // "hello" with 1 edit → pieces ["hel", "lo"] (at least one must match exactly)
872        let pf = Prefilter::pigeonhole("hello", 1);
873
874        // Should be PigeonholePieces variant
875        assert!(matches!(pf, Prefilter::PigeonholePieces { .. }));
876
877        // Text containing "hel" → should find candidate
878        let text1 = b"say hel world";
879        let candidates1: Vec<_> = pf.find_candidates(text1).collect();
880        assert!(!candidates1.is_empty(), "Should find 'hel' piece");
881
882        // Text containing "lo" → should find candidate
883        let text2 = b"say lo world";
884        let candidates2: Vec<_> = pf.find_candidates(text2).collect();
885        assert!(!candidates2.is_empty(), "Should find 'lo' piece");
886
887        // Text containing "hello" → should find candidate at correct position
888        let text3 = b"say hello world";
889        let candidates3: Vec<_> = pf.find_candidates(text3).collect();
890        assert!(!candidates3.is_empty(), "Should find 'hello'");
891        // "hel" is at position 4, offset 0 in pattern, max_edits=1 → candidate at 4-0-1=3
892        assert!(
893            candidates3.contains(&3),
894            "Candidate should include position 3"
895        );
896
897        // Text without any pieces → no candidates
898        let text4 = b"say xyz world";
899        let candidates4: Vec<_> = pf.find_candidates(text4).collect();
900        assert!(
901            candidates4.is_empty(),
902            "Should not find candidates in text without pieces"
903        );
904
905        // Short patterns fall back to regular fuzzy prefilter
906        let pf_short = Prefilter::pigeonhole("hi", 1);
907        assert!(!matches!(pf_short, Prefilter::PigeonholePieces { .. }));
908    }
909
910    #[test]
911    fn test_pigeonhole_pieces_split() {
912        // Test piece splitting for various pattern/edit combinations
913
914        // "abcdefgh" (8 chars) with 1 edit → 2 pieces of 4 chars each
915        let pf1 = Prefilter::pigeonhole("abcdefgh", 1);
916        if let Prefilter::PigeonholePieces {
917            pieces, offsets, ..
918        } = &pf1
919        {
920            assert_eq!(pieces.len(), 2);
921            assert_eq!(pieces[0], b"abcd");
922            assert_eq!(pieces[1], b"efgh");
923            assert_eq!(offsets, &[0, 4]);
924        } else {
925            panic!("Expected PigeonholePieces");
926        }
927
928        // "abcdefgh" with 3 edits → 4 pieces of 2 chars each
929        let pf2 = Prefilter::pigeonhole("abcdefgh", 3);
930        if let Prefilter::PigeonholePieces {
931            pieces, offsets, ..
932        } = &pf2
933        {
934            assert_eq!(pieces.len(), 4);
935            assert_eq!(pieces[0], b"ab");
936            assert_eq!(pieces[1], b"cd");
937            assert_eq!(pieces[2], b"ef");
938            assert_eq!(pieces[3], b"gh");
939            assert_eq!(offsets, &[0, 2, 4, 6]);
940        } else {
941            panic!("Expected PigeonholePieces");
942        }
943
944        // "abcde" (5 chars) with 2 edits → 3 pieces [2, 2, 1] - but min piece < 2, falls back
945        let pf3 = Prefilter::pigeonhole("abcde", 2);
946        assert!(
947            !matches!(pf3, Prefilter::PigeonholePieces { .. }),
948            "Should fall back when pieces too short"
949        );
950    }
951}
fuzzy_regex/engine/prefilter.rs

fuzzy_regex/engine/
prefilter.rs