fuzzy_regex/engine/prefilter.rs
1//! Prefilter for fast candidate position detection.
2//!
3//! Uses literal prefixes to quickly skip positions that cannot possibly match,
4//! dramatically improving performance for unanchored searches.
5
6#![allow(
7 clippy::match_same_arms,
8 clippy::too_many_lines,
9 clippy::items_after_statements
10)]
11
12use memchr::{memchr, memchr2, memchr3, memmem};
13
14/// A prefilter that can quickly find candidate start positions.
15#[derive(Debug, Clone)]
16pub enum Prefilter {
17 /// No prefiltering - try every position.
18 None,
19 /// Search for a single byte.
20 SingleByte {
21 /// The byte to search for.
22 byte: u8,
23 /// Maximum distance from found byte where match could start.
24 max_offset: usize,
25 },
26 /// Search for two possible bytes (for case-insensitive or fuzzy).
27 TwoBytes {
28 /// First byte to search for.
29 byte1: u8,
30 /// Second byte to search for.
31 byte2: u8,
32 /// Maximum distance from found byte where match could start.
33 max_offset: usize,
34 },
35 /// Search for three possible bytes.
36 ThreeBytes {
37 /// First byte to search for.
38 byte1: u8,
39 /// Second byte to search for.
40 byte2: u8,
41 /// Third byte to search for.
42 byte3: u8,
43 /// Maximum distance from found byte where match could start.
44 max_offset: usize,
45 },
46 /// Search for 4+ possible bytes (uses linear scan).
47 MultiBytes {
48 /// Collection of bytes to search for.
49 bytes: Vec<u8>,
50 /// Maximum distance from found byte where match could start.
51 max_offset: usize,
52 },
53 /// Search for an exact literal substring.
54 Literal {
55 /// The byte sequence to search for.
56 needle: Vec<u8>,
57 },
58 /// Search for a literal prefix with offset (for fuzzy matching).
59 LiteralWithOffset {
60 /// The byte sequence to search for.
61 needle: Vec<u8>,
62 /// Maximum distance from found literal where match could start.
63 max_offset: usize,
64 },
65 /// Fast search for exactly 2-byte literal (uses memchr + check).
66 /// Much faster than memmem for 2-byte needles.
67 TwoByteLiteral {
68 /// First byte of the literal.
69 byte1: u8,
70 /// Second byte of the literal.
71 byte2: u8,
72 /// Maximum distance from found literal where match could start.
73 max_offset: usize,
74 },
75 /// Search for a literal with fuzzy tolerance.
76 FuzzyLiteral {
77 /// First byte of the literal (or common variant).
78 first_byte: u8,
79 /// Alternative first bytes (for substitutions).
80 alt_bytes: Vec<u8>,
81 /// Maximum edit distance.
82 max_edits: usize,
83 },
84 /// Pigeonhole-based prefilter: for k edits, at least one of (k+1) pieces must match exactly.
85 /// Much more selective than single-byte prefiltering for longer patterns.
86 PigeonholePieces {
87 /// The pattern pieces (at least one must match exactly).
88 pieces: Vec<Vec<u8>>,
89 /// Offset of each piece within the original pattern.
90 offsets: Vec<usize>,
91 /// Maximum edit distance.
92 max_edits: usize,
93 },
94}
95
96impl Prefilter {
97 /// Create a prefilter for an exact literal.
98 #[must_use]
99 pub fn exact(literal: &str) -> Self {
100 if literal.is_empty() {
101 return Prefilter::None;
102 }
103
104 let bytes = literal.as_bytes();
105
106 // For ASCII-only short literals, search for first byte
107 // This is safe because ASCII characters are single-byte
108 if bytes.len() <= 3 && bytes[0] < 128 {
109 return Prefilter::SingleByte {
110 byte: bytes[0],
111 max_offset: 0,
112 };
113 }
114
115 // For non-ASCII or longer literals, use substring search
116 // This ensures we match the full first character, not just its first byte
117 Prefilter::Literal {
118 needle: bytes.to_vec(),
119 }
120 }
121
122 /// Create a prefilter for a fuzzy literal.
123 ///
124 /// Strategy: Even with fuzzy matching, certain bytes from the pattern must
125 /// appear in any valid match. We search for bytes from the first positions
126 /// that could possibly match.
127 #[must_use]
128 pub fn fuzzy(literal: &str, max_edits: u8) -> Self {
129 if literal.is_empty() {
130 return Prefilter::None;
131 }
132
133 let bytes = literal.as_bytes();
134 let max_edits_usize = max_edits as usize;
135
136 // If edits >= pattern length, any text could match - no useful prefilter
137 if max_edits_usize >= bytes.len() {
138 return Prefilter::None;
139 }
140
141 // For non-ASCII patterns (UTF-8), use substring search on the first character.
142 // Single-byte prefilter is ineffective because UTF-8 leading bytes (208, 209 for
143 // Cyrillic, 228-233 for CJK) appear in almost every character of that script.
144 //
145 // Only use this for exact matches (e<=0) or when pattern is long enough that
146 // the first character must appear. For fuzzy matches, the first char could be
147 // deleted, so we'd need to search for both first and second chars.
148 if bytes[0] >= 128 {
149 let first_char_len = Self::utf8_char_len(bytes[0]);
150 let char_count = bytes.iter().filter(|&&b| (b & 0xC0) != 0x80).count(); // Count UTF-8 start bytes
151
152 // Use substring search only when:
153 // 1. First char is multi-byte, AND
154 // 2. Either exact match (e<=0), or pattern has enough chars that first must appear
155 if first_char_len > 1 && first_char_len <= bytes.len() {
156 let first_char_must_appear =
157 max_edits_usize == 0 || char_count > max_edits_usize + 1;
158
159 if first_char_must_appear {
160 // For 2-byte UTF-8 characters (Cyrillic, etc.), use fast TwoByteLiteral
161 // which is much faster than memmem::Finder for short needles
162 if first_char_len == 2 {
163 return Prefilter::TwoByteLiteral {
164 byte1: bytes[0],
165 byte2: bytes[1],
166 max_offset: max_edits_usize,
167 };
168 }
169 // For 3+ byte characters, use memmem
170 let needle = bytes[..first_char_len].to_vec();
171 return Prefilter::LiteralWithOffset {
172 needle,
173 max_offset: max_edits_usize,
174 };
175 }
176 }
177 }
178
179 // For ASCII patterns, collect unique bytes from the first (max_edits + 1) positions.
180 let search_depth = (max_edits_usize + 1).min(bytes.len());
181 let mut search_bytes: Vec<u8> = Vec::with_capacity(search_depth * 2);
182
183 for &b in bytes.iter().take(search_depth) {
184 if !search_bytes.contains(&b) {
185 search_bytes.push(b);
186 // Also add case variant
187 if b.is_ascii_lowercase() {
188 let upper = b.to_ascii_uppercase();
189 if !search_bytes.contains(&upper) {
190 search_bytes.push(upper);
191 }
192 } else if b.is_ascii_uppercase() {
193 let lower = b.to_ascii_lowercase();
194 if !search_bytes.contains(&lower) {
195 search_bytes.push(lower);
196 }
197 }
198 }
199 }
200
201 // The max_offset accounts for:
202 // - Insertions before the pattern (up to max_edits)
203 // - The byte we find might be at position (max_edits) in pattern
204 let max_offset = max_edits_usize;
205
206 match search_bytes.len() {
207 0 => Prefilter::None,
208 1 => Prefilter::SingleByte {
209 byte: search_bytes[0],
210 max_offset,
211 },
212 2 => Prefilter::TwoBytes {
213 byte1: search_bytes[0],
214 byte2: search_bytes[1],
215 max_offset,
216 },
217 3 => Prefilter::ThreeBytes {
218 byte1: search_bytes[0],
219 byte2: search_bytes[1],
220 byte3: search_bytes[2],
221 max_offset,
222 },
223 _ => Prefilter::MultiBytes {
224 bytes: search_bytes,
225 max_offset,
226 },
227 }
228 }
229
230 /// Get the byte length of a UTF-8 character from its leading byte.
231 #[inline]
232 fn utf8_char_len(leading_byte: u8) -> usize {
233 if leading_byte < 128 {
234 1 // ASCII
235 } else if leading_byte < 224 {
236 2 // 2-byte (Cyrillic, Latin Extended, etc.)
237 } else if leading_byte < 240 {
238 3 // 3-byte (CJK, etc.)
239 } else {
240 4 // 4-byte (Emoji, rare scripts)
241 }
242 }
243
244 /// Create a prefilter for a fuzzy literal with rarity-based byte selection.
245 ///
246 /// For patterns with small alphabets (like DNA), select the rarest bytes
247 /// to minimize false positives. Falls back to streaming if all bytes are common.
248 #[must_use]
249 pub fn fuzzy_rare(literal: &str, max_edits: u8, text_sample: Option<&[u8]>) -> Self {
250 if literal.is_empty() {
251 return Prefilter::None;
252 }
253
254 let bytes = literal.as_bytes();
255 let max_edits_usize = max_edits as usize;
256
257 if max_edits_usize >= bytes.len() {
258 return Prefilter::None;
259 }
260
261 // If we have a text sample, find the rarest byte in the pattern
262 if let Some(sample) = text_sample {
263 // Count byte frequencies in sample
264 let mut freq = [0usize; 256];
265 for &b in sample {
266 freq[b as usize] += 1;
267 }
268
269 // Find the rarest byte in pattern (within first max_edits+1 positions)
270 let search_depth = (max_edits_usize + 1).min(bytes.len());
271 let mut rarest_byte = bytes[0];
272 let mut rarest_freq = freq[bytes[0] as usize];
273
274 for &b in bytes.iter().take(search_depth) {
275 if freq[b as usize] < rarest_freq {
276 rarest_freq = freq[b as usize];
277 rarest_byte = b;
278 }
279 }
280
281 // If the rarest byte appears in >25% of positions, prefilter won't help much
282 if rarest_freq * 4 > sample.len() {
283 return Prefilter::None; // Fall back to streaming
284 }
285
286 return Prefilter::SingleByte {
287 byte: rarest_byte,
288 max_offset: max_edits_usize,
289 };
290 }
291
292 // No sample - use standard fuzzy prefilter
293 Self::fuzzy(literal, max_edits)
294 }
295
296 /// Create a pigeonhole-based prefilter for fuzzy matching.
297 ///
298 /// For a pattern of length m with at most k edits, we split the pattern into
299 /// (k+1) non-overlapping pieces. By the pigeonhole principle, at least one
300 /// piece must match exactly in any valid fuzzy match.
301 ///
302 /// This is much more selective than single-byte prefiltering for longer patterns.
303 /// For example, `"hello"` with k=1 → pieces `["hel", "lo"]`, both 2-3 chars long.
304 /// Finding `"hel"` or `"lo"` is much rarer than finding 'h' or 'e'.
305 #[must_use]
306 pub fn pigeonhole(literal: &str, max_edits: u8) -> Self {
307 let bytes = literal.as_bytes();
308 let m = bytes.len();
309 let k = max_edits as usize;
310
311 // Need at least k+1 bytes to form k+1 pieces of at least 1 byte each
312 if m < k + 1 || k == 0 {
313 return Self::fuzzy(literal, max_edits);
314 }
315
316 // Split pattern into k+1 pieces
317 let num_pieces = k + 1;
318 let base_piece_len = m / num_pieces;
319 let extra = m % num_pieces;
320
321 let mut pieces = Vec::with_capacity(num_pieces);
322 let mut offsets = Vec::with_capacity(num_pieces);
323 let mut pos = 0;
324
325 for i in 0..num_pieces {
326 // Distribute extra bytes among first `extra` pieces
327 let piece_len = base_piece_len + usize::from(i < extra);
328 offsets.push(pos);
329 pieces.push(bytes[pos..pos + piece_len].to_vec());
330 pos += piece_len;
331 }
332
333 // Only use pigeonhole if pieces are long enough to be selective
334 // Single-byte pieces are no better than regular fuzzy prefilter
335 let min_piece_len = pieces.iter().map(Vec::len).min().unwrap_or(0);
336 if min_piece_len < 2 {
337 return Self::fuzzy(literal, max_edits);
338 }
339
340 Prefilter::PigeonholePieces {
341 pieces,
342 offsets,
343 max_edits: k,
344 }
345 }
346
347 /// Create a prefilter for case-insensitive matching.
348 #[must_use]
349 pub fn case_insensitive(literal: &str) -> Self {
350 if literal.is_empty() {
351 return Prefilter::None;
352 }
353
354 let first = literal.as_bytes()[0];
355
356 if first.is_ascii_alphabetic() {
357 Prefilter::TwoBytes {
358 byte1: first.to_ascii_lowercase(),
359 byte2: first.to_ascii_uppercase(),
360 max_offset: 0,
361 }
362 } else {
363 Prefilter::SingleByte {
364 byte: first,
365 max_offset: 0,
366 }
367 }
368 }
369
370 /// Create a prefilter for multiple fuzzy literals.
371 ///
372 /// Collects first bytes from all patterns and uses memchr to find candidates.
373 /// Each entry is (literal, `max_edits`).
374 #[must_use]
375 pub fn multi_fuzzy(literals: &[(&str, u8)], case_insensitive: bool) -> Self {
376 if literals.is_empty() {
377 return Prefilter::None;
378 }
379
380 // Collect unique first bytes from all patterns
381 let mut search_bytes: Vec<u8> = Vec::new();
382 let mut max_offset: usize = 0;
383
384 for (lit, max_edits) in literals {
385 if lit.is_empty() {
386 continue;
387 }
388
389 let bytes = lit.as_bytes();
390 let max_edits_usize = *max_edits as usize;
391
392 // If any pattern has edits >= length, can't use prefilter
393 if max_edits_usize >= bytes.len() {
394 return Prefilter::None;
395 }
396
397 // Update max_offset (accounts for insertions at start)
398 max_offset = max_offset.max(max_edits_usize);
399
400 // Collect bytes from first (max_edits + 1) positions
401 let search_depth = (max_edits_usize + 1).min(bytes.len());
402 for &b in bytes.iter().take(search_depth) {
403 if !search_bytes.contains(&b) {
404 search_bytes.push(b);
405 }
406 // Also add case variant
407 if case_insensitive && b.is_ascii_alphabetic() {
408 let variant = if b.is_ascii_lowercase() {
409 b.to_ascii_uppercase()
410 } else {
411 b.to_ascii_lowercase()
412 };
413 if !search_bytes.contains(&variant) {
414 search_bytes.push(variant);
415 }
416 }
417 }
418 }
419
420 match search_bytes.len() {
421 0 => Prefilter::None,
422 1 => Prefilter::SingleByte {
423 byte: search_bytes[0],
424 max_offset,
425 },
426 2 => Prefilter::TwoBytes {
427 byte1: search_bytes[0],
428 byte2: search_bytes[1],
429 max_offset,
430 },
431 3 => Prefilter::ThreeBytes {
432 byte1: search_bytes[0],
433 byte2: search_bytes[1],
434 byte3: search_bytes[2],
435 max_offset,
436 },
437 _ => Prefilter::MultiBytes {
438 bytes: search_bytes,
439 max_offset,
440 },
441 }
442 }
443
444 /// Find candidate positions in the text.
445 /// Returns an iterator over byte positions where a match might start.
446 #[must_use]
447 pub fn find_candidates<'a>(&'a self, text: &'a [u8]) -> CandidateIter<'a> {
448 CandidateIter {
449 prefilter: self,
450 text,
451 pos: 0,
452 finder: None,
453 }
454 }
455
456 /// Check if this prefilter does anything useful.
457 #[must_use]
458 pub fn is_active(&self) -> bool {
459 !matches!(self, Prefilter::None)
460 }
461
462 /// Get the `max_offset` for this prefilter (how far back a match could start).
463 #[must_use]
464 pub fn max_offset(&self) -> usize {
465 match self {
466 Prefilter::None | Prefilter::Literal { .. } => 0,
467 Prefilter::SingleByte { max_offset, .. }
468 | Prefilter::TwoBytes { max_offset, .. }
469 | Prefilter::ThreeBytes { max_offset, .. }
470 | Prefilter::MultiBytes { max_offset, .. }
471 | Prefilter::LiteralWithOffset { max_offset, .. }
472 | Prefilter::TwoByteLiteral { max_offset, .. } => *max_offset,
473 Prefilter::FuzzyLiteral { max_edits, .. }
474 | Prefilter::PigeonholePieces { max_edits, .. } => *max_edits,
475 }
476 }
477
478 /// Check if this prefilter is "selective" enough to be effective.
479 /// A prefilter searching for too many different bytes will hit too many positions.
480 /// Returns the number of unique bytes being searched for (lower is better).
481 #[must_use]
482 pub fn selectivity(&self) -> usize {
483 match self {
484 Prefilter::None => usize::MAX,
485 Prefilter::SingleByte { .. } => 1,
486 Prefilter::TwoBytes { .. }
487 | Prefilter::ThreeBytes { .. }
488 | Prefilter::TwoByteLiteral { .. } => 2,
489 Prefilter::MultiBytes { bytes, .. } => bytes.len(),
490 Prefilter::Literal { needle } | Prefilter::LiteralWithOffset { needle, .. } => {
491 needle.len().min(4)
492 }
493 Prefilter::FuzzyLiteral { alt_bytes, .. } => 1 + alt_bytes.len(),
494 Prefilter::PigeonholePieces { pieces, .. } => {
495 pieces.iter().map(Vec::len).min().unwrap_or(1)
496 }
497 }
498 }
499
500 /// Check if this prefilter is likely to be effective for multi-pattern search.
501 /// Returns false if the prefilter searches for too many different bytes.
502 #[must_use]
503 pub fn is_selective(&self) -> bool {
504 // More than 6 different bytes starts to become ineffective
505 // (each common byte appears ~5-10% of the time in English text)
506 // With 6 bytes, we hit ~30-40% of positions which is still reasonable.
507 // With 8+ bytes, we hit ~50%+ which makes prefiltering ineffective.
508 self.selectivity() <= 6
509 }
510}
511
512/// Iterator over candidate start positions.
513pub struct CandidateIter<'a> {
514 /// Reference to the prefilter configuration.
515 prefilter: &'a Prefilter,
516 /// The text being searched.
517 text: &'a [u8],
518 /// Current search position in the text.
519 pos: usize,
520 /// Lazily initialized substring finder for literal searches.
521 finder: Option<memmem::Finder<'a>>,
522}
523
524impl Iterator for CandidateIter<'_> {
525 type Item = usize;
526
527 #[allow(clippy::too_many_lines)]
528 fn next(&mut self) -> Option<usize> {
529 if self.pos >= self.text.len() {
530 return None;
531 }
532
533 match self.prefilter {
534 Prefilter::None => {
535 // Return every position
536 let result = self.pos;
537 self.pos += 1;
538 // Skip to next char boundary
539 while self.pos < self.text.len() && !is_char_boundary(self.text, self.pos) {
540 self.pos += 1;
541 }
542 Some(result)
543 }
544
545 Prefilter::SingleByte { byte, max_offset } => {
546 if let Some(idx) = memchr(*byte, &self.text[self.pos..]) {
547 let found = self.pos + idx;
548 self.pos = found + 1;
549
550 // Return position adjusted by max_offset
551 // Match could start up to max_offset before the found byte
552 Some(found.saturating_sub(*max_offset))
553 } else {
554 self.pos = self.text.len();
555 None
556 }
557 }
558
559 Prefilter::TwoBytes {
560 byte1,
561 byte2,
562 max_offset,
563 } => {
564 if let Some(idx) = memchr2(*byte1, *byte2, &self.text[self.pos..]) {
565 let found = self.pos + idx;
566 self.pos = found + 1;
567 Some(found.saturating_sub(*max_offset))
568 } else {
569 self.pos = self.text.len();
570 None
571 }
572 }
573
574 Prefilter::ThreeBytes {
575 byte1,
576 byte2,
577 byte3,
578 max_offset,
579 } => {
580 if let Some(idx) = memchr3(*byte1, *byte2, *byte3, &self.text[self.pos..]) {
581 let found = self.pos + idx;
582 self.pos = found + 1;
583 Some(found.saturating_sub(*max_offset))
584 } else {
585 self.pos = self.text.len();
586 None
587 }
588 }
589
590 Prefilter::MultiBytes { bytes, max_offset } => {
591 // Linear scan for any of the bytes
592 let remaining = &self.text[self.pos..];
593 if let Some(idx) = remaining.iter().position(|b| bytes.contains(b)) {
594 let found = self.pos + idx;
595 self.pos = found + 1;
596 Some(found.saturating_sub(*max_offset))
597 } else {
598 self.pos = self.text.len();
599 None
600 }
601 }
602
603 Prefilter::Literal { needle } => {
604 // Initialize finder lazily
605 let finder = self
606 .finder
607 .get_or_insert_with(|| memmem::Finder::new(needle));
608
609 if let Some(idx) = finder.find(&self.text[self.pos..]) {
610 let found = self.pos + idx;
611 self.pos = found + 1;
612 Some(found)
613 } else {
614 self.pos = self.text.len();
615 None
616 }
617 }
618
619 Prefilter::LiteralWithOffset { needle, max_offset } => {
620 // Initialize finder lazily - memmem uses SIMD for fast search
621 let finder = self
622 .finder
623 .get_or_insert_with(|| memmem::Finder::new(needle));
624
625 if let Some(idx) = finder.find(&self.text[self.pos..]) {
626 let found = self.pos + idx;
627 self.pos = found + 1;
628 Some(found.saturating_sub(*max_offset))
629 } else {
630 self.pos = self.text.len();
631 None
632 }
633 }
634
635 Prefilter::TwoByteLiteral {
636 byte1,
637 byte2,
638 max_offset,
639 } => {
640 // For short remaining text, use memchr + second byte check (low overhead)
641 // For long remaining text, use memmem (SIMD-efficient for many false positives)
642 let remaining = &self.text[self.pos..];
643
644 // Threshold tuned empirically: memmem SIMD setup cost ~10-15ns
645 // breaks even around 50-100 bytes of scanning
646 const MEMMEM_THRESHOLD: usize = 64;
647
648 if remaining.len() < MEMMEM_THRESHOLD {
649 // Short text: use memchr + check (avoid memmem setup)
650 let mut search_pos = 0;
651 while search_pos < remaining.len() {
652 if let Some(idx) = memchr(*byte1, &remaining[search_pos..]) {
653 let abs_idx = search_pos + idx;
654 if abs_idx + 1 < remaining.len() && remaining[abs_idx + 1] == *byte2 {
655 let found = self.pos + abs_idx;
656 self.pos = found + 1;
657 return Some(found.saturating_sub(*max_offset));
658 }
659 search_pos = abs_idx + 1;
660 } else {
661 break;
662 }
663 }
664 self.pos = self.text.len();
665 None
666 } else {
667 // Long text: use memmem for SIMD-optimized search
668 let needle = [*byte1, *byte2];
669 let finder = memmem::Finder::new(&needle);
670 if let Some(idx) = finder.find(remaining) {
671 let found = self.pos + idx;
672 self.pos = found + 1;
673 Some(found.saturating_sub(*max_offset))
674 } else {
675 self.pos = self.text.len();
676 None
677 }
678 }
679 }
680
681 Prefilter::FuzzyLiteral {
682 first_byte,
683 alt_bytes,
684 max_edits,
685 } => {
686 // Search for first byte or any alternative
687 let remaining = &self.text[self.pos..];
688
689 let idx = if alt_bytes.is_empty() {
690 memchr(*first_byte, remaining)
691 } else if alt_bytes.len() == 1 {
692 memchr2(*first_byte, alt_bytes[0], remaining)
693 } else if alt_bytes.len() == 2 {
694 memchr3(*first_byte, alt_bytes[0], alt_bytes[1], remaining)
695 } else {
696 // Fallback to simple search
697 remaining
698 .iter()
699 .position(|&b| b == *first_byte || alt_bytes.contains(&b))
700 };
701
702 if let Some(idx) = idx {
703 let found = self.pos + idx;
704 self.pos = found + 1;
705 Some(found.saturating_sub(*max_edits))
706 } else {
707 self.pos = self.text.len();
708 None
709 }
710 }
711
712 Prefilter::PigeonholePieces {
713 pieces,
714 offsets,
715 max_edits,
716 } => {
717 // Search for any piece match and return adjusted position
718 // Strategy: search for first bytes of each piece, verify full match
719 let remaining = &self.text[self.pos..];
720
721 // Collect first bytes of all pieces for fast initial scan
722 let first_bytes: Vec<u8> = pieces.iter().map(|p| p[0]).collect();
723
724 // Find the next occurrence of any first byte
725 let mut search_offset = 0;
726 while search_offset < remaining.len() {
727 // Find next position with any of the first bytes
728 let byte_match = match first_bytes.len() {
729 1 => memchr(first_bytes[0], &remaining[search_offset..]),
730 2 => memchr2(first_bytes[0], first_bytes[1], &remaining[search_offset..]),
731 3 => memchr3(
732 first_bytes[0],
733 first_bytes[1],
734 first_bytes[2],
735 &remaining[search_offset..],
736 ),
737 _ => remaining[search_offset..]
738 .iter()
739 .position(|b| first_bytes.contains(b)),
740 };
741
742 match byte_match {
743 Some(idx) => {
744 let abs_idx = search_offset + idx;
745 let text_pos = self.pos + abs_idx;
746
747 // Check which piece(s) match at this position
748 for (piece_idx, piece) in pieces.iter().enumerate() {
749 if remaining[abs_idx..].starts_with(piece) {
750 // Found a piece match!
751 // The pattern could start at text_pos - offset - max_edits
752 let piece_offset = offsets[piece_idx];
753 let candidate_start = text_pos
754 .saturating_sub(piece_offset)
755 .saturating_sub(*max_edits);
756
757 self.pos = text_pos + 1;
758 return Some(candidate_start);
759 }
760 }
761
762 // First byte matched but no piece matched - continue searching
763 search_offset = abs_idx + 1;
764 }
765 None => {
766 // No more first bytes found
767 break;
768 }
769 }
770 }
771
772 self.pos = self.text.len();
773 None
774 }
775 }
776 }
777}
778
779/// Check if a byte position is a UTF-8 char boundary.
780#[inline]
781fn is_char_boundary(text: &[u8], pos: usize) -> bool {
782 if pos >= text.len() {
783 return true;
784 }
785 // UTF-8 continuation bytes start with 10xxxxxx
786 (text[pos] & 0b1100_0000) != 0b1000_0000
787}
788
789/// Extract a prefilter from pattern information.
790#[must_use]
791pub fn create_prefilter(
792 first_literal: Option<&str>,
793 max_edits: Option<u8>,
794 case_insensitive: bool,
795) -> Prefilter {
796 match first_literal {
797 None | Some("") => Prefilter::None,
798 Some(lit) => {
799 if case_insensitive {
800 Prefilter::case_insensitive(lit)
801 } else if let Some(edits) = max_edits {
802 if edits > 0 {
803 Prefilter::fuzzy(lit, edits)
804 } else {
805 Prefilter::exact(lit)
806 }
807 } else {
808 Prefilter::exact(lit)
809 }
810 }
811 }
812}
813
814#[cfg(test)]
815mod tests {
816 use super::*;
817
818 #[test]
819 fn test_exact_prefilter() {
820 let pf = Prefilter::exact("hello");
821 let text = b"say hello world hello";
822 let candidates: Vec<_> = pf.find_candidates(text).collect();
823 assert_eq!(candidates, vec![4, 16]);
824 }
825
826 #[test]
827 fn test_single_byte_prefilter() {
828 let pf = Prefilter::SingleByte {
829 byte: b'h',
830 max_offset: 0,
831 };
832 let text = b"say hello world";
833 let candidates: Vec<_> = pf.find_candidates(text).collect();
834 assert_eq!(candidates, vec![4]);
835 }
836
837 #[test]
838 fn test_fuzzy_prefilter() {
839 // Fuzzy prefilter searches for bytes from first (max_edits+1) positions
840 // "hello" with 2 edits: searches for 'h', 'H', 'e' (first 3 unique bytes with case variants)
841 let pf = Prefilter::fuzzy("hello", 2);
842 let text = b"say hello world";
843 let candidates: Vec<_> = pf.find_candidates(text).collect();
844 // 'h' at position 4 → returns 4-2=2
845 // 'e' at position 5 → returns 5-2=3
846 // 'e' at position 12 → returns 12-2=10
847 // 'l' and 'o' are not in search set (only first 3 bytes used)
848 assert!(candidates.contains(&2), "Should find 'h' at 4, offset to 2");
849 assert!(candidates.contains(&3), "Should find 'e' at 5, offset to 3");
850
851 // For exact match (0 edits), prefilter works normally
852 let pf_exact = Prefilter::fuzzy("hello", 0);
853 let candidates_exact: Vec<_> = pf_exact.find_candidates(text).collect();
854 assert_eq!(candidates_exact, vec![4]); // Just 'h' at position 4
855
856 // For high edit distance >= pattern length, no prefilter
857 let pf_high = Prefilter::fuzzy("hi", 2);
858 assert!(matches!(pf_high, Prefilter::None));
859 }
860
861 #[test]
862 fn test_case_insensitive_prefilter() {
863 let pf = Prefilter::case_insensitive("Hello");
864 let text = b"say HELLO and hello";
865 let candidates: Vec<_> = pf.find_candidates(text).collect();
866 assert_eq!(candidates, vec![4, 14]); // H at 4, h at 14
867 }
868
869 #[test]
870 fn test_pigeonhole_prefilter() {
871 // "hello" with 1 edit → pieces ["hel", "lo"] (at least one must match exactly)
872 let pf = Prefilter::pigeonhole("hello", 1);
873
874 // Should be PigeonholePieces variant
875 assert!(matches!(pf, Prefilter::PigeonholePieces { .. }));
876
877 // Text containing "hel" → should find candidate
878 let text1 = b"say hel world";
879 let candidates1: Vec<_> = pf.find_candidates(text1).collect();
880 assert!(!candidates1.is_empty(), "Should find 'hel' piece");
881
882 // Text containing "lo" → should find candidate
883 let text2 = b"say lo world";
884 let candidates2: Vec<_> = pf.find_candidates(text2).collect();
885 assert!(!candidates2.is_empty(), "Should find 'lo' piece");
886
887 // Text containing "hello" → should find candidate at correct position
888 let text3 = b"say hello world";
889 let candidates3: Vec<_> = pf.find_candidates(text3).collect();
890 assert!(!candidates3.is_empty(), "Should find 'hello'");
891 // "hel" is at position 4, offset 0 in pattern, max_edits=1 → candidate at 4-0-1=3
892 assert!(
893 candidates3.contains(&3),
894 "Candidate should include position 3"
895 );
896
897 // Text without any pieces → no candidates
898 let text4 = b"say xyz world";
899 let candidates4: Vec<_> = pf.find_candidates(text4).collect();
900 assert!(
901 candidates4.is_empty(),
902 "Should not find candidates in text without pieces"
903 );
904
905 // Short patterns fall back to regular fuzzy prefilter
906 let pf_short = Prefilter::pigeonhole("hi", 1);
907 assert!(!matches!(pf_short, Prefilter::PigeonholePieces { .. }));
908 }
909
910 #[test]
911 fn test_pigeonhole_pieces_split() {
912 // Test piece splitting for various pattern/edit combinations
913
914 // "abcdefgh" (8 chars) with 1 edit → 2 pieces of 4 chars each
915 let pf1 = Prefilter::pigeonhole("abcdefgh", 1);
916 if let Prefilter::PigeonholePieces {
917 pieces, offsets, ..
918 } = &pf1
919 {
920 assert_eq!(pieces.len(), 2);
921 assert_eq!(pieces[0], b"abcd");
922 assert_eq!(pieces[1], b"efgh");
923 assert_eq!(offsets, &[0, 4]);
924 } else {
925 panic!("Expected PigeonholePieces");
926 }
927
928 // "abcdefgh" with 3 edits → 4 pieces of 2 chars each
929 let pf2 = Prefilter::pigeonhole("abcdefgh", 3);
930 if let Prefilter::PigeonholePieces {
931 pieces, offsets, ..
932 } = &pf2
933 {
934 assert_eq!(pieces.len(), 4);
935 assert_eq!(pieces[0], b"ab");
936 assert_eq!(pieces[1], b"cd");
937 assert_eq!(pieces[2], b"ef");
938 assert_eq!(pieces[3], b"gh");
939 assert_eq!(offsets, &[0, 2, 4, 6]);
940 } else {
941 panic!("Expected PigeonholePieces");
942 }
943
944 // "abcde" (5 chars) with 2 edits → 3 pieces [2, 2, 1] - but min piece < 2, falls back
945 let pf3 = Prefilter::pigeonhole("abcde", 2);
946 assert!(
947 !matches!(pf3, Prefilter::PigeonholePieces { .. }),
948 "Should fall back when pieces too short"
949 );
950 }
951}