flash_fuzzy_core/
bitap.rs

1//! Bitap (Shift-Or) algorithm for fuzzy string matching
2//!
3//! Implements Wu-Manber extension for approximate matching with errors.
4
5use crate::bloom::{to_lower, BloomFilter};
6use crate::types::SearchMatch;
7use crate::MAX_PATTERN_LEN;
8
9/// Bitap searcher with pre-computed pattern masks
10pub struct BitapSearcher {
11    /// Character bitmasks (256 ASCII chars)
12    char_masks: [u32; 256],
13    /// Pattern length
14    pattern_len: usize,
15    /// Bloom filter for the pattern
16    pattern_bloom: BloomFilter,
17}
18
19impl BitapSearcher {
20    /// Create a new Bitap searcher from a pattern
21    pub fn new(pattern: &[u8]) -> Self {
22        let len = pattern.len().min(MAX_PATTERN_LEN);
23        let mut char_masks = [0u32; 256];
24        let mut bloom_bits = 0u64;
25
26        for (i, &ch) in pattern.iter().take(len).enumerate() {
27            let lower = to_lower(ch);
28            let bit = 1u32 << i;
29
30            // Set bit for lowercase
31            char_masks[lower as usize] |= bit;
32
33            // Also set for original case if different
34            if lower != ch {
35                char_masks[ch as usize] |= bit;
36            }
37
38            // Build bloom filter
39            let bloom_idx = (lower & 0x3F) as u64;
40            bloom_bits |= 1u64 << bloom_idx;
41        }
42
43        Self {
44            char_masks,
45            pattern_len: len,
46            pattern_bloom: BloomFilter(bloom_bits),
47        }
48    }
49
50    /// Get the pattern length
51    #[inline]
52    pub fn pattern_len(&self) -> usize {
53        self.pattern_len
54    }
55
56    /// Get the pattern's bloom filter
57    #[inline]
58    pub fn bloom(&self) -> BloomFilter {
59        self.pattern_bloom
60    }
61
62    /// Search for pattern in text with up to max_errors
63    /// Returns the best match found (lowest error count)
64    pub fn search(&self, text: &[u8], max_errors: u32) -> Option<SearchMatch> {
65        if self.pattern_len == 0 || text.is_empty() {
66            return None;
67        }
68
69        let pattern_len = self.pattern_len as u32;
70
71        // Adaptive max_errors based on pattern length
72        let effective_max_errors = if pattern_len <= 3 {
73            0 // Exact match only for very short patterns
74        } else if pattern_len <= 5 {
75            max_errors.min(1) // Max 1 error for short patterns
76        } else {
77            max_errors
78        };
79
80        // Initialize R array (1 = matched position)
81        let mut r = [0u32; MAX_PATTERN_LEN + 1];
82
83        let pattern_mask = (1u32 << self.pattern_len) - 1;
84        let match_bit = 1u32 << (self.pattern_len - 1);
85
86        let mut best_errors = effective_max_errors + 1;
87        let mut best_pos = 0usize;
88
89        for (pos, &ch) in text.iter().enumerate() {
90            let lower = to_lower(ch);
91            let char_mask = self.char_masks[lower as usize];
92
93            // Save old values for error propagation
94            let mut old_r = r[0];
95
96            // Exact match: shift left, seed new match at pos 0, filter by char
97            r[0] = ((r[0] << 1) | 1) & char_mask;
98
99            // Error levels
100            for k in 1..=(effective_max_errors as usize) {
101                let new_r = r[k];
102
103                // Error transitions:
104                // - Exact match at this level
105                // - Substitution: was at pos i with k-1 errors, now at i+1 with k
106                // - Deletion: was at pos i with k-1 errors, stay at i with k
107                // - Insertion: was at pos i with k errors, now at i+1 with k
108                r[k] = (((r[k] << 1) | 1) & char_mask) |  // exact match
109                       (old_r << 1) |                      // substitution
110                       old_r |                              // deletion
111                       (r[k - 1] << 1);                     // insertion
112
113                old_r = new_r;
114            }
115
116            // Mask to pattern length
117            for i in 0..=(effective_max_errors as usize) {
118                r[i] &= pattern_mask;
119            }
120
121            // Check for matches
122            for k in 0..=effective_max_errors {
123                if (r[k as usize] & match_bit) != 0 {
124                    if k < best_errors {
125                        best_errors = k;
126                        best_pos = pos + 1;
127                    }
128                    break;
129                }
130            }
131        }
132
133        if best_errors <= effective_max_errors {
134            Some(SearchMatch {
135                errors: best_errors,
136                end_pos: best_pos,
137            })
138        } else {
139            None
140        }
141    }
142}
143
144/// Compute score from match result
145///
146/// Score formula:
147/// - Base: 1000 - (errors * 250)
148/// - Position bonus: +50 for start, +25 for near start
149pub fn compute_score(errors: u32, pattern_len: u32, end_pos: usize) -> u16 {
150    let base = 1000u32.saturating_sub(errors * 250);
151
152    let start_pos = end_pos.saturating_sub(pattern_len as usize);
153
154    let pos_bonus = if start_pos == 0 {
155        50
156    } else if start_pos < 10 {
157        25
158    } else {
159        0
160    };
161
162    let score = base + pos_bonus;
163    if score > 1000 { 1000 } else { score as u16 }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn test_exact_match() {
172        let searcher = BitapSearcher::new(b"hello");
173        let result = searcher.search(b"hello world", 0);
174
175        assert!(result.is_some());
176        let m = result.unwrap();
177        assert_eq!(m.errors, 0);
178        assert_eq!(m.end_pos, 5);
179    }
180
181    #[test]
182    fn test_fuzzy_match() {
183        let searcher = BitapSearcher::new(b"hello");
184        let result = searcher.search(b"hallo world", 2);
185
186        assert!(result.is_some());
187        let m = result.unwrap();
188        assert_eq!(m.errors, 1); // 'e' -> 'a' substitution
189    }
190
191    #[test]
192    fn test_no_match() {
193        let searcher = BitapSearcher::new(b"xyz");
194        let result = searcher.search(b"hello world", 0);
195
196        assert!(result.is_none());
197    }
198
199    #[test]
200    fn test_case_insensitive() {
201        let searcher = BitapSearcher::new(b"HELLO");
202        let result = searcher.search(b"hello world", 0);
203
204        assert!(result.is_some());
205        assert_eq!(result.unwrap().errors, 0);
206    }
207
208    #[test]
209    fn test_score_computation() {
210        // Exact match at start
211        assert_eq!(compute_score(0, 5, 5), 1000);
212
213        // 1 error
214        assert_eq!(compute_score(1, 5, 5), 800); // 750 + 50
215
216        // Match not at start
217        assert_eq!(compute_score(0, 5, 15), 1000); // 1000 + 0
218    }
219}