Skip to main content

fuzzy_regex/engine/
fuzzy_nfa.rs

1//! Optimized NFA-based fuzzy matching.
2//!
3//! Similar to mrab-regex's fuzzy guards - uses bit-packed state encoding
4//! for efficient fuzzy matching without bit-vector algorithms.
5
6#![allow(clippy::needless_range_loop)]
7
8use crate::engine::damlev::{DamLevMatch, EditLimits};
9
10/// Optimized NFA-based fuzzy matcher using bit-packed states.
11pub struct FuzzyNfa {
12    pattern: Vec<char>,
13    pattern_len: usize,
14    edit_limits: EditLimits,
15    case_insensitive: bool,
16    first_char: char,
17}
18
19impl FuzzyNfa {
20    /// Create a new optimized fuzzy NFA matcher.
21    #[must_use]
22    pub fn new(pattern: &str, edit_limits: EditLimits, case_insensitive: bool) -> Option<Self> {
23        let pattern: Vec<char> = if case_insensitive {
24            pattern.to_lowercase().chars().collect()
25        } else {
26            pattern.chars().collect()
27        };
28        let pattern_len = pattern.len();
29
30        if pattern_len == 0 || pattern_len > 63 {
31            return None;
32        }
33
34        let first_char = pattern[0];
35
36        Some(FuzzyNfa {
37            pattern,
38            pattern_len,
39            edit_limits,
40            case_insensitive,
41            first_char,
42        })
43    }
44
45    /// Find first match using optimized NFA.
46    #[inline]
47    #[must_use]
48    pub fn find_first(&self, text: &str, threshold: f32) -> Option<DamLevMatch> {
49        let max_edits = self.edit_limits.max_edits as usize;
50
51        if self.pattern_len == 0 {
52            return Some(DamLevMatch {
53                start: 0,
54                end: 0,
55                insertions: 0,
56                deletions: 0,
57                substitutions: 0,
58                swaps: 0,
59                similarity: 1.0,
60            });
61        }
62
63        let text_chars: Vec<char> = if self.case_insensitive {
64            text.chars()
65                .map(|c| c.to_lowercase().next().unwrap_or(c))
66                .collect()
67        } else {
68            text.chars().collect()
69        };
70        let text_len = text_chars.len();
71
72        let mut char_to_byte: Vec<usize> = vec![0; text_len + 1];
73        let mut byte_pos = 0;
74        for (i, c) in text.char_indices() {
75            char_to_byte[i] = byte_pos;
76            byte_pos += c.len_utf8();
77        }
78        char_to_byte[text_len] = byte_pos;
79
80        if text_len == 0 {
81            if self.pattern_len <= max_edits {
82                let sim = 1.0 - (self.pattern_len as f32 / (self.pattern_len + max_edits) as f32);
83                return Some(DamLevMatch {
84                    start: 0,
85                    end: 0,
86                    insertions: 0,
87                    deletions: self.pattern_len as u8,
88                    substitutions: 0,
89                    swaps: 0,
90                    similarity: sim,
91                });
92            }
93            return None;
94        }
95
96        let m = self.pattern_len;
97
98        let mut states: [(usize, usize, usize, u8, u8, u8); 128] = [(0, 0, 0, 0, 0, 0); 128];
99        let mut state_count = 0;
100        let mut seen: u128 = 0;
101
102        let encode_key =
103            |pat_pos: usize, edits: usize| -> u128 { ((pat_pos as u128) << 6) | (edits as u128) };
104
105        let mut pos = 0;
106        while pos < text_len {
107            let text_char = text_chars[pos];
108            let mut new_states: [(usize, usize, usize, u8, u8, u8); 128] =
109                [(0, 0, 0, 0, 0, 0); 128];
110            let mut new_state_count = 0;
111            let mut new_seen: u128 = 0;
112
113            let initial_key = encode_key(0, 0);
114            if seen & (1u128 << initial_key) == 0 {
115                states[0] = (0, 0, pos, 0, 0, 0);
116                state_count = 1;
117            }
118
119            for i in 0..state_count {
120                let (pat_pos, edits, start_pos, ins, del, sub) = states[i];
121
122                if pat_pos >= m || edits > max_edits {
123                    continue;
124                }
125
126                let pat_char = self.pattern[pat_pos];
127
128                if text_char == pat_char {
129                    let key = encode_key(pat_pos + 1, edits);
130                    if new_seen & (1u128 << key) == 0 {
131                        new_states[new_state_count] =
132                            (pat_pos + 1, edits, start_pos, ins, del, sub);
133                        new_state_count += 1;
134                        new_seen |= 1u128 << key;
135                    }
136                }
137
138                if edits < max_edits && text_char != pat_char {
139                    let key = encode_key(pat_pos + 1, edits + 1);
140                    if new_seen & (1u128 << key) == 0 {
141                        new_states[new_state_count] =
142                            (pat_pos + 1, edits + 1, start_pos, ins, del, sub + 1);
143                        new_state_count += 1;
144                        new_seen |= 1u128 << key;
145                    }
146                }
147
148                if edits < max_edits {
149                    let key = encode_key(pat_pos, edits + 1);
150                    if new_seen & (1u128 << key) == 0 {
151                        new_states[new_state_count] =
152                            (pat_pos, edits + 1, start_pos, ins + 1, del, sub);
153                        new_state_count += 1;
154                        new_seen |= 1u128 << key;
155                    }
156                }
157
158                if pat_pos + 1 < m && edits < max_edits {
159                    let key = encode_key(pat_pos + 1, edits + 1);
160                    if new_seen & (1u128 << key) == 0 {
161                        new_states[new_state_count] =
162                            (pat_pos + 1, edits + 1, start_pos, ins, del + 1, sub);
163                        new_state_count += 1;
164                        new_seen |= 1u128 << key;
165                    }
166                }
167            }
168
169            for i in 0..new_state_count {
170                let (pat_pos, edits, start_pos, ins, del, sub) = new_states[i];
171                if pat_pos >= m {
172                    let sim = 1.0 - (edits as f32 / (m + max_edits) as f32);
173                    if sim >= threshold {
174                        let match_end = pos + 1;
175                        let byte_start = char_to_byte[start_pos];
176                        let byte_end = char_to_byte[match_end];
177                        return Some(DamLevMatch {
178                            start: byte_start,
179                            end: byte_end,
180                            insertions: ins,
181                            deletions: del,
182                            substitutions: sub,
183                            swaps: 0,
184                            similarity: sim,
185                        });
186                    }
187                }
188            }
189
190            states = new_states;
191            state_count = new_state_count;
192            seen = new_seen;
193
194            if state_count == 0 {
195                pos += 1;
196                while pos < text_len && text_chars[pos] != self.first_char {
197                    pos += 1;
198                }
199            } else {
200                pos += 1;
201            }
202        }
203
204        for i in 0..state_count {
205            let (pat_pos, edits, start_pos, ins, del, sub) = states[i];
206            if pat_pos >= m {
207                let sim = 1.0 - (edits as f32 / (m + max_edits) as f32);
208                if sim >= threshold {
209                    let byte_start = char_to_byte[start_pos];
210                    let byte_end = char_to_byte[text_len];
211                    return Some(DamLevMatch {
212                        start: byte_start,
213                        end: byte_end,
214                        insertions: ins,
215                        deletions: del,
216                        substitutions: sub,
217                        swaps: 0,
218                        similarity: sim,
219                    });
220                }
221            }
222
223            let remaining = m - pat_pos;
224            let new_edits = edits + remaining;
225            if new_edits <= max_edits {
226                let new_del = del + remaining as u8;
227                let sim = 1.0 - (new_edits as f32 / (m + max_edits) as f32);
228                if sim >= threshold {
229                    let byte_start = char_to_byte[start_pos];
230                    let byte_end = char_to_byte[text_len];
231                    return Some(DamLevMatch {
232                        start: byte_start,
233                        end: byte_end,
234                        insertions: ins,
235                        deletions: new_del,
236                        substitutions: sub,
237                        swaps: 0,
238                        similarity: sim,
239                    });
240                }
241            }
242        }
243
244        None
245    }
246}