fuzzy_aho_corasick/
lib.rs

1#![warn(clippy::pedantic)]
2#![allow(clippy::too_many_lines, clippy::cast_precision_loss)]
3mod builder;
4mod replacer;
5mod segment;
6mod structs;
7#[cfg(test)]
8mod tests;
9
10pub use replacer::FuzzyReplacer;
11
12pub use builder::FuzzyAhoCorasickBuilder;
13use std::borrow::Cow;
14use std::collections::BTreeMap;
15use unicode_segmentation::UnicodeSegmentation;
16pub type PatternIndex = usize;
17pub use structs::*;
18
19#[allow(unused_macros)]
20#[cfg(test)]
21macro_rules! trace {
22    ($($arg:tt)*) => { println!($($arg)*); };
23}
24#[allow(unused_macros)]
25#[cfg(not(test))]
26macro_rules! trace {
27    ($($arg:tt)*) => {};
28}
29
30/// Fuzzy Aho—Corasick engine
31impl FuzzyAhoCorasick {
32    #[inline]
33    fn get_node_limits(&self, node: usize) -> Option<&FuzzyLimits> {
34        self.nodes[node]
35            .pattern_index
36            .and_then(|i| self.patterns.get(i).and_then(|p| p.limits.as_ref()))
37    }
38    #[inline]
39    fn within_limits_ins_del_ahead(
40        &self,
41        limits: Option<&FuzzyLimits>,
42        edits: NumEdits,
43        insertions: NumEdits,
44        deletions: NumEdits,
45    ) -> (bool, bool) {
46        if let Some(max) = limits.or(self.limits.as_ref()) {
47            let edits_ok = max.edits.is_none_or(|max| edits < max);
48            (
49                edits_ok && max.insertions.is_none_or(|max| insertions < max),
50                edits_ok && max.deletions.is_none_or(|max| deletions < max),
51            )
52        } else {
53            (false, false)
54        }
55    }
56
57    #[inline]
58    fn within_limits_swap_ahead(
59        &self,
60        limits: Option<&FuzzyLimits>,
61        edits: NumEdits,
62        swaps: NumEdits,
63    ) -> bool {
64        if let Some(max) = limits.or(self.limits.as_ref()) {
65            let result =
66                max.edits.is_none_or(|max| edits < max) && max.swaps.is_none_or(|max| swaps < max);
67            /*println!(
68                "within_limits_swap_ahead() -- max: {max:?} edits: {edits:?} swaps: {swaps:?}\
69                \nresult = {result:?}\n"
70            );*/
71            result
72        } else {
73            false
74        }
75    }
76
77    #[inline]
78    fn within_limits_subst(
79        &self,
80        limits: Option<&FuzzyLimits>,
81        edits: NumEdits,
82        substitutions: NumEdits,
83    ) -> bool {
84        if let Some(max) = limits.or(self.limits.as_ref()) {
85            let result = max.edits.is_none_or(|max| edits <= max)
86                && max.substitutions.is_none_or(|max| substitutions <= max);
87            /*println!(
88                "within_limits_subst_ahead() -- max: {max:?} edits: {edits:?} substitutions: {substitutions:?}\
89                \nresult = {result:?}\n"
90            );*/
91            result
92        } else {
93            edits == 0 && substitutions == 0
94        }
95    }
96
97    #[inline]
98    fn within_limits(
99        &self,
100        limits: Option<&FuzzyLimits>,
101        edits: NumEdits,
102        insertions: NumEdits,
103        deletions: NumEdits,
104        substitutions: NumEdits,
105        swaps: NumEdits,
106    ) -> bool {
107        if let Some(max) = limits.or(self.limits.as_ref()) {
108            let result = max.edits.is_none_or(|max| edits <= max)
109                && max.insertions.is_none_or(|max| insertions <= max)
110                && max.deletions.is_none_or(|max| deletions <= max)
111                && max.substitutions.is_none_or(|max| substitutions <= max)
112                && max.swaps.is_none_or(|max| swaps <= max);
113            /*println!(
114                "within_limits() -- max: {max:?} edits: {edits:?} insertions: {insertions:?} deletions: {deletions:?} substitutions: {substitutions:?} swaps: {swaps:?}\
115                \nresult = {result:?}\n"
116            );*/
117            result
118        } else {
119            edits == 0 && insertions == 0 && deletions == 0 && substitutions == 0 && swaps == 0
120        }
121    }
122
123    #[inline]
124    #[allow(clippy::too_many_arguments)]
125    fn scalar_output_handling(
126        &self,
127        output: &[usize],
128        penalties: f32,
129        edits: usize,
130        insertions: usize,
131        deletions: usize,
132        substitutions: usize,
133        swaps: usize,
134        matched_start: usize,
135        matched_end: usize,
136        grapheme_idx: &[(usize, &str)],
137        text: &str,
138        best: &mut BTreeMap<(usize, usize, usize), FuzzyMatch>,
139        similarity_threshold: f32,
140        #[cfg(debug_assertions)] notes: &Vec<String>,
141    ) {
142        for &pat_idx in output {
143            if !self.within_limits(
144                self.patterns[pat_idx].limits.as_ref(),
145                edits,
146                insertions,
147                deletions,
148                substitutions,
149                swaps,
150            ) {
151                continue;
152            }
153            let start_byte = grapheme_idx.get(matched_start).map_or(0, |&(b, _)| b);
154            let end_byte = grapheme_idx
155                .get(matched_end)
156                .map_or_else(|| text.len(), |&(b, _)| b);
157            let key = (start_byte, end_byte, pat_idx);
158
159            let total = self.patterns[pat_idx].grapheme_len as f32;
160
161            let similarity = (total - penalties) / total * self.patterns[pat_idx].weight;
162
163            if similarity < similarity_threshold {
164                continue;
165            }
166
167            best.entry(key)
168                .and_modify(|entry| {
169                    if similarity > entry.similarity {
170                        *entry = FuzzyMatch {
171                            insertions,
172                            deletions,
173                            substitutions,
174                            edits,
175                            swaps,
176                            pattern_index: pat_idx,
177                            start: start_byte,
178                            end: end_byte,
179                            pattern: self.patterns[pat_idx].pattern.clone(),
180                            similarity,
181                            text: text[start_byte..end_byte].to_string(),
182                            #[cfg(debug_assertions)]
183                            notes: notes.to_owned(),
184                        };
185                    }
186                })
187                .or_insert_with(|| FuzzyMatch {
188                    insertions,
189                    deletions,
190                    substitutions,
191                    edits,
192                    swaps,
193                    pattern_index: pat_idx,
194                    start: start_byte,
195                    end: end_byte,
196                    pattern: self.patterns[pat_idx].pattern.clone(),
197                    similarity,
198                    text: text[start_byte..end_byte].to_string(),
199                    #[cfg(debug_assertions)]
200                    notes: notes.clone(),
201                });
202        }
203    }
204
205    #[inline]
206    #[must_use]
207    pub fn search(&self, haystack: &str, similarity_threshold: f32) -> Vec<FuzzyMatch> {
208        let grapheme_idx: Vec<(usize, &str)> = haystack.grapheme_indices(true).collect();
209        if grapheme_idx.is_empty() {
210            return vec![];
211        }
212        let text_chars: Vec<Cow<str>> = grapheme_idx
213            .iter()
214            .map(|(_, g)| {
215                if self.case_insensitive {
216                    Cow::Owned(g.to_lowercase())
217                } else {
218                    Cow::Borrowed(*g)
219                }
220            })
221            .collect();
222
223        let mut best: BTreeMap<(usize, usize, usize), FuzzyMatch> = BTreeMap::new();
224
225        let mut queue: Vec<State> = Vec::with_capacity(64);
226
227        trace!(
228            "=== fuzzy_search on {haystack:?} (similarity_threshold {similarity_threshold:.2}) ===",
229        );
230        for start in 0..text_chars.len() {
231            trace!(
232                "=== new window at grapheme #{start} ({:?}) ===",
233                text_chars[start]
234            );
235
236            queue.clear();
237            queue.push(State {
238                node: 0,
239                j: start,
240                matched_start: start,
241                matched_end: start,
242                penalties: 0.,
243                edits: 0,
244                insertions: 0,
245                deletions: 0,
246                substitutions: 0,
247                swaps: 0,
248                #[cfg(debug_assertions)]
249                notes: vec![],
250            });
251
252            let mut q_idx = 0;
253            while q_idx < queue.len() {
254                let State {
255                    node,
256                    j,
257                    matched_start,
258                    matched_end,
259                    penalties,
260                    edits,
261                    insertions,
262                    deletions,
263                    substitutions,
264                    swaps,
265                    ..
266                } = queue[q_idx];
267                #[cfg(debug_assertions)]
268                let mut notes = queue[q_idx].notes.clone();
269                q_idx += 1;
270
271                /*trace!(
272                    "visit: node={} j={} span=({}->{}) score={:.3} edits={}",
273                    node, j, matched_start, matched_end, score, edits
274                );*/
275
276                let Node {
277                    output,
278                    transitions,
279                    ..
280                } = &self.nodes[node];
281
282                if !output.is_empty() {
283                    self.scalar_output_handling(
284                        output,
285                        penalties,
286                        edits,
287                        insertions,
288                        deletions,
289                        substitutions,
290                        swaps,
291                        matched_start,
292                        matched_end,
293                        &grapheme_idx,
294                        haystack,
295                        &mut best,
296                        similarity_threshold,
297                        #[cfg(debug_assertions)]
298                        &notes,
299                    );
300                }
301
302                if j == text_chars.len() {
303                    continue;
304                }
305
306                let current_grapheme = text_chars[j].as_ref();
307                let current_grapheme_first_char = current_grapheme.chars().next().unwrap_or('\0');
308
309                // 1)  Same or similar symbol
310                for (edge_g, &next_node) in transitions {
311                    #[cfg(debug_assertions)]
312                    let mut notes = notes.clone();
313                    let g_ch = edge_g.chars().next().unwrap_or('\0');
314                    let (next_penalties, next_edits, next_subs) = if edge_g == current_grapheme {
315                        trace!(
316                            "  match   {:>8} ─{:>3}→ node={}  sim=1.00",
317                            edge_g, "ok", next_node
318                        );
319                        (penalties, edits, substitutions)
320                    } else {
321                        let sim = *self
322                            .similarity
323                            .get(&(g_ch, current_grapheme_first_char))
324                            .unwrap_or(&0.);
325                        let penalty = self.penalties.substitution * (1. - sim);
326
327                        if !self.within_limits_subst(
328                            self.get_node_limits(node),
329                            edits,
330                            substitutions,
331                        ) {
332                            continue;
333                        }
334
335                        trace!(
336                            "  subst {:?} ─{:>3}→ {current_grapheme:?} node={:?}  base_penalty={:.2} sim={:.2} penalty={:.2} edits+1={:?}",
337                            edge_g,
338                            "sub",
339                            next_node,
340                            self.penalties.substitution,
341                            sim,
342                            penalty,
343                            edits + 1
344                        );
345                        #[cfg(debug_assertions)]
346                        notes.push(format!(
347                            "sub {edge_g:?} -> {current_grapheme:?} (sim={sim:?}, penalty={penalty:?}) (sub+1={:?}, edits+1={:?})",
348                            substitutions + 1,
349                            edits + 1,
350                        ));
351                        (penalties + penalty, edits + 1, substitutions + 1)
352                    };
353
354                    queue.push(State {
355                        node: next_node,
356                        j: j + 1,
357                        matched_start: if matched_end == matched_start {
358                            j
359                        } else {
360                            matched_start
361                        },
362                        matched_end: j + 1,
363                        penalties: next_penalties,
364                        edits: next_edits,
365                        insertions,
366                        deletions,
367                        substitutions: next_subs,
368                        swaps,
369                        #[cfg(debug_assertions)]
370                        notes,
371                    });
372                }
373
374                // Swap (transposition of two neighboring graphemes)
375                if j + 1 < text_chars.len() {
376                    let a = &text_chars[j];
377                    let b = &text_chars[j + 1];
378                    // check if the node has B-transition and then A-transition
379                    if let Some(&node) = transitions
380                        .get(b.as_ref())
381                        .and_then(|&x| self.nodes[x].transitions.get(a.as_ref()))
382                    {
383                        // Checking swap
384                        // Correct option
385                        if self.within_limits_swap_ahead(self.get_node_limits(node), edits, swaps) {
386                            #[cfg(debug_assertions)]
387                            notes.push(format!(
388                                "swap a:{a:?} b:{b:?} (swaps+1={:?}, edits+1={:?})",
389                                swaps + 1,
390                                edits + 1,
391                            ));
392                            queue.push(State {
393                                node,
394                                j: j + 2,
395                                matched_start,
396                                matched_end: j + 2,
397                                penalties: penalties + self.penalties.swap,
398                                edits: edits + 1,
399                                insertions,
400                                deletions,
401                                substitutions,
402                                swaps: swaps + 1,
403                                #[cfg(debug_assertions)]
404                                notes: notes.clone(),
405                            });
406                        }
407                    }
408                }
409
410                // 3)  Insertion / Deletion
411                let (ins_ex, del_ex) = self.within_limits_ins_del_ahead(
412                    self.get_node_limits(node),
413                    edits,
414                    insertions,
415                    deletions,
416                );
417                if ins_ex || del_ex {
418                    trace!(
419                        "  insert  (skip {:?})  penalty={:.2}",
420                        text_chars[j], self.penalties.insertion
421                    );
422                    if ins_ex && (matched_start != matched_end || matched_start != j) {
423                        #[cfg(debug_assertions)]
424                        notes.push(format!(
425                            "ins {:?} (sub+1={:?}, edits+1={:?})",
426                            text_chars[j],
427                            substitutions + 1,
428                            edits + 1,
429                        ));
430                        queue.push(State {
431                            node,
432                            j: j + 1,
433                            matched_start,
434                            matched_end,
435                            penalties: penalties + self.penalties.insertion,
436                            edits: edits + 1,
437                            insertions: insertions + 1,
438                            deletions,
439                            substitutions,
440                            swaps,
441                            #[cfg(debug_assertions)]
442                            notes: notes.clone(),
443                        });
444                    }
445                    if del_ex {
446                        for (edge_g, &next_node) in transitions {
447                            trace!(
448                                "  delete to node={next_node} penalty={:.2}",
449                                self.penalties.deletion
450                            );
451                            #[cfg(debug_assertions)]
452                            notes.push(format!("del {:?} (del+1={:?})", edge_g, deletions + 1,));
453                            queue.push(State {
454                                node: next_node,
455                                j,
456                                matched_start,
457                                matched_end,
458                                penalties: penalties + self.penalties.deletion,
459                                edits: edits + 1,
460                                insertions,
461                                deletions: deletions + 1,
462                                substitutions,
463                                swaps,
464                                #[cfg(debug_assertions)]
465                                notes: notes.clone(),
466                            });
467                        }
468                    }
469                }
470            }
471        }
472
473        best.into_values().collect()
474    }
475
476    /// Search without overlapping matches (the engine will greedily choose the
477    /// longest non‑overlapping matches from left to right).
478    #[must_use]
479    pub fn search_non_overlapping(
480        &self,
481        haystack: &str,
482        similarity_threshold: f32,
483    ) -> Vec<FuzzyMatch> {
484        let mut matches = self.search(haystack, similarity_threshold);
485        #[cfg(test)]
486        trace!("raw matches: {:?}", matches);
487        matches.sort_by(|left, right| {
488            right
489                .similarity
490                .total_cmp(&left.similarity)
491                .then_with(|| (right.end - right.start).cmp(&(left.end - left.start)))
492                .then_with(|| left.start.cmp(&right.start))
493        });
494        let mut chosen = Vec::new();
495        let mut occupied_intervals: BTreeMap<usize, usize> = BTreeMap::new();
496        for matched in matches {
497            if occupied_intervals
498                .range(..=matched.start)
499                .next_back()
500                .is_none_or(|(_, &end)| end <= matched.start)
501                && occupied_intervals
502                    .range(matched.start..)
503                    .next()
504                    .is_none_or(|(&start, _)| start >= matched.end)
505            {
506                occupied_intervals.insert(matched.start, matched.end);
507                chosen.push(matched);
508            }
509        }
510
511        chosen.sort_by_key(|m| m.start);
512        chosen
513    }
514
515    /// Performs a **fuzzy** find‑and‑replace using a list of `(pattern →
516    /// replacement)` pairs.  Replacements are applied left‑to‑right, the longest
517    /// non‑overlapping match wins.
518    #[must_use]
519    pub fn replace<'a, F>(&self, text: &str, callback: F, threshold: f32) -> String
520    where
521        F: Fn(&FuzzyMatch) -> Option<&'a str>,
522    {
523        let mut result = String::new();
524        let mut last = 0;
525        for matched in self.search_non_overlapping(text, threshold) {
526            if matched.start >= last {
527                result.push_str(&text[last..matched.start]);
528                last = matched.end;
529                result.push_str(callback(&matched).unwrap_or(&matched.text));
530            }
531        }
532        result.push_str(&text[last..]);
533        result
534    }
535}