1#![warn(clippy::pedantic)]
2#![allow(clippy::too_many_lines, clippy::cast_precision_loss)]
3mod builder;
4mod replacer;
5mod segment;
6mod structs;
7#[cfg(test)]
8mod tests;
9
10pub use replacer::FuzzyReplacer;
11
12pub use builder::FuzzyAhoCorasickBuilder;
13use std::borrow::Cow;
14use std::collections::BTreeMap;
15use unicode_segmentation::UnicodeSegmentation;
16pub type PatternIndex = usize;
17pub use structs::*;
18
19#[allow(unused_macros)]
20#[cfg(test)]
21macro_rules! trace {
22 ($($arg:tt)*) => { println!($($arg)*); };
23}
24#[allow(unused_macros)]
25#[cfg(not(test))]
26macro_rules! trace {
27 ($($arg:tt)*) => {};
28}
29
30impl FuzzyAhoCorasick {
32 #[inline]
33 fn get_node_limits(&self, node: usize) -> Option<&FuzzyLimits> {
34 self.nodes[node]
35 .pattern_index
36 .and_then(|i| self.patterns.get(i).and_then(|p| p.limits.as_ref()))
37 }
38 #[inline]
39 fn within_limits_ins_del_ahead(
40 &self,
41 limits: Option<&FuzzyLimits>,
42 edits: NumEdits,
43 insertions: NumEdits,
44 deletions: NumEdits,
45 ) -> (bool, bool) {
46 if let Some(max) = limits.or(self.limits.as_ref()) {
47 let edits_ok = max.edits.is_none_or(|max| edits < max);
48 (
49 edits_ok && max.insertions.is_none_or(|max| insertions < max),
50 edits_ok && max.deletions.is_none_or(|max| deletions < max),
51 )
52 } else {
53 (false, false)
54 }
55 }
56
57 #[inline]
58 fn within_limits_swap_ahead(
59 &self,
60 limits: Option<&FuzzyLimits>,
61 edits: NumEdits,
62 swaps: NumEdits,
63 ) -> bool {
64 if let Some(max) = limits.or(self.limits.as_ref()) {
65 let result =
66 max.edits.is_none_or(|max| edits < max) && max.swaps.is_none_or(|max| swaps < max);
67 result
72 } else {
73 false
74 }
75 }
76
77 #[inline]
78 fn within_limits_subst(
79 &self,
80 limits: Option<&FuzzyLimits>,
81 edits: NumEdits,
82 substitutions: NumEdits,
83 ) -> bool {
84 if let Some(max) = limits.or(self.limits.as_ref()) {
85 let result = max.edits.is_none_or(|max| edits <= max)
86 && max.substitutions.is_none_or(|max| substitutions <= max);
87 result
92 } else {
93 edits == 0 && substitutions == 0
94 }
95 }
96
97 #[inline]
98 fn within_limits(
99 &self,
100 limits: Option<&FuzzyLimits>,
101 edits: NumEdits,
102 insertions: NumEdits,
103 deletions: NumEdits,
104 substitutions: NumEdits,
105 swaps: NumEdits,
106 ) -> bool {
107 if let Some(max) = limits.or(self.limits.as_ref()) {
108 let result = max.edits.is_none_or(|max| edits <= max)
109 && max.insertions.is_none_or(|max| insertions <= max)
110 && max.deletions.is_none_or(|max| deletions <= max)
111 && max.substitutions.is_none_or(|max| substitutions <= max)
112 && max.swaps.is_none_or(|max| swaps <= max);
113 result
118 } else {
119 edits == 0 && insertions == 0 && deletions == 0 && substitutions == 0 && swaps == 0
120 }
121 }
122
123 #[inline]
124 #[allow(clippy::too_many_arguments)]
125 fn scalar_output_handling(
126 &self,
127 output: &[usize],
128 penalties: f32,
129 edits: usize,
130 insertions: usize,
131 deletions: usize,
132 substitutions: usize,
133 swaps: usize,
134 matched_start: usize,
135 matched_end: usize,
136 grapheme_idx: &[(usize, &str)],
137 text: &str,
138 best: &mut BTreeMap<(usize, usize, usize), FuzzyMatch>,
139 similarity_threshold: f32,
140 #[cfg(debug_assertions)] notes: &Vec<String>,
141 ) {
142 for &pat_idx in output {
143 if !self.within_limits(
144 self.patterns[pat_idx].limits.as_ref(),
145 edits,
146 insertions,
147 deletions,
148 substitutions,
149 swaps,
150 ) {
151 continue;
152 }
153 let start_byte = grapheme_idx.get(matched_start).map_or(0, |&(b, _)| b);
154 let end_byte = grapheme_idx
155 .get(matched_end)
156 .map_or_else(|| text.len(), |&(b, _)| b);
157 let key = (start_byte, end_byte, pat_idx);
158
159 let total = self.patterns[pat_idx].grapheme_len as f32;
160
161 let similarity = (total - penalties) / total * self.patterns[pat_idx].weight;
162
163 if similarity < similarity_threshold {
164 continue;
165 }
166
167 best.entry(key)
168 .and_modify(|entry| {
169 if similarity > entry.similarity {
170 *entry = FuzzyMatch {
171 insertions,
172 deletions,
173 substitutions,
174 edits,
175 swaps,
176 pattern_index: pat_idx,
177 start: start_byte,
178 end: end_byte,
179 pattern: self.patterns[pat_idx].pattern.clone(),
180 similarity,
181 text: text[start_byte..end_byte].to_string(),
182 #[cfg(debug_assertions)]
183 notes: notes.to_owned(),
184 };
185 }
186 })
187 .or_insert_with(|| FuzzyMatch {
188 insertions,
189 deletions,
190 substitutions,
191 edits,
192 swaps,
193 pattern_index: pat_idx,
194 start: start_byte,
195 end: end_byte,
196 pattern: self.patterns[pat_idx].pattern.clone(),
197 similarity,
198 text: text[start_byte..end_byte].to_string(),
199 #[cfg(debug_assertions)]
200 notes: notes.clone(),
201 });
202 }
203 }
204
205 #[inline]
206 #[must_use]
207 pub fn search(&self, haystack: &str, similarity_threshold: f32) -> Vec<FuzzyMatch> {
208 let grapheme_idx: Vec<(usize, &str)> = haystack.grapheme_indices(true).collect();
209 if grapheme_idx.is_empty() {
210 return vec![];
211 }
212 let text_chars: Vec<Cow<str>> = grapheme_idx
213 .iter()
214 .map(|(_, g)| {
215 if self.case_insensitive {
216 Cow::Owned(g.to_lowercase())
217 } else {
218 Cow::Borrowed(*g)
219 }
220 })
221 .collect();
222
223 let mut best: BTreeMap<(usize, usize, usize), FuzzyMatch> = BTreeMap::new();
224
225 let mut queue: Vec<State> = Vec::with_capacity(64);
226
227 trace!(
228 "=== fuzzy_search on {haystack:?} (similarity_threshold {similarity_threshold:.2}) ===",
229 );
230 for start in 0..text_chars.len() {
231 trace!(
232 "=== new window at grapheme #{start} ({:?}) ===",
233 text_chars[start]
234 );
235
236 queue.clear();
237 queue.push(State {
238 node: 0,
239 j: start,
240 matched_start: start,
241 matched_end: start,
242 penalties: 0.,
243 edits: 0,
244 insertions: 0,
245 deletions: 0,
246 substitutions: 0,
247 swaps: 0,
248 #[cfg(debug_assertions)]
249 notes: vec![],
250 });
251
252 let mut q_idx = 0;
253 while q_idx < queue.len() {
254 let State {
255 node,
256 j,
257 matched_start,
258 matched_end,
259 penalties,
260 edits,
261 insertions,
262 deletions,
263 substitutions,
264 swaps,
265 ..
266 } = queue[q_idx];
267 #[cfg(debug_assertions)]
268 let mut notes = queue[q_idx].notes.clone();
269 q_idx += 1;
270
271 let Node {
277 output,
278 transitions,
279 ..
280 } = &self.nodes[node];
281
282 if !output.is_empty() {
283 self.scalar_output_handling(
284 output,
285 penalties,
286 edits,
287 insertions,
288 deletions,
289 substitutions,
290 swaps,
291 matched_start,
292 matched_end,
293 &grapheme_idx,
294 haystack,
295 &mut best,
296 similarity_threshold,
297 #[cfg(debug_assertions)]
298 ¬es,
299 );
300 }
301
302 if j == text_chars.len() {
303 continue;
304 }
305
306 let current_grapheme = text_chars[j].as_ref();
307 let current_grapheme_first_char = current_grapheme.chars().next().unwrap_or('\0');
308
309 for (edge_g, &next_node) in transitions {
311 #[cfg(debug_assertions)]
312 let mut notes = notes.clone();
313 let g_ch = edge_g.chars().next().unwrap_or('\0');
314 let (next_penalties, next_edits, next_subs) = if edge_g == current_grapheme {
315 trace!(
316 " match {:>8} ─{:>3}→ node={} sim=1.00",
317 edge_g, "ok", next_node
318 );
319 (penalties, edits, substitutions)
320 } else {
321 let sim = *self
322 .similarity
323 .get(&(g_ch, current_grapheme_first_char))
324 .unwrap_or(&0.);
325 let penalty = self.penalties.substitution * (1. - sim);
326
327 if !self.within_limits_subst(
328 self.get_node_limits(node),
329 edits,
330 substitutions,
331 ) {
332 continue;
333 }
334
335 trace!(
336 " subst {:?} ─{:>3}→ {current_grapheme:?} node={:?} base_penalty={:.2} sim={:.2} penalty={:.2} edits+1={:?}",
337 edge_g,
338 "sub",
339 next_node,
340 self.penalties.substitution,
341 sim,
342 penalty,
343 edits + 1
344 );
345 #[cfg(debug_assertions)]
346 notes.push(format!(
347 "sub {edge_g:?} -> {current_grapheme:?} (sim={sim:?}, penalty={penalty:?}) (sub+1={:?}, edits+1={:?})",
348 substitutions + 1,
349 edits + 1,
350 ));
351 (penalties + penalty, edits + 1, substitutions + 1)
352 };
353
354 queue.push(State {
355 node: next_node,
356 j: j + 1,
357 matched_start: if matched_end == matched_start {
358 j
359 } else {
360 matched_start
361 },
362 matched_end: j + 1,
363 penalties: next_penalties,
364 edits: next_edits,
365 insertions,
366 deletions,
367 substitutions: next_subs,
368 swaps,
369 #[cfg(debug_assertions)]
370 notes,
371 });
372 }
373
374 if j + 1 < text_chars.len() {
376 let a = &text_chars[j];
377 let b = &text_chars[j + 1];
378 if let Some(&node) = transitions
380 .get(b.as_ref())
381 .and_then(|&x| self.nodes[x].transitions.get(a.as_ref()))
382 {
383 if self.within_limits_swap_ahead(self.get_node_limits(node), edits, swaps) {
386 #[cfg(debug_assertions)]
387 notes.push(format!(
388 "swap a:{a:?} b:{b:?} (swaps+1={:?}, edits+1={:?})",
389 swaps + 1,
390 edits + 1,
391 ));
392 queue.push(State {
393 node,
394 j: j + 2,
395 matched_start,
396 matched_end: j + 2,
397 penalties: penalties + self.penalties.swap,
398 edits: edits + 1,
399 insertions,
400 deletions,
401 substitutions,
402 swaps: swaps + 1,
403 #[cfg(debug_assertions)]
404 notes: notes.clone(),
405 });
406 }
407 }
408 }
409
410 let (ins_ex, del_ex) = self.within_limits_ins_del_ahead(
412 self.get_node_limits(node),
413 edits,
414 insertions,
415 deletions,
416 );
417 if ins_ex || del_ex {
418 trace!(
419 " insert (skip {:?}) penalty={:.2}",
420 text_chars[j], self.penalties.insertion
421 );
422 if ins_ex && (matched_start != matched_end || matched_start != j) {
423 #[cfg(debug_assertions)]
424 notes.push(format!(
425 "ins {:?} (sub+1={:?}, edits+1={:?})",
426 text_chars[j],
427 substitutions + 1,
428 edits + 1,
429 ));
430 queue.push(State {
431 node,
432 j: j + 1,
433 matched_start,
434 matched_end,
435 penalties: penalties + self.penalties.insertion,
436 edits: edits + 1,
437 insertions: insertions + 1,
438 deletions,
439 substitutions,
440 swaps,
441 #[cfg(debug_assertions)]
442 notes: notes.clone(),
443 });
444 }
445 if del_ex {
446 for (edge_g, &next_node) in transitions {
447 trace!(
448 " delete to node={next_node} penalty={:.2}",
449 self.penalties.deletion
450 );
451 #[cfg(debug_assertions)]
452 notes.push(format!("del {:?} (del+1={:?})", edge_g, deletions + 1,));
453 queue.push(State {
454 node: next_node,
455 j,
456 matched_start,
457 matched_end,
458 penalties: penalties + self.penalties.deletion,
459 edits: edits + 1,
460 insertions,
461 deletions: deletions + 1,
462 substitutions,
463 swaps,
464 #[cfg(debug_assertions)]
465 notes: notes.clone(),
466 });
467 }
468 }
469 }
470 }
471 }
472
473 best.into_values().collect()
474 }
475
476 #[must_use]
479 pub fn search_non_overlapping(
480 &self,
481 haystack: &str,
482 similarity_threshold: f32,
483 ) -> Vec<FuzzyMatch> {
484 let mut matches = self.search(haystack, similarity_threshold);
485 #[cfg(test)]
486 trace!("raw matches: {:?}", matches);
487 matches.sort_by(|left, right| {
488 right
489 .similarity
490 .total_cmp(&left.similarity)
491 .then_with(|| (right.end - right.start).cmp(&(left.end - left.start)))
492 .then_with(|| left.start.cmp(&right.start))
493 });
494 let mut chosen = Vec::new();
495 let mut occupied_intervals: BTreeMap<usize, usize> = BTreeMap::new();
496 for matched in matches {
497 if occupied_intervals
498 .range(..=matched.start)
499 .next_back()
500 .is_none_or(|(_, &end)| end <= matched.start)
501 && occupied_intervals
502 .range(matched.start..)
503 .next()
504 .is_none_or(|(&start, _)| start >= matched.end)
505 {
506 occupied_intervals.insert(matched.start, matched.end);
507 chosen.push(matched);
508 }
509 }
510
511 chosen.sort_by_key(|m| m.start);
512 chosen
513 }
514
515 #[must_use]
519 pub fn replace<'a, F>(&self, text: &str, callback: F, threshold: f32) -> String
520 where
521 F: Fn(&FuzzyMatch) -> Option<&'a str>,
522 {
523 let mut result = String::new();
524 let mut last = 0;
525 for matched in self.search_non_overlapping(text, threshold) {
526 if matched.start >= last {
527 result.push_str(&text[last..matched.start]);
528 last = matched.end;
529 result.push_str(callback(&matched).unwrap_or(&matched.text));
530 }
531 }
532 result.push_str(&text[last..]);
533 result
534 }
535}