Skip to main content

fuzzy_regex/compat/
matches.rs

1// Suppress pedantic lints for compatibility layer
2#![allow(clippy::cast_precision_loss)]
3#![allow(clippy::needless_continue)]
4#![allow(clippy::must_use_candidate)]
5#![allow(clippy::len_without_is_empty)]
6
7//! Match types compatible with fuzzy-aho-corasick.
8
9use super::pattern::Pattern;
10use crate::types::NumEdits;
11use std::borrow::Cow;
12use std::collections::{BTreeMap, BTreeSet};
13
14/// A single fuzzy match result.
15#[derive(Debug, Clone, PartialEq)]
16pub struct FuzzyMatch<'a> {
17    /// Number of insertions.
18    pub insertions: NumEdits,
19    /// Number of deletions.
20    pub deletions: NumEdits,
21    /// Number of substitutions.
22    pub substitutions: NumEdits,
23    /// Number of swaps (transpositions).
24    pub swaps: NumEdits,
25    /// Total number of edits.
26    pub edits: NumEdits,
27    /// Index of the matched pattern.
28    pub pattern_index: usize,
29    /// Reference to the matched pattern.
30    pub pattern: &'a Pattern,
31    /// Start byte offset in the haystack.
32    pub start: usize,
33    /// End byte offset in the haystack.
34    pub end: usize,
35    /// Similarity score (0.0 to 1.0).
36    pub similarity: f32,
37    /// The matched text slice.
38    pub text: &'a str,
39}
40
41/// An unmatched segment between fuzzy matches.
42#[derive(Debug, Clone, PartialEq)]
43pub struct UnmatchedSegment<'a> {
44    /// Start byte offset.
45    pub start: usize,
46    /// End byte offset.
47    pub end: usize,
48    /// The unmatched text slice.
49    pub text: &'a str,
50}
51
52/// A segment of text - either matched or unmatched.
53#[derive(Debug, Clone, PartialEq)]
54pub enum Segment<'a> {
55    /// A fuzzy match.
56    Matched(FuzzyMatch<'a>),
57    /// An unmatched gap.
58    Unmatched(UnmatchedSegment<'a>),
59}
60
61impl<'a> Segment<'a> {
62    /// Get the matched segment if this is a match.
63    #[must_use]
64    pub fn matched(&'a self) -> Option<&'a FuzzyMatch<'a>> {
65        if let Segment::Matched(m) = self {
66            Some(m)
67        } else {
68            None
69        }
70    }
71
72    /// Get the unmatched segment if this is unmatched.
73    #[must_use]
74    pub fn unmatched(&'a self) -> Option<&'a UnmatchedSegment<'a>> {
75        if let Segment::Unmatched(u) = self {
76            Some(u)
77        } else {
78            None
79        }
80    }
81
82    /// Get the length of this segment in bytes.
83    #[must_use]
84    pub fn len(&self) -> usize {
85        match self {
86            Segment::Matched(m) => m.text.len(),
87            Segment::Unmatched(u) => u.text.len(),
88        }
89    }
90
91    /// Check if this segment is empty.
92    #[must_use]
93    pub fn is_empty(&self) -> bool {
94        self.len() == 0
95    }
96
97    /// Get the text content of this segment.
98    #[must_use]
99    pub fn as_str(&self) -> &str {
100        match self {
101            Segment::Matched(m) => m.text,
102            Segment::Unmatched(u) => u.text,
103        }
104    }
105}
106
107/// Unique ID for pattern deduplication.
108#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
109pub enum UniqueId {
110    /// Automatic ID based on pattern index.
111    Automatic(usize),
112    /// Custom user-provided ID.
113    Custom(usize),
114}
115
116/// A collection of fuzzy matches with sorting and filtering methods.
117#[derive(Debug)]
118pub struct FuzzyMatches<'a> {
119    pub(crate) haystack: &'a str,
120    /// The raw list of matches.
121    pub inner: Vec<FuzzyMatch<'a>>,
122}
123
124impl<'a> FuzzyMatches<'a> {
125    /// Default sorting: similarity (desc), pattern length (desc), text length (desc), start (asc).
126    pub fn default_sort(&mut self) {
127        self.inner.sort_by(|a, b| {
128            b.similarity
129                .total_cmp(&a.similarity)
130                .then_with(|| b.pattern.len().cmp(&a.pattern.len()))
131                .then_with(|| b.text.len().cmp(&a.text.len()))
132                .then_with(|| a.start.cmp(&b.start))
133        });
134    }
135
136    /// Greedy sorting: pattern length (desc), similarity (desc), start (asc).
137    pub fn greedy_sort(&mut self) {
138        self.inner.sort_by(|a, b| {
139            b.pattern
140                .len()
141                .cmp(&a.pattern.len())
142                .then_with(|| b.similarity.total_cmp(&a.similarity))
143                .then_with(|| a.start.cmp(&b.start))
144        });
145    }
146
147    /// Coverage-weighted sorting using similarity^2 * `pattern_len`.
148    pub fn coverage_weighted_sort(&mut self) {
149        self.inner.sort_by(|a, b| {
150            let score_a = a.similarity * a.similarity * a.pattern.len() as f32;
151            let score_b = b.similarity * b.similarity * b.pattern.len() as f32;
152            score_b
153                .total_cmp(&score_a)
154                .then_with(|| b.similarity.total_cmp(&a.similarity))
155                .then_with(|| a.start.cmp(&b.start))
156        });
157    }
158
159    /// Retain only non-overlapping matches.
160    pub fn non_overlapping(&mut self) {
161        let mut occupied: BTreeMap<usize, usize> = BTreeMap::new();
162        self.inner.retain(|m| {
163            let overlaps_before = occupied
164                .range(..=m.start)
165                .next_back()
166                .is_some_and(|(_, &end)| end > m.start);
167            let overlaps_after = occupied
168                .range(m.start..)
169                .next()
170                .is_some_and(|(&start, _)| start < m.end);
171
172            if !overlaps_before && !overlaps_after {
173                occupied.insert(m.start, m.end);
174                true
175            } else {
176                false
177            }
178        });
179        self.inner.sort_by_key(|m| m.start);
180    }
181
182    /// Retain only non-overlapping matches with unique patterns.
183    pub fn non_overlapping_unique(&mut self) {
184        let mut used_patterns: BTreeSet<UniqueId> = BTreeSet::new();
185        let mut occupied: BTreeMap<usize, usize> = BTreeMap::new();
186
187        self.inner.retain(|m| {
188            let unique_id = m
189                .pattern
190                .custom_unique_id
191                .map_or(UniqueId::Automatic(m.pattern_index), UniqueId::Custom);
192
193            if used_patterns.contains(&unique_id) {
194                return false;
195            }
196
197            let overlaps_before = occupied
198                .range(..=m.start)
199                .next_back()
200                .is_some_and(|(_, &end)| end > m.start);
201            let overlaps_after = occupied
202                .range(m.start..)
203                .next()
204                .is_some_and(|(&start, _)| start < m.end);
205
206            if !overlaps_before && !overlaps_after {
207                used_patterns.insert(unique_id);
208                occupied.insert(m.start, m.end);
209                true
210            } else {
211                false
212            }
213        });
214        self.inner.sort_by_key(|m| m.start);
215    }
216
217    /// Replace matches using a callback function.
218    #[must_use]
219    pub fn replace<F, S>(&self, callback: F) -> String
220    where
221        F: Fn(&FuzzyMatch<'a>) -> Option<S>,
222        S: Into<Cow<'a, str>>,
223    {
224        let mut result = String::new();
225        let mut last = 0;
226
227        for m in &self.inner {
228            if m.start >= last {
229                result.push_str(&self.haystack[last..m.start]);
230                last = m.end;
231
232                match callback(m) {
233                    Some(repl) => result.push_str(&repl.into()),
234                    None => result.push_str(m.text),
235                }
236            }
237        }
238        result.push_str(&self.haystack[last..]);
239        result
240    }
241
242    /// Strip matched prefix and leading whitespace.
243    #[must_use]
244    pub fn strip_prefix(self) -> String {
245        let mut result = String::new();
246        let mut skipping = true;
247
248        for segment in self.segment_iter() {
249            match segment {
250                Segment::Matched(_) if skipping => {}
251                Segment::Matched(m) => result.push_str(m.text),
252                Segment::Unmatched(u) if skipping => {
253                    if u.text.trim().is_empty() {
254                        continue;
255                    }
256                    skipping = false;
257                    result.push_str(u.text.trim_start());
258                }
259                Segment::Unmatched(u) => result.push_str(u.text),
260            }
261        }
262        result
263    }
264
265    /// Strip matched suffix and trailing whitespace.
266    #[must_use]
267    pub fn strip_postfix(self) -> String {
268        let segments: Vec<_> = self.segment_iter().collect();
269        let mut keep = 0;
270
271        for (i, seg) in segments.iter().enumerate() {
272            if let Segment::Unmatched(u) = seg
273                && !u.text.trim().is_empty()
274            {
275                keep = i + 1;
276            }
277        }
278
279        let mut result = String::new();
280        for (i, seg) in segments.into_iter().take(keep).enumerate() {
281            let is_last = i + 1 == keep;
282            match seg {
283                Segment::Matched(m) => result.push_str(m.text),
284                Segment::Unmatched(u) if is_last => result.push_str(u.text.trim_end()),
285                Segment::Unmatched(u) => result.push_str(u.text),
286            }
287        }
288        result
289    }
290
291    /// Split the haystack by matches, returning unmatched parts.
292    pub fn split(self) -> impl Iterator<Item = &'a str> + 'a {
293        let mut segments = self.segment_iter();
294        std::iter::from_fn(move || {
295            for seg in segments.by_ref() {
296                if let Segment::Unmatched(u) = seg {
297                    return Some(u.text);
298                }
299            }
300            None
301        })
302    }
303
304    /// Iterate over matches.
305    pub fn iter(&self) -> impl Iterator<Item = &FuzzyMatch<'a>> {
306        self.inner.iter()
307    }
308
309    /// Iterate mutably over matches.
310    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut FuzzyMatch<'a>> {
311        self.inner.iter_mut()
312    }
313
314    /// Get mutable access to the inner vector.
315    pub fn inner_mut(&mut self) -> &mut Vec<FuzzyMatch<'a>> {
316        &mut self.inner
317    }
318
319    /// Get the number of matches.
320    #[must_use]
321    pub fn len(&self) -> usize {
322        self.inner.len()
323    }
324
325    /// Check if there are no matches.
326    #[must_use]
327    pub fn is_empty(&self) -> bool {
328        self.inner.is_empty()
329    }
330
331    /// Retain matches that satisfy a predicate.
332    pub fn retain<F>(&mut self, pred: F) -> &mut Self
333    where
334        F: Fn(&FuzzyMatch<'a>) -> bool,
335    {
336        self.inner.retain(pred);
337        self
338    }
339
340    /// Filter matches by a predicate, returning a new `FuzzyMatches`.
341    #[must_use]
342    pub fn filter<F>(&self, pred: F) -> FuzzyMatches<'a>
343    where
344        F: Fn(&FuzzyMatch<'a>) -> bool,
345    {
346        FuzzyMatches {
347            haystack: self.haystack,
348            inner: self.inner.iter().filter(|m| pred(m)).cloned().collect(),
349        }
350    }
351
352    /// Get the byte spans of all matches.
353    #[must_use]
354    pub fn matched_spans(&self) -> Vec<(usize, usize)> {
355        self.inner.iter().map(|m| (m.start, m.end)).collect()
356    }
357
358    /// Get the matched text strings.
359    #[must_use]
360    pub fn matched_strings(&self) -> Vec<&'a str> {
361        self.inner.iter().map(|m| m.text).collect()
362    }
363
364    /// Iterate over segments (matched and unmatched).
365    pub fn segment_iter(self) -> impl Iterator<Item = Segment<'a>> {
366        let mut segments = Vec::new();
367        let mut last = 0;
368
369        for m in self.inner {
370            if m.start >= last {
371                if m.start > last {
372                    segments.push(Segment::Unmatched(UnmatchedSegment {
373                        start: last,
374                        end: m.start,
375                        text: &self.haystack[last..m.start],
376                    }));
377                }
378                last = m.end;
379                segments.push(Segment::Matched(m));
380            }
381        }
382
383        if last < self.haystack.len() {
384            segments.push(Segment::Unmatched(UnmatchedSegment {
385                start: last,
386                end: self.haystack.len(),
387                text: &self.haystack[last..],
388            }));
389        }
390
391        segments.into_iter()
392    }
393
394    /// Reconstruct text with smart spacing around matches.
395    #[must_use]
396    pub fn segment_text(self) -> String {
397        const SPACE: [char; 2] = [' ', '\t'];
398        const NO_LEADING_SPACE: [char; 9] = [',', '.', '?', '!', ';', ':', '—', '-', '…'];
399
400        let mut result = String::new();
401        let mut prev_matched = false;
402
403        for segment in self.segment_iter() {
404            match segment {
405                Segment::Matched(m) => {
406                    if prev_matched || (!result.is_empty() && !result.ends_with(SPACE)) {
407                        result.push(' ');
408                    }
409                    prev_matched = true;
410                    result.push_str(m.text);
411                }
412                Segment::Unmatched(u) => {
413                    if prev_matched && !u.text.starts_with(NO_LEADING_SPACE) {
414                        result.push(' ');
415                    }
416                    prev_matched = false;
417                    result.push_str(u.text);
418                }
419            }
420        }
421        result
422    }
423}
424
425// Iterator implementations
426impl<'a, 'b> IntoIterator for &'b FuzzyMatches<'a> {
427    type Item = &'b FuzzyMatch<'a>;
428    type IntoIter = std::slice::Iter<'b, FuzzyMatch<'a>>;
429
430    fn into_iter(self) -> Self::IntoIter {
431        self.inner.iter()
432    }
433}
434
435impl<'a, 'b> IntoIterator for &'b mut FuzzyMatches<'a> {
436    type Item = &'b mut FuzzyMatch<'a>;
437    type IntoIter = std::slice::IterMut<'b, FuzzyMatch<'a>>;
438
439    fn into_iter(self) -> Self::IntoIter {
440        self.inner.iter_mut()
441    }
442}
443
444impl<'a> IntoIterator for FuzzyMatches<'a> {
445    type Item = FuzzyMatch<'a>;
446    type IntoIter = std::vec::IntoIter<FuzzyMatch<'a>>;
447
448    fn into_iter(self) -> Self::IntoIter {
449        self.inner.into_iter()
450    }
451}
452
453impl<'a> std::ops::Deref for FuzzyMatches<'a> {
454    type Target = [FuzzyMatch<'a>];
455
456    fn deref(&self) -> &Self::Target {
457        &self.inner
458    }
459}
460
461impl std::ops::DerefMut for FuzzyMatches<'_> {
462    fn deref_mut(&mut self) -> &mut Self::Target {
463        &mut self.inner
464    }
465}