broot/pattern/
fuzzy_pattern.rs

1//! a fuzzy pattern matcher for filename filtering / sorting.
2//! It's not meant for file contents but for small strings (less than 1000 chars)
3//!  such as file names.
4
5use {
6    super::NameMatch,
7    secular,
8    smallvec::{
9        SmallVec,
10        smallvec,
11    },
12    std::fmt::{
13        self,
14        Write,
15    },
16};
17
18type CandChars = SmallVec<[char; 32]>;
19
20// weights used in match score computing
21const BONUS_MATCH: i32 = 50_000;
22const BONUS_EXACT: i32 = 1_000;
23const BONUS_START: i32 = 10;
24const BONUS_START_WORD: i32 = 5;
25const BONUS_CANDIDATE_LENGTH: i32 = -1; // per char
26const BONUS_MATCH_LENGTH: i32 = -10; // per char of length of the match
27const BONUS_NB_HOLES: i32 = -30; // there's also a max on that number
28const BONUS_SINGLED_CHAR: i32 = -15; // when there's a char, neither first not last, isolated
29
30/// A pattern for fuzzy matching
31#[derive(Debug, Clone)]
32pub struct FuzzyPattern {
33    chars: Box<[char]>, // secularized characters
34    max_nb_holes: usize,
35}
36
37impl fmt::Display for FuzzyPattern {
38    fn fmt(
39        &self,
40        f: &mut fmt::Formatter<'_>,
41    ) -> fmt::Result {
42        for &c in self.chars.iter() {
43            f.write_char(c)?
44        }
45        Ok(())
46    }
47}
48
49enum MatchSearchResult {
50    Perfect(NameMatch), // no need to test other positions
51    Some(NameMatch),
52    None,
53}
54
55fn is_word_separator(c: char) -> bool {
56    matches!(c, '_' | ' ' | '-')
57}
58
59impl FuzzyPattern {
60    /// build a pattern which will later be usable for fuzzy search.
61    /// A pattern should be reused
62    pub fn from(pat: &str) -> Self {
63        let chars = secular::normalized_lower_lay_string(pat)
64            .chars()
65            .collect::<Vec<char>>()
66            .into_boxed_slice();
67        let max_nb_holes = match chars.len() {
68            1 => 0,
69            2 => 1,
70            3 => 2,
71            4 => 2,
72            5 => 2,
73            6 => 3,
74            7 => 3,
75            8 => 4,
76            _ => chars.len() * 4 / 7,
77        };
78        FuzzyPattern {
79            chars,
80            max_nb_holes,
81        }
82    }
83
84    /// an "empty" pattern is one which accepts everything because
85    /// it has no discriminant
86    pub fn is_empty(&self) -> bool {
87        self.chars.is_empty()
88    }
89
90    fn tight_match_from_index(
91        &self,
92        cand_chars: &CandChars,
93        start_idx: usize, // start index in candidate, in chars
94    ) -> MatchSearchResult {
95        let mut pos = smallvec![0; self.chars.len()]; // positions of matching chars in candidate
96        let mut cand_idx = start_idx;
97        let mut pat_idx = 0; // index both in self.chars and pos
98        let mut in_hole = false;
99        loop {
100            if cand_chars[cand_idx] == self.chars[pat_idx] {
101                pos[pat_idx] = cand_idx;
102                if in_hole {
103                    // We're no more in a hole.
104                    // Let's look if we can bring back the chars before the hole
105                    let mut rev_idx = 1;
106                    loop {
107                        if pat_idx < rev_idx {
108                            break;
109                        }
110                        if cand_chars[cand_idx - rev_idx] == self.chars[pat_idx - rev_idx] {
111                            // we move the pos forward
112                            pos[pat_idx - rev_idx] = cand_idx - rev_idx;
113                        } else {
114                            break;
115                        }
116                        rev_idx += 1;
117                    }
118                    in_hole = false;
119                }
120                pat_idx += 1;
121                if pat_idx == self.chars.len() {
122                    break; // match, finished
123                }
124            } else {
125                // there's a hole
126                if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx {
127                    return MatchSearchResult::None;
128                }
129                in_hole = true;
130            }
131            cand_idx += 1;
132        }
133        let mut nb_holes = 0;
134        let mut nb_singled_chars = 0;
135        for idx in 1..pos.len() {
136            if pos[idx] > 1 + pos[idx - 1] {
137                nb_holes += 1;
138                if idx > 1 && pos[idx - 1] > 1 + pos[idx - 2] {
139                    // we improve a simple case: the one of a singleton which was created
140                    // by pushing forward a char
141                    if cand_chars[pos[idx - 2] + 1] == cand_chars[pos[idx - 1]] {
142                        // in some cases we're really removing another singletons but
143                        // let's forget this
144                        pos[idx - 1] = pos[idx - 2] + 1;
145                        nb_holes -= 1;
146                    } else {
147                        nb_singled_chars += 1;
148                    }
149                }
150            }
151        }
152        if nb_holes > self.max_nb_holes {
153            return MatchSearchResult::None;
154        }
155        let match_len = 1 + cand_idx - pos[0];
156        let mut score = BONUS_MATCH;
157        score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32);
158        score += BONUS_SINGLED_CHAR * nb_singled_chars;
159        score += BONUS_NB_HOLES * (nb_holes as i32);
160        score += match_len as i32 * BONUS_MATCH_LENGTH;
161        if pos[0] == 0 {
162            score += BONUS_START + BONUS_START_WORD;
163            if cand_chars.len() == self.chars.len() {
164                score += BONUS_EXACT;
165                return MatchSearchResult::Perfect(NameMatch { score, pos });
166            }
167        } else {
168            let previous = cand_chars[pos[0] - 1];
169            if is_word_separator(previous) {
170                score += BONUS_START_WORD;
171                if cand_chars.len() - pos[0] == self.chars.len() {
172                    return MatchSearchResult::Perfect(NameMatch { score, pos });
173                }
174            }
175        }
176        MatchSearchResult::Some(NameMatch { score, pos })
177    }
178
179    /// return a match if the pattern can be found in the candidate string.
180    /// The algorithm tries to return the best one. For example if you search
181    /// "abc" in "ababca-abc", the returned match would be at the end.
182    pub fn find(
183        &self,
184        candidate: &str,
185    ) -> Option<NameMatch> {
186        if candidate.len() < self.chars.len() {
187            return None;
188        }
189        let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len());
190        cand_chars.extend(candidate.chars().map(secular::lower_lay_char));
191        if cand_chars.len() < self.chars.len() {
192            return None;
193        }
194        let mut best_score = 0;
195        let mut best_match: Option<NameMatch> = None;
196        let n = cand_chars.len() + 1 - self.chars.len();
197        for start_idx in 0..n {
198            if cand_chars[start_idx] == self.chars[0] {
199                match self.tight_match_from_index(&cand_chars, start_idx) {
200                    MatchSearchResult::Perfect(m) => {
201                        return Some(m);
202                    }
203                    MatchSearchResult::Some(m) => {
204                        if m.score > best_score {
205                            best_score = m.score;
206                            best_match = Some(m);
207                        }
208                        // we could make start_idx jump to pos[0] here
209                        // but it doesn't improve the perfs (it's rare
210                        // anyway to have pos[0] much greater than the
211                        // start of the search)
212                    }
213                    _ => {}
214                }
215            }
216        }
217        best_match
218    }
219
220    /// compute the score of the best match
221    pub fn score_of(
222        &self,
223        candidate: &str,
224    ) -> Option<i32> {
225        self.find(candidate).map(|nm| nm.score)
226    }
227}
228
229#[cfg(test)]
230mod fuzzy_pattern_tests {
231
232    use {
233        super::*,
234        crate::pattern::Pos,
235    };
236
237    /// check position of the match of the pattern in name
238    fn check_pos(
239        pattern: &str,
240        name: &str,
241        pos: &str,
242    ) {
243        let pat = FuzzyPattern::from(pattern);
244        let match_pos = pat.find(name).unwrap().pos;
245        let target_pos: Pos = pos
246            .chars()
247            .enumerate()
248            .filter(|(_, c)| *c == '^')
249            .map(|(i, _)| i)
250            .collect();
251        assert_eq!(match_pos, target_pos);
252    }
253
254    /// test the quality of match length and groups number minimization
255    #[test]
256    fn check_match_pos() {
257        check_pos("ba", "babababaaa", "^^        ");
258        check_pos("baa", "babababaaa", "      ^^^ ");
259        check_pos("bbabaa", "babababaaa", "  ^ ^^^^^ ");
260        check_pos("aoml", "bacon.coml", " ^     ^^^");
261        check_pos("broot", "ab br bro oxt ", "      ^^^ ^ ^ ");
262        check_pos("broot", "b_broooooot_broot", "            ^^^^^");
263        check_pos(
264            "buityp",
265            "builder-styles-less-typing.d.ts",
266            "^^^                 ^^^        ",
267        );
268        check_pos(
269            "broot",
270            "ветер br x o vent ootot",
271            "      ^^          ^^^  ",
272        );
273        check_pos(
274            "broot",
275            "brbroorrbbbbbrrooorobrototooooot.txt",
276            "                    ^^^ ^^          ",
277        );
278        check_pos("besh", "benches/shared", "^^      ^^    ");
279    }
280
281    /// check that the scores of all names are strictly decreasing
282    /// (pattern is first tested against itself).
283    /// We verify this property with both computation functions.
284    fn check_ordering_for(
285        pattern: &str,
286        names: &[&str],
287    ) {
288        let fp = FuzzyPattern::from(pattern);
289        let mut last_score = fp.find(pattern).map(|m| m.score);
290        let mut last_name = pattern;
291        for name in names {
292            let score = fp.find(name).map(|m| m.score);
293            assert!(
294                score < last_score,
295                "score({name:?}) should be lower than score({last_name:?}) (using find)"
296            );
297            last_name = name;
298            last_score = score;
299        }
300    }
301
302    #[test]
303    fn check_orderings() {
304        check_ordering_for(
305            "broot",
306            &[
307                "a broot",
308                "abbroot",
309                "abcbroot",
310                " abdbroot",
311                "1234broot1",
312                "12345brrrroooottt",
313                "12345brrr roooottt",
314                "brot",
315            ],
316        );
317        check_ordering_for(
318            "Abc",
319            &["abCd", "aBdc", " abdc", " abdbccccc", " a b c", "nothing"],
320        );
321        check_ordering_for(
322            "réveil",
323            &[
324                "Réveillon",
325                "Réveillons",
326                " réveils",
327                "πréveil",
328                "déréveil",
329                "Rêve-t-il ?",
330                " rêves",
331            ],
332        );
333    }
334
335    /// check that we don't fail to find strings differing by diacritics
336    /// or unicode normalization.
337    ///
338    /// Note that we can't go past the limits of unicode normalization
339    /// and for example I don't know how to make 'B́' one char (help welcome).
340    /// This should normally not cause any problem for the user as searching
341    /// `"ab"` in `"aB́"` will still match.
342    #[test]
343    fn check_equivalences() {
344        fn check_equivalences_in(arr: &[&str]) {
345            for pattern in arr.iter() {
346                let fp = FuzzyPattern::from(pattern);
347                for name in arr.iter() {
348                    println!("looking for pattern {pattern:?} in name {name:?}");
349                    assert!(fp.find(name).unwrap().score > 0);
350                }
351            }
352        }
353        check_equivalences_in(&["aB", "ab", "àb", "âB"]);
354        let c12 = "Comunicações";
355        assert_eq!(c12.len(), 14);
356        assert_eq!(c12.chars().count(), 12);
357        let c14 = "Comunicações";
358        assert_eq!(c14.len(), 16);
359        assert_eq!(c14.chars().count(), 14);
360        check_equivalences_in(&["comunicacoes", c12, c14]);
361        check_equivalences_in(&["у", "У"]);
362    }
363    /// check that there's no problem with ignoring case on cyrillic.
364    /// This problem arises when secular was compiled without the "bmp" feature.
365    /// See https://github.com/Canop/broot/issues/746
366    #[test]
367    fn issue_746() {
368        let fp = FuzzyPattern::from("устр");
369        assert!(fp.find("Устройства").is_some());
370    }
371}