edit/
fuzzy.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`).
5//! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`,
6//! but I figured that this one is what lots of people may be familiar with.
7
8use std::vec;
9
10use crate::arena::{Arena, scratch_arena};
11use crate::icu;
12
13const NO_MATCH: i32 = 0;
14
15pub fn score_fuzzy<'a>(
16    arena: &'a Arena,
17    haystack: &str,
18    needle: &str,
19    allow_non_contiguous_matches: bool,
20) -> (i32, Vec<usize, &'a Arena>) {
21    if haystack.is_empty() || needle.is_empty() {
22        // return early if target or query are empty
23        return (NO_MATCH, Vec::new_in(arena));
24    }
25
26    let scratch = scratch_arena(Some(arena));
27    let target = map_chars(&scratch, haystack);
28    let query = map_chars(&scratch, needle);
29
30    if target.len() < query.len() {
31        // impossible for query to be contained in target
32        return (NO_MATCH, Vec::new_in(arena));
33    }
34
35    let target_lower = icu::fold_case(&scratch, haystack);
36    let query_lower = icu::fold_case(&scratch, needle);
37    let target_lower = map_chars(&scratch, &target_lower);
38    let query_lower = map_chars(&scratch, &query_lower);
39
40    let area = query.len() * target.len();
41    let mut scores = vec::from_elem_in(0, area, &*scratch);
42    let mut matches = vec::from_elem_in(0, area, &*scratch);
43
44    //
45    // Build Scorer Matrix:
46    //
47    // The matrix is composed of query q and target t. For each index we score
48    // q[i] with t[i] and compare that with the previous score. If the score is
49    // equal or larger, we keep the match. In addition to the score, we also keep
50    // the length of the consecutive matches to use as boost for the score.
51    //
52    //      t   a   r   g   e   t
53    //  q
54    //  u
55    //  e
56    //  r
57    //  y
58    //
59    for query_index in 0..query.len() {
60        let query_index_offset = query_index * target.len();
61        let query_index_previous_offset =
62            if query_index > 0 { (query_index - 1) * target.len() } else { 0 };
63
64        for target_index in 0..target.len() {
65            let current_index = query_index_offset + target_index;
66            let diag_index = if query_index > 0 && target_index > 0 {
67                query_index_previous_offset + target_index - 1
68            } else {
69                0
70            };
71            let left_score = if target_index > 0 { scores[current_index - 1] } else { 0 };
72            let diag_score =
73                if query_index > 0 && target_index > 0 { scores[diag_index] } else { 0 };
74            let matches_sequence_len =
75                if query_index > 0 && target_index > 0 { matches[diag_index] } else { 0 };
76
77            // If we are not matching on the first query character anymore, we only produce a
78            // score if we had a score previously for the last query index (by looking at the diagScore).
79            // This makes sure that the query always matches in sequence on the target. For example
80            // given a target of "ede" and a query of "de", we would otherwise produce a wrong high score
81            // for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost.
82            let score = if diag_score == 0 && query_index != 0 {
83                0
84            } else {
85                compute_char_score(
86                    query[query_index],
87                    query_lower[query_index],
88                    if target_index != 0 { Some(target[target_index - 1]) } else { None },
89                    target[target_index],
90                    target_lower[target_index],
91                    matches_sequence_len,
92                )
93            };
94
95            // We have a score and its equal or larger than the left score
96            // Match: sequence continues growing from previous diag value
97            // Score: increases by diag score value
98            let is_valid_score = score != 0 && diag_score + score >= left_score;
99            if is_valid_score
100                && (
101                    // We don't need to check if it's contiguous if we allow non-contiguous matches
102                    allow_non_contiguous_matches ||
103                        // We must be looking for a contiguous match.
104                        // Looking at an index above 0 in the query means we must have already
105                        // found out this is contiguous otherwise there wouldn't have been a score
106                        query_index > 0 ||
107                        // lastly check if the query is completely contiguous at this index in the target
108                        target_lower[target_index..].starts_with(&query_lower)
109                )
110            {
111                matches[current_index] = matches_sequence_len + 1;
112                scores[current_index] = diag_score + score;
113            } else {
114                // We either have no score or the score is lower than the left score
115                // Match: reset to 0
116                // Score: pick up from left hand side
117                matches[current_index] = NO_MATCH;
118                scores[current_index] = left_score;
119            }
120        }
121    }
122
123    // Restore Positions (starting from bottom right of matrix)
124    let mut positions = Vec::new_in(arena);
125
126    if !query.is_empty() && !target.is_empty() {
127        let mut query_index = query.len() - 1;
128        let mut target_index = target.len() - 1;
129
130        loop {
131            let current_index = query_index * target.len() + target_index;
132            if matches[current_index] == NO_MATCH {
133                if target_index == 0 {
134                    break;
135                }
136                target_index -= 1; // go left
137            } else {
138                positions.push(target_index);
139
140                // go up and left
141                if query_index == 0 || target_index == 0 {
142                    break;
143                }
144                query_index -= 1;
145                target_index -= 1;
146            }
147        }
148
149        positions.reverse();
150    }
151
152    (scores[area - 1], positions)
153}
154
155fn compute_char_score(
156    query: char,
157    query_lower: char,
158    target_prev: Option<char>,
159    target_curr: char,
160    target_curr_lower: char,
161    matches_sequence_len: i32,
162) -> i32 {
163    let mut score = 0;
164
165    if !consider_as_equal(query_lower, target_curr_lower) {
166        return score; // no match of characters
167    }
168
169    // Character match bonus
170    score += 1;
171
172    // Consecutive match bonus
173    if matches_sequence_len > 0 {
174        score += matches_sequence_len * 5;
175    }
176
177    // Same case bonus
178    if query == target_curr {
179        score += 1;
180    }
181
182    if let Some(target_prev) = target_prev {
183        // After separator bonus
184        let separator_bonus = score_separator_at_pos(target_prev);
185        if separator_bonus > 0 {
186            score += separator_bonus;
187        }
188        // Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence.
189        // For example:
190        // NPE => NullPointerException = boost
191        // HTTP => HTTP = not boost
192        else if target_curr != target_curr_lower && matches_sequence_len == 0 {
193            score += 2;
194        }
195    } else {
196        // Start of word bonus
197        score += 8;
198    }
199
200    score
201}
202
203fn consider_as_equal(a: char, b: char) -> bool {
204    // Special case path separators: ignore platform differences
205    a == b || (a == '/' && b == '\\') || (a == '\\' && b == '/')
206}
207
208fn score_separator_at_pos(ch: char) -> i32 {
209    match ch {
210        '/' | '\\' => 5,                               // prefer path separators...
211        '_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators
212        _ => 0,
213    }
214}
215
216fn map_chars<'a>(arena: &'a Arena, s: &str) -> Vec<char, &'a Arena> {
217    let mut chars = Vec::with_capacity_in(s.len(), arena);
218    chars.extend(s.chars());
219    chars.shrink_to_fit();
220    chars
221}