Skip to main content

termgrid_core/
search.rs

1//! Generic search helpers.
2//!
3//! This module intentionally provides small, deterministic building blocks that
4//! higher-level UIs can compose into experiences such as an fzf-style picker.
5
6use unicode_normalization::UnicodeNormalization;
7use unicode_segmentation::UnicodeSegmentation;
8
9/// Deterministic folding for matching.
10///
11/// We apply NFKC normalization and then a Unicode-aware lowercase transform.
12///
13/// Notes:
14/// - This is not a full Unicode case fold in the strict sense, but it is
15///   stable and deterministic without requiring additional case-fold tables.
16/// - Lowercasing can expand in some scripts, which is fine for our unit-based
17///   matching.
18fn nfkc_lower(s: &str) -> String {
19    // `nfkc()` yields an iterator of chars. We then apply a small, deterministic
20    // “casefold-ish” transform suitable for search keys:
21    // - Unicode NFKC normalization (handles compatibility forms like ligatures)
22    // - Unicode lowercase
23    // - Special-case German sharp s (ß/ẞ) → "ss" to allow multi-unit folds
24    //
25    // Note: This is not a complete implementation of Unicode CaseFolding. We
26    // implement the multi-unit fold we rely on for search UX while keeping the
27    // mapping back to original grapheme indices stable.
28    let mut out = String::new();
29    for c in s.nfkc() {
30        match c {
31            'ß' | 'ẞ' => out.push_str("ss"),
32            _ => out.extend(c.to_lowercase()),
33        }
34    }
35    out
36}
37
38/// Fold a string into comparable units.
39///
40/// We fold using NFKC + Unicode-aware lowercase, then split the folded output into grapheme
41/// clusters. This permits multi-codepoint folds (e.g. ß → ss) while retaining a
42/// stable mapping back to original grapheme indices (for candidate strings).
43fn fold_query_units(query: &str) -> Vec<String> {
44    let mut out: Vec<String> = Vec::new();
45    for g in query.graphemes(true) {
46        // Fast path for single-byte ASCII.
47        let b = g.as_bytes();
48        if b.len() == 1 && b[0].is_ascii() {
49            out.push(((b[0] as char).to_ascii_lowercase()).to_string());
50            continue;
51        }
52
53        let folded: String = nfkc_lower(g);
54        for fg in folded.graphemes(true) {
55            if !fg.is_empty() {
56                out.push(fg.to_string());
57            }
58        }
59    }
60    out
61}
62
63fn fold_candidate_units(candidate: &str) -> (Vec<&str>, Vec<(String, usize)>) {
64    let cgs: Vec<&str> = candidate.graphemes(true).collect();
65    let mut units: Vec<(String, usize)> = Vec::new();
66    for (i, cg) in cgs.iter().enumerate() {
67        let b = cg.as_bytes();
68        if b.len() == 1 && b[0].is_ascii() {
69            units.push((((b[0] as char).to_ascii_lowercase()).to_string(), i));
70            continue;
71        }
72
73        let folded: String = nfkc_lower(cg);
74        for fg in folded.graphemes(true) {
75            if !fg.is_empty() {
76                units.push((fg.to_string(), i));
77            }
78        }
79    }
80    (cgs, units)
81}
82
83/// Returns grapheme indices in `candidate` that match `query` in order.
84///
85/// Matching is case-insensitive by Unicode NFKC + lowercase folding.
86///
87/// Case folding may expand a single grapheme into multiple folded units (for
88/// example, ß → ss). Returned positions are still grapheme indices in the
89/// original `candidate` and collapse multi-unit folds to a single position,
90/// which is suitable for highlighting.
91///
92/// - If `query` is empty, returns `Some(vec![])`.
93/// - If all graphemes in `query` can be matched in order, returns their
94///   positions as grapheme indices in `candidate`.
95/// - Otherwise returns `None`.
96///
97/// This function performs no scoring. For a scored matcher that also returns
98/// match positions suitable for highlighting, use
99/// [`fuzzy_match_positions_graphemes`].
100pub fn match_positions_graphemes(query: &str, candidate: &str) -> Option<Vec<usize>> {
101    let qu: Vec<String> = fold_query_units(query);
102    if qu.is_empty() {
103        return Some(Vec::new());
104    }
105
106    let (_cgs, cu) = fold_candidate_units(candidate);
107    let mut q_i: usize = 0;
108    let mut raw_positions: Vec<usize> = Vec::new();
109
110    for (unit, orig_i) in cu {
111        if q_i >= qu.len() {
112            break;
113        }
114        if unit == qu[q_i] {
115            raw_positions.push(orig_i);
116            q_i += 1;
117            if q_i == qu.len() {
118                break;
119            }
120        }
121    }
122
123    if q_i != qu.len() {
124        return None;
125    }
126
127    // Collapse multi-unit matches that map to the same original candidate
128    // grapheme (e.g. ß -> ss) into a single highlight position.
129    let mut positions: Vec<usize> = Vec::new();
130    for p in raw_positions {
131        if positions.last().copied() != Some(p) {
132            positions.push(p);
133        }
134    }
135    Some(positions)
136}
137
138/// Returns `(positions, score)` for a case-insensitive fuzzy match.
139///
140/// "Fuzzy" here means ordered subsequence matching over grapheme clusters,
141/// plus a deterministic score to help sort candidates.
142///
143/// - `positions` are grapheme indices in `candidate` where query graphemes were
144///   matched.
145/// - `score` is higher for:
146///   - consecutive matches
147///   - matches at the start of the candidate
148///   - matches at word boundaries
149///   - shorter candidates (mild bonus)
150///
151/// Returns `None` if `query` cannot be matched in order.
152///
153/// Scoring model (intentionally simple and stable):
154/// - Base: +10 per matched grapheme
155/// - Consecutive bonus: +20 for each match that immediately follows the prior
156///   match (by grapheme index)
157/// - Start-of-string bonus: +30 if the first match is at index 0
158/// - Word-boundary bonus: +15 for each match whose position is a word boundary
159///   (start of string, or preceded by a non-word grapheme)
160/// - Gap penalty: -1 per non-matching grapheme between successive matches
161/// - Length penalty: -1 per grapheme in `candidate` (mild; favors shorter)
162///
163/// Word characters are ASCII letters/digits and underscore. Boundary is detected
164/// between a non-word grapheme and a word grapheme.
165///
166/// ## Versioning
167///
168/// This function is an alias for the stable scorer
169/// [`fuzzy_match_positions_graphemes_v1`].
170pub fn fuzzy_match_positions_graphemes(query: &str, candidate: &str) -> Option<(Vec<usize>, i64)> {
171    fuzzy_match_positions_graphemes_v1(query, candidate)
172}
173
174/// Latest version of the scored fuzzy matcher.
175///
176/// This function may change its scoring model in a future release. If you need
177/// a pinned scoring model, call a versioned function such as
178/// [`fuzzy_match_positions_graphemes_v1`].
179pub fn fuzzy_match_positions_graphemes_latest(
180    query: &str,
181    candidate: &str,
182) -> Option<(Vec<usize>, i64)> {
183    fuzzy_match_positions_graphemes_v1(query, candidate)
184}
185
186/// Version 1 of the scored fuzzy matcher.
187///
188/// This function's scoring model is intended to be stable across patch/minor
189/// releases. If a future scoring model is introduced, it will be exposed as a
190/// new versioned function (for example, `..._v2`).
191pub fn fuzzy_match_positions_graphemes_v1(
192    query: &str,
193    candidate: &str,
194) -> Option<(Vec<usize>, i64)> {
195    let qu: Vec<String> = fold_query_units(query);
196    if qu.is_empty() {
197        return Some((Vec::new(), 0));
198    }
199
200    let (cgs, cu) = fold_candidate_units(candidate);
201    let mut q_i: usize = 0;
202    let mut raw_positions: Vec<usize> = Vec::new();
203
204    for (unit, orig_i) in cu {
205        if q_i >= qu.len() {
206            break;
207        }
208        if unit == qu[q_i] {
209            raw_positions.push(orig_i);
210            q_i += 1;
211            if q_i == qu.len() {
212                break;
213            }
214        }
215    }
216
217    if q_i != qu.len() {
218        return None;
219    }
220
221    let mut positions: Vec<usize> = Vec::new();
222    for p in raw_positions {
223        if positions.last().copied() != Some(p) {
224            positions.push(p);
225        }
226    }
227
228    let score = score_match_positions(&cgs, &positions);
229    Some((positions, score))
230}
231
232fn is_ascii_word_grapheme(g: &str) -> bool {
233    // Treat a single ASCII byte as "word" if it is [A-Za-z0-9_].
234    // Multi-grapheme clusters (including emoji) are treated as non-word.
235    let b = g.as_bytes();
236    if b.len() != 1 {
237        return false;
238    }
239    matches!(b[0], b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
240}
241
242fn is_word_boundary(cgs: &[&str], pos: usize) -> bool {
243    if pos == 0 {
244        return true;
245    }
246    let cur_word = is_ascii_word_grapheme(cgs[pos]);
247    if !cur_word {
248        return false;
249    }
250    let prev_word = is_ascii_word_grapheme(cgs[pos - 1]);
251    !prev_word
252}
253
254fn score_match_positions(cgs: &[&str], positions: &[usize]) -> i64 {
255    let mut score: i64 = 0;
256
257    // Mild length penalty favors shorter candidates.
258    score -= cgs.len() as i64;
259
260    // Base per-match scoring.
261    score += 10 * positions.len() as i64;
262
263    // Start-of-string bonus.
264    if positions.first().copied() == Some(0) {
265        score += 30;
266    }
267
268    // Boundary and consecutive bonuses, plus gap penalty.
269    let mut prev: Option<usize> = None;
270    for &p in positions {
271        if is_word_boundary(cgs, p) {
272            score += 15;
273        }
274        if let Some(pp) = prev {
275            if p == pp + 1 {
276                score += 20;
277            } else if p > pp + 1 {
278                score -= (p - (pp + 1)) as i64;
279            }
280        }
281        prev = Some(p);
282    }
283
284    score
285}