termgrid_core/search.rs
1//! Generic search helpers.
2//!
3//! This module intentionally provides small, deterministic building blocks that
4//! higher-level UIs can compose into experiences such as an fzf-style picker.
5
6use unicode_normalization::UnicodeNormalization;
7use unicode_segmentation::UnicodeSegmentation;
8
9/// Deterministic folding for matching.
10///
11/// We apply NFKC normalization and then a Unicode-aware lowercase transform.
12///
13/// Notes:
14/// - This is not a full Unicode case fold in the strict sense, but it is
15/// stable and deterministic without requiring additional case-fold tables.
16/// - Lowercasing can expand in some scripts, which is fine for our unit-based
17/// matching.
18fn nfkc_lower(s: &str) -> String {
19 // `nfkc()` yields an iterator of chars. We then apply a small, deterministic
20 // “casefold-ish” transform suitable for search keys:
21 // - Unicode NFKC normalization (handles compatibility forms like ligatures)
22 // - Unicode lowercase
23 // - Special-case German sharp s (ß/ẞ) → "ss" to allow multi-unit folds
24 //
25 // Note: This is not a complete implementation of Unicode CaseFolding. We
26 // implement the multi-unit fold we rely on for search UX while keeping the
27 // mapping back to original grapheme indices stable.
28 let mut out = String::new();
29 for c in s.nfkc() {
30 match c {
31 'ß' | 'ẞ' => out.push_str("ss"),
32 _ => out.extend(c.to_lowercase()),
33 }
34 }
35 out
36}
37
38/// Fold a string into comparable units.
39///
40/// We fold using NFKC + Unicode-aware lowercase, then split the folded output into grapheme
41/// clusters. This permits multi-codepoint folds (e.g. ß → ss) while retaining a
42/// stable mapping back to original grapheme indices (for candidate strings).
43fn fold_query_units(query: &str) -> Vec<String> {
44 let mut out: Vec<String> = Vec::new();
45 for g in query.graphemes(true) {
46 // Fast path for single-byte ASCII.
47 let b = g.as_bytes();
48 if b.len() == 1 && b[0].is_ascii() {
49 out.push(((b[0] as char).to_ascii_lowercase()).to_string());
50 continue;
51 }
52
53 let folded: String = nfkc_lower(g);
54 for fg in folded.graphemes(true) {
55 if !fg.is_empty() {
56 out.push(fg.to_string());
57 }
58 }
59 }
60 out
61}
62
63fn fold_candidate_units(candidate: &str) -> (Vec<&str>, Vec<(String, usize)>) {
64 let cgs: Vec<&str> = candidate.graphemes(true).collect();
65 let mut units: Vec<(String, usize)> = Vec::new();
66 for (i, cg) in cgs.iter().enumerate() {
67 let b = cg.as_bytes();
68 if b.len() == 1 && b[0].is_ascii() {
69 units.push((((b[0] as char).to_ascii_lowercase()).to_string(), i));
70 continue;
71 }
72
73 let folded: String = nfkc_lower(cg);
74 for fg in folded.graphemes(true) {
75 if !fg.is_empty() {
76 units.push((fg.to_string(), i));
77 }
78 }
79 }
80 (cgs, units)
81}
82
83/// Returns grapheme indices in `candidate` that match `query` in order.
84///
85/// Matching is case-insensitive by Unicode NFKC + lowercase folding.
86///
87/// Case folding may expand a single grapheme into multiple folded units (for
88/// example, ß → ss). Returned positions are still grapheme indices in the
89/// original `candidate` and collapse multi-unit folds to a single position,
90/// which is suitable for highlighting.
91///
92/// - If `query` is empty, returns `Some(vec![])`.
93/// - If all graphemes in `query` can be matched in order, returns their
94/// positions as grapheme indices in `candidate`.
95/// - Otherwise returns `None`.
96///
97/// This function performs no scoring. For a scored matcher that also returns
98/// match positions suitable for highlighting, use
99/// [`fuzzy_match_positions_graphemes`].
100pub fn match_positions_graphemes(query: &str, candidate: &str) -> Option<Vec<usize>> {
101 let qu: Vec<String> = fold_query_units(query);
102 if qu.is_empty() {
103 return Some(Vec::new());
104 }
105
106 let (_cgs, cu) = fold_candidate_units(candidate);
107 let mut q_i: usize = 0;
108 let mut raw_positions: Vec<usize> = Vec::new();
109
110 for (unit, orig_i) in cu {
111 if q_i >= qu.len() {
112 break;
113 }
114 if unit == qu[q_i] {
115 raw_positions.push(orig_i);
116 q_i += 1;
117 if q_i == qu.len() {
118 break;
119 }
120 }
121 }
122
123 if q_i != qu.len() {
124 return None;
125 }
126
127 // Collapse multi-unit matches that map to the same original candidate
128 // grapheme (e.g. ß -> ss) into a single highlight position.
129 let mut positions: Vec<usize> = Vec::new();
130 for p in raw_positions {
131 if positions.last().copied() != Some(p) {
132 positions.push(p);
133 }
134 }
135 Some(positions)
136}
137
138/// Returns `(positions, score)` for a case-insensitive fuzzy match.
139///
140/// "Fuzzy" here means ordered subsequence matching over grapheme clusters,
141/// plus a deterministic score to help sort candidates.
142///
143/// - `positions` are grapheme indices in `candidate` where query graphemes were
144/// matched.
145/// - `score` is higher for:
146/// - consecutive matches
147/// - matches at the start of the candidate
148/// - matches at word boundaries
149/// - shorter candidates (mild bonus)
150///
151/// Returns `None` if `query` cannot be matched in order.
152///
153/// Scoring model (intentionally simple and stable):
154/// - Base: +10 per matched grapheme
155/// - Consecutive bonus: +20 for each match that immediately follows the prior
156/// match (by grapheme index)
157/// - Start-of-string bonus: +30 if the first match is at index 0
158/// - Word-boundary bonus: +15 for each match whose position is a word boundary
159/// (start of string, or preceded by a non-word grapheme)
160/// - Gap penalty: -1 per non-matching grapheme between successive matches
161/// - Length penalty: -1 per grapheme in `candidate` (mild; favors shorter)
162///
163/// Word characters are ASCII letters/digits and underscore. Boundary is detected
164/// between a non-word grapheme and a word grapheme.
165///
166/// ## Versioning
167///
168/// This function is an alias for the stable scorer
169/// [`fuzzy_match_positions_graphemes_v1`].
170pub fn fuzzy_match_positions_graphemes(query: &str, candidate: &str) -> Option<(Vec<usize>, i64)> {
171 fuzzy_match_positions_graphemes_v1(query, candidate)
172}
173
174/// Latest version of the scored fuzzy matcher.
175///
176/// This function may change its scoring model in a future release. If you need
177/// a pinned scoring model, call a versioned function such as
178/// [`fuzzy_match_positions_graphemes_v1`].
179pub fn fuzzy_match_positions_graphemes_latest(
180 query: &str,
181 candidate: &str,
182) -> Option<(Vec<usize>, i64)> {
183 fuzzy_match_positions_graphemes_v1(query, candidate)
184}
185
186/// Version 1 of the scored fuzzy matcher.
187///
188/// This function's scoring model is intended to be stable across patch/minor
189/// releases. If a future scoring model is introduced, it will be exposed as a
190/// new versioned function (for example, `..._v2`).
191pub fn fuzzy_match_positions_graphemes_v1(
192 query: &str,
193 candidate: &str,
194) -> Option<(Vec<usize>, i64)> {
195 let qu: Vec<String> = fold_query_units(query);
196 if qu.is_empty() {
197 return Some((Vec::new(), 0));
198 }
199
200 let (cgs, cu) = fold_candidate_units(candidate);
201 let mut q_i: usize = 0;
202 let mut raw_positions: Vec<usize> = Vec::new();
203
204 for (unit, orig_i) in cu {
205 if q_i >= qu.len() {
206 break;
207 }
208 if unit == qu[q_i] {
209 raw_positions.push(orig_i);
210 q_i += 1;
211 if q_i == qu.len() {
212 break;
213 }
214 }
215 }
216
217 if q_i != qu.len() {
218 return None;
219 }
220
221 let mut positions: Vec<usize> = Vec::new();
222 for p in raw_positions {
223 if positions.last().copied() != Some(p) {
224 positions.push(p);
225 }
226 }
227
228 let score = score_match_positions(&cgs, &positions);
229 Some((positions, score))
230}
231
232fn is_ascii_word_grapheme(g: &str) -> bool {
233 // Treat a single ASCII byte as "word" if it is [A-Za-z0-9_].
234 // Multi-grapheme clusters (including emoji) are treated as non-word.
235 let b = g.as_bytes();
236 if b.len() != 1 {
237 return false;
238 }
239 matches!(b[0], b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
240}
241
242fn is_word_boundary(cgs: &[&str], pos: usize) -> bool {
243 if pos == 0 {
244 return true;
245 }
246 let cur_word = is_ascii_word_grapheme(cgs[pos]);
247 if !cur_word {
248 return false;
249 }
250 let prev_word = is_ascii_word_grapheme(cgs[pos - 1]);
251 !prev_word
252}
253
254fn score_match_positions(cgs: &[&str], positions: &[usize]) -> i64 {
255 let mut score: i64 = 0;
256
257 // Mild length penalty favors shorter candidates.
258 score -= cgs.len() as i64;
259
260 // Base per-match scoring.
261 score += 10 * positions.len() as i64;
262
263 // Start-of-string bonus.
264 if positions.first().copied() == Some(0) {
265 score += 30;
266 }
267
268 // Boundary and consecutive bonuses, plus gap penalty.
269 let mut prev: Option<usize> = None;
270 for &p in positions {
271 if is_word_boundary(cgs, p) {
272 score += 15;
273 }
274 if let Some(pp) = prev {
275 if p == pp + 1 {
276 score += 20;
277 } else if p > pp + 1 {
278 score -= (p - (pp + 1)) as i64;
279 }
280 }
281 prev = Some(p);
282 }
283
284 score
285}