Skip to main content

ftui_text/
search.rs

1#![forbid(unsafe_code)]
2
3//! Unicode-aware text search utilities.
4//!
5//! Feature-gated behind the `normalization` feature flag for normalization-aware
6//! and case-folded search. Basic exact search is always available.
7//!
8//! # Example
9//! ```
10//! use ftui_text::search::{SearchResult, search_exact};
11//!
12//! let results = search_exact("hello world hello", "hello");
13//! assert_eq!(results.len(), 2);
14//! assert_eq!(results[0].range, 0..5);
15//! assert_eq!(results[1].range, 12..17);
16//! ```
17
18/// A single search match with its byte range in the source text.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct SearchResult {
21    /// Byte offset range of the match in the source string.
22    pub range: std::ops::Range<usize>,
23}
24
25impl SearchResult {
26    /// Create a new search result.
27    #[must_use]
28    pub fn new(start: usize, end: usize) -> Self {
29        Self { range: start..end }
30    }
31
32    /// Extract the matched text from the source.
33    #[must_use]
34    pub fn text<'a>(&self, source: &'a str) -> &'a str {
35        &source[self.range.clone()]
36    }
37}
38
39/// Find all exact substring matches (byte-level, no normalization).
40///
41/// Returns non-overlapping matches from left to right.
42#[must_use]
43pub fn search_exact(haystack: &str, needle: &str) -> Vec<SearchResult> {
44    if needle.is_empty() {
45        return Vec::new();
46    }
47    let mut results = Vec::new();
48    let mut start = 0;
49    while let Some(pos) = haystack[start..].find(needle) {
50        let abs_pos = start + pos;
51        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
52        start = abs_pos + needle.len();
53    }
54    results
55}
56
57/// Find all exact substring matches, allowing overlapping results.
58///
59/// Advances by one byte after each match start.
60#[must_use]
61pub fn search_exact_overlapping(haystack: &str, needle: &str) -> Vec<SearchResult> {
62    if needle.is_empty() {
63        return Vec::new();
64    }
65    let mut results = Vec::new();
66    let mut start = 0;
67    while let Some(pos) = haystack[start..].find(needle) {
68        let abs_pos = start + pos;
69        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
70        // Advance by one char (not byte) to handle multi-byte chars correctly
71        start = abs_pos + 1;
72        // Ensure we're at a char boundary
73        while start < haystack.len() && !haystack.is_char_boundary(start) {
74            start += 1;
75        }
76    }
77    results
78}
79
80/// Case-insensitive search using simple ASCII lowering.
81///
82/// For full Unicode case folding, use [`search_case_insensitive`] with the
83/// `normalization` feature enabled.
84#[must_use]
85pub fn search_ascii_case_insensitive(haystack: &str, needle: &str) -> Vec<SearchResult> {
86    if needle.is_empty() {
87        return Vec::new();
88    }
89    let haystack_lower = haystack.to_ascii_lowercase();
90    let needle_lower = needle.to_ascii_lowercase();
91    let mut results = Vec::new();
92    let mut start = 0;
93    while let Some(pos) = haystack_lower[start..].find(&needle_lower) {
94        let abs_pos = start + pos;
95        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
96        start = abs_pos + needle.len();
97    }
98    results
99}
100
101/// Case-insensitive search using full Unicode case folding.
102///
103/// Uses NFKC normalization + lowercase for both haystack and needle,
104/// then maps result positions back to the original string.
105#[cfg(feature = "normalization")]
106#[must_use]
107pub fn search_case_insensitive(haystack: &str, needle: &str) -> Vec<SearchResult> {
108    if needle.is_empty() {
109        return Vec::new();
110    }
111    let needle_norm = crate::normalization::normalize_for_search(needle);
112    if needle_norm.is_empty() {
113        return Vec::new();
114    }
115
116    use unicode_segmentation::UnicodeSegmentation;
117
118    // Build mapping using grapheme clusters for correct normalization boundaries.
119    // Track both start and end byte offsets for each normalized byte so
120    // matches that land inside a grapheme expansion still map to a full
121    // grapheme range in the original string.
122    let mut norm_start_map: Vec<usize> = Vec::new();
123    let mut norm_end_map: Vec<usize> = Vec::new();
124    let mut normalized = String::new();
125
126    for (orig_byte, grapheme) in haystack.grapheme_indices(true) {
127        let chunk = crate::normalization::normalize_for_search(grapheme);
128        if chunk.is_empty() {
129            continue;
130        }
131        let orig_end = orig_byte + grapheme.len();
132        for _ in chunk.bytes() {
133            norm_start_map.push(orig_byte);
134            norm_end_map.push(orig_end);
135        }
136        normalized.push_str(&chunk);
137    }
138    if normalized.is_empty() {
139        return Vec::new();
140    }
141
142    let mut results = Vec::new();
143    let mut start = 0;
144    while let Some(pos) = normalized[start..].find(&needle_norm) {
145        let norm_start = start + pos;
146        let norm_end = norm_start + needle_norm.len();
147
148        let orig_start = norm_start_map
149            .get(norm_start)
150            .copied()
151            .unwrap_or(haystack.len());
152        let orig_end = if norm_end == 0 {
153            orig_start
154        } else {
155            norm_end_map
156                .get(norm_end - 1)
157                .copied()
158                .unwrap_or(haystack.len())
159        };
160
161        // Avoid duplicate ranges when a single grapheme expands into multiple
162        // normalized bytes (e.g., fullwidth "A" -> "a" under NFKC).
163        if results
164            .last()
165            .is_some_and(|r: &SearchResult| r.range.start == orig_start && r.range.end == orig_end)
166        {
167            start = norm_end;
168            continue;
169        }
170
171        results.push(SearchResult::new(orig_start, orig_end));
172        start = norm_end;
173    }
174    results
175}
176
177/// Normalization-aware search: treats composed and decomposed forms as equal.
178///
179/// Normalizes both haystack and needle to the given form before searching,
180/// then maps results back to original string positions using grapheme clusters.
181#[cfg(feature = "normalization")]
182#[must_use]
183pub fn search_normalized(
184    haystack: &str,
185    needle: &str,
186    form: crate::normalization::NormForm,
187) -> Vec<SearchResult> {
188    use crate::normalization::normalize;
189    use unicode_segmentation::UnicodeSegmentation;
190
191    if needle.is_empty() {
192        return Vec::new();
193    }
194    let needle_norm = normalize(needle, form);
195    if needle_norm.is_empty() {
196        return Vec::new();
197    }
198
199    // Normalize per grapheme cluster to maintain position tracking.
200    // Grapheme clusters are the correct unit because normalization
201    // operates within grapheme boundaries.
202    let mut norm_start_map: Vec<usize> = Vec::new();
203    let mut norm_end_map: Vec<usize> = Vec::new();
204    let mut normalized = String::new();
205
206    for (orig_byte, grapheme) in haystack.grapheme_indices(true) {
207        let chunk = normalize(grapheme, form);
208        if chunk.is_empty() {
209            continue;
210        }
211        let orig_end = orig_byte + grapheme.len();
212        for _ in chunk.bytes() {
213            norm_start_map.push(orig_byte);
214            norm_end_map.push(orig_end);
215        }
216        normalized.push_str(&chunk);
217    }
218    if normalized.is_empty() {
219        return Vec::new();
220    }
221
222    let mut results = Vec::new();
223    let mut start = 0;
224    while let Some(pos) = normalized[start..].find(&needle_norm) {
225        let norm_start = start + pos;
226        let norm_end = norm_start + needle_norm.len();
227
228        let orig_start = norm_start_map
229            .get(norm_start)
230            .copied()
231            .unwrap_or(haystack.len());
232        let orig_end = if norm_end == 0 {
233            orig_start
234        } else {
235            norm_end_map
236                .get(norm_end - 1)
237                .copied()
238                .unwrap_or(haystack.len())
239        };
240
241        if results
242            .last()
243            .is_some_and(|r: &SearchResult| r.range.start == orig_start && r.range.end == orig_end)
244        {
245            start = norm_end;
246            continue;
247        }
248
249        results.push(SearchResult::new(orig_start, orig_end));
250        start = norm_end;
251    }
252    results
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    // ==========================================================
260    // Exact search
261    // ==========================================================
262
263    #[test]
264    fn exact_basic() {
265        let results = search_exact("hello world hello", "hello");
266        assert_eq!(results.len(), 2);
267        assert_eq!(results[0].range, 0..5);
268        assert_eq!(results[1].range, 12..17);
269    }
270
271    #[test]
272    fn exact_no_match() {
273        let results = search_exact("hello world", "xyz");
274        assert!(results.is_empty());
275    }
276
277    #[test]
278    fn exact_empty_needle() {
279        let results = search_exact("hello", "");
280        assert!(results.is_empty());
281    }
282
283    #[test]
284    fn exact_empty_haystack() {
285        let results = search_exact("", "hello");
286        assert!(results.is_empty());
287    }
288
289    #[test]
290    fn exact_needle_equals_haystack() {
291        let results = search_exact("hello", "hello");
292        assert_eq!(results.len(), 1);
293        assert_eq!(results[0].range, 0..5);
294    }
295
296    #[test]
297    fn exact_needle_longer() {
298        let results = search_exact("hi", "hello");
299        assert!(results.is_empty());
300    }
301
302    #[test]
303    fn exact_adjacent_matches() {
304        let results = search_exact("aaa", "a");
305        assert_eq!(results.len(), 3);
306    }
307
308    #[test]
309    fn exact_text_extraction() {
310        let haystack = "foo bar baz";
311        let results = search_exact(haystack, "bar");
312        assert_eq!(results[0].text(haystack), "bar");
313    }
314
315    #[test]
316    fn exact_unicode() {
317        let results = search_exact("café résumé café", "café");
318        assert_eq!(results.len(), 2);
319    }
320
321    #[test]
322    fn exact_cjk() {
323        let results = search_exact("你好世界你好", "你好");
324        assert_eq!(results.len(), 2);
325    }
326
327    // ==========================================================
328    // Overlapping search
329    // ==========================================================
330
331    #[test]
332    fn overlapping_basic() {
333        let results = search_exact_overlapping("aaa", "aa");
334        assert_eq!(results.len(), 2);
335        assert_eq!(results[0].range, 0..2);
336        assert_eq!(results[1].range, 1..3);
337    }
338
339    #[test]
340    fn overlapping_no_overlap() {
341        let results = search_exact_overlapping("abcabc", "abc");
342        assert_eq!(results.len(), 2);
343    }
344
345    #[test]
346    fn overlapping_empty_needle() {
347        let results = search_exact_overlapping("abc", "");
348        assert!(results.is_empty());
349    }
350
351    // ==========================================================
352    // ASCII case-insensitive search
353    // ==========================================================
354
355    #[test]
356    fn ascii_ci_basic() {
357        let results = search_ascii_case_insensitive("Hello World HELLO", "hello");
358        assert_eq!(results.len(), 2);
359    }
360
361    #[test]
362    fn ascii_ci_mixed_case() {
363        let results = search_ascii_case_insensitive("FoO BaR fOo", "foo");
364        assert_eq!(results.len(), 2);
365    }
366
367    #[test]
368    fn ascii_ci_no_match() {
369        let results = search_ascii_case_insensitive("hello", "xyz");
370        assert!(results.is_empty());
371    }
372
373    // ==========================================================
374    // Valid range property tests
375    // ==========================================================
376
377    #[test]
378    fn results_have_valid_ranges() {
379        let test_cases = [
380            ("hello world", "o"),
381            ("aaaa", "aa"),
382            ("", "x"),
383            ("x", ""),
384            ("café", "é"),
385            ("🌍 world 🌍", "🌍"),
386        ];
387        for (haystack, needle) in test_cases {
388            let results = search_exact(haystack, needle);
389            for r in &results {
390                assert!(
391                    r.range.start <= r.range.end,
392                    "Invalid range for '{needle}' in '{haystack}'"
393                );
394                assert!(
395                    r.range.end <= haystack.len(),
396                    "Out of bounds for '{needle}' in '{haystack}'"
397                );
398                assert!(
399                    haystack.is_char_boundary(r.range.start),
400                    "Not char boundary at start"
401                );
402                assert!(
403                    haystack.is_char_boundary(r.range.end),
404                    "Not char boundary at end"
405                );
406            }
407        }
408    }
409
410    #[test]
411    fn emoji_search() {
412        let results = search_exact("hello 🌍 world 🌍 end", "🌍");
413        assert_eq!(results.len(), 2);
414        for r in &results {
415            assert_eq!(&"hello 🌍 world 🌍 end"[r.range.clone()], "🌍");
416        }
417    }
418}
419
420#[cfg(all(test, feature = "normalization"))]
421mod normalization_tests {
422    use super::*;
423
424    #[test]
425    fn case_insensitive_unicode() {
426        // Case-insensitive search finds "Strasse" (literal match in haystack)
427        // Note: ß does NOT fold to ss with to_lowercase(); this tests the literal match
428        let results = search_case_insensitive("Straße Strasse", "strasse");
429        assert!(
430            !results.is_empty(),
431            "Should find literal case-insensitive match"
432        );
433    }
434
435    #[test]
436    fn case_insensitive_expansion_range_maps_to_grapheme() {
437        // Test that grapheme boundaries are preserved in results
438        // Note: ß does NOT case-fold to ss (that would require Unicode case folding)
439        let haystack = "STRAßE";
440        let results = search_case_insensitive(haystack, "straße");
441        assert_eq!(results.len(), 1);
442        let result = &results[0];
443        assert_eq!(result.text(haystack), "STRAßE");
444        assert!(haystack.is_char_boundary(result.range.start));
445        assert!(haystack.is_char_boundary(result.range.end));
446    }
447
448    #[test]
449    fn case_insensitive_accented() {
450        let results = search_case_insensitive("CAFÉ café Café", "café");
451        // All three should match
452        assert_eq!(results.len(), 3);
453    }
454
455    #[test]
456    fn case_insensitive_empty() {
457        let results = search_case_insensitive("hello", "");
458        assert!(results.is_empty());
459    }
460
461    #[test]
462    fn case_insensitive_fullwidth() {
463        // Fullwidth "HELLO" should match "hello" under NFKC normalization
464        let results = search_case_insensitive("\u{FF28}\u{FF25}\u{FF2C}\u{FF2C}\u{FF2F}", "hello");
465        assert!(!results.is_empty(), "Fullwidth should match via NFKC");
466    }
467
468    #[test]
469    fn normalized_composed_vs_decomposed() {
470        use crate::normalization::NormForm;
471        // Search for composed é in text with decomposed e+combining acute
472        let haystack = "caf\u{0065}\u{0301}"; // e + combining acute
473        let needle = "caf\u{00E9}"; // precomposed é
474        let results = search_normalized(haystack, needle, NormForm::Nfc);
475        assert_eq!(results.len(), 1, "Should find NFC-equivalent match");
476    }
477
478    #[test]
479    fn normalized_no_false_positive() {
480        use crate::normalization::NormForm;
481        let results = search_normalized("hello", "world", NormForm::Nfc);
482        assert!(results.is_empty());
483    }
484
485    #[test]
486    fn normalized_result_ranges_valid() {
487        use crate::normalization::NormForm;
488        let haystack = "café résumé café";
489        let needle = "café";
490        let results = search_normalized(haystack, needle, NormForm::Nfc);
491        for r in &results {
492            assert!(r.range.start <= r.range.end);
493            assert!(r.range.end <= haystack.len());
494            assert!(haystack.is_char_boundary(r.range.start));
495            assert!(haystack.is_char_boundary(r.range.end));
496        }
497    }
498
499    #[test]
500    fn case_insensitive_result_ranges_valid() {
501        let haystack = "Hello WORLD hello";
502        let results = search_case_insensitive(haystack, "hello");
503        for r in &results {
504            assert!(r.range.start <= r.range.end);
505            assert!(r.range.end <= haystack.len());
506            assert!(haystack.is_char_boundary(r.range.start));
507            assert!(haystack.is_char_boundary(r.range.end));
508        }
509    }
510}