Skip to main content

ftui_text/
search.rs

1#![forbid(unsafe_code)]
2
3//! Unicode-aware text search utilities.
4//!
5//! Feature-gated behind the `normalization` feature flag for normalization-aware
6//! and case-folded search. Basic exact search is always available.
7//!
8//! # Policy-Aware Search
9//!
10//! When the `normalization` feature is enabled, [`search_with_policy`] provides a
11//! unified entry point that combines configurable normalization form, case folding,
12//! and width measurement into a single [`SearchPolicy`]. Results include display
13//! column offsets computed under the chosen [`WidthMode`].
14//!
15//! # Example
16//! ```
17//! use ftui_text::search::{SearchResult, search_exact};
18//!
19//! let results = search_exact("hello world hello", "hello");
20//! assert_eq!(results.len(), 2);
21//! assert_eq!(results[0].range, 0..5);
22//! assert_eq!(results[1].range, 12..17);
23//! ```
24
25use unicode_width::UnicodeWidthChar;
26
27/// Unicode character width measurement mode for search results.
28///
29/// Controls how display column offsets are computed โ€” in particular for East
30/// Asian Ambiguous characters whose width differs between CJK and Western
31/// terminals.
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
33pub enum WidthMode {
34    /// Standard Unicode width: EA Ambiguous characters are single-width (1 cell).
35    #[default]
36    Standard,
37
38    /// CJK-aware width: EA Ambiguous characters are double-width (2 cells).
39    CjkAmbiguousWide,
40}
41
42impl WidthMode {
43    /// Compute the terminal display width of a single Unicode scalar.
44    #[inline]
45    #[must_use]
46    pub fn char_width(self, ch: char) -> usize {
47        let w = match self {
48            Self::Standard => UnicodeWidthChar::width(ch).unwrap_or(0),
49            Self::CjkAmbiguousWide => UnicodeWidthChar::width_cjk(ch).unwrap_or(0),
50        };
51        w.min(2)
52    }
53
54    /// Compute the display width of a string by summing per-character widths.
55    #[must_use]
56    pub fn str_width(self, s: &str) -> usize {
57        s.chars().map(|ch| self.char_width(ch)).sum()
58    }
59}
60
61/// Compute the display column at a byte offset in a string.
62///
63/// Returns the sum of character widths for all characters before the given byte
64/// offset, using the specified width mode.
65///
66/// # Panics
67/// Panics if `byte_offset` is not at a char boundary in `s`.
68#[must_use]
69pub fn display_col_at(s: &str, byte_offset: usize, mode: WidthMode) -> usize {
70    debug_assert!(s.is_char_boundary(byte_offset));
71    s[..byte_offset].chars().map(|ch| mode.char_width(ch)).sum()
72}
73
74/// A single search match with its byte range in the source text.
75#[derive(Debug, Clone, PartialEq, Eq)]
76pub struct SearchResult {
77    /// Byte offset range of the match in the source string.
78    pub range: std::ops::Range<usize>,
79}
80
81impl SearchResult {
82    /// Create a new search result.
83    #[must_use]
84    pub fn new(start: usize, end: usize) -> Self {
85        Self { range: start..end }
86    }
87
88    /// Extract the matched text from the source.
89    #[must_use]
90    pub fn text<'a>(&self, source: &'a str) -> &'a str {
91        &source[self.range.clone()]
92    }
93}
94
95/// Find all exact substring matches (byte-level, no normalization).
96///
97/// Returns non-overlapping matches from left to right.
98#[must_use]
99pub fn search_exact(haystack: &str, needle: &str) -> Vec<SearchResult> {
100    if needle.is_empty() {
101        return Vec::new();
102    }
103    let mut results = Vec::new();
104    let mut start = 0;
105    while let Some(pos) = haystack[start..].find(needle) {
106        let abs_pos = start + pos;
107        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
108        start = abs_pos + needle.len();
109    }
110    results
111}
112
113/// Find all exact substring matches, allowing overlapping results.
114///
115/// Advances by one byte after each match start.
116#[must_use]
117pub fn search_exact_overlapping(haystack: &str, needle: &str) -> Vec<SearchResult> {
118    if needle.is_empty() {
119        return Vec::new();
120    }
121    let mut results = Vec::new();
122    let mut start = 0;
123    while let Some(pos) = haystack[start..].find(needle) {
124        let abs_pos = start + pos;
125        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
126        // Advance by one char (not byte) to handle multi-byte chars correctly
127        start = abs_pos + 1;
128        // Ensure we're at a char boundary
129        while start < haystack.len() && !haystack.is_char_boundary(start) {
130            start += 1;
131        }
132    }
133    results
134}
135
136/// Case-insensitive search using simple ASCII lowering.
137///
138/// For full Unicode case folding, use [`search_case_insensitive`] with the
139/// `normalization` feature enabled.
140#[must_use]
141pub fn search_ascii_case_insensitive(haystack: &str, needle: &str) -> Vec<SearchResult> {
142    if needle.is_empty() {
143        return Vec::new();
144    }
145    let haystack_lower = haystack.to_ascii_lowercase();
146    let needle_lower = needle.to_ascii_lowercase();
147    let mut results = Vec::new();
148    let mut start = 0;
149    while let Some(pos) = haystack_lower[start..].find(&needle_lower) {
150        let abs_pos = start + pos;
151        results.push(SearchResult::new(abs_pos, abs_pos + needle.len()));
152        start = abs_pos + needle.len();
153    }
154    results
155}
156
157/// Case-insensitive search using full Unicode case folding.
158///
159/// Uses NFKC normalization + lowercase for both haystack and needle,
160/// then maps result positions back to the original string.
161#[cfg(feature = "normalization")]
162#[must_use]
163pub fn search_case_insensitive(haystack: &str, needle: &str) -> Vec<SearchResult> {
164    if needle.is_empty() {
165        return Vec::new();
166    }
167    let needle_norm = crate::normalization::normalize_for_search(needle);
168    if needle_norm.is_empty() {
169        return Vec::new();
170    }
171
172    use unicode_segmentation::UnicodeSegmentation;
173
174    // Build mapping using grapheme clusters for correct normalization boundaries.
175    // Track both start and end byte offsets for each normalized byte so
176    // matches that land inside a grapheme expansion still map to a full
177    // grapheme range in the original string.
178    let mut norm_start_map: Vec<usize> = Vec::new();
179    let mut norm_end_map: Vec<usize> = Vec::new();
180    let mut normalized = String::new();
181
182    for (orig_byte, grapheme) in haystack.grapheme_indices(true) {
183        let chunk = crate::normalization::normalize_for_search(grapheme);
184        if chunk.is_empty() {
185            continue;
186        }
187        let orig_end = orig_byte + grapheme.len();
188        for _ in chunk.bytes() {
189            norm_start_map.push(orig_byte);
190            norm_end_map.push(orig_end);
191        }
192        normalized.push_str(&chunk);
193    }
194    if normalized.is_empty() {
195        return Vec::new();
196    }
197
198    let mut results = Vec::new();
199    let mut start = 0;
200    while let Some(pos) = normalized[start..].find(&needle_norm) {
201        let norm_start = start + pos;
202        let norm_end = norm_start + needle_norm.len();
203
204        let orig_start = norm_start_map
205            .get(norm_start)
206            .copied()
207            .unwrap_or(haystack.len());
208        let orig_end = if norm_end == 0 {
209            orig_start
210        } else {
211            norm_end_map
212                .get(norm_end - 1)
213                .copied()
214                .unwrap_or(haystack.len())
215        };
216
217        // Avoid duplicate ranges when a single grapheme expands into multiple
218        // normalized bytes (e.g., fullwidth "๏ผก" -> "a" under NFKC).
219        if results
220            .last()
221            .is_some_and(|r: &SearchResult| r.range.start == orig_start && r.range.end == orig_end)
222        {
223            start = norm_end;
224            continue;
225        }
226
227        results.push(SearchResult::new(orig_start, orig_end));
228        start = norm_end;
229    }
230    results
231}
232
233/// Normalization-aware search: treats composed and decomposed forms as equal.
234///
235/// Normalizes both haystack and needle to the given form before searching,
236/// then maps results back to original string positions using grapheme clusters.
237#[cfg(feature = "normalization")]
238#[must_use]
239pub fn search_normalized(
240    haystack: &str,
241    needle: &str,
242    form: crate::normalization::NormForm,
243) -> Vec<SearchResult> {
244    use crate::normalization::normalize;
245    use unicode_segmentation::UnicodeSegmentation;
246
247    if needle.is_empty() {
248        return Vec::new();
249    }
250    let needle_norm = normalize(needle, form);
251    if needle_norm.is_empty() {
252        return Vec::new();
253    }
254
255    // Normalize per grapheme cluster to maintain position tracking.
256    // Grapheme clusters are the correct unit because normalization
257    // operates within grapheme boundaries.
258    let mut norm_start_map: Vec<usize> = Vec::new();
259    let mut norm_end_map: Vec<usize> = Vec::new();
260    let mut normalized = String::new();
261
262    for (orig_byte, grapheme) in haystack.grapheme_indices(true) {
263        let chunk = normalize(grapheme, form);
264        if chunk.is_empty() {
265            continue;
266        }
267        let orig_end = orig_byte + grapheme.len();
268        for _ in chunk.bytes() {
269            norm_start_map.push(orig_byte);
270            norm_end_map.push(orig_end);
271        }
272        normalized.push_str(&chunk);
273    }
274    if normalized.is_empty() {
275        return Vec::new();
276    }
277
278    let mut results = Vec::new();
279    let mut start = 0;
280    while let Some(pos) = normalized[start..].find(&needle_norm) {
281        let norm_start = start + pos;
282        let norm_end = norm_start + needle_norm.len();
283
284        let orig_start = norm_start_map
285            .get(norm_start)
286            .copied()
287            .unwrap_or(haystack.len());
288        let orig_end = if norm_end == 0 {
289            orig_start
290        } else {
291            norm_end_map
292                .get(norm_end - 1)
293                .copied()
294                .unwrap_or(haystack.len())
295        };
296
297        if results
298            .last()
299            .is_some_and(|r: &SearchResult| r.range.start == orig_start && r.range.end == orig_end)
300        {
301            start = norm_end;
302            continue;
303        }
304
305        results.push(SearchResult::new(orig_start, orig_end));
306        start = norm_end;
307    }
308    results
309}
310
311// =============================================================================
312// Policy-aware search
313// =============================================================================
314
315/// Search policy controlling normalization, case folding, and width measurement.
316///
317/// Bundles the three knobs a terminal search feature needs:
318/// 1. Which Unicode normalization form to apply before matching.
319/// 2. Whether to fold case (Unicode-aware lowercase after normalization).
320/// 3. Which width mode to use for computing display column offsets in results.
321///
322/// # Presets
323///
324/// | Preset | Norm | Case | Width |
325/// |--------|------|------|-------|
326/// | [`STANDARD`](Self::STANDARD) | NFKC | insensitive | Standard |
327/// | [`CJK`](Self::CJK) | NFKC | insensitive | CjkAmbiguousWide |
328/// | [`EXACT_NFC`](Self::EXACT_NFC) | NFC | sensitive | Standard |
329#[cfg(feature = "normalization")]
330#[derive(Debug, Clone, Copy, PartialEq, Eq)]
331pub struct SearchPolicy {
332    /// Normalization form applied to both haystack and needle before matching.
333    pub norm_form: crate::normalization::NormForm,
334    /// If `true`, apply Unicode case folding (lowercase) after normalization.
335    pub case_insensitive: bool,
336    /// Width mode for computing [`PolicySearchResult::col_start`] / [`PolicySearchResult::col_end`].
337    pub width_mode: WidthMode,
338}
339
340#[cfg(feature = "normalization")]
341impl SearchPolicy {
342    /// Default Western terminal preset: NFKC + case-insensitive + standard width.
343    pub const STANDARD: Self = Self {
344        norm_form: crate::normalization::NormForm::Nfkc,
345        case_insensitive: true,
346        width_mode: WidthMode::Standard,
347    };
348
349    /// CJK terminal preset: NFKC + case-insensitive + CJK ambiguous-wide.
350    pub const CJK: Self = Self {
351        norm_form: crate::normalization::NormForm::Nfkc,
352        case_insensitive: true,
353        width_mode: WidthMode::CjkAmbiguousWide,
354    };
355
356    /// Exact NFC matching: NFC + case-sensitive + standard width.
357    pub const EXACT_NFC: Self = Self {
358        norm_form: crate::normalization::NormForm::Nfc,
359        case_insensitive: false,
360        width_mode: WidthMode::Standard,
361    };
362}
363
364/// A search result enriched with display-column offsets.
365///
366/// Extends [`SearchResult`] with terminal column positions computed under the
367/// [`WidthMode`] specified by the [`SearchPolicy`].
368#[cfg(feature = "normalization")]
369#[derive(Debug, Clone, PartialEq, Eq)]
370pub struct PolicySearchResult {
371    /// Byte offset range of the match in the source string.
372    pub range: std::ops::Range<usize>,
373    /// Display column (0-indexed) where the match starts.
374    pub col_start: usize,
375    /// Display column (0-indexed) where the match ends (exclusive).
376    pub col_end: usize,
377}
378
379#[cfg(feature = "normalization")]
380impl PolicySearchResult {
381    /// Extract the matched text from the source.
382    #[must_use]
383    pub fn text<'a>(&self, source: &'a str) -> &'a str {
384        &source[self.range.clone()]
385    }
386
387    /// Display width of the matched text (col_end - col_start).
388    #[must_use]
389    pub fn display_width(&self) -> usize {
390        self.col_end - self.col_start
391    }
392}
393
394/// Search with explicit policy controlling normalization, case folding, and width
395/// measurement.
396///
397/// This is the unified entry point for policy-aware search. It:
398/// 1. Normalizes both haystack and needle to [`SearchPolicy::norm_form`].
399/// 2. Optionally applies Unicode case folding.
400/// 3. Finds all non-overlapping matches.
401/// 4. Computes display column offsets using [`SearchPolicy::width_mode`].
402///
403/// # Example
404/// ```
405/// use ftui_text::search::{SearchPolicy, search_with_policy};
406///
407/// let results = search_with_policy("Hello World", "hello", &SearchPolicy::STANDARD);
408/// assert_eq!(results.len(), 1);
409/// assert_eq!(results[0].col_start, 0);
410/// assert_eq!(results[0].col_end, 5);
411/// ```
412#[cfg(feature = "normalization")]
413#[must_use]
414pub fn search_with_policy(
415    haystack: &str,
416    needle: &str,
417    policy: &SearchPolicy,
418) -> Vec<PolicySearchResult> {
419    use crate::normalization::normalize;
420    use unicode_segmentation::UnicodeSegmentation;
421
422    if needle.is_empty() {
423        return Vec::new();
424    }
425
426    // Normalize the needle.
427    let needle_norm = if policy.case_insensitive {
428        normalize(needle, policy.norm_form).to_lowercase()
429    } else {
430        normalize(needle, policy.norm_form)
431    };
432    if needle_norm.is_empty() {
433        return Vec::new();
434    }
435
436    // Normalize haystack per grapheme cluster and build byte-offset mapping.
437    let mut norm_start_map: Vec<usize> = Vec::new();
438    let mut norm_end_map: Vec<usize> = Vec::new();
439    let mut normalized = String::new();
440
441    for (orig_byte, grapheme) in haystack.grapheme_indices(true) {
442        let chunk = if policy.case_insensitive {
443            normalize(grapheme, policy.norm_form).to_lowercase()
444        } else {
445            normalize(grapheme, policy.norm_form)
446        };
447        if chunk.is_empty() {
448            continue;
449        }
450        let orig_end = orig_byte + grapheme.len();
451        for _ in chunk.bytes() {
452            norm_start_map.push(orig_byte);
453            norm_end_map.push(orig_end);
454        }
455        normalized.push_str(&chunk);
456    }
457    if normalized.is_empty() {
458        return Vec::new();
459    }
460
461    // Find matches in normalized text.
462    let mut results = Vec::new();
463    let mut start = 0;
464    while let Some(pos) = normalized[start..].find(&needle_norm) {
465        let norm_start = start + pos;
466        let norm_end = norm_start + needle_norm.len();
467
468        let orig_start = norm_start_map
469            .get(norm_start)
470            .copied()
471            .unwrap_or(haystack.len());
472        let orig_end = if norm_end == 0 {
473            orig_start
474        } else {
475            norm_end_map
476                .get(norm_end - 1)
477                .copied()
478                .unwrap_or(haystack.len())
479        };
480
481        // Deduplicate when a grapheme expansion produces duplicate ranges.
482        if results.last().is_some_and(|r: &PolicySearchResult| {
483            r.range.start == orig_start && r.range.end == orig_end
484        }) {
485            start = norm_end;
486            continue;
487        }
488
489        let col_start = display_col_at(haystack, orig_start, policy.width_mode);
490        let col_end = display_col_at(haystack, orig_end, policy.width_mode);
491
492        results.push(PolicySearchResult {
493            range: orig_start..orig_end,
494            col_start,
495            col_end,
496        });
497        start = norm_end;
498    }
499    results
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    // ==========================================================
507    // Exact search
508    // ==========================================================
509
510    #[test]
511    fn exact_basic() {
512        let results = search_exact("hello world hello", "hello");
513        assert_eq!(results.len(), 2);
514        assert_eq!(results[0].range, 0..5);
515        assert_eq!(results[1].range, 12..17);
516    }
517
518    #[test]
519    fn exact_no_match() {
520        let results = search_exact("hello world", "xyz");
521        assert!(results.is_empty());
522    }
523
524    #[test]
525    fn exact_empty_needle() {
526        let results = search_exact("hello", "");
527        assert!(results.is_empty());
528    }
529
530    #[test]
531    fn exact_empty_haystack() {
532        let results = search_exact("", "hello");
533        assert!(results.is_empty());
534    }
535
536    #[test]
537    fn exact_needle_equals_haystack() {
538        let results = search_exact("hello", "hello");
539        assert_eq!(results.len(), 1);
540        assert_eq!(results[0].range, 0..5);
541    }
542
543    #[test]
544    fn exact_needle_longer() {
545        let results = search_exact("hi", "hello");
546        assert!(results.is_empty());
547    }
548
549    #[test]
550    fn exact_adjacent_matches() {
551        let results = search_exact("aaa", "a");
552        assert_eq!(results.len(), 3);
553    }
554
555    #[test]
556    fn exact_text_extraction() {
557        let haystack = "foo bar baz";
558        let results = search_exact(haystack, "bar");
559        assert_eq!(results[0].text(haystack), "bar");
560    }
561
562    #[test]
563    fn exact_unicode() {
564        let results = search_exact("cafรฉ rรฉsumรฉ cafรฉ", "cafรฉ");
565        assert_eq!(results.len(), 2);
566    }
567
568    #[test]
569    fn exact_cjk() {
570        let results = search_exact("ไฝ ๅฅฝไธ–็•Œไฝ ๅฅฝ", "ไฝ ๅฅฝ");
571        assert_eq!(results.len(), 2);
572    }
573
574    // ==========================================================
575    // Overlapping search
576    // ==========================================================
577
578    #[test]
579    fn overlapping_basic() {
580        let results = search_exact_overlapping("aaa", "aa");
581        assert_eq!(results.len(), 2);
582        assert_eq!(results[0].range, 0..2);
583        assert_eq!(results[1].range, 1..3);
584    }
585
586    #[test]
587    fn overlapping_no_overlap() {
588        let results = search_exact_overlapping("abcabc", "abc");
589        assert_eq!(results.len(), 2);
590    }
591
592    #[test]
593    fn overlapping_empty_needle() {
594        let results = search_exact_overlapping("abc", "");
595        assert!(results.is_empty());
596    }
597
598    // ==========================================================
599    // ASCII case-insensitive search
600    // ==========================================================
601
602    #[test]
603    fn ascii_ci_basic() {
604        let results = search_ascii_case_insensitive("Hello World HELLO", "hello");
605        assert_eq!(results.len(), 2);
606    }
607
608    #[test]
609    fn ascii_ci_mixed_case() {
610        let results = search_ascii_case_insensitive("FoO BaR fOo", "foo");
611        assert_eq!(results.len(), 2);
612    }
613
614    #[test]
615    fn ascii_ci_no_match() {
616        let results = search_ascii_case_insensitive("hello", "xyz");
617        assert!(results.is_empty());
618    }
619
620    // ==========================================================
621    // Valid range property tests
622    // ==========================================================
623
624    #[test]
625    fn results_have_valid_ranges() {
626        let test_cases = [
627            ("hello world", "o"),
628            ("aaaa", "aa"),
629            ("", "x"),
630            ("x", ""),
631            ("cafรฉ", "รฉ"),
632            ("๐ŸŒ world ๐ŸŒ", "๐ŸŒ"),
633        ];
634        for (haystack, needle) in test_cases {
635            let results = search_exact(haystack, needle);
636            for r in &results {
637                assert!(
638                    r.range.start <= r.range.end,
639                    "Invalid range for '{needle}' in '{haystack}'"
640                );
641                assert!(
642                    r.range.end <= haystack.len(),
643                    "Out of bounds for '{needle}' in '{haystack}'"
644                );
645                assert!(
646                    haystack.is_char_boundary(r.range.start),
647                    "Not char boundary at start"
648                );
649                assert!(
650                    haystack.is_char_boundary(r.range.end),
651                    "Not char boundary at end"
652                );
653            }
654        }
655    }
656
657    #[test]
658    fn emoji_search() {
659        let results = search_exact("hello ๐ŸŒ world ๐ŸŒ end", "๐ŸŒ");
660        assert_eq!(results.len(), 2);
661        for r in &results {
662            assert_eq!(&"hello ๐ŸŒ world ๐ŸŒ end"[r.range.clone()], "๐ŸŒ");
663        }
664    }
665}
666
667#[cfg(all(test, feature = "normalization"))]
668mod normalization_tests {
669    use super::*;
670
671    #[test]
672    fn case_insensitive_unicode() {
673        // Case-insensitive search finds "Strasse" (literal match in haystack)
674        // Note: รŸ does NOT fold to ss with to_lowercase(); this tests the literal match
675        let results = search_case_insensitive("StraรŸe Strasse", "strasse");
676        assert!(
677            !results.is_empty(),
678            "Should find literal case-insensitive match"
679        );
680    }
681
682    #[test]
683    fn case_insensitive_expansion_range_maps_to_grapheme() {
684        // Test that grapheme boundaries are preserved in results
685        // Note: รŸ does NOT case-fold to ss (that would require Unicode case folding)
686        let haystack = "STRAรŸE";
687        let results = search_case_insensitive(haystack, "straรŸe");
688        assert_eq!(results.len(), 1);
689        let result = &results[0];
690        assert_eq!(result.text(haystack), "STRAรŸE");
691        assert!(haystack.is_char_boundary(result.range.start));
692        assert!(haystack.is_char_boundary(result.range.end));
693    }
694
695    #[test]
696    fn case_insensitive_accented() {
697        let results = search_case_insensitive("CAFร‰ cafรฉ Cafรฉ", "cafรฉ");
698        // All three should match
699        assert_eq!(results.len(), 3);
700    }
701
702    #[test]
703    fn case_insensitive_empty() {
704        let results = search_case_insensitive("hello", "");
705        assert!(results.is_empty());
706    }
707
708    #[test]
709    fn case_insensitive_fullwidth() {
710        // Fullwidth "HELLO" should match "hello" under NFKC normalization
711        let results = search_case_insensitive("\u{FF28}\u{FF25}\u{FF2C}\u{FF2C}\u{FF2F}", "hello");
712        assert!(!results.is_empty(), "Fullwidth should match via NFKC");
713    }
714
715    #[test]
716    fn normalized_composed_vs_decomposed() {
717        use crate::normalization::NormForm;
718        // Search for composed รฉ in text with decomposed e+combining acute
719        let haystack = "caf\u{0065}\u{0301}"; // e + combining acute
720        let needle = "caf\u{00E9}"; // precomposed รฉ
721        let results = search_normalized(haystack, needle, NormForm::Nfc);
722        assert_eq!(results.len(), 1, "Should find NFC-equivalent match");
723    }
724
725    #[test]
726    fn normalized_no_false_positive() {
727        use crate::normalization::NormForm;
728        let results = search_normalized("hello", "world", NormForm::Nfc);
729        assert!(results.is_empty());
730    }
731
732    #[test]
733    fn normalized_result_ranges_valid() {
734        use crate::normalization::NormForm;
735        let haystack = "cafรฉ rรฉsumรฉ cafรฉ";
736        let needle = "cafรฉ";
737        let results = search_normalized(haystack, needle, NormForm::Nfc);
738        for r in &results {
739            assert!(r.range.start <= r.range.end);
740            assert!(r.range.end <= haystack.len());
741            assert!(haystack.is_char_boundary(r.range.start));
742            assert!(haystack.is_char_boundary(r.range.end));
743        }
744    }
745
746    #[test]
747    fn case_insensitive_result_ranges_valid() {
748        let haystack = "Hello WORLD hello";
749        let results = search_case_insensitive(haystack, "hello");
750        for r in &results {
751            assert!(r.range.start <= r.range.end);
752            assert!(r.range.end <= haystack.len());
753            assert!(haystack.is_char_boundary(r.range.start));
754            assert!(haystack.is_char_boundary(r.range.end));
755        }
756    }
757}
758
759// =============================================================================
760// Policy-aware search tests
761// =============================================================================
762
763#[cfg(all(test, feature = "normalization"))]
764mod policy_tests {
765    use super::*;
766    use crate::normalization::NormForm;
767
768    // โ”€โ”€ WidthMode basic tests โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
769
770    #[test]
771    fn width_mode_ascii_is_one() {
772        for ch in ['a', 'Z', '0', ' ', '~'] {
773            assert_eq!(WidthMode::Standard.char_width(ch), 1);
774            assert_eq!(WidthMode::CjkAmbiguousWide.char_width(ch), 1);
775        }
776    }
777
778    #[test]
779    fn width_mode_cjk_ideograph_is_two() {
780        for ch in ['ไธญ', 'ๅ›ฝ', 'ๅญ—'] {
781            assert_eq!(WidthMode::Standard.char_width(ch), 2);
782            assert_eq!(WidthMode::CjkAmbiguousWide.char_width(ch), 2);
783        }
784    }
785
786    #[test]
787    fn width_mode_ea_ambiguous_differs() {
788        // Box drawing: EA Ambiguous
789        for ch in ['โ”€', 'โ”‚', 'โ”Œ'] {
790            assert_eq!(WidthMode::Standard.char_width(ch), 1, "Standard: {ch:?}");
791            assert_eq!(
792                WidthMode::CjkAmbiguousWide.char_width(ch),
793                2,
794                "CjkWide: {ch:?}"
795            );
796        }
797        // Arrows: EA Ambiguous
798        for ch in ['โ†’', 'โ†', 'โ†‘', 'โ†“'] {
799            assert_eq!(WidthMode::Standard.char_width(ch), 1);
800            assert_eq!(WidthMode::CjkAmbiguousWide.char_width(ch), 2);
801        }
802        // Misc symbols: EA Ambiguous
803        for ch in ['ยฐ', 'ร—', 'ยฎ'] {
804            assert_eq!(WidthMode::Standard.char_width(ch), 1);
805            assert_eq!(WidthMode::CjkAmbiguousWide.char_width(ch), 2);
806        }
807    }
808
809    #[test]
810    fn width_mode_combining_marks_zero() {
811        for ch in ['\u{0300}', '\u{0301}', '\u{0302}'] {
812            assert_eq!(WidthMode::Standard.char_width(ch), 0);
813            assert_eq!(WidthMode::CjkAmbiguousWide.char_width(ch), 0);
814        }
815    }
816
817    #[test]
818    fn width_mode_str_width() {
819        assert_eq!(WidthMode::Standard.str_width("hello"), 5);
820        assert_eq!(WidthMode::Standard.str_width("ไธญๅ›ฝ"), 4);
821        assert_eq!(WidthMode::CjkAmbiguousWide.str_width("hello"), 5);
822        // Arrow + space + CJK: 2+1+2 = 5 in CJK mode
823        assert_eq!(WidthMode::CjkAmbiguousWide.str_width("โ†’ ไธญ"), 5);
824        assert_eq!(WidthMode::Standard.str_width("โ†’ ไธญ"), 4); // 1+1+2
825    }
826
827    #[test]
828    fn width_mode_default_is_standard() {
829        assert_eq!(WidthMode::default(), WidthMode::Standard);
830    }
831
832    // โ”€โ”€ display_col_at tests โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
833
834    #[test]
835    fn display_col_at_ascii() {
836        let s = "hello world";
837        assert_eq!(display_col_at(s, 0, WidthMode::Standard), 0);
838        assert_eq!(display_col_at(s, 5, WidthMode::Standard), 5);
839        assert_eq!(display_col_at(s, 11, WidthMode::Standard), 11);
840    }
841
842    #[test]
843    fn display_col_at_cjk() {
844        let s = "ไฝ ๅฅฝworld";
845        // "ไฝ " = 3 bytes, "ๅฅฝ" = 3 bytes, each 2 cells wide
846        assert_eq!(display_col_at(s, 0, WidthMode::Standard), 0);
847        assert_eq!(display_col_at(s, 3, WidthMode::Standard), 2); // after ไฝ 
848        assert_eq!(display_col_at(s, 6, WidthMode::Standard), 4); // after ๅฅฝ
849        assert_eq!(display_col_at(s, 11, WidthMode::Standard), 9); // after world
850    }
851
852    #[test]
853    fn display_col_at_ea_ambiguous_differs() {
854        let s = "โ”€โ†’text";
855        // โ”€ is 3 bytes, โ†’ is 3 bytes
856        let after_box = 3; // byte offset after โ”€
857        let after_arrow = 6; // byte offset after โ†’
858        assert_eq!(display_col_at(s, after_box, WidthMode::Standard), 1);
859        assert_eq!(display_col_at(s, after_box, WidthMode::CjkAmbiguousWide), 2);
860        assert_eq!(display_col_at(s, after_arrow, WidthMode::Standard), 2);
861        assert_eq!(
862            display_col_at(s, after_arrow, WidthMode::CjkAmbiguousWide),
863            4
864        );
865    }
866
867    #[test]
868    fn display_col_at_combining_marks() {
869        // "e" + combining acute: the combining mark has 0 width
870        let s = "e\u{0301}x";
871        // e = 1 byte, combining = 2 bytes, x = 1 byte
872        assert_eq!(display_col_at(s, 0, WidthMode::Standard), 0);
873        assert_eq!(display_col_at(s, 1, WidthMode::Standard), 1); // after e
874        assert_eq!(display_col_at(s, 3, WidthMode::Standard), 1); // after combining
875        assert_eq!(display_col_at(s, 4, WidthMode::Standard), 2); // after x
876    }
877
878    // โ”€โ”€ SearchPolicy preset tests โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
879
880    #[test]
881    fn policy_standard_preset() {
882        let p = SearchPolicy::STANDARD;
883        assert_eq!(p.norm_form, NormForm::Nfkc);
884        assert!(p.case_insensitive);
885        assert_eq!(p.width_mode, WidthMode::Standard);
886    }
887
888    #[test]
889    fn policy_cjk_preset() {
890        let p = SearchPolicy::CJK;
891        assert_eq!(p.norm_form, NormForm::Nfkc);
892        assert!(p.case_insensitive);
893        assert_eq!(p.width_mode, WidthMode::CjkAmbiguousWide);
894    }
895
896    #[test]
897    fn policy_exact_nfc_preset() {
898        let p = SearchPolicy::EXACT_NFC;
899        assert_eq!(p.norm_form, NormForm::Nfc);
900        assert!(!p.case_insensitive);
901        assert_eq!(p.width_mode, WidthMode::Standard);
902    }
903
904    // โ”€โ”€ search_with_policy: basic matching โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
905
906    #[test]
907    fn policy_search_basic_ascii() {
908        let results = search_with_policy("hello world", "hello", &SearchPolicy::STANDARD);
909        assert_eq!(results.len(), 1);
910        assert_eq!(results[0].range, 0..5);
911        assert_eq!(results[0].col_start, 0);
912        assert_eq!(results[0].col_end, 5);
913    }
914
915    #[test]
916    fn policy_search_case_insensitive() {
917        let results = search_with_policy("Hello WORLD hello", "hello", &SearchPolicy::STANDARD);
918        assert_eq!(results.len(), 2);
919        assert_eq!(results[0].text("Hello WORLD hello"), "Hello");
920        assert_eq!(results[1].text("Hello WORLD hello"), "hello");
921    }
922
923    #[test]
924    fn policy_search_case_sensitive() {
925        let results = search_with_policy("Hello hello", "hello", &SearchPolicy::EXACT_NFC);
926        assert_eq!(results.len(), 1);
927        assert_eq!(results[0].range.start, 6);
928    }
929
930    #[test]
931    fn policy_search_empty_needle() {
932        let results = search_with_policy("hello", "", &SearchPolicy::STANDARD);
933        assert!(results.is_empty());
934    }
935
936    #[test]
937    fn policy_search_empty_haystack() {
938        let results = search_with_policy("", "hello", &SearchPolicy::STANDARD);
939        assert!(results.is_empty());
940    }
941
942    #[test]
943    fn policy_search_no_match() {
944        let results = search_with_policy("hello", "world", &SearchPolicy::STANDARD);
945        assert!(results.is_empty());
946    }
947
948    // โ”€โ”€ search_with_policy: normalization alignment โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
949
950    #[test]
951    fn policy_search_composed_vs_decomposed() {
952        // Composed รฉ in haystack, decomposed e+acute in needle
953        let haystack = "caf\u{00E9}";
954        let needle = "caf\u{0065}\u{0301}";
955        let results = search_with_policy(haystack, needle, &SearchPolicy::EXACT_NFC);
956        assert_eq!(
957            results.len(),
958            1,
959            "NFC should equate composed and decomposed"
960        );
961    }
962
963    #[test]
964    fn policy_search_fullwidth_nfkc() {
965        // Fullwidth "HELLO" matches "hello" under NFKC + case-insensitive
966        let haystack = "\u{FF28}\u{FF25}\u{FF2C}\u{FF2C}\u{FF2F}";
967        let results = search_with_policy(haystack, "hello", &SearchPolicy::STANDARD);
968        assert!(!results.is_empty(), "Fullwidth should match via NFKC");
969    }
970
971    #[test]
972    fn policy_search_nfc_does_not_match_compatibility() {
973        // fi ligature (U+FB01) should NOT match "fi" under NFC (only NFKC decomposes it)
974        let haystack = "\u{FB01}le";
975        let results = search_with_policy(haystack, "file", &SearchPolicy::EXACT_NFC);
976        assert!(
977            results.is_empty(),
978            "NFC should not decompose compatibility chars"
979        );
980    }
981
982    #[test]
983    fn policy_search_nfkc_matches_compatibility() {
984        // fi ligature should match "file" under NFKC
985        let haystack = "\u{FB01}le";
986        let results = search_with_policy(haystack, "file", &SearchPolicy::STANDARD);
987        assert!(!results.is_empty(), "NFKC should decompose fi ligature");
988    }
989
990    // โ”€โ”€ search_with_policy: column offsets with CJK text โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
991
992    #[test]
993    fn policy_search_cjk_column_offsets() {
994        // "ไฝ ๅฅฝworldไฝ ๅฅฝ" โ€” search for "world"
995        let haystack = "ไฝ ๅฅฝworldไฝ ๅฅฝ";
996        let results = search_with_policy(haystack, "world", &SearchPolicy::STANDARD);
997        assert_eq!(results.len(), 1);
998        // "ไฝ " = 2 cols, "ๅฅฝ" = 2 cols โ†’ world starts at col 4
999        assert_eq!(results[0].col_start, 4);
1000        assert_eq!(results[0].col_end, 9); // 4 + 5 = 9
1001    }
1002
1003    #[test]
1004    fn policy_search_cjk_in_cjk() {
1005        // Search for CJK in CJK
1006        let haystack = "ไฝ ๅฅฝไธ–็•Œไฝ ๅฅฝ";
1007        let results = search_with_policy(haystack, "ไธ–็•Œ", &SearchPolicy::STANDARD);
1008        assert_eq!(results.len(), 1);
1009        assert_eq!(results[0].col_start, 4); // ไฝ (2) + ๅฅฝ(2) = 4
1010        assert_eq!(results[0].col_end, 8); // 4 + ไธ–(2) + ็•Œ(2) = 8
1011    }
1012
1013    // โ”€โ”€ search_with_policy: EA Ambiguous differs by width mode โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1014
1015    #[test]
1016    fn policy_search_ea_ambiguous_column_divergence() {
1017        // "โ†’hello" โ€” arrow is EA Ambiguous
1018        let haystack = "โ†’hello";
1019        let standard = search_with_policy(haystack, "hello", &SearchPolicy::STANDARD);
1020        let cjk = search_with_policy(haystack, "hello", &SearchPolicy::CJK);
1021
1022        assert_eq!(standard.len(), 1);
1023        assert_eq!(cjk.len(), 1);
1024        // Same byte range
1025        assert_eq!(standard[0].range, cjk[0].range);
1026        // Different column offsets
1027        assert_eq!(standard[0].col_start, 1); // โ†’ = 1 col in Standard
1028        assert_eq!(cjk[0].col_start, 2); // โ†’ = 2 cols in CJK
1029        assert_eq!(standard[0].col_end, 6);
1030        assert_eq!(cjk[0].col_end, 7);
1031    }
1032
1033    #[test]
1034    fn policy_search_box_drawing_column_divergence() {
1035        // "โ”€โ”€text" โ€” box drawing chars are EA Ambiguous
1036        let haystack = "โ”€โ”€text";
1037        let standard = search_with_policy(haystack, "text", &SearchPolicy::STANDARD);
1038        let cjk = search_with_policy(haystack, "text", &SearchPolicy::CJK);
1039
1040        assert_eq!(standard[0].col_start, 2); // 2 ร— 1 = 2
1041        assert_eq!(cjk[0].col_start, 4); // 2 ร— 2 = 4
1042    }
1043
1044    // โ”€โ”€ search_with_policy: combining marks and column offsets โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1045
1046    #[test]
1047    fn policy_search_combining_mark_offsets() {
1048        // "cafรฉ" with composed รฉ โ€” search for "fรฉ"
1049        let haystack = "cafรฉ";
1050        let results = search_with_policy(haystack, "fรฉ", &SearchPolicy::STANDARD);
1051        assert_eq!(results.len(), 1);
1052        // c=1, a=1 โ†’ fรฉ starts at col 2
1053        assert_eq!(results[0].col_start, 2);
1054        // f=1, รฉ=1 โ†’ ends at col 4
1055        assert_eq!(results[0].col_end, 4);
1056    }
1057
1058    #[test]
1059    fn policy_search_decomposed_combining_offsets() {
1060        // "cafe\u{0301}" โ€” decomposed รฉ
1061        let haystack = "cafe\u{0301}";
1062        let needle = "f\u{00E9}"; // composed fรฉ
1063        let results = search_with_policy(haystack, needle, &SearchPolicy::EXACT_NFC);
1064        assert_eq!(results.len(), 1);
1065        // c=1, a=1 โ†’ starts at col 2
1066        assert_eq!(results[0].col_start, 2);
1067        // f=1, e=1, combining=0 โ†’ ends at col 4
1068        assert_eq!(results[0].col_end, 4);
1069    }
1070
1071    // โ”€โ”€ search_with_policy: display_width on PolicySearchResult โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1072
1073    #[test]
1074    fn policy_result_display_width() {
1075        let haystack = "ไฝ ๅฅฝhello";
1076        let results = search_with_policy(haystack, "hello", &SearchPolicy::STANDARD);
1077        assert_eq!(results[0].display_width(), 5);
1078    }
1079
1080    #[test]
1081    fn policy_result_display_width_cjk_match() {
1082        let haystack = "abcไฝ ๅฅฝdef";
1083        let results = search_with_policy(haystack, "ไฝ ๅฅฝ", &SearchPolicy::STANDARD);
1084        assert_eq!(results[0].display_width(), 4); // 2 + 2
1085    }
1086
1087    // โ”€โ”€ search_with_policy: text extraction โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1088
1089    #[test]
1090    fn policy_result_text_extraction() {
1091        let haystack = "Hello World";
1092        let results = search_with_policy(haystack, "world", &SearchPolicy::STANDARD);
1093        assert_eq!(results[0].text(haystack), "World");
1094    }
1095
1096    // โ”€โ”€ search_with_policy: multiple matches โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1097
1098    #[test]
1099    fn policy_search_multiple_matches() {
1100        let haystack = "foo bar foo baz foo";
1101        let results = search_with_policy(haystack, "foo", &SearchPolicy::STANDARD);
1102        assert_eq!(results.len(), 3);
1103        assert_eq!(results[0].col_start, 0);
1104        assert_eq!(results[1].col_start, 8);
1105        assert_eq!(results[2].col_start, 16);
1106    }
1107
1108    // โ”€โ”€ search_with_policy: range validity invariant โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1109
1110    #[test]
1111    fn policy_search_ranges_always_valid() {
1112        let test_cases = [
1113            ("hello world", "o", SearchPolicy::STANDARD),
1114            ("CAFร‰ cafรฉ", "cafรฉ", SearchPolicy::STANDARD),
1115            ("ไฝ ๅฅฝไธ–็•Œ", "ไธ–", SearchPolicy::CJK),
1116            ("โ”€โ†’text", "text", SearchPolicy::CJK),
1117            ("\u{FB01}le", "file", SearchPolicy::STANDARD),
1118            ("e\u{0301}", "\u{00E9}", SearchPolicy::EXACT_NFC),
1119        ];
1120        for (haystack, needle, policy) in &test_cases {
1121            let results = search_with_policy(haystack, needle, policy);
1122            for r in &results {
1123                assert!(
1124                    r.range.start <= r.range.end,
1125                    "Invalid range for '{needle}' in '{haystack}'"
1126                );
1127                assert!(
1128                    r.range.end <= haystack.len(),
1129                    "Out of bounds for '{needle}' in '{haystack}'"
1130                );
1131                assert!(
1132                    haystack.is_char_boundary(r.range.start),
1133                    "Not char boundary at start"
1134                );
1135                assert!(
1136                    haystack.is_char_boundary(r.range.end),
1137                    "Not char boundary at end"
1138                );
1139                assert!(
1140                    r.col_start <= r.col_end,
1141                    "col_start > col_end for '{needle}' in '{haystack}'"
1142                );
1143            }
1144        }
1145    }
1146
1147    // โ”€โ”€ search_with_policy: column monotonicity โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1148
1149    #[test]
1150    fn policy_search_columns_monotonically_increasing() {
1151        let haystack = "aa bb aa cc aa";
1152        let results = search_with_policy(haystack, "aa", &SearchPolicy::STANDARD);
1153        assert_eq!(results.len(), 3);
1154        for w in results.windows(2) {
1155            assert!(
1156                w[0].col_end <= w[1].col_start,
1157                "Non-overlapping matches should have monotonically increasing columns"
1158            );
1159        }
1160    }
1161
1162    // โ”€โ”€ search_with_policy: custom policy construction โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1163
1164    #[test]
1165    fn policy_custom_nfd_case_sensitive() {
1166        let policy = SearchPolicy {
1167            norm_form: NormForm::Nfd,
1168            case_insensitive: false,
1169            width_mode: WidthMode::Standard,
1170        };
1171        // NFD decomposes รฉ โ†’ e+combining acute
1172        let haystack = "\u{00E9}"; // composed รฉ
1173        let needle = "e\u{0301}"; // decomposed
1174        let results = search_with_policy(haystack, needle, &policy);
1175        assert_eq!(results.len(), 1, "NFD should match decomposed forms");
1176    }
1177
1178    #[test]
1179    fn policy_custom_nfkd_case_insensitive() {
1180        let policy = SearchPolicy {
1181            norm_form: NormForm::Nfkd,
1182            case_insensitive: true,
1183            width_mode: WidthMode::CjkAmbiguousWide,
1184        };
1185        // fi ligature should match "FI" under NFKD + case-insensitive
1186        let haystack = "\u{FB01}";
1187        let results = search_with_policy(haystack, "FI", &policy);
1188        assert!(!results.is_empty(), "NFKD + CI should match fi ligature");
1189    }
1190
1191    // โ”€โ”€ search_with_policy: consistency with existing functions โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1192
1193    #[test]
1194    fn policy_search_agrees_with_search_case_insensitive() {
1195        let test_cases = [
1196            ("Hello World HELLO", "hello"),
1197            ("CAFร‰ cafรฉ Cafรฉ", "cafรฉ"),
1198            ("\u{FF28}\u{FF25}\u{FF2C}\u{FF2C}\u{FF2F}", "hello"),
1199        ];
1200        for (haystack, needle) in &test_cases {
1201            let old = search_case_insensitive(haystack, needle);
1202            let new = search_with_policy(haystack, needle, &SearchPolicy::STANDARD);
1203            assert_eq!(
1204                old.len(),
1205                new.len(),
1206                "Match count mismatch for '{needle}' in '{haystack}'"
1207            );
1208            for (o, n) in old.iter().zip(new.iter()) {
1209                assert_eq!(
1210                    o.range, n.range,
1211                    "Byte range mismatch for '{needle}' in '{haystack}'"
1212                );
1213            }
1214        }
1215    }
1216
1217    #[test]
1218    fn policy_search_agrees_with_search_normalized() {
1219        let haystack = "caf\u{0065}\u{0301} rรฉsumรฉ";
1220        let needle = "caf\u{00E9}";
1221        let old = search_normalized(haystack, needle, NormForm::Nfc);
1222        let new = search_with_policy(haystack, needle, &SearchPolicy::EXACT_NFC);
1223        assert_eq!(old.len(), new.len());
1224        for (o, n) in old.iter().zip(new.iter()) {
1225            assert_eq!(o.range, n.range);
1226        }
1227    }
1228}