Skip to main content

simd_normalizer/
casefold.rs

1//! Unicode simple case folding (CaseFolding.txt, status C+S).
2//!
3//! Provides character-level and string-level case folding for case-insensitive
4//! matching. Supports both standard folding and Turkish/Azerbaijani locale mode.
5
6use alloc::borrow::Cow;
7use alloc::string::String;
8
9use crate::tables;
10
11/// Case folding mode.
12#[derive(Clone, Copy, Debug, PartialEq, Eq)]
13pub enum CaseFoldMode {
14    /// Standard Unicode case folding (CaseFolding.txt status C+S).
15    Standard,
16    /// Turkish/Azerbaijani locale folding.
17    ///
18    /// Overrides:
19    /// - U+0049 (I) → U+0131 (ı) instead of U+0069 (i)
20    /// - U+0130 (İ) → U+0069 (i) instead of standard mapping
21    Turkish,
22}
23
24/// Fold a single character using simple case folding.
25///
26/// Returns the folded character, or the input character unchanged if no
27/// folding applies.
28#[inline]
29pub fn casefold_char(c: char, mode: CaseFoldMode) -> char {
30    // Turkish exceptions override the standard mapping.
31    if mode == CaseFoldMode::Turkish
32        && let Some(folded) = tables::turkish_casefold(c)
33    {
34        return folded;
35    }
36    tables::lookup_casefold(c).unwrap_or(c)
37}
38
39/// Fold a string using simple case folding.
40///
41/// Returns `Cow::Borrowed` if the string is already fully case-folded
42/// (no characters changed).
43pub fn casefold<'a>(input: &'a str, mode: CaseFoldMode) -> Cow<'a, str> {
44    if input.is_empty() {
45        return Cow::Borrowed(input);
46    }
47
48    // Quick scan: find first character that would change.
49    let mut scan_iter = input.char_indices();
50    let first_change = loop {
51        match scan_iter.next() {
52            None => return Cow::Borrowed(input),
53            Some((idx, ch)) => {
54                let folded = casefold_char(ch, mode);
55                if folded != ch {
56                    break idx;
57                }
58            },
59        }
60    };
61
62    // Build the output: copy unchanged prefix, then fold the rest.
63    let mut out = String::with_capacity(input.len());
64    out.push_str(&input[..first_change]);
65
66    for ch in input[first_change..].chars() {
67        out.push(casefold_char(ch, mode));
68    }
69
70    Cow::Owned(out)
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    // ---- Character-level tests ----
78
79    #[test]
80    fn fold_ascii_uppercase() {
81        assert_eq!(casefold_char('A', CaseFoldMode::Standard), 'a');
82        assert_eq!(casefold_char('Z', CaseFoldMode::Standard), 'z');
83    }
84
85    #[test]
86    fn fold_ascii_lowercase_unchanged() {
87        assert_eq!(casefold_char('a', CaseFoldMode::Standard), 'a');
88        assert_eq!(casefold_char('z', CaseFoldMode::Standard), 'z');
89    }
90
91    #[test]
92    fn fold_digit_unchanged() {
93        assert_eq!(casefold_char('0', CaseFoldMode::Standard), '0');
94        assert_eq!(casefold_char('9', CaseFoldMode::Standard), '9');
95    }
96
97    #[test]
98    fn fold_latin_extended() {
99        // U+00C0 À → U+00E0 à
100        assert_eq!(
101            casefold_char('\u{00C0}', CaseFoldMode::Standard),
102            '\u{00E0}'
103        );
104        // U+00D6 Ö → U+00F6 ö
105        assert_eq!(
106            casefold_char('\u{00D6}', CaseFoldMode::Standard),
107            '\u{00F6}'
108        );
109    }
110
111    #[test]
112    fn fold_greek() {
113        // U+0391 Α → U+03B1 α
114        assert_eq!(
115            casefold_char('\u{0391}', CaseFoldMode::Standard),
116            '\u{03B1}'
117        );
118        // U+03A3 Σ → U+03C3 σ
119        assert_eq!(
120            casefold_char('\u{03A3}', CaseFoldMode::Standard),
121            '\u{03C3}'
122        );
123    }
124
125    #[test]
126    fn fold_cyrillic() {
127        // U+0410 А → U+0430 а
128        assert_eq!(
129            casefold_char('\u{0410}', CaseFoldMode::Standard),
130            '\u{0430}'
131        );
132    }
133
134    #[test]
135    fn fold_micro_sign() {
136        // U+00B5 µ (MICRO SIGN) → U+03BC μ (GREEK SMALL LETTER MU)
137        assert_eq!(
138            casefold_char('\u{00B5}', CaseFoldMode::Standard),
139            '\u{03BC}'
140        );
141    }
142
143    #[test]
144    fn fold_sharp_s() {
145        // U+1E9E ẞ (LATIN CAPITAL LETTER SHARP S) → U+00DF ß
146        assert_eq!(
147            casefold_char('\u{1E9E}', CaseFoldMode::Standard),
148            '\u{00DF}'
149        );
150    }
151
152    // ---- Turkish mode ----
153
154    #[test]
155    fn fold_turkish_dotless_i() {
156        // Standard: I → i
157        assert_eq!(casefold_char('I', CaseFoldMode::Standard), 'i');
158        // Turkish: I → ı (U+0131)
159        assert_eq!(casefold_char('I', CaseFoldMode::Turkish), '\u{0131}');
160    }
161
162    #[test]
163    fn fold_turkish_dotted_capital_i() {
164        // Turkish: İ (U+0130) → i
165        assert_eq!(casefold_char('\u{0130}', CaseFoldMode::Turkish), 'i');
166    }
167
168    #[test]
169    fn fold_turkish_other_chars_unchanged() {
170        // Non-I characters should fold the same in Turkish mode.
171        assert_eq!(casefold_char('A', CaseFoldMode::Turkish), 'a');
172        assert_eq!(casefold_char('a', CaseFoldMode::Turkish), 'a');
173    }
174
175    // ---- String-level tests ----
176
177    #[test]
178    fn fold_string_ascii() {
179        let result = casefold("Hello World", CaseFoldMode::Standard);
180        assert_eq!(&*result, "hello world");
181    }
182
183    #[test]
184    fn fold_string_already_folded() {
185        let result = casefold("hello world", CaseFoldMode::Standard);
186        assert!(matches!(result, Cow::Borrowed(_)));
187        assert_eq!(&*result, "hello world");
188    }
189
190    #[test]
191    fn fold_string_empty() {
192        let result = casefold("", CaseFoldMode::Standard);
193        assert!(matches!(result, Cow::Borrowed(_)));
194    }
195
196    #[test]
197    fn fold_string_mixed() {
198        let result = casefold("Ströme", CaseFoldMode::Standard);
199        assert_eq!(&*result, "ströme");
200    }
201
202    #[test]
203    fn fold_string_turkish() {
204        let result = casefold("Istanbul", CaseFoldMode::Turkish);
205        // I → ı in Turkish mode
206        assert_eq!(&*result, "\u{0131}stanbul");
207    }
208
209    #[test]
210    fn fold_string_all_ascii_lowercase() {
211        // Should return borrowed.
212        let result = casefold(
213            "abcdefghijklmnopqrstuvwxyz0123456789",
214            CaseFoldMode::Standard,
215        );
216        assert!(matches!(result, Cow::Borrowed(_)));
217    }
218}