ib_unicode/case/
mod.rs

1/*!
2## Case folding
3> Case folding, i.e. mapping strings to a canonical form for string comparison, typically results in lowercase characters; however, characters in the Cherokee script resolve to uppercase characters. Case folding isn't context-, language-, or locale-sensitive; however, you can specify whether to use mappings for languages like Turkish.
4
5Currently, only simple [case folding](https://www.unicode.org/Public/16.0.0/ucd/CaseFolding.txt) is supported. Simple case folding does not handle some special letter cases that have multiple characters, like `Maße` cannot match `MASSE`.
6
7The API is [`CharCaseExt::to_simple_fold_case()`] and [`StrCaseExt::to_simple_fold_case()`], for example:
8```
9use ib_unicode::case::StrCaseExt;
10
11assert_eq!("βίος".to_simple_fold_case(), "βίοσ");
12assert_eq!("Βίοσ".to_simple_fold_case(), "βίοσ");
13assert_eq!("ΒΊΟΣ".to_simple_fold_case(), "βίοσ");
14```
15
16- Unicode version: 16.0.0.
17- Performance: The default implementation uses the same algorithm as the `unicase` crate, which is compact but a bit slow, especially on miss paths. You can enable the `perf-case-fold` feature to use a faster algorithm.
18
19Simple case folding is also used by the [`regex`](https://docs.rs/regex/) crate.
20
21## Mono lowercase
22The "mono lowercase" mentioned in this module refers to the single-char lowercase mapping of a Unicode character. This is different from Unicode's [simple case folding](#case-folding) in that it always results in lowercase characters, and does not normalize different lower cases of a character to the same one (e.g. `σ` and `ς` are kept).
23
24<!-- except that some full/special case foldings are also added but only kept the first character (currently only `İ`). -->
25
26For example:
27```
28use ib_unicode::case::StrCaseExt;
29
30assert_eq!("βίος".to_mono_lowercase(), "βίος");
31assert_eq!("Βίοσ".to_mono_lowercase(), "βίοσ");
32assert_eq!("ΒΊΟΣ".to_mono_lowercase(), "βίοσ");
33```
34
35- Unicode version: 16.0.0.
36- Compared to [`char::to_lowercase()`]/[`str::to_lowercase()`] in `std`: the same, except that `İ` is mapped to `i` instead of `i\u{307}`.
37  - `Σ` always maps to `σ` instead of conditionally `ς`, unlike in `str::to_lowercase()`. This may be changed if the need arises.
38  - [`to_mono_lowercase()`](CharCaseExt::to_mono_lowercase) is also much faster if `perf-case-map` feature is enabled.
39- Compared to simple case folding: Besides normalization, the covered characters are basically the same, except that there is no `İ` in simple case folding but the following ones:
40  - ΐ, ΐ
41  - ΰ, ΰ
42  - ſt, st
43*/
44
45use crate::Sealed;
46
47#[cfg(feature = "case-fold")]
48mod fold;
49#[cfg(feature = "perf-case-map")]
50mod map;
51
52pub trait CharCaseExt: Sealed {
53    /// The only multi-char lowercase mapping is 'İ' -> "i\u{307}", we just ignore the '\u{307}'.
54    ///
55    /// See [mono lowercase](super::case#mono-lowercase) for details.
56    fn to_mono_lowercase(self) -> char;
57
58    /// A convenient method for feature-gated case folding.
59    /// If `case-fold` feature is enabled, it uses simple case folding; otherwise it uses `to_ascii_lowercase()`.
60    fn to_simple_or_ascii_fold_case(self) -> char;
61
62    /// See [case folding](super::case#case-folding) for details.
63    #[cfg(feature = "case-fold")]
64    fn to_simple_fold_case(self) -> char;
65
66    /// See [case folding](super::case#case-folding) for details.
67    #[cfg(feature = "bench")]
68    fn to_simple_fold_case_unicase(self) -> char;
69
70    /// See [case folding](super::case#case-folding) for details.
71    #[cfg(feature = "bench")]
72    fn to_simple_fold_case_map(self) -> char;
73}
74
75impl CharCaseExt for char {
76    fn to_mono_lowercase(self) -> char {
77        #[cfg(not(feature = "perf-case-map"))]
78        return self.to_lowercase().next().unwrap();
79
80        // Optimize away the binary search
81        // Reduce total match time by ~37%
82        #[cfg(feature = "perf-case-map")]
83        map::to_mono_lowercase(self)
84    }
85
86    fn to_simple_or_ascii_fold_case(self) -> char {
87        #[cfg(not(feature = "case-fold"))]
88        return self.to_ascii_lowercase();
89        #[cfg(feature = "case-fold")]
90        self.to_simple_fold_case()
91    }
92
93    #[cfg(feature = "case-fold")]
94    fn to_simple_fold_case(self) -> char {
95        #[cfg(not(feature = "perf-case-fold"))]
96        return fold::unicase::fold(self);
97        #[cfg(feature = "perf-case-fold")]
98        fold::map::fold(self)
99    }
100
101    #[cfg(feature = "bench")]
102    fn to_simple_fold_case_unicase(self) -> char {
103        fold::unicase::fold(self)
104    }
105
106    #[cfg(feature = "bench")]
107    fn to_simple_fold_case_map(self) -> char {
108        fold::map::fold(self)
109    }
110}
111
112pub trait StrCaseExt: Sealed {
113    /// See [mono lowercase](super::case#mono-lowercase) for details.
114    fn to_mono_lowercase(&self) -> String;
115
116    /// A convenient method for feature-gated case folding.
117    /// If `case-fold` feature is enabled, it uses simple case folding; otherwise it uses `to_ascii_lowercase()`.
118    fn to_simple_or_ascii_fold_case(&self) -> String;
119
120    /// See [case folding](super::case#case-folding) for details.
121    #[cfg(feature = "case-fold")]
122    fn to_simple_fold_case(&self) -> String;
123}
124
125impl StrCaseExt for str {
126    fn to_mono_lowercase(&self) -> String {
127        self.chars().map(|c| c.to_mono_lowercase()).collect()
128    }
129
130    fn to_simple_or_ascii_fold_case(&self) -> String {
131        self.chars()
132            .map(|c| c.to_simple_or_ascii_fold_case())
133            .collect()
134    }
135
136    #[cfg(feature = "case-fold")]
137    fn to_simple_fold_case(&self) -> String {
138        self.chars().map(|c| c.to_simple_fold_case()).collect()
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use std::collections::HashSet;
145
146    use super::*;
147
148    fn mono_set() -> HashSet<char> {
149        let mut chars = HashSet::new();
150        for c in 'A'..='Z' {
151            chars.insert(c);
152            chars.insert(c.to_ascii_lowercase());
153        }
154        for (c, map) in map::tests::LOWERCASE_TABLE {
155            chars.insert(*c);
156            chars.insert(char::from_u32(*map).unwrap_or('i'));
157        }
158        chars
159    }
160
161    #[test]
162    fn mono() {
163        let mono = mono_set();
164        println!("{} chars", mono.len());
165        println!("{} upper chars", 26 + map::tests::LOWERCASE_TABLE.len());
166    }
167}
168
169/// ucd-generate case-folding-simple ucd-16.0.0 --chars --all-pairs > case-folding-simple-chars-all-pairs.rs
170#[cfg(all(not(feature = "doc"), feature = "_test_data"))]
171mod tests_data {
172    use std::collections::HashSet;
173
174    include!("../../data/case-folding-simple-chars-all-pairs.rs");
175
176    fn regex_set() -> HashSet<char> {
177        let mut chars = HashSet::new();
178        for (c, maps) in CASE_FOLDING_SIMPLE {
179            chars.insert(*c);
180            for c in maps.iter() {
181                chars.insert(*c);
182            }
183        }
184        chars
185    }
186
187    #[test]
188    fn regex() {
189        let regex = regex_set();
190        println!("{} chars", regex.len());
191    }
192
193    #[test]
194    fn mono_sub_regex() {
195        let regex = regex_set();
196
197        let mut chars = HashSet::new();
198        for (c, map) in map::tests::LOWERCASE_TABLE {
199            if !regex.contains(c) {
200                chars.insert(*c);
201            }
202            let map = char::from_u32(*map).unwrap_or('i');
203            if !regex.contains(&map) {
204                chars.insert(map);
205            }
206        }
207        println!("{} chars", chars.len());
208        println!("{:?}", chars);
209    }
210
211    #[test]
212    fn regex_sub_mono() {
213        let mono = mono_set();
214
215        let mut chars = HashSet::new();
216        let mut multicase = HashSet::new();
217        for (c, maps) in CASE_FOLDING_SIMPLE {
218            let set = if maps.len() > 1 {
219                &mut multicase
220            } else {
221                &mut chars
222            };
223            if !mono.contains(c) {
224                set.insert(*c);
225            }
226            for c in maps.iter() {
227                if !mono.contains(c) {
228                    set.insert(*c);
229                }
230            }
231        }
232        println!("{} chars", chars.len());
233        println!("{} multicase chars", multicase.len());
234        println!("{:?}", chars);
235        println!("{:?}", multicase);
236    }
237}