ib_unicode/case/mod.rs
1/*!
2## Case folding
3> Case folding, i.e. mapping strings to a canonical form for string comparison, typically results in lowercase characters; however, characters in the Cherokee script resolve to uppercase characters. Case folding isn't context-, language-, or locale-sensitive; however, you can specify whether to use mappings for languages like Turkish.
4
5Currently, only simple [case folding](https://www.unicode.org/Public/16.0.0/ucd/CaseFolding.txt) is supported. Simple case folding does not handle some special letter cases that have multiple characters, like `Maße` cannot match `MASSE`.
6
7The API is [`CharCaseExt::to_simple_fold_case()`] and [`StrCaseExt::to_simple_fold_case()`], for example:
8```
9use ib_unicode::case::StrCaseExt;
10
11assert_eq!("βίος".to_simple_fold_case(), "βίοσ");
12assert_eq!("Βίοσ".to_simple_fold_case(), "βίοσ");
13assert_eq!("ΒΊΟΣ".to_simple_fold_case(), "βίοσ");
14```
15
16- Unicode version: 16.0.0.
17- Performance: The default implementation uses the same algorithm as the `unicase` crate, which is compact but a bit slow, especially on miss paths. You can enable the `perf-case-fold` feature to use a faster algorithm.
18
19Simple case folding is also used by the [`regex`](https://docs.rs/regex/) crate.
20
21## Mono lowercase
22The "mono lowercase" mentioned in this module refers to the single-char lowercase mapping of a Unicode character. This is different from Unicode's [simple case folding](#case-folding) in that it always results in lowercase characters, and does not normalize different lower cases of a character to the same one (e.g. `σ` and `ς` are kept).
23
24<!-- except that some full/special case foldings are also added but only kept the first character (currently only `İ`). -->
25
26For example:
27```
28use ib_unicode::case::StrCaseExt;
29
30assert_eq!("βίος".to_mono_lowercase(), "βίος");
31assert_eq!("Βίοσ".to_mono_lowercase(), "βίοσ");
32assert_eq!("ΒΊΟΣ".to_mono_lowercase(), "βίοσ");
33```
34
35- Unicode version: 16.0.0.
36- Compared to [`char::to_lowercase()`]/[`str::to_lowercase()`] in `std`: the same, except that `İ` is mapped to `i` instead of `i\u{307}`.
37 - `Σ` always maps to `σ` instead of conditionally `ς`, unlike in `str::to_lowercase()`. This may be changed if the need arises.
38 - [`to_mono_lowercase()`](CharCaseExt::to_mono_lowercase) is also much faster if `perf-case-map` feature is enabled.
39- Compared to simple case folding: Besides normalization, the covered characters are basically the same, except that there is no `İ` in simple case folding but the following ones:
40 - ΐ, ΐ
41 - ΰ, ΰ
42 - ſt, st
43*/
44
45use crate::Sealed;
46
47#[cfg(feature = "case-fold")]
48mod fold;
49#[cfg(feature = "perf-case-map")]
50mod map;
51
52pub trait CharCaseExt: Sealed {
53 /// The only multi-char lowercase mapping is 'İ' -> "i\u{307}", we just ignore the '\u{307}'.
54 ///
55 /// See [mono lowercase](super::case#mono-lowercase) for details.
56 fn to_mono_lowercase(self) -> char;
57
58 /// A convenient method for feature-gated case folding.
59 /// If `case-fold` feature is enabled, it uses simple case folding; otherwise it uses `to_ascii_lowercase()`.
60 fn to_simple_or_ascii_fold_case(self) -> char;
61
62 /// See [case folding](super::case#case-folding) for details.
63 #[cfg(feature = "case-fold")]
64 fn to_simple_fold_case(self) -> char;
65
66 /// See [case folding](super::case#case-folding) for details.
67 #[cfg(feature = "bench")]
68 fn to_simple_fold_case_unicase(self) -> char;
69
70 /// See [case folding](super::case#case-folding) for details.
71 #[cfg(feature = "bench")]
72 fn to_simple_fold_case_map(self) -> char;
73}
74
75impl CharCaseExt for char {
76 fn to_mono_lowercase(self) -> char {
77 #[cfg(not(feature = "perf-case-map"))]
78 return self.to_lowercase().next().unwrap();
79
80 // Optimize away the binary search
81 // Reduce total match time by ~37%
82 #[cfg(feature = "perf-case-map")]
83 map::to_mono_lowercase(self)
84 }
85
86 fn to_simple_or_ascii_fold_case(self) -> char {
87 #[cfg(not(feature = "case-fold"))]
88 return self.to_ascii_lowercase();
89 #[cfg(feature = "case-fold")]
90 self.to_simple_fold_case()
91 }
92
93 #[cfg(feature = "case-fold")]
94 fn to_simple_fold_case(self) -> char {
95 #[cfg(not(feature = "perf-case-fold"))]
96 return fold::unicase::fold(self);
97 #[cfg(feature = "perf-case-fold")]
98 fold::map::fold(self)
99 }
100
101 #[cfg(feature = "bench")]
102 fn to_simple_fold_case_unicase(self) -> char {
103 fold::unicase::fold(self)
104 }
105
106 #[cfg(feature = "bench")]
107 fn to_simple_fold_case_map(self) -> char {
108 fold::map::fold(self)
109 }
110}
111
112pub trait StrCaseExt: Sealed {
113 /// See [mono lowercase](super::case#mono-lowercase) for details.
114 fn to_mono_lowercase(&self) -> String;
115
116 /// A convenient method for feature-gated case folding.
117 /// If `case-fold` feature is enabled, it uses simple case folding; otherwise it uses `to_ascii_lowercase()`.
118 fn to_simple_or_ascii_fold_case(&self) -> String;
119
120 /// See [case folding](super::case#case-folding) for details.
121 #[cfg(feature = "case-fold")]
122 fn to_simple_fold_case(&self) -> String;
123}
124
125impl StrCaseExt for str {
126 fn to_mono_lowercase(&self) -> String {
127 self.chars().map(|c| c.to_mono_lowercase()).collect()
128 }
129
130 fn to_simple_or_ascii_fold_case(&self) -> String {
131 self.chars()
132 .map(|c| c.to_simple_or_ascii_fold_case())
133 .collect()
134 }
135
136 #[cfg(feature = "case-fold")]
137 fn to_simple_fold_case(&self) -> String {
138 self.chars().map(|c| c.to_simple_fold_case()).collect()
139 }
140}
141
142#[cfg(test)]
143mod tests {
144 use std::collections::HashSet;
145
146 use super::*;
147
148 fn mono_set() -> HashSet<char> {
149 let mut chars = HashSet::new();
150 for c in 'A'..='Z' {
151 chars.insert(c);
152 chars.insert(c.to_ascii_lowercase());
153 }
154 for (c, map) in map::tests::LOWERCASE_TABLE {
155 chars.insert(*c);
156 chars.insert(char::from_u32(*map).unwrap_or('i'));
157 }
158 chars
159 }
160
161 #[test]
162 fn mono() {
163 let mono = mono_set();
164 println!("{} chars", mono.len());
165 println!("{} upper chars", 26 + map::tests::LOWERCASE_TABLE.len());
166 }
167}
168
169/// ucd-generate case-folding-simple ucd-16.0.0 --chars --all-pairs > case-folding-simple-chars-all-pairs.rs
170#[cfg(all(not(feature = "doc"), feature = "_test_data"))]
171mod tests_data {
172 use std::collections::HashSet;
173
174 include!("../../data/case-folding-simple-chars-all-pairs.rs");
175
176 fn regex_set() -> HashSet<char> {
177 let mut chars = HashSet::new();
178 for (c, maps) in CASE_FOLDING_SIMPLE {
179 chars.insert(*c);
180 for c in maps.iter() {
181 chars.insert(*c);
182 }
183 }
184 chars
185 }
186
187 #[test]
188 fn regex() {
189 let regex = regex_set();
190 println!("{} chars", regex.len());
191 }
192
193 #[test]
194 fn mono_sub_regex() {
195 let regex = regex_set();
196
197 let mut chars = HashSet::new();
198 for (c, map) in map::tests::LOWERCASE_TABLE {
199 if !regex.contains(c) {
200 chars.insert(*c);
201 }
202 let map = char::from_u32(*map).unwrap_or('i');
203 if !regex.contains(&map) {
204 chars.insert(map);
205 }
206 }
207 println!("{} chars", chars.len());
208 println!("{:?}", chars);
209 }
210
211 #[test]
212 fn regex_sub_mono() {
213 let mono = mono_set();
214
215 let mut chars = HashSet::new();
216 let mut multicase = HashSet::new();
217 for (c, maps) in CASE_FOLDING_SIMPLE {
218 let set = if maps.len() > 1 {
219 &mut multicase
220 } else {
221 &mut chars
222 };
223 if !mono.contains(c) {
224 set.insert(*c);
225 }
226 for c in maps.iter() {
227 if !mono.contains(c) {
228 set.insert(*c);
229 }
230 }
231 }
232 println!("{} chars", chars.len());
233 println!("{} multicase chars", multicase.len());
234 println!("{:?}", chars);
235 println!("{:?}", multicase);
236 }
237}