rust_locale/
ctype.rs

1use libc::{c_char, wchar_t};
2
3use errno::errno;
4
5mod c {
6    #[allow(non_camel_case_types)]
7    type wint_t = i64;
8
9    #[link(name = "rustlocale", kind = "static")]
10    extern "C" {
11        pub fn utf8towc(
12            wc_buf: *mut libc::wchar_t,
13            multibytes: *const libc::c_char,
14            byte_length: libc::size_t,
15        ) -> u8;
16        pub fn wctoutf8(utf8_bytes: *mut libc::c_char, wc: libc::wchar_t) -> libc::ssize_t;
17        pub fn iswspace_native(ch: wint_t) -> i8;
18        pub fn iswblank_native(ch: wint_t) -> libc::c_int;
19        pub fn towupper_native(ch: wint_t) -> wint_t;
20        pub fn towlower_native(ch: wint_t) -> wint_t;
21    }
22}
23
24pub trait CType {
25    /// Returns `true` if `self` is a whitespace character.
26    ///
27    /// Whitespace characters are:
28    ///
29    /// - space (0x20), form feed (0x0c), line feed (0x0a), carriage return (0x0d), horizontal tab (0x09), vertical tab (0x0b)
30    /// - whitespace characters specific to the current locale
31    ///
32    /// # examples
33    ///
34    /// ```
35    /// use rust_locale::CType;
36    ///
37    /// assert!(' '.is_space());
38    /// assert!(!'a'.is_space());
39    /// std::env::set_var("LC_ALL", "POSIX");
40    /// assert!(!'\u{2003}'.is_space());
41    /// std::env::set_var("LC_ALL", "en_US");
42    /// assert!('\u{2003}'.is_space());
43    /// ```
44    fn is_space(&self) -> bool;
45
46    /// Checks if `self` is classified as blank character (that is, a whitespace character used to separate words in a sentence) by the current locale.
47    ///
48    /// # examples
49    ///
50    /// ```
51    /// use rust_locale::CType;
52    ///
53    /// assert!(' '.is_blank());
54    /// assert!(!'\n'.is_blank());
55    /// std::env::set_var("LC_ALL", "POSIX");
56    /// assert!(!'\u{3000}'.is_blank());
57    /// std::env::set_var("LC_ALL", "en_US");
58    /// assert!('\u{3000}'.is_blank());
59    /// ```
60    fn is_blank(&self) -> bool;
61
62    /// Converts `self` to uppercase listed in the current locale.
63    ///
64    /// If no uppercase version is listed in the current locale, returns unmodified `self`.
65    ///
66    /// Only 1:1 character mapping can be performed by this function, e.g. the uppercase form of 'ß' is (with some exceptions)
67    /// the two-character string "SS", which cannot be obtained.
68    ///
69    /// # examples
70    ///
71    /// ```
72    /// use rust_locale::CType;
73    ///
74    /// assert_eq!(CType::to_uppercase(&'a'), 'A');
75    /// assert_eq!(CType::to_uppercase(&'1'), '1');
76    /// std::env::set_var("LC_ALL", "POSIX");
77    /// assert_eq!(CType::to_uppercase(&'\u{017F}'), '\u{017F}');
78    /// std::env::set_var("LC_ALL", "en_US");
79    /// assert_eq!(CType::to_uppercase(&'\u{017F}'), 'S');
80    /// ```
81    fn to_uppercase(&self) -> Self;
82
83    /// Converts `self` to lowercase, if possible.
84    ///
85    /// If no lowercase version is listed in the current locale, returns unmodified `self`.
86    ///
87    /// Only 1:1 character mapping can be performed by this function, e.g. the Greek uppercase letter 'Σ' has two lowercase forms,
88    /// depending on the position in a word: 'σ' and 'ς'. A call to this method cannot be used to obtain the correct lowercase form in this case.
89    ///
90    /// # examples
91    ///
92    /// ```
93    /// use rust_locale::CType;
94    ///
95    /// assert_eq!(CType::to_lowercase(&'A'), 'a');
96    /// assert_eq!(CType::to_lowercase(&'1'), '1');
97    /// std::env::set_var("LC_ALL", "POSIX");
98    /// assert_eq!(CType::to_lowercase(&'\u{0190}'), '\u{0190}');
99    /// std::env::set_var("LC_ALL", "en_US");
100    /// assert_eq!(CType::to_lowercase(&'\u{0190}'), '\u{025b}');
101    /// ```
102    fn to_lowercase(&self) -> Self;
103}
104
105impl CType for char {
106    fn is_space(&self) -> bool {
107        let buf = utf8_bytes(self);
108        if buf.len() == 1 {
109            unsafe { libc::isspace(buf[0].into()) != 0 }
110        } else {
111            let wc = utf8towc(&buf);
112            isspace(wc)
113        }
114    }
115
116    fn is_blank(&self) -> bool {
117        let buf = utf8_bytes(self);
118        if buf.len() == 1 {
119            unsafe { libc::isblank(buf[0].into()) != 0 }
120        } else {
121            let wc = utf8towc(&buf);
122            isblank(wc)
123        }
124    }
125
126    fn to_uppercase(&self) -> char {
127        let bytes = utf8_bytes(self);
128        let wc = utf8towc(&bytes);
129        let upper = toupper(wc);
130        wctochar(upper)
131    }
132
133    fn to_lowercase(&self) -> char {
134        let bytes = utf8_bytes(self);
135        let wc = utf8towc(&bytes);
136        let lower = tolower(wc);
137        wctochar(lower)
138    }
139}
140
141fn utf8_bytes(c: &char) -> Vec<u8> {
142    let length = c.len_utf8();
143    let mut buf = vec![0; length];
144    c.encode_utf8(&mut buf);
145    buf
146}
147
148fn utf8towc(utf8_bytes: &Vec<u8>) -> wchar_t {
149    let mut wc = 0;
150    match unsafe {
151        c::utf8towc(
152            &mut wc as *mut wchar_t,
153            utf8_bytes.as_ptr() as *const c_char,
154            utf8_bytes.len(),
155        )
156    } {
157        s if s == 0 => wc,
158        s => panic!("utf8towc failed. status={}, error={}", s, errno()),
159    }
160}
161
162fn wctochar(wc: wchar_t) -> char {
163    let mut buf = [0; 4];
164    match unsafe { c::wctoutf8(buf.as_mut_ptr(), wc) } {
165        length if length > 0 => {
166            let length = length as usize;
167            String::from_utf8(buf[..length].iter().map(|c| *c as u8).collect())
168                .unwrap()
169                .chars()
170                .next()
171                .unwrap()
172        }
173        status => panic!("wctochar failed. status={}, error={}", status, errno()),
174    }
175}
176
177fn isspace(wc: wchar_t) -> bool {
178    match unsafe { c::iswspace_native(wc.into()) } {
179        s if s >= 0 => s != 0,
180        _ => panic!("iswspace_native failed. error={}", errno()),
181    }
182}
183
184fn isblank(wc: wchar_t) -> bool {
185    unsafe { c::iswblank_native(wc.into()) != 0 }
186}
187
188fn toupper(wc: wchar_t) -> wchar_t {
189    unsafe { c::towupper_native(wc.into()) as wchar_t }
190}
191
192fn tolower(wc: wchar_t) -> wchar_t {
193    unsafe { c::towlower_native(wc.into()) as wchar_t }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn is_always_space() {
202        assert!(' '.is_space());
203        assert!('\x0c'.is_space());
204        assert!('\n'.is_space());
205        assert!('\r'.is_space());
206        assert!('\t'.is_space());
207        assert!('\x0b'.is_space());
208    }
209
210    #[test]
211    fn is_space_i18n() {
212        std::env::set_var("LC_ALL", "POSIX");
213        assert!(!'\u{1680}'.is_space());
214        assert!(!'\u{2000}'.is_space());
215        assert!(!'\u{2006}'.is_space());
216        assert!(!'\u{2008}'.is_space());
217        assert!(!'\u{200A}'.is_space());
218        assert!(!'\u{2028}'.is_space());
219        assert!(!'\u{2029}'.is_space());
220        assert!(!'\u{205F}'.is_space());
221        assert!(!'\u{3000}'.is_space());
222        std::env::set_var("LC_ALL", "en_US");
223        assert!('\u{1680}'.is_space());
224        assert!('\u{2000}'.is_space());
225        assert!('\u{2006}'.is_space());
226        assert!('\u{2008}'.is_space());
227        assert!('\u{200A}'.is_space());
228        assert!('\u{2028}'.is_space());
229        assert!('\u{2029}'.is_space());
230        assert!('\u{205F}'.is_space());
231        assert!('\u{3000}'.is_space());
232    }
233
234    #[test]
235    #[ignore]
236    fn is_space_special() {
237        std::env::set_var("LC_ALL", "en_US");
238        assert!(!'\u{1361}'.is_space());
239        std::env::set_var("LC_ALL", "am_ET");
240        assert!('\u{1361}'.is_space());
241    }
242
243    #[test]
244    fn is_blank() {
245        std::env::set_var("LC_ALL", "POSIX");
246        assert!(' '.is_blank());
247        assert!('\t'.is_blank());
248        assert!(!'\n'.is_blank());
249        assert!(!'\u{3000}'.is_blank());
250        std::env::set_var("LC_ALL", "en_US");
251        assert!('\u{3000}'.is_blank());
252        assert!(!'\u{2028}'.is_blank());
253    }
254
255    #[test]
256    fn to_uppercase() {
257        assert_eq!(CType::to_uppercase(&'a'), 'A');
258        assert_eq!(CType::to_uppercase(&'1'), '1');
259        std::env::set_var("LC_ALL", "POSIX");
260        assert_eq!(CType::to_uppercase(&'\u{017F}'), '\u{017F}');
261        std::env::set_var("LC_ALL", "en_US");
262        assert_eq!(CType::to_uppercase(&'\u{017F}'), 'S');
263    }
264
265    #[test]
266    #[ignore]
267    fn to_uppercase_special() {
268        std::env::set_var("LC_ALL", "en_US");
269        assert_eq!(CType::to_uppercase(&'i'), 'I');
270        std::env::set_var("LC_ALL", "tr_TR");
271        assert_eq!(CType::to_uppercase(&'i'), '\u{0130}');
272    }
273
274    #[test]
275    fn to_lowercase() {
276        assert_eq!(CType::to_lowercase(&'A'), 'a');
277        assert_eq!(CType::to_lowercase(&'1'), '1');
278        std::env::set_var("LC_ALL", "POSIX");
279        assert_eq!(CType::to_lowercase(&'\u{0190}'), '\u{0190}');
280        std::env::set_var("LC_ALL", "en_US");
281        assert_eq!(CType::to_lowercase(&'\u{0190}'), '\u{025b}');
282    }
283
284    #[test]
285    #[ignore]
286    fn to_lowercase_special() {
287        std::env::set_var("LC_ALL", "en_US");
288        assert_eq!(CType::to_lowercase(&'I'), 'i');
289        std::env::set_var("LC_ALL", "tr_TR");
290        assert_eq!(CType::to_lowercase(&'I'), '\u{0131}');
291    }
292}