case_conv/
lib.rs

1#![doc = include_str!("../README.md")]
2#![feature(unicode_internals)]
3use core::unicode::conversions;
4use std::mem;
5
6#[inline]
7fn contains_nonascii(v: usize) -> bool {
8    const NONASCII_MASK: usize = 0x8080808080808080; // usize::repeat_u8(0x80);
9    (NONASCII_MASK & v) != 0
10}
11
12#[inline]
13/// SAFETY: N*size_of::<usize>() bytes must be valid of b
14unsafe fn is_ascii_funsafe<const N: usize>(b: *const u8) -> bool {
15    // check that the bytes are not ascii (going by chunks of usize)
16    let mut count = 0;
17    for j in 0..N {
18        let chunk = b.cast::<usize>().add(j).read_unaligned();
19        count += contains_nonascii(chunk) as usize;
20    }
21    count == 0
22}
23
24#[inline]
25unsafe fn convert_while_ascii(
26    b: &[u8],
27    out: &mut [mem::MaybeUninit<u8>],
28    f: fn(&u8) -> u8,
29) -> usize {
30    debug_assert!(out.len() >= b.len());
31
32    const USIZE_SIZE: usize = mem::size_of::<usize>();
33    const MAGIC_UNROLL: usize = 16;
34
35    let mut i = 0;
36    while i + USIZE_SIZE * MAGIC_UNROLL <= b.len() {
37        let c = b.get_unchecked(i..);
38        let o = out.get_unchecked_mut(i..);
39
40        if !is_ascii_funsafe::<MAGIC_UNROLL>(c.as_ptr()) {
41            return i;
42        }
43
44        // perform the case conversions on USIZE_SIZE * MAGIC_UNROLL bytes (gets heavily autovec'd)
45        for j in 0..USIZE_SIZE * MAGIC_UNROLL {
46            let out = o.get_unchecked_mut(j);
47            out.write(f(c.get_unchecked(j)));
48        }
49
50        i += USIZE_SIZE * MAGIC_UNROLL;
51    }
52    i
53}
54
55/// Returns the lowercase equivalent of this string slice, as a new [`String`].
56pub fn to_lowercase(s: &str) -> String {
57    let mut out = Vec::<u8>::with_capacity(s.len());
58    let b = s.as_bytes();
59
60    unsafe {
61        let n = convert_while_ascii(b, out.spare_capacity_mut(), u8::to_ascii_lowercase);
62        out.set_len(n);
63    }
64
65    // Safety: we know this is a valid char boundary since
66    // 1. Our iterator guarantees that this is a valid byte
67    // 2. From our loop we know this is the start of a utf8 scalar point
68    let rest = unsafe { s.get_unchecked(out.len()..) };
69
70    // Safety: We have written only valid ASCII to our vec
71    let mut to = unsafe { String::from_utf8_unchecked(out) };
72
73    for (i, c) in rest.char_indices() {
74        if c == 'Σ' {
75            // Σ maps to σ, except at the end of a word where it maps to ς.
76            // This is the only conditional (contextual) but language-independent mapping
77            // in `SpecialCasing.txt`,
78            // so hard-code it rather than have a generic "condition" mechanism.
79            // See https://github.com/rust-lang/rust/issues/26035
80            map_uppercase_sigma(s, i, &mut to)
81        } else {
82            match conversions::to_lower(c) {
83                [a, '\0', _] => to.push(a),
84                [a, b, '\0'] => {
85                    to.push(a);
86                    to.push(b);
87                }
88                [a, b, c] => {
89                    to.push(a);
90                    to.push(b);
91                    to.push(c);
92                }
93            }
94        }
95    }
96
97    fn map_uppercase_sigma(from: &str, i: usize, out: &mut String) {
98        // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
99        // for the definition of `Final_Sigma`.
100        debug_assert!('Σ'.len_utf8() == 2);
101        let is_word_final = case_ignoreable_then_cased(from[..i].chars().rev())
102            && !case_ignoreable_then_cased(from[i + 2..].chars());
103        out.push_str(if is_word_final { "ς" } else { "σ" });
104    }
105
106    fn case_ignoreable_then_cased<I: Iterator<Item = char>>(mut iter: I) -> bool {
107        use core::unicode::{Case_Ignorable, Cased};
108        match iter.find(|&c| !Case_Ignorable(c)) {
109            Some(c) => Cased(c),
110            None => false,
111        }
112    }
113
114    to
115}
116
117/// Returns the uppercase equivalent of this string slice, as a new [`String`].
118pub fn to_uppercase(s: &str) -> String {
119    let mut out = Vec::<u8>::with_capacity(s.len());
120    let b = s.as_bytes();
121
122    unsafe {
123        let n = convert_while_ascii(b, out.spare_capacity_mut(), u8::to_ascii_uppercase);
124        out.set_len(n);
125    }
126
127    // Safety: we know this is a valid char boundary since
128    // 1. Our iterator guarantees that this is a valid byte
129    // 2. From our loop we know this is the start of a utf8 scalar point
130    let rest = unsafe { s.get_unchecked(out.len()..) };
131
132    // Safety: We have written only valid ASCII to our vec
133    let mut to = unsafe { String::from_utf8_unchecked(out) };
134
135    for c in rest.chars() {
136        match conversions::to_upper(c) {
137            [a, '\0', _] => to.push(a),
138            [a, b, '\0'] => {
139                to.push(a);
140                to.push(b);
141            }
142            [a, b, c] => {
143                to.push(a);
144                to.push(b);
145                to.push(c);
146            }
147        }
148    }
149    to
150}
151
152pub fn is_ascii(b: &[u8]) -> bool {
153    const USIZE_SIZE: usize = mem::size_of::<usize>();
154    const MAGIC_UNROLL: usize = 16;
155
156    if b.len() < USIZE_SIZE {
157        return b.iter().all(u8::is_ascii);
158    }
159    unsafe {
160        let mut i = 0;
161
162        // on 16 usize chunks
163        while i + USIZE_SIZE * MAGIC_UNROLL <= b.len() {
164            if !is_ascii_funsafe::<MAGIC_UNROLL>(b.as_ptr().add(i)) {
165                return false;
166            }
167            i += USIZE_SIZE * MAGIC_UNROLL;
168        }
169
170        // on usize chunks
171        while i + USIZE_SIZE < b.len() {
172            if !is_ascii_funsafe::<1>(b.as_ptr().add(i)) {
173                return false;
174            }
175            i += USIZE_SIZE;
176        }
177
178        // final chunk
179        let i = b.len() - USIZE_SIZE;
180        is_ascii_funsafe::<1>(b.as_ptr().add(i))
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn lowercase() {
190        assert_eq!(to_lowercase(""), "");
191        assert_eq!(to_lowercase("AÉǅaé "), "aéǆaé ");
192
193        // https://github.com/rust-lang/rust/issues/26035
194        assert_eq!(to_lowercase("ΑΣ"), "ας");
195        assert_eq!(to_lowercase("Α'Σ"), "α'ς");
196        assert_eq!(to_lowercase("Α''Σ"), "α''ς");
197
198        assert_eq!(to_lowercase("ΑΣ Α"), "ας α");
199        assert_eq!(to_lowercase("Α'Σ Α"), "α'ς α");
200        assert_eq!(to_lowercase("Α''Σ Α"), "α''ς α");
201
202        assert_eq!(to_lowercase("ΑΣ' Α"), "ας' α");
203        assert_eq!(to_lowercase("ΑΣ'' Α"), "ας'' α");
204
205        assert_eq!(to_lowercase("Α'Σ' Α"), "α'ς' α");
206        assert_eq!(to_lowercase("Α''Σ'' Α"), "α''ς'' α");
207
208        assert_eq!(to_lowercase("Α Σ"), "α σ");
209        assert_eq!(to_lowercase("Α 'Σ"), "α 'σ");
210        assert_eq!(to_lowercase("Α ''Σ"), "α ''σ");
211
212        assert_eq!(to_lowercase("Σ"), "σ");
213        assert_eq!(to_lowercase("'Σ"), "'σ");
214        assert_eq!(to_lowercase("''Σ"), "''σ");
215
216        assert_eq!(to_lowercase("ΑΣΑ"), "ασα");
217        assert_eq!(to_lowercase("ΑΣ'Α"), "ασ'α");
218        assert_eq!(to_lowercase("ΑΣ''Α"), "ασ''α");
219    }
220
221    #[test]
222    fn long() {
223        let mut upper = str::repeat("A", 128);
224        let mut lower = str::repeat("a", 128);
225
226        assert_eq!(to_lowercase(&upper), lower);
227        assert_eq!(to_uppercase(&lower), upper);
228
229        upper.push('Σ');
230        lower.push('σ');
231
232        assert_eq!(to_lowercase(&upper), lower);
233        assert_eq!(to_uppercase(&lower), upper);
234    }
235
236    #[test]
237    fn case_conv_long() {
238        let upper = str::repeat("A", 512);
239        let lower = str::repeat("a", 512);
240
241        assert_eq!(to_lowercase(&upper), lower);
242        assert_eq!(to_uppercase(&lower), upper);
243    }
244
245    #[test]
246    fn case_conv_long_unicode() {
247        let upper = str::repeat("É", 512);
248        let lower = str::repeat("é", 512);
249
250        assert_eq!(to_lowercase(&upper), lower);
251        assert_eq!(to_uppercase(&lower), upper);
252    }
253
254    #[test]
255    fn uppercase() {
256        assert_eq!(to_uppercase(""), "");
257        assert_eq!(to_uppercase("aéǅßﬁᾀ"), "AÉǄSSFIἈΙ");
258    }
259}
case_conv/lib.rs

case_conv/
lib.rs