enrede/encoding/
utf.rs

1use crate::encoding::sealed::Sealed;
2use crate::encoding::{Encoding, NullTerminable, ValidateError};
3use crate::str::Str;
4use arrayvec::ArrayVec;
5#[cfg(feature = "rand")]
6use rand::{distr::Distribution, Rng};
7
8/// The [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoding
9#[non_exhaustive]
10#[derive(Default)]
11pub struct Utf8;
12
13impl Sealed for Utf8 {}
14
15impl Encoding for Utf8 {
16    const REPLACEMENT: char = '\u{FFFD}';
17    const MAX_LEN: usize = 4;
18    type Bytes = ArrayVec<u8, 4>;
19
20    fn shorthand() -> &'static str {
21        "utf8"
22    }
23
24    fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
25        core::str::from_utf8(bytes)
26            .map(|_| ())
27            .map_err(|e| ValidateError {
28                valid_up_to: e.valid_up_to(),
29                error_len: e.error_len().map(|e| e as u8),
30            })
31    }
32
33    fn encode_char(c: char) -> Option<Self::Bytes> {
34        let mut out = [0; 4];
35        let res = c.encode_utf8(&mut out);
36        let mut out = ArrayVec::new();
37        out.extend(res.as_bytes().iter().copied());
38        Some(out)
39    }
40
41    fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
42        let c = str.as_std().chars().next().unwrap();
43        (c, &str[c.len_utf8()..])
44    }
45
46    fn char_bound(str: &Str<Self>, idx: usize) -> bool {
47        str.as_std().is_char_boundary(idx)
48    }
49
50    fn char_len(c: char) -> usize {
51        c.len_utf8()
52    }
53}
54
55impl NullTerminable for Utf8 {}
56
57#[cfg(feature = "rand")]
58impl Distribution<char> for Utf8 {
59    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
60        rng.random::<char>()
61    }
62}
63
64/// The [UTF-16](https://en.wikipedia.org/wiki/UTF-16) encoding
65pub type Utf16 = Utf16LE;
66
67#[derive(PartialEq, Eq)]
68enum Kind {
69    Char,
70    High,
71    Low,
72}
73
74impl Kind {
75    fn of(c: u16) -> Kind {
76        match c {
77            ..=0xD7FF => Kind::Char,
78            0xD800..=0xDBFF => Kind::High,
79            0xDC00..=0xDFFF => Kind::Low,
80            0xE000.. => Kind::Char,
81        }
82    }
83}
84
85macro_rules! utf16_impl {
86    (
87        $name:ident,
88        $shorthand:literal,
89        $method_from:ident,
90        $method_to:ident,
91        $idx_add:literal,
92        $docname:literal,
93    ) => {
94        #[doc = "The ["]
95        #[doc = $docname]
96        #[doc = "](https://en.wikipedia.org/wiki/UTF-16#Byte-order_encoding_schemes) encoding"]
97        #[non_exhaustive]
98        #[derive(Default)]
99        pub struct $name;
100
101        impl Sealed for $name {}
102
103        impl Encoding for $name {
104            const REPLACEMENT: char = '\u{FFFD}';
105            const MAX_LEN: usize = 4;
106            type Bytes = ArrayVec<u8, 4>;
107
108            fn shorthand() -> &'static str {
109                $shorthand
110            }
111
112            fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
113                let chunks = bytes.chunks_exact(2);
114
115                let error = if let [_] = chunks.remainder() {
116                    Some(ValidateError {
117                        valid_up_to: bytes.len() - 1,
118                        error_len: None,
119                    })
120                } else {
121                    None
122                };
123
124                // `get_unchecked` is the same speed
125                // `try_fold` variant is significantly slower
126                let mut surrogate = false;
127                for (idx, chunk) in chunks.enumerate() {
128                    let c = u16::$method_from([chunk[0], chunk[1]]);
129                    let kind = Kind::of(c);
130
131                    if !surrogate && kind == Kind::High {
132                        surrogate = true;
133                    } else if surrogate && kind == Kind::Low {
134                        surrogate = false;
135                    } else if surrogate || kind != Kind::Char {
136                        let err_len = if surrogate && kind != Kind::Char {
137                            4
138                        } else {
139                            2
140                        };
141                        let idx = if surrogate { idx - 1 } else { idx };
142                        return Err(ValidateError {
143                            valid_up_to: idx * 2,
144                            error_len: Some(err_len),
145                        });
146                    }
147                }
148
149                if surrogate {
150                    return Err(ValidateError {
151                        valid_up_to: bytes.len() - 2,
152                        error_len: None,
153                    });
154                }
155
156                match error {
157                    Some(err) => Err(err),
158                    None => Ok(()),
159                }
160            }
161
162            fn encode_char(c: char) -> Option<Self::Bytes> {
163                let mut out = [0; 2];
164                let res = c.encode_utf16(&mut out);
165                let mut out = ArrayVec::new();
166                out.extend(res[0].$method_to());
167                if res.len() > 1 {
168                    out.extend(res[1].$method_to());
169                }
170                Some(out)
171            }
172
173            fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
174                let bytes = str.as_bytes();
175                let high = u16::$method_from([bytes[0], bytes[1]]);
176                if (..0xD800).contains(&high) || (0xE000..).contains(&high) {
177                    // SAFETY: We just confirmed `high` is not in the surrogate range, and is thus a valid
178                    //         `char`.
179                    let c = unsafe { char::from_u32_unchecked(high as u32) };
180                    (c, &str[2..])
181                } else {
182                    let low = u16::$method_from([bytes[2], bytes[3]]);
183
184                    let high = (high as u32 - 0xD800) * 0x400;
185                    let low = low as u32 - 0xDC00;
186                    // SAFETY: Str is valid UTF-16, as such, all surrogate pairs will produce a valid `char`
187                    let c = unsafe { char::from_u32_unchecked(high + low + 0x10000) };
188                    (c, &str[4..])
189                }
190            }
191
192            fn char_bound(str: &Str<Self>, idx: usize) -> bool {
193                idx % 2 == 0 && !(0xDC..0xE0).contains(&str.as_bytes()[idx + $idx_add])
194            }
195
196            fn char_len(c: char) -> usize {
197                c.len_utf16()
198            }
199        }
200
201        #[cfg(feature = "rand")]
202        impl Distribution<char> for $name {
203            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
204                rng.random::<char>()
205            }
206        }
207    };
208}
209
210utf16_impl!(
211    Utf16BE,
212    "utf16be",
213    from_be_bytes,
214    to_be_bytes,
215    0,
216    "UTF-16BE",
217);
218
219utf16_impl!(
220    Utf16LE,
221    "utf16le",
222    from_le_bytes,
223    to_le_bytes,
224    1,
225    "UTF-16LE",
226);
227
228macro_rules! utf32_impl {
229    (
230        $name:ident,
231        $shorthand:literal,
232        $method_from:ident,
233        $method_to:ident,
234        $docname:literal,
235    ) => {
236        #[doc = "The ["]
237        #[doc = $docname]
238        #[doc = "](https://en.wikipedia.org/wiki/UTF-32) encoding"]
239        #[non_exhaustive]
240        #[derive(Default)]
241        pub struct $name;
242
243        impl Sealed for $name {}
244
245        impl Encoding for $name {
246            const REPLACEMENT: char = '\u{FFFD}';
247            const MAX_LEN: usize = 4;
248            type Bytes = [u8; 4];
249
250            fn shorthand() -> &'static str {
251                $shorthand
252            }
253
254            fn validate(bytes: &[u8]) -> Result<(), ValidateError> {
255                for (idx, chunk) in bytes.chunks(4).enumerate() {
256                    if chunk.len() != 4 {
257                        return Err(ValidateError {
258                            valid_up_to: idx * 4,
259                            error_len: None,
260                        });
261                    }
262
263                    let c = u32::$method_from([chunk[0], chunk[1], chunk[2], chunk[3]]);
264                    if (0xD800..0xE000).contains(&c) || (0x0011_0000..).contains(&c) {
265                        return Err(ValidateError {
266                            valid_up_to: idx * 4,
267                            error_len: Some(4),
268                        });
269                    }
270                }
271
272                Ok(())
273            }
274
275            fn encode_char(c: char) -> Option<Self::Bytes> {
276                Some((c as u32).$method_to())
277            }
278
279            fn decode_char(str: &Str<Self>) -> (char, &Str<Self>) {
280                let bytes = str.as_bytes();
281                let c = u32::$method_from([bytes[0], bytes[1], bytes[2], bytes[3]]);
282                // SAFETY: Str<Utf32> is guaranteed to contain valid `char` values
283                let c = unsafe { char::from_u32_unchecked(c) };
284                (c, &str[4..])
285            }
286
287            fn char_bound(_: &Str<Self>, idx: usize) -> bool {
288                idx % 4 == 0
289            }
290
291            fn char_len(_: char) -> usize {
292                4
293            }
294        }
295
296        #[cfg(feature = "rand")]
297        impl Distribution<char> for $name {
298            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
299                rng.random::<char>()
300            }
301        }
302    };
303}
304
305utf32_impl!(Utf32BE, "utf32be", from_be_bytes, to_be_bytes, "UTF-32BE",);
306utf32_impl!(Utf32LE, "utf32le", from_le_bytes, to_le_bytes, "UTF-32LE",);
307
308/// The [UTF-32](https://en.wikipedia.org/wiki/UTF-32) encoding
309#[cfg(target_endian = "little")]
310pub type Utf32 = Utf32LE;
311
312/// The [UTF-32](https://en.wikipedia.org/wiki/UTF-32) encoding
313#[cfg(target_endian = "big")]
314pub type Utf32 = Utf32BE;
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319    use alloc::vec::Vec;
320
321    extern crate alloc;
322
323    #[allow(clippy::octal_escapes)]
324    #[test]
325    fn test_validate_utf16_le() {
326        assert!(Utf16LE::validate(b"a\0b\0c\01\02\03\0").is_ok());
327        assert!(Utf16LE::validate(b"A\0 \0y\0e\0e\0:\0 \0\x01\xD8\x37\xDC").is_ok());
328        // dangling surrogate (after is valid char)
329        assert_eq!(
330            Utf16LE::validate(b"\x01\xD8a\0"),
331            Err(ValidateError {
332                valid_up_to: 0,
333                error_len: Some(2),
334            })
335        );
336        // dangling surrogate (after is invalid)
337        assert_eq!(
338            Utf16LE::validate(b" \0\x01\xD8\x01\xD8"),
339            Err(ValidateError {
340                valid_up_to: 2,
341                error_len: Some(4),
342            })
343        );
344        // dangling surrogate (final byte)
345        assert_eq!(
346            Utf16LE::validate(b"\x01\xD8"),
347            Err(ValidateError {
348                valid_up_to: 0,
349                error_len: None,
350            })
351        );
352        // dangling surrogate (final byte, valid before it)
353        assert_eq!(
354            Utf16LE::validate(b"a\0b\0\x01\xD8"),
355            Err(ValidateError {
356                valid_up_to: 4,
357                error_len: None,
358            })
359        );
360    }
361
362    #[test]
363    fn test_encode_utf16_le() {
364        let mut expect = ArrayVec::new();
365        expect.extend([b'A', 0]);
366        assert_eq!(Utf16LE::encode_char('A'), Some(expect));
367        assert_eq!(
368            Utf16LE::encode_char('𐐷'),
369            Some(ArrayVec::from([0x01, 0xD8, 0x37, 0xDC]))
370        );
371    }
372
373    #[test]
374    fn test_decode_utf16_le() {
375        // SAFETY: This test data is guaranteed valid
376        let str = unsafe { Str::from_bytes_unchecked(b"A\0\x01\xD8\x37\xDCb\0") };
377        let (c, str) = Utf16LE::decode_char(str);
378        assert_eq!(c, 'A');
379        let (c, str) = Utf16LE::decode_char(str);
380        assert_eq!(c, '𐐷');
381        let (c, _) = Utf16LE::decode_char(str);
382        assert_eq!(c, 'b');
383    }
384
385    #[test]
386    fn test_char_boundary_utf16le() {
387        let str = unsafe { Str::from_bytes_unchecked(b"A\0\x01\xD8\x37\xDCb\0") };
388        assert!(Utf16LE::char_bound(str, 2));
389        assert!(!Utf16LE::char_bound(str, 4));
390        assert!(Utf16LE::char_bound(str, 6));
391
392        let str =
393            unsafe { Str::from_bytes_unchecked(&[174, 95, 223, 142, 99, 107, 209, 158, 212, 154]) };
394        assert!(!Utf16LE::char_bound(str, 1));
395        assert!(Utf16LE::char_bound(str, 2));
396        assert!(!Utf16LE::char_bound(str, 3));
397        assert!(Utf16LE::char_bound(str, 4));
398    }
399
400    #[allow(clippy::octal_escapes)]
401    #[test]
402    fn test_validate_utf16_be() {
403        assert!(Utf16BE::validate(b"\0a\0b\0c\01\02\03").is_ok());
404        assert!(Utf16BE::validate(b"\0A\0 \0y\0e\0e\0:\0 \xD8\x01\xDC\x37").is_ok());
405        // dangling surrogate (after is valid char)
406        assert_eq!(
407            Utf16BE::validate(b"\xD8\x01\0a"),
408            Err(ValidateError {
409                valid_up_to: 0,
410                error_len: Some(2),
411            })
412        );
413        // dangling surrogate (after is invalid)
414        assert_eq!(
415            Utf16BE::validate(b"\0 \xD8\x01\xD8\x01"),
416            Err(ValidateError {
417                valid_up_to: 2,
418                error_len: Some(4),
419            })
420        );
421        // dangling surrogate (final byte)
422        assert_eq!(
423            Utf16BE::validate(b"\xD8\x01"),
424            Err(ValidateError {
425                valid_up_to: 0,
426                error_len: None,
427            })
428        );
429        // dangling surrogate (final byte, valid before it)
430        assert_eq!(
431            Utf16BE::validate(b"\0a\0b\xD8\x01"),
432            Err(ValidateError {
433                valid_up_to: 4,
434                error_len: None,
435            })
436        );
437    }
438
439    #[test]
440    fn test_encode_utf16_be() {
441        let mut expect = ArrayVec::new();
442        expect.extend([0, b'A']);
443        assert_eq!(Utf16BE::encode_char('A'), Some(expect));
444        assert_eq!(
445            Utf16BE::encode_char('𐐷'),
446            Some(ArrayVec::from([0xD8, 0x01, 0xDC, 0x37]))
447        );
448    }
449
450    #[test]
451    fn test_decode_utf16_be() {
452        // SAFETY: This test data is guaranteed valid
453        let str = unsafe { Str::from_bytes_unchecked(b"\0A\xD8\x01\xDC\x37\0b") };
454        let (c, str) = Utf16BE::decode_char(str);
455        assert_eq!(c, 'A');
456        let (c, str) = Utf16BE::decode_char(str);
457        assert_eq!(c, '𐐷');
458        let (c, _) = Utf16BE::decode_char(str);
459        assert_eq!(c, 'b');
460    }
461
462    #[test]
463    fn test_char_boundary_utf16be() {
464        let str = unsafe { Str::from_bytes_unchecked(b"\0A\xD8\x01\xDC\x37\0b") };
465        assert!(Utf16BE::char_bound(str, 2));
466        assert!(!Utf16BE::char_bound(str, 4));
467        assert!(Utf16BE::char_bound(str, 6));
468
469        let str =
470            unsafe { Str::from_bytes_unchecked(&[95, 174, 142, 223, 107, 99, 158, 209, 154, 212]) };
471        assert!(!Utf16BE::char_bound(str, 1));
472        assert!(Utf16BE::char_bound(str, 2));
473        assert!(!Utf16BE::char_bound(str, 3));
474        assert!(Utf16BE::char_bound(str, 4));
475    }
476
477    macro_rules! utf32le {
478        ($str:literal) => {
479            $str.chars()
480                .flat_map(|c| (c as u32).to_le_bytes())
481                .collect::<Vec<_>>()
482        };
483    }
484
485    #[test]
486    fn test_validate_utf32_le() {
487        assert!(Utf32LE::validate(&utf32le!("abc123")).is_ok());
488        assert!(Utf32LE::validate(&utf32le!("A yee: 𐐷")).is_ok());
489        // Invalid (surrogate)
490        assert_eq!(
491            Utf32LE::validate(&[
492                0x61, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
493            ]),
494            Err(ValidateError {
495                valid_up_to: 4,
496                error_len: Some(4),
497            })
498        );
499        assert_eq!(
500            Utf32LE::validate(&[0x00, 0x00, 0x11, 0x00]),
501            Err(ValidateError {
502                valid_up_to: 0,
503                error_len: Some(4),
504            })
505        );
506    }
507
508    #[test]
509    fn test_encode_utf32_le() {
510        assert_eq!(Utf32LE::encode_char('A'), Some([b'A', 0, 0, 0]));
511        assert_eq!(Utf32LE::encode_char('𐐷'), Some([0x37, 0x04, 0x01, 0x00]));
512    }
513
514    #[test]
515    fn test_decode_utf32_le() {
516        let bytes = utf32le!("A𐐷b");
517        let str = Str::from_bytes(&bytes).unwrap();
518        let (c, str) = Utf32LE::decode_char(str);
519        assert_eq!(c, 'A');
520        let (c, str) = Utf32LE::decode_char(str);
521        assert_eq!(c, '𐐷');
522        let (c, _) = Utf32LE::decode_char(str);
523        assert_eq!(c, 'b');
524    }
525
526    macro_rules! utf32be {
527        ($str:literal) => {
528            $str.chars()
529                .flat_map(|c| (c as u32).to_be_bytes())
530                .collect::<Vec<_>>()
531        };
532    }
533
534    #[test]
535    fn test_validate_utf32_be() {
536        assert!(Utf32BE::validate(&utf32be!("abc123")).is_ok());
537        assert!(Utf32BE::validate(&utf32be!("A yee: 𐐷")).is_ok());
538        // Invalid (surrogate)
539        assert_eq!(
540            Utf32BE::validate(&[
541                0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x62,
542            ]),
543            Err(ValidateError {
544                valid_up_to: 4,
545                error_len: Some(4),
546            })
547        );
548        assert_eq!(
549            Utf32BE::validate(&[0x00, 0x11, 0x00, 0x00]),
550            Err(ValidateError {
551                valid_up_to: 0,
552                error_len: Some(4),
553            })
554        );
555    }
556
557    #[test]
558    fn test_encode_utf32_be() {
559        assert_eq!(Utf32BE::encode_char('A'), Some([0, 0, 0, b'A']));
560        assert_eq!(Utf32BE::encode_char('𐐷'), Some([0x00, 0x01, 0x04, 0x37]));
561    }
562
563    #[test]
564    fn test_decode_utf32_be() {
565        let bytes = utf32be!("A𐐷b");
566        let str = Str::from_bytes(&bytes).unwrap();
567        let (c, str) = Utf32BE::decode_char(str);
568        assert_eq!(c, 'A');
569        let (c, str) = Utf32BE::decode_char(str);
570        assert_eq!(c, '𐐷');
571        let (c, _) = Utf32BE::decode_char(str);
572        assert_eq!(c, 'b');
573    }
574}