cesu8/
lib.rs

1//! A library for converting between CESU-8 and UTF-8.
2//!
3//! > Unicode code points from the [Basic Multilingual Plane][bmp] (BMP), i.e. a
4//! > code point in the range U+0000 to U+FFFF is encoded in the same way as
5//! > UTF-8.
6//!
7//! [bmp]: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
8//!
9//! If [`encode`] or [`decode`] only encounters data that is both
10//! valid CESU-8 and UTF-8 data, the `cesu8` crate leverages this using a
11//! [clone-on-write smart pointer][cow] ([`Cow`]). This means that there
12//! are no unnecessary operations and needless allocation of memory:
13//!
14//! [cow]: https://en.wikipedia.org/wiki/Copy-on-write
15//!
16//! # Examples
17//!
18//! Basic usage:
19//!
20//! ```rust
21//! # extern crate alloc;
22//! use alloc::borrow::Cow;
23//!
24//! # fn main() -> Result<(), cesu8::Error> {
25//! let str = "Hello, world!";
26//! assert_eq!(cesu8::encode(str), Cow::Borrowed(str.as_bytes()));
27//! assert_eq!(cesu8::decode(str.as_bytes())?, Cow::Borrowed(str));
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! When data needs to be encoded or decoded, it functions as one might expect:
33//!
34//! ```
35//! # extern crate alloc;
36//! # use alloc::borrow::Cow;
37//! # fn main() -> Result<(), cesu8::Error> {
38//! let str = "\u{10400}";
39//! let cesu8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
40//! assert_eq!(cesu8::decode(cesu8_data)?, Cow::<str>::Owned(str.to_string()));
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Features
46//!
47//! - `std` implements [`std::error::Error`] on [`Error`]. By  default this
48//! feature is enabled.
49
50#![cfg_attr(not(feature = "std"), no_std)]
51#![cfg_attr(doc_cfg, feature(doc_cfg))]
52#![deny(clippy::pedantic)]
53#![allow(clippy::cast_lossless, clippy::cast_possible_truncation)]
54
55extern crate alloc;
56
57use alloc::{borrow::Cow, str::from_utf8, string::String, vec::Vec};
58use core::fmt;
59
60/// Converts a slice of bytes to a string slice.
61///
62/// First, if the slice of bytes is already valid UTF-8, this function is
63/// functionally no different than [`std::str::from_utf8`](std::str::from_utf8);
64/// this means that `decode` does not need to perform any further operations
65/// and doesn't need to allocate additional memory.
66///
67/// If the slice of bytes is not valid UTF-8, `decode` works on the assumption
68/// that the slice of bytes, if not valid UTF-8, is valid CESU-8. It will then
69/// decode the bytes given to it and return the newly constructed string slice.
70///
71/// # Errors
72///
73/// Returns [`cesu8::Error`](Error) if the input is invalid CESU-8 data.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// # extern crate alloc;
81/// use alloc::borrow::Cow;
82///
83/// # fn main() -> Result<(), cesu8::Error> {
84/// let str = "Hello, world!";
85/// // Since 'str' is valid UTF-8 and CESU-8 data, 'cesu8::decode' can decode
86/// // the string slice without allocating memory.
87/// assert_eq!(cesu8::decode(str.as_bytes())?, Cow::Borrowed(str));
88///
89/// let str = "\u{10400}";
90/// let cesu8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
91/// // 'cesu8_data' is a byte slice containing a 6-byte surrogate pair which
92/// // becomes a 4-byte UTF-8 character.
93/// assert_eq!(cesu8::decode(cesu8_data)?, Cow::<str>::Owned(str.to_string()));
94/// # Ok(())
95/// # }
96/// ```
97#[inline]
98pub fn decode(bytes: &[u8]) -> Result<Cow<str>, Error> {
99    from_utf8(bytes)
100        .map(Cow::Borrowed)
101        .or_else(|_| decode_cesu8(bytes).map(Cow::Owned))
102}
103
104#[inline(never)]
105#[cold]
106#[allow(clippy::unnested_or_patterns)] // this hurts readability otherwise
107fn decode_cesu8(bytes: &[u8]) -> Result<String, Error> {
108    let mut decoded = Vec::with_capacity(bytes.len());
109    let mut iter = bytes.iter();
110
111    macro_rules! err {
112        () => {{
113            return Err(Error);
114        }};
115    }
116
117    macro_rules! next {
118        () => {
119            match iter.next() {
120                Some(&byte) => byte,
121                None => err!(),
122            }
123        };
124    }
125
126    macro_rules! next_continuation {
127        () => {{
128            let byte = next!();
129            if is_continuation_byte(byte) {
130                byte
131            } else {
132                err!();
133            }
134        }};
135    }
136
137    while let Some(&first) = iter.next() {
138        if first <= MAX_ASCII_CODE_POINT {
139            decoded.push(first);
140        } else {
141            let width = match utf8_char_width(first) {
142                Some(v) => v,
143                None => err!(),
144            };
145            let second = next_continuation!();
146            match width {
147                2 => decoded.extend_from_slice(&[first, second]),
148                3 => {
149                    let third = next_continuation!();
150                    match (first, second) {
151                        (0xE0, 0xA0..=0xBF)
152                        | (0xE1..=0xEC, 0x80..=0xBF)
153                        | (0xED, 0x80..=0x9F)
154                        | (0xEE..=0xEF, 0x80..=0xBF) => {
155                            decoded.extend_from_slice(&[first, second, third]);
156                        }
157                        (0xED, 0xA0..=0xAF) => {
158                            let fourth = next!();
159                            if fourth != 0xED {
160                                err!();
161                            }
162                            let fifth = next_continuation!();
163                            if !(0xB0..=0xBF).contains(&fifth) {
164                                err!();
165                            }
166                            let sixth = next_continuation!();
167                            decoded.extend_from_slice(&decode_surrogate_pair(
168                                second, third, fifth, sixth,
169                            ));
170                        }
171                        _ => err!(),
172                    }
173                }
174                _ => err!(),
175            }
176        }
177    }
178
179    debug_assert!(from_utf8(&decoded).is_ok());
180    Ok(unsafe { String::from_utf8_unchecked(decoded) })
181}
182
183#[inline]
184fn decode_surrogate_pair(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
185    let surrogate1 = decode_surrogate(second, third);
186    let surrogate2 = decode_surrogate(fifth, sixth);
187    let code_point = 0x10000 + ((surrogate1 - 0xD800) << 10 | (surrogate2 - 0xDC00));
188    decode_code_point(code_point)
189}
190
191#[inline]
192fn decode_surrogate(second: u8, third: u8) -> u32 {
193    const VAL_MASK: u8 = 0b0011_1111;
194    0xD000 | ((second & VAL_MASK) as u32) << 6 | (third & VAL_MASK) as u32
195}
196
197#[inline]
198fn decode_code_point(code_point: u32) -> [u8; 4] {
199    const STRT_TAG: u8 = 0b1111_0000;
200    [
201        STRT_TAG | ((code_point & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
202        CONT_TAG | ((code_point & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
203        CONT_TAG | ((code_point & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
204        CONT_TAG | ((code_point & 0b0_0000_0000_0000_0011_1111) as u8),
205    ]
206}
207
208/// Converts a string slice to CESU-8 bytes.
209///
210/// If the string slice's representation in CESU-8 would be identical to its
211/// present UTF-8 representation, this function is functionally no different
212/// than [`(&str).as_bytes()`](str::as_bytes); this means that `encode` does
213/// not need to perform any further operations and doesn't need to allocate any
214/// additional memory.
215///
216/// If the string slice's representation in UTF-8 is not equivalent in CESU-8,
217/// `encode` encodes the string slice to its CESU-8 representation as a slice
218/// of bytes.
219///
220/// # Examples
221///
222/// Basic usage:
223///
224/// ```
225/// # extern crate alloc;
226/// use alloc::borrow::Cow;
227///
228/// let str = "Hello, world!";
229/// // Since 'str' is valid UTF-8 and CESU-8 data, 'to_cesu8' can encode
230/// // data without allocating memory.
231/// assert_eq!(cesu8::encode(str), Cow::Borrowed(str.as_bytes()));
232///
233/// let utf8_data = "\u{10401}";
234/// let cesu8_data = vec![0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
235/// // 'utf8_data' is a 4-byte UTF-8 representation, which becomes a 6-byte
236/// // CESU-8 representation.
237/// assert_eq!(cesu8::encode(utf8_data), Cow::<[u8]>::Owned(cesu8_data));
238/// ```
239#[must_use]
240#[inline]
241pub fn encode(str: &str) -> Cow<[u8]> {
242    if is_valid(str) {
243        Cow::Borrowed(str.as_bytes())
244    } else {
245        Cow::Owned(encode_cesu8(str))
246    }
247}
248
249#[must_use]
250#[inline(never)]
251#[cold]
252fn encode_cesu8(str: &str) -> Vec<u8> {
253    let bytes = str.as_bytes();
254    let capacity = len(str);
255    let mut encoded = Vec::with_capacity(capacity);
256    let mut index = 0;
257
258    while index < bytes.len() {
259        let byte = bytes[index];
260        if byte <= MAX_ASCII_CODE_POINT {
261            encoded.push(byte);
262            index += 1;
263        } else {
264            let width = utf8_char_width(byte).unwrap();
265            let slice_range = index..index + width;
266            if width <= CESU8_MAX_CHAR_WIDTH {
267                encoded.extend(&bytes[slice_range]);
268            } else {
269                let str = &str[slice_range];
270                let code_point = str.chars().next().unwrap() as u32;
271                let surrogate_pair = to_surrogate_pair(code_point);
272                let encoded_pair = encode_surrogate_pair(surrogate_pair);
273                encoded.extend(&encoded_pair);
274            }
275            index += width;
276        }
277    }
278
279    encoded
280}
281
282#[inline]
283fn encode_surrogate_pair(surrogate_pair: [u16; 2]) -> [u8; 6] {
284    let [b1, b2, b3] = encode_surrogate(surrogate_pair[0]);
285    let [b4, b5, b6] = encode_surrogate(surrogate_pair[1]);
286    [b1, b2, b3, b4, b5, b6]
287}
288
289#[inline]
290fn encode_surrogate(surrogate: u16) -> [u8; 3] {
291    const STRT_TAG: u8 = 0b1110_0000;
292    [
293        STRT_TAG | ((surrogate & 0b1111_0000_0000_0000) >> 12) as u8,
294        CONT_TAG | ((surrogate & 0b0000_1111_1100_0000) >> 6) as u8,
295        CONT_TAG | ((surrogate & 0b0000_0000_0011_1111) as u8),
296    ]
297}
298
299#[inline]
300fn to_surrogate_pair(code_point: u32) -> [u16; 2] {
301    let code_point = code_point - 0x10000;
302    let first = ((code_point >> 10) as u16) | 0xD800;
303    let second = ((code_point & 0x3FF) as u16) | 0xDC00;
304    [first, second]
305}
306
307/// Returns how many bytes in CESU-8 are required to encode a string slice.
308///
309/// # Examples
310///
311/// Basic usage:
312///
313/// ```
314/// // Any codepoint below or equal to U+FFFF is the same length as it is in
315/// // UTF-8.
316/// assert_eq!(cesu8::len("\u{FFFF}"), 3);
317///
318/// // Any codepoint above U+FFFF is stored as a surrogate pair.
319/// assert_eq!(cesu8::len("\u{10000}"), 6);
320/// ```
321#[must_use]
322pub fn len(str: &str) -> usize {
323    let bytes = str.as_bytes();
324    let mut len = 0;
325    let mut index = 0;
326    while index < bytes.len() {
327        let byte = bytes[index];
328        if byte <= MAX_ASCII_CODE_POINT {
329            len += 1;
330            index += 1;
331        } else {
332            // SAFETY: Valid UTF-8 will never yield a `None` value:
333            let width = unsafe { utf8_char_width(byte).unwrap_unchecked() };
334            len += if width <= CESU8_MAX_CHAR_WIDTH {
335                width
336            } else {
337                6
338            };
339            index += width;
340        }
341    }
342    len
343}
344
345/// Returns `true` if a string slice contains UTF-8 data that is also valid
346/// CESU-8.
347///
348/// This is primarily used in testing if a string slice needs to be explicitly
349/// encoded using [`encode`]. If `is_valid()` returns `false`, it implies that
350/// [`&str.as_bytes()`](str::as_bytes) is directly  equivalent to the string
351/// slice's CESU-8 representation.
352///
353/// # Examples
354///
355/// Basic usage:
356///
357/// ```
358/// // Any code point below or equal to U+FFFF encoded in UTF-8 IS valid CESU-8.
359/// assert!(cesu8::is_valid("Hello, world!"));
360/// assert!(cesu8::is_valid("\u{FFFF}"));
361///
362/// // Any code point above U+FFFF encoded in UTF-8 IS NOT valid CESU-8.
363/// assert!(!cesu8::is_valid("\u{10000}"));
364/// ```
365#[must_use]
366pub fn is_valid(str: &str) -> bool {
367    for byte in str.bytes() {
368        if is_continuation_byte(byte) {
369            continue;
370        }
371        if let Some(width) = utf8_char_width(byte) {
372            if width > CESU8_MAX_CHAR_WIDTH {
373                return false;
374            }
375        } else {
376            return false;
377        }
378    }
379    true
380}
381
382const CESU8_MAX_CHAR_WIDTH: usize = 3;
383
384#[inline]
385fn is_continuation_byte(byte: u8) -> bool {
386    const TAG_MASK: u8 = 0b1100_0000;
387    byte & TAG_MASK == CONT_TAG
388}
389
390const CONT_TAG: u8 = 0b1000_0000;
391
392fn utf8_char_width(byte: u8) -> Option<usize> {
393    match byte {
394        0x00..=MAX_ASCII_CODE_POINT => Some(1),
395        0xC2..=0xDF => Some(2),
396        0xE0..=0xEF => Some(3),
397        0xF0..=0xF4 => Some(4),
398        _ => None,
399    }
400}
401
402const MAX_ASCII_CODE_POINT: u8 = 0x7F;
403
404/// An error thrown by [`decode`] when the input is invalid CESU-8 data.
405///
406/// This type does not support transmission of an error other than that an error
407/// occurred.
408#[derive(Clone, Copy, Debug, PartialEq, Eq)]
409pub struct Error;
410
411impl fmt::Display for Error {
412    #[inline]
413    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
414        f.write_str("invalid CESU-8 data")
415    }
416}
417
418#[cfg(feature = "std")]
419#[cfg_attr(doc_cfg, doc(cfg(feature = "std")))]
420impl std::error::Error for Error {}