cesu8/lib.rs
1//! A library for converting between CESU-8 and UTF-8.
2//!
3//! > Unicode code points from the [Basic Multilingual Plane][bmp] (BMP), i.e. a
4//! > code point in the range U+0000 to U+FFFF is encoded in the same way as
5//! > UTF-8.
6//!
7//! [bmp]: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
8//!
9//! If [`encode`] or [`decode`] only encounters data that is both
10//! valid CESU-8 and UTF-8 data, the `cesu8` crate leverages this using a
11//! [clone-on-write smart pointer][cow] ([`Cow`]). This means that there
12//! are no unnecessary operations and needless allocation of memory:
13//!
14//! [cow]: https://en.wikipedia.org/wiki/Copy-on-write
15//!
16//! # Examples
17//!
18//! Basic usage:
19//!
20//! ```rust
21//! # extern crate alloc;
22//! use alloc::borrow::Cow;
23//!
24//! # fn main() -> Result<(), cesu8::Error> {
25//! let str = "Hello, world!";
26//! assert_eq!(cesu8::encode(str), Cow::Borrowed(str.as_bytes()));
27//! assert_eq!(cesu8::decode(str.as_bytes())?, Cow::Borrowed(str));
28//! # Ok(())
29//! # }
30//! ```
31//!
32//! When data needs to be encoded or decoded, it functions as one might expect:
33//!
34//! ```
35//! # extern crate alloc;
36//! # use alloc::borrow::Cow;
37//! # fn main() -> Result<(), cesu8::Error> {
38//! let str = "\u{10400}";
39//! let cesu8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
40//! assert_eq!(cesu8::decode(cesu8_data)?, Cow::<str>::Owned(str.to_string()));
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Features
46//!
47//! - `std` implements [`std::error::Error`] on [`Error`]. By default this
48//! feature is enabled.
49
50#![cfg_attr(not(feature = "std"), no_std)]
51#![cfg_attr(doc_cfg, feature(doc_cfg))]
52#![deny(clippy::pedantic)]
53#![allow(clippy::cast_lossless, clippy::cast_possible_truncation)]
54
55extern crate alloc;
56
57use alloc::{borrow::Cow, str::from_utf8, string::String, vec::Vec};
58use core::fmt;
59
60/// Converts a slice of bytes to a string slice.
61///
62/// First, if the slice of bytes is already valid UTF-8, this function is
63/// functionally no different than [`std::str::from_utf8`](std::str::from_utf8);
64/// this means that `decode` does not need to perform any further operations
65/// and doesn't need to allocate additional memory.
66///
67/// If the slice of bytes is not valid UTF-8, `decode` works on the assumption
68/// that the slice of bytes, if not valid UTF-8, is valid CESU-8. It will then
69/// decode the bytes given to it and return the newly constructed string slice.
70///
71/// # Errors
72///
73/// Returns [`cesu8::Error`](Error) if the input is invalid CESU-8 data.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// # extern crate alloc;
81/// use alloc::borrow::Cow;
82///
83/// # fn main() -> Result<(), cesu8::Error> {
84/// let str = "Hello, world!";
85/// // Since 'str' is valid UTF-8 and CESU-8 data, 'cesu8::decode' can decode
86/// // the string slice without allocating memory.
87/// assert_eq!(cesu8::decode(str.as_bytes())?, Cow::Borrowed(str));
88///
89/// let str = "\u{10400}";
90/// let cesu8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
91/// // 'cesu8_data' is a byte slice containing a 6-byte surrogate pair which
92/// // becomes a 4-byte UTF-8 character.
93/// assert_eq!(cesu8::decode(cesu8_data)?, Cow::<str>::Owned(str.to_string()));
94/// # Ok(())
95/// # }
96/// ```
97#[inline]
98pub fn decode(bytes: &[u8]) -> Result<Cow<str>, Error> {
99 from_utf8(bytes)
100 .map(Cow::Borrowed)
101 .or_else(|_| decode_cesu8(bytes).map(Cow::Owned))
102}
103
104#[inline(never)]
105#[cold]
106#[allow(clippy::unnested_or_patterns)] // this hurts readability otherwise
107fn decode_cesu8(bytes: &[u8]) -> Result<String, Error> {
108 let mut decoded = Vec::with_capacity(bytes.len());
109 let mut iter = bytes.iter();
110
111 macro_rules! err {
112 () => {{
113 return Err(Error);
114 }};
115 }
116
117 macro_rules! next {
118 () => {
119 match iter.next() {
120 Some(&byte) => byte,
121 None => err!(),
122 }
123 };
124 }
125
126 macro_rules! next_continuation {
127 () => {{
128 let byte = next!();
129 if is_continuation_byte(byte) {
130 byte
131 } else {
132 err!();
133 }
134 }};
135 }
136
137 while let Some(&first) = iter.next() {
138 if first <= MAX_ASCII_CODE_POINT {
139 decoded.push(first);
140 } else {
141 let width = match utf8_char_width(first) {
142 Some(v) => v,
143 None => err!(),
144 };
145 let second = next_continuation!();
146 match width {
147 2 => decoded.extend_from_slice(&[first, second]),
148 3 => {
149 let third = next_continuation!();
150 match (first, second) {
151 (0xE0, 0xA0..=0xBF)
152 | (0xE1..=0xEC, 0x80..=0xBF)
153 | (0xED, 0x80..=0x9F)
154 | (0xEE..=0xEF, 0x80..=0xBF) => {
155 decoded.extend_from_slice(&[first, second, third]);
156 }
157 (0xED, 0xA0..=0xAF) => {
158 let fourth = next!();
159 if fourth != 0xED {
160 err!();
161 }
162 let fifth = next_continuation!();
163 if !(0xB0..=0xBF).contains(&fifth) {
164 err!();
165 }
166 let sixth = next_continuation!();
167 decoded.extend_from_slice(&decode_surrogate_pair(
168 second, third, fifth, sixth,
169 ));
170 }
171 _ => err!(),
172 }
173 }
174 _ => err!(),
175 }
176 }
177 }
178
179 debug_assert!(from_utf8(&decoded).is_ok());
180 Ok(unsafe { String::from_utf8_unchecked(decoded) })
181}
182
183#[inline]
184fn decode_surrogate_pair(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
185 let surrogate1 = decode_surrogate(second, third);
186 let surrogate2 = decode_surrogate(fifth, sixth);
187 let code_point = 0x10000 + ((surrogate1 - 0xD800) << 10 | (surrogate2 - 0xDC00));
188 decode_code_point(code_point)
189}
190
191#[inline]
192fn decode_surrogate(second: u8, third: u8) -> u32 {
193 const VAL_MASK: u8 = 0b0011_1111;
194 0xD000 | ((second & VAL_MASK) as u32) << 6 | (third & VAL_MASK) as u32
195}
196
197#[inline]
198fn decode_code_point(code_point: u32) -> [u8; 4] {
199 const STRT_TAG: u8 = 0b1111_0000;
200 [
201 STRT_TAG | ((code_point & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
202 CONT_TAG | ((code_point & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
203 CONT_TAG | ((code_point & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
204 CONT_TAG | ((code_point & 0b0_0000_0000_0000_0011_1111) as u8),
205 ]
206}
207
208/// Converts a string slice to CESU-8 bytes.
209///
210/// If the string slice's representation in CESU-8 would be identical to its
211/// present UTF-8 representation, this function is functionally no different
212/// than [`(&str).as_bytes()`](str::as_bytes); this means that `encode` does
213/// not need to perform any further operations and doesn't need to allocate any
214/// additional memory.
215///
216/// If the string slice's representation in UTF-8 is not equivalent in CESU-8,
217/// `encode` encodes the string slice to its CESU-8 representation as a slice
218/// of bytes.
219///
220/// # Examples
221///
222/// Basic usage:
223///
224/// ```
225/// # extern crate alloc;
226/// use alloc::borrow::Cow;
227///
228/// let str = "Hello, world!";
229/// // Since 'str' is valid UTF-8 and CESU-8 data, 'to_cesu8' can encode
230/// // data without allocating memory.
231/// assert_eq!(cesu8::encode(str), Cow::Borrowed(str.as_bytes()));
232///
233/// let utf8_data = "\u{10401}";
234/// let cesu8_data = vec![0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
235/// // 'utf8_data' is a 4-byte UTF-8 representation, which becomes a 6-byte
236/// // CESU-8 representation.
237/// assert_eq!(cesu8::encode(utf8_data), Cow::<[u8]>::Owned(cesu8_data));
238/// ```
239#[must_use]
240#[inline]
241pub fn encode(str: &str) -> Cow<[u8]> {
242 if is_valid(str) {
243 Cow::Borrowed(str.as_bytes())
244 } else {
245 Cow::Owned(encode_cesu8(str))
246 }
247}
248
249#[must_use]
250#[inline(never)]
251#[cold]
252fn encode_cesu8(str: &str) -> Vec<u8> {
253 let bytes = str.as_bytes();
254 let capacity = len(str);
255 let mut encoded = Vec::with_capacity(capacity);
256 let mut index = 0;
257
258 while index < bytes.len() {
259 let byte = bytes[index];
260 if byte <= MAX_ASCII_CODE_POINT {
261 encoded.push(byte);
262 index += 1;
263 } else {
264 let width = utf8_char_width(byte).unwrap();
265 let slice_range = index..index + width;
266 if width <= CESU8_MAX_CHAR_WIDTH {
267 encoded.extend(&bytes[slice_range]);
268 } else {
269 let str = &str[slice_range];
270 let code_point = str.chars().next().unwrap() as u32;
271 let surrogate_pair = to_surrogate_pair(code_point);
272 let encoded_pair = encode_surrogate_pair(surrogate_pair);
273 encoded.extend(&encoded_pair);
274 }
275 index += width;
276 }
277 }
278
279 encoded
280}
281
282#[inline]
283fn encode_surrogate_pair(surrogate_pair: [u16; 2]) -> [u8; 6] {
284 let [b1, b2, b3] = encode_surrogate(surrogate_pair[0]);
285 let [b4, b5, b6] = encode_surrogate(surrogate_pair[1]);
286 [b1, b2, b3, b4, b5, b6]
287}
288
289#[inline]
290fn encode_surrogate(surrogate: u16) -> [u8; 3] {
291 const STRT_TAG: u8 = 0b1110_0000;
292 [
293 STRT_TAG | ((surrogate & 0b1111_0000_0000_0000) >> 12) as u8,
294 CONT_TAG | ((surrogate & 0b0000_1111_1100_0000) >> 6) as u8,
295 CONT_TAG | ((surrogate & 0b0000_0000_0011_1111) as u8),
296 ]
297}
298
299#[inline]
300fn to_surrogate_pair(code_point: u32) -> [u16; 2] {
301 let code_point = code_point - 0x10000;
302 let first = ((code_point >> 10) as u16) | 0xD800;
303 let second = ((code_point & 0x3FF) as u16) | 0xDC00;
304 [first, second]
305}
306
307/// Returns how many bytes in CESU-8 are required to encode a string slice.
308///
309/// # Examples
310///
311/// Basic usage:
312///
313/// ```
314/// // Any codepoint below or equal to U+FFFF is the same length as it is in
315/// // UTF-8.
316/// assert_eq!(cesu8::len("\u{FFFF}"), 3);
317///
318/// // Any codepoint above U+FFFF is stored as a surrogate pair.
319/// assert_eq!(cesu8::len("\u{10000}"), 6);
320/// ```
321#[must_use]
322pub fn len(str: &str) -> usize {
323 let bytes = str.as_bytes();
324 let mut len = 0;
325 let mut index = 0;
326 while index < bytes.len() {
327 let byte = bytes[index];
328 if byte <= MAX_ASCII_CODE_POINT {
329 len += 1;
330 index += 1;
331 } else {
332 // SAFETY: Valid UTF-8 will never yield a `None` value:
333 let width = unsafe { utf8_char_width(byte).unwrap_unchecked() };
334 len += if width <= CESU8_MAX_CHAR_WIDTH {
335 width
336 } else {
337 6
338 };
339 index += width;
340 }
341 }
342 len
343}
344
345/// Returns `true` if a string slice contains UTF-8 data that is also valid
346/// CESU-8.
347///
348/// This is primarily used in testing if a string slice needs to be explicitly
349/// encoded using [`encode`]. If `is_valid()` returns `false`, it implies that
350/// [`&str.as_bytes()`](str::as_bytes) is directly equivalent to the string
351/// slice's CESU-8 representation.
352///
353/// # Examples
354///
355/// Basic usage:
356///
357/// ```
358/// // Any code point below or equal to U+FFFF encoded in UTF-8 IS valid CESU-8.
359/// assert!(cesu8::is_valid("Hello, world!"));
360/// assert!(cesu8::is_valid("\u{FFFF}"));
361///
362/// // Any code point above U+FFFF encoded in UTF-8 IS NOT valid CESU-8.
363/// assert!(!cesu8::is_valid("\u{10000}"));
364/// ```
365#[must_use]
366pub fn is_valid(str: &str) -> bool {
367 for byte in str.bytes() {
368 if is_continuation_byte(byte) {
369 continue;
370 }
371 if let Some(width) = utf8_char_width(byte) {
372 if width > CESU8_MAX_CHAR_WIDTH {
373 return false;
374 }
375 } else {
376 return false;
377 }
378 }
379 true
380}
381
382const CESU8_MAX_CHAR_WIDTH: usize = 3;
383
384#[inline]
385fn is_continuation_byte(byte: u8) -> bool {
386 const TAG_MASK: u8 = 0b1100_0000;
387 byte & TAG_MASK == CONT_TAG
388}
389
390const CONT_TAG: u8 = 0b1000_0000;
391
392fn utf8_char_width(byte: u8) -> Option<usize> {
393 match byte {
394 0x00..=MAX_ASCII_CODE_POINT => Some(1),
395 0xC2..=0xDF => Some(2),
396 0xE0..=0xEF => Some(3),
397 0xF0..=0xF4 => Some(4),
398 _ => None,
399 }
400}
401
402const MAX_ASCII_CODE_POINT: u8 = 0x7F;
403
404/// An error thrown by [`decode`] when the input is invalid CESU-8 data.
405///
406/// This type does not support transmission of an error other than that an error
407/// occurred.
408#[derive(Clone, Copy, Debug, PartialEq, Eq)]
409pub struct Error;
410
411impl fmt::Display for Error {
412 #[inline]
413 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
414 f.write_str("invalid CESU-8 data")
415 }
416}
417
418#[cfg(feature = "std")]
419#[cfg_attr(doc_cfg, doc(cfg(feature = "std")))]
420impl std::error::Error for Error {}