mutf8/
lib.rs

1//! A library for converting between MUTF-8 and UTF-8.
2//!
3//! MUTF-8 is the same as CESU-8 except for its handling of embedded null
4//! characters. This library builds on top of the `residua-cesu8` crate found
5//! [here][residua-cesu8].
6//!
7//! [residua-cesu8]: https://github.com/residua/cesu8
8//!
9//! # Examples
10//!
11//! Basic usage
12//!
13//! ```
14//! # extern crate alloc;
15//! use alloc::borrow::Cow;
16//!
17//! let str = "Hello, world!";
18//! // 16-bit Unicode characters are the same in UTF-8 and MUTF-8:
19//! assert_eq!(mutf8::encode(str), Cow::Borrowed(str.as_bytes()));
20//! assert_eq!(mutf8::decode(str.as_bytes()), Ok(Cow::Borrowed(str)));
21//!
22//! let str = "\u{10401}";
23//! let mutf8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
24//! // 'mutf8_data' is a byte slice containing a 6-byte surrogate pair which
25//! // becomes a 4-byte UTF-8 character.
26//! assert_eq!(mutf8::decode(mutf8_data), Ok(Cow::Owned(str.to_string())));
27//!
28//! let str = "\0";
29//! let mutf8_data = vec![0xC0, 0x80];
30//! // 'str' is a null character which becomes a two-byte MUTF-8 representation.
31//! assert_eq!(mutf8::encode(str), Cow::<[u8]>::Owned(mutf8_data));
32//! ```
33//! # Features
34//!
35//! - `std` implements `std::error::Error` on `Error`. By default, this feature
36//!   is enabled.
37
38#![cfg_attr(not(feature = "std"), no_std)]
39#![cfg_attr(doc_cfg, feature(doc_cfg))]
40#![deny(clippy::pedantic)]
41
42extern crate alloc;
43
44use alloc::{borrow::Cow, str::from_utf8, string::String, vec::Vec};
45use core::fmt;
46
47/// Converts a slice of bytes to a string slice.
48///
49/// First, if the slice of bytes is already valid UTF-8, this function is
50/// functionally no different than [`std::str::from_utf8`]; this means that
51/// `decode()` does not need to perform any further operations and doesn't need
52/// to allocate additional memory.
53///
54/// If the slice of bytes is not valid UTF-8, `decode()` works on the assumption
55/// that the slice of bytes, if not valid UTF-8, is valid MUTF-8. It will then
56/// decode the bytes given to it and return the newly constructed string slice.
57///
58/// If the slice of bytes is found not to be valid MUTF-8 data, `decode()`
59/// returns `Err(Error)` to signify that an error has occurred.
60///
61/// # Errors
62///
63/// Returns [`Error`] if the input is invalid MUTF-8 data.
64///
65/// # Examples
66///
67/// ```
68/// # extern crate alloc;
69/// use alloc::borrow::Cow;
70///
71/// let str = "Hello, world!";
72/// // Since 'str' contains valid UTF-8 and MUTF-8 data, 'from_mutf8' can
73/// // decode the string slice without allocating memory.
74/// assert_eq!(mutf8::decode(str.as_bytes()), Ok(Cow::Borrowed(str)));
75///
76/// let str = "\u{10401}";
77/// let mutf8_data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
78/// // 'mutf8_data' is a byte slice containing a 6-byte surrogate pair which
79/// // becomes the 4-byte UTF-8 character 'str'.
80/// assert_eq!(mutf8::decode(mutf8_data), Ok(Cow::Owned(str.to_string())));
81///
82/// let str = "\0";
83/// let mutf8_data = &[0xC0, 0x80];
84/// // 'mutf8_data' is a byte slice containing MUTF-8 data containing a null
85/// // code point which becomes a null character.
86/// assert_eq!(mutf8::decode(mutf8_data), Ok(Cow::Owned(str.to_string())));
87/// ```
88#[inline]
89pub fn decode(bytes: &[u8]) -> Result<Cow<str>, Error> {
90    from_utf8(bytes)
91        .map(Cow::Borrowed)
92        .or_else(|_| decode_mutf8(bytes).map(Cow::Owned))
93}
94
95#[inline(never)]
96#[cold]
97fn decode_mutf8(bytes: &[u8]) -> Result<String, Error> {
98    macro_rules! err {
99        () => {{
100            return Err(Error);
101        }};
102    }
103
104    let mut decoded = Vec::with_capacity(bytes.len());
105    let mut iter = bytes.iter();
106
107    while let Some(&byte) = iter.next() {
108        let value = if byte == NULL_PAIR[0] {
109            match iter.next() {
110                Some(&byte) => {
111                    if byte != NULL_PAIR[1] {
112                        err!()
113                    }
114                }
115                _ => err!(),
116            }
117            NULL_CODE_POINT
118        } else {
119            byte
120        };
121        decoded.push(value);
122    }
123
124    cesu8::decode(&decoded)
125        .map(Cow::into_owned)
126        .map_err(From::from)
127}
128
129/// Converts a string slice to MUTF-8 bytes.
130///
131/// If the string slice's representation in MUTF-8 would be identical to its
132/// present UTF-8 representation, this function is functionally no different
133/// than `(&str).as_bytes()`; this means that `encode()` does not need to
134/// perform any further operations and doesn't need to allocate any additional
135/// memory.
136///
137/// If the string slice's representation in UTF-8 is not equivalent in MUTF-8,
138/// `encode()` encodes the string slice to its MUTF-8 representation as a slice
139/// of bytes.
140///
141/// # Examples
142///
143/// ```
144/// # extern crate alloc;
145/// use alloc::borrow::Cow;
146///
147/// let str = "Hello, world!";
148/// // Since 'str' contains valid UTF-8 and MUTF-8 data, 'to_mutf8' can
149/// // encode data without allocating memory.
150/// assert_eq!(mutf8::encode(str), Cow::Borrowed(str.as_bytes()));
151///
152/// let str = "\u{10401}";
153/// let mutf8_data = vec![0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
154/// // 'str' is a 4-byte UTF-8 character, which becomes the 6-byte MUTF-8
155/// // surrogate pair 'mutf8_data'.
156/// assert_eq!(mutf8::encode(str), Cow::<[u8]>::Owned(mutf8_data));
157///
158/// let str = "\0";
159/// let mutf8_data = vec![0xC0, 0x80];
160/// // 'str' is a null character which becomes a two byte representation in
161/// // MUTF-8.
162/// assert_eq!(mutf8::encode(str), Cow::<[u8]>::Owned(mutf8_data));
163/// ```
164#[must_use]
165#[inline]
166pub fn encode(s: &str) -> Cow<[u8]> {
167    if is_valid(s) {
168        Cow::Borrowed(s.as_bytes())
169    } else {
170        Cow::Owned(encode_mutf8(s))
171    }
172}
173
174#[must_use]
175#[inline(never)]
176#[cold]
177fn encode_mutf8(s: &str) -> Vec<u8> {
178    let mut encoded = Vec::with_capacity(len(s));
179
180    for &byte in cesu8::encode(s).iter() {
181        if byte == NULL_CODE_POINT {
182            encoded.extend_from_slice(&NULL_PAIR);
183        } else {
184            encoded.push(byte);
185        }
186    }
187
188    encoded
189}
190
191/// The pair of bytes the null code point (`0x00`) is represented by in MUTF-8.
192const NULL_PAIR: [u8; 2] = [0xC0, 0x80];
193
194/// Given a string slice, this function returns how many bytes in MUTF-8 are
195/// required to encode the string slice.
196#[must_use]
197pub fn len(s: &str) -> usize {
198    let mut len = cesu8::len(s);
199    s.as_bytes().iter().for_each(|&b| {
200        if b == NULL_CODE_POINT {
201            len += 1;
202        }
203    });
204    len
205}
206
207/// Returns `true` if a string slice contains UTF-8 data that is also valid
208/// MUTF-8. This is mainly used in testing if a string slice needs to be
209/// explicitly encoded using [`encode`].
210///
211/// If `is_valid()` returns `false`, it implies that
212/// [`&str.as_bytes()`](str::as_bytes) is directly equivalent to the string
213/// slice's MUTF-8 representation.
214///
215/// # Examples
216///
217/// Basic usage:
218///
219/// ```
220/// // Code points below U+10400 encoded in UTF-8 IS valid MUTF-8.
221/// assert!(mutf8::is_valid("Hello, world!"));
222///
223/// // Any code point above U+10400 encoded in UTF-8 IS NOT valid MUTF-8.
224/// assert!(!mutf8::is_valid("\u{10400}"));
225///
226/// // The use of a null character IS NOT valid MUTF-8.
227/// assert!(!mutf8::is_valid("\0"));
228/// ```
229#[must_use]
230#[inline]
231pub fn is_valid(s: &str) -> bool {
232    !s.contains(NULL_CHAR) && cesu8::is_valid(s)
233}
234
235const NULL_CODE_POINT: u8 = 0x00;
236const NULL_CHAR: char = '\0';
237
238/// An error thrown by [`decode`] when the input is invalid MUTF-8 data.
239///
240/// This type does not support transmission of an error other than that an error
241/// occurred.
242#[derive(Clone, Copy, Debug, PartialEq, Eq)]
243pub struct Error;
244
245impl fmt::Display for Error {
246    #[inline]
247    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
248        f.write_str("invalid MUTF-8 data")
249    }
250}
251
252impl From<cesu8::Error> for Error {
253    #[inline]
254    fn from(_: cesu8::Error) -> Self {
255        Error
256    }
257}
258
259#[cfg(feature = "std")]
260#[cfg_attr(doc_cfg, doc(cfg(feature = "std")))]
261impl std::error::Error for Error {}