cesu8str/
lib.rs

1#![feature(portable_simd)]
2#![feature(const_ptr_read)]
3#![feature(const_trait_impl)]
4#![feature(const_deref)]
5#![feature(assert_matches)]
6#![feature(round_char_boundary)]
7
8#![allow(clippy::let_unit_value)]
9#![allow(clippy::unit_arg)]
10// #![warn(missing_docs)]
11// Copyright 2012-2022 The Rust Project Developers and Eric Kidd and Christopher Moore.  See the
12// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
13//
14// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
15// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
16// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
17// option. This file may not be copied, modified, or distributed except
18// according to those terms.
19
20//! A simple library implementing the [CESU-8 compatibility encoding
21//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html).  This is a
22//! non-standard variant of UTF-8 that is used internally by some systems
23//! that need to represent UTF-16 data as 8-bit characters.  Yes, this is
24//! ugly.
25//!
26//! Use of this encoding is discouraged by the Unicode Consortium.  It's OK
27//! for working with existing internal APIs, but it should not be used for
28//! transmitting or storing data.
29//!
30//! ```
31//! use std::borrow::Cow;
32//! use cesu8str::Cesu8Str;
33//!
34//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
35//! const TEST_STRING: &str = "aé日";
36//! const TEST_UTF8: &[u8] = TEST_STRING.as_bytes();
37//! assert_eq!(TEST_UTF8, Cesu8Str::from_utf8(TEST_STRING).as_bytes());
38//! let cesu_from_bytes = Cesu8Str::try_from_bytes(TEST_UTF8).unwrap();
39//! assert_eq!(TEST_UTF8, cesu_from_bytes.as_bytes());
40//!
41//! // This string is CESU-8 data containing a 6-byte surrogate pair,
42//! // which decodes to a 4-byte UTF-8 string.
43//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
44//! assert_eq!("\u{10401}", Cesu8Str::try_from_bytes(data).unwrap().to_str());
45//! ```
46//!
47//! ### A note about security
48//!
49//! While this library tries it's best to fail and check for malformed
50//! input, this is a legacy data format that should only be used for
51//! interacting with legacy libraries. CESU-8 is intended as an
52//! internal-only format, malformed data should be assumed to be improperly
53//! encoded (a bug), or an attacker.
54//!
55//! ### Java and U+0000, and other variants
56//!
57//! Java uses the CESU-8 encoding as described above, but with one
58//! difference: The null character U+0000 is represented as an overlong
59//! UTF-8 sequence `C0 80`. This is supported by the `Cesu8Str::from_cesu8(bytes, Variant::Java)` and
60//! `java_variant_str.as_bytes()` methods.
61//!
62//! ### Surrogate pairs and UTF-8
63//!
64//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
65//! points in the range from U+10000 to U+10FFFF.  These are 16-bit numbers
66//! in the range 0xD800 to 0xDFFF.
67//!
68//! * 0xD800 to 0xDBFF: First half of surrogate pair.  When encoded as
69//!   CESU-8, these become **1110**1101 **10**100000 **10**000000 to
70//!   **1110**1101 **10**101111 **10**111111.
71//!
72//! * 0xDC00 to 0xDFFF: Second half of surrogate pair.  These become
73//!   **1110**1101 **10**110000 **10**000000 to
74//!   **1110**1101 **10**111111 **10**111111.
75//!
76//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
77//! code point to UTF-16 conversion process:
78//!
79//! > Consider the encoding of U+10437 (𐐷):
80//! >
81//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
82//! >   0011 0111.
83//! > * Split this into the high 10-bit value and the low 10-bit value:
84//! >   0000000001 and 0000110111.
85//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
86//! >   0x0001 = 0xD801.
87//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
88//! >   0x0037 = 0xDC37.
89
90#![warn(missing_docs)]
91
92mod decoding;
93mod encoding;
94mod legacy_api;
95mod string;
96mod string_impls;
97#[rustfmt::skip]
98mod unicode;
99
100/// # Next-generation strings
101/// 
102/// | `encode_nul` | `nul_terminated` | `mode` | `Output` |
103/// | - | - | - |- |
104/// | `false` | `false` | `borrowed` | `Cesu8Str` |
105/// | `false` | `false` | `owned` | `Cesu8String` |
106/// | `false` | `true` | `borrowed` | n/a |
107/// | `false` | `true` | `owned` | n/a |
108/// | `true` | `false` | `borrowed` | `Mutf8Str` |
109/// | `true` | `false` | `owned` | `Mutf8String` |
110/// | `true` | `true` | `borrowed` | `Mutf8CStr` |
111/// | `true` | `true` | `owned` | `Mutf8CString` |
112/// 
113mod ngstr;
114
115/// A prelude including most relavent structs for the crate, including re-exports of some stdlib helpers
116/// such as `Cow`, `CStr`, `CString`, and `Deref`.
117pub mod prelude {
118    pub use crate::ngstr::prelude::*;
119}
120
121pub use ngstr::{
122    NGCesu8CError,
123    cesu8str::Cesu8Str,
124    cesu8string::Cesu8String,
125    mutf8str::Mutf8Str,
126    mutf8string::Mutf8String,
127    mutf8cstr::Mutf8CStr,
128    mutf8cstring::Mutf8CString,
129    mutf8cstr::FromStrWithNulError,
130    mutf8cstring::FromMutf8BytesWithNulError,
131
132    TryFromUtf8Error,
133    FromBytesWithNulError,
134};
135
136pub use crate::decoding::Cesu8Error;
137pub use crate::legacy_api::*;
138#[allow(deprecated)]
139pub use crate::string::Cesu8Str as LegacyCesu8Str;
140pub(crate) use crate::string::default_cesu8_capacity;
141
142#[cfg(test)]
143mod tests;
144
145/// Which variant of the encoding are we working with?
146#[derive(Debug, Clone, Copy, PartialEq, Eq)]
147pub enum Variant {
148    /// Regular CESU-8, with '\0' represented as itself.
149    Standard,
150
151    /// This is technically Java's "Modified UTF-8", which is supposedly
152    /// like CESU-8, except that it UTF-8 encodes the '\0' byte.  I'm sure
153    /// it seemed like a good idea at the time.
154    Java,
155}
156
157impl Variant {
158    /// Returns true if this Variant of CESU-8 converts nul-bytes to a `&[0xC0, 0x80]` sequence.
159    ///
160    /// This should only be true for the Java variant of CESU-8, also known as Modified UTF-8.
161    pub const fn encodes_nul(&self) -> bool {
162        match self {
163            Variant::Standard => false,
164            Variant::Java => true,
165        }
166    }
167}
168
169// Currently using a const generic bool for specializing functions for each variant
170// once const_generics is stabalized, we can use the variant directly
171// Creations of Cesu8Str should use this impl, then this impl can be removed once
172// const_generics is stabalized and we can adjust things properly
173#[doc(hidden)]
174impl From<bool> for Variant {
175    fn from(b: bool) -> Variant {
176        match b {
177            false => Variant::Standard,
178            true => Variant::Java,
179        }
180    }
181}
182
183#[inline]
184#[track_caller]
185pub(crate) fn from_utf8_slice<'s>(by: &'s [u8], expect_msg: &'_ str) -> &'s str {
186    if cfg!(debug_assertions) || cfg!(validate_release) {
187        std::str::from_utf8(by).expect(expect_msg)
188    } else {
189        unsafe { std::str::from_utf8_unchecked(by) }
190    }
191}
192
193#[inline]
194#[track_caller]
195pub(crate) fn from_utf8_vec(by: Vec<u8>, expect_msg: &str) -> String {
196    if cfg!(debug_assertions) || cfg!(validate_release) {
197        String::from_utf8(by).expect(expect_msg)
198    } else {
199        unsafe { String::from_utf8_unchecked(by) }
200    }
201}