cesu8_str/
lib.rs

1//! A library implementing the [CESU-8 compatibility encoding scheme](https://www.unicode.org/reports/tr26/tr26-4.html).
2//! This is a non-standard variant of UTF-8 that is used internally by some
3//! systems that need to represent UTF-16 data as 8-bit characters.
4//!
5//! The use of this encoding is discouraged by the Unicode Consortium. It's OK
6//! for working with existing APIs, but it should not be used for data
7//! trasmission or storage.
8//!
9//! ### Java and U+0000
10//!
11//! Java uses the CESU-8 encoding as described above, but with one difference:
12//! the null character U+0000 is represented as an overlong UTF-8 sequence `C0
13//! 80`. This is supported by [`JavaStr`] and [`JavaString`].
14//!
15//! [`JavaStr`]: java::JavaStr
16//! [`JavaString`]: java::JavaString
17//!
18//! ### Surrogate pairs and UTF-8
19//!
20//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code points
21//! in the range from U+10000 to U+10FFFF. These are 16-bit numbers in the range
22//! 0xD800 to 0xDFFF.
23//!
24//! CESU-8 encodes these surrogate pairs as a 6-byte seqence consisting of two
25//! sets of three bytes.
26//!
27//! # Crate features
28//!
29//! **Alloc** - Enables all allocation related features. This will allow usage
30//! of `Cesu8String` and `JavaString`, which offer a similiar API to the
31//! standard library's `String`.
32#![no_std]
33
34#[cfg(feature = "alloc")]
35extern crate alloc;
36
37pub mod cesu8;
38pub mod java;
39
40mod index;
41mod internal;
42
43use core::num::NonZeroU8;
44
45#[cfg(feature = "alloc")]
46use alloc::borrow::Cow;
47#[cfg(feature = "alloc")]
48use alloc::string::String;
49
50/// Errors which can occur when attempting to interpret a sequence of [`u8`] as
51/// a string.
52///
53/// As such, the `from_slice` function for both [`Cesu8Str`] and [`JavaStr`]
54/// make use of this error.
55///
56/// [`Cesu8Str`]: cesu8::Cesu8Str
57/// [`JavaStr`]: java::JavaStr
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub struct EncodingError {
60    error_len: Option<NonZeroU8>,
61    valid_up_to: usize,
62}
63
64impl EncodingError {
65    /// Returns the index in the given string up to which valid CESU-8 or Java
66    /// CESU-8 was verified.
67    ///
68    /// It is the maximum index such that `from_slice` of either [`Cesu8Str`] or
69    /// [`JavaStr`] would return `Ok(_)`.
70    ///
71    /// [`Cesu8Str`]: cesu8::Cesu8Str
72    /// [`JavaStr`]: java::JavaStr
73    #[inline]
74    #[must_use]
75    pub fn valid_up_to(&self) -> usize {
76        self.valid_up_to
77    }
78
79    /// Provides more information about the failure:
80    /// * `None`: the end of the input was reached unexpectedly.
81    ///   `self.valid_up_to()` is 1 to 6 bytes from the end of the input. If a
82    ///   byte stream (such as a file or network socket) is being decoded
83    ///   incrementally, this could be a valid `char` whose UTF-8 byte sequence
84    ///   is spanning multiple chunks.
85    /// * `Some(len)`: an unexpected byte was encountered. The length provided
86    ///   is that of the invalid byte seqence that starts at the index given by
87    ///   `valid_up_to()`.
88    #[inline]
89    #[must_use]
90    pub fn error_len(&self) -> Option<NonZeroU8> {
91        self.error_len
92    }
93}
94
95impl core::fmt::Display for EncodingError {
96    #[inline]
97    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
98        if let Some(len) = self.error_len {
99            write!(
100                f,
101                "invalid cesu-8 sequence of {} bytes from index {}",
102                len, self.valid_up_to
103            )
104        } else {
105            write!(
106                f,
107                "incomplete utf-8 byte sequence from index {}",
108                self.valid_up_to
109            )
110        }
111    }
112}
113
114/// A possible error value when converting a `JavaString` from a CESU-8 byte
115/// vector.
116///
117/// This type is the error type for the [`from_cesu8`] and [`from_java_cesu8`]
118/// on [`Cesu8String`] and [`JavaString`]. It is designed in such a way to
119/// carefully avoid reallocations: the [`into_bytes`] method will give back the
120/// byte vector that was used in the conversion attempt.
121///
122/// [`from_cesu8`]: cesu8::Cesu8String::from_cesu8
123/// [`from_java_cesu8`]: java::JavaString::from_java_cesu8
124/// [`Cesu8String`]: cesu8::Cesu8String
125/// [`JavaString`]: java::JavaString
126/// [`into_bytes`]: FromVecError::into_bytes
127///
128/// The [`EncodingError`] type represents an error that may occur when
129/// converting a slice of [`u8`]s to either a [`&Cesu8Str`] or a [`&JavaStr`].
130/// In this sense, it's an analogue to `FromCesu8Error`, and you can get one
131/// from a `FromCesu8Error` through the [`encoding_error`] method.
132///
133/// [`&Cesu8Str`]: cesu8::Cesu8Str
134/// [`&JavaStr`]: java::JavaStr
135/// [`encoding_error`]: FromVecError::encoding_error
136#[cfg(feature = "alloc")]
137#[derive(Debug, PartialEq, Eq)]
138pub struct FromVecError {
139    bytes: alloc::vec::Vec<u8>,
140    error: EncodingError,
141}
142
143#[cfg(feature = "alloc")]
144impl FromVecError {
145    /// Returns a slice of [`u8`]s that were attempted to convert to either a
146    /// `Cesu8String` or a `JavaString`.
147    #[inline]
148    #[must_use]
149    pub fn as_bytes(&self) -> &[u8] {
150        &self.bytes
151    }
152
153    /// Returns the bytes that were attempted to convert to either a
154    /// `Cesu8String` or a `JavaString`.
155    ///
156    /// This method is carefully constructed to avoid allocation. It will
157    /// consume the error, moving out the bytes, so that a copy of the bytes
158    /// does not need to be made.
159    ///
160    /// [`Cesu8Str`]: cesu8::Cesu8String
161    /// [`JavaStr`]: java::JavaString
162    #[inline]
163    #[must_use]
164    pub fn into_bytes(self) -> alloc::vec::Vec<u8> {
165        self.bytes
166    }
167
168    /// Fetch a `EncodingError` to get more details about the conversion
169    /// failure.
170    ///
171    /// The [`EncodingError`] type represents an error that may occur when
172    /// converting a slice of [`u8`]s to either a [`Cesu8String`] or a
173    /// [`JavaString`]. In this sense, it's an analogue to `FromCesu8Error`. See
174    /// its documentation for more details on using it.
175    ///
176    /// [`Cesu8String`]: cesu8::Cesu8String
177    /// [`JavaString`]: java::JavaString
178    #[inline]
179    #[must_use]
180    pub const fn encoding_error(&self) -> EncodingError {
181        self.error
182    }
183}
184
185impl core::fmt::Display for FromVecError {
186    #[inline]
187    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
188        core::fmt::Display::fmt(&self.error, f)
189    }
190}
191
192/// Converts bytes in CESU-8 format into UTF-8 format.
193#[cfg(feature = "alloc")]
194#[inline]
195fn from_cesu8<const JAVA: bool>(str: &internal::InternalStr) -> Cow<'_, str> {
196    let mut index = 0;
197    let mut last_index = 0;
198    let mut string = None;
199
200    // Fast forward to next supplementary character
201    let v = str.as_bytes();
202    while let Some(&byte) = v.get(index) {
203        // Check if byte marks the beginning of a supplementary character.
204        if byte == 0b1110_1101 {
205            let second = unsafe { *v.get(index + 1).unwrap_unchecked() };
206            if second & 0b1111_0000 == 0b1010_0000 {
207                let string = string.get_or_insert_with(String::new);
208                unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
209
210                let mut iter = v[index..].iter();
211                let code_point = unsafe { next_code_point(&mut iter).unwrap_unchecked() };
212
213                string.push(unsafe { char::from_u32_unchecked(code_point) });
214
215                index += 6;
216                last_index = index;
217            } else {
218                index += 3;
219            }
220        } else if JAVA && byte == 0xC0 {
221            if let Some(0x80) = v.get(index + 1) {
222                let string = string.get_or_insert_with(String::new);
223                unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
224
225                string.push('\0');
226
227                index += 2;
228                last_index = index;
229            }
230        } else {
231            index += 1;
232        }
233    }
234
235    if let Some(mut string) = string {
236        unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
237        Cow::Owned(string)
238    } else {
239        Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(v) })
240    }
241}
242
243/// Converts bytes in UTF-8 format into CESU-8 format.
244#[cfg(feature = "alloc")]
245#[inline]
246fn from_utf8<const JAVA: bool>(str: &str) -> Cow<'_, internal::InternalStr> {
247    let mut index = 0;
248    let mut last_index = 0;
249    let mut string = None;
250
251    let v = str.as_bytes();
252    while let Some(&byte) = v.get(index) {
253        if byte & 0b1111_1000 == 0b1111_0000 {
254            let string =
255                string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 6));
256
257            unsafe {
258                let c = core::str::from_utf8_unchecked(&v[index..])
259                    .chars()
260                    .next()
261                    .unwrap_unchecked();
262
263                let vec = string.as_mut_vec();
264                vec.extend_from_slice(&v[last_index..index]);
265
266                // Add character in CESU-8 encoding
267                vec.extend_from_slice(encode_cesu8_raw::<JAVA>(c as u32, &mut [0; 6]));
268            }
269
270            index += 4;
271            last_index = index;
272        } else if JAVA && byte == 0 {
273            let string =
274                string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 2));
275
276            unsafe {
277                let vec = string.as_mut_vec();
278                vec.extend_from_slice(&v[last_index..index]);
279                // Add nul character in Java CESU-8 encoding.
280                vec.extend_from_slice(&[0xC0, 0x80]);
281            }
282
283            index += 1;
284            last_index = index;
285        } else {
286            index += 1;
287        }
288    }
289
290    if let Some(mut string) = string {
291        unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
292        Cow::Owned(string)
293    } else {
294        Cow::Borrowed(unsafe { internal::InternalStr::from_unchecked(v) })
295    }
296}
297
298/// Checks whether a slice of bytes contains valid CESU-8 data. When passed
299/// `check_java`, additionally ensure that the string conforms to the Java
300/// String specification.
301#[inline]
302const fn validate_cesu8_internal<const CHECK_JAVA: bool>(v: &[u8]) -> Result<(), EncodingError> {
303    const OVERLONG: [u32; 4] = [0x00, 0x80, 0x800, 0x10000];
304
305    let mut index = 0;
306    let len = v.len();
307
308    while index < len {
309        macro_rules! err {
310            ($error_len:expr) => {
311                return Err(EncodingError {
312                    error_len: NonZeroU8::new($error_len),
313                    valid_up_to: index,
314                })
315            };
316        }
317
318        // Check if the character is multi-byte.
319        let first = v[index];
320        let (len, code_point) = if first < 128 {
321            // 1-byte characters - always ascii
322
323            (1, first as u32)
324        } else if first & 0b1110_0000 == 0b1100_0000 {
325            // 2-byte characters
326            if index + 1 >= len {
327                err!(0);
328            }
329            let second = v[index + 1];
330            if second & 0b1100_0000 != 0b1000_0000 {
331                err!(2);
332            }
333
334            (2, ((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
335        } else if first & 0b1111_0000 == 0b1110_0000 {
336            // 3-byte characters
337            if index + 2 >= len {
338                err!(0);
339            }
340
341            let second = v[index + 1];
342            let third = v[index + 2];
343            // This is safe, even though the three-byte encoding seems like it supports
344            // values overlapping this range. This is because any value that would end up in
345            // this range and yet be encoded in three-bytes is an unpaired supplementary
346            // character, which is not a valid Unicode character.
347            if !(first == 0b1110_1101 && second & 0b1111_0000 == 0b1010_0000) {
348                // No surrogate pair
349                if second & 0b1100_0000 != 0b1000_0000 {
350                    err!(2);
351                }
352                if third & 0b1100_0000 != 0b1000_0000 {
353                    err!(3);
354                }
355
356                (
357                    3,
358                    ((first as u32 & 0x0F) << 12)
359                        | ((second as u32 & 0x3F) << 6)
360                        | (third as u32 & 0x3F),
361                )
362            } else {
363                // Surrogate pair
364                if index + 5 >= len {
365                    err!(0);
366                }
367                let fourth = v[index + 3];
368                let fifth = v[index + 4];
369                let sixth = v[index + 5];
370
371                if second & 0b1111_0000 != 0b1010_0000 {
372                    err!(2);
373                }
374                if third & 0b1100_0000 != 0b1000_0000 {
375                    err!(3);
376                }
377
378                if fourth != 0b1110_1101 {
379                    err!(4);
380                }
381                if fifth & 0b1111_0000 != 0b1011_0000 {
382                    err!(5);
383                }
384                if sixth & 0b1100_0000 != 0b1000_0000 {
385                    err!(6);
386                }
387
388                (
389                    6,
390                    0x10000
391                        + (((second as u32 & 0x0F) << 16)
392                            | ((third as u32 & 0x3F) << 10)
393                            | ((fifth as u32 & 0x0F) << 6)
394                            | (sixth as u32 & 0x3F)),
395                )
396            }
397        } else {
398            err!(1);
399        };
400
401        if code_point > 0x10FFFF {
402            err!(len as u8);
403        }
404
405        let idx = if len != 6 { len - 1 } else { 3 };
406
407        // Check for overlong encoding, and if validating Java CESU-8, exclude
408        let overlong = if CHECK_JAVA && code_point == 0x00 {
409            len != 2
410        } else {
411            code_point < OVERLONG[idx]
412        };
413
414        let surrogate = (code_point >> 11) == 0x1B;
415        if overlong || surrogate {
416            err!(len as u8);
417        }
418
419        index += len;
420    }
421
422    Ok(())
423}
424
425/// Reads the next code point out of a byte iterator (assuming a CESU-8-like
426/// encoding).
427///
428/// This method can be used for both standard CESU-8 and Java CESU-8 because
429/// this method does not care about what is encoded inside the code-points, and
430/// Java CESU-8 only adds additional stipulations regarding how to encode the
431/// NUL character.
432///
433/// # Safety
434///
435/// The byte iterator passed in must provide CESU-8.
436#[allow(clippy::cast_lossless)]
437#[inline]
438unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
439    let first = *bytes.next()?;
440    if first < 128 {
441        // 1-byte characters
442        Some(first as u32)
443    } else if first & 0b1110_0000 == 0b1100_0000 {
444        // 2-byte characters
445        let second = *bytes.next().unwrap_unchecked();
446        Some(((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
447    } else {
448        let second = *bytes.next().unwrap_unchecked();
449        let third = *bytes.next().unwrap_unchecked();
450
451        // This is safe, even though the three-byte encoding seems like it supports
452        // values overlapping this range. This is because any value that would end up in
453        // this range and yet be encoded in three-bytes is an unpaired supplementary
454        // character, which is not a valid Unicode character.
455        if first != 0b1110_1101 || second & 0b1111_0000 != 0b1010_0000 {
456            // 3-byte characters - no surrogate pair
457            Some(
458                ((first as u32 & 0x0F) << 12)
459                    | ((second as u32 & 0x3F) << 6)
460                    | (third as u32 & 0x3F),
461            )
462        } else {
463            // 6-byte characters - surrogate pair
464            let _fourth = *bytes.next().unwrap_unchecked();
465            let fifth = *bytes.next().unwrap_unchecked();
466            let sixth = *bytes.next().unwrap_unchecked();
467
468            Some(
469                0x10000
470                    + (((second as u32 & 0x0F) << 16)
471                        | ((third as u32 & 0x3F) << 10)
472                        | ((fifth as u32 & 0x0F) << 6)
473                        | (sixth as u32 & 0x3F)),
474            )
475        }
476    }
477}
478
479/// Reads the next code point of a reversed byte iterator (assuming a
480/// CESU-8-like encoding).
481///
482/// This method can be used for both standard CESU-8 and Java CESU-8 because
483/// this method does not care about what is encoded inside the code-points, and
484/// Java CESU-8 only adds additional stipulations regarding how to encode the
485/// NUL character.
486///
487/// # Safety
488///
489/// The byte iterator passed in must provide CESU-8.
490#[allow(clippy::cast_lossless)]
491#[inline]
492unsafe fn next_code_point_reverse<'a, I: DoubleEndedIterator<Item = &'a u8>>(
493    bytes: &mut I,
494) -> Option<u32> {
495    let first = *bytes.next_back()?;
496    if first < 128 {
497        // 1-byte characters
498        Some(first as u32)
499    } else {
500        // Multi-byte characters
501        let second = *bytes.next_back().unwrap_unchecked();
502        if second & 0b1110_0000 == 0b1100_0000 {
503            // 2-byte characters
504            Some(((second as u32 & 0x1F) << 6) | (first as u32 & 0x3F))
505        } else {
506            let third = *bytes.next_back().unwrap_unchecked();
507            if second & 0b1111_0000 != 0b1011_0000 || third != 0b1110_1101 {
508                // 3-byte characters - no surrogate pair
509                Some(
510                    ((third as u32 & 0x0F) << 12)
511                        | ((second as u32 & 0x3F) << 6)
512                        | (first as u32 & 0x3F),
513                )
514            } else {
515                // 6-byte characters - surrogate pair
516                let fourth = *bytes.next_back().unwrap_unchecked();
517                let fifth = *bytes.next_back().unwrap_unchecked();
518                let _sixth = *bytes.next_back().unwrap_unchecked();
519
520                Some(
521                    0x10000
522                        + (((fifth as u32 & 0x0F) << 16)
523                            | ((fourth as u32 & 0x3F) << 10)
524                            | ((second as u32 & 0x0F) << 6)
525                            | (first as u32 & 0x3F)),
526                )
527            }
528        }
529    }
530}
531
532/// Compute the length of a character when encoded in the CESU-8 format.
533#[inline]
534#[must_use]
535pub(crate) const fn len_cesu8<const JAVA: bool>(code: u32) -> usize {
536    if code < 0x80 && !(JAVA && code == 0) {
537        1
538    } else if code < 0x800 {
539        2
540    } else if code < 0x10000 {
541        3
542    } else {
543        6
544    }
545}
546
547/// Encodes a raw u32 value as CESU-8 into the provided byte buffer, then
548/// returns the subslice of the buffer that contains the encoded character.
549#[inline]
550pub(crate) fn encode_cesu8_raw<const JAVA: bool>(code: u32, dst: &mut [u8]) -> &mut [u8] {
551    let len = len_cesu8::<JAVA>(code);
552    match (len, &mut dst[..]) {
553        (1, [a, ..]) => *a = code as u8,
554        (2, [a, b, ..]) => {
555            *a = 0b1100_0000 | (code >> 6 & 0x1F) as u8;
556            *b = 0b1000_0000 | (code & 0x3F) as u8;
557        }
558        (3, [a, b, c, ..]) => {
559            *a = 0b1110_0000 | (code >> 12 & 0x0F) as u8;
560            *b = 0b1000_0000 | (code >> 6 & 0x3F) as u8;
561            *c = 0b1000_0000 | (code & 0x3F) as u8;
562        }
563        (6, [a, b, c, d, e, f, ..]) => {
564            *a = 0b1110_1101;
565            *b = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
566            *c = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
567            *d = 0b1110_1101;
568            *e = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
569            *f = 0b1000_0000 | (code & 0x3F) as u8;
570        }
571        _ => panic!(
572            "encode_cesu8: need {len} bytes to encode U+{code:X}, but the buffer has {}",
573            dst.len()
574        ),
575    };
576    &mut dst[..len]
577}
578
579/// Calculate the amount of bytes required to encode `str` in CESU-8.
580pub(crate) const fn required_len<const JAVA: bool>(str: &str) -> usize {
581    let mut len = 0;
582
583    let mut i = 0;
584    let v = str.as_bytes();
585    while i < v.len() {
586        let first = v[i];
587        if first & 0b1111_1000 == 0b1111_0000 {
588            len += 6;
589            i += 4;
590        } else if JAVA && first == 0 {
591            len += 2;
592            i += 1;
593        } else {
594            len += 1;
595            i += 1;
596        }
597    }
598
599    len
600}
601
602/// Creates a buffer of CESU-8 encoded bytes from `str`.
603pub(crate) const fn create_array<const JAVA: bool, const N: usize>(str: &str) -> [u8; N] {
604    let mut buf = [0; N];
605
606    let mut j = 0;
607    let mut i = 0;
608    let v = str.as_bytes();
609    while i < v.len() {
610        let first = v[i];
611        if first & 0b1111_1000 == 0b1111_0000 {
612            let code = 0x10000
613                + (((v[i + 0] as u32 & 0b0000_0111) << 18)
614                    | ((v[i + 1] as u32 & 0b0011_1111) << 12)
615                    | ((v[i + 2] as u32 & 0b0011_1111) << 6)
616                    | (v[i + 3] as u32 & 0b0011_1111));
617
618            buf[i + 0] = 0b1110_1101;
619            buf[i + 1] = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
620            buf[i + 2] = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
621            buf[i + 3] = 0b1110_1101;
622            buf[i + 4] = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
623            buf[i + 5] = 0b1000_0000 | (code & 0x3F) as u8;
624            j += 6;
625            i += 4;
626        } else if JAVA && first == 0 {
627            buf[j + 0] = 0xC0;
628            buf[j + 1] = 0x80;
629            j += 2;
630            i += 1;
631        } else {
632            buf[j] = v[i];
633            j += 1;
634            i += 1;
635        }
636    }
637
638    buf
639}