cesu8_str/lib.rs
1//! A library implementing the [CESU-8 compatibility encoding scheme](https://www.unicode.org/reports/tr26/tr26-4.html).
2//! This is a non-standard variant of UTF-8 that is used internally by some
3//! systems that need to represent UTF-16 data as 8-bit characters.
4//!
5//! The use of this encoding is discouraged by the Unicode Consortium. It's OK
6//! for working with existing APIs, but it should not be used for data
7//! trasmission or storage.
8//!
9//! ### Java and U+0000
10//!
11//! Java uses the CESU-8 encoding as described above, but with one difference:
12//! the null character U+0000 is represented as an overlong UTF-8 sequence `C0
13//! 80`. This is supported by [`JavaStr`] and [`JavaString`].
14//!
15//! [`JavaStr`]: java::JavaStr
16//! [`JavaString`]: java::JavaString
17//!
18//! ### Surrogate pairs and UTF-8
19//!
20//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code points
21//! in the range from U+10000 to U+10FFFF. These are 16-bit numbers in the range
22//! 0xD800 to 0xDFFF.
23//!
24//! CESU-8 encodes these surrogate pairs as a 6-byte seqence consisting of two
25//! sets of three bytes.
26//!
27//! # Crate features
28//!
29//! **Alloc** - Enables all allocation related features. This will allow usage
30//! of `Cesu8String` and `JavaString`, which offer a similiar API to the
31//! standard library's `String`.
32#![no_std]
33
34#[cfg(feature = "alloc")]
35extern crate alloc;
36
37pub mod cesu8;
38pub mod java;
39
40mod index;
41mod internal;
42
43use core::num::NonZeroU8;
44
45#[cfg(feature = "alloc")]
46use alloc::borrow::Cow;
47#[cfg(feature = "alloc")]
48use alloc::string::String;
49
50/// Errors which can occur when attempting to interpret a sequence of [`u8`] as
51/// a string.
52///
53/// As such, the `from_slice` function for both [`Cesu8Str`] and [`JavaStr`]
54/// make use of this error.
55///
56/// [`Cesu8Str`]: cesu8::Cesu8Str
57/// [`JavaStr`]: java::JavaStr
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub struct EncodingError {
60 error_len: Option<NonZeroU8>,
61 valid_up_to: usize,
62}
63
64impl EncodingError {
65 /// Returns the index in the given string up to which valid CESU-8 or Java
66 /// CESU-8 was verified.
67 ///
68 /// It is the maximum index such that `from_slice` of either [`Cesu8Str`] or
69 /// [`JavaStr`] would return `Ok(_)`.
70 ///
71 /// [`Cesu8Str`]: cesu8::Cesu8Str
72 /// [`JavaStr`]: java::JavaStr
73 #[inline]
74 #[must_use]
75 pub fn valid_up_to(&self) -> usize {
76 self.valid_up_to
77 }
78
79 /// Provides more information about the failure:
80 /// * `None`: the end of the input was reached unexpectedly.
81 /// `self.valid_up_to()` is 1 to 6 bytes from the end of the input. If a
82 /// byte stream (such as a file or network socket) is being decoded
83 /// incrementally, this could be a valid `char` whose UTF-8 byte sequence
84 /// is spanning multiple chunks.
85 /// * `Some(len)`: an unexpected byte was encountered. The length provided
86 /// is that of the invalid byte seqence that starts at the index given by
87 /// `valid_up_to()`.
88 #[inline]
89 #[must_use]
90 pub fn error_len(&self) -> Option<NonZeroU8> {
91 self.error_len
92 }
93}
94
95impl core::fmt::Display for EncodingError {
96 #[inline]
97 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
98 if let Some(len) = self.error_len {
99 write!(
100 f,
101 "invalid cesu-8 sequence of {} bytes from index {}",
102 len, self.valid_up_to
103 )
104 } else {
105 write!(
106 f,
107 "incomplete utf-8 byte sequence from index {}",
108 self.valid_up_to
109 )
110 }
111 }
112}
113
114/// A possible error value when converting a `JavaString` from a CESU-8 byte
115/// vector.
116///
117/// This type is the error type for the [`from_cesu8`] and [`from_java_cesu8`]
118/// on [`Cesu8String`] and [`JavaString`]. It is designed in such a way to
119/// carefully avoid reallocations: the [`into_bytes`] method will give back the
120/// byte vector that was used in the conversion attempt.
121///
122/// [`from_cesu8`]: cesu8::Cesu8String::from_cesu8
123/// [`from_java_cesu8`]: java::JavaString::from_java_cesu8
124/// [`Cesu8String`]: cesu8::Cesu8String
125/// [`JavaString`]: java::JavaString
126/// [`into_bytes`]: FromVecError::into_bytes
127///
128/// The [`EncodingError`] type represents an error that may occur when
129/// converting a slice of [`u8`]s to either a [`&Cesu8Str`] or a [`&JavaStr`].
130/// In this sense, it's an analogue to `FromCesu8Error`, and you can get one
131/// from a `FromCesu8Error` through the [`encoding_error`] method.
132///
133/// [`&Cesu8Str`]: cesu8::Cesu8Str
134/// [`&JavaStr`]: java::JavaStr
135/// [`encoding_error`]: FromVecError::encoding_error
136#[cfg(feature = "alloc")]
137#[derive(Debug, PartialEq, Eq)]
138pub struct FromVecError {
139 bytes: alloc::vec::Vec<u8>,
140 error: EncodingError,
141}
142
143#[cfg(feature = "alloc")]
144impl FromVecError {
145 /// Returns a slice of [`u8`]s that were attempted to convert to either a
146 /// `Cesu8String` or a `JavaString`.
147 #[inline]
148 #[must_use]
149 pub fn as_bytes(&self) -> &[u8] {
150 &self.bytes
151 }
152
153 /// Returns the bytes that were attempted to convert to either a
154 /// `Cesu8String` or a `JavaString`.
155 ///
156 /// This method is carefully constructed to avoid allocation. It will
157 /// consume the error, moving out the bytes, so that a copy of the bytes
158 /// does not need to be made.
159 ///
160 /// [`Cesu8Str`]: cesu8::Cesu8String
161 /// [`JavaStr`]: java::JavaString
162 #[inline]
163 #[must_use]
164 pub fn into_bytes(self) -> alloc::vec::Vec<u8> {
165 self.bytes
166 }
167
168 /// Fetch a `EncodingError` to get more details about the conversion
169 /// failure.
170 ///
171 /// The [`EncodingError`] type represents an error that may occur when
172 /// converting a slice of [`u8`]s to either a [`Cesu8String`] or a
173 /// [`JavaString`]. In this sense, it's an analogue to `FromCesu8Error`. See
174 /// its documentation for more details on using it.
175 ///
176 /// [`Cesu8String`]: cesu8::Cesu8String
177 /// [`JavaString`]: java::JavaString
178 #[inline]
179 #[must_use]
180 pub const fn encoding_error(&self) -> EncodingError {
181 self.error
182 }
183}
184
185impl core::fmt::Display for FromVecError {
186 #[inline]
187 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
188 core::fmt::Display::fmt(&self.error, f)
189 }
190}
191
192/// Converts bytes in CESU-8 format into UTF-8 format.
193#[cfg(feature = "alloc")]
194#[inline]
195fn from_cesu8<const JAVA: bool>(str: &internal::InternalStr) -> Cow<'_, str> {
196 let mut index = 0;
197 let mut last_index = 0;
198 let mut string = None;
199
200 // Fast forward to next supplementary character
201 let v = str.as_bytes();
202 while let Some(&byte) = v.get(index) {
203 // Check if byte marks the beginning of a supplementary character.
204 if byte == 0b1110_1101 {
205 let second = unsafe { *v.get(index + 1).unwrap_unchecked() };
206 if second & 0b1111_0000 == 0b1010_0000 {
207 let string = string.get_or_insert_with(String::new);
208 unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
209
210 let mut iter = v[index..].iter();
211 let code_point = unsafe { next_code_point(&mut iter).unwrap_unchecked() };
212
213 string.push(unsafe { char::from_u32_unchecked(code_point) });
214
215 index += 6;
216 last_index = index;
217 } else {
218 index += 3;
219 }
220 } else if JAVA && byte == 0xC0 {
221 if let Some(0x80) = v.get(index + 1) {
222 let string = string.get_or_insert_with(String::new);
223 unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
224
225 string.push('\0');
226
227 index += 2;
228 last_index = index;
229 }
230 } else {
231 index += 1;
232 }
233 }
234
235 if let Some(mut string) = string {
236 unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
237 Cow::Owned(string)
238 } else {
239 Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(v) })
240 }
241}
242
243/// Converts bytes in UTF-8 format into CESU-8 format.
244#[cfg(feature = "alloc")]
245#[inline]
246fn from_utf8<const JAVA: bool>(str: &str) -> Cow<'_, internal::InternalStr> {
247 let mut index = 0;
248 let mut last_index = 0;
249 let mut string = None;
250
251 let v = str.as_bytes();
252 while let Some(&byte) = v.get(index) {
253 if byte & 0b1111_1000 == 0b1111_0000 {
254 let string =
255 string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 6));
256
257 unsafe {
258 let c = core::str::from_utf8_unchecked(&v[index..])
259 .chars()
260 .next()
261 .unwrap_unchecked();
262
263 let vec = string.as_mut_vec();
264 vec.extend_from_slice(&v[last_index..index]);
265
266 // Add character in CESU-8 encoding
267 vec.extend_from_slice(encode_cesu8_raw::<JAVA>(c as u32, &mut [0; 6]));
268 }
269
270 index += 4;
271 last_index = index;
272 } else if JAVA && byte == 0 {
273 let string =
274 string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 2));
275
276 unsafe {
277 let vec = string.as_mut_vec();
278 vec.extend_from_slice(&v[last_index..index]);
279 // Add nul character in Java CESU-8 encoding.
280 vec.extend_from_slice(&[0xC0, 0x80]);
281 }
282
283 index += 1;
284 last_index = index;
285 } else {
286 index += 1;
287 }
288 }
289
290 if let Some(mut string) = string {
291 unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
292 Cow::Owned(string)
293 } else {
294 Cow::Borrowed(unsafe { internal::InternalStr::from_unchecked(v) })
295 }
296}
297
298/// Checks whether a slice of bytes contains valid CESU-8 data. When passed
299/// `check_java`, additionally ensure that the string conforms to the Java
300/// String specification.
301#[inline]
302const fn validate_cesu8_internal<const CHECK_JAVA: bool>(v: &[u8]) -> Result<(), EncodingError> {
303 const OVERLONG: [u32; 4] = [0x00, 0x80, 0x800, 0x10000];
304
305 let mut index = 0;
306 let len = v.len();
307
308 while index < len {
309 macro_rules! err {
310 ($error_len:expr) => {
311 return Err(EncodingError {
312 error_len: NonZeroU8::new($error_len),
313 valid_up_to: index,
314 })
315 };
316 }
317
318 // Check if the character is multi-byte.
319 let first = v[index];
320 let (len, code_point) = if first < 128 {
321 // 1-byte characters - always ascii
322
323 (1, first as u32)
324 } else if first & 0b1110_0000 == 0b1100_0000 {
325 // 2-byte characters
326 if index + 1 >= len {
327 err!(0);
328 }
329 let second = v[index + 1];
330 if second & 0b1100_0000 != 0b1000_0000 {
331 err!(2);
332 }
333
334 (2, ((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
335 } else if first & 0b1111_0000 == 0b1110_0000 {
336 // 3-byte characters
337 if index + 2 >= len {
338 err!(0);
339 }
340
341 let second = v[index + 1];
342 let third = v[index + 2];
343 // This is safe, even though the three-byte encoding seems like it supports
344 // values overlapping this range. This is because any value that would end up in
345 // this range and yet be encoded in three-bytes is an unpaired supplementary
346 // character, which is not a valid Unicode character.
347 if !(first == 0b1110_1101 && second & 0b1111_0000 == 0b1010_0000) {
348 // No surrogate pair
349 if second & 0b1100_0000 != 0b1000_0000 {
350 err!(2);
351 }
352 if third & 0b1100_0000 != 0b1000_0000 {
353 err!(3);
354 }
355
356 (
357 3,
358 ((first as u32 & 0x0F) << 12)
359 | ((second as u32 & 0x3F) << 6)
360 | (third as u32 & 0x3F),
361 )
362 } else {
363 // Surrogate pair
364 if index + 5 >= len {
365 err!(0);
366 }
367 let fourth = v[index + 3];
368 let fifth = v[index + 4];
369 let sixth = v[index + 5];
370
371 if second & 0b1111_0000 != 0b1010_0000 {
372 err!(2);
373 }
374 if third & 0b1100_0000 != 0b1000_0000 {
375 err!(3);
376 }
377
378 if fourth != 0b1110_1101 {
379 err!(4);
380 }
381 if fifth & 0b1111_0000 != 0b1011_0000 {
382 err!(5);
383 }
384 if sixth & 0b1100_0000 != 0b1000_0000 {
385 err!(6);
386 }
387
388 (
389 6,
390 0x10000
391 + (((second as u32 & 0x0F) << 16)
392 | ((third as u32 & 0x3F) << 10)
393 | ((fifth as u32 & 0x0F) << 6)
394 | (sixth as u32 & 0x3F)),
395 )
396 }
397 } else {
398 err!(1);
399 };
400
401 if code_point > 0x10FFFF {
402 err!(len as u8);
403 }
404
405 let idx = if len != 6 { len - 1 } else { 3 };
406
407 // Check for overlong encoding, and if validating Java CESU-8, exclude
408 let overlong = if CHECK_JAVA && code_point == 0x00 {
409 len != 2
410 } else {
411 code_point < OVERLONG[idx]
412 };
413
414 let surrogate = (code_point >> 11) == 0x1B;
415 if overlong || surrogate {
416 err!(len as u8);
417 }
418
419 index += len;
420 }
421
422 Ok(())
423}
424
425/// Reads the next code point out of a byte iterator (assuming a CESU-8-like
426/// encoding).
427///
428/// This method can be used for both standard CESU-8 and Java CESU-8 because
429/// this method does not care about what is encoded inside the code-points, and
430/// Java CESU-8 only adds additional stipulations regarding how to encode the
431/// NUL character.
432///
433/// # Safety
434///
435/// The byte iterator passed in must provide CESU-8.
436#[allow(clippy::cast_lossless)]
437#[inline]
438unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
439 let first = *bytes.next()?;
440 if first < 128 {
441 // 1-byte characters
442 Some(first as u32)
443 } else if first & 0b1110_0000 == 0b1100_0000 {
444 // 2-byte characters
445 let second = *bytes.next().unwrap_unchecked();
446 Some(((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
447 } else {
448 let second = *bytes.next().unwrap_unchecked();
449 let third = *bytes.next().unwrap_unchecked();
450
451 // This is safe, even though the three-byte encoding seems like it supports
452 // values overlapping this range. This is because any value that would end up in
453 // this range and yet be encoded in three-bytes is an unpaired supplementary
454 // character, which is not a valid Unicode character.
455 if first != 0b1110_1101 || second & 0b1111_0000 != 0b1010_0000 {
456 // 3-byte characters - no surrogate pair
457 Some(
458 ((first as u32 & 0x0F) << 12)
459 | ((second as u32 & 0x3F) << 6)
460 | (third as u32 & 0x3F),
461 )
462 } else {
463 // 6-byte characters - surrogate pair
464 let _fourth = *bytes.next().unwrap_unchecked();
465 let fifth = *bytes.next().unwrap_unchecked();
466 let sixth = *bytes.next().unwrap_unchecked();
467
468 Some(
469 0x10000
470 + (((second as u32 & 0x0F) << 16)
471 | ((third as u32 & 0x3F) << 10)
472 | ((fifth as u32 & 0x0F) << 6)
473 | (sixth as u32 & 0x3F)),
474 )
475 }
476 }
477}
478
479/// Reads the next code point of a reversed byte iterator (assuming a
480/// CESU-8-like encoding).
481///
482/// This method can be used for both standard CESU-8 and Java CESU-8 because
483/// this method does not care about what is encoded inside the code-points, and
484/// Java CESU-8 only adds additional stipulations regarding how to encode the
485/// NUL character.
486///
487/// # Safety
488///
489/// The byte iterator passed in must provide CESU-8.
490#[allow(clippy::cast_lossless)]
491#[inline]
492unsafe fn next_code_point_reverse<'a, I: DoubleEndedIterator<Item = &'a u8>>(
493 bytes: &mut I,
494) -> Option<u32> {
495 let first = *bytes.next_back()?;
496 if first < 128 {
497 // 1-byte characters
498 Some(first as u32)
499 } else {
500 // Multi-byte characters
501 let second = *bytes.next_back().unwrap_unchecked();
502 if second & 0b1110_0000 == 0b1100_0000 {
503 // 2-byte characters
504 Some(((second as u32 & 0x1F) << 6) | (first as u32 & 0x3F))
505 } else {
506 let third = *bytes.next_back().unwrap_unchecked();
507 if second & 0b1111_0000 != 0b1011_0000 || third != 0b1110_1101 {
508 // 3-byte characters - no surrogate pair
509 Some(
510 ((third as u32 & 0x0F) << 12)
511 | ((second as u32 & 0x3F) << 6)
512 | (first as u32 & 0x3F),
513 )
514 } else {
515 // 6-byte characters - surrogate pair
516 let fourth = *bytes.next_back().unwrap_unchecked();
517 let fifth = *bytes.next_back().unwrap_unchecked();
518 let _sixth = *bytes.next_back().unwrap_unchecked();
519
520 Some(
521 0x10000
522 + (((fifth as u32 & 0x0F) << 16)
523 | ((fourth as u32 & 0x3F) << 10)
524 | ((second as u32 & 0x0F) << 6)
525 | (first as u32 & 0x3F)),
526 )
527 }
528 }
529 }
530}
531
532/// Compute the length of a character when encoded in the CESU-8 format.
533#[inline]
534#[must_use]
535pub(crate) const fn len_cesu8<const JAVA: bool>(code: u32) -> usize {
536 if code < 0x80 && !(JAVA && code == 0) {
537 1
538 } else if code < 0x800 {
539 2
540 } else if code < 0x10000 {
541 3
542 } else {
543 6
544 }
545}
546
547/// Encodes a raw u32 value as CESU-8 into the provided byte buffer, then
548/// returns the subslice of the buffer that contains the encoded character.
549#[inline]
550pub(crate) fn encode_cesu8_raw<const JAVA: bool>(code: u32, dst: &mut [u8]) -> &mut [u8] {
551 let len = len_cesu8::<JAVA>(code);
552 match (len, &mut dst[..]) {
553 (1, [a, ..]) => *a = code as u8,
554 (2, [a, b, ..]) => {
555 *a = 0b1100_0000 | (code >> 6 & 0x1F) as u8;
556 *b = 0b1000_0000 | (code & 0x3F) as u8;
557 }
558 (3, [a, b, c, ..]) => {
559 *a = 0b1110_0000 | (code >> 12 & 0x0F) as u8;
560 *b = 0b1000_0000 | (code >> 6 & 0x3F) as u8;
561 *c = 0b1000_0000 | (code & 0x3F) as u8;
562 }
563 (6, [a, b, c, d, e, f, ..]) => {
564 *a = 0b1110_1101;
565 *b = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
566 *c = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
567 *d = 0b1110_1101;
568 *e = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
569 *f = 0b1000_0000 | (code & 0x3F) as u8;
570 }
571 _ => panic!(
572 "encode_cesu8: need {len} bytes to encode U+{code:X}, but the buffer has {}",
573 dst.len()
574 ),
575 };
576 &mut dst[..len]
577}
578
579/// Calculate the amount of bytes required to encode `str` in CESU-8.
580pub(crate) const fn required_len<const JAVA: bool>(str: &str) -> usize {
581 let mut len = 0;
582
583 let mut i = 0;
584 let v = str.as_bytes();
585 while i < v.len() {
586 let first = v[i];
587 if first & 0b1111_1000 == 0b1111_0000 {
588 len += 6;
589 i += 4;
590 } else if JAVA && first == 0 {
591 len += 2;
592 i += 1;
593 } else {
594 len += 1;
595 i += 1;
596 }
597 }
598
599 len
600}
601
602/// Creates a buffer of CESU-8 encoded bytes from `str`.
603pub(crate) const fn create_array<const JAVA: bool, const N: usize>(str: &str) -> [u8; N] {
604 let mut buf = [0; N];
605
606 let mut j = 0;
607 let mut i = 0;
608 let v = str.as_bytes();
609 while i < v.len() {
610 let first = v[i];
611 if first & 0b1111_1000 == 0b1111_0000 {
612 let code = 0x10000
613 + (((v[i + 0] as u32 & 0b0000_0111) << 18)
614 | ((v[i + 1] as u32 & 0b0011_1111) << 12)
615 | ((v[i + 2] as u32 & 0b0011_1111) << 6)
616 | (v[i + 3] as u32 & 0b0011_1111));
617
618 buf[i + 0] = 0b1110_1101;
619 buf[i + 1] = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
620 buf[i + 2] = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
621 buf[i + 3] = 0b1110_1101;
622 buf[i + 4] = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
623 buf[i + 5] = 0b1000_0000 | (code & 0x3F) as u8;
624 j += 6;
625 i += 4;
626 } else if JAVA && first == 0 {
627 buf[j + 0] = 0xC0;
628 buf[j + 1] = 0x80;
629 j += 2;
630 i += 1;
631 } else {
632 buf[j] = v[i];
633 j += 1;
634 i += 1;
635 }
636 }
637
638 buf
639}