#![no_std]
#[cfg(feature = "alloc")]
extern crate alloc;
pub mod cesu8;
pub mod java;
mod index;
mod internal;
use core::num::NonZeroU8;
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
#[cfg(feature = "alloc")]
use alloc::string::String;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EncodingError {
error_len: Option<NonZeroU8>,
valid_up_to: usize,
}
impl EncodingError {
#[inline]
#[must_use]
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
#[inline]
#[must_use]
pub fn error_len(&self) -> Option<NonZeroU8> {
self.error_len
}
}
impl core::fmt::Display for EncodingError {
#[inline]
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if let Some(len) = self.error_len {
write!(
f,
"invalid cesu-8 sequence of {} bytes from index {}",
len, self.valid_up_to
)
} else {
write!(
f,
"incomplete utf-8 byte sequence from index {}",
self.valid_up_to
)
}
}
}
#[cfg(feature = "alloc")]
#[derive(Debug, PartialEq, Eq)]
pub struct FromVecError {
bytes: alloc::vec::Vec<u8>,
error: EncodingError,
}
#[cfg(feature = "alloc")]
impl FromVecError {
#[inline]
#[must_use]
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
#[must_use]
pub fn into_bytes(self) -> alloc::vec::Vec<u8> {
self.bytes
}
#[inline]
#[must_use]
pub const fn encoding_error(&self) -> EncodingError {
self.error
}
}
impl core::fmt::Display for FromVecError {
#[inline]
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(&self.error, f)
}
}
#[cfg(feature = "alloc")]
#[inline]
fn from_cesu8<const JAVA: bool>(str: &internal::InternalStr) -> Cow<'_, str> {
let mut index = 0;
let mut last_index = 0;
let mut string = None;
let v = str.as_bytes();
while let Some(&byte) = v.get(index) {
if byte == 0b1110_1101 {
let second = unsafe { *v.get(index + 1).unwrap_unchecked() };
if second & 0b1111_0000 == 0b1010_0000 {
let string = string.get_or_insert_with(String::new);
unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
let mut iter = v[index..].iter();
let code_point = unsafe { next_code_point(&mut iter).unwrap_unchecked() };
string.push(unsafe { char::from_u32_unchecked(code_point) });
index += 6;
last_index = index;
} else {
index += 3;
}
} else if JAVA && byte == 0xC0 {
if let Some(0x80) = v.get(index + 1) {
let string = string.get_or_insert_with(String::new);
unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
string.push('\0');
index += 2;
last_index = index;
}
} else {
index += 1;
}
}
if let Some(mut string) = string {
unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
Cow::Owned(string)
} else {
Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(v) })
}
}
#[cfg(feature = "alloc")]
#[inline]
fn from_utf8<const JAVA: bool>(str: &str) -> Cow<'_, internal::InternalStr> {
let mut index = 0;
let mut last_index = 0;
let mut string = None;
let v = str.as_bytes();
while let Some(&byte) = v.get(index) {
if byte & 0b1111_1000 == 0b1111_0000 {
let string =
string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 6));
unsafe {
let c = core::str::from_utf8_unchecked(&v[index..])
.chars()
.next()
.unwrap_unchecked();
let vec = string.as_mut_vec();
vec.extend_from_slice(&v[last_index..index]);
vec.extend_from_slice(encode_cesu8_raw::<JAVA>(c as u32, &mut [0; 6]));
}
index += 4;
last_index = index;
} else if JAVA && byte == 0 {
let string =
string.get_or_insert_with(|| internal::InternalString::with_capacity(index + 2));
unsafe {
let vec = string.as_mut_vec();
vec.extend_from_slice(&v[last_index..index]);
vec.extend_from_slice(&[0xC0, 0x80]);
}
index += 1;
last_index = index;
} else {
index += 1;
}
}
if let Some(mut string) = string {
unsafe { string.as_mut_vec().extend_from_slice(&v[last_index..index]) };
Cow::Owned(string)
} else {
Cow::Borrowed(unsafe { internal::InternalStr::from_unchecked(v) })
}
}
#[inline]
const fn validate_cesu8_internal<const CHECK_JAVA: bool>(v: &[u8]) -> Result<(), EncodingError> {
const OVERLONG: [u32; 4] = [0x00, 0x80, 0x800, 0x10000];
let mut index = 0;
let len = v.len();
while index < len {
macro_rules! err {
($error_len:expr) => {
return Err(EncodingError {
error_len: NonZeroU8::new($error_len),
valid_up_to: index,
})
};
}
let first = v[index];
let (len, code_point) = if first < 128 {
(1, first as u32)
} else if first & 0b1110_0000 == 0b1100_0000 {
if index + 1 >= len {
err!(0);
}
let second = v[index + 1];
if second & 0b1100_0000 != 0b1000_0000 {
err!(2);
}
(2, ((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
} else if first & 0b1111_0000 == 0b1110_0000 {
if index + 2 >= len {
err!(0);
}
let second = v[index + 1];
let third = v[index + 2];
if !(first == 0b1110_1101 && second & 0b1111_0000 == 0b1010_0000) {
if second & 0b1100_0000 != 0b1000_0000 {
err!(2);
}
if third & 0b1100_0000 != 0b1000_0000 {
err!(3);
}
(
3,
((first as u32 & 0x0F) << 12)
| ((second as u32 & 0x3F) << 6)
| (third as u32 & 0x3F),
)
} else {
if index + 5 >= len {
err!(0);
}
let fourth = v[index + 3];
let fifth = v[index + 4];
let sixth = v[index + 5];
if second & 0b1111_0000 != 0b1010_0000 {
err!(2);
}
if third & 0b1100_0000 != 0b1000_0000 {
err!(3);
}
if fourth != 0b1110_1101 {
err!(4);
}
if fifth & 0b1111_0000 != 0b1011_0000 {
err!(5);
}
if sixth & 0b1100_0000 != 0b1000_0000 {
err!(6);
}
(
6,
0x10000
+ (((second as u32 & 0x0F) << 16)
| ((third as u32 & 0x3F) << 10)
| ((fifth as u32 & 0x0F) << 6)
| (sixth as u32 & 0x3F)),
)
}
} else {
err!(1);
};
if code_point > 0x10FFFF {
err!(len as u8);
}
let idx = if len != 6 { len - 1 } else { 3 };
let overlong = if CHECK_JAVA && code_point == 0x00 {
len != 2
} else {
code_point < OVERLONG[idx]
};
let surrogate = (code_point >> 11) == 0x1B;
if overlong || surrogate {
err!(len as u8);
}
index += len;
}
Ok(())
}
#[allow(clippy::cast_lossless)]
#[inline]
unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
let first = *bytes.next()?;
if first < 128 {
Some(first as u32)
} else if first & 0b1110_0000 == 0b1100_0000 {
let second = *bytes.next().unwrap_unchecked();
Some(((first as u32 & 0x1F) << 6) | (second as u32 & 0x3F))
} else {
let second = *bytes.next().unwrap_unchecked();
let third = *bytes.next().unwrap_unchecked();
if first != 0b1110_1101 || second & 0b1111_0000 != 0b1010_0000 {
Some(
((first as u32 & 0x0F) << 12)
| ((second as u32 & 0x3F) << 6)
| (third as u32 & 0x3F),
)
} else {
let _fourth = *bytes.next().unwrap_unchecked();
let fifth = *bytes.next().unwrap_unchecked();
let sixth = *bytes.next().unwrap_unchecked();
Some(
0x10000
+ (((second as u32 & 0x0F) << 16)
| ((third as u32 & 0x3F) << 10)
| ((fifth as u32 & 0x0F) << 6)
| (sixth as u32 & 0x3F)),
)
}
}
}
#[allow(clippy::cast_lossless)]
#[inline]
unsafe fn next_code_point_reverse<'a, I: DoubleEndedIterator<Item = &'a u8>>(
bytes: &mut I,
) -> Option<u32> {
let first = *bytes.next_back()?;
if first < 128 {
Some(first as u32)
} else {
let second = *bytes.next_back().unwrap_unchecked();
if second & 0b1110_0000 == 0b1100_0000 {
Some(((second as u32 & 0x1F) << 6) | (first as u32 & 0x3F))
} else {
let third = *bytes.next_back().unwrap_unchecked();
if second & 0b1111_0000 != 0b1011_0000 || third != 0b1110_1101 {
Some(
((third as u32 & 0x0F) << 12)
| ((second as u32 & 0x3F) << 6)
| (first as u32 & 0x3F),
)
} else {
let fourth = *bytes.next_back().unwrap_unchecked();
let fifth = *bytes.next_back().unwrap_unchecked();
let _sixth = *bytes.next_back().unwrap_unchecked();
Some(
0x10000
+ (((fifth as u32 & 0x0F) << 16)
| ((fourth as u32 & 0x3F) << 10)
| ((second as u32 & 0x0F) << 6)
| (first as u32 & 0x3F)),
)
}
}
}
}
#[inline]
#[must_use]
pub(crate) const fn len_cesu8<const JAVA: bool>(code: u32) -> usize {
if code < 0x80 && !(JAVA && code == 0) {
1
} else if code < 0x800 {
2
} else if code < 0x10000 {
3
} else {
6
}
}
#[inline]
pub(crate) fn encode_cesu8_raw<const JAVA: bool>(code: u32, dst: &mut [u8]) -> &mut [u8] {
let len = len_cesu8::<JAVA>(code);
match (len, &mut dst[..]) {
(1, [a, ..]) => *a = code as u8,
(2, [a, b, ..]) => {
*a = 0b1100_0000 | (code >> 6 & 0x1F) as u8;
*b = 0b1000_0000 | (code & 0x3F) as u8;
}
(3, [a, b, c, ..]) => {
*a = 0b1110_0000 | (code >> 12 & 0x0F) as u8;
*b = 0b1000_0000 | (code >> 6 & 0x3F) as u8;
*c = 0b1000_0000 | (code & 0x3F) as u8;
}
(6, [a, b, c, d, e, f, ..]) => {
*a = 0b1110_1101;
*b = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
*c = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
*d = 0b1110_1101;
*e = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
*f = 0b1000_0000 | (code & 0x3F) as u8;
}
_ => panic!(
"encode_cesu8: need {len} bytes to encode U+{code:X}, but the buffer has {}",
dst.len()
),
};
&mut dst[..len]
}
pub(crate) const fn required_len<const JAVA: bool>(str: &str) -> usize {
let mut len = 0;
let mut i = 0;
let v = str.as_bytes();
while i < v.len() {
let first = v[i];
if first & 0b1111_1000 == 0b1111_0000 {
len += 6;
i += 4;
} else if JAVA && first == 0 {
len += 2;
i += 1;
} else {
len += 1;
i += 1;
}
}
len
}
pub(crate) const fn create_array<const JAVA: bool, const N: usize>(str: &str) -> [u8; N] {
let mut buf = [0; N];
let mut j = 0;
let mut i = 0;
let v = str.as_bytes();
while i < v.len() {
let first = v[i];
if first & 0b1111_1000 == 0b1111_0000 {
let code = 0x10000
+ (((v[i + 0] as u32 & 0b0000_0111) << 18)
| ((v[i + 1] as u32 & 0b0011_1111) << 12)
| ((v[i + 2] as u32 & 0b0011_1111) << 6)
| (v[i + 3] as u32 & 0b0011_1111));
buf[i + 0] = 0b1110_1101;
buf[i + 1] = 0b1010_0000 | ((code - 0x1_0000) >> 16 & 0x0F) as u8;
buf[i + 2] = 0b1000_0000 | (code >> 10 & 0x3F) as u8;
buf[i + 3] = 0b1110_1101;
buf[i + 4] = 0b1011_0000 | (code >> 6 & 0x0F) as u8;
buf[i + 5] = 0b1000_0000 | (code & 0x3F) as u8;
j += 6;
i += 4;
} else if JAVA && first == 0 {
buf[j + 0] = 0xC0;
buf[j + 1] = 0x80;
j += 2;
i += 1;
} else {
buf[j] = v[i];
j += 1;
i += 1;
}
}
buf
}