use std::borrow::Cow;
use std::io;
use std::str::Utf8Error;
use crate::decoding::TAG_CONT_U8;
use crate::string::Cesu8Str;
use crate::unicode::utf8_char_width;
use crate::Cesu8Error;
use crate::Variant;
pub(crate) fn utf8_as_cesu8_spec<const ENCODE_NUL: bool>(
text: Cow<'_, str>,
) -> Result<Cesu8Str<'_>, Cesu8Error> {
let mut i = 0;
let text_bytes = match text {
Cow::Borrowed(b) => Cow::Borrowed(b.as_bytes()),
Cow::Owned(b) => Cow::Owned(b.into_bytes()),
};
while i < text_bytes.len() {
let b = text_bytes[i];
if ENCODE_NUL && b == b'\0' {
return Err(Cesu8Error::new(i, Some(1), Ok(())));
}
if b.is_ascii() {
i += 1;
continue;
}
let w = utf8_char_width(b);
if w == 4 {
return Err(Cesu8Error::new(i, Some(4), Ok(())));
}
assert_ne!(w, 0, "utf8 char length was 0, this is illegal in well-formed utf8 strings (byte {:x?}, bytes[{}] from {:x?})", b, i, text_bytes);
i += w;
}
Ok(Cesu8Str {
variant: ENCODE_NUL.into(),
utf8_error: Ok(()),
bytes: text_bytes,
})
}
#[inline]
pub(crate) fn utf8_as_cesu8(
text: Cow<'_, str>,
variant: Variant,
) -> Result<Cesu8Str<'_>, Cesu8Error> {
match variant {
Variant::Standard => utf8_as_cesu8_spec::<false>(text),
Variant::Java => utf8_as_cesu8_spec::<true>(text),
}
}
pub(crate) unsafe fn utf8_to_cesu8_spec<W: io::Write, const ENCODE_NUL: bool>(
text: &str,
assume_good: usize,
encoded: &mut W,
) -> io::Result<Result<(), Utf8Error>> {
if assume_good != 0 {
debug_assert_eq!(
utf8_as_cesu8_spec::<ENCODE_NUL>(Cow::Borrowed(text))
.unwrap_err()
.valid_up_to(),
assume_good,
"tried to assume invalid CESU-8 as good"
);
debug_assert!(
assume_good <= text.len(),
"tried to assume_good a chunk larger than the source"
);
}
#[inline(always)]
fn utf8_to_cesu8_prealloc_internal<W: io::Write, const ENCODE_NUL: bool>(
text: &str,
assume_good: usize,
encoded: &mut W,
) -> io::Result<Result<(), Utf8Error>> {
let bytes = text.as_bytes();
encoded.write_all(&bytes[..assume_good])?;
let mut i = assume_good;
let mut utf8_seg = 0;
let mut utf8_err = Ok(());
let mut written = assume_good;
macro_rules! write_cesu8 {
($cesu8_slice: expr, $text_len: expr) => {
let sl: &[u8] = $cesu8_slice;
encoded.write_all(sl)?;
written += sl.len();
i += $text_len;
};
}
macro_rules! push_utf8 {
($errlen: expr) => {
if utf8_seg > 0 {
write_cesu8!(&bytes[i..i + utf8_seg], utf8_seg);
utf8_seg = 0;
}
if let Some(err) = $errlen {
if utf8_err.is_ok() {
utf8_err = Err(utf8err_new(written, err));
}
}
};
}
while let Some(&b) = bytes.get(i + utf8_seg) {
if ENCODE_NUL && b == b'\0' {
push_utf8!(Some(Some(1)));
write_cesu8!(&[0xC0, 0x80], 1);
} else if b.is_ascii() {
utf8_seg += 1;
} else {
match utf8_char_width(b) {
4 => {
push_utf8!(Some(Some(1)));
let s = unsafe { std::str::from_utf8_unchecked(&bytes[i..i + 4]) };
let c = s.chars().next().unwrap() as u32;
write_cesu8!(&enc_surrogates(c), 4);
}
w => {
utf8_seg += w;
}
}
}
}
push_utf8!(None);
debug_assert_eq!(i, text.len(), "did not fully consume the input text bytes");
debug_assert_eq!(
utf8_seg, 0,
"did not fully consume the current utf8 segment"
);
Ok(utf8_err)
}
utf8_to_cesu8_prealloc_internal::<W, ENCODE_NUL>(text, assume_good, encoded)
}
#[inline]
pub(crate) fn utf8_to_cesu8_safe(
text: &str,
encoded: &mut Vec<u8>,
variant: Variant,
) -> Result<(), Utf8Error> {
unsafe {
utf8_to_cesu8(text, 0, encoded, variant).expect(
"io::Error occured within Vec's io::Write implementation. This should not happen.",
)
}
}
#[inline]
pub(crate) unsafe fn utf8_to_cesu8<W: io::Write>(
text: &str,
assume_good: usize,
encoded: &mut W,
variant: Variant,
) -> io::Result<Result<(), Utf8Error>> {
match variant {
Variant::Standard => utf8_to_cesu8_spec::<W, false>(text, assume_good, encoded),
Variant::Java => utf8_to_cesu8_spec::<W, true>(text, assume_good, encoded),
}
}
#[inline]
pub(crate) fn enc_surrogates<C: Into<u32>>(ch: C) -> [u8; 6] {
let c = ch.into() - 0x10000;
let high = enc_surrogate(((c >> 10) as u16) | 0xD800);
let low = enc_surrogate(((c & 0x3FF) as u16) | 0xDC00);
[high[0], high[1], high[2], low[0], low[1], low[2]]
}
#[inline]
fn enc_surrogate(surrogate: u16) -> [u8; 3] {
if cfg!(debug_assertions) || cfg!(validate_release) {
assert!(
(0xD800..=0xDFFF).contains(&surrogate),
"trying to encode invalid surrogate pair"
);
}
[
0b11100000 | ((surrogate & 0b1111_0000_0000_0000) >> 12) as u8,
TAG_CONT_U8 | ((surrogate & 0b0000_1111_1100_0000) >> 6) as u8,
TAG_CONT_U8 | (surrogate & 0b0000_0000_0011_1111) as u8,
]
}
#[inline]
pub(crate) fn utf8err_new(valid_up_to: usize, err_len: Option<u8>) -> Utf8Error {
#[allow(dead_code)]
struct CustomUtf8Error {
valid_up_to: usize,
err_len: Option<u8>,
}
let err = CustomUtf8Error {
valid_up_to,
err_len,
};
debug_assert_eq!(
std::mem::align_of::<CustomUtf8Error>(),
std::mem::align_of::<Utf8Error>(),
"std::str::Utf8Error has unexpectedly changed alignment"
);
debug_assert_eq!(
std::mem::size_of::<CustomUtf8Error>(),
std::mem::size_of::<Utf8Error>(),
"std::str::Utf8Error has unexpectedly changed alignment"
);
unsafe { std::mem::transmute(err) }
}
#[inline]
pub(crate) fn utf8err_inc(err: &Utf8Error, incby: usize) -> Utf8Error {
utf8err_new(incby + err.valid_up_to(), err.error_len().map(|b| b as u8))
}