use core::{char, mem, slice};
use super::{CodePoint, IllFormedUtf16CodeUnits, Wtf8, Wtf8Buf};
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;
#[inline]
fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
if code < MAX_ONE_B && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1f) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3f) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0f) as u8 | TAG_THREE_B;
dst[1] = (code >> 6 & 0x3f) as u8 | TAG_CONT;
dst[2] = (code & 0x3f) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12 & 0x3f) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3f) as u8 | TAG_CONT;
dst[3] = (code & 0x3f) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
#[inline]
fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
if (ch & 0xffff) == ch && !dst.is_empty() {
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
ch -= 0x1_0000;
dst[0] = 0xd800 | ((ch >> 10) as u16);
dst[1] = 0xdc00 | ((ch as u16) & 0x3ff);
Some(2)
} else {
None
}
}
#[inline]
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
let x = match bytes.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
Some(&next_byte) => next_byte,
};
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xe0 {
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xf0 {
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some(ch)
}
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7f >> width)) as u32
}
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}
const CONT_MASK: u8 = 0b0011_1111;
#[inline]
pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) {
let cur_len = string.len();
string.reserve(4);
unsafe {
let slice = slice::from_raw_parts_mut(string.bytes.as_mut_ptr().add(cur_len), 4);
let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0);
string.bytes.set_len(cur_len + used);
}
}
#[inline]
pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
if index == slice.len() {
return true;
}
match slice.bytes.get(index) {
None => false,
Some(&b) => !(128u8..192u8).contains(&b),
}
}
#[inline]
pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
mem::transmute(slice::from_raw_parts(
s.bytes.as_ptr().add(begin),
end - begin,
))
}
#[inline(never)]
pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
assert!(begin <= end);
panic!("index {begin} and/or {end} in {s:?} do not lie on character boundary");
}
pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
if iter.extra != 0 {
let tmp = iter.extra;
iter.extra = 0;
return Some(tmp);
}
let mut buf = [0u16; 2];
iter.code_points.next().map(|code_point| {
let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0);
if n == 2 {
iter.extra = buf[1];
}
buf[0]
})
}
pub struct DecodeUtf16<I>
where
I: Iterator<Item = u16>,
{
iter: I,
buf: Option<u16>,
}
#[inline]
pub fn decode_utf16<I: IntoIterator<Item = u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
DecodeUtf16 {
iter: iterable.into_iter(),
buf: None,
}
}
impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
type Item = Result<char, u16>;
fn next(&mut self) -> Option<Result<char, u16>> {
let u = match self.buf.take() {
Some(buf) => buf,
None => self.iter.next()?,
};
if !(0xd800..=0xdfff).contains(&u) {
Some(Ok(unsafe { char::from_u32_unchecked(u as u32) }))
} else if u >= 0xdc00 {
Some(Err(u))
} else {
let u2 = match self.iter.next() {
Some(u2) => u2,
None => return Some(Err(u)),
};
if !(0xdc00..=0xdfff).contains(&u2) {
self.buf = Some(u2);
return Some(Err(u));
}
let c = (((u - 0xd800) as u32) << 10 | (u2 - 0xdc00) as u32) + 0x1_0000;
Some(Ok(unsafe { char::from_u32_unchecked(c) }))
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.iter.size_hint();
(low / 2, high)
}
}
#[inline]
fn is_continuation_byte(byte: u8) -> bool {
byte & 0xc0 == 0x80
}
#[inline]
fn is_surrogate_pair(bytes: &[u8], pos: usize) -> bool {
if pos + 5 >= bytes.len() {
return false;
}
bytes[pos] == 0xed
&& (0xa0..0xb0).contains(&bytes[pos + 1])
&& is_continuation_byte(bytes[pos + 2])
&& bytes[pos + 3] == 0xed
&& (0xb0..0xc0).contains(&bytes[pos + 4])
&& is_continuation_byte(bytes[pos + 5])
}
pub fn validate_wtf8(bytes: &[u8]) -> bool {
let mut i = 0;
while i < bytes.len() {
let byte = bytes[i];
if byte < 0x80 {
i += 1;
continue;
}
if byte < 0xe0 {
if i + 1 >= bytes.len() || !is_continuation_byte(bytes[i + 1]) || byte < 0xc2 {
return false; }
i += 2;
continue;
}
if byte < 0xf0 {
if i + 2 >= bytes.len()
|| !is_continuation_byte(bytes[i + 1])
|| !is_continuation_byte(bytes[i + 2])
{
return false; }
let b2 = bytes[i + 1];
if byte == 0xe0 && b2 < 0xa0 {
return false;
}
if is_surrogate_pair(bytes, i) {
return false;
}
i += 3;
continue;
}
if byte < 0xf8 {
if i + 3 >= bytes.len()
|| !is_continuation_byte(bytes[i + 1])
|| !is_continuation_byte(bytes[i + 2])
|| !is_continuation_byte(bytes[i + 3])
{
return false; }
let b2 = bytes[i + 1];
if (byte == 0xf0 && b2 < 0x90) || (byte == 0xf4 && b2 >= 0x90) || byte > 0xf4 {
return false;
}
i += 4;
continue;
}
return false;
}
true
}