use core::mem;
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
struct Utf8Error {
valid_up_to: usize,
error_len: Option<u8>,
}
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
#[inline]
fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}
#[inline]
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size {
len - ascii_block_size + 1
} else {
0
};
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
error_len: $error_len,
});
};
}
macro_rules! next {
() => {{
index += 1;
if index >= len {
err!(None)
}
v[index]
}};
}
let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
match w {
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
} else {
let ptr = v.as_ptr();
let align = unsafe {
ptr.add(index).align_offset(usize_bytes)
};
if align == 0 {
while index < blocks_end {
unsafe {
let block = ptr.add(index) as *const usize;
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
}
index += ascii_block_size;
}
while index < len && v[index] < 128 {
index += 1;
}
} else {
index += 1;
}
}
}
Ok(())
}
#[cfg_attr(rustfmt, rustfmt_skip)]
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, ];
const CONT_MASK: u8 = 0b0011_1111;
const TAG_CONT_U8: u8 = 0b1000_0000;
pub fn is_utf8(bytes: &[u8]) -> bool {
run_utf8_validation(bytes).is_ok()
}