#[allow(dead_code)]
pub(super) fn is_utf8_start(byte: u8) -> bool {
byte < 0x80 || (0xc2..=0xf4).contains(&byte)
}
#[allow(dead_code)]
pub(super) fn read_utf8(bytes: &[u8]) -> core::result::Result<(char, usize), usize> {
const CONTINUATION_MASK: u8 = 0b0011_1111;
let length = scan_utf8_length(bytes)?;
assert!(
length <= bytes.len(),
"buffer does not contain sufficient bytes despite UTF-8 validation"
);
let mut codepoint = (bytes[0] & (0x7f >> length)) as u32;
for byte in bytes.iter().take(length).skip(1) {
codepoint = (codepoint << 6) | (byte & CONTINUATION_MASK) as u32;
}
Ok((unsafe { char::from_u32_unchecked(codepoint) }, length))
}
pub(super) fn scan_utf8_length(bytes: &[u8]) -> core::result::Result<usize, usize> {
let mut index = 0;
let len = bytes.len();
macro_rules! next {
() => {{
index += 1;
if index >= len {
return Err(index);
}
bytes[index]
}};
}
if len == 0 {
return Err(0);
}
let first = bytes[0];
if first < 0x80 {
return Ok(1);
}
let width = UTF8_CHAR_WIDTH[(first - 0x80) as usize];
match width {
2 => {
if next!() as i8 >= -64 {
return Err(1);
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => return Err(1),
}
if next!() as i8 >= -64 {
return Err(2);
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => return Err(1),
}
if next!() as i8 >= -64 {
return Err(2);
}
if next!() as i8 >= -64 {
return Err(3);
}
}
_ => return Err(1),
}
Ok(width as usize)
}
#[rustfmt::skip]
const UTF8_CHAR_WIDTH: &[u8; 128] = &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ];
#[cfg(any())]
mod algo2 {
pub(super) fn scan_utf8(bytes: &[u8]) -> std::result::Result<usize, usize> {
let (mut index, mut state) = (0, UTF8_ACCEPT);
while index < bytes.len() {
state = TRANSITIONS[(state + CLASSES[bytes[index] as usize]) as usize];
if state == UTF8_ACCEPT {
return Ok(index + 1);
} else if state == UTF8_REJECT {
return Err(1.max(index));
}
index += 1;
}
Err(index)
}
#[rustfmt::skip]
static CLASSES: [u8; 256] = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
];
const UTF8_ACCEPT: u8 = 12;
const UTF8_REJECT: u8 = 0;
#[rustfmt::skip]
static TRANSITIONS: [u8; 108] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12, 0,24,36,60,96,84, 0, 0, 0,48,72,
0,12, 0, 0, 0, 0, 0,12, 0,12, 0, 0,
0,24, 0, 0, 0, 0, 0,24, 0,24, 0, 0,
0, 0, 0, 0, 0, 0, 0,24, 0, 0, 0, 0,
0,24, 0, 0, 0, 0, 0, 0, 0,24, 0, 0,
0, 0, 0, 0, 0, 0, 0,36, 0,36, 0, 0,
0,36, 0, 0, 0, 0, 0,36, 0,36, 0, 0,
0,36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
}