pub(crate) const MAX_BYTES_PER_CHAR: usize = 4;
pub(crate) fn is_1byte(b: u8) -> bool {
b <= 0x7F
}
const _2BYTE_MASK: u8 = 0b1110_0000;
const _2BYTE_MASK_VAL: u8 = !_2BYTE_MASK;
pub(crate) fn is_2byte_start(b: u8) -> bool {
(b & _2BYTE_MASK) == 0b1100_0000
}
const _3BYTE_MASK: u8 = 0b1111_0000;
const _3BYTE_MASK_VAL: u8 = !_3BYTE_MASK;
pub(crate) fn is_3byte_start(b: u8) -> bool {
(b & _3BYTE_MASK) == 0b1110_0000
}
const _4BYTE_MASK: u8 = 0b1111_1000;
const _4BYTE_MASK_VAL: u8 = !_4BYTE_MASK;
pub(crate) fn is_4byte_start(b: u8) -> bool {
(b & _4BYTE_MASK) == 0b1111_0000
}
const CONT_MASK: u8 = 0b1100_0000;
const CONT_MASK_VAL: u8 = !CONT_MASK;
pub(crate) fn is_continuation(b: u8) -> bool {
(b & CONT_MASK) == 0b1000_0000
}
pub(crate) fn is_valid_2bytes(b0: u8, b1: u8) -> bool {
debug_assert!(is_2byte_start(b0) && is_continuation(b1));
let code_point = (u32::from(b0 & _2BYTE_MASK_VAL) << 6) | u32::from(b1 & CONT_MASK_VAL);
code_point >= 0x80
}
pub(crate) fn is_valid_3bytes(b0: u8, b1: u8, b2: u8) -> bool {
debug_assert!(is_3byte_start(b0) && is_continuation(b1) && is_continuation(b2));
let code_point = (u32::from(b0 & _3BYTE_MASK_VAL) << 12)
| (u32::from(b1 & CONT_MASK_VAL) << 6)
| u32::from(b2 & CONT_MASK_VAL);
code_point >= 0x800 && !matches!(code_point, 0xD800..=0xDFFF)
}
pub(crate) fn is_valid_4bytes(b0: u8, b1: u8, b2: u8, b3: u8) -> bool {
debug_assert!(
is_4byte_start(b0) && is_continuation(b1) && is_continuation(b2) && is_continuation(b3)
);
let code_point = (u32::from(b0 & _4BYTE_MASK_VAL) << 18)
| (u32::from(b1 & CONT_MASK_VAL) << 12)
| (u32::from(b2 & CONT_MASK_VAL) << 6)
| u32::from(b3 & CONT_MASK_VAL);
matches!(code_point, 0x10000..=0x10FFFF)
}
fn debug_assert_valid_utf8(bytes: &[u8]) {
if cfg!(debug_assertions) {
if let Err(e) = std::str::from_utf8(bytes) {
panic!(
"Unexpected: Invalid UTF-8 bytes detected, report this to the Struson maintainers: {e:?}; bytes: {bytes:02X?}"
)
}
}
}
pub(crate) fn to_str_unchecked(bytes: &[u8]) -> &str {
debug_assert_valid_utf8(bytes);
std::str::from_utf8(bytes).unwrap()
}
pub(crate) fn to_string_unchecked(bytes: Vec<u8>) -> String {
debug_assert_valid_utf8(&bytes);
String::from_utf8(bytes).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use std::panic::UnwindSafe;
#[must_use] fn assert_panics<R>(f: impl FnOnce() -> R + UnwindSafe) -> String {
if let Err(panic_value) = std::panic::catch_unwind(f) {
match panic_value.downcast::<String>() {
Ok(message) => *message,
Err(panic_value) => {
panic!("Panic value should have been a String, but is: {panic_value:?}")
}
}
} else {
panic!("Expression should have panicked");
}
}
#[cfg(debug_assertions)] #[test]
fn to_str_unchecked_invalid() {
let message = assert_panics(|| to_str_unchecked(b"\xC1\xBF"));
assert!(
message.starts_with(
"Unexpected: Invalid UTF-8 bytes detected, report this to the Struson maintainers: "
),
"Unexpected prefix for message: {message}"
);
assert!(
message.ends_with("; bytes: [C1, BF]"),
"Unexpected suffix for message: {message}"
);
}
#[cfg(debug_assertions)] #[test]
fn to_string_unchecked_invalid() {
let message = assert_panics(|| to_string_unchecked(b"\xC1\xBF".to_vec()));
assert!(
message.starts_with(
"Unexpected: Invalid UTF-8 bytes detected, report this to the Struson maintainers: "
),
"Unexpected prefix for message: {message}"
);
assert!(
message.ends_with("; bytes: [C1, BF]"),
"Unexpected suffix for message: {message}"
);
}
}