#![allow(unsafe_code)]
use core::mem;
use crate::printable::is_printable;
pub struct CharEncodeUtf8 {
buf: [u8; 4],
len: u8,
}
impl CharEncodeUtf8 {
pub const fn new(ch: char) -> Self {
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
let mut buf = [0; 4];
let len = ch.len_utf8();
let code = ch as u32;
match len {
1 => {
buf[0] = code as u8;
}
2 => {
buf[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
buf[1] = (code & 0x3F) as u8 | TAG_CONT;
}
3 => {
buf[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
buf[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code & 0x3F) as u8 | TAG_CONT;
}
4 => {
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
}
_ => {}
};
CharEncodeUtf8 {
buf,
len: len as u8,
}
}
pub const fn as_bytes(&self) -> &[u8] {
crate::bytes::subslice(&self.buf, 0..self.len as usize)
}
#[cfg(test)]
pub fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.buf[..self.len as usize]) }
}
}
#[test]
fn test_char_encode_utf8() {
macro_rules! test_char_encode_utf8 {
($ch: expr) => {{
let e = CharEncodeUtf8::new($ch);
let output = e.as_str();
let mut ans = [0; 4];
let ans = $ch.encode_utf8(&mut ans);
assert_eq!(output, ans);
}};
}
test_char_encode_utf8!('\0');
test_char_encode_utf8!('我');
test_char_encode_utf8!('\u{10ffff}');
}
pub struct CharEscapeUnicode {
buf: [u8; 10],
len: u8,
}
impl CharEscapeUnicode {
const unsafe fn from_code_point(code: u32) -> Self {
let mut hex_buf = [0; 10];
let mut hex_pos = 0;
let mut x = code;
loop {
hex_buf[hex_pos] = crate::ascii::num_to_hex_digit((x as u8) & 0x0f);
hex_pos += 1;
x >>= 4;
if x == 0 {
break;
}
}
let mut buf = [b'\\', b'u', b'{', 0, 0, 0, 0, 0, 0, 0];
let mut pos = 3;
while hex_pos > 0 {
hex_pos -= 1;
buf[pos] = hex_buf[hex_pos];
pos += 1;
}
buf[pos] = b'}';
pos += 1;
Self {
buf,
len: pos as u8,
}
}
pub const fn new(ch: char) -> Self {
unsafe { Self::from_code_point(ch as u32) }
}
#[cfg(test)]
pub fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.buf[..self.len as usize]) }
}
}
#[test]
fn test_char_escape_unicode() {
macro_rules! test_char_escape_unicode {
($ch: expr) => {{
let e = CharEscapeUnicode::new($ch);
let output = e.as_str();
let ans = $ch.escape_unicode().to_string();
assert_eq!(output, ans);
}};
}
test_char_escape_unicode!('\0');
test_char_escape_unicode!('我');
test_char_escape_unicode!('\u{10ffff}');
}
pub struct CharEscapeDebug {
buf: [u8; 10],
len: u8,
}
pub struct CharEscapeDebugArgs {
pub escape_single_quote: bool,
pub escape_double_quote: bool,
}
impl CharEscapeDebugArgs {
#[cfg(test)]
pub const ESCAPE_ALL: Self = Self {
escape_single_quote: true,
escape_double_quote: true,
};
}
impl CharEscapeDebug {
pub const fn new(ch: char, args: CharEscapeDebugArgs) -> Self {
match ch {
'\0' => Self::backslash_ascii(b'0'),
'\t' => Self::backslash_ascii(b't'),
'\r' => Self::backslash_ascii(b'r'),
'\n' => Self::backslash_ascii(b'n'),
'\\' => Self::backslash_ascii(b'\\'),
'"' if args.escape_double_quote => Self::backslash_ascii(b'"'),
'\'' if args.escape_single_quote => Self::backslash_ascii(b'\''),
_ if is_printable(ch) => Self::printable(ch),
_ => Self::unicode(ch),
}
}
const fn printable(ch: char) -> Self {
let e = CharEncodeUtf8::new(ch);
Self {
buf: [e.buf[0], e.buf[1], e.buf[2], e.buf[3], 0, 0, 0, 0, 0, 0],
len: e.len,
}
}
const fn backslash_ascii(ch: u8) -> Self {
Self {
buf: [b'\\', ch, 0, 0, 0, 0, 0, 0, 0, 0],
len: 2,
}
}
const fn unicode(ch: char) -> Self {
let e = CharEscapeUnicode::new(ch);
Self {
buf: e.buf,
len: e.len,
}
}
pub const fn as_bytes(&self) -> &[u8] {
crate::bytes::subslice(&self.buf, 0..self.len as usize)
}
#[cfg(test)]
pub fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.buf[..self.len as usize]) }
}
}
#[test]
fn test_char_escape_debug() {
macro_rules! test_char_escape_debug {
($ch: expr) => {{
let e = CharEscapeDebug::new($ch, CharEscapeDebugArgs::ESCAPE_ALL);
let output = e.as_str();
let ans = $ch.escape_debug().to_string();
assert_eq!(output, ans);
}};
}
for ch in '\0'..='\u{7f}' {
test_char_escape_debug!(ch);
}
test_char_escape_debug!('\u{10ffff}');
}
pub const fn next_char(bytes: &[u8]) -> Option<(char, usize)> {
#[allow(clippy::many_single_char_names)]
const fn next_code_point(bytes: &[u8]) -> Option<(u32, usize)> {
const CONT_MASK: u8 = 0b0011_1111;
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
const fn unwrap_or_0(opt: Option<u8>) -> u8 {
match opt {
Some(byte) => byte,
None => 0,
}
}
let mut i = 0;
macro_rules! next {
() => {{
if i < bytes.len() {
let x = Some(bytes[i]);
i += 1;
x
} else {
None
}
}};
}
let x = match next!() {
Some(x) => x,
None => return None,
};
if x < 128 {
return Some((x as u32, i));
}
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(next!());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
let z = unwrap_or_0(next!());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
let w = unwrap_or_0(next!());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some((ch, i))
}
match next_code_point(bytes) {
Some((ch, count)) => Some((unsafe { crate::str::char_from_u32(ch) }, count)),
None => None,
}
}
pub const fn str_count_chars(s: &str) -> usize {
let mut s = s.as_bytes();
let mut ans = 0;
while let Some((_, count)) = next_char(s) {
s = crate::bytes::advance(s, count);
ans += 1;
}
ans
}
pub const fn str_chars<const N: usize>(s: &str) -> [char; N] {
let mut s = s.as_bytes();
let mut buf: [char; N] = ['\0'; N];
let mut pos = 0;
while let Some((ch, count)) = next_char(s) {
s = crate::bytes::advance(s, count);
buf[pos] = ch;
pos += 1;
}
constfn_assert!(pos == N);
buf
}
#[test]
fn test_str_chars() {
const X: &str = "唐可可";
const OUTPUT_LEN: usize = str_count_chars(X);
const OUTPUT_BUF: [char; OUTPUT_LEN] = str_chars::<OUTPUT_LEN>(X);
let ans = X.chars().collect::<Vec<_>>();
assert_eq!(OUTPUT_BUF, ans.as_slice());
}
pub struct Utf8Error {
pub valid_up_to: usize,
pub error_len: Option<u8>,
}
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ];
#[must_use]
#[inline]
pub const fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}
#[cfg(target_pointer_width = "16")]
const NONASCII_MASK: usize = 0x8080;
#[cfg(target_pointer_width = "32")]
const NONASCII_MASK: usize = 0x80808080;
#[cfg(target_pointer_width = "64")]
const NONASCII_MASK: usize = 0x8080808080808080;
#[inline]
const fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}
pub const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size {
len - ascii_block_size + 1
} else {
0
};
let align = usize::MAX;
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
error_len: $error_len,
})
};
}
macro_rules! next {
() => {{
index += 1;
if index >= len {
err!(None)
}
v[index]
}};
}
let first = v[index];
if first >= 128 {
let w = utf8_char_width(first);
match w {
2 => {
if next!() as i8 >= -64 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => err!(Some(1)),
}
if next!() as i8 >= -64 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => err!(Some(1)),
}
if next!() as i8 >= -64 {
err!(Some(2))
}
if next!() as i8 >= -64 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
} else {
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
let ptr = v.as_ptr();
while index < blocks_end {
unsafe {
let block = ptr.add(index) as *const usize;
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu || zv {
break;
}
}
index += ascii_block_size;
}
while index < len && v[index] < 128 {
index += 1;
}
} else {
index += 1;
}
}
}
Ok(())
}