use super::Wtf8;
const USIZE_SIZE: usize = core::mem::size_of::<usize>();
const UNROLL_INNER: usize = 4;
#[inline]
pub(super) fn count_chars(s: &Wtf8) -> usize {
if s.len() < USIZE_SIZE * UNROLL_INNER {
char_count_general_case(s.as_bytes())
} else {
do_count_chars(s)
}
}
fn do_count_chars(s: &Wtf8) -> usize {
const CHUNK_SIZE: usize = 192;
const _: () = assert!(CHUNK_SIZE < 256);
const _: () = assert!(CHUNK_SIZE.is_multiple_of(UNROLL_INNER));
let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() };
if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) {
return char_count_general_case(s.as_bytes());
}
let mut total = char_count_general_case(head) + char_count_general_case(tail);
for chunk in body.chunks(CHUNK_SIZE) {
let mut counts = 0;
let (unrolled_chunks, remainder) = chunk.as_chunks::<UNROLL_INNER>();
for unrolled in unrolled_chunks {
for &word in unrolled {
counts += contains_non_continuation_byte(word);
}
}
total += sum_bytes_in_usize(counts);
if !remainder.is_empty() {
let mut counts = 0;
for &word in remainder {
counts += contains_non_continuation_byte(word);
}
total += sum_bytes_in_usize(counts);
break;
}
}
total
}
#[inline]
fn contains_non_continuation_byte(w: usize) -> usize {
const LSB: usize = usize_repeat_u8(0x01);
((!w >> 7) | (w >> 6)) & LSB
}
#[inline]
fn sum_bytes_in_usize(values: usize) -> usize {
const LSB_SHORTS: usize = usize_repeat_u16(0x0001);
const SKIP_BYTES: usize = usize_repeat_u16(0x00ff);
let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8)
}
fn char_count_general_case(s: &[u8]) -> usize {
s.iter()
.filter(|&&byte| !super::core_str::utf8_is_cont_byte(byte))
.count()
}
const fn usize_repeat_u8(x: u8) -> usize {
usize::from_ne_bytes([x; size_of::<usize>()])
}
const fn usize_repeat_u16(x: u16) -> usize {
let mut r = 0usize;
let mut i = 0;
while i < size_of::<usize>() {
r = r.wrapping_shl(16) | (x as usize);
i += 2;
}
r
}
const fn unlikely(x: bool) -> bool {
x
}