#![allow(unsafe_code)]
#![allow(clippy::cast_ptr_alignment)]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::__m128i;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_cmpgt_epi8;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_loadu_si128;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_movemask_epi8;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_set1_epi8;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_xor_si128;
pub struct VarintScan {
pub single_byte_count: usize,
}
#[inline(always)]
pub fn scan_single_byte_varints(bytes: &[u8]) -> VarintScan {
let mut count = 0;
#[cfg(target_arch = "x86_64")]
{
if bytes.len() >= 16 {
unsafe {
while count + 16 <= bytes.len() {
let chunk = _mm_loadu_si128(bytes.as_ptr().add(count).cast::<__m128i>());
let shifted = _mm_xor_si128(chunk, _mm_set1_epi8(-128));
let mask = _mm_cmpgt_epi8(shifted, _mm_set1_epi8(122));
let movemask = _mm_movemask_epi8(mask);
if movemask != 0 {
count += movemask.trailing_zeros() as usize;
return VarintScan {
single_byte_count: count,
};
}
count += 16;
}
}
}
}
#[cfg(target_arch = "aarch64")]
{
if bytes.len() >= 16 {
unsafe {
let limit = vdupq_n_u8(250);
while count + 16 <= bytes.len() {
let chunk = vld1q_u8(bytes.as_ptr().add(count));
let mask = vcgtq_u8(chunk, limit);
if vmaxvq_u8(mask) != 0 {
break;
}
count += 16;
}
}
}
}
while count < bytes.len() && bytes[count] <= 250 {
count += 1;
}
VarintScan {
single_byte_count: count,
}
}