#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use core::cmp::min;
use crate::num::{SignedVarIntTarget, VarIntTarget};
use crate::VarIntDecodeError;
mod lookup;
#[inline]
pub fn decode<T: VarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
let result = if bytes.len() >= 16 {
unsafe { decode_unsafe(bytes.as_ptr()) }
} else if !bytes.is_empty() {
let mut data = [0u8; 16];
let len = min(16, bytes.len());
data[..len].copy_from_slice(&bytes[..len]);
unsafe { decode_unsafe(data.as_ptr()) }
} else {
return Err(VarIntDecodeError::NotEnoughBytes);
};
if bytes.len() >= T::MAX_VARINT_BYTES as usize
&& unsafe { *bytes.get_unchecked((T::MAX_VARINT_BYTES - 1) as usize) } > T::MAX_LAST_VARINT_BYTE
&& result.1 == T::MAX_VARINT_BYTES as usize
|| result.1 > T::MAX_VARINT_BYTES as usize
{
Err(VarIntDecodeError::Overflow)
} else if result.1 > bytes.len() {
Err(VarIntDecodeError::NotEnoughBytes)
} else {
Ok(result)
}
}
#[inline]
pub fn decode_len<T: VarIntTarget>(bytes: &[u8]) -> Result<usize, VarIntDecodeError> {
let result = if bytes.len() >= 16 {
unsafe { decode_len_unsafe::<T>(bytes.as_ptr()) }
} else if !bytes.is_empty() {
let mut data = [0u8; 16];
let len = min(16, bytes.len());
data[..len].copy_from_slice(&bytes[..len]);
unsafe { decode_len_unsafe::<T>(data.as_ptr()) }
} else {
return Err(VarIntDecodeError::NotEnoughBytes);
};
Ok(result)
}
#[inline]
pub fn decode_zigzag<T: SignedVarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
decode::<T::Unsigned>(bytes).map(|r| (r.0.unzigzag(), r.1))
}
#[inline]
pub unsafe fn decode_len_unsafe<T: VarIntTarget>(bytes: *const u8) -> usize {
if T::MAX_VARINT_BYTES <= 5 {
let b = bytes.cast::<u64>().read_unaligned();
let msbs = !b & !0x7f7f7f7f7f7f7f7f;
let len = msbs.trailing_zeros() + 1; (len / 8) as usize
} else {
let b0 = bytes.cast::<u64>().read_unaligned();
let b1 = bytes.cast::<u64>().add(1).read_unaligned();
let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
let len0 = msbs0.trailing_zeros() + 1;
let len1 = msbs1.trailing_zeros() + 1;
let len = if msbs0 == 0 { len1 + 64 } else { len0 };
len as usize / 8
}
}
#[inline]
pub unsafe fn decode_unsafe<T: VarIntTarget>(bytes: *const u8) -> (T, usize) {
if T::MAX_VARINT_BYTES <= 5 {
let b = bytes.cast::<u64>().read_unaligned();
let msbs = !b & !0x7f7f7f7f7f7f7f7f;
let len = msbs.trailing_zeros() + 1;
let varint_part = b & (msbs ^ msbs.wrapping_sub(1));
let num = T::scalar_to_num(varint_part);
(num, (len / 8) as usize)
} else {
let b0 = bytes.cast::<u64>().read_unaligned();
let b1 = bytes.cast::<u64>().add(1).read_unaligned();
let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
let len0 = msbs0.trailing_zeros() + 1;
let len1 = msbs1.trailing_zeros() + 1;
let varint_part0 = b0 & (msbs0 ^ msbs0.wrapping_sub(1));
let varint_part1 = (b1 & (msbs1 ^ msbs1.wrapping_sub(1))) * ((msbs0 == 0) as u64);
let num = T::vector_to_num(core::mem::transmute::<[u64; 2], [u8; 16]>([
varint_part0,
varint_part1,
]));
let len = if msbs0 == 0 { len1 + 64 } else { len0 } / 8;
(num, len as usize)
}
}
#[inline]
#[cfg(any(target_feature = "ssse3", doc))]
#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
pub unsafe fn decode_two_unsafe<T: VarIntTarget, U: VarIntTarget>(
bytes: *const u8,
) -> (T, U, u8, u8) {
if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES > 16 {
panic!(
"exceeded length limit: cannot decode {} and {}, total length {} exceeds 16 bytes",
core::any::type_name::<T>(),
core::any::type_name::<U>(),
T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES
);
}
if T::MAX_VARINT_BYTES <= 5 && U::MAX_VARINT_BYTES <= 5 {
return decode_two_u32_unsafe(bytes);
}
let b = _mm_loadu_si128(bytes as *const __m128i);
let bitmask = _mm_movemask_epi8(b) as u32;
let bm_not = !bitmask;
let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
let second_len = bm_not_2.trailing_zeros() + 1;
let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let first_len_vec = _mm_set1_epi8(first_len as i8);
let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
let first = _mm_and_si128(b, first_mask);
let second_shuf = _mm_add_epi8(ascend, first_len_vec);
let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
let second = _mm_and_si128(second_shuffled, second_mask);
let first_num;
let second_num;
let should_turbo = T::MAX_VARINT_BYTES <= 8
&& U::MAX_VARINT_BYTES <= 8
&& cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
if should_turbo {
let comb = _mm_or_si128(first, _mm_bslli_si128(second, 8));
let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
dual_u8_stage2(comb)
} else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
dual_u16_stage2(comb)
} else {
dual_u32_stage2(comb)
};
let x: [u32; 4] = core::mem::transmute(x);
first_num = T::cast_u32(x[0]);
second_num = U::cast_u32(x[2]);
} else {
first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
}
(first_num, second_num, first_len as u8, second_len as u8)
}
#[inline]
#[cfg(any(target_feature = "ssse3", doc))]
unsafe fn decode_two_u32_unsafe<T: VarIntTarget, U: VarIntTarget>(
bytes: *const u8,
) -> (T, U, u8, u8) {
let b = _mm_loadu_si128(bytes as *const __m128i);
let bitmask = _mm_movemask_epi8(b) as u32 & 0b1111111111;
let (lookup, first_len, second_len) =
*lookup::LOOKUP_DOUBLE_STEP1.get_unchecked(bitmask as usize);
let shuf = *lookup::LOOKUP_DOUBLE_VEC.get_unchecked(lookup as usize);
let comb = _mm_shuffle_epi8(b, shuf);
let first_num;
let second_num;
let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
if should_turbo {
let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
dual_u8_stage2(comb)
} else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
dual_u16_stage2(comb)
} else {
dual_u32_stage2(comb)
};
let x: [u32; 4] = core::mem::transmute(x);
first_num = T::cast_u32(x[0]);
second_num = U::cast_u32(x[2]);
} else {
first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
comb, 8,
)));
}
(first_num, second_num, first_len, second_len)
}
#[inline(always)]
unsafe fn dual_u8_stage2(comb: __m128i) -> __m128i {
_mm_or_si128(
_mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x0000000000000100, 0x0000000000000100)),
1,
),
)
}
#[inline(always)]
unsafe fn dual_u16_stage2(comb: __m128i) -> __m128i {
_mm_or_si128(
_mm_or_si128(
_mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x0000000000030000, 0x0000000000030000)),
2,
),
),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
1,
),
)
}
#[inline(always)]
unsafe fn dual_u32_stage2(comb: __m128i) -> __m128i {
_mm_or_si128(
_mm_or_si128(
_mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x0000000f00000000, 0x0000000f00000000)),
4,
),
),
_mm_or_si128(
_mm_or_si128(
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x000000007f000000, 0x000000007f000000)),
3,
),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x00000000007f0000, 0x00000000007f0000)),
2,
),
),
_mm_srli_epi64(
_mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
1,
),
),
)
}
#[inline]
#[cfg(any(target_feature = "avx2", doc))]
#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "avx2")))]
pub unsafe fn decode_two_wide_unsafe<T: VarIntTarget, U: VarIntTarget>(
bytes: *const u8,
) -> (T, U, u8, u8) {
let b = _mm256_loadu_si256(bytes as *const __m256i);
let bitmask = _mm256_movemask_epi8(b) as u32;
let bm_not = !bitmask;
let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
let second_len = bm_not_2.trailing_zeros() + 1;
let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let first_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(first_len as i8));
let first = _mm_and_si128(_mm256_extracti128_si256(b, 0), first_mask);
let shuf_gen = _mm256_setr_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15,
);
let shuf_add = _mm256_set_m128i(
_mm_set1_epi8(-(16i8 - first_len as i8)),
_mm_set1_epi8(first_len as i8),
);
let shuf_added = _mm256_add_epi8(shuf_gen, shuf_add);
let shuf = _mm256_or_si256(
shuf_added,
_mm256_cmpgt_epi8(shuf_added, _mm256_set1_epi8(15)), );
let shuffled = _mm256_shuffle_epi8(b, shuf);
let second_shifted = _mm_or_si128(
_mm256_extracti128_si256(shuffled, 0),
_mm256_extracti128_si256(shuffled, 1),
);
let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
let second = _mm_and_si128(second_shifted, second_mask);
let first_num;
let second_num;
let should_turbo = true;
if should_turbo {
let comb_lo = _mm_unpacklo_epi64(first, second);
let x_lo = _mm_or_si128(
_mm_or_si128(
_mm_or_si128(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000000000007f)),
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x7f00000000000000)),
7,
),
),
_mm_or_si128(
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x007f000000000000)),
6,
),
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x00007f0000000000)),
5,
),
),
),
_mm_or_si128(
_mm_or_si128(
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000007f00000000)),
4,
),
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000007f000000)),
3,
),
),
_mm_or_si128(
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x00000000007f0000)),
2,
),
_mm_srli_epi64(
_mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000000000007f00)),
1,
),
),
),
);
let comb_hi = _mm_unpackhi_epi64(first, second);
let x_hi = _mm_or_si128(
_mm_slli_epi64(
_mm_and_si128(comb_hi, _mm_set1_epi64x(0x0000000000000100)),
55,
),
_mm_slli_epi64(
_mm_and_si128(comb_hi, _mm_set1_epi64x(0x000000000000007f)),
56,
),
);
let x = _mm_or_si128(x_lo, x_hi);
first_num = T::cast_u64(_mm_extract_epi64(x, 0) as u64);
second_num = U::cast_u64(_mm_extract_epi64(x, 1) as u64);
} else {
first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
}
(first_num, second_num, first_len as u8, second_len as u8)
}
#[inline]
#[cfg(any(target_feature = "ssse3", doc))]
#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
pub unsafe fn decode_four_unsafe<
T: VarIntTarget,
U: VarIntTarget,
V: VarIntTarget,
W: VarIntTarget,
>(
bytes: *const u8,
) -> (T, U, V, W, u8, u8, u8, u8, bool) {
if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES > 16 {
panic!(
"exceeded length limit: cannot decode {}, {}, {}, and {}, total length {} exceeds 16 bytes",
core::any::type_name::<T>(),
core::any::type_name::<U>(),
core::any::type_name::<V>(),
core::any::type_name::<W>(),
T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES
);
}
if T::MAX_VARINT_BYTES <= 3
&& U::MAX_VARINT_BYTES <= 3
&& V::MAX_VARINT_BYTES <= 3
&& W::MAX_VARINT_BYTES <= 3
{
return decode_four_u16_unsafe(bytes);
}
let b = _mm_loadu_si128(bytes as *const __m128i);
let bitmask = _mm_movemask_epi8(b) as u32;
let bm_not = !bitmask;
let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
let second_len = bm_not_2.trailing_zeros() + 1;
let bm_not_3 = bm_not_2 >> second_len;
let third_len = bm_not_3.trailing_zeros() + 1;
let bm_not_4 = bm_not_3 >> third_len;
let fourth_len = bm_not_4.trailing_zeros() + 1;
let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let first_len_vec = _mm_set1_epi8(first_len as i8);
let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
let first = _mm_and_si128(b, first_mask);
let second_shuf = _mm_add_epi8(ascend, first_len_vec);
let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
let second_len_vec = _mm_set1_epi8(second_len as i8);
let second_mask = _mm_cmplt_epi8(ascend, second_len_vec);
let second = _mm_and_si128(second_shuffled, second_mask);
let third_shuf = _mm_add_epi8(ascend, second_len_vec);
let third_shuffled = _mm_shuffle_epi8(second_shuffled, third_shuf);
let third_len_vec = _mm_set1_epi8(third_len as i8);
let third_mask = _mm_cmplt_epi8(ascend, third_len_vec);
let third = _mm_and_si128(third_shuffled, third_mask);
let fourth_shuf = _mm_add_epi8(ascend, third_len_vec);
let fourth_shuffled = _mm_shuffle_epi8(third_shuffled, fourth_shuf);
let fourth_len_vec = _mm_set1_epi8(fourth_len as i8);
let fourth_mask = _mm_cmplt_epi8(ascend, fourth_len_vec);
let fourth = _mm_and_si128(fourth_shuffled, fourth_mask);
let first_num;
let second_num;
let third_num;
let fourth_num;
let should_turbo = T::MAX_VARINT_BYTES <= 4
&& U::MAX_VARINT_BYTES <= 4
&& V::MAX_VARINT_BYTES <= 4
&& W::MAX_VARINT_BYTES <= 4
&& cfg!(not(all(
target_feature = "bmi2",
very_fast_pdep
)));
if should_turbo {
let comb = _mm_or_si128(
_mm_or_si128(first, _mm_bslli_si128(second, 4)),
_mm_or_si128(_mm_bslli_si128(third, 8), _mm_bslli_si128(fourth, 12)),
);
let x = if T::MAX_VARINT_BYTES <= 2
&& U::MAX_VARINT_BYTES <= 2
&& V::MAX_VARINT_BYTES <= 2
&& W::MAX_VARINT_BYTES <= 2
{
_mm_or_si128(
_mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
)
} else {
_mm_or_si128(
_mm_or_si128(
_mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
)
};
let x: [u32; 4] = core::mem::transmute(x);
first_num = T::cast_u32(x[0]);
second_num = U::cast_u32(x[1]);
third_num = V::cast_u32(x[2]);
fourth_num = W::cast_u32(x[3]);
} else {
first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(third));
fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(fourth));
}
(
first_num,
second_num,
third_num,
fourth_num,
first_len as u8,
second_len as u8,
third_len as u8,
fourth_len as u8,
false,
)
}
#[inline]
#[cfg(any(target_feature = "ssse3", doc))]
#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
unsafe fn decode_four_u16_unsafe<
T: VarIntTarget,
U: VarIntTarget,
V: VarIntTarget,
W: VarIntTarget,
>(
bytes: *const u8,
) -> (T, U, V, W, u8, u8, u8, u8, bool) {
let b = _mm_loadu_si128(bytes as *const __m128i);
let bitmask = _mm_movemask_epi8(b) as u32;
let lookup = *lookup::LOOKUP_QUAD_STEP1.get_unchecked((bitmask & 0b111111111111) as usize);
let shuf = *lookup::LOOKUP_QUAD_VEC.get_unchecked((lookup & 0b11111111) as usize);
let first_len = (lookup >> 8) & 0b1111;
let second_len = (lookup >> 12) & 0b1111;
let third_len = (lookup >> 16) & 0b1111;
let fourth_len = (lookup >> 20) & 0b1111;
let comb = _mm_shuffle_epi8(b, shuf);
let invalid = lookup >> 31;
let first_num;
let second_num;
let third_num;
let fourth_num;
let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
if should_turbo {
let x = if T::MAX_VARINT_BYTES <= 2
&& U::MAX_VARINT_BYTES <= 2
&& V::MAX_VARINT_BYTES <= 2
&& W::MAX_VARINT_BYTES <= 2
{
_mm_or_si128(
_mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
)
} else {
_mm_or_si128(
_mm_or_si128(
_mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
),
_mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
)
};
let x: [u32; 4] = core::mem::transmute(x);
first_num = T::cast_u32(x[0]);
second_num = U::cast_u32(x[1]);
third_num = V::cast_u32(x[2]);
fourth_num = W::cast_u32(x[3]);
} else {
first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
comb, 4,
)));
third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
comb, 8,
)));
fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
comb, 12,
)));
}
(
first_num,
second_num,
third_num,
fourth_num,
first_len as u8,
second_len as u8,
third_len as u8,
fourth_len as u8,
invalid != 0,
)
}
#[inline]
#[cfg(any(target_feature = "ssse3", doc))]
#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
pub unsafe fn decode_eight_u8_unsafe(bytes: *const u8) -> ([u8; 8], u8) {
let b = _mm_loadu_si128(bytes as *const __m128i);
let ones = _mm_set1_epi8(1);
let mut lens = _mm_setzero_si128();
let mut shift = _mm_and_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), ones);
let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let asc_one = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let mut window_small = _mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
let broadcast_mask = _mm_setzero_si128();
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
window_small = _mm_bslli_si128(window_small, 1);
let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
let lens_invert = _mm_sub_epi8(ones, lens);
let mut cumul_lens = _mm_add_epi8(lens_invert, _mm_bslli_si128(lens_invert, 1));
cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 2));
cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 4));
cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 8));
let cumul_lens_2: [u8; 16] = core::mem::transmute(cumul_lens);
let last_len = 8 - cumul_lens_2[7] + 8;
let second = _mm_shuffle_epi8(
_mm_add_epi8(lens, ones),
_mm_setr_epi8(-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7),
);
let shuf_pt1 = _mm_or_si128(ascend, _mm_cmpeq_epi8(second, ones));
let x_shuf = _mm_shuffle_epi8(
_mm_bslli_si128(cumul_lens, 1),
_mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7),
);
let shuf = _mm_sub_epi8(shuf_pt1, x_shuf);
let comb = _mm_shuffle_epi8(b, shuf);
let x = _mm_or_si128(
_mm_and_si128(comb, _mm_set1_epi16(0x0000007f)),
_mm_srli_epi16(_mm_and_si128(comb, _mm_set1_epi16(0x00000100)), 1),
);
let shuf = _mm_shuffle_epi8(
x,
_mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1),
);
let lower: [u64; 2] = core::mem::transmute(shuf);
let nums = lower[0].to_ne_bytes();
(nums, last_len)
}