#![allow(clippy::match_same_arms, reason = "XXX")]
#![allow(clippy::similar_names, reason = "XXX")]
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use core::mem::MaybeUninit;
use core::{hint, ptr, slice};
use crate::backend::generic::{decode_generic_unchecked, encode_generic_unchecked};
use crate::error::InvalidInput;
use crate::util::{lut16, lut32, lut64};
#[target_feature(enable = "ssse3")]
pub(crate) unsafe fn encode_ssse3_unchecked<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
match src.len() {
0..16 => encode_generic_unchecked::<UPPER>(src, dst),
16 => encode_ssse3_unchecked_v128_exact::<UPPER>(src, dst),
17..=32 => encode_ssse3_unchecked_v128_overlapped::<UPPER>(src, dst),
33.. => encode_ssse3_unchecked_v128_with_trailing::<UPPER>(src, dst),
}
}
#[target_feature(enable = "ssse3")]
unsafe fn encode_ssse3_unchecked_v128_exact<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<__m128i>() * 1;
debug_assert_eq!(src.len(), BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= dst.len());
let mask = _mm_set1_epi8(0b_0000_1111);
let lut = _mm_loadu_si128(lut16::<UPPER>().as_ptr().cast());
let encode_v128 = |src: *const __m128i, dst: *mut __m128i| {
let chunk = _mm_loadu_si128(src);
let hi = _mm_and_si128(_mm_srli_epi16::<4>(chunk), mask);
let lo = _mm_and_si128(chunk, mask);
let a = _mm_unpacklo_epi8(hi, lo);
let b = _mm_unpackhi_epi8(hi, lo);
let a = _mm_shuffle_epi8(lut, a);
let b = _mm_shuffle_epi8(lut, b);
_mm_storeu_si128(dst.add(0), a);
_mm_storeu_si128(dst.add(1), b);
};
encode_v128(
src.as_ptr().cast::<__m128i>(),
dst.as_mut_ptr().cast::<__m128i>(),
);
}
#[target_feature(enable = "ssse3")]
unsafe fn encode_ssse3_unchecked_v128_overlapped<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<__m128i>() * 1;
const BATCH_ELEMS_V128_X2: usize = size_of::<__m128i>() * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let mask = _mm_set1_epi8(0b_0000_1111);
let lut = _mm_loadu_si128(lut16::<UPPER>().as_ptr().cast());
let encode_v128 = |src: *const __m128i, dst: *mut __m128i| {
let chunk = _mm_loadu_si128(src);
let hi = _mm_and_si128(_mm_srli_epi16::<4>(chunk), mask);
let lo = _mm_and_si128(chunk, mask);
let a = _mm_unpacklo_epi8(hi, lo);
let b = _mm_unpackhi_epi8(hi, lo);
let a = _mm_shuffle_epi8(lut, a);
let b = _mm_shuffle_epi8(lut, b);
_mm_storeu_si128(dst.add(0), a);
_mm_storeu_si128(dst.add(1), b);
};
encode_v128(
src.as_ptr().cast::<__m128i>(),
dst.as_mut_ptr().cast::<__m128i>(),
);
encode_v128(
src.as_ptr().add(src.len()).cast::<__m128i>().sub(1),
dst.as_mut_ptr().add(src.len()).cast::<__m128i>().sub(2),
);
}
#[target_feature(enable = "ssse3")]
unsafe fn encode_ssse3_unchecked_v128_with_trailing<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<__m128i>() * 1;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= dst.len());
let mask = _mm_set1_epi8(0b_0000_1111);
let lut = _mm_loadu_si128(lut16::<UPPER>().as_ptr().cast());
let encode_v128 = |src: *const __m128i, dst: *mut __m128i| {
let chunk = _mm_loadu_si128(src);
let hi = _mm_and_si128(_mm_srli_epi16::<4>(chunk), mask);
let lo = _mm_and_si128(chunk, mask);
let a = _mm_unpacklo_epi8(hi, lo);
let b = _mm_unpackhi_epi8(hi, lo);
let a = _mm_shuffle_epi8(lut, a);
let b = _mm_shuffle_epi8(lut, b);
_mm_storeu_si128(dst.add(0), a);
_mm_storeu_si128(dst.add(1), b);
};
let batches = src.len() / BATCH_ELEMS_V128_X1;
let remainder = src.len() % BATCH_ELEMS_V128_X1;
for i in 0..batches {
encode_v128(
src.as_ptr().cast::<__m128i>().add(i),
dst.as_mut_ptr().cast::<__m128i>().add(i * 2),
);
}
encode_generic_unchecked::<UPPER>(
slice::from_raw_parts(src.as_ptr().add(batches * BATCH_ELEMS_V128_X1), remainder),
slice::from_raw_parts_mut(
dst.as_mut_ptr().add(batches * BATCH_ELEMS_V128_X1),
remainder,
),
);
}
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn encode_avx2_unchecked<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
match src.len() {
0..16 => encode_generic_unchecked::<UPPER>(src, dst),
16 => encode_ssse3_unchecked_v128_exact::<UPPER>(src, dst),
17..=32 => encode_ssse3_unchecked_v128_overlapped::<UPPER>(src, dst),
33..=64 => encode_avx2_unchecked_v256_overlapped::<UPPER>(src, dst),
65.. => encode_avx2_unchecked_v256_with_trailing::<UPPER>(src, dst),
}
}
#[target_feature(enable = "avx2")]
unsafe fn encode_avx2_unchecked_v256_overlapped<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V256_X1: usize = size_of::<__m256i>() * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V256_X2: usize = size_of::<__m256i>() * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V256_X1);
debug_assert!(src.len() <= BATCH_ELEMS_V256_X2);
debug_assert!(src.len() <= dst.len());
let mask = _mm256_set1_epi8(0b_0000_1111);
let lut = _mm256_loadu_si256(lut32::<UPPER>().as_ptr().cast());
let encode_v256 = |src: *const __m256i, dst: *mut __m256i| {
let chunk = _mm256_loadu_si256(src);
let hi = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask);
let lo = _mm256_and_si256(chunk, mask);
let ac = _mm256_unpacklo_epi8(hi, lo);
let bd = _mm256_unpackhi_epi8(hi, lo);
let ab = _mm256_permute2x128_si256::<0x20>(ac, bd);
let cd = _mm256_permute2x128_si256::<0x31>(ac, bd);
let ab = _mm256_shuffle_epi8(lut, ab);
let cd = _mm256_shuffle_epi8(lut, cd);
_mm256_storeu_si256(dst.add(0), ab);
_mm256_storeu_si256(dst.add(1), cd);
};
encode_v256(
src.as_ptr().cast::<__m256i>(),
dst.as_mut_ptr().cast::<__m256i>(),
);
encode_v256(
src.as_ptr().add(src.len()).cast::<__m256i>().sub(1),
dst.as_mut_ptr().add(src.len()).cast::<__m256i>().sub(2),
);
}
#[target_feature(enable = "avx2")]
unsafe fn encode_avx2_unchecked_v256_with_trailing<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V256_X1: usize = size_of::<__m256i>() * 1;
debug_assert!(src.len() >= BATCH_ELEMS_V256_X1);
debug_assert!(src.len() <= dst.len());
let mask = _mm256_set1_epi8(0b_0000_1111);
let lut = _mm256_loadu_si256(lut32::<UPPER>().as_ptr().cast());
let batches = src.len() / BATCH_ELEMS_V256_X1;
let remainder = src.len() % BATCH_ELEMS_V256_X1;
let encode_v256 = |src: *const __m256i, dst: *mut __m256i| {
let chunk = _mm256_loadu_si256(src);
let hi = _mm256_and_si256(_mm256_srli_epi16::<4>(chunk), mask);
let lo = _mm256_and_si256(chunk, mask);
let ac = _mm256_unpacklo_epi8(hi, lo);
let bd = _mm256_unpackhi_epi8(hi, lo);
let ab = _mm256_permute2x128_si256::<0x20>(ac, bd);
let cd = _mm256_permute2x128_si256::<0x31>(ac, bd);
let ab = _mm256_shuffle_epi8(lut, ab);
let cd = _mm256_shuffle_epi8(lut, cd);
_mm256_storeu_si256(dst.add(0), ab);
_mm256_storeu_si256(dst.add(1), cd);
};
for i in 0..batches {
encode_v256(
src.as_ptr().cast::<__m256i>().add(i),
dst.as_mut_ptr().cast::<__m256i>().add(i * 2),
);
}
encode_generic_unchecked::<UPPER>(
slice::from_raw_parts(src.as_ptr().add(batches * BATCH_ELEMS_V256_X1), remainder),
slice::from_raw_parts_mut(
dst.as_mut_ptr().add(batches * BATCH_ELEMS_V256_X1),
remainder,
),
);
}
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn encode_avx512_unchecked<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
match src.len() {
0..16 => encode_generic_unchecked::<UPPER>(src, dst),
16 => encode_ssse3_unchecked_v128_exact::<UPPER>(src, dst),
17..=32 => encode_ssse3_unchecked_v128_overlapped::<UPPER>(src, dst),
33..=64 => encode_avx2_unchecked_v256_overlapped::<UPPER>(src, dst),
65..=128 => encode_avx512_unchecked_v512_overlapped::<UPPER>(src, dst),
129.. => encode_avx512_unchecked_v512_with_trailing::<UPPER>(src, dst),
}
}
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn encode_avx512_unchecked_v512_overlapped<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V512_X1: usize = size_of::<__m512i>() * 1;
const BATCH_ELEMS_V512_X2: usize = size_of::<__m512i>() * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V512_X1);
debug_assert!(src.len() <= BATCH_ELEMS_V512_X2);
debug_assert!(src.len() <= dst.len());
let mask = _mm512_set1_epi8(0b_0000_1111);
let permute_abcd = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
let permute_efgh = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
let lut = _mm512_loadu_si512(lut64::<UPPER>().as_ptr().cast());
let encode_v512 = |src: *const __m512i, dst: *mut __m512i| {
let chunk = _mm512_loadu_si512(src);
let hi = _mm512_and_si512(_mm512_srli_epi16::<4>(chunk), mask);
let lo = _mm512_and_si512(chunk, mask);
let aceg = _mm512_unpacklo_epi8(hi, lo);
let bfdh = _mm512_unpackhi_epi8(hi, lo);
let abcd = _mm512_permutex2var_epi64(aceg, permute_abcd, bfdh);
let efgh = _mm512_permutex2var_epi64(aceg, permute_efgh, bfdh);
let abcd = _mm512_shuffle_epi8(lut, abcd);
let efgh = _mm512_shuffle_epi8(lut, efgh);
_mm512_storeu_si512(dst.add(0), abcd);
_mm512_storeu_si512(dst.add(1), efgh);
};
encode_v512(
src.as_ptr().cast::<__m512i>(),
dst.as_mut_ptr().cast::<__m512i>(),
);
encode_v512(
src.as_ptr().add(src.len()).cast::<__m512i>().sub(1),
dst.as_mut_ptr().add(src.len()).cast::<__m512i>().sub(2),
);
}
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn encode_avx512_unchecked_v512_with_trailing<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V512_X1: usize = size_of::<__m512i>() * 1;
debug_assert!(src.len() >= BATCH_ELEMS_V512_X1);
debug_assert!(src.len() <= dst.len());
let mask = _mm512_set1_epi8(0b_0000_1111);
let permute_abcd = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
let permute_efgh = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
let lut = _mm512_loadu_si512(lut64::<UPPER>().as_ptr().cast());
let batches = src.len() / BATCH_ELEMS_V512_X1;
let remainder = src.len() % BATCH_ELEMS_V512_X1;
for i in 0..batches {
let chunk = _mm512_loadu_si512(src.as_ptr().cast::<__m512i>().add(i));
let hi = _mm512_and_si512(_mm512_srli_epi16::<4>(chunk), mask);
let lo = _mm512_and_si512(chunk, mask);
let aceg = _mm512_unpacklo_epi8(hi, lo);
let bfdh = _mm512_unpackhi_epi8(hi, lo);
let abcd = _mm512_permutex2var_epi64(aceg, permute_abcd, bfdh);
let efgh = _mm512_permutex2var_epi64(aceg, permute_efgh, bfdh);
let abcd = _mm512_shuffle_epi8(lut, abcd);
let efgh = _mm512_shuffle_epi8(lut, efgh);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(i * 2), abcd);
_mm512_storeu_si512(dst.as_mut_ptr().cast::<__m512i>().add(i * 2 + 1), efgh);
}
encode_generic_unchecked::<UPPER>(
slice::from_raw_parts(src.as_ptr().add(batches * BATCH_ELEMS_V512_X1), remainder),
slice::from_raw_parts_mut(
dst.as_mut_ptr().add(batches * BATCH_ELEMS_V512_X1),
remainder,
),
);
}
#[target_feature(enable = "ssse3")]
#[inline]
pub(crate) unsafe fn decode_ssse3_unchecked(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
match src.len() {
8 => decode_ssse3_unchecked_v128_exact(src, dst),
16 => decode_ssse3_unchecked_v128x2_with_trailing(src, dst),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_ssse3_unchecked_v128_overlapped(src, dst),
17.. => decode_ssse3_unchecked_v128x2_with_trailing(src, dst),
}
}
#[target_feature(enable = "ssse3")]
unsafe fn decode_ssse3_unchecked_v128_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM128_X1: usize = size_of::<__m128i>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_MM128_X1);
let n_c6 = _mm_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm_set1_epi8(0x06);
let n_f0 = _mm_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm_set1_epi8(b'A'.cast_signed());
let n_0a = _mm_set1_epi8(0x0A);
let trick = _mm_set1_epi8(127 - 15);
let chunk0 = _mm_loadu_si128(src.cast::<__m128i>());
macro_rules! decode128 {
($chunk:expr) => {{
let d = _mm_sub_epi8(_mm_subs_epu8(_mm_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm_adds_epu8(_mm_sub_epi8(_mm_and_si128($chunk, n_df), u_a), n_0a);
_mm_min_epu8(d, a)
}};
}
let n0 = decode128!(chunk0);
if _mm_movemask_epi8(_mm_adds_epu8(n0, trick)) != 0 {
return Err(InvalidInput);
}
let weights = _mm_set1_epi16(0x0110);
let b0 = _mm_packus_epi16(_mm_maddubs_epi16(n0, weights), _mm_setzero_si128());
_mm_storel_epi64(dst.cast::<__m128i>(), b0);
Ok(())
}
#[target_feature(enable = "ssse3")]
unsafe fn decode_ssse3_unchecked_v128_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM128_X1: usize = size_of::<__m128i>() / 2 * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
debug_assert!(src.len() >= BATCH_MM128_X1);
debug_assert!(src.len() <= BATCH_MM256_X1);
let n_c6 = _mm_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm_set1_epi8(0x06);
let n_f0 = _mm_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm_set1_epi8(b'A'.cast_signed());
let n_0a = _mm_set1_epi8(0x0A);
let trick = _mm_set1_epi8(127 - 15);
let chunk0 = _mm_loadu_si128(src.cast::<__m128i>());
let chunk1 = _mm_loadu_si128(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<__m128i>()
.sub(1),
);
macro_rules! decode128 {
($chunk:expr) => {{
let d = _mm_sub_epi8(_mm_subs_epu8(_mm_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm_adds_epu8(_mm_sub_epi8(_mm_and_si128($chunk, n_df), u_a), n_0a);
_mm_min_epu8(d, a)
}};
}
let n0 = decode128!(chunk0);
let n1 = decode128!(chunk1);
if _mm_movemask_epi8(_mm_or_si128(
_mm_adds_epu8(n0, trick),
_mm_adds_epu8(n1, trick),
)) != 0
{
return Err(InvalidInput);
}
let weights = _mm_set1_epi16(0x0110);
let b0 = _mm_packus_epi16(_mm_maddubs_epi16(n0, weights), _mm_setzero_si128());
let b1 = _mm_packus_epi16(_mm_maddubs_epi16(n1, weights), _mm_setzero_si128());
_mm_storel_epi64(dst.cast::<__m128i>(), b0);
_mm_storel_epi64(
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.sub(size_of::<__m128i>() / 2)
.cast::<__m128i>(),
b1,
);
Ok(())
}
#[target_feature(enable = "ssse3")]
unsafe fn decode_ssse3_unchecked_v128x2_with_trailing(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
const BATCH_ELEMS_V128_X2: usize = size_of::<__m128i>() / 2 * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let n_c6 = _mm_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm_set1_epi8(0x06);
let n_f0 = _mm_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm_set1_epi8(b'A'.cast_signed());
let n_0a = _mm_set1_epi8(0x0A);
let trick = _mm_set1_epi8(127 - 15);
let weights = _mm_set1_epi16(0x0110);
let batches = src.len() / BATCH_ELEMS_V128_X2;
let remainder = src.len() % BATCH_ELEMS_V128_X2;
let mut i = 0;
let mut invalid = 0;
while i < batches {
let chunk0 = _mm_loadu_si128(src.cast::<__m128i>().add(i * 2));
let chunk1 = _mm_loadu_si128(src.cast::<__m128i>().add(i * 2 + 1));
macro_rules! decode128 {
($chunk:expr) => {{
let d = _mm_sub_epi8(_mm_subs_epu8(_mm_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm_adds_epu8(_mm_sub_epi8(_mm_and_si128($chunk, n_df), u_a), n_0a);
_mm_min_epu8(d, a)
}};
}
let n0 = decode128!(chunk0);
let n1 = decode128!(chunk1);
invalid |= _mm_movemask_epi8(_mm_or_si128(
_mm_adds_epu8(n0, trick),
_mm_adds_epu8(n1, trick),
));
let b0 = _mm_maddubs_epi16(n0, weights);
let b1 = _mm_maddubs_epi16(n1, weights);
let b01 = _mm_packus_epi16(b0, b1);
_mm_storeu_si128(dst.cast::<__m128i>().add(i), b01);
i += 1;
}
if invalid != 0 {
return Err(InvalidInput);
}
let src = ptr::slice_from_raw_parts(
src.cast::<[u8; 2]>().add(batches * BATCH_ELEMS_V128_X2),
remainder,
);
let dst = ptr::slice_from_raw_parts_mut(
dst.cast::<MaybeUninit<u8>>()
.add(batches * BATCH_ELEMS_V128_X2),
remainder,
);
match src.len() {
8 => decode_ssse3_unchecked_v128_exact(src, dst),
16 => hint::unreachable_unchecked(),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_ssse3_unchecked_v128_overlapped(src, dst),
17.. => hint::unreachable_unchecked(),
}
}
#[target_feature(enable = "avx2")]
#[inline]
pub(crate) unsafe fn decode_avx2_unchecked(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
match src.len() {
8 => decode_ssse3_unchecked_v128_exact(src, dst),
16 => decode_avx2_unchecked_v256_exact(src, dst),
32 => decode_avx2_unchecked_v256x2_with_trailing(src, dst),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_ssse3_unchecked_v128_overlapped(src, dst),
17..32 => decode_avx2_unchecked_v256_overlapped(src, dst),
33.. => decode_avx2_unchecked_v256x2_with_trailing(src, dst),
}
}
#[target_feature(enable = "avx2")]
unsafe fn decode_avx2_unchecked_v256_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_MM256_X1);
let n_c6 = _mm256_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm256_set1_epi8(0x06);
let n_f0 = _mm256_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm256_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm256_set1_epi8(b'A'.cast_signed());
let n_0a = _mm256_set1_epi8(0x0A);
let trick = _mm256_set1_epi8(127 - 15);
let chunk01 = _mm256_loadu_si256(src.cast::<__m256i>());
macro_rules! decode256 {
($chunk:expr) => {{
let d = _mm256_sub_epi8(_mm256_subs_epu8(_mm256_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm256_adds_epu8(_mm256_sub_epi8(_mm256_and_si256($chunk, n_df), u_a), n_0a);
_mm256_min_epu8(d, a)
}};
}
let n01 = decode256!(chunk01);
if _mm256_movemask_epi8(_mm256_adds_epu8(n01, trick)) != 0 {
return Err(InvalidInput);
}
let weights = _mm256_set1_epi16(0x0110);
let b01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_packus_epi16(
_mm256_maddubs_epi16(n01, weights),
_mm256_setzero_si256(),
));
_mm_storeu_si128(dst.cast::<__m128i>(), _mm256_castsi256_si128(b01));
Ok(())
}
#[target_feature(enable = "avx2")]
unsafe fn decode_avx2_unchecked_v256_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM512_X1: usize = size_of::<__m512i>() / 2 * 1;
debug_assert!(src.len() >= BATCH_MM256_X1);
debug_assert!(src.len() <= BATCH_MM512_X1);
let n_c6 = _mm256_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm256_set1_epi8(0x06);
let n_f0 = _mm256_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm256_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm256_set1_epi8(b'A'.cast_signed());
let n_0a = _mm256_set1_epi8(0x0A);
let trick = _mm256_set1_epi8(127 - 15);
let chunk0 = _mm256_loadu_si256(src.cast::<__m256i>());
let chunk1 = _mm256_loadu_si256(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<__m256i>()
.sub(1),
);
macro_rules! decode256 {
($chunk:expr) => {{
let d = _mm256_sub_epi8(_mm256_subs_epu8(_mm256_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm256_adds_epu8(_mm256_sub_epi8(_mm256_and_si256($chunk, n_df), u_a), n_0a);
_mm256_min_epu8(d, a)
}};
}
let n0 = decode256!(chunk0);
let n1 = decode256!(chunk1);
if _mm256_movemask_epi8(_mm256_or_si256(
_mm256_adds_epu8(n0, trick),
_mm256_adds_epu8(n1, trick),
)) != 0
{
return Err(InvalidInput);
}
let weights = _mm256_set1_epi16(0x0110);
let b0 = _mm256_castsi256_si128(_mm256_permute4x64_epi64::<0b11_01_10_00>(
_mm256_packus_epi16(_mm256_maddubs_epi16(n0, weights), _mm256_setzero_si256()),
));
let b1 = _mm256_castsi256_si128(_mm256_permute4x64_epi64::<0b11_01_10_00>(
_mm256_packus_epi16(_mm256_maddubs_epi16(n1, weights), _mm256_setzero_si256()),
));
_mm_storeu_si128(dst.cast::<__m128i>(), b0);
_mm_storeu_si128(
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.cast::<__m128i>()
.sub(1),
b1,
);
Ok(())
}
#[target_feature(enable = "avx2")]
unsafe fn decode_avx2_unchecked_v256x2_with_trailing(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
const BATCH_ELEMS_V256_X2: usize = size_of::<__m256i>() / 2 * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V256_X2);
let n_c6 = _mm256_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm256_set1_epi8(0x06);
let n_f0 = _mm256_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm256_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm256_set1_epi8(b'A'.cast_signed());
let n_0a = _mm256_set1_epi8(0x0A);
let trick = _mm256_set1_epi8(127 - 15);
let weights = _mm256_set1_epi16(0x0110);
let batches = src.len() / BATCH_ELEMS_V256_X2;
let remainder = src.len() % BATCH_ELEMS_V256_X2;
let mut i = 0;
let mut invalid = 0;
while i < batches {
let chunk0 = _mm256_loadu_si256(src.cast::<__m256i>().add(i * 2));
let chunk1 = _mm256_loadu_si256(src.cast::<__m256i>().add(i * 2 + 1));
macro_rules! decode256 {
($chunk:expr) => {{
let d =
_mm256_sub_epi8(_mm256_subs_epu8(_mm256_add_epi8($chunk, n_c6), n_06), n_f0);
let a =
_mm256_adds_epu8(_mm256_sub_epi8(_mm256_and_si256($chunk, n_df), u_a), n_0a);
_mm256_min_epu8(d, a)
}};
}
let n0 = decode256!(chunk0);
let n1 = decode256!(chunk1);
invalid |= _mm256_movemask_epi8(_mm256_or_si256(
_mm256_adds_epu8(n0, trick),
_mm256_adds_epu8(n1, trick),
));
let b0 = _mm256_maddubs_epi16(n0, weights);
let b1 = _mm256_maddubs_epi16(n1, weights);
let b01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_packus_epi16(b0, b1));
_mm256_storeu_si256(dst.cast::<__m256i>().add(i), b01);
i += 1;
}
if invalid != 0 {
return Err(InvalidInput);
}
let src = ptr::slice_from_raw_parts(
src.cast::<[u8; 2]>().add(batches * BATCH_ELEMS_V256_X2),
remainder,
);
let dst = ptr::slice_from_raw_parts_mut(
dst.cast::<MaybeUninit<u8>>()
.add(batches * BATCH_ELEMS_V256_X2),
remainder,
);
match src.len() {
8 => decode_ssse3_unchecked_v128_exact(src, dst),
16 => decode_avx2_unchecked_v256_exact(src, dst),
32 => hint::unreachable_unchecked(),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_ssse3_unchecked_v128_overlapped(src, dst),
17..32 => decode_avx2_unchecked_v256_overlapped(src, dst),
33.. => hint::unreachable_unchecked(),
}
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
#[inline]
pub(crate) unsafe fn decode_avx512_unchecked(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
match src.len() {
8 => decode_avx512_unchecked_v128_exact(src, dst),
16 => decode_avx512_unchecked_v256_exact(src, dst),
32 => decode_avx512_unchecked_v512_exact(src, dst),
..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_avx512_unchecked_v128_overlapped(src, dst),
17..32 => decode_avx512_unchecked_v256_overlapped(src, dst),
33..64 => decode_avx512_unchecked_v512_overlapped(src, dst),
64.. => decode_avx512_unchecked_v512x2_with_trailing(src, dst),
}
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v128_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM128_X1: usize = size_of::<__m128i>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_MM128_X1);
let n_c6 = _mm_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm_set1_epi8(0x06);
let n_f0 = _mm_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm_set1_epi8(b'A'.cast_signed());
let n_0a = _mm_set1_epi8(0x0A);
let n_0f = _mm_set1_epi8(0x0F);
let chunk0 = _mm_loadu_si128(src.cast::<__m128i>());
macro_rules! decode128 {
($chunk:expr) => {{
let d = _mm_sub_epi8(_mm_subs_epu8(_mm_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm_adds_epu8(_mm_sub_epi8(_mm_and_si128($chunk, n_df), u_a), n_0a);
_mm_min_epu8(d, a)
}};
}
let n0 = decode128!(chunk0);
if (_mm_cmpgt_epu8_mask(n0, n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm_set1_epi16(0x0110);
let b0 = _mm_cvtepi16_epi8(_mm_maddubs_epi16(n0, weights));
_mm_storel_epi64(dst.cast::<__m128i>(), b0);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v128_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM128_X1: usize = size_of::<__m128i>() / 2 * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
debug_assert!(src.len() >= BATCH_MM128_X1);
debug_assert!(src.len() <= BATCH_MM256_X1);
let n_c6 = _mm_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm_set1_epi8(0x06);
let n_f0 = _mm_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm_set1_epi8(b'A'.cast_signed());
let n_0a = _mm_set1_epi8(0x0A);
let n_0f = _mm_set1_epi8(0x0F);
let chunk0 = _mm_loadu_si128(src.cast::<__m128i>());
let chunk1 = _mm_loadu_si128(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<__m128i>()
.sub(1),
);
macro_rules! decode128 {
($chunk:expr) => {{
let d = _mm_sub_epi8(_mm_subs_epu8(_mm_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm_adds_epu8(_mm_sub_epi8(_mm_and_si128($chunk, n_df), u_a), n_0a);
_mm_min_epu8(d, a)
}};
}
let n0 = decode128!(chunk0);
let n1 = decode128!(chunk1);
if (_mm_cmpgt_epu8_mask(_mm_or_si128(n0, n1), n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm_set1_epi16(0x0110);
let b0 = _mm_cvtepi16_epi8(_mm_maddubs_epi16(n0, weights));
let b1 = _mm_cvtepi16_epi8(_mm_maddubs_epi16(n1, weights));
_mm_storel_epi64(dst.cast::<__m128i>(), b0);
_mm_storel_epi64(
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.sub(size_of::<__m128i>() / 2)
.cast::<__m128i>(),
b1,
);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v256_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_MM256_X1);
let n_c6 = _mm256_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm256_set1_epi8(0x06);
let n_f0 = _mm256_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm256_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm256_set1_epi8(b'A'.cast_signed());
let n_0a = _mm256_set1_epi8(0x0A);
let n_0f = _mm256_set1_epi8(0x0F);
let chunk0 = _mm256_loadu_si256(src.cast::<__m256i>());
macro_rules! decode256 {
($chunk:expr) => {{
let d = _mm256_sub_epi8(_mm256_subs_epu8(_mm256_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm256_adds_epu8(_mm256_sub_epi8(_mm256_and_si256($chunk, n_df), u_a), n_0a);
_mm256_min_epu8(d, a)
}};
}
let n0 = decode256!(chunk0);
if (_mm256_cmpgt_epu8_mask(n0, n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm256_set1_epi16(0x0110);
let b0 = _mm256_cvtepi16_epi8(_mm256_maddubs_epi16(n0, weights));
_mm_storeu_si128(dst.cast::<__m128i>(), b0);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v256_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM256_X1: usize = size_of::<__m256i>() / 2 * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM512_X1: usize = size_of::<__m512i>() / 2 * 1;
debug_assert!(src.len() >= BATCH_MM256_X1);
debug_assert!(src.len() <= BATCH_MM512_X1);
let n_c6 = _mm256_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm256_set1_epi8(0x06);
let n_f0 = _mm256_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm256_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm256_set1_epi8(b'A'.cast_signed());
let n_0a = _mm256_set1_epi8(0x0A);
let n_0f = _mm256_set1_epi8(0x0F);
let chunk0 = _mm256_loadu_si256(src.cast::<__m256i>());
let chunk1 = _mm256_loadu_si256(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<__m256i>()
.sub(1),
);
macro_rules! decode256 {
($chunk:expr) => {{
let d = _mm256_sub_epi8(_mm256_subs_epu8(_mm256_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm256_adds_epu8(_mm256_sub_epi8(_mm256_and_si256($chunk, n_df), u_a), n_0a);
_mm256_min_epu8(d, a)
}};
}
let n0 = decode256!(chunk0);
let n1 = decode256!(chunk1);
if (_mm256_cmpgt_epu8_mask(_mm256_or_si256(n0, n1), n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm256_set1_epi16(0x0110);
let b0 = _mm256_cvtepi16_epi8(_mm256_maddubs_epi16(n0, weights));
let b1 = _mm256_cvtepi16_epi8(_mm256_maddubs_epi16(n1, weights));
_mm_storeu_si128(dst.cast::<__m128i>(), b0);
_mm_storeu_si128(
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.cast::<__m128i>()
.sub(1),
b1,
);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v512_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM512_X1: usize = size_of::<__m512i>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_MM512_X1);
let n_c6 = _mm512_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm512_set1_epi8(0x06);
let n_f0 = _mm512_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm512_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm512_set1_epi8(b'A'.cast_signed());
let n_0a = _mm512_set1_epi8(0x0A);
let n_0f = _mm512_set1_epi8(0x0F);
let chunk0 = _mm512_loadu_si512(src.cast::<__m512i>());
macro_rules! decode512 {
($chunk:expr) => {{
let d = _mm512_sub_epi8(_mm512_subs_epu8(_mm512_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm512_adds_epu8(_mm512_sub_epi8(_mm512_and_si512($chunk, n_df), u_a), n_0a);
_mm512_min_epu8(d, a)
}};
}
let n0 = decode512!(chunk0);
if (_mm512_cmpgt_epu8_mask(n0, n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm512_set1_epi16(0x0110);
let b0 = _mm512_cvtepi16_epi8(_mm512_maddubs_epi16(n0, weights));
_mm256_storeu_si256(dst.cast::<__m256i>(), b0);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v512_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_MM512_X1: usize = size_of::<__m512i>() / 2 * 1;
const BATCH_MM512_X2: usize = size_of::<__m512i>() / 2 * 2;
debug_assert!(src.len() >= BATCH_MM512_X1);
debug_assert!(src.len() <= BATCH_MM512_X2);
let n_c6 = _mm512_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm512_set1_epi8(0x06);
let n_f0 = _mm512_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm512_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm512_set1_epi8(b'A'.cast_signed());
let n_0a = _mm512_set1_epi8(0x0A);
let n_0f = _mm512_set1_epi8(0x0F);
let chunk0 = _mm512_loadu_si512(src.cast::<__m512i>());
let chunk1 = _mm512_loadu_si512(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<__m512i>()
.sub(1),
);
macro_rules! decode512 {
($chunk:expr) => {{
let d = _mm512_sub_epi8(_mm512_subs_epu8(_mm512_add_epi8($chunk, n_c6), n_06), n_f0);
let a = _mm512_adds_epu8(_mm512_sub_epi8(_mm512_and_si512($chunk, n_df), u_a), n_0a);
_mm512_min_epu8(d, a)
}};
}
let n0 = decode512!(chunk0);
let n1 = decode512!(chunk1);
if (_mm512_cmpgt_epu8_mask(_mm512_or_si512(n0, n1), n_0f)) != 0 {
return Err(InvalidInput);
}
let weights = _mm512_set1_epi16(0x0110);
let b0 = _mm512_cvtepi16_epi8(_mm512_maddubs_epi16(n0, weights));
let b1 = _mm512_cvtepi16_epi8(_mm512_maddubs_epi16(n1, weights));
_mm256_storeu_si256(dst.cast::<__m256i>(), b0);
_mm256_storeu_si256(
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.cast::<__m256i>()
.sub(1),
b1,
);
Ok(())
}
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
unsafe fn decode_avx512_unchecked_v512x2_with_trailing(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
const BATCH_ELEMS_V512_X2: usize = size_of::<__m512i>() / 2 * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V512_X2);
let n_c6 = _mm512_set1_epi8((0xFF_u8 - b'9').cast_signed());
let n_06 = _mm512_set1_epi8(0x06);
let n_f0 = _mm512_set1_epi8(0xF0_u8.cast_signed());
let n_df = _mm512_set1_epi8(0xDF_u8.cast_signed());
let u_a = _mm512_set1_epi8(b'A'.cast_signed());
let n_0a = _mm512_set1_epi8(0x0A);
let n_0f = _mm512_set1_epi8(0x0F);
let weights = _mm512_set1_epi16(0x0110);
let batches = src.len() / BATCH_ELEMS_V512_X2;
let remainder = src.len() % BATCH_ELEMS_V512_X2;
let mut i = 0;
let mut invalid = 0;
while i < batches {
let chunk0 = _mm512_loadu_si512(src.cast::<__m512i>().add(i * 2));
let chunk1 = _mm512_loadu_si512(src.cast::<__m512i>().add(i * 2 + 1));
macro_rules! decode512 {
($chunk:expr) => {{
let d =
_mm512_sub_epi8(_mm512_subs_epu8(_mm512_add_epi8($chunk, n_c6), n_06), n_f0);
let a =
_mm512_adds_epu8(_mm512_sub_epi8(_mm512_and_si512($chunk, n_df), u_a), n_0a);
_mm512_min_epu8(d, a)
}};
}
let n0 = decode512!(chunk0);
let n1 = decode512!(chunk1);
invalid |= _mm512_cmpgt_epu8_mask(_mm512_or_si512(n0, n1), n_0f);
let b0 = _mm512_cvtepi16_epi8(_mm512_maddubs_epi16(n0, weights));
let b1 = _mm512_cvtepi16_epi8(_mm512_maddubs_epi16(n1, weights));
_mm256_storeu_si256(dst.cast::<__m256i>().add(i * 2), b0);
_mm256_storeu_si256(dst.cast::<__m256i>().add(i * 2 + 1), b1);
i += 1;
}
if invalid != 0 {
return Err(InvalidInput);
}
let src = ptr::slice_from_raw_parts(
src.cast::<[u8; 2]>().add(batches * BATCH_ELEMS_V512_X2),
remainder,
);
let dst = ptr::slice_from_raw_parts_mut(
dst.cast::<MaybeUninit<u8>>()
.add(batches * BATCH_ELEMS_V512_X2),
remainder,
);
match remainder {
8 => decode_avx512_unchecked_v128_exact(src, dst),
16 => decode_avx512_unchecked_v256_exact(src, dst),
32 => decode_avx512_unchecked_v512_exact(src, dst),
..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_avx512_unchecked_v128_overlapped(src, dst),
17..32 => decode_avx512_unchecked_v256_overlapped(src, dst),
33..64 => decode_avx512_unchecked_v512_overlapped(src, dst),
64.. => hint::unreachable_unchecked(),
}
}
#[cfg(test)]
mod smoking {
use super::*;
use crate::backend::tests::{
check_decode_validation_any_backend, check_encode_decode_any_backend,
};
#[test]
#[cfg_attr(not(target_feature = "ssse3"), ignore)]
fn test_encode_decode_ssse3() {
check_encode_decode_any_backend::<true>(
encode_ssse3_unchecked::<true>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<false>(
encode_ssse3_unchecked::<false>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<true>(
encode_ssse3_unchecked::<true>,
decode_ssse3_unchecked,
);
check_encode_decode_any_backend::<false>(
encode_ssse3_unchecked::<false>,
decode_ssse3_unchecked,
);
}
#[test]
#[cfg_attr(not(target_feature = "avx2"), ignore)]
fn test_encode_decode_avx2() {
check_encode_decode_any_backend::<true>(
encode_avx2_unchecked::<true>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<false>(
encode_avx2_unchecked::<false>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<true>(
encode_avx2_unchecked::<true>,
decode_avx2_unchecked,
);
check_encode_decode_any_backend::<false>(
encode_avx2_unchecked::<false>,
decode_avx2_unchecked,
);
}
#[test]
#[cfg_attr(
not(all(
target_feature = "avx512f",
target_feature = "avx512bw",
target_feature = "avx512vl"
)),
ignore
)]
fn test_encode_decode_avx512() {
check_encode_decode_any_backend::<true>(
encode_avx512_unchecked::<true>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<false>(
encode_avx512_unchecked::<false>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<true>(
encode_avx512_unchecked::<true>,
decode_avx512_unchecked,
);
check_encode_decode_any_backend::<false>(
encode_avx512_unchecked::<false>,
decode_avx512_unchecked,
);
}
#[test]
#[cfg_attr(any(miri, not(target_feature = "ssse3")), ignore)]
fn test_decode_validation_ssse3() {
check_decode_validation_any_backend(decode_ssse3_unchecked);
}
#[test]
#[cfg_attr(any(miri, not(target_feature = "avx2")), ignore)]
fn test_decode_validation_avx2() {
check_decode_validation_any_backend(decode_avx2_unchecked);
}
#[test]
#[cfg_attr(
any(
miri,
not(all(
target_feature = "avx512f",
target_feature = "avx512bw",
target_feature = "avx512vl"
))
),
ignore
)]
fn test_decode_validation_avx512() {
check_decode_validation_any_backend(decode_avx512_unchecked);
}
}