#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::avx2::*;
#[target_feature(enable = "avx2,bmi1,sse4.2,popcnt")]
unsafe fn decode_avx2(input: __m256i) -> (__m256i, u32, u32) {
let higher_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), _mm256_set1_epi8(0x0f));
let lower_nibble = _mm256_and_si256(input, _mm256_set1_epi8(0x0f));
#[rustfmt::skip]
let row_lut = dup_mm_setr_epu8([
0b1010_1100, 0b1111_1000, 0b1111_1000, 0b1111_1000,
0b1111_1000, 0b1111_1000, 0b1111_1000, 0b1111_1000,
0b1111_1000, 0b1111_1001, 0b1111_0001, 0b0101_0100,
0b0101_0001, 0b0101_0101, 0b0101_0000, 0b0111_0100,
]);
#[rustfmt::skip]
let column_lut = dup_mm_setr_epu8([
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0, 0, 0, 0, 0, 0, 0, 0,
]);
let row = _mm256_shuffle_epi8(row_lut, lower_nibble);
let column = _mm256_shuffle_epi8(column_lut, higher_nibble);
let valid = _mm256_and_si256(row, column);
let non_match = _mm256_cmpeq_epi8(valid, _mm256_setzero_si256());
let invalid_mask = _mm256_movemask_epi8(non_match);
#[rustfmt::skip]
let shift_lut = dup_mm_setr_epi8([
0, 0, 0,
4,
-65, -65,
-71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
]);
let shift = _mm256_shuffle_epi8(shift_lut, higher_nibble);
let shifted = _mm256_add_epi8(input, shift);
#[rustfmt::skip]
let spcrange_lut = dup_mm_setr_epu8([
0, 0, 0xff, 0, 0, 0xff, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
]);
#[rustfmt::skip]
let spcchar_lut = dup_mm_setr_epu8([
0, 0, 0, 0, 0, 0, 0, 0,
0, !62, !63, !62, 0, !63, 0, 0,
]);
let sel_range = _mm256_shuffle_epi8(spcrange_lut, higher_nibble);
let lo_sub_hi = _mm256_sub_epi8(lower_nibble, higher_nibble);
let specials = _mm256_shuffle_epi8(spcchar_lut, lo_sub_hi);
let sel_spec = _mm256_and_si256(sel_range, specials);
let result = _mm256_blendv_epi8(shifted, _mm256_not_si256(specials), sel_spec);
#[rustfmt::skip]
let valid_nonws_set = _mm_setr_epi8(
b'A' as _, b'Z' as _,
b'a' as _, b'z' as _,
b'0' as _, b'9' as _,
b'+' as _, b'+' as _,
b'/' as _, b'/' as _,
b'-' as _, b'-' as _,
b'_' as _, b'_' as _,
0, 0,
);
let lane0 = _mm256_extracti128_si256(input, 0);
let lane1 = _mm256_extracti128_si256(input, 1);
const CMP_FLAGS: i32 = _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_BIT_MASK;
let mask0 = _mm_cmpestrm(valid_nonws_set, 14, lane0, 16, CMP_FLAGS);
let mask1 = _mm_cmpestrm(valid_nonws_set, 14, lane1, 16, CMP_FLAGS);
let first = _mm_extract_epi16(mask0, 0) as u16;
let second = _mm_extract_epi16(mask1, 0) as u16;
let valid_mask = first as u32 + ((second as u32) << 16);
(result, invalid_mask as _, valid_mask as _)
}
#[target_feature(enable = "avx2,bmi1,sse4.2,popcnt")]
unsafe fn decode_block(block: &mut <Avx2 as super::Decoder>::Block) -> super::BlockResult {
let input = array_as_m256i(*block);
let (unpacked, invalid_mask, mut valid_mask) = decode_avx2(input);
let unpacked = m256i_as_array(unpacked);
let first_invalid = match invalid_mask.trailing_zeros() {
32 => None,
v => Some(v as _),
};
let out_length = valid_mask.count_ones() as _;
let mut out_iter = block.iter_mut();
for &val in unpacked.iter() {
if (valid_mask & 1) == 1 {
*out_iter.next().unwrap() = val;
}
valid_mask >>= 1;
}
super::BlockResult {
out_length,
first_invalid,
}
}
#[target_feature(enable = "avx2,bmi1,sse4.2,popcnt")]
unsafe fn pack_block(input: &<Avx2 as super::Packer>::Input, output: &mut [u8]) {
assert_eq!(output.len(), <Avx2 as super::Packer>::OUT_BUF_LEN);
let unpacked = array_as_m256i(*input);
let packed1 = _mm256_maddubs_epi16(unpacked, _mm256_set1_epi16(0x0140));
let packed2 = _mm256_madd_epi16(packed1, _mm256_set1_epi32(0x00011000));
#[rustfmt::skip]
let packed3 = _mm256_shuffle_epi8(packed2, dup_mm_setr_epu8([
2, 1, 0,
6, 5, 4,
10, 9, 8,
14, 13, 12,
0xff, 0xff, 0xff, 0xff,
]));
_mm_storeu_si128(
output.as_mut_ptr() as _,
_mm256_extracti128_si256(packed3, 0),
);
_mm_storeu_si128(
output.as_mut_ptr().offset(12) as _,
_mm256_extracti128_si256(packed3, 1),
);
}
#[derive(Copy, Clone)]
pub(super) struct Avx2 {
_private: (),
}
impl Avx2 {
#[target_feature(enable = "avx2,bmi1,sse4.2,popcnt")]
pub(super) unsafe fn new() -> Avx2 {
Avx2 { _private: () }
}
}
impl super::Decoder for Avx2 {
type Block = [u8; 32];
#[inline]
fn decode_block(self, block: &mut Self::Block) -> super::BlockResult {
unsafe { decode_block(block) }
}
#[inline(always)]
fn zero_block() -> Self::Block {
[b' '; 32]
}
}
impl super::Packer for Avx2 {
type Input = [u8; 32];
const OUT_BUF_LEN: usize = 28;
fn pack_block(self, input: &Self::Input, output: &mut [u8]) {
unsafe { pack_block(input, output) }
}
}