use core::arch::x86_64::*;
use super::super::decode_full_group_into::decode_full_group_into;
use crate::base32::config::Base32DecodeConfig;
use crate::base32::error::Base32Error;
#[target_feature(enable = "avx2")]
#[inline]
#[allow(unsafe_op_in_unsafe_fn)]
pub(crate) unsafe fn avx2_decode_full_groups_into(
config: &Base32DecodeConfig,
dst: &mut [u8],
src: &[u8],
) -> Result<usize, Base32Error> {
debug_assert_eq!(src.len() % 32, 0);
let table_ptr = config.decode_table.as_ptr() as *const __m128i;
let tbl0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr));
let tbl1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(1)));
let tbl2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(2)));
let tbl3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(3)));
let tbl4 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(4)));
let tbl5 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(5)));
let tbl6 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(6)));
let tbl7 = _mm256_broadcastsi128_si256(_mm_loadu_si128(table_ptr.add(7)));
let pack_shuf_128 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4);
let pack_shuf = _mm256_broadcastsi128_si256(pack_shuf_128);
let mut src_offset = 0usize;
let mut dst_offset = 0usize;
while src_offset + 32 <= src.len() {
let input = _mm256_loadu_si256(src.as_ptr().add(src_offset) as *const __m256i);
if _mm256_movemask_epi8(input) != 0 {
let mut written = 0usize;
for group_offset in (0..32usize).step_by(8) {
written += decode_full_group_into(
config,
&src[src_offset + group_offset..src_offset + group_offset + 8],
src_offset + group_offset,
dst,
dst_offset + written,
)?;
}
dst_offset += written;
src_offset += 32;
continue;
}
let upper = _mm256_and_si256(input, _mm256_set1_epi8(0x70u8 as i8));
let low = _mm256_and_si256(input, _mm256_set1_epi8(0x0F));
let sel0 = _mm256_cmpeq_epi8(upper, _mm256_setzero_si256());
let sel1 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x10u8 as i8));
let sel2 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x20u8 as i8));
let sel3 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x30u8 as i8));
let sel4 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x40u8 as i8));
let sel5 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x50u8 as i8));
let sel6 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x60u8 as i8));
let sel7 = _mm256_cmpeq_epi8(upper, _mm256_set1_epi8(0x70u8 as i8));
let r0 = _mm256_and_si256(_mm256_shuffle_epi8(tbl0, low), sel0);
let r1 = _mm256_and_si256(_mm256_shuffle_epi8(tbl1, low), sel1);
let r2 = _mm256_and_si256(_mm256_shuffle_epi8(tbl2, low), sel2);
let r3 = _mm256_and_si256(_mm256_shuffle_epi8(tbl3, low), sel3);
let r4 = _mm256_and_si256(_mm256_shuffle_epi8(tbl4, low), sel4);
let r5 = _mm256_and_si256(_mm256_shuffle_epi8(tbl5, low), sel5);
let r6 = _mm256_and_si256(_mm256_shuffle_epi8(tbl6, low), sel6);
let r7 = _mm256_and_si256(_mm256_shuffle_epi8(tbl7, low), sel7);
let decoded = _mm256_or_si256(
_mm256_or_si256(_mm256_or_si256(r0, r1), _mm256_or_si256(r2, r3)),
_mm256_or_si256(_mm256_or_si256(r4, r5), _mm256_or_si256(r6, r7)),
);
if _mm256_movemask_epi8(_mm256_cmpeq_epi8(decoded, _mm256_set1_epi8(-1))) != 0 {
let mut written = 0usize;
for group_offset in (0..32usize).step_by(8) {
written += decode_full_group_into(
config,
&src[src_offset + group_offset..src_offset + group_offset + 8],
src_offset + group_offset,
dst,
dst_offset + written,
)?;
}
dst_offset += written;
src_offset += 32;
continue;
}
let t0 = _mm256_maddubs_epi16(decoded, _mm256_set1_epi16(0x0120));
let t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00010400u32 as i32));
let combined = _mm256_or_si256(_mm256_slli_epi64(t1, 20), _mm256_srli_epi64(t1, 32));
let packed = _mm256_shuffle_epi8(combined, pack_shuf);
let out_ptr = dst.as_mut_ptr().add(dst_offset);
let lo = _mm256_castsi256_si128(packed);
_mm_storel_epi64(out_ptr as *mut __m128i, lo);
let lo_shifted = _mm_srli_si128(lo, 8);
let lo_word = _mm_cvtsi128_si32(lo_shifted) as u16;
core::ptr::write_unaligned(out_ptr.add(8) as *mut u16, lo_word);
let hi = _mm256_extracti128_si256(packed, 1);
_mm_storel_epi64(out_ptr.add(10) as *mut __m128i, hi);
let hi_shifted = _mm_srli_si128(hi, 8);
let hi_word = _mm_cvtsi128_si32(hi_shifted) as u16;
core::ptr::write_unaligned(out_ptr.add(18) as *mut u16, hi_word);
src_offset += 32;
dst_offset += 20;
}
Ok(dst_offset)
}