use core::arch::x86_64::*;
use crate::base32::config::Base32EncodeConfig;
use crate::base32::error::Base32Error;
#[target_feature(enable = "avx512f,avx512bw")]
#[inline]
#[allow(unsafe_op_in_unsafe_fn)]
pub(crate) unsafe fn avx512_encode_full_groups_into(
config: &Base32EncodeConfig,
dst: &mut [u8],
src: &[u8],
) -> Result<usize, Base32Error> {
debug_assert_eq!(src.len() % 5, 0);
let alphabet_ptr = config.alphabet.as_ptr();
let lo_table = _mm512_broadcast_i32x4(_mm_loadu_si128(alphabet_ptr as *const __m128i));
let hi_table = _mm512_broadcast_i32x4(_mm_loadu_si128(alphabet_ptr.add(16) as *const __m128i));
let spread_shuf =
_mm512_broadcast_i32x4(_mm_set_epi8(8, 9, 7, 8, 6, 7, 5, 6, 3, 4, 2, 3, 1, 2, 0, 1));
let mul_first = _mm512_broadcast_i32x4(_mm_set_epi16(2048, 512, 128, 32, 2048, 512, 128, 32));
let mul_second =
_mm512_broadcast_i32x4(_mm_set_epi16(0, 16384, 4096, 1024, 0, 16384, 4096, 1024));
let mask_c7 = _mm512_broadcast_i32x4(_mm_set_epi16(0x001F, 0, 0, 0, 0x001F, 0, 0, 0));
let mask_5bit = _mm512_set1_epi16(0x001F);
let interleave_shuf = _mm512_broadcast_i32x4(_mm_set_epi8(
15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0,
));
let mut src_offset = 0usize;
let mut dst_offset = 0usize;
while src_offset + 40 <= src.len() {
let l0 = _mm_loadu_si128(src.as_ptr().add(src_offset) as *const __m128i);
let l1 = _mm_loadu_si128(src.as_ptr().add(src_offset + 10) as *const __m128i);
let l2 = _mm_loadu_si128(src.as_ptr().add(src_offset + 20) as *const __m128i);
let l3 = _mm_loadu_si128(src.as_ptr().add(src_offset + 30) as *const __m128i);
let input = _mm512_broadcast_i32x4(l0);
let input = _mm512_inserti32x4(input, l1, 1);
let input = _mm512_inserti32x4(input, l2, 2);
let input = _mm512_inserti32x4(input, l3, 3);
let windows = _mm512_shuffle_epi8(input, spread_shuf);
let first_idx = _mm512_and_si512(_mm512_mulhi_epu16(windows, mul_first), mask_5bit);
let second_a = _mm512_and_si512(_mm512_mulhi_epu16(windows, mul_second), mask_5bit);
let c7_vals = _mm512_and_si512(windows, mask_c7);
let second_idx = _mm512_or_si512(second_a, c7_vals);
let packed =
_mm512_shuffle_epi8(_mm512_packus_epi16(first_idx, second_idx), interleave_shuf);
let lo_idx = _mm512_add_epi8(packed, _mm512_set1_epi8(0x70u8 as i8));
let hi_idx = _mm512_sub_epi8(packed, _mm512_set1_epi8(16));
let chars = _mm512_or_si512(
_mm512_shuffle_epi8(lo_table, lo_idx),
_mm512_shuffle_epi8(hi_table, hi_idx),
);
_mm512_storeu_si512(dst.as_mut_ptr().add(dst_offset) as *mut __m512i, chars);
src_offset += 40;
dst_offset += 64;
}
Ok(dst_offset)
}