use core::arch::x86_64::*;
use crate::base32::config::Base32EncodeConfig;
use crate::base32::error::Base32Error;
#[target_feature(enable = "ssse3")]
#[inline]
#[allow(unsafe_op_in_unsafe_fn)]
pub(crate) unsafe fn ssse3_encode_full_groups_into(
config: &Base32EncodeConfig,
dst: &mut [u8],
src: &[u8],
) -> Result<usize, Base32Error> {
debug_assert_eq!(src.len() % 5, 0);
let alphabet_ptr = config.alphabet.as_ptr();
let lo_table = _mm_loadu_si128(alphabet_ptr as *const __m128i);
let hi_table = _mm_loadu_si128(alphabet_ptr.add(16) as *const __m128i);
let spread_shuf = _mm_set_epi8(8, 9, 7, 8, 6, 7, 5, 6, 3, 4, 2, 3, 1, 2, 0, 1);
let mul_first = _mm_set_epi16(2048, 512, 128, 32, 2048, 512, 128, 32);
let mul_second = _mm_set_epi16(0, 16384, 4096, 1024, 0, 16384, 4096, 1024);
let mask_c7 = _mm_set_epi16(0x001F, 0, 0, 0, 0x001F, 0, 0, 0);
let mask_5bit = _mm_set1_epi16(0x001F);
let interleave_shuf = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
let mut src_offset = 0usize;
let mut dst_offset = 0usize;
while src_offset + 10 <= src.len() {
let input = _mm_loadu_si128(src.as_ptr().add(src_offset) as *const __m128i);
let windows = _mm_shuffle_epi8(input, spread_shuf);
let first_idx = _mm_and_si128(_mm_mulhi_epu16(windows, mul_first), mask_5bit);
let second_a = _mm_and_si128(_mm_mulhi_epu16(windows, mul_second), mask_5bit);
let c7_vals = _mm_and_si128(windows, mask_c7);
let second_idx = _mm_or_si128(second_a, c7_vals);
let packed = _mm_shuffle_epi8(_mm_packus_epi16(first_idx, second_idx), interleave_shuf);
let lo_idx = _mm_add_epi8(packed, _mm_set1_epi8(0x70u8 as i8));
let hi_idx = _mm_sub_epi8(packed, _mm_set1_epi8(16));
let chars = _mm_or_si128(
_mm_shuffle_epi8(lo_table, lo_idx),
_mm_shuffle_epi8(hi_table, hi_idx),
);
_mm_storeu_si128(dst.as_mut_ptr().add(dst_offset) as *mut __m128i, chars);
src_offset += 10;
dst_offset += 16;
}
Ok(dst_offset)
}