use crate::encode_default;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
const CHUNK_SIZE: usize = core::mem::size_of::<__m128i>();
cpufeatures::new!(cpuid_ssse3, "sse2", "ssse3");
pub(super) unsafe fn _encode(input: &[u8], output: &mut [u8], table: &[u8; 16]) {
    if input.len() < CHUNK_SIZE || !cpuid_ssse3::get() {
        return encode_default(input, output, table);
    }
    let hex_table = _mm_loadu_si128(table.as_ptr().cast());
    let mask_lo = _mm_set1_epi8(0x0F);
    #[allow(clippy::cast_possible_wrap)]
    let mask_hi = _mm_set1_epi8(0xF0u8 as i8);
    let input_chunks = input.chunks_exact(CHUNK_SIZE);
    let input_remainder = input_chunks.remainder();
    let mut i = 0;
    for input_chunk in input_chunks {
        let input_bytes = _mm_loadu_si128(input_chunk.as_ptr().cast());
        let mut lo = _mm_and_si128(input_bytes, mask_lo);
        let mut hi = _mm_srli_epi32::<4>(_mm_and_si128(input_bytes, mask_hi));
        lo = _mm_shuffle_epi8(hex_table, lo);
        hi = _mm_shuffle_epi8(hex_table, hi);
        let hex_lo = _mm_unpacklo_epi8(hi, lo);
        let hex_hi = _mm_unpackhi_epi8(hi, lo);
        let ptr = output.as_mut_ptr().add(i);
        i = i.checked_add(CHUNK_SIZE).unwrap_unchecked();
        _mm_storeu_si128(ptr.cast(), hex_lo);
        let ptr = output.as_mut_ptr().add(i);
        i = i.checked_add(CHUNK_SIZE).unwrap_unchecked();
        _mm_storeu_si128(ptr.cast(), hex_hi);
    }
    if !input_remainder.is_empty() {
        encode_default(input_remainder, output.get_unchecked_mut(i..), table);
    }
}