fashex 0.0.12

Hexadecimal string encoding and decoding with best-effort SIMD acceleration.
Documentation
//! Optimized implementations based on unstable [`core::simd`] APIs.

#![allow(clippy::similar_names, reason = "XXX")]

use core::mem::MaybeUninit;
use core::simd::prelude::*;
use core::{ptr, slice};

use crate::backend::generic::{decode_generic_unchecked, encode_generic_unchecked};
use crate::error::InvalidInput;
use crate::util::lut16;

#[cfg_attr(
    any(
        target_arch = "x86",
        target_arch = "x86_64",
        target_arch = "aarch64",
        target_arch = "arm64ec",
        target_arch = "loongarch32",
        target_arch = "loongarch64",
        target_arch = "wasm32"
    ),
    allow(dead_code, reason = "XXX")
)]
/// ## Safety
///
/// We assume that:
///
/// 1. `src.len() <= dst.len()`.
pub(crate) unsafe fn encode_simd128_unchecked<const UPPER: bool>(
    src: &[u8],
    dst: &mut [[MaybeUninit<u8>; 2]],
) {
    #[allow(clippy::identity_op, reason = "XXX")]
    /// Process 16 bytes of input, and produce 16 * 2 bytes of output.
    const BATCH_ELEMS_V128_X1: usize = size_of::<u8x16>() * 1;

    debug_assert!(src.len() <= dst.len());

    if src.len() >= BATCH_ELEMS_V128_X1 {
        let mask = u8x16::splat(0b_0000_1111);

        let lut = u8x16::from_slice(lut16::<UPPER>());

        let batches = src.len() / BATCH_ELEMS_V128_X1;
        let remainder = src.len() % BATCH_ELEMS_V128_X1;

        for i in 0..batches {
            let chunk =
                u8x16::from_slice(&src[i * BATCH_ELEMS_V128_X1..(i + 1) * BATCH_ELEMS_V128_X1]);

            let mut hi = chunk >> 4;
            let mut lo = chunk & mask;

            lo = lut.swizzle_dyn(lo);
            hi = lut.swizzle_dyn(hi);

            let (out0, out1) = u8x16::interleave(hi, lo);

            ptr::copy_nonoverlapping(
                out0.as_array().as_ptr(),
                dst.as_mut_ptr().cast::<u8x16>().add(2 * i).cast(),
                out0.len(),
            );
            ptr::copy_nonoverlapping(
                out1.as_array().as_ptr(),
                dst.as_mut_ptr().cast::<u8x16>().add(2 * i + 1).cast(),
                out1.len(),
            );
        }

        encode_generic_unchecked::<UPPER>(
            slice::from_raw_parts(src.as_ptr().add(batches * BATCH_ELEMS_V128_X1), remainder),
            slice::from_raw_parts_mut(
                dst.as_mut_ptr().add(batches * BATCH_ELEMS_V128_X1),
                remainder,
            ),
        );
    } else {
        encode_generic_unchecked::<UPPER>(src, dst);
    }
}

#[cfg_attr(
    any(
        target_arch = "x86",
        target_arch = "x86_64",
        target_arch = "aarch64",
        target_arch = "arm64ec",
        target_arch = "loongarch32",
        target_arch = "loongarch64",
        target_arch = "wasm32"
    ),
    allow(dead_code, reason = "XXX")
)]
#[allow(clippy::cast_possible_wrap, reason = "XXX")]
/// ## Safety
///
/// We assume that:
///
/// 1. `src.len() <= dst.len()`.
pub(crate) unsafe fn decode_simd128_unchecked(
    src: &[[u8; 2]],
    dst: &mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
    /// Process 2 * 8 * 2 bytes of input, and produce 2 * 8 bytes of output.
    const BATCH_ELEMS_V128_X2: usize = size_of::<i8x16>() / 2 * 2;

    if src.len() >= BATCH_ELEMS_V128_X2 {
        let n_c6 = i8x16::splat((0xFF_u8 - b'9') as i8);
        let n_06 = i8x16::splat(0x06_i8);
        let n_f0 = i8x16::splat(0xF0_u8 as i8);

        let n_df = i8x16::splat(0xDF_u8 as i8);
        let u_a = i8x16::splat(b'A' as i8);
        let n_0a = i8x16::splat(0x0A_i8);

        let n_0f = u8x16::splat(15);

        let batches = src.len() / BATCH_ELEMS_V128_X2;
        let remainder = src.len() % BATCH_ELEMS_V128_X2;

        let mut invalid = false;

        let mut decode_v128x2 = |src: *const u8, dst: *mut MaybeUninit<u8>| {
            let chunk0 =
                u8x16::from_slice(slice::from_raw_parts(src, size_of::<u8x16>())).cast::<i8>();
            let chunk1 = u8x16::from_slice(slice::from_raw_parts(
                src.add(size_of::<u8x16>()),
                size_of::<u8x16>(),
            ))
            .cast::<i8>();

            let n0 = {
                let d = (chunk0 + n_c6)
                    .cast::<u8>()
                    .saturating_sub(n_06.cast::<u8>())
                    .cast::<i8>()
                    - n_f0;

                let a = ((chunk0 & n_df) - u_a)
                    .cast::<u8>()
                    .saturating_add(n_0a.cast::<u8>());

                d.cast::<u8>().simd_min(a)
            };
            let n1 = {
                let d = (chunk1 + n_c6)
                    .cast::<u8>()
                    .saturating_sub(n_06.cast::<u8>())
                    .cast::<i8>()
                    - n_f0;

                let a = ((chunk1 & n_df) - u_a)
                    .cast::<u8>()
                    .saturating_add(n_0a.cast::<u8>());

                d.cast::<u8>().simd_min(a)
            };

            invalid |= (n0 | n1).simd_gt(n_0f).any();

            let b = {
                let (hi, lo) = Simd::deinterleave(n0, n1);
                (hi << 4) | lo
            };

            ptr::copy_nonoverlapping(b.as_array().as_ptr(), dst.cast::<u8>(), b.len());
        };

        for i in 0..batches {
            decode_v128x2(
                src.as_ptr().add(i * BATCH_ELEMS_V128_X2).cast::<u8>(),
                dst.as_mut_ptr()
                    .add(i * BATCH_ELEMS_V128_X2)
                    .cast::<MaybeUninit<u8>>(),
            );
        }

        if invalid {
            return Err(InvalidInput);
        }

        let src = ptr::slice_from_raw_parts(
            src.as_ptr()
                .cast::<[u8; 2]>()
                .add(batches * BATCH_ELEMS_V128_X2),
            remainder,
        );

        let dst = ptr::slice_from_raw_parts_mut(
            dst.as_mut_ptr()
                .cast::<MaybeUninit<u8>>()
                .add(batches * BATCH_ELEMS_V128_X2),
            remainder,
        );

        decode_generic_unchecked::<false>(src, dst)
    } else {
        decode_generic_unchecked::<false>(src, dst)
    }
}

#[cfg(test)]
mod smoking {
    use super::*;
    use crate::backend::tests::{
        check_decode_validation_any_backend, check_encode_decode_any_backend,
    };

    fn decode_simd128_unchecked_test(
        src: *const [[u8; 2]],
        dst: *mut [MaybeUninit<u8>],
    ) -> Result<(), InvalidInput> {
        unsafe { decode_simd128_unchecked(&*src, &mut *dst) }
    }

    #[test]
    fn test_encode_decode_simd128() {
        check_encode_decode_any_backend::<true>(
            encode_simd128_unchecked::<true>,
            decode_generic_unchecked::<false>,
        );
        check_encode_decode_any_backend::<false>(
            encode_simd128_unchecked::<false>,
            decode_generic_unchecked::<false>,
        );
        check_encode_decode_any_backend::<true>(
            encode_simd128_unchecked::<true>,
            decode_simd128_unchecked_test,
        );
        check_encode_decode_any_backend::<false>(
            encode_simd128_unchecked::<false>,
            decode_simd128_unchecked_test,
        );
    }

    #[test]
    fn test_decode_validation_simd128() {
        check_decode_validation_any_backend(decode_simd128_unchecked_test);
    }
}