fashex 0.0.7

Hexadecimal string encoding and decoding with best-effort SIMD acceleration.
Documentation
//! Optimized implementations for `aarch64`.

#![allow(clippy::similar_names, reason = "XXX")]

use core::arch::aarch64::*;
use core::mem::MaybeUninit;

use crate::backend::generic::{decode_generic_unchecked, encode_generic_unchecked};
use crate::error::InvalidInput;
use crate::util::lut16;

#[target_feature(enable = "neon")]
/// ## Safety
///
/// We assume that:
///
/// 1. The CPU supports `neon`.
/// 2. `src.len() == dst.len()`.
pub(crate) unsafe fn encode_neon_unchecked<const UPPER: bool>(
    mut src: &[u8],
    mut dst: &mut [[MaybeUninit<u8>; 2]],
) {
    /// Process 16 bytes of input, and produce 16 * 2 bytes of output.
    const BATCH: usize = size_of::<uint8x16_t>();

    if src.len() >= BATCH {
        let m = vdupq_n_u8(0b_0000_1111);
        let lut = vld1q_u8(lut16::<UPPER>().as_ptr());

        while src.len() >= BATCH {
            // let [byte @ u8; 16]
            let chunk: uint8x16_t = vld1q_u8(src.as_ptr());

            // let [hi; 16] = [byte >> 4; 16];
            let mut hi = vshrq_n_u8(chunk, 4);
            // let [lo; 16] = [byte & 0b_0000_1111; 16];
            let mut lo = vandq_u8(chunk, m);

            // let [lo; 16] = [lut[lo]; 16];
            lo = vqtbl1q_u8(lut, lo);
            // let [hi; 16] = [lut[hi]; 16];
            hi = vqtbl1q_u8(lut, hi);

            // Interleave the nibbles ([hi[0], lo[0], hi[1], lo[1], ...]).
            let output = vzipq_u8(hi, lo);

            // Store the result.
            vst1q_u8_x2(dst.as_mut_ptr().cast(), output);

            src = &src[BATCH..];
            dst = dst.get_unchecked_mut(BATCH..);
        }
    }

    encode_generic_unchecked::<UPPER>(src, dst);
}

#[target_feature(enable = "neon")]
/// ## Safety
///
/// We assume that:
///
/// 1. The CPU supports `neon`.
/// 2. `src.len() == dst.len()`.
pub(crate) unsafe fn decode_neon_unchecked(
    mut src: &[[u8; 2]],
    mut dst: &mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
    /// Process 16 * 2 bytes of input, and produce 16 bytes of output.
    const BATCH: usize = size_of::<uint8x16_t>();

    /// Process 8 * 2 bytes of input, and produce 8 bytes of output.
    const TRAILING_BATCH: usize = BATCH / 2;

    if src.len() >= TRAILING_BATCH {
        let n_c6 = vdupq_n_u8(0xFF_u8 - b'9');
        let n_06 = vdupq_n_u8(0x06);
        let n_f0 = vdupq_n_u8(0xF0);

        let n_df = vdupq_n_u8(0xDF);
        let u_a = vdupq_n_u8(b'A');
        let n_0a = vdupq_n_u8(0x0A);

        macro_rules! n {
            ($chunk:ident) => {{
                // Digits '0'..'9' → 0..9, others > 15.
                let d = vsubq_u8(vqsubq_u8(vaddq_u8($chunk, n_c6), n_06), n_f0);
                // Letters 'A'..'F'/'a'..'f' → 10..15, others > 15.
                let a = vqaddq_u8(vsubq_u8(vandq_u8($chunk, n_df), u_a), n_0a);
                // Valid nibble wins (0..15), invalid stays > 15.
                vminq_u8(d, a)
            }};
        }

        while src.len() >= BATCH {
            let uint8x16x2_t(chunk0, chunk1) = vld1q_u8_x2(src.as_ptr().cast::<u8>());

            let n0 = n!(chunk0);
            let n1 = n!(chunk1);

            // Validate: invalid stays > 15.
            if vmaxvq_u8(n0) > 0x0F || vmaxvq_u8(n1) > 0x0F {
                return Err(InvalidInput);
            }

            let bytes = {
                let uint8x16x2_t(hi, lo) = vuzpq_u8(n0, n1);
                vorrq_u8(vshlq_n_u8(hi, 4), lo)
            };

            vst1q_u8(dst.as_mut_ptr().cast::<u8>(), bytes);

            src = &src[BATCH..];
            dst = dst.get_unchecked_mut(BATCH..);
        }

        if src.len() >= TRAILING_BATCH {
            let chunk = vld1q_u8(src.as_ptr().cast::<u8>());

            let n = n!(chunk);

            // Validate: invalid stays > 15.
            if vmaxvq_u8(n) > 0x0F {
                return Err(InvalidInput);
            }

            let bytes = {
                let uint8x16x2_t(hi, lo) = vuzpq_u8(n, n);
                vorr_u8(vshl_n_u8(vget_low_u8(hi), 4), vget_low_u8(lo))
            };

            vst1_u8(dst.as_mut_ptr().cast::<u8>(), bytes);

            src = &src[TRAILING_BATCH..];
            dst = dst.get_unchecked_mut(TRAILING_BATCH..);
        }
    }

    decode_generic_unchecked::<false>(src, dst)
}

#[cfg(test)]
mod smoking {
    use alloc::string::String;
    use alloc::vec;
    use alloc::vec::Vec;
    use core::mem::MaybeUninit;
    use core::slice;

    use super::*;
    use crate::util::{HEX_CHARS_LOWER, HEX_CHARS_UPPER};

    macro_rules! test {
        (
            Encode = $encode_f:ident;
            Decode = $($decode_f:ident),*;
            Case = $i:expr
        ) => {{
            let input = $i;

            let expected_lower = input
                .iter()
                .flat_map(|b| [
                    HEX_CHARS_LOWER[(*b >> 4) as usize] as char,
                    HEX_CHARS_LOWER[(*b & 0b1111) as usize] as char,
                ])
                .collect::<String>();
            let expected_upper = input
                .iter()
                .flat_map(|b| [
                    HEX_CHARS_UPPER[(*b >> 4) as usize] as char,
                    HEX_CHARS_UPPER[(*b & 0b1111) as usize] as char,
                ])
                .collect::<String>();

            let mut output_lower = vec![[MaybeUninit::<u8>::uninit(); 2]; input.len()];
            let mut output_upper = vec![[MaybeUninit::<u8>::uninit(); 2]; input.len()];

            unsafe {
                $encode_f::<false>(input, &mut output_lower);
                $encode_f::<true>(input, &mut output_upper);
            }

            let output_lower = unsafe {
                slice::from_raw_parts(
                    output_lower.as_ptr().cast::<[u8; 2]>(),
                    output_lower.len(),
                )
            };
            let output_upper = unsafe {
                slice::from_raw_parts(
                    output_upper.as_ptr().cast::<[u8; 2]>(),
                    output_upper.len(),
                )
            };

            assert_eq!(
                output_lower.as_flattened(),
                expected_lower.as_bytes(),
                "Encode error, expect \"{expected_lower}\", got \"{}\" ({:?})",
                str::from_utf8(output_lower.as_flattened()).unwrap_or("<invalid utf-8>"),
                output_lower.as_flattened()
            );
            assert_eq!(
                output_upper.as_flattened(),
                expected_upper.as_bytes(),
                "Encode error, expect \"{expected_upper}\", got \"{}\" ({:?})",
                str::from_utf8(output_upper.as_flattened()).unwrap_or("<invalid utf-8>"),
                output_upper.as_flattened()
            );

            $({
                let mut decoded_lower = vec![MaybeUninit::<u8>::uninit(); input.len()];
                let mut decoded_upper = vec![MaybeUninit::<u8>::uninit(); input.len()];

                unsafe {
                    $decode_f(output_lower, &mut decoded_lower).unwrap();
                    $decode_f(output_upper, &mut decoded_upper).unwrap();

                    assert_eq!(
                        decoded_lower.assume_init_ref(),
                        input,
                        "Decode error for {}, expect {:?}, got {:?}",
                        stringify!($decode_f),
                        input,
                        decoded_lower.assume_init_ref()
                    );
                    assert_eq!(
                        decoded_upper.assume_init_ref(),
                        input,
                        "Decode error for {}, expect {:?}, got {:?}",
                        stringify!($decode_f),
                        input,
                        decoded_upper.assume_init_ref()
                    );
                }
            })*
        }};
    }

    const CASE: &[u8; 513] = &[
        0xBD, 0xE8, 0xAC, 0xA5, 0x82, 0x41, 0x8A, 0x10, 0x66, 0x56, 0xE4, 0xF3, 0x06, 0x13, 0xD0,
        0x06, 0x3E, 0x19, 0x4B, 0x7E, 0xE1, 0xAB, 0x24, 0x03, 0x29, 0xD0, 0x8B, 0x91, 0x06, 0x56,
        0xF4, 0x44, 0x4E, 0x7B, 0x00, 0x76, 0xFB, 0xA3, 0xB4, 0x4F, 0x9E, 0x4E, 0x3E, 0x20, 0x89,
        0x29, 0x17, 0x47, 0x4D, 0x59, 0xF7, 0x9E, 0xAE, 0x0A, 0xB4, 0x16, 0xEB, 0x2B, 0x0D, 0xA2,
        0x35, 0x99, 0x1D, 0x94, 0xA0, 0x23, 0xFF, 0x60, 0x0F, 0x67, 0xDB, 0xB5, 0xEF, 0x89, 0xC2,
        0x3C, 0x2C, 0x24, 0x0E, 0x04, 0x05, 0x35, 0x31, 0xAA, 0x88, 0xB4, 0x04, 0x82, 0x21, 0x8B,
        0x24, 0x88, 0x3F, 0x19, 0x94, 0x36, 0xDB, 0x52, 0x9E, 0x89, 0x7D, 0x53, 0x6D, 0x8D, 0xDF,
        0xF7, 0xFD, 0x2A, 0x8F, 0x4B, 0x20, 0xAB, 0xAC, 0xA4, 0x4B, 0xBB, 0x5C, 0x10, 0x0D, 0x7B,
        0xEF, 0x3A, 0x03, 0xF7, 0x4D, 0x15, 0x10, 0x8C, 0xB1, 0x0A, 0x86, 0x6A, 0x19, 0x6F, 0x25,
        0xA6, 0xE3, 0x4B, 0xA8, 0x9D, 0x78, 0xC7, 0x19, 0x19, 0x09, 0x05, 0x08, 0x9A, 0xA1, 0x67,
        0x48, 0xF7, 0x9E, 0x3C, 0xFA, 0xD3, 0xFD, 0x5E, 0x1A, 0x09, 0xD8, 0x85, 0x7F, 0xA5, 0x73,
        0x34, 0xBF, 0x93, 0xCC, 0xF4, 0x8D, 0x8A, 0x62, 0xBD, 0xD5, 0x67, 0x39, 0x0D, 0xB7, 0x41,
        0x94, 0x7D, 0xB5, 0xB3, 0x5B, 0x95, 0x1F, 0x43, 0xE4, 0x77, 0x40, 0x41, 0x9E, 0x26, 0x34,
        0x73, 0x0D, 0x93, 0x0C, 0xE9, 0xB7, 0x3C, 0x97, 0x3D, 0xA4, 0xBC, 0xAA, 0xDA, 0xA9, 0xFB,
        0x78, 0xD8, 0xE4, 0xB4, 0xE8, 0x88, 0x29, 0x9B, 0xE4, 0x5B, 0xF4, 0x56, 0xC4, 0x0D, 0x50,
        0x05, 0x0F, 0x84, 0x51, 0xD4, 0x96, 0x3E, 0xC5, 0x4F, 0xCD, 0xEF, 0x2B, 0x0F, 0x78, 0x1D,
        0xE6, 0x4A, 0x90, 0xC6, 0xD8, 0xF7, 0x88, 0x0D, 0x58, 0x2C, 0xE7, 0x37, 0x4A, 0x94, 0x5F,
        0x56, 0x68, 0x84, 0xEE, 0xD2, 0xD6, 0x8A, 0xC9, 0x8A, 0x90, 0x70, 0xF7, 0x51, 0xC9, 0xD1,
        0x86, 0x5A, 0xB2, 0xD5, 0x91, 0xDB, 0xDF, 0x36, 0xF1, 0xD3, 0x69, 0xB1, 0x7D, 0x39, 0x0E,
        0xCC, 0x86, 0xEF, 0xBD, 0xBD, 0x13, 0x52, 0x2A, 0xFC, 0x72, 0x78, 0x14, 0x28, 0xDD, 0xD5,
        0xEE, 0xF8, 0x72, 0x0F, 0x26, 0x76, 0xC6, 0x5E, 0x1B, 0x50, 0x30, 0xDB, 0x93, 0xD9, 0x20,
        0xA3, 0x07, 0x4D, 0x85, 0x50, 0x40, 0x28, 0x1E, 0x40, 0x4B, 0x96, 0xD6, 0x8C, 0xAF, 0x8E,
        0xD4, 0xD7, 0x81, 0x31, 0x97, 0x47, 0x2A, 0x95, 0xC3, 0x03, 0xA2, 0x40, 0xC9, 0x55, 0xBF,
        0x64, 0x1A, 0xAB, 0x81, 0xA1, 0x6B, 0x6A, 0x56, 0x81, 0xDD, 0xD2, 0x68, 0x1D, 0xB7, 0xDB,
        0xD6, 0x9E, 0xDA, 0x84, 0xFC, 0x5B, 0xE0, 0x34, 0xAD, 0x61, 0x5E, 0xD1, 0xF5, 0x74, 0x79,
        0xE9, 0xED, 0xB5, 0x31, 0x3C, 0x7F, 0xB1, 0x44, 0xE0, 0x23, 0xAE, 0xBD, 0x9E, 0x13, 0x8A,
        0x9D, 0xAF, 0x48, 0x75, 0x06, 0x16, 0x58, 0x4A, 0x8B, 0xD3, 0xB7, 0x06, 0x14, 0xB5, 0x92,
        0xE2, 0xA1, 0x9F, 0xCF, 0x42, 0x3E, 0x99, 0x24, 0xE4, 0x65, 0x93, 0x84, 0x83, 0x66, 0x26,
        0x28, 0xEA, 0x3F, 0x05, 0x4E, 0xAC, 0x7C, 0x96, 0xF2, 0x50, 0x22, 0xF3, 0xCD, 0x90, 0x81,
        0x73, 0xBD, 0x3D, 0xCA, 0xD1, 0x2F, 0xC2, 0x3F, 0x20, 0xF0, 0x1C, 0x41, 0x9D, 0x9A, 0x85,
        0x1A, 0xC4, 0xB1, 0xE3, 0xBA, 0x52, 0xE7, 0xE3, 0x22, 0x72, 0x98, 0x76, 0xEC, 0x0B, 0xC4,
        0x07, 0xA3, 0x05, 0x01, 0xC0, 0x40, 0xA7, 0x0E, 0x8A, 0x0F, 0xDE, 0x5F, 0x65, 0xA3, 0x89,
        0x34, 0x3B, 0xFD, 0x9F, 0xE4, 0xB1, 0x6C, 0x1B, 0x40, 0xE6, 0xC2, 0x58, 0xE3, 0x62, 0xFC,
        0xB0, 0x22, 0x02, 0xD2, 0xE2, 0xF6, 0xFD, 0x4D, 0x64, 0xF5, 0x17, 0x07, 0x04, 0x34, 0x50,
        0x04, 0xEF, 0xAB,
    ];

    #[test]
    #[cfg_attr(miri, ignore)]
    fn test_neon() {
        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &[]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..15]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..16]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..17]
        };

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..31]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..32]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..33]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..63]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..64]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..65]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..127]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..128]
        }

        test! {
            Encode = encode_neon_unchecked;
            Decode = decode_neon_unchecked;
            Case = &CASE[..129]
        }
    }

    #[test]
    #[cfg_attr(miri, ignore)]
    fn test_validation() {
        for l in [
            15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 257, 511, 512, 513,
        ] {
            for c in 0u8..=255 {
                let mut bytes = vec![b'a'; l * 2];

                bytes[l] = c;

                let bytes = unsafe { bytes.as_chunks_unchecked() };

                if c.is_ascii_hexdigit() {
                    unsafe {
                        assert!(
                            decode_neon_unchecked(
                                bytes,
                                Vec::with_capacity(l).spare_capacity_mut()
                            )
                            .is_ok(),
                            "neon validation failed for byte {c} (l={l})",
                        );
                    }
                } else {
                    unsafe {
                        assert!(
                            decode_neon_unchecked(
                                bytes,
                                Vec::with_capacity(l).spare_capacity_mut()
                            )
                            .is_err(),
                            "neon validation failed for byte {c} (l={l})"
                        );
                    }
                }
            }
        }
    }
}