#![allow(clippy::match_same_arms, reason = "XXX")]
#![allow(clippy::similar_names, reason = "XXX")]
use core::arch::aarch64::*;
use core::mem::MaybeUninit;
use core::{hint, ptr, slice};
use crate::backend::generic::{decode_generic_unchecked, encode_generic_unchecked};
use crate::error::InvalidInput;
use crate::util::lut16;
#[target_feature(enable = "neon")]
pub(crate) unsafe fn encode_neon_unchecked<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
match src.len() {
0..16 => encode_generic_unchecked::<UPPER>(src, dst),
16 => encode_neon_unchecked_v128_exact::<UPPER>(src, dst),
17..=32 => encode_neon_unchecked_v128_overlapped::<UPPER>(src, dst),
33.. => encode_neon_unchecked_v128_with_trailing::<UPPER>(src, dst),
}
}
#[target_feature(enable = "neon")]
unsafe fn encode_neon_unchecked_v128_exact<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<uint8x16_t>() * 1;
debug_assert_eq!(src.len(), BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= dst.len());
let mask = vdupq_n_u8(0b_0000_1111);
let lut = vld1q_u8(lut16::<UPPER>().as_ptr());
let encode_v128 = |src: *const u8, dst: *mut u8| {
let chunk: uint8x16_t = vld1q_u8(src);
let mut hi = vshrq_n_u8(chunk, 4);
let mut lo = vandq_u8(chunk, mask);
lo = vqtbl1q_u8(lut, lo);
hi = vqtbl1q_u8(lut, hi);
let out = vzipq_u8(hi, lo);
vst1q_u8_x2(dst, out);
};
encode_v128(src.as_ptr(), dst.as_mut_ptr().cast());
}
#[target_feature(enable = "neon")]
unsafe fn encode_neon_unchecked_v128_overlapped<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<uint8x16_t>() * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X2: usize = size_of::<uint8x16_t>() * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let mask = vdupq_n_u8(0b_0000_1111);
let lut = vld1q_u8(lut16::<UPPER>().as_ptr());
let encode_v128 = |src: *const u8, dst: *mut u8| {
let chunk = vld1q_u8(src);
let mut hi = vshrq_n_u8(chunk, 4);
let mut lo = vandq_u8(chunk, mask);
lo = vqtbl1q_u8(lut, lo);
hi = vqtbl1q_u8(lut, hi);
let out = vzipq_u8(hi, lo);
vst1q_u8_x2(dst, out);
};
encode_v128(src.as_ptr(), dst.as_mut_ptr().cast());
encode_v128(
src.as_ptr()
.add(src.len())
.cast::<uint8x16_t>()
.sub(1)
.cast(),
dst.as_mut_ptr()
.add(src.len())
.cast::<uint8x16_t>()
.sub(2)
.cast(),
);
}
#[target_feature(enable = "neon")]
unsafe fn encode_neon_unchecked_v128_with_trailing<const UPPER: bool>(
src: &[u8],
dst: &mut [[MaybeUninit<u8>; 2]],
) {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<uint8x16_t>() * 1;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= dst.len());
let mask = vdupq_n_u8(0b_0000_1111);
let lut = vld1q_u8(lut16::<UPPER>().as_ptr());
let encode_v128 = |src: *const u8, dst: *mut u8| {
let chunk: uint8x16_t = vld1q_u8(src);
let mut hi = vshrq_n_u8(chunk, 4);
let mut lo = vandq_u8(chunk, mask);
lo = vqtbl1q_u8(lut, lo);
hi = vqtbl1q_u8(lut, hi);
let out = vzipq_u8(hi, lo);
vst1q_u8_x2(dst, out);
};
let batches = src.len() / BATCH_ELEMS_V128_X1;
let remainder = src.len() % BATCH_ELEMS_V128_X1;
for i in 0..batches {
encode_v128(
src.as_ptr().cast::<uint8x16_t>().add(i).cast(),
dst.as_mut_ptr().cast::<uint8x16_t>().add(i * 2).cast(),
);
}
encode_generic_unchecked::<UPPER>(
slice::from_raw_parts(src.as_ptr().add(batches * BATCH_ELEMS_V128_X1), remainder),
slice::from_raw_parts_mut(
dst.as_mut_ptr().add(batches * BATCH_ELEMS_V128_X1),
remainder,
),
);
}
#[target_feature(enable = "neon")]
#[inline]
pub(crate) unsafe fn decode_neon_unchecked(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
match src.len() {
8 => decode_neon_unchecked_v128_exact(src, dst),
16 => decode_neon_unchecked_v128x2_exact(src, dst),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_neon_unchecked_v128_overlapped(src, dst),
17.. => decode_neon_unchecked_v128x2_with_trailing(src, dst),
}
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn decode_neon_unchecked_v128_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<uint8x16_t>() / 2 * 1;
debug_assert_eq!(src.len(), BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= dst.len());
let n_c6 = vdupq_n_u8(0xFF_u8 - b'9');
let n_06 = vdupq_n_u8(0x06);
let n_f0 = vdupq_n_u8(0xF0);
let n_df = vdupq_n_u8(0xDF);
let u_a = vdupq_n_u8(b'A');
let n_0a = vdupq_n_u8(0x0A);
let mut invalid = 0;
let mut decode_v128 = |src: *const u8, dst: *mut u8| {
let chunk = vld1q_u8(src);
let n = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk, n_df), u_a), n_0a);
vminq_u8(d, a)
};
invalid |= vmaxvq_u8(n);
let b = {
let uint8x16x2_t(hi, lo) = vuzpq_u8(n, n);
vorr_u8(vshl_n_u8(vget_low_u8(hi), 4), vget_low_u8(lo))
};
vst1_u8(dst, b);
};
decode_v128(
src.cast::<uint8x16_t>().cast(),
dst.cast::<uint8x16_t>().cast(),
);
if invalid > 0x0F {
return Err(InvalidInput);
}
Ok(())
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn decode_neon_unchecked_v128_overlapped(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X1: usize = size_of::<uint8x16_t>() / 2 * 1;
#[allow(clippy::identity_op, reason = "XXX")]
const BATCH_ELEMS_V128_X2: usize = size_of::<uint8x16_t>() / 2 * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X1);
debug_assert!(src.len() <= BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let n_c6 = vdupq_n_u8(0xFF_u8 - b'9');
let n_06 = vdupq_n_u8(0x06);
let n_f0 = vdupq_n_u8(0xF0);
let n_df = vdupq_n_u8(0xDF);
let u_a = vdupq_n_u8(b'A');
let n_0a = vdupq_n_u8(0x0A);
let mut invalid = 0;
let mut decode_v128 = |src: *const u8, dst: *mut u8| {
let chunk = vld1q_u8(src);
let n = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk, n_df), u_a), n_0a);
vminq_u8(d, a)
};
invalid |= vmaxvq_u8(n);
let b = {
let uint8x16x2_t(hi, lo) = vuzpq_u8(n, n);
vorr_u8(vshl_n_u8(vget_low_u8(hi), 4), vget_low_u8(lo))
};
vst1_u8(dst, b);
};
decode_v128(
src.cast::<uint8x16_t>().cast(),
dst.cast::<uint8x16_t>().cast(),
);
decode_v128(
src.cast::<[u8; 2]>()
.add(src.len())
.cast::<uint8x16_t>()
.sub(1)
.cast(),
dst.cast::<MaybeUninit<u8>>()
.add(src.len())
.sub(size_of::<uint8x16_t>() / 2)
.cast::<uint8x16_t>()
.cast(),
);
if invalid > 0x0F {
return Err(InvalidInput);
}
Ok(())
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn decode_neon_unchecked_v128x2_exact(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
const BATCH_ELEMS_V128_X2: usize = size_of::<uint8x16_t>() / 2 * 2;
debug_assert_eq!(src.len(), BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let n_c6 = vdupq_n_u8(0xFF_u8 - b'9');
let n_06 = vdupq_n_u8(0x06);
let n_f0 = vdupq_n_u8(0xF0);
let n_df = vdupq_n_u8(0xDF);
let u_a = vdupq_n_u8(b'A');
let n_0a = vdupq_n_u8(0x0A);
let mut invalid = 0;
let mut decode_v128x2 = |src: *const u8, dst: *mut u8| {
let uint8x16x2_t(chunk0, chunk1) = vld1q_u8_x2(src);
let n0 = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk0, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk0, n_df), u_a), n_0a);
vminq_u8(d, a)
};
let n1 = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk1, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk1, n_df), u_a), n_0a);
vminq_u8(d, a)
};
invalid |= vmaxvq_u8(vorrq_u8(n0, n1));
let b01 = {
let uint8x16x2_t(hi, lo) = vuzpq_u8(n0, n1);
vorrq_u8(vshlq_n_u8(hi, 4), lo)
};
vst1q_u8(dst, b01);
};
decode_v128x2(
src.cast::<uint8x16x2_t>().cast(),
dst.cast::<uint8x16_t>().cast(),
);
if invalid > 0x0F {
return Err(InvalidInput);
}
Ok(())
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn decode_neon_unchecked_v128x2_with_trailing(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
const BATCH_ELEMS_V128_X2: usize = size_of::<uint8x16_t>() / 2 * 2;
debug_assert!(src.len() >= BATCH_ELEMS_V128_X2);
debug_assert!(src.len() <= dst.len());
let n_c6 = vdupq_n_u8(0xFF_u8 - b'9');
let n_06 = vdupq_n_u8(0x06);
let n_f0 = vdupq_n_u8(0xF0);
let n_df = vdupq_n_u8(0xDF);
let u_a = vdupq_n_u8(b'A');
let n_0a = vdupq_n_u8(0x0A);
let batches = src.len() / BATCH_ELEMS_V128_X2;
let remainder = src.len() % BATCH_ELEMS_V128_X2;
let mut invalid = 0;
let mut decode_v128x2 = |src: *const u8, dst: *mut u8| {
let uint8x16x2_t(chunk0, chunk1) = vld1q_u8_x2(src);
let n0 = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk0, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk0, n_df), u_a), n_0a);
vminq_u8(d, a)
};
let n1 = {
let d = vsubq_u8(vqsubq_u8(vaddq_u8(chunk1, n_c6), n_06), n_f0);
let a = vqaddq_u8(vsubq_u8(vandq_u8(chunk1, n_df), u_a), n_0a);
vminq_u8(d, a)
};
invalid |= vmaxvq_u8(vorrq_u8(n0, n1));
let b01 = {
let uint8x16x2_t(hi, lo) = vuzpq_u8(n0, n1);
vorrq_u8(vshlq_n_u8(hi, 4), lo)
};
vst1q_u8(dst, b01);
};
for i in 0..batches {
decode_v128x2(
src.cast::<uint8x16x2_t>().add(i).cast(),
dst.cast::<uint8x16_t>().add(i).cast(),
);
}
if invalid > 0x0F {
return Err(InvalidInput);
}
let src = ptr::slice_from_raw_parts(
src.cast::<[u8; 2]>().add(batches * BATCH_ELEMS_V128_X2),
remainder,
);
let dst = ptr::slice_from_raw_parts_mut(
dst.cast::<MaybeUninit<u8>>()
.add(batches * BATCH_ELEMS_V128_X2),
remainder,
);
match src.len() {
8 => decode_neon_unchecked_v128_exact(src, dst),
16 => hint::unreachable_unchecked(),
0..8 => decode_generic_unchecked::<false>(src, dst),
9..16 => decode_neon_unchecked_v128_overlapped(src, dst),
17.. => hint::unreachable_unchecked(),
}
}
#[cfg(test)]
mod smoking {
use super::*;
use crate::backend::tests::{
check_decode_validation_any_backend, check_encode_decode_any_backend,
};
fn decode_neon_unchecked_test(
src: *const [[u8; 2]],
dst: *mut [MaybeUninit<u8>],
) -> Result<(), InvalidInput> {
unsafe { decode_neon_unchecked(&*src, &mut *dst) }
}
#[test]
#[cfg_attr(any(miri, not(target_feature = "neon")), ignore)]
fn test_encode_decode_neon() {
check_encode_decode_any_backend::<true>(
encode_neon_unchecked::<true>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<false>(
encode_neon_unchecked::<false>,
decode_generic_unchecked::<false>,
);
check_encode_decode_any_backend::<true>(
encode_neon_unchecked::<true>,
decode_neon_unchecked_test,
);
check_encode_decode_any_backend::<false>(
encode_neon_unchecked::<false>,
decode_neon_unchecked_test,
);
}
#[test]
#[cfg_attr(any(miri, not(target_feature = "neon")), ignore)]
fn test_decode_validation_neon() {
check_decode_validation_any_backend(decode_neon_unchecked_test);
}
}