use crate::encode::scalar;
use core::arch::aarch64::*;
use core::mem::MaybeUninit;
static LUT: [u8; 64] = [
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'A', b'B', b'C', b'D', b'E', b'F',
b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V',
b'W', b'X', b'Y', b'Z', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'(', b')',
];
#[inline]
#[target_feature(enable = "neon")]
fn reshuffle(v: uint8x16x3_t) -> uint8x16x4_t {
let mut result = uint8x16x4_t(
v.0,
vshrq_n_u8(v.0, 6),
vshrq_n_u8(v.1, 4),
vshrq_n_u8(v.2, 2),
);
result.1 = vsliq_n_u8(result.1, v.1, 2);
result.2 = vsliq_n_u8(result.2, v.2, 4);
result.0 = vandq_u8(result.0, vdupq_n_u8(0x3F));
result.1 = vandq_u8(result.1, vdupq_n_u8(0x3F));
result.2 = vandq_u8(result.2, vdupq_n_u8(0x3F));
result
}
#[target_feature(enable = "neon")]
pub unsafe fn encode_into_unchecked(input: &[u8], output: &mut [MaybeUninit<u8>]) -> usize {
let mut len = input.len();
let out_len = output.len();
let mut written = 0;
let mut ptr = input.as_ptr();
let mut out_ptr = output.as_mut_ptr();
let lut = unsafe { vld1q_u8_x4(&LUT as *const _) };
while len >= 48 {
let input_data = unsafe { vld3q_u8(ptr) };
let mut output_data = reshuffle(input_data);
output_data.0 = vqtbl4q_u8(lut, output_data.0);
output_data.1 = vqtbl4q_u8(lut, output_data.1);
output_data.2 = vqtbl4q_u8(lut, output_data.2);
output_data.3 = vqtbl4q_u8(lut, output_data.3);
unsafe {
vst4q_u8(out_ptr.cast(), output_data);
out_ptr = out_ptr.add(64);
written += 64;
ptr = ptr.add(48);
len -= 48;
}
}
written
+ unsafe {
scalar::encode_into_unchecked(
core::slice::from_raw_parts(ptr, len),
core::slice::from_raw_parts_mut(out_ptr, out_len - written),
)
}
}