#![allow(unsafe_code)]
#[cfg(any(
target_arch = "x86",
target_arch = "x86_64",
target_arch = "aarch64",
all(target_arch = "arm", target_feature = "neon")
))]
use super::{Alphabet, encode_base64_value};
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::{uint8x16_t, vdupq_n_u8, vst1q_u8};
#[cfg(all(target_arch = "arm", target_feature = "neon"))]
use core::arch::arm::{uint8x16_t, vdupq_n_u8, vst1q_u8};
#[cfg(target_arch = "x86")]
use core::arch::x86::{
__m128i, __m256i, __m512i, _mm_setzero_si128, _mm_storeu_si128, _mm256_setzero_si256,
_mm256_storeu_si256, _mm512_setzero_si512, _mm512_storeu_si512,
};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
__m128i, __m256i, __m512i, _mm_setzero_si128, _mm_storeu_si128, _mm256_setzero_si256,
_mm256_storeu_si256, _mm512_setzero_si512, _mm512_storeu_si512,
};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum ActiveBackend {
Scalar,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum Candidate {
Scalar,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Avx512Vbmi,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Avx2,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Ssse3Sse41,
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
Neon,
#[cfg(target_arch = "wasm32")]
WasmSimd128,
}
#[must_use]
pub(crate) fn active_backend() -> ActiveBackend {
let _candidate = detected_candidate();
ActiveBackend::Scalar
}
#[must_use]
pub(crate) fn detected_candidate() -> Candidate {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if avx512_vbmi_base64_available() {
return Candidate::Avx512Vbmi;
}
if avx2_available() {
return Candidate::Avx2;
}
if ssse3_sse41_available() {
return Candidate::Ssse3Sse41;
}
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
{
if neon_available() {
return Candidate::Neon;
}
}
#[cfg(target_arch = "wasm32")]
{
if wasm_simd128_available() {
return Candidate::WasmSimd128;
}
}
Candidate::Scalar
}
#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
fn avx512_vbmi_base64_available() -> bool {
std::is_x86_feature_detected!("avx512f")
&& std::is_x86_feature_detected!("avx512bw")
&& std::is_x86_feature_detected!("avx512vl")
&& std::is_x86_feature_detected!("avx512vbmi")
}
#[cfg(all(not(feature = "std"), any(target_arch = "x86", target_arch = "x86_64")))]
fn avx512_vbmi_base64_available() -> bool {
cfg!(target_feature = "avx512f")
&& cfg!(target_feature = "avx512bw")
&& cfg!(target_feature = "avx512vl")
&& cfg!(target_feature = "avx512vbmi")
}
#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
fn avx2_available() -> bool {
std::is_x86_feature_detected!("avx2")
}
#[cfg(all(not(feature = "std"), any(target_arch = "x86", target_arch = "x86_64")))]
fn avx2_available() -> bool {
cfg!(target_feature = "avx2")
}
#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
fn ssse3_sse41_available() -> bool {
std::is_x86_feature_detected!("ssse3") && std::is_x86_feature_detected!("sse4.1")
}
#[cfg(all(not(feature = "std"), any(target_arch = "x86", target_arch = "x86_64")))]
fn ssse3_sse41_available() -> bool {
cfg!(target_feature = "ssse3") && cfg!(target_feature = "sse4.1")
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
fn neon_available() -> bool {
cfg!(target_arch = "aarch64") || cfg!(target_feature = "neon")
}
#[cfg(target_arch = "wasm32")]
fn wasm_simd128_available() -> bool {
cfg!(target_feature = "simd128")
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(dead_code, reason = "inactive prototype is not dispatchable yet")]
#[expect(
clippy::cast_ptr_alignment,
reason = "_mm512_storeu_si512 accepts unaligned pointers"
)]
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx512vbmi")]
pub(super) unsafe fn encode_48_bytes_avx512<A>(input: &[u8; 48], output: &mut [u8; 64])
where
A: Alphabet,
{
let zeros = _mm512_setzero_si512();
unsafe {
_mm512_storeu_si512(output.as_mut_ptr().cast::<__m512i>(), zeros);
}
let mut read = 0;
let mut write = 0;
while read < input.len() {
let b0 = input[read];
let b1 = input[read + 1];
let b2 = input[read + 2];
output[write] = encode_base64_value::<A>(b0 >> 2);
output[write + 1] = encode_base64_value::<A>(((b0 & 0b0000_0011) << 4) | (b1 >> 4));
output[write + 2] = encode_base64_value::<A>(((b1 & 0b0000_1111) << 2) | (b2 >> 6));
output[write + 3] = encode_base64_value::<A>(b2 & 0b0011_1111);
read += 3;
write += 4;
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(dead_code, reason = "inactive prototype is not dispatchable yet")]
#[expect(
clippy::cast_ptr_alignment,
reason = "_mm256_storeu_si256 accepts unaligned pointers"
)]
#[target_feature(enable = "avx2")]
pub(super) unsafe fn encode_24_bytes_avx2<A>(input: &[u8; 24], output: &mut [u8; 32])
where
A: Alphabet,
{
let zeros = _mm256_setzero_si256();
unsafe {
_mm256_storeu_si256(output.as_mut_ptr().cast::<__m256i>(), zeros);
}
let mut read = 0;
let mut write = 0;
while read < input.len() {
let b0 = input[read];
let b1 = input[read + 1];
let b2 = input[read + 2];
output[write] = encode_base64_value::<A>(b0 >> 2);
output[write + 1] = encode_base64_value::<A>(((b0 & 0b0000_0011) << 4) | (b1 >> 4));
output[write + 2] = encode_base64_value::<A>(((b1 & 0b0000_1111) << 2) | (b2 >> 6));
output[write + 3] = encode_base64_value::<A>(b2 & 0b0011_1111);
read += 3;
write += 4;
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(dead_code, reason = "inactive prototype is not dispatchable yet")]
#[expect(
clippy::cast_ptr_alignment,
reason = "_mm_storeu_si128 accepts unaligned pointers"
)]
#[target_feature(enable = "ssse3,sse4.1")]
pub(super) unsafe fn encode_12_bytes_ssse3_sse41<A>(input: &[u8; 12], output: &mut [u8; 16])
where
A: Alphabet,
{
let zeros = _mm_setzero_si128();
unsafe {
_mm_storeu_si128(output.as_mut_ptr().cast::<__m128i>(), zeros);
}
let mut read = 0;
let mut write = 0;
while read < input.len() {
let b0 = input[read];
let b1 = input[read + 1];
let b2 = input[read + 2];
output[write] = encode_base64_value::<A>(b0 >> 2);
output[write + 1] = encode_base64_value::<A>(((b0 & 0b0000_0011) << 4) | (b1 >> 4));
output[write + 2] = encode_base64_value::<A>(((b1 & 0b0000_1111) << 2) | (b2 >> 6));
output[write + 3] = encode_base64_value::<A>(b2 & 0b0011_1111);
read += 3;
write += 4;
}
}
#[cfg(any(
target_arch = "aarch64",
all(target_arch = "arm", target_feature = "neon")
))]
#[allow(dead_code, reason = "inactive prototype is not dispatchable yet")]
pub(super) unsafe fn encode_12_bytes_neon<A>(input: &[u8; 12], output: &mut [u8; 16])
where
A: Alphabet,
{
unsafe {
let zeros: uint8x16_t = vdupq_n_u8(0);
vst1q_u8(output.as_mut_ptr(), zeros);
}
let mut read = 0;
let mut write = 0;
while read < input.len() {
let b0 = input[read];
let b1 = input[read + 1];
let b2 = input[read + 2];
output[write] = encode_base64_value::<A>(b0 >> 2);
output[write + 1] = encode_base64_value::<A>(((b0 & 0b0000_0011) << 4) | (b1 >> 4));
output[write + 2] = encode_base64_value::<A>(((b1 & 0b0000_1111) << 2) | (b2 >> 6));
output[write + 3] = encode_base64_value::<A>(b2 & 0b0011_1111);
read += 3;
write += 4;
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{Engine, Standard, UrlSafe};
fn fill_pattern(output: &mut [u8], seed: usize) {
for (index, byte) in output.iter_mut().enumerate() {
let value = (index * 73 + seed * 19) % 256;
*byte = u8::try_from(value).unwrap();
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[test]
fn avx512_encode_prototype_matches_scalar_when_available() {
if detected_candidate() != Candidate::Avx512Vbmi {
return;
}
let mut input = [0; 48];
for seed in 0..64 {
fill_pattern(&mut input, seed);
let mut avx512_standard = [0x55; 64];
let mut scalar_standard = [0xaa; 64];
unsafe {
encode_48_bytes_avx512::<Standard>(&input, &mut avx512_standard);
}
let scalar_len = Engine::<Standard, true>::new()
.encode_slice(&input, &mut scalar_standard)
.unwrap();
assert_eq!(scalar_len, avx512_standard.len());
assert_eq!(avx512_standard, scalar_standard);
let mut avx512_url_safe = [0x55; 64];
let mut scalar_url_safe = [0xaa; 64];
unsafe {
encode_48_bytes_avx512::<UrlSafe>(&input, &mut avx512_url_safe);
}
let scalar_len = Engine::<UrlSafe, true>::new()
.encode_slice(&input, &mut scalar_url_safe)
.unwrap();
assert_eq!(scalar_len, avx512_url_safe.len());
assert_eq!(avx512_url_safe, scalar_url_safe);
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[test]
fn avx2_encode_prototype_matches_scalar_when_available() {
if detected_candidate() != Candidate::Avx2 {
return;
}
let mut input = [0; 24];
for seed in 0..64 {
fill_pattern(&mut input, seed);
let mut avx2_standard = [0x55; 32];
let mut scalar_standard = [0xaa; 32];
unsafe {
encode_24_bytes_avx2::<Standard>(&input, &mut avx2_standard);
}
let scalar_len = Engine::<Standard, true>::new()
.encode_slice(&input, &mut scalar_standard)
.unwrap();
assert_eq!(scalar_len, avx2_standard.len());
assert_eq!(avx2_standard, scalar_standard);
let mut avx2_url_safe = [0x55; 32];
let mut scalar_url_safe = [0xaa; 32];
unsafe {
encode_24_bytes_avx2::<UrlSafe>(&input, &mut avx2_url_safe);
}
let scalar_len = Engine::<UrlSafe, true>::new()
.encode_slice(&input, &mut scalar_url_safe)
.unwrap();
assert_eq!(scalar_len, avx2_url_safe.len());
assert_eq!(avx2_url_safe, scalar_url_safe);
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[test]
fn ssse3_sse41_encode_prototype_matches_scalar_when_available() {
if !ssse3_sse41_available() {
return;
}
let mut input = [0; 12];
for seed in 0..64 {
fill_pattern(&mut input, seed);
let mut ssse3_standard = [0x55; 16];
let mut scalar_standard = [0xaa; 16];
unsafe {
encode_12_bytes_ssse3_sse41::<Standard>(&input, &mut ssse3_standard);
}
let scalar_len = Engine::<Standard, true>::new()
.encode_slice(&input, &mut scalar_standard)
.unwrap();
assert_eq!(scalar_len, ssse3_standard.len());
assert_eq!(ssse3_standard, scalar_standard);
let mut ssse3_url_safe = [0x55; 16];
let mut scalar_url_safe = [0xaa; 16];
unsafe {
encode_12_bytes_ssse3_sse41::<UrlSafe>(&input, &mut ssse3_url_safe);
}
let scalar_len = Engine::<UrlSafe, true>::new()
.encode_slice(&input, &mut scalar_url_safe)
.unwrap();
assert_eq!(scalar_len, ssse3_url_safe.len());
assert_eq!(ssse3_url_safe, scalar_url_safe);
}
}
#[cfg(any(
target_arch = "aarch64",
all(target_arch = "arm", target_feature = "neon")
))]
#[test]
fn neon_encode_prototype_matches_scalar_when_available() {
if detected_candidate() != Candidate::Neon {
return;
}
let mut input = [0; 12];
for seed in 0..64 {
fill_pattern(&mut input, seed);
let mut neon_standard = [0x55; 16];
let mut scalar_standard = [0xaa; 16];
unsafe {
encode_12_bytes_neon::<Standard>(&input, &mut neon_standard);
}
let scalar_len = Engine::<Standard, true>::new()
.encode_slice(&input, &mut scalar_standard)
.unwrap();
assert_eq!(scalar_len, neon_standard.len());
assert_eq!(neon_standard, scalar_standard);
let mut neon_url_safe = [0x55; 16];
let mut scalar_url_safe = [0xaa; 16];
unsafe {
encode_12_bytes_neon::<UrlSafe>(&input, &mut neon_url_safe);
}
let scalar_len = Engine::<UrlSafe, true>::new()
.encode_slice(&input, &mut scalar_url_safe)
.unwrap();
assert_eq!(scalar_len, neon_url_safe.len());
assert_eq!(neon_url_safe, scalar_url_safe);
}
}
}