#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(not(feature = "simd"))]
fn is_avx2_available() -> bool { true }
#[cfg(feature = "simd")]
use crate::simd::dispatch::is_avx2_available;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
pub unsafe fn clock_mix_avx2_impl(message: &mut [u64; 16]) {
unsafe { clock_mix_xor_rotate_avx2(message) };
unsafe { clock_mix_sbox_avx2(message) };
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
unsafe fn clock_mix_xor_rotate_avx2(message: &mut [u64; 16]) {
use crate::constants::ROTATION_SCHEDULE;
let mut rotated = [0u64; 16];
for i in 0..16 {
rotated[i] = message[(i + 1) % 16];
}
let rot_ptr = rotated.as_ptr() as *const __m256i;
let mut rot0 = unsafe { _mm256_loadu_si256(rot_ptr) };
let mut rot1 = unsafe { _mm256_loadu_si256(rot_ptr.add(1)) };
let mut rot2 = unsafe { _mm256_loadu_si256(rot_ptr.add(2)) };
let mut rot3 = unsafe { _mm256_loadu_si256(rot_ptr.add(3)) };
let rot_sched0 = _mm256_set_epi64x(
ROTATION_SCHEDULE[3] as i64,
ROTATION_SCHEDULE[2] as i64,
ROTATION_SCHEDULE[1] as i64,
ROTATION_SCHEDULE[0] as i64,
);
let rot_sched1 = _mm256_set_epi64x(
ROTATION_SCHEDULE[7] as i64,
ROTATION_SCHEDULE[6] as i64,
ROTATION_SCHEDULE[5] as i64,
ROTATION_SCHEDULE[4] as i64,
);
let rot_sched2 = _mm256_set_epi64x(
ROTATION_SCHEDULE[11] as i64,
ROTATION_SCHEDULE[10] as i64,
ROTATION_SCHEDULE[9] as i64,
ROTATION_SCHEDULE[8] as i64,
);
let rot_sched3 = _mm256_set_epi64x(
ROTATION_SCHEDULE[15] as i64,
ROTATION_SCHEDULE[14] as i64,
ROTATION_SCHEDULE[13] as i64,
ROTATION_SCHEDULE[12] as i64,
);
rot0 = unsafe { avx2_rotate_left_epi64(rot0, rot_sched0) };
rot1 = unsafe { avx2_rotate_left_epi64(rot1, rot_sched1) };
rot2 = unsafe { avx2_rotate_left_epi64(rot2, rot_sched2) };
rot3 = unsafe { avx2_rotate_left_epi64(rot3, rot_sched3) };
let msg_ptr = message.as_ptr() as *const __m256i;
let mut msg0 = unsafe { _mm256_loadu_si256(msg_ptr) };
let mut msg1 = unsafe { _mm256_loadu_si256(msg_ptr.add(1)) };
let mut msg2 = unsafe { _mm256_loadu_si256(msg_ptr.add(2)) };
let mut msg3 = unsafe { _mm256_loadu_si256(msg_ptr.add(3)) };
msg0 = _mm256_xor_si256(msg0, rot0);
msg1 = _mm256_xor_si256(msg1, rot1);
msg2 = _mm256_xor_si256(msg2, rot2);
msg3 = _mm256_xor_si256(msg3, rot3);
unsafe { _mm256_storeu_si256(message.as_mut_ptr() as *mut __m256i, msg0) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(4) as *mut __m256i, msg1) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(8) as *mut __m256i, msg2) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(12) as *mut __m256i, msg3) };
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn avx2_rotate_left_epi64(x: __m256i, n: __m256i) -> __m256i {
let sixty_four = _mm256_set1_epi64x(64);
let right_shift = _mm256_sub_epi64(sixty_four, n);
let left_shifted = _mm256_sllv_epi64(x, n);
let right_shifted = _mm256_srlv_epi64(x, right_shift);
_mm256_or_si256(left_shifted, right_shifted)
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
unsafe fn clock_mix_sbox_avx2(message: &mut [u64; 16]) {
use crate::constants::SBOX;
let msg_ptr = message.as_ptr() as *const __m256i;
let mut msg0 = unsafe { _mm256_loadu_si256(msg_ptr) };
let mut msg1 = unsafe { _mm256_loadu_si256(msg_ptr.add(1)) };
let mut msg2 = unsafe { _mm256_loadu_si256(msg_ptr.add(2)) };
let mut msg3 = unsafe { _mm256_loadu_si256(msg_ptr.add(3)) };
let mask_8bit = _mm256_set1_epi64x(0xFF);
let indices0 = _mm256_and_si256(msg0, mask_8bit);
let indices1 = _mm256_and_si256(msg1, mask_8bit);
let indices2 = _mm256_and_si256(msg2, mask_8bit);
let indices3 = _mm256_and_si256(msg3, mask_8bit);
let sbox_ptr = SBOX.as_ptr();
let sbox_vals0 = unsafe { avx2_gather_sbox(indices0, sbox_ptr) };
let sbox_vals1 = unsafe { avx2_gather_sbox(indices1, sbox_ptr) };
let sbox_vals2 = unsafe { avx2_gather_sbox(indices2, sbox_ptr) };
let sbox_vals3 = unsafe { avx2_gather_sbox(indices3, sbox_ptr) };
msg0 = _mm256_add_epi64(msg0, sbox_vals0);
msg1 = _mm256_add_epi64(msg1, sbox_vals1);
msg2 = _mm256_add_epi64(msg2, sbox_vals2);
msg3 = _mm256_add_epi64(msg3, sbox_vals3);
unsafe { _mm256_storeu_si256(message.as_mut_ptr() as *mut __m256i, msg0) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(4) as *mut __m256i, msg1) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(8) as *mut __m256i, msg2) };
unsafe { _mm256_storeu_si256(message.as_mut_ptr().add(12) as *mut __m256i, msg3) };
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn avx2_gather_sbox(indices: __m256i, sbox_ptr: *const u8) -> __m256i {
let idx0 = _mm256_extract_epi64(indices, 0) as usize;
let idx1 = _mm256_extract_epi64(indices, 1) as usize;
let idx2 = _mm256_extract_epi64(indices, 2) as usize;
let idx3 = _mm256_extract_epi64(indices, 3) as usize;
let sbox0 = unsafe { *sbox_ptr.add(idx0) } as u64;
let sbox1 = unsafe { *sbox_ptr.add(idx1) } as u64;
let sbox2 = unsafe { *sbox_ptr.add(idx2) } as u64;
let sbox3 = unsafe { *sbox_ptr.add(idx3) } as u64;
_mm256_set_epi64x(sbox3 as i64, sbox2 as i64, sbox1 as i64, sbox0 as i64)
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
pub unsafe fn clock_permute_avx2(state: &mut [u64; 8]) {
crate::clockpermute::clock_permute(state);
}
#[cfg(test)]
mod tests {
extern crate alloc;
use super::*;
use crate::simd::dispatch::is_avx2_available;
use alloc::vec::Vec;
#[test]
fn test_avx2_implementation_edge_cases() {
if !is_avx2_available() {
return;
}
let mut unaligned_data = Vec::from([0u64; 16]);
for i in 0..16 {
unaligned_data[i] = (i as u64).wrapping_mul(0x1111111111111111);
}
let mut aligned_data = unaligned_data.clone();
let unaligned_array: &mut [u64; 16] = unaligned_data.as_mut_slice().try_into().unwrap();
let aligned_array: &mut [u64; 16] = aligned_data.as_mut_slice().try_into().unwrap();
crate::simd::scalar::scalar_clock_mix(unaligned_array);
unsafe { clock_mix_avx2_impl(aligned_array) };
assert_eq!(unaligned_data, aligned_data);
}
#[test]
fn test_avx2_target_feature_safety() {
if !is_avx2_available() {
return;
}
let mut data = [0x123456789ABCDEF0u64; 16];
let original = data;
unsafe { clock_mix_avx2_impl(&mut data) };
assert_ne!(data, original);
let mut scalar_data = original;
crate::simd::scalar::scalar_clock_mix(&mut scalar_data);
assert_eq!(data, scalar_data);
}
#[test]
fn test_avx2_edge_cases() {
if !is_avx2_available() {
return;
}
let mut zeros = [0u64; 16];
let original_zeros = zeros;
unsafe { clock_mix_avx2_impl(&mut zeros) };
assert_ne!(zeros, original_zeros);
let mut ones = [u64::MAX; 16];
let original_ones = ones;
unsafe { clock_mix_avx2_impl(&mut ones) };
assert_ne!(ones, original_ones);
let mut alternating = [0u64; 16];
for i in 0..16 {
alternating[i] = if i % 2 == 0 { 0 } else { u64::MAX };
}
let original_alternating = alternating;
unsafe { clock_mix_avx2_impl(&mut alternating) };
assert_ne!(alternating, original_alternating);
}
#[test]
fn test_avx2_boundary_values() {
if !is_avx2_available() {
return;
}
let mut data = [0u64; 16];
for i in 0..16 {
data[i] = 1u64 << (i % 64); }
let original = data;
unsafe { clock_mix_avx2_impl(&mut data) };
assert_ne!(data, original);
let mut scalar_data = original;
crate::simd::scalar::scalar_clock_mix(&mut scalar_data);
assert_eq!(data, scalar_data);
}
#[test]
fn test_avx2_clock_permute_edge_cases() {
if !is_avx2_available() {
return;
}
let test_states = [
[u64::MAX; 8], [1, 2, 3, 4, 5, 6, 7, 8], [8, 7, 6, 5, 4, 3, 2, 1], ];
for mut state in test_states {
let original = state;
unsafe { clock_permute_avx2(&mut state) };
assert_ne!(state, original);
let mut scalar_state = original;
crate::clockpermute::clock_permute(&mut scalar_state);
assert_eq!(state, scalar_state);
}
}
#[test]
fn test_avx2_sbox_edge_cases() {
if !is_avx2_available() {
return;
}
let test_values = [
[0u64; 16], [255u64; 16], [128u64; 16], ];
for mut data in test_values {
let original = data;
unsafe { clock_mix_sbox_avx2(&mut data) };
assert_ne!(data, original);
}
}
}