clock-hash 1.0.0

//! AVX-512-accelerated implementations for ClockHash operations
//!
//! This module provides AVX-512 SIMD implementations of ClockMix operations
//! for maximum performance on x86_64 and x86 architectures with AVX-512 support.

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;

/// AVX-512-accelerated implementation of ClockMix
///
/// Uses AVX-512 SIMD operations for maximum parallelism.
/// Handles all 16 u64 values using 512-bit registers when available.
///
/// # Safety
///
/// This function is unsafe because it uses AVX-512 SIMD instructions that require:
/// - AVX-512F and AVX-512BW CPU support (checked via runtime feature detection)
/// - Proper alignment of input data for optimal performance
/// - The caller must ensure AVX-512 is available before calling
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx512f,avx512bw")]
pub unsafe fn clock_mix_avx512_impl(message: &mut [u64; 16]) {
    // Step 1: XOR with rotated neighbor using AVX-512
    unsafe { clock_mix_xor_rotate_avx512(message) };

    // Step 2: S-box lookup and addition using AVX-512 gather
    unsafe { clock_mix_sbox_avx512(message) };
}

/// AVX-512 implementation of ClockMix XOR-with-rotated-neighbor step
///
/// Uses AVX-512 SIMD operations to handle all 16 u64 values in parallel.
/// Processes the circular rotation and variable rotations using 512-bit registers.
///
/// # Arguments
///
/// * `message` - Mutable reference to 16 u64 words to process in-place
///
/// # Safety
///
/// This function is unsafe because it uses AVX-512 SIMD instructions.
/// The caller must ensure AVX-512F and AVX-512BW are available.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn clock_mix_xor_rotate_avx512(message: &mut [u64; 16]) {
    use crate::constants::ROTATION_SCHEDULE;

    // Load all 16 u64 values into two 512-bit registers
    let msg_ptr = message.as_ptr() as *const __m512i;
    let mut msg0 = unsafe { _mm512_loadu_si512(msg_ptr) }; // message[0..7]
    let mut msg1 = unsafe { _mm512_loadu_si512(msg_ptr.add(1)) }; // message[8..15]

    // Create rotated versions for the circular dependency: rot[i] = message[(i+1) % 16]
    // For AVX-512, we need to handle the cross-register rotation carefully

    // Extract the last element from msg0 and first element from msg1 for rotation
    let last_of_msg0 = _mm512_extracti64x4_epi64(msg0, 1); // Extract upper 256 bits (elements 4-7)
    let _last_element = _mm256_extract_epi64(last_of_msg0, 3) as u64; // Extract element 7

    let first_of_msg1 = _mm512_castsi512_si256(msg1); // Extract lower 256 bits (elements 8-11)
    let _first_element = _mm256_extract_epi64(first_of_msg1, 0) as u64; // Extract element 8

    // Create rotated registers: rot0 contains elements [1,2,3,4,5,6,7,8]
    // rot1 contains elements [9,10,11,12,13,14,15,0]
    let rot0 = _mm512_alignr_epi64(msg0, msg1, 1); // Align right by 1 element
    let rot1_temp = _mm512_alignr_epi64(msg1, msg0, 1);

    // Fix the wrap-around elements
    // For rot0[7] (which should be message[8]), we need to insert message[8]
    let msg1_lower = _mm512_castsi512_si256(msg1);
    let element_8 = _mm256_extract_epi64(msg1_lower, 0);
    let _element_8_broadcast = _mm512_set1_epi64(element_8 as i64);

    // Insert element_8 into rot0 at position 7
    let rot0 = _mm512_mask_set1_epi64(rot0, 0x80, element_8 as i64);

    // For rot1[7] (which should be message[0]), we need to insert message[0]
    let msg0_lower = _mm512_castsi512_si256(msg0);
    let element_0 = _mm256_extract_epi64(msg0_lower, 0);
    let rot1 = _mm512_mask_set1_epi64(rot1_temp, 0x80, element_0 as i64);

    // Now apply the variable rotations from ROTATION_SCHEDULE
    // Load rotation schedule into AVX-512 registers
    let rot_sched0 = _mm512_set_epi64(
        ROTATION_SCHEDULE[7] as i64,
        ROTATION_SCHEDULE[6] as i64,
        ROTATION_SCHEDULE[5] as i64,
        ROTATION_SCHEDULE[4] as i64,
        ROTATION_SCHEDULE[3] as i64,
        ROTATION_SCHEDULE[2] as i64,
        ROTATION_SCHEDULE[1] as i64,
        ROTATION_SCHEDULE[0] as i64,
    );
    let rot_sched1 = _mm512_set_epi64(
        ROTATION_SCHEDULE[15] as i64,
        ROTATION_SCHEDULE[14] as i64,
        ROTATION_SCHEDULE[13] as i64,
        ROTATION_SCHEDULE[12] as i64,
        ROTATION_SCHEDULE[11] as i64,
        ROTATION_SCHEDULE[10] as i64,
        ROTATION_SCHEDULE[9] as i64,
        ROTATION_SCHEDULE[8] as i64,
    );

    // Apply variable rotations to the rotated values
    let rot0_rotated = unsafe { avx512_rotate_left_epi64(rot0, rot_sched0) };
    let rot1_rotated = unsafe { avx512_rotate_left_epi64(rot1, rot_sched1) };

    // XOR with the rotated and rotated values
    msg0 = _mm512_xor_si512(msg0, rot0_rotated);
    msg1 = _mm512_xor_si512(msg1, rot1_rotated);

    // Store results back to message array
    unsafe { _mm512_storeu_si512(message.as_mut_ptr() as *mut __m512i, msg0) };
    unsafe { _mm512_storeu_si512(message.as_mut_ptr().add(8) as *mut __m512i, msg1) };
}

/// AVX-512 implementation of variable left rotate for 64-bit elements
///
/// Uses AVX-512 variable rotate instructions for maximum efficiency.
/// Equivalent to: (x << n) | (x >> (64 - n))
///
/// # Safety
///
/// This function is unsafe because it uses AVX-512 SIMD instructions.
/// The caller must ensure AVX-512F and AVX-512BW are available.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx512f,avx512bw")]
#[inline]
unsafe fn avx512_rotate_left_epi64(x: __m512i, n: __m512i) -> __m512i {
    // Create complement of n for right shift: 64 - n
    let sixty_four = _mm512_set1_epi64(64);
    let right_shift = _mm512_sub_epi64(sixty_four, n);

    // Left shift by n
    let left_shifted = _mm512_sllv_epi64(x, n);

    // Right shift by (64 - n)
    let right_shifted = _mm512_srlv_epi64(x, right_shift);

    // OR them together
    _mm512_or_si512(left_shifted, right_shifted)
}

/// AVX-512-accelerated S-box lookup and addition for ClockMix
///
/// Uses AVX-512 gather operations to lookup S-box values for all 16 elements
/// simultaneously, then adds them to the message values using 512-bit operations.
///
/// # Safety
///
/// This function is unsafe because it uses AVX-512 SIMD instructions.
/// The caller must ensure AVX-512F and AVX-512BW are available.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn clock_mix_sbox_avx512(message: &mut [u64; 16]) {
    use crate::constants::SBOX;

    // Load message into AVX-512 registers
    let msg_ptr = message.as_ptr() as *const __m512i;
    let mut msg0 = unsafe { _mm512_loadu_si512(msg_ptr) }; // message[0..7]
    let mut msg1 = unsafe { _mm512_loadu_si512(msg_ptr.add(1)) }; // message[8..15]

    // Extract lower 8 bits from each 64-bit element to use as S-box indices
    let mask_8bit = _mm512_set1_epi64(0xFF);
    let indices0 = _mm512_and_si512(msg0, mask_8bit);
    let indices1 = _mm512_and_si512(msg1, mask_8bit);

    // Use AVX-512 gather to lookup S-box values
    // The SBOX table is treated as an array of u64 values for gather purposes
    let sbox_ptr = SBOX.as_ptr() as *const u64;

    // Gather S-box values - AVX-512 can gather 8 values at once
    let sbox_vals0 = unsafe { avx512_gather_sbox(indices0, sbox_ptr) };
    let sbox_vals1 = unsafe { avx512_gather_sbox(indices1, sbox_ptr) };

    // Add S-box values to message values (wrapping addition)
    msg0 = _mm512_add_epi64(msg0, sbox_vals0);
    msg1 = _mm512_add_epi64(msg1, sbox_vals1);

    // Store results back to message array
    unsafe { _mm512_storeu_si512(message.as_mut_ptr() as *mut __m512i, msg0) };
    unsafe { _mm512_storeu_si512(message.as_mut_ptr().add(8) as *mut __m512i, msg1) };
}

/// AVX-512 gather operation for S-box lookups
///
/// Gathers 8 u8 values from SBOX and converts them to u64 for addition.
/// Uses AVX-512 gather operations with a 64-bit SBOX table for true SIMD parallelization.
///
/// # Safety
///
/// This function is unsafe because it uses AVX-512 SIMD instructions and raw pointer operations.
/// The caller must ensure AVX-512F and AVX-512BW are available and the sbox_ptr is valid.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx512f,avx512bw")]
#[inline]
unsafe fn avx512_gather_sbox(indices: __m512i, sbox_ptr: *const u64) -> __m512i {
    // Convert 64-bit indices to 32-bit and keep as 512-bit for gather
    let indices_32bit_256 = _mm512_cvtepi64_epi32(indices);
    let indices_32bit_512 = _mm512_cvtepu32_epi64(indices_32bit_256);

    // Gather 32-bit values from SBOX (treating it as u32 array for gather)
    let sbox_i32_ptr = sbox_ptr as *const i32;
    let gathered_u32 = unsafe { _mm512_i32gather_epi32::<1>(indices_32bit_512, sbox_i32_ptr) };

    // Zero-extend u32 results to u64
    _mm512_cvtepu32_epi64(_mm512_cvtepi64_epi32(gathered_u32))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::simd::dispatch::is_avx512_available;

    #[test]
    fn test_avx512_target_feature_safety() {
        // Test that AVX-512 target features are used safely
        if !is_avx512_available() {
            // AVX-512 not available, skipping AVX-512 safety test
            // (Print statement removed for no_std compatibility)
            return;
        }

        let mut data = [0xFEDCBA9876543210u64; 16];
        let original = data;

        // This should not panic on systems with AVX-512
        unsafe { clock_mix_avx512_impl(&mut data) };
        assert_ne!(data, original);

        // Should match scalar
        let mut scalar_data = original;
        crate::simd::scalar::scalar_clock_mix(&mut scalar_data);
        assert_eq!(data, scalar_data);
    }

    #[test]
    fn test_avx512_feature_logic() {
        // Test that AVX-512 logic works correctly
        let avx512_available = is_avx512_available();
        let avx2_available = crate::simd::dispatch::is_avx2_available();

        // AVX-512 should only be available if AVX2 is available
        if avx512_available {
            assert!(avx2_available, "AVX-512 requires AVX2");
        }
    }
}