#[cfg(any(feature = "std", feature = "alloc"))]
extern crate alloc;
#[cfg(feature = "simd")]
use alloc::vec::Vec;
use core::mem::size_of;
#[cfg(feature = "simd")]
use core::simd::{
u64x2,
u64x4,
u64x8,
};
use crate::{
LaneSize,
PLEN,
keccak_p,
};
#[derive(Debug, Clone, Copy)]
pub struct SimdConfig {
pub max_width: usize,
pub bounds_check: bool,
pub cache_optimized: bool,
pub side_channel_protection: bool,
}
impl Default for SimdConfig {
fn default() -> Self {
Self {
max_width: 4, bounds_check: true,
cache_optimized: true,
side_channel_protection: true,
}
}
}
impl SimdConfig {
pub fn security_optimized() -> Self {
Self {
max_width: 2, bounds_check: true,
cache_optimized: true,
side_channel_protection: true,
}
}
pub fn performance_optimized() -> Self {
Self {
max_width: 8, bounds_check: false, cache_optimized: true,
side_channel_protection: false, }
}
}
#[cfg(feature = "simd")]
pub struct SimdSecurityValidator;
#[cfg(feature = "simd")]
impl SimdSecurityValidator {
pub fn validate_simd_state<T: LaneSize>(state: &[T; PLEN]) -> Result<(), &'static str> {
for lane in state.iter() {
let _ = lane; }
Ok(())
}
pub fn sanitize_input(data: &[u8]) -> Vec<u8> {
let mut result = Vec::with_capacity(data.len());
result.extend_from_slice(data);
result
}
}
#[cfg(feature = "simd")]
pub trait AdvancedLaneSize: LaneSize {
const SIMD_WIDTH: usize;
fn parallel_keccak_p_secure(
states: &mut [Self; PLEN],
round_count: usize,
config: &SimdConfig,
) -> Result<(), &'static str> {
if config.bounds_check {
Self::validate_bounds(states, round_count)?;
}
if config.side_channel_protection {
SimdSecurityValidator::validate_simd_state(states)?;
}
Self::parallel_keccak_p(states, round_count);
Ok(())
}
fn parallel_keccak_p(states: &mut [Self; PLEN], round_count: usize);
fn validate_bounds(_states: &[Self; PLEN], round_count: usize) -> Result<(), &'static str> {
if round_count == 0 {
return Err("Round count cannot be zero");
}
if round_count > Self::KECCAK_F_ROUND_COUNT {
return Err("Round count exceeds maximum allowed");
}
Ok(())
}
fn fast_parallel_absorb_secure(
state: &mut [Self; PLEN],
data: &[u8],
config: &SimdConfig,
) -> Result<usize, &'static str> {
if config.bounds_check && data.len() < size_of::<Self>() {
return Err("Input data too small for SIMD processing");
}
if config.side_channel_protection {
SimdSecurityValidator::validate_simd_state(state)?;
}
let sanitized_data = if config.side_channel_protection {
SimdSecurityValidator::sanitize_input(data)
} else {
let mut result = Vec::with_capacity(data.len());
result.extend_from_slice(data);
result
};
Ok(Self::fast_parallel_absorb(state, &sanitized_data))
}
fn fast_parallel_absorb(state: &mut [Self; PLEN], data: &[u8]) -> usize;
}
#[cfg(feature = "simd")]
impl AdvancedLaneSize for u64x2 {
const SIMD_WIDTH: usize = 2;
fn parallel_keccak_p(states: &mut [Self; PLEN], round_count: usize) {
if round_count == 0 || round_count > Self::KECCAK_F_ROUND_COUNT {
return; }
let round_constants =
&crate::RC[(Self::KECCAK_F_ROUND_COUNT - round_count)..Self::KECCAK_F_ROUND_COUNT];
for &rc in round_constants {
let mut c = [Self::default(); 5];
for x in 0..5 {
for y in 0..5 {
c[x] ^= states[5 * y + x];
}
}
for x in 0..5 {
let t1 = c[(x + 4) % 5];
let t2 = c[(x + 1) % 5].rotate_left(1);
for y in 0..5 {
states[5 * y + x] ^= t1 ^ t2;
}
}
let mut array = [Self::default(); 5];
for y in 0..5 {
for x in 0..5 {
array[x] = states[5 * y + x];
}
for x in 0..5 {
let t1 = !array[(x + 1) % 5];
let t2 = array[(x + 2) % 5];
states[5 * y + x] = array[x] ^ (t1 & t2);
}
}
states[0] ^= Self::truncate_rc(rc);
}
}
fn fast_parallel_absorb(state: &mut [Self; PLEN], data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
let mut offset = 0;
let lane_size = size_of::<Self>();
while offset + lane_size <= data.len() {
let data_slice = &data[offset..offset + lane_size];
let value = if data_slice.len() >= lane_size {
u64x2::from_array([
u64::from_le_bytes([
data_slice[0],
data_slice[1],
data_slice[2],
data_slice[3],
data_slice[4],
data_slice[5],
data_slice[6],
data_slice[7],
]),
u64::from_le_bytes([
data_slice[8],
data_slice[9],
data_slice[10],
data_slice[11],
data_slice[12],
data_slice[13],
data_slice[14],
data_slice[15],
]),
])
} else {
u64x2::splat(0)
};
state[0] ^= value;
keccak_p(state, 24);
offset += lane_size;
}
offset
}
}
#[cfg(feature = "simd")]
impl AdvancedLaneSize for u64x4 {
const SIMD_WIDTH: usize = 4;
fn parallel_keccak_p(states: &mut [Self; PLEN], round_count: usize) {
if round_count == 0 || round_count > Self::KECCAK_F_ROUND_COUNT {
return; }
let round_constants =
&crate::RC[(Self::KECCAK_F_ROUND_COUNT - round_count)..Self::KECCAK_F_ROUND_COUNT];
for &rc in round_constants {
let mut c = [Self::default(); 5];
for x in 0..5 {
for y in 0..5 {
c[x] ^= states[5 * y + x];
}
}
for x in 0..5 {
let t1 = c[(x + 4) % 5];
let t2 = c[(x + 1) % 5].rotate_left(1);
for y in 0..5 {
states[5 * y + x] ^= t1 ^ t2;
}
}
let mut array = [Self::default(); 5];
for y in 0..5 {
for x in 0..5 {
array[x] = states[5 * y + x];
}
for x in 0..5 {
let t1 = !array[(x + 1) % 5];
let t2 = array[(x + 2) % 5];
states[5 * y + x] = array[x] ^ (t1 & t2);
}
}
states[0] ^= Self::truncate_rc(rc);
}
}
fn fast_parallel_absorb(state: &mut [Self; PLEN], data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
let mut offset = 0;
let lane_size = size_of::<Self>();
while offset + lane_size <= data.len() {
let data_slice = &data[offset..offset + lane_size];
let value = if data_slice.len() >= lane_size {
u64x4::from_array([
u64::from_le_bytes(data_slice[0..8].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[8..16].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[16..24].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[24..32].try_into().unwrap_or([0; 8])),
])
} else {
u64x4::splat(0)
};
state[0] ^= value;
keccak_p(state, 24);
offset += lane_size;
}
offset
}
}
#[cfg(feature = "simd")]
impl AdvancedLaneSize for u64x8 {
const SIMD_WIDTH: usize = 8;
fn parallel_keccak_p(states: &mut [Self; PLEN], round_count: usize) {
if round_count == 0 || round_count > Self::KECCAK_F_ROUND_COUNT {
return; }
let round_constants =
&crate::RC[(Self::KECCAK_F_ROUND_COUNT - round_count)..Self::KECCAK_F_ROUND_COUNT];
for &rc in round_constants {
let mut c = [Self::default(); 5];
for x in 0..5 {
for y in 0..5 {
c[x] ^= states[5 * y + x];
}
}
for x in 0..5 {
let t1 = c[(x + 4) % 5];
let t2 = c[(x + 1) % 5].rotate_left(1);
for y in 0..5 {
states[5 * y + x] ^= t1 ^ t2;
}
}
let mut array = [Self::default(); 5];
for y in 0..5 {
for x in 0..5 {
array[x] = states[5 * y + x];
}
for x in 0..5 {
let t1 = !array[(x + 1) % 5];
let t2 = array[(x + 2) % 5];
states[5 * y + x] = array[x] ^ (t1 & t2);
}
}
states[0] ^= Self::truncate_rc(rc);
}
}
fn fast_parallel_absorb(state: &mut [Self; PLEN], data: &[u8]) -> usize {
if data.is_empty() {
return 0;
}
let mut offset = 0;
let lane_size = size_of::<Self>();
while offset + lane_size <= data.len() {
let data_slice = &data[offset..offset + lane_size];
let value = if data_slice.len() >= lane_size {
u64x8::from_array([
u64::from_le_bytes(data_slice[0..8].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[8..16].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[16..24].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[24..32].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[32..40].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[40..48].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[48..56].try_into().unwrap_or([0; 8])),
u64::from_le_bytes(data_slice[56..64].try_into().unwrap_or([0; 8])),
])
} else {
u64x8::splat(0)
};
state[0] ^= value;
keccak_p(state, 24);
offset += lane_size;
}
offset
}
}
#[cfg(feature = "simd")]
pub mod parallel {
use super::*;
pub fn p1600_parallel_2x(states: &mut [[u64; 25]; 2]) {
let mut simd_states = [u64x2::splat(0); 25];
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
simd_states[i] = u64x2::from_array([states[0][i], states[1][i]]);
}
u64x2::parallel_keccak_p(&mut simd_states, 24);
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
let result = simd_states[i].to_array();
states[0][i] = result[0];
states[1][i] = result[1];
}
}
pub fn p1600_parallel_4x(states: &mut [[u64; 25]; 4]) {
let mut simd_states = [u64x4::splat(0); 25];
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
simd_states[i] =
u64x4::from_array([states[0][i], states[1][i], states[2][i], states[3][i]]);
}
u64x4::parallel_keccak_p(&mut simd_states, 24);
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
let result = simd_states[i].to_array();
states[0][i] = result[0];
states[1][i] = result[1];
states[2][i] = result[2];
states[3][i] = result[3];
}
}
pub fn p1600_parallel_8x(states: &mut [[u64; 25]; 8]) {
let mut simd_states = [u64x8::splat(0); 25];
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
simd_states[i] = u64x8::from_array([
states[0][i],
states[1][i],
states[2][i],
states[3][i],
states[4][i],
states[5][i],
states[6][i],
states[7][i],
]);
}
u64x8::parallel_keccak_p(&mut simd_states, 24);
#[allow(clippy::needless_range_loop)]
for i in 0..25 {
let result = simd_states[i].to_array();
for j in 0..8 {
states[j][i] = result[j];
}
}
}
}
#[cfg(feature = "simd")]
pub fn fast_loop_absorb_advanced(state: &mut [u64; 25], data: &[u8], parallelism: usize) -> usize {
match parallelism {
2 => {
let mut simd_state = [u64x2::splat(0); 25];
for i in 0..25 {
simd_state[i] = u64x2::splat(state[i]);
}
let offset = u64x2::fast_parallel_absorb(&mut simd_state, data);
for i in 0..25 {
state[i] = simd_state[i].to_array()[0];
}
offset
}
4 => {
let mut simd_state = [u64x4::splat(0); 25];
for i in 0..25 {
simd_state[i] = u64x4::splat(state[i]);
}
let offset = u64x4::fast_parallel_absorb(&mut simd_state, data);
for i in 0..25 {
state[i] = simd_state[i].to_array()[0];
}
offset
}
8 => {
let mut simd_state = [u64x8::splat(0); 25];
for i in 0..25 {
simd_state[i] = u64x8::splat(state[i]);
}
let offset = u64x8::fast_parallel_absorb(&mut simd_state, data);
for i in 0..25 {
state[i] = simd_state[i].to_array()[0];
}
offset
}
_ => {
let mut offset = 0;
let lane_size = 8;
while offset + lane_size <= data.len() {
let value = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
state[0] ^= value;
crate::p1600(state, 24);
offset += lane_size;
}
offset
}
}
}
#[cfg(test)]
#[allow(clippy::unreadable_literal)] mod tests {
use super::*;
#[test]
#[cfg(all(feature = "std", feature = "simd"))]
fn test_parallel_2x_consistency() {
let mut states = [[0u64; 25], [0u64; 25]];
states[0][0] = 0x1234567890ABCDEF;
states[1][0] = 0xFEDCBA0987654321;
parallel::p1600_parallel_2x(&mut states);
assert_ne!(states[0][0], 0x1234567890ABCDEF);
assert_ne!(states[1][0], 0xFEDCBA0987654321);
}
#[test]
#[cfg(all(feature = "std", feature = "simd"))]
fn test_fast_loop_absorb() {
let mut state = [0u64; 25];
let data = b"Hello, World! This is a test message for advanced SIMD processing.";
let offset = fast_loop_absorb_advanced(&mut state, data, 4);
assert!(offset > 0);
assert_ne!(state[0], 0);
}
}