use core::arch::x86_64::*;
#[repr(C, align(32))]
union RC_Data {
vecs: [__m256i; 24],
u: [u64; 96]
}
#[repr(C, align(32))]
union Temp {
pub vec: __m256i,
pub u: [u64; 4]
}
const RHO: [u32; 24] = [
1, 3, 6, 10, 15, 21, 28, 36,
45, 55, 2, 14, 27, 41, 56, 8,
25, 43, 62, 18, 39, 61, 20, 44,
];
const PI: [usize; 24] = [
10, 7, 11, 17, 18, 3, 5, 16,
8, 21, 24, 4, 15, 23, 19, 13,
12, 2, 20, 14, 22, 9, 6, 1,
];
const RC_X4: RC_Data = RC_Data { u: [
0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001,
0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082,
0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a,
0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000,
0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b,
0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001,
0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081,
0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009,
0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a,
0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088,
0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009,
0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a,
0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b,
0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b,
0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089,
0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003,
0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002,
0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080,
0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a,
0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a,
0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081,
0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080,
0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001,
0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
]};
macro_rules! unroll5 {
($var:ident, $body:block) => {
{ const $var: usize = 0; $body; }
{ const $var: usize = 1; $body; }
{ const $var: usize = 2; $body; }
{ const $var: usize = 3; $body; }
{ const $var: usize = 4; $body; }
};
}
macro_rules! unroll24 {
($var: ident, $body: block) => {
{ const $var: usize = 0; $body; }
{ const $var: usize = 1; $body; }
{ const $var: usize = 2; $body; }
{ const $var: usize = 3; $body; }
{ const $var: usize = 4; $body; }
{ const $var: usize = 5; $body; }
{ const $var: usize = 6; $body; }
{ const $var: usize = 7; $body; }
{ const $var: usize = 8; $body; }
{ const $var: usize = 9; $body; }
{ const $var: usize = 10; $body; }
{ const $var: usize = 11; $body; }
{ const $var: usize = 12; $body; }
{ const $var: usize = 13; $body; }
{ const $var: usize = 14; $body; }
{ const $var: usize = 15; $body; }
{ const $var: usize = 16; $body; }
{ const $var: usize = 17; $body; }
{ const $var: usize = 18; $body; }
{ const $var: usize = 19; $body; }
{ const $var: usize = 20; $body; }
{ const $var: usize = 21; $body; }
{ const $var: usize = 22; $body; }
{ const $var: usize = 23; $body; }
};
}
#[allow(unused_assignments, non_upper_case_globals)]
pub fn f1600_x4(a: &mut [__m256i]) {
unsafe {
for i in 0..24 {
let mut array = [_mm256_setzero_si256(); 5];
unroll5!(x, {
unroll5!(y, {
array[x] = _mm256_xor_si256(array[x], a[5 * y + x]);
});
});
unroll5!(x, {
unroll5!(y, {
let t1 = array[(x + 4) % 5];
let mut t2 = Temp {vec: array[(x + 1) % 5]};
for i in 0..4 {
t2.u[i] = t2.u[i].rotate_left(1);
}
a[5 * y + x] = _mm256_xor_si256(a[5 * y + x], _mm256_xor_si256(t1, t2.vec));
});
});
let mut last = a[1];
unroll24!(x, {
array[0] = a[PI[x]];
let mut temp_last = Temp {vec: last};
for i in 0..4 {
temp_last.u[i] = temp_last.u[i].rotate_left(RHO[x]);
}
a[PI[x]] = temp_last.vec;
last = array[0];
});
unroll5!(y_step, {
let y = 5 * y_step;
unroll5!(x, {
array[x] = a[y + x];
});
unroll5!(x, {
let t1 = array[(x + 1) % 5];
let t2 = array[(x + 2) % 5];
let tmp = _mm256_xor_si256(array[x], _mm256_andnot_si256(t1, t2));
a[y+x] = tmp;
});
});
a[0] = _mm256_xor_si256(a[0], RC_X4.vecs[i]);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
const PLEN: usize = 25;
#[test]
fn known_vectors() {
let vec1: [u64; 25] = [
0xF1258F7940E1DDE7, 0x84D5CCF933C0478A, 0xD598261EA65AA9EE, 0xBD1547306F80494D,
0x8B284E056253D057, 0xFF97A42D7F8E6FD4, 0x90FEE5A0A44647C4, 0x8C5BDA0CD6192E76,
0xAD30A6F71B19059C, 0x30935AB7D08FFC64, 0xEB5AA93F2317D635, 0xA9A6E6260D712103,
0x81A57C16DBCF555F, 0x43B831CD0347C826, 0x01F22F1A11A5569F, 0x05E5635A21D9AE61,
0x64BEFEF28CC970F2, 0x613670957BC46611, 0xB87C5A554FD00ECB, 0x8C3EE88A1CCF32C8,
0x940C7922AE3A2614, 0x1841F924A2C509E4, 0x16F53526E70465C2, 0x75F644E97F30A13B,
0xEAF1FF7B5CECA249
];
let vec2: [u64; 25] = [
0x2D5C954DF96ECB3C, 0x6A332CD07057B56D, 0x093D8D1270D76B6C, 0x8A20D9B25569D094,
0x4F9C4F99E5E7F156, 0xF957B9A2DA65FB38, 0x85773DAE1275AF0D, 0xFAF4F247C3D810F7,
0x1F1B9EE6F79A8759, 0xE4FECC0FEE98B425, 0x68CE61B6B9CE68A1, 0xDEEA66C4BA8F974F,
0x33C43D836EAFB1F5, 0xE00654042719DBD9, 0x7CF8A9F009831265, 0xFD5449A6BF174743,
0x97DDAD33D8994B40, 0x48EAD5FC5D0BE774, 0xE3B8C8EE55B7B03C, 0x91A0226E649E42E9,
0x900E3129E7BADD7B, 0x202A9EC5FAA3CCE8, 0x5B3402464E1C3DB6, 0x609F4E62A44C1059,
0x20D06CD26A8FBF5C
];
let tvec1 = expand(vec1);
let tvec2 = expand(vec2);
unsafe {
let mut data = Data { u: [0u64;100] };
f1600_x4(&mut data.lanes);
assert_eq!(&data.u , &tvec1);
f1600_x4(&mut data.lanes);
assert_eq!(data.u, tvec2);
}
}
#[repr(C)]
pub union Data {
pub lanes: [__m256i; PLEN],
pub u: [u64; PLEN * 4]
}
fn expand(vec: [u64; PLEN]) -> [u64; 100] {
let mut out = [0u64; 100];
for (i,u) in vec.iter().enumerate() {
out[i*4..][..4].copy_from_slice(&[*u; 4]);
}
out
}
}