#![allow(unsafe_code)]
use core::arch::x86_64::*;
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn rk(rk: &[u8], i: usize) -> __m128i {
unsafe { _mm_loadu_si128(rk.as_ptr().add(i * 16) as *const __m128i) }
}
#[target_feature(enable = "aes,sse2")]
pub(super) unsafe fn aes_round(state: [u8; 16], rk: [u8; 16]) -> [u8; 16] {
unsafe {
let s = _mm_loadu_si128(state.as_ptr() as *const __m128i);
let k = _mm_loadu_si128(rk.as_ptr() as *const __m128i);
let r = _mm_aesenc_si128(s, k);
let mut out = [0u8; 16];
_mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, r);
out
}
}
#[target_feature(enable = "aes,sse2")]
pub(super) unsafe fn encrypt_block(round_keys: &[u8], nr: usize, block: &mut [u8; 16]) {
unsafe {
let mut s = _mm_loadu_si128(block.as_ptr() as *const __m128i);
s = _mm_xor_si128(s, rk(round_keys, 0));
for r in 1..nr {
s = _mm_aesenc_si128(s, rk(round_keys, r));
}
s = _mm_aesenclast_si128(s, rk(round_keys, nr));
_mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, s);
}
}
#[target_feature(enable = "aes,sse2")]
pub(super) unsafe fn decrypt_block(round_keys: &[u8], nr: usize, block: &mut [u8; 16]) {
unsafe {
let mut s = _mm_loadu_si128(block.as_ptr() as *const __m128i);
s = _mm_xor_si128(s, rk(round_keys, nr));
for r in (1..nr).rev() {
s = _mm_aesdec_si128(s, _mm_aesimc_si128(rk(round_keys, r)));
}
s = _mm_aesdeclast_si128(s, rk(round_keys, 0));
_mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, s);
}
}
#[target_feature(enable = "aes,sse2")]
pub(super) unsafe fn encrypt_blocks(round_keys: &[u8], nr: usize, blocks: &mut [u8]) {
unsafe {
let mut ks = [_mm_setzero_si128(); 15];
for (i, k) in ks.iter_mut().enumerate().take(nr + 1) {
*k = rk(round_keys, i);
}
let mut wide = blocks.chunks_exact_mut(16 * 8);
for c in &mut wide {
let mut b = [_mm_setzero_si128(); 8];
for (j, bj) in b.iter_mut().enumerate() {
*bj = _mm_loadu_si128(c.as_ptr().add(j * 16) as *const __m128i);
}
for bj in b.iter_mut() {
*bj = _mm_xor_si128(*bj, ks[0]);
}
for &k in ks.iter().take(nr).skip(1) {
for bj in b.iter_mut() {
*bj = _mm_aesenc_si128(*bj, k);
}
}
for bj in b.iter_mut() {
*bj = _mm_aesenclast_si128(*bj, ks[nr]);
}
for (j, &bj) in b.iter().enumerate() {
_mm_storeu_si128(c.as_mut_ptr().add(j * 16) as *mut __m128i, bj);
}
}
for block in wide.into_remainder().chunks_exact_mut(16) {
let mut s = _mm_loadu_si128(block.as_ptr() as *const __m128i);
s = _mm_xor_si128(s, ks[0]);
for &k in ks.iter().take(nr).skip(1) {
s = _mm_aesenc_si128(s, k);
}
s = _mm_aesenclast_si128(s, ks[nr]);
_mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, s);
}
}
}
#[target_feature(enable = "aes,sse2")]
pub(super) unsafe fn decrypt_blocks(round_keys: &[u8], nr: usize, blocks: &mut [u8]) {
unsafe {
let mut ks = [_mm_setzero_si128(); 15];
ks[0] = rk(round_keys, 0);
ks[nr] = rk(round_keys, nr);
for (i, k) in ks.iter_mut().enumerate().take(nr).skip(1) {
*k = _mm_aesimc_si128(rk(round_keys, i));
}
let mut wide = blocks.chunks_exact_mut(16 * 8);
for c in &mut wide {
let mut b = [_mm_setzero_si128(); 8];
for (j, bj) in b.iter_mut().enumerate() {
*bj = _mm_loadu_si128(c.as_ptr().add(j * 16) as *const __m128i);
}
for bj in b.iter_mut() {
*bj = _mm_xor_si128(*bj, ks[nr]);
}
for r in (1..nr).rev() {
for bj in b.iter_mut() {
*bj = _mm_aesdec_si128(*bj, ks[r]);
}
}
for bj in b.iter_mut() {
*bj = _mm_aesdeclast_si128(*bj, ks[0]);
}
for (j, &bj) in b.iter().enumerate() {
_mm_storeu_si128(c.as_mut_ptr().add(j * 16) as *mut __m128i, bj);
}
}
for block in wide.into_remainder().chunks_exact_mut(16) {
let mut s = _mm_loadu_si128(block.as_ptr() as *const __m128i);
s = _mm_xor_si128(s, ks[nr]);
for r in (1..nr).rev() {
s = _mm_aesdec_si128(s, ks[r]);
}
s = _mm_aesdeclast_si128(s, ks[0]);
_mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, s);
}
}
}