#![cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
__m256i, _mm256_add_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_or_si256,
_mm256_set1_epi8, _mm256_setzero_si256, _mm256_slli_epi16, _mm256_srli_epi16, _mm256_sub_epi8,
_mm256_xor_si256,
};
use super::scalar::{A_ROWS, AFFINE_B, SM4_GF_POLY};
#[target_feature(enable = "avx2")]
#[allow(unsafe_op_in_unsafe_fn)]
pub(super) unsafe fn gf_mul(mut a: __m256i, mut b: __m256i) -> __m256i {
let mut r = _mm256_setzero_si256();
let one = _mm256_set1_epi8(1);
let poly = _mm256_set1_epi8(SM4_GF_POLY as i8);
let mask_lo7 = _mm256_set1_epi8(0x7F);
let mut i = 0;
while i < 8 {
let bit0 = _mm256_and_si256(b, one);
let mask = _mm256_sub_epi8(_mm256_setzero_si256(), bit0);
r = _mm256_xor_si256(r, _mm256_and_si256(a, mask));
let high = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a);
let a_shl1 = _mm256_add_epi8(a, a);
a = _mm256_xor_si256(a_shl1, _mm256_and_si256(poly, high));
let b_shr1 = _mm256_srli_epi16(b, 1);
b = _mm256_and_si256(b_shr1, mask_lo7);
i += 1;
}
r
}
#[target_feature(enable = "avx2")]
#[allow(unsafe_op_in_unsafe_fn)]
pub(super) unsafe fn gf_inv(x: __m256i) -> __m256i {
let x2 = gf_mul(x, x);
let x4 = gf_mul(x2, x2);
let x8 = gf_mul(x4, x4);
let x16 = gf_mul(x8, x8);
let x32 = gf_mul(x16, x16);
let x64 = gf_mul(x32, x32);
let x128 = gf_mul(x64, x64);
let r1 = gf_mul(x128, x64);
let r2 = gf_mul(r1, x32);
let r3 = gf_mul(r2, x16);
let r4 = gf_mul(r3, x8);
let r5 = gf_mul(r4, x4);
gf_mul(r5, x2)
}
#[target_feature(enable = "avx2")]
#[allow(unsafe_op_in_unsafe_fn)]
pub(super) unsafe fn affine_a(x: __m256i) -> __m256i {
let row0 = _mm256_set1_epi8(A_ROWS[0] as i8);
let row1 = _mm256_set1_epi8(A_ROWS[1] as i8);
let row2 = _mm256_set1_epi8(A_ROWS[2] as i8);
let row3 = _mm256_set1_epi8(A_ROWS[3] as i8);
let row4 = _mm256_set1_epi8(A_ROWS[4] as i8);
let row5 = _mm256_set1_epi8(A_ROWS[5] as i8);
let row6 = _mm256_set1_epi8(A_ROWS[6] as i8);
let row7 = _mm256_set1_epi8(A_ROWS[7] as i8);
let mut out = _mm256_setzero_si256();
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row0, x)), 7));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row1, x)), 6));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row2, x)), 5));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row3, x)), 4));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row4, x)), 3));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row5, x)), 2));
out = _mm256_or_si256(out, _mm256_slli_epi16(parity(_mm256_and_si256(row6, x)), 1));
out = _mm256_or_si256(out, parity(_mm256_and_si256(row7, x)));
out
}
#[target_feature(enable = "avx2")]
#[allow(unsafe_op_in_unsafe_fn)]
pub(super) unsafe fn parity(x: __m256i) -> __m256i {
let p = _mm256_xor_si256(x, _mm256_srli_epi16(x, 4));
let p = _mm256_xor_si256(p, _mm256_srli_epi16(p, 2));
let p = _mm256_xor_si256(p, _mm256_srli_epi16(p, 1));
_mm256_and_si256(p, _mm256_set1_epi8(1))
}
#[target_feature(enable = "avx2")]
#[allow(unsafe_op_in_unsafe_fn)]
pub(super) unsafe fn sbox_round(x: __m256i) -> __m256i {
let b_const = _mm256_set1_epi8(AFFINE_B as i8);
let pre = _mm256_xor_si256(affine_a(x), b_const);
let inv = gf_inv(pre);
_mm256_xor_si256(affine_a(inv), b_const)
}