#![allow(unsafe_op_in_unsafe_fn)]
use super::ExpandedKey;
use crate::{Block, ParBlocks, field_element::FieldElement};
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
const P1: u64 = 0xC200000000000000;
cpufeatures::new!(clmul, "vpclmulqdq");
pub(crate) use clmul::InitToken;
type ByteArray = [u8; 16];
impl FieldElement {
#[target_feature(enable = "sse2")]
#[inline]
unsafe fn from_m128i(reg: __m128i) -> Self {
let mut out = ByteArray::default();
_mm_storeu_si128(out.as_mut_ptr().cast(), reg);
out.into()
}
#[target_feature(enable = "sse2")]
#[inline]
unsafe fn to_m128i(self) -> __m128i {
load_bytes(&self.into())
}
}
#[target_feature(enable = "sse2")]
#[inline]
unsafe fn load_bytes(bytes: &ByteArray) -> __m128i {
_mm_loadu_si128(bytes.as_ptr().cast())
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
pub(super) unsafe fn proc_block(
key: &ExpandedKey,
acc: FieldElement,
block: &Block,
) -> FieldElement {
let data = load_bytes(&block.0);
let y = _mm_xor_si128(acc.to_m128i(), data);
FieldElement::from_m128i(gf128_mul_rf(y, key.h1.to_m128i(), key.d1.to_m128i()))
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
pub(super) unsafe fn proc_par_blocks(
key: &ExpandedKey,
acc: FieldElement,
par_blocks: &ParBlocks,
) -> FieldElement {
let m0 = load_bytes(&par_blocks[0].0);
let m1 = load_bytes(&par_blocks[1].0);
let m2 = load_bytes(&par_blocks[2].0);
let m3 = load_bytes(&par_blocks[3].0);
let y0 = _mm_xor_si128(acc.to_m128i(), m0);
let (r0, f0) = rf_mul_unreduced(y0, key.h4.to_m128i(), key.d4.to_m128i());
let (r1, f1) = rf_mul_unreduced(m1, key.h3.to_m128i(), key.d3.to_m128i());
let (r2, f2) = rf_mul_unreduced(m2, key.h2.to_m128i(), key.d2.to_m128i());
let (r3, f3) = rf_mul_unreduced(m3, key.h1.to_m128i(), key.d1.to_m128i());
let r = _mm_xor_si128(_mm_xor_si128(r0, r1), _mm_xor_si128(r2, r3));
let f = _mm_xor_si128(_mm_xor_si128(f0, f1), _mm_xor_si128(f2, f3));
FieldElement::from_m128i(reduce_rf(r, f))
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
pub(super) unsafe fn expand_key(h: &[u8; 16]) -> ExpandedKey {
let h1 = load_bytes(h);
let d1 = compute_d(h1);
let h2 = gf128_mul_rf(h1, h1, d1);
let d2 = compute_d(h2);
let h3 = gf128_mul_rf(h2, h1, d1);
let d3 = compute_d(h3);
let h4 = gf128_mul_rf(h2, h2, d2);
let d4 = compute_d(h4);
ExpandedKey {
h1: FieldElement::from_m128i(h1),
d1: FieldElement::from_m128i(d1),
h2: FieldElement::from_m128i(h2),
d2: FieldElement::from_m128i(d2),
h3: FieldElement::from_m128i(h3),
d3: FieldElement::from_m128i(d3),
h4: FieldElement::from_m128i(h4),
d4: FieldElement::from_m128i(d4),
}
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
unsafe fn compute_d(h: __m128i) -> __m128i {
#[allow(clippy::cast_possible_wrap)]
let p = _mm_set_epi64x(P1 as i64, 0);
let h_swap = _mm_shuffle_epi32(h, 0x4e);
let t = _mm_clmulepi64_si128(h, p, 0x10);
_mm_xor_si128(h_swap, t)
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
unsafe fn rf_mul_unreduced(m: __m128i, h: __m128i, d: __m128i) -> (__m128i, __m128i) {
let r0 = _mm_clmulepi64_si128(m, d, 0x10); let r1 = _mm_clmulepi64_si128(m, h, 0x11); let r = _mm_xor_si128(r0, r1);
let f0 = _mm_clmulepi64_si128(m, d, 0x00); let f1 = _mm_clmulepi64_si128(m, h, 0x01); let f = _mm_xor_si128(f0, f1);
(r, f)
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
unsafe fn reduce_rf(r: __m128i, f: __m128i) -> __m128i {
#[allow(clippy::cast_possible_wrap)]
let p1 = _mm_set_epi64x(0, P1 as i64);
let f1 = _mm_srli_si128(f, 8);
let f0_shifted = _mm_slli_si128(f, 8);
let p1_f0 = _mm_clmulepi64_si128(f, p1, 0x00);
let result = _mm_xor_si128(r, f1);
let result = _mm_xor_si128(result, f0_shifted);
_mm_xor_si128(result, p1_f0)
}
#[target_feature(enable = "avx", enable = "pclmulqdq")]
#[inline]
unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i {
let (r, f) = rf_mul_unreduced(m, h, d);
reduce_rf(r, f)
}