#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[derive(Clone)]
pub struct Polyval {
key: __m128i,
h: __m128i,
}
impl Polyval {
pub const KEY_LEN: usize = 16;
pub const BLOCK_LEN: usize = 16;
pub const TAG_LEN: usize = 16;
pub fn new(k: &[u8]) -> Self {
assert_eq!(k.len(), Self::KEY_LEN);
unsafe {
let h = _mm_setzero_si128();
let key = _mm_loadu_si128(k.as_ptr() as *const __m128i);
Self { key, h }
}
}
#[inline]
fn gf_mul(&mut self, block: &[u8]) {
unsafe {
let a = _mm_loadu_si128(block.as_ptr() as *const __m128i);
let mask = _mm_setr_epi32(0x1, 0, 0, 0xc2000000u32 as _);
self.h = _mm_xor_si128(self.h, a);
let mut tmp1 = _mm_clmulepi64_si128(self.h, self.key, 0x00);
let mut tmp4 = _mm_clmulepi64_si128(self.h, self.key, 0x11);
let mut tmp2 = _mm_clmulepi64_si128(self.h, self.key, 0x10);
let mut tmp3 = _mm_clmulepi64_si128(self.h, self.key, 0x01);
tmp2 = _mm_xor_si128(tmp2, tmp3);
tmp3 = _mm_slli_si128(tmp2, 8);
tmp2 = _mm_srli_si128(tmp2, 8);
tmp1 = _mm_xor_si128(tmp3, tmp1);
tmp4 = _mm_xor_si128(tmp4, tmp2);
tmp2 = _mm_clmulepi64_si128(tmp1, mask, 0x10);
tmp3 = _mm_shuffle_epi32(tmp1, 78);
tmp1 = _mm_xor_si128(tmp3, tmp2);
tmp2 = _mm_clmulepi64_si128(tmp1, mask, 0x10);
tmp3 = _mm_shuffle_epi32(tmp1, 78);
tmp1 = _mm_xor_si128(tmp3, tmp2);
self.h = _mm_xor_si128(tmp4, tmp1);
}
}
pub fn update(&mut self, m: &[u8]) {
let mlen = m.len();
for chunk in m.chunks_exact(Self::BLOCK_LEN) {
self.gf_mul(chunk);
}
let r = mlen % Self::BLOCK_LEN;
if r > 0 {
let mut last_block = [0u8; Self::BLOCK_LEN];
let offset = mlen - r;
last_block[..r].copy_from_slice(&m[offset..]);
self.gf_mul(&last_block);
}
}
pub fn finalize(self) -> [u8; Self::TAG_LEN] {
unsafe {
let mut tag = [0u8; Self::TAG_LEN];
_mm_storeu_si128(tag.as_mut_ptr() as *mut __m128i, self.h);
tag
}
}
}