include!(concat!(env!("OUT_DIR"), "/tables.rs"));
#[inline(always)]
pub fn add(a: u8, b: u8) -> u8 {
a ^ b
}
#[inline(always)]
pub fn mul(a: u8, b: u8) -> u8 {
MUL_TABLE[a as usize][b as usize]
}
#[inline(always)]
pub fn div(a: u8, b: u8) -> u8 {
if a == 0 {
return 0;
}
assert!(b != 0, "Division by zero in GF(2^8)");
let log_a = LOG_TABLE[a as usize] as isize;
let log_b = LOG_TABLE[b as usize] as isize;
let mut log_result = log_a - log_b;
if log_result < 0 {
log_result += 255;
}
EXP_TABLE[log_result as usize]
}
#[inline(always)]
pub fn exp(a: u8, n: usize) -> u8 {
if n == 0 {
return 1;
}
if a == 0 {
return 0;
}
let log_a = LOG_TABLE[a as usize] as usize;
let log_result = log_a * (n % 255) % 255;
EXP_TABLE[log_result]
}
#[inline]
pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) {
assert_eq!(input.len(), out.len());
if input.is_empty() || c == 0 {
out.iter_mut().for_each(|o| *o = 0);
return;
}
if c == 1 {
out.copy_from_slice(input);
return;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
unsafe {
mul_slice_gfni_avx2(c, input, out);
}
return;
}
if is_x86_feature_detected!("avx2") {
unsafe {
mul_slice_avx2(c, input, out);
}
return;
}
if is_x86_feature_detected!("gfni") {
unsafe {
mul_slice_gfni_sse(c, input, out);
}
return;
}
if is_x86_feature_detected!("ssse3") {
unsafe {
mul_slice_ssse3(c, input, out);
}
return;
}
}
mul_slice_scalar(c, input, out);
}
#[inline]
pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) {
assert_eq!(input.len(), out.len());
if input.is_empty() || c == 0 {
return;
}
if c == 1 {
for (o, i) in out.iter_mut().zip(input.iter()) {
*o ^= *i;
}
return;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
unsafe {
mul_slice_xor_gfni_avx2(c, input, out);
}
return;
}
if is_x86_feature_detected!("avx2") {
unsafe {
mul_slice_xor_avx2(c, input, out);
}
return;
}
if is_x86_feature_detected!("gfni") {
unsafe {
mul_slice_xor_gfni_sse(c, input, out);
}
return;
}
if is_x86_feature_detected!("ssse3") {
unsafe {
mul_slice_xor_ssse3(c, input, out);
}
return;
}
}
mul_slice_xor_scalar(c, input, out);
}
pub type MulSliceFn = fn(u8, &[u8], &mut [u8]);
pub fn detect_mul_slice() -> (MulSliceFn, MulSliceFn) {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
return (
wrap_mul_slice_gfni_avx2 as MulSliceFn,
wrap_mul_slice_xor_gfni_avx2 as MulSliceFn,
);
}
if is_x86_feature_detected!("avx2") {
return (
wrap_mul_slice_avx2 as MulSliceFn,
wrap_mul_slice_xor_avx2 as MulSliceFn,
);
}
if is_x86_feature_detected!("gfni") {
return (
wrap_mul_slice_gfni_sse as MulSliceFn,
wrap_mul_slice_xor_gfni_sse as MulSliceFn,
);
}
if is_x86_feature_detected!("ssse3") {
return (
wrap_mul_slice_ssse3 as MulSliceFn,
wrap_mul_slice_xor_ssse3 as MulSliceFn,
);
}
}
(
mul_slice_scalar as MulSliceFn,
mul_slice_xor_scalar as MulSliceFn,
)
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_gfni_avx2(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_xor_gfni_avx2(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_avx2(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_xor_avx2(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_gfni_sse(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_xor_gfni_sse(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_ssse3(c, input, out) }
}
#[cfg(target_arch = "x86_64")]
fn wrap_mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
unsafe { mul_slice_xor_ssse3(c, input, out) }
}
fn mul_slice_scalar(c: u8, input: &[u8], out: &mut [u8]) {
let mt = &MUL_TABLE[c as usize];
for (o, &i) in out.iter_mut().zip(input.iter()) {
*o = mt[i as usize];
}
}
fn mul_slice_xor_scalar(c: u8, input: &[u8], out: &mut [u8]) {
let mt = &MUL_TABLE[c as usize];
for (o, &i) in out.iter_mut().zip(input.iter()) {
*o ^= mt[i as usize];
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "gfni,avx2")]
unsafe fn mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let matrix = GFNI_TABLE[c as usize] as i64;
let mat_vec = _mm256_set1_epi64x(matrix);
let len = input.len();
let mut i = 0;
while i + 32 <= len {
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
let result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
i += 32;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "gfni,avx2")]
unsafe fn mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let matrix = GFNI_TABLE[c as usize] as i64;
let mat_vec = _mm256_set1_epi64x(matrix);
let len = input.len();
let mut i = 0;
while i + 32 <= len {
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
let mul_result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
let result = _mm256_xor_si256(mul_result, existing);
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
i += 32;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "gfni")]
unsafe fn mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let matrix = GFNI_TABLE[c as usize] as i64;
let mat_vec = _mm_set1_epi64x(matrix);
let len = input.len();
let mut i = 0;
while i + 16 <= len {
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
let result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
i += 16;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "gfni")]
unsafe fn mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let matrix = GFNI_TABLE[c as usize] as i64;
let mat_vec = _mm_set1_epi64x(matrix);
let len = input.len();
let mut i = 0;
while i + 16 <= len {
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
let mul_result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
let result = _mm_xor_si128(mul_result, existing);
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
i += 16;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let low = &MUL_TABLE_LOW[c as usize];
let high = &MUL_TABLE_HIGH[c as usize];
let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
let mask = _mm256_set1_epi8(0x0F);
let len = input.len();
let mut i = 0;
while i + 32 <= len {
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
let lo_nibble = _mm256_and_si256(data, mask);
let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
let result = _mm256_xor_si256(lo_result, hi_result);
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
i += 32;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let low = &MUL_TABLE_LOW[c as usize];
let high = &MUL_TABLE_HIGH[c as usize];
let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
let mask = _mm256_set1_epi8(0x0F);
let len = input.len();
let mut i = 0;
while i + 32 <= len {
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
let lo_nibble = _mm256_and_si256(data, mask);
let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
let result = _mm256_xor_si256(_mm256_xor_si256(lo_result, hi_result), existing);
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
i += 32;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let low = &MUL_TABLE_LOW[c as usize];
let high = &MUL_TABLE_HIGH[c as usize];
let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
let mask = _mm_set1_epi8(0x0F);
let len = input.len();
let mut i = 0;
while i + 16 <= len {
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
let lo_nibble = _mm_and_si128(data, mask);
let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
let result = _mm_xor_si128(lo_result, hi_result);
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
i += 16;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
use core::arch::x86_64::*;
let low = &MUL_TABLE_LOW[c as usize];
let high = &MUL_TABLE_HIGH[c as usize];
let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
let mask = _mm_set1_epi8(0x0F);
let len = input.len();
let mut i = 0;
while i + 16 <= len {
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
let lo_nibble = _mm_and_si128(data, mask);
let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
let result = _mm_xor_si128(_mm_xor_si128(lo_result, hi_result), existing);
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
i += 16;
}
let mt = &MUL_TABLE[c as usize];
while i < len {
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
i += 1;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gfni_table() {
for c in 0u16..256 {
let matrix = GFNI_TABLE[c as usize];
for b in 0u16..256 {
let expected = MUL_TABLE[c as usize][b as usize];
let x = b as u8;
let mut result: u8 = 0;
for i in 0..8u32 {
let row_byte = ((matrix >> ((7 - i) * 8)) & 0xFF) as u8;
let dot = (row_byte & x).count_ones() % 2;
result |= (dot as u8) << i;
}
assert_eq!(
result, expected,
"GFNI table mismatch: c={c}, b={b}, got={result}, expected={expected}"
);
}
}
}
#[test]
fn test_add() {
assert_eq!(add(0, 0), 0);
assert_eq!(add(1, 0), 1);
assert_eq!(add(0, 1), 1);
assert_eq!(add(1, 1), 0);
assert_eq!(add(0xFF, 0xFF), 0);
assert_eq!(add(0xAA, 0x55), 0xFF);
}
#[test]
fn test_mul() {
assert_eq!(mul(0, 0), 0);
assert_eq!(mul(1, 0), 0);
assert_eq!(mul(0, 1), 0);
assert_eq!(mul(1, 1), 1);
for a in 0u8..=255 {
assert_eq!(mul(a, 1), a);
assert_eq!(mul(1, a), a);
}
for a in 0u8..=255 {
assert_eq!(mul(a, 0), 0);
}
}
#[test]
fn test_div() {
for a in 0u8..=255 {
assert_eq!(div(a, 1), a);
}
for a in 1u8..=255 {
assert_eq!(div(a, a), 1);
}
for a in 1u8..=255 {
for b in 1u8..=255 {
assert_eq!(div(mul(a, b), b), a);
}
}
}
#[test]
fn test_exp() {
assert_eq!(exp(0, 0), 1);
assert_eq!(exp(1, 0), 1);
assert_eq!(exp(5, 0), 1);
assert_eq!(exp(0, 1), 0);
assert_eq!(exp(0, 100), 0);
for a in 0u8..=255 {
assert_eq!(exp(a, 1), a);
}
for a in 0u8..=255 {
assert_eq!(exp(a, 2), mul(a, a));
}
}
#[test]
fn test_mul_slice_basic() {
let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
let mut out = [0u8; 8];
mul_slice(3, &input, &mut out);
for i in 0..input.len() {
assert_eq!(out[i], mul(3, input[i]));
}
}
#[test]
fn test_mul_slice_xor_basic() {
let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
let mut out = [10u8; 8];
let original = out;
mul_slice_xor(3, &input, &mut out);
for i in 0..input.len() {
assert_eq!(out[i], original[i] ^ mul(3, input[i]));
}
}
#[test]
fn test_mul_slice_large() {
let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
let mut out = vec![0u8; 256];
let mut expected = vec![0u8; 256];
for c in [2u8, 7, 42, 128, 255] {
mul_slice_scalar(c, &input, &mut expected);
mul_slice(c, &input, &mut out);
assert_eq!(out, expected, "mul_slice mismatch for c={c}");
}
}
#[test]
fn test_mul_slice_xor_large() {
let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
for c in [2u8, 7, 42, 128, 255] {
let mut out_expected = vec![0xABu8; 256];
let mut out_simd = out_expected.clone();
mul_slice_xor_scalar(c, &input, &mut out_expected);
mul_slice_xor(c, &input, &mut out_simd);
assert_eq!(out_simd, out_expected, "mul_slice_xor mismatch for c={c}");
}
}
#[test]
fn test_mul_slice_unaligned_sizes() {
for size in [1, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100] {
let input: Vec<u8> = (0..size).map(|i| i as u8).collect();
let mut out = vec![0u8; size];
let mut expected = vec![0u8; size];
mul_slice_scalar(42, &input, &mut expected);
mul_slice(42, &input, &mut out);
assert_eq!(out, expected, "mul_slice mismatch for size={size}");
}
}
}