Skip to main content

hekate_math/towers/
block8.rs

1// SPDX-License-Identifier: Apache-2.0
2// This file is part of the hekate-math project.
3// Copyright (C) 2026 Andrei Kochergin <zeek@tuta.com>
4// Copyright (C) 2026 Oumuamua Labs. All rights reserved.
5//
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9//
10//     http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! BLOCK 8 (GF(2^8))
19use crate::constants::FLAT_TO_TOWER_BIT_MASKS_8;
20use crate::towers::bit::Bit;
21use crate::{
22    CanonicalDeserialize, CanonicalSerialize, Flat, HardwareField, PackableField, PackedFlat,
23    TowerField, constants,
24};
25use core::ops::{Add, AddAssign, BitXor, Mul, MulAssign, Sub, SubAssign};
26use serde::{Deserialize, Serialize};
27use zeroize::Zeroize;
28
29#[cfg(not(feature = "table-math"))]
30#[repr(align(64))]
31struct CtConvertBasisU8<const N: usize>([u8; N]);
32
33#[cfg(not(feature = "table-math"))]
34static TOWER_TO_FLAT_BASIS_8: CtConvertBasisU8<8> =
35    CtConvertBasisU8(constants::RAW_TOWER_TO_FLAT_8);
36
37#[cfg(not(feature = "table-math"))]
38static FLAT_TO_TOWER_BASIS_8: CtConvertBasisU8<8> =
39    CtConvertBasisU8(constants::RAW_FLAT_TO_TOWER_8);
40
41// ============================================================
42// Precomputed Lookup Tables for GF(2^8) arithmetic.
43// Polynomial: x^8 + x^4 + x^3 + x + 1 (0x11B) [AES Standard]
44// Generator: 3 (x + 1)
45// ============================================================
46
47/// Exponentiation Table: g^i
48/// Maps index i -> value inside the field.
49/// Range: [0..255].
50/// Note that EXP_TABLE[0] == 1 and EXP_TABLE[255] == 1.
51#[cfg(feature = "table-math")]
52const EXP_TABLE: [u8; 256] = generate_exp_table();
53
54/// Logarithm Table: log_g(x)
55/// Maps value x -> power i such that g^i = x.
56/// Range: LOG_TABLE[1..=255] contain values 0..254.
57/// LOG_TABLE[0] is 0 (undefined).
58#[cfg(feature = "table-math")]
59const LOG_TABLE: [u8; 256] = generate_log_table();
60
61/// Field element GF(2^8).
62#[derive(Copy, Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, Zeroize)]
63#[repr(transparent)]
64pub struct Block8(pub u8);
65
66impl Block8 {
67    pub const fn new(val: u8) -> Self {
68        Self(val)
69    }
70}
71
72impl TowerField for Block8 {
73    const BITS: usize = 8;
74    const ZERO: Self = Block8(0);
75    const ONE: Self = Block8(1);
76
77    const EXTENSION_TAU: Self = Block8(0x20);
78
79    fn invert(&self) -> Self {
80        #[cfg(feature = "table-math")]
81        {
82            if self.0 == 0 {
83                return Self::ZERO;
84            }
85
86            let i = LOG_TABLE[self.0 as usize] as usize;
87            Block8(EXP_TABLE[255 - i])
88        }
89
90        #[cfg(not(feature = "table-math"))]
91        {
92            // Fermat's Little Theorem:
93            // a^-1 = a^254 in GF(2^8)
94            // Constant-time, no branching.
95            let x = *self;
96            let x2 = x * x;
97            let x4 = x2 * x2;
98            let x8 = x4 * x4;
99            let x16 = x8 * x8;
100            let x32 = x16 * x16;
101            let x64 = x32 * x32;
102            let x128 = x64 * x64;
103
104            // 254 = 128 + 64 + 32 + 16 + 8 + 4 + 2
105            x128 * x64 * x32 * x16 * x8 * x4 * x2
106        }
107    }
108
109    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
110        Self(bytes[0])
111    }
112}
113
114/// Add (XOR)
115impl Add for Block8 {
116    type Output = Self;
117
118    fn add(self, rhs: Self) -> Self::Output {
119        Self(self.0.bitxor(rhs.0))
120    }
121}
122
123impl Sub for Block8 {
124    type Output = Self;
125
126    fn sub(self, rhs: Self) -> Self::Output {
127        self.add(rhs)
128    }
129}
130
131/// Mul (Galois Field Multiplication)
132impl Mul for Block8 {
133    type Output = Self;
134
135    fn mul(self, rhs: Self) -> Self::Output {
136        #[cfg(feature = "table-math")]
137        {
138            // Handle zero explicitly (log(0) is undefined)
139            if self.0 == 0 || rhs.0 == 0 {
140                return Self::ZERO;
141            }
142
143            // Lookup Logarithms
144            // Math:
145            // a * b = g^(log(a) + log(b))
146            let i = LOG_TABLE[self.0 as usize] as usize;
147            let j = LOG_TABLE[rhs.0 as usize] as usize;
148
149            // Add exponents modulo 255
150            // Since max(i) = 254, max(i+j) = 508.
151            // Check if sum >= 255 and subtract.
152            let k = i + j;
153            let idx = if k >= 255 { k - 255 } else { k };
154
155            // Lookup Exponent result
156            Self(EXP_TABLE[idx])
157        }
158
159        #[cfg(not(feature = "table-math"))]
160        {
161            #[cfg(target_arch = "aarch64")]
162            {
163                neon::mul_8(self, rhs)
164            }
165
166            #[cfg(not(target_arch = "aarch64"))]
167            {
168                let mut a = self.0;
169                let mut b = rhs.0;
170                let mut res = 0u8;
171
172                // Constant-time shift-and-add
173                // over GF(2^8) with poly 0x11B.
174                for _ in 0..8 {
175                    let bit = b & 1;
176                    let mask = 0u8.wrapping_sub(bit);
177                    res ^= a & mask;
178
179                    let high_bit = a >> 7;
180                    let overflow_mask = 0u8.wrapping_sub(high_bit);
181                    a = (a << 1) ^ (0x1B & overflow_mask);
182
183                    b >>= 1;
184                }
185
186                Self(res)
187            }
188        }
189    }
190}
191
192impl AddAssign for Block8 {
193    fn add_assign(&mut self, rhs: Self) {
194        *self = *self + rhs;
195    }
196}
197
198impl SubAssign for Block8 {
199    fn sub_assign(&mut self, rhs: Self) {
200        *self = *self - rhs;
201    }
202}
203
204impl MulAssign for Block8 {
205    fn mul_assign(&mut self, rhs: Self) {
206        *self = *self * rhs;
207    }
208}
209
210impl CanonicalSerialize for Block8 {
211    #[inline]
212    fn serialized_size(&self) -> usize {
213        1
214    }
215
216    #[inline]
217    fn serialize(&self, writer: &mut [u8]) -> Result<(), ()> {
218        if writer.is_empty() {
219            return Err(());
220        }
221
222        writer[0] = self.0;
223
224        Ok(())
225    }
226}
227
228impl CanonicalDeserialize for Block8 {
229    fn deserialize(bytes: &[u8]) -> Result<Self, ()> {
230        if bytes.is_empty() {
231            return Err(());
232        }
233
234        Ok(Self(bytes[0]))
235    }
236}
237
238impl From<u8> for Block8 {
239    #[inline]
240    fn from(val: u8) -> Self {
241        Self::new(val)
242    }
243}
244
245impl From<u32> for Block8 {
246    #[inline]
247    fn from(val: u32) -> Self {
248        Self(val as u8)
249    }
250}
251
252impl From<u64> for Block8 {
253    #[inline]
254    fn from(val: u64) -> Self {
255        Self(val as u8)
256    }
257}
258
259impl From<u128> for Block8 {
260    #[inline]
261    fn from(val: u128) -> Self {
262        Self(val as u8)
263    }
264}
265
266// ========================================
267// FIELD LIFTING
268// ========================================
269
270impl From<Bit> for Block8 {
271    #[inline(always)]
272    fn from(val: Bit) -> Self {
273        Self(val.0)
274    }
275}
276
277// ===================================
278// PACKED BLOCK 8 (Width = 16)
279// ===================================
280
281// 128 bits / 8 = 16 elements
282pub const PACKED_WIDTH_8: usize = 16;
283
284#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
285#[repr(C, align(16))]
286pub struct PackedBlock8(pub [Block8; PACKED_WIDTH_8]);
287
288impl PackedBlock8 {
289    #[inline(always)]
290    pub fn zero() -> Self {
291        Self([Block8::ZERO; PACKED_WIDTH_8])
292    }
293}
294
295impl PackableField for Block8 {
296    type Packed = PackedBlock8;
297
298    const WIDTH: usize = PACKED_WIDTH_8;
299
300    #[inline(always)]
301    fn pack(chunk: &[Self]) -> Self::Packed {
302        assert!(
303            chunk.len() >= PACKED_WIDTH_8,
304            "PackableField::pack: input slice too short",
305        );
306
307        let mut arr = [Self::ZERO; PACKED_WIDTH_8];
308        arr.copy_from_slice(&chunk[..PACKED_WIDTH_8]);
309
310        PackedBlock8(arr)
311    }
312
313    #[inline(always)]
314    fn unpack(packed: Self::Packed, output: &mut [Self]) {
315        assert!(
316            output.len() >= PACKED_WIDTH_8,
317            "PackableField::unpack: output slice too short",
318        );
319
320        output[..PACKED_WIDTH_8].copy_from_slice(&packed.0);
321    }
322}
323
324impl Add for PackedBlock8 {
325    type Output = Self;
326
327    #[inline(always)]
328    fn add(self, rhs: Self) -> Self {
329        let mut res = [Block8::ZERO; PACKED_WIDTH_8];
330        for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
331            *out = *l + *r;
332        }
333
334        Self(res)
335    }
336}
337
338impl AddAssign for PackedBlock8 {
339    #[inline(always)]
340    fn add_assign(&mut self, rhs: Self) {
341        for (l, r) in self.0.iter_mut().zip(rhs.0.iter()) {
342            *l += *r;
343        }
344    }
345}
346
347impl Sub for PackedBlock8 {
348    type Output = Self;
349
350    #[inline(always)]
351    fn sub(self, rhs: Self) -> Self {
352        self.add(rhs)
353    }
354}
355
356impl SubAssign for PackedBlock8 {
357    #[inline(always)]
358    fn sub_assign(&mut self, rhs: Self) {
359        self.add_assign(rhs);
360    }
361}
362
363impl Mul for PackedBlock8 {
364    type Output = Self;
365
366    #[inline(always)]
367    fn mul(self, rhs: Self) -> Self {
368        #[cfg(target_arch = "aarch64")]
369        {
370            let mut res = [Block8::ZERO; PACKED_WIDTH_8];
371            for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
372                *out = mul_iso_8(*l, *r);
373            }
374
375            Self(res)
376        }
377
378        #[cfg(not(target_arch = "aarch64"))]
379        {
380            let mut res = [Block8::ZERO; PACKED_WIDTH_8];
381            for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
382                *out = *l * *r;
383            }
384
385            Self(res)
386        }
387    }
388}
389
390impl MulAssign for PackedBlock8 {
391    #[inline(always)]
392    fn mul_assign(&mut self, rhs: Self) {
393        *self = *self * rhs;
394    }
395}
396
397impl Mul<Block8> for PackedBlock8 {
398    type Output = Self;
399
400    #[inline(always)]
401    fn mul(self, rhs: Block8) -> Self {
402        let mut res = [Block8::ZERO; PACKED_WIDTH_8];
403        for (out, v) in res.iter_mut().zip(self.0.iter()) {
404            *out = *v * rhs;
405        }
406
407        Self(res)
408    }
409}
410
411// ===================================
412// Hardware Field
413// ===================================
414
415impl HardwareField for Block8 {
416    #[inline(always)]
417    fn to_hardware(self) -> Flat<Self> {
418        #[cfg(feature = "table-math")]
419        {
420            Flat::from_raw(apply_matrix_8(self, &constants::TOWER_TO_FLAT_8))
421        }
422
423        #[cfg(not(feature = "table-math"))]
424        {
425            Flat::from_raw(Block8(map_ct_8(self.0, &TOWER_TO_FLAT_BASIS_8.0)))
426        }
427    }
428
429    #[inline(always)]
430    fn from_hardware(value: Flat<Self>) -> Self {
431        let value = value.into_raw();
432        #[cfg(feature = "table-math")]
433        {
434            apply_matrix_8(value, &constants::FLAT_TO_TOWER_8)
435        }
436
437        #[cfg(not(feature = "table-math"))]
438        {
439            Block8(map_ct_8(value.0, &FLAT_TO_TOWER_BASIS_8.0))
440        }
441    }
442
443    #[inline(always)]
444    fn add_hardware(lhs: Flat<Self>, rhs: Flat<Self>) -> Flat<Self> {
445        Flat::from_raw(lhs.into_raw() + rhs.into_raw())
446    }
447
448    #[inline(always)]
449    fn add_hardware_packed(lhs: PackedFlat<Self>, rhs: PackedFlat<Self>) -> PackedFlat<Self> {
450        let lhs = lhs.into_raw();
451        let rhs = rhs.into_raw();
452        #[cfg(target_arch = "aarch64")]
453        {
454            PackedFlat::from_raw(neon::add_packed_8(lhs, rhs))
455        }
456
457        #[cfg(not(target_arch = "aarch64"))]
458        {
459            PackedFlat::from_raw(lhs + rhs)
460        }
461    }
462
463    #[inline(always)]
464    fn mul_hardware(lhs: Flat<Self>, rhs: Flat<Self>) -> Flat<Self> {
465        let lhs = lhs.into_raw();
466        let rhs = rhs.into_raw();
467        #[cfg(target_arch = "aarch64")]
468        {
469            Flat::from_raw(neon::mul_8(lhs, rhs))
470        }
471
472        #[cfg(not(target_arch = "aarch64"))]
473        {
474            let a_tower = Self::from_hardware(Flat::from_raw(lhs));
475            let b_tower = Self::from_hardware(Flat::from_raw(rhs));
476
477            (a_tower * b_tower).to_hardware()
478        }
479    }
480
481    #[inline(always)]
482    fn mul_hardware_packed(lhs: PackedFlat<Self>, rhs: PackedFlat<Self>) -> PackedFlat<Self> {
483        let lhs = lhs.into_raw();
484        let rhs = rhs.into_raw();
485
486        #[cfg(target_arch = "aarch64")]
487        {
488            PackedFlat::from_raw(neon::mul_flat_packed_8(lhs, rhs))
489        }
490
491        #[cfg(not(target_arch = "aarch64"))]
492        {
493            let mut l = [Self::ZERO; <Self as PackableField>::WIDTH];
494            let mut r = [Self::ZERO; <Self as PackableField>::WIDTH];
495            let mut res = [Self::ZERO; <Self as PackableField>::WIDTH];
496
497            Self::unpack(lhs, &mut l);
498            Self::unpack(rhs, &mut r);
499
500            for i in 0..<Self as PackableField>::WIDTH {
501                res[i] = Self::mul_hardware(Flat::from_raw(l[i]), Flat::from_raw(r[i])).into_raw();
502            }
503
504            PackedFlat::from_raw(Self::pack(&res))
505        }
506    }
507
508    #[inline(always)]
509    fn mul_hardware_scalar_packed(lhs: PackedFlat<Self>, rhs: Flat<Self>) -> PackedFlat<Self> {
510        let broadcasted = PackedBlock8([rhs.into_raw(); PACKED_WIDTH_8]);
511        Self::mul_hardware_packed(lhs, PackedFlat::from_raw(broadcasted))
512    }
513
514    #[inline(always)]
515    fn tower_bit_from_hardware(value: Flat<Self>, bit_idx: usize) -> u8 {
516        let mask = FLAT_TO_TOWER_BIT_MASKS_8[bit_idx];
517
518        // Parity of (x & mask) without popcount
519        let mut v = value.into_raw().0 & mask;
520        v ^= v >> 4;
521        v ^= v >> 2;
522        v ^= v >> 1;
523
524        v & 1
525    }
526}
527
528// ===========================================
529// UTILS
530// ===========================================
531
532#[cfg(target_arch = "aarch64")]
533#[inline(always)]
534fn mul_iso_8(a: Block8, b: Block8) -> Block8 {
535    let a_f = a.to_hardware();
536    let b_f = b.to_hardware();
537    let c_f = Flat::from_raw(neon::mul_8(a_f.into_raw(), b_f.into_raw()));
538
539    c_f.to_tower()
540}
541
542#[cfg(feature = "table-math")]
543#[inline(always)]
544fn apply_matrix_8(val: Block8, table: &[u8; 256]) -> Block8 {
545    let idx = val.0 as usize;
546    Block8(unsafe { *table.get_unchecked(idx) })
547}
548
549#[cfg(not(feature = "table-math"))]
550#[inline(always)]
551fn map_ct_8(x: u8, basis: &[u8; 8]) -> u8 {
552    let mut acc = 0u8;
553    let mut i = 0usize;
554
555    while i < 8 {
556        let bit = (x >> i) & 1;
557        let mask = 0u8.wrapping_sub(bit);
558        acc ^= basis[i] & mask;
559        i += 1;
560    }
561
562    acc
563}
564
565#[cfg(feature = "table-math")]
566const fn generate_exp_table() -> [u8; 256] {
567    let mut table = [0u8; 256];
568    let mut val: u8 = 1;
569
570    // Iterate i from 0 to 255 (inclusive).
571    // This fills table[0]..table[255].
572    // At i=0, table[0] = 1.
573    // At i=255, val cycles back to 1, so table[255] = 1.
574    // This allows safe access to table[255]
575    // during inversion logic (255 - i).
576    let mut i = 0;
577    while i < 256 {
578        table[i] = val;
579
580        // Multiply val by GENERATOR (3) in GF(2^8)
581        // val * 3 = val * (x + 1) = (val << 1) ^ val
582
583        let high_bit = val & 0x80;
584        let mut shifted = val << 1;
585
586        // AES Polynomial 0x11B.
587        // If high bit was set, XOR with
588        // the lower 8 bits (0x1B).
589        if high_bit != 0 {
590            shifted ^= 0x1B;
591        }
592
593        val = shifted ^ val;
594        i += 1;
595    }
596
597    table
598}
599
600#[cfg(feature = "table-math")]
601const fn generate_log_table() -> [u8; 256] {
602    let mut table = [0u8; 256];
603
604    // For Log table, iterate 0..254.
605    // Valid log values are in range [0, 254].
606    // log(1) is 0. log(g^254) is 254.
607    //
608    // Note:
609    // Don't map index 255 here, as log(1)
610    // is strictly 0 for canonical form.
611
612    let mut val: u8 = 1;
613    let mut i = 0;
614
615    while i < 255 {
616        table[val as usize] = i as u8;
617
618        let high_bit = val & 0x80;
619        let mut shifted = val << 1;
620
621        if high_bit != 0 {
622            shifted ^= 0x1B;
623        }
624
625        val = shifted ^ val;
626
627        i += 1;
628    }
629
630    // table[0] remains 0 (log(0) is undefined).
631
632    table
633}
634
635// ===========================================
636// 8-BIT SIMD INSTRUCTIONS
637// ===========================================
638
639#[cfg(target_arch = "aarch64")]
640mod neon {
641    use super::*;
642    use core::arch::aarch64::*;
643    use core::mem::transmute;
644
645    #[inline(always)]
646    pub fn add_packed_8(lhs: PackedBlock8, rhs: PackedBlock8) -> PackedBlock8 {
647        unsafe {
648            let res = veorq_u8(
649                transmute::<[Block8; 16], uint8x16_t>(lhs.0),
650                transmute::<[Block8; 16], uint8x16_t>(rhs.0),
651            );
652            transmute(res)
653        }
654    }
655
656    #[inline(always)]
657    pub fn mul_8(a: Block8, b: Block8) -> Block8 {
658        unsafe {
659            // Load 8-bit scalars
660            // into NEON vectors.
661            let a_poly = transmute::<uint8x8_t, poly8x8_t>(vdup_n_u8(a.0));
662            let b_poly = transmute::<uint8x8_t, poly8x8_t>(vdup_n_u8(b.0));
663
664            // Multiply:
665            // 8-bit x 8-bit -> 16-bit
666            let prod = vmull_p8(a_poly, b_poly);
667
668            // Extract the 16-bit result
669            let prod_u16 = vgetq_lane_u16(transmute::<poly16x8_t, uint16x8_t>(prod), 0);
670
671            let l = (prod_u16 & 0xFF) as u8;
672            let h = (prod_u16 >> 8) as u8;
673
674            // P(x) = x^8 + 0x1B
675            let r_val = constants::POLY_8; // u8
676
677            // Fold high bits (h * 0x1B)
678            let h_poly = transmute::<uint8x8_t, poly8x8_t>(vdup_n_u8(h));
679            let r_poly = transmute::<uint8x8_t, poly8x8_t>(vdup_n_u8(r_val));
680            let h_red = vmull_p8(h_poly, r_poly);
681
682            let h_red_u16 = vgetq_lane_u16(transmute::<poly16x8_t, uint16x8_t>(h_red), 0);
683
684            let folded = (h_red_u16 & 0xFF) as u8;
685            let carry = (h_red_u16 >> 8) as u8;
686
687            let mut res = l ^ folded;
688
689            // Unconditional carry reduction:
690            // If carry is 0, c_poly is 0,
691            // c_red is 0, and XOR does nothing.
692            let c_poly = transmute::<uint8x8_t, poly8x8_t>(vdup_n_u8(carry));
693            let c_red = vmull_p8(c_poly, r_poly);
694            let c_red_u16 = vgetq_lane_u16(transmute::<poly16x8_t, uint16x8_t>(c_red), 0);
695
696            res ^= (c_red_u16 & 0xFF) as u8;
697
698            Block8(res)
699        }
700    }
701
702    /// Vectorized multiplication for Block8 (16 elements at once).
703    /// Uses vmull_p8 for multiplication and vqtbl1q_u8 for reduction.
704    #[inline(always)]
705    pub fn mul_flat_packed_8(lhs: PackedBlock8, rhs: PackedBlock8) -> PackedBlock8 {
706        unsafe {
707            let a: uint8x16_t = transmute(lhs.0);
708            let b: uint8x16_t = transmute(rhs.0);
709
710            // Split into low/high 64-bit halves
711            let a_lo = vget_low_u8(a);
712            let a_hi = vget_high_u8(a);
713            let b_lo = vget_low_u8(b);
714            let b_hi = vget_high_u8(b);
715
716            // Multiply 8x8 -> 16 bits
717            // (poly16x8_t, which is 128-bit wide)
718            let res_lo = vmull_p8(
719                transmute::<uint8x8_t, poly8x8_t>(a_lo),
720                transmute::<uint8x8_t, poly8x8_t>(b_lo),
721            );
722            let res_hi = vmull_p8(
723                transmute::<uint8x8_t, poly8x8_t>(a_hi),
724                transmute::<uint8x8_t, poly8x8_t>(b_hi),
725            );
726
727            // Reduction using Table Lookup
728            // Load the tables once.
729            let tbl_lo = vld1q_u8(
730                [
731                    0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4,
732                    0xaf, 0x82, 0x99,
733                ]
734                .as_ptr(),
735            );
736
737            let tbl_hi = vld1q_u8(
738                [
739                    0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c, 0x2f, 0x84, 0x62, 0xc9, 0xb5,
740                    0x1e, 0xf8, 0x53,
741                ]
742                .as_ptr(),
743            );
744
745            // Helper to reduce a 128-bit vector
746            // of 16-bit polys down to a 64-bit
747            // vector of 8-bit results.
748            let reduce_tbl = |val_poly: poly16x8_t| -> uint8x8_t {
749                let val: uint16x8_t = transmute(val_poly);
750
751                // vmovn_u16 narrows 128-bit (u16x8) to 64-bit (u8x8)
752                let data = vmovn_u16(val);
753                let carry_u16 = vshrq_n_u16(val, 8);
754                let carry = vmovn_u16(carry_u16);
755
756                // Operations on 64-bit vectors
757                let mask_lo = vdup_n_u8(0x0F);
758                let h_lo = vand_u8(carry, mask_lo);
759                let h_hi = vshr_n_u8(carry, 4);
760
761                // Lookup:
762                // Table is 128-bit (q),
763                // Index is 64-bit.
764                // Result is 64-bit.
765                let r_lo = vqtbl1_u8(tbl_lo, h_lo);
766                let r_hi = vqtbl1_u8(tbl_hi, h_hi);
767
768                // XOR everything together
769                veor_u8(data, veor_u8(r_lo, r_hi))
770            };
771
772            let final_lo = reduce_tbl(res_lo);
773            let final_hi = reduce_tbl(res_hi);
774
775            // Combine two 64-bit results
776            // back into one 128-bit vector.
777            let res = vcombine_u8(final_lo, final_hi);
778
779            PackedBlock8(transmute::<uint8x16_t, [Block8; 16]>(res))
780        }
781    }
782}
783
784#[cfg(test)]
785mod tests {
786    use super::*;
787    use rand::{RngExt, rng};
788
789    // ==================================
790    // BASIC
791    // ==================================
792
793    #[test]
794    fn tower_constants() {
795        // Check that tau is propagated correctly
796        // For Block8 we set 0x20
797        assert_eq!(Block8::EXTENSION_TAU, Block8(0x20));
798    }
799
800    #[test]
801    fn add_truth() {
802        let zero = Block8::ZERO;
803        let one = Block8::ONE;
804
805        assert_eq!(zero + zero, zero);
806        assert_eq!(zero + one, one);
807        assert_eq!(one + zero, one);
808        assert_eq!(one + one, zero);
809    }
810
811    #[test]
812    fn mul_truth() {
813        let zero = Block8::ZERO;
814        let one = Block8::ONE;
815
816        assert_eq!(zero * zero, zero);
817        assert_eq!(zero * one, zero);
818        assert_eq!(one * one, one);
819    }
820
821    #[test]
822    fn add() {
823        // 5 ^ 3 = 6
824        // 101 ^ 011 = 110
825        assert_eq!(Block8(5) + Block8(3), Block8(6));
826    }
827
828    #[test]
829    fn mul_simple() {
830        // Check for prime numbers (without overflow)
831        // x^1 * x^1 = x^2 (2 * 2 = 4)
832        assert_eq!(Block8(2) * Block8(2), Block8(4));
833    }
834
835    #[test]
836    fn mul_overflow() {
837        // Reduction verification (AES test vectors)
838        // Example from the AES specification:
839        // 0x57 * 0x83 = 0xC1
840        assert_eq!(Block8(0x57) * Block8(0x83), Block8(0xC1));
841    }
842
843    #[test]
844    fn security_zeroize() {
845        let mut secret_val = Block8::from(0xFF_u32);
846        assert_ne!(secret_val, Block8::ZERO);
847
848        secret_val.zeroize();
849
850        assert_eq!(secret_val, Block8::ZERO);
851        assert_eq!(secret_val.0, 0, "Block8 memory leak detected");
852    }
853
854    #[test]
855    fn inversion_exhaustive() {
856        // Iterate over all possible field elements (0..255)
857        for i in 0u8..=255 {
858            let val = Block8(i);
859
860            if val == Block8::ZERO {
861                // Case 1:
862                // Zero inversion safety check
863                assert_eq!(val.invert(), Block8::ZERO, "invert(0) must return 0");
864            } else {
865                // Case 2:
866                // Algebraic correctness a * a^-1 = 1
867                let inv = val.invert();
868                let product = val * inv;
869
870                assert_eq!(
871                    product,
872                    Block8::ONE,
873                    "Inversion identity failed: a * a^-1 != 1"
874                );
875            }
876        }
877    }
878
879    // ==================================
880    // HARDWARE
881    // ==================================
882
883    #[test]
884    fn isomorphism_roundtrip() {
885        let mut rng = rng();
886        for _ in 0..1000 {
887            let val = Block8::from(rng.random::<u8>());
888
889            // Roundtrip:
890            // Tower -> Flat -> Tower must be identity
891            assert_eq!(
892                val.to_hardware().to_tower(),
893                val,
894                "Block8 isomorphism roundtrip failed"
895            );
896        }
897    }
898
899    #[test]
900    fn parity_masks_match_from_hardware() {
901        // Exhaustive for Block8:
902        // 256 values * 8 bits.
903        for x in 0u16..=255 {
904            let x_flat = x as u8;
905            let tower = Block8::from_hardware(Flat::from_raw(Block8(x_flat))).0;
906
907            for (k, &mask) in FLAT_TO_TOWER_BIT_MASKS_8.iter().enumerate() {
908                let parity = ((x_flat & mask).count_ones() & 1) as u8;
909                let bit = (tower >> k) & 1;
910                assert_eq!(
911                    parity, bit,
912                    "Block8 mask mismatch at x={x_flat:#04x}, k={k}"
913                );
914
915                let via_api = Flat::from_raw(Block8(x_flat)).tower_bit(k);
916                assert_eq!(via_api, bit, "Block8 tower_bit_from_hardware mismatch");
917            }
918        }
919    }
920
921    #[test]
922    fn flat_mul_homomorphism() {
923        let mut rng = rng();
924        for _ in 0..1000 {
925            let a = Block8::from(rng.random::<u8>());
926            let b = Block8::from(rng.random::<u8>());
927
928            let expected_flat = (a * b).to_hardware();
929            let actual_flat = a.to_hardware() * b.to_hardware();
930
931            // Check if multiplication in Flat basis matches Tower
932            assert_eq!(
933                actual_flat, expected_flat,
934                "Block8 flat multiplication mismatch"
935            );
936        }
937    }
938
939    #[test]
940    fn packed_consistency() {
941        let mut rng = rng();
942        for _ in 0..100 {
943            let mut a_vals = [Block8::ZERO; 16];
944            let mut b_vals = [Block8::ZERO; 16];
945
946            for i in 0..16 {
947                a_vals[i] = Block8::from(rng.random::<u8>());
948                b_vals[i] = Block8::from(rng.random::<u8>());
949            }
950
951            let a_flat_vals = a_vals.map(|x| x.to_hardware());
952            let b_flat_vals = b_vals.map(|x| x.to_hardware());
953            let a_packed = Flat::<Block8>::pack(&a_flat_vals);
954            let b_packed = Flat::<Block8>::pack(&b_flat_vals);
955
956            // Test SIMD Add (XOR)
957            let add_res = Block8::add_hardware_packed(a_packed, b_packed);
958
959            let mut add_out = [Block8::ZERO.to_hardware(); 16];
960            Flat::<Block8>::unpack(add_res, &mut add_out);
961
962            for i in 0..16 {
963                assert_eq!(
964                    add_out[i],
965                    (a_vals[i] + b_vals[i]).to_hardware(),
966                    "Block8 packed add mismatch"
967                );
968            }
969
970            // Test SIMD Mul (Flat basis)
971            let mul_res = Block8::mul_hardware_packed(a_packed, b_packed);
972
973            let mut mul_out = [Block8::ZERO.to_hardware(); 16];
974            Flat::<Block8>::unpack(mul_res, &mut mul_out);
975
976            for i in 0..16 {
977                assert_eq!(
978                    mul_out[i],
979                    (a_vals[i] * b_vals[i]).to_hardware(),
980                    "Block8 packed mul mismatch"
981                );
982            }
983        }
984    }
985
986    // ==================================
987    // PACKED
988    // ==================================
989
990    #[test]
991    fn pack_unpack_roundtrip() {
992        let mut rng = rng();
993        let mut data = [Block8::ZERO; PACKED_WIDTH_8];
994
995        for v in data.iter_mut() {
996            *v = Block8(rng.random());
997        }
998
999        let packed = Block8::pack(&data);
1000        let mut unpacked = [Block8::ZERO; PACKED_WIDTH_8];
1001        Block8::unpack(packed, &mut unpacked);
1002
1003        assert_eq!(data, unpacked, "Block8 pack/unpack roundtrip failed");
1004    }
1005
1006    #[test]
1007    fn packed_add_consistency() {
1008        let mut rng = rng();
1009        let mut a_vals = [Block8::ZERO; PACKED_WIDTH_8];
1010        let mut b_vals = [Block8::ZERO; PACKED_WIDTH_8];
1011
1012        for i in 0..PACKED_WIDTH_8 {
1013            a_vals[i] = Block8(rng.random());
1014            b_vals[i] = Block8(rng.random());
1015        }
1016
1017        let a_packed = Block8::pack(&a_vals);
1018        let b_packed = Block8::pack(&b_vals);
1019        let res_packed = a_packed + b_packed;
1020
1021        let mut res_unpacked = [Block8::ZERO; PACKED_WIDTH_8];
1022        Block8::unpack(res_packed, &mut res_unpacked);
1023
1024        for i in 0..PACKED_WIDTH_8 {
1025            assert_eq!(
1026                res_unpacked[i],
1027                a_vals[i] + b_vals[i],
1028                "Block8 packed add mismatch at index {}",
1029                i
1030            );
1031        }
1032    }
1033
1034    #[test]
1035    fn packed_mul_consistency() {
1036        let mut rng = rng();
1037
1038        for _ in 0..1000 {
1039            let mut a_arr = [Block8::ZERO; PACKED_WIDTH_8];
1040            let mut b_arr = [Block8::ZERO; PACKED_WIDTH_8];
1041
1042            for i in 0..PACKED_WIDTH_8 {
1043                let val_a: u8 = rng.random();
1044                let val_b: u8 = rng.random();
1045                a_arr[i] = Block8(val_a);
1046                b_arr[i] = Block8(val_b);
1047            }
1048
1049            let a_packed = PackedBlock8(a_arr);
1050            let b_packed = PackedBlock8(b_arr);
1051            let c_packed = a_packed * b_packed;
1052
1053            let mut c_expected = [Block8::ZERO; PACKED_WIDTH_8];
1054            for i in 0..PACKED_WIDTH_8 {
1055                c_expected[i] = a_arr[i] * b_arr[i];
1056            }
1057
1058            assert_eq!(c_packed.0, c_expected, "SIMD Block8 mismatch!");
1059        }
1060    }
1061}