Skip to main content

hekate_math/towers/
block128.rs

1// SPDX-License-Identifier: Apache-2.0
2// This file is part of the hekate-math project.
3// Copyright (C) 2026 Andrei Kochergin <zeek@tuta.com>
4// Copyright (C) 2026 Oumuamua Labs. All rights reserved.
5//
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9//
10//     http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! BLOCK 128 (GF(2^128))
19use crate::towers::bit::Bit;
20use crate::towers::block8::Block8;
21use crate::towers::block16::Block16;
22use crate::towers::block32::Block32;
23use crate::towers::block64::Block64;
24use crate::{
25    CanonicalDeserialize, CanonicalSerialize, Flat, FlatPromote, HardwareField, PackableField,
26    PackedFlat, TowerField, constants,
27};
28use core::ops::{Add, AddAssign, BitXor, BitXorAssign, Mul, MulAssign, Sub, SubAssign};
29use serde::{Deserialize, Serialize};
30use zeroize::Zeroize;
31
32#[cfg(not(feature = "table-math"))]
33#[repr(align(64))]
34struct CtConvertBasisU128<const N: usize>([u128; N]);
35
36#[cfg(not(feature = "table-math"))]
37static TOWER_TO_FLAT_BASIS_128: CtConvertBasisU128<128> =
38    CtConvertBasisU128(constants::RAW_TOWER_TO_FLAT_128);
39
40#[cfg(not(feature = "table-math"))]
41static FLAT_TO_TOWER_BASIS_128: CtConvertBasisU128<128> =
42    CtConvertBasisU128(constants::RAW_FLAT_TO_TOWER_128);
43
44#[derive(Copy, Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, Zeroize)]
45#[repr(transparent)]
46pub struct Block128(pub u128);
47
48impl Block128 {
49    // 0x2000_0000_0000_0000 << 64
50    const TAU: Self = Block128(0x2000_0000_0000_0000_0000_0000_0000_0000);
51
52    pub fn new(lo: Block64, hi: Block64) -> Self {
53        Self((hi.0 as u128) << 64 | (lo.0 as u128))
54    }
55
56    #[inline(always)]
57    pub fn split(self) -> (Block64, Block64) {
58        (Block64(self.0 as u64), Block64((self.0 >> 64) as u64))
59    }
60}
61
62impl TowerField for Block128 {
63    const BITS: usize = 128;
64    const ZERO: Self = Block128(0);
65    const ONE: Self = Block128(1);
66
67    const EXTENSION_TAU: Self = Self::TAU;
68
69    fn invert(&self) -> Self {
70        let (l, h) = self.split();
71        let h2 = h * h;
72        let l2 = l * l;
73        let hl = h * l;
74        let norm = (h2 * Block64::TAU) + hl + l2;
75
76        let norm_inv = norm.invert();
77        let res_hi = h * norm_inv;
78        let res_lo = (h + l) * norm_inv;
79
80        Self::new(res_lo, res_hi)
81    }
82
83    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
84        let mut buf = [0u8; 16];
85        buf.copy_from_slice(&bytes[0..16]);
86
87        Self(u128::from_le_bytes(buf))
88    }
89}
90
91impl Add for Block128 {
92    type Output = Self;
93
94    fn add(self, rhs: Self) -> Self {
95        Self(self.0.bitxor(rhs.0))
96    }
97}
98
99impl Sub for Block128 {
100    type Output = Self;
101
102    fn sub(self, rhs: Self) -> Self {
103        self.add(rhs)
104    }
105}
106
107impl Mul for Block128 {
108    type Output = Self;
109
110    fn mul(self, rhs: Self) -> Self {
111        let (a0, a1) = self.split();
112        let (b0, b1) = rhs.split();
113
114        let v0 = a0 * b0;
115        let v1 = a1 * b1;
116        let v_sum = (a0 + a1) * (b0 + b1);
117
118        let c_hi = v0 + v_sum;
119        let c_lo = v0 + (v1 * Block64::TAU);
120
121        Self::new(c_lo, c_hi)
122    }
123}
124
125impl AddAssign for Block128 {
126    fn add_assign(&mut self, rhs: Self) {
127        self.0.bitxor_assign(rhs.0);
128    }
129}
130
131impl SubAssign for Block128 {
132    fn sub_assign(&mut self, rhs: Self) {
133        self.0.bitxor_assign(rhs.0);
134    }
135}
136
137impl MulAssign for Block128 {
138    fn mul_assign(&mut self, rhs: Self) {
139        *self = *self * rhs;
140    }
141}
142
143impl CanonicalSerialize for Block128 {
144    fn serialized_size(&self) -> usize {
145        16
146    }
147
148    fn serialize(&self, writer: &mut [u8]) -> Result<(), ()> {
149        if writer.len() < 16 {
150            return Err(());
151        }
152
153        writer.copy_from_slice(&self.0.to_le_bytes());
154
155        Ok(())
156    }
157}
158
159impl CanonicalDeserialize for Block128 {
160    fn deserialize(bytes: &[u8]) -> Result<Self, ()> {
161        if bytes.len() < 16 {
162            return Err(());
163        }
164
165        let mut buf = [0u8; 16];
166        buf.copy_from_slice(&bytes[0..16]);
167
168        Ok(Self(u128::from_le_bytes(buf)))
169    }
170}
171
172impl From<u8> for Block128 {
173    fn from(val: u8) -> Self {
174        Self(val as u128)
175    }
176}
177
178impl From<u32> for Block128 {
179    #[inline]
180    fn from(val: u32) -> Self {
181        Self(val as u128)
182    }
183}
184
185impl From<u64> for Block128 {
186    #[inline]
187    fn from(val: u64) -> Self {
188        Self::from(val as u128)
189    }
190}
191
192impl From<u128> for Block128 {
193    #[inline]
194    fn from(val: u128) -> Self {
195        Self(val)
196    }
197}
198
199// ========================================
200// FIELD LIFTING
201// ========================================
202
203impl From<Bit> for Block128 {
204    #[inline(always)]
205    fn from(val: Bit) -> Self {
206        Self(val.0 as u128)
207    }
208}
209
210impl From<Block8> for Block128 {
211    #[inline(always)]
212    fn from(val: Block8) -> Self {
213        Self(val.0 as u128)
214    }
215}
216
217impl From<Block16> for Block128 {
218    #[inline(always)]
219    fn from(val: Block16) -> Self {
220        Self(val.0 as u128)
221    }
222}
223
224impl From<Block32> for Block128 {
225    #[inline(always)]
226    fn from(val: Block32) -> Self {
227        Self(val.0 as u128)
228    }
229}
230
231impl From<Block64> for Block128 {
232    #[inline(always)]
233    fn from(val: Block64) -> Self {
234        Self(val.0 as u128)
235    }
236}
237
238// ===================================
239// PACKED BLOCK 128 (Width = 4)
240// ===================================
241
242pub const PACKED_WIDTH_128: usize = 4;
243
244/// A SIMD register containing `PACKED_WIDTH`
245/// of Block128 elements. Force 32-byte alignment
246/// for AVX2 compatibility.
247#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
248#[repr(C, align(32))]
249pub struct PackedBlock128(pub [Block128; PACKED_WIDTH_128]);
250
251impl PackedBlock128 {
252    /// Create a zeroed vector.
253    #[inline(always)]
254    pub fn zero() -> Self {
255        Self([Block128::ZERO; PACKED_WIDTH_128])
256    }
257
258    /// Fill vector with the same value (Broadcast).
259    #[inline(always)]
260    pub fn broadcast(val: Block128) -> Self {
261        Self([val; PACKED_WIDTH_128])
262    }
263}
264
265impl PackableField for Block128 {
266    type Packed = PackedBlock128;
267
268    const WIDTH: usize = PACKED_WIDTH_128;
269
270    #[inline(always)]
271    fn pack(chunk: &[Self]) -> Self::Packed {
272        assert!(
273            chunk.len() >= PACKED_WIDTH_128,
274            "PackableField::pack: input slice too short",
275        );
276
277        let mut arr = [Self::ZERO; PACKED_WIDTH_128];
278        arr.copy_from_slice(&chunk[..PACKED_WIDTH_128]);
279
280        PackedBlock128(arr)
281    }
282
283    #[inline(always)]
284    fn unpack(packed: Self::Packed, output: &mut [Self]) {
285        assert!(
286            output.len() >= PACKED_WIDTH_128,
287            "PackableField::unpack: output slice too short",
288        );
289
290        output[..PACKED_WIDTH_128].copy_from_slice(&packed.0);
291    }
292}
293
294// 1. ADDITION (XOR)
295// This is perfectly parallel. Compiler will
296// vectorize this automatically using `vpxor`.
297
298impl Add for PackedBlock128 {
299    type Output = Self;
300
301    #[inline(always)]
302    fn add(self, rhs: Self) -> Self {
303        let mut res = [Block128::ZERO; PACKED_WIDTH_128];
304        for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
305            *out = *l + *r;
306        }
307
308        Self(res)
309    }
310}
311
312impl AddAssign for PackedBlock128 {
313    #[inline(always)]
314    fn add_assign(&mut self, rhs: Self) {
315        for (l, r) in self.0.iter_mut().zip(rhs.0.iter()) {
316            *l += *r;
317        }
318    }
319}
320
321// 2. SUBTRACTION (Same as Add for Char 2)
322
323impl Sub for PackedBlock128 {
324    type Output = Self;
325
326    #[inline(always)]
327    fn sub(self, rhs: Self) -> Self {
328        self.add(rhs)
329    }
330}
331
332impl SubAssign for PackedBlock128 {
333    #[inline(always)]
334    fn sub_assign(&mut self, rhs: Self) {
335        self.add_assign(rhs);
336    }
337}
338
339// 3. MULTIPLICATION (Hardware Accelerated)
340
341impl Mul for PackedBlock128 {
342    type Output = Self;
343
344    #[inline(always)]
345    fn mul(self, rhs: Self) -> Self {
346        #[cfg(target_arch = "aarch64")]
347        {
348            let mut res = [Block128::ZERO; PACKED_WIDTH_128];
349            for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
350                let a_flat = l.to_hardware();
351                let b_flat = r.to_hardware();
352                let c_flat =
353                    Flat::from_raw(neon::mul_flat_128(a_flat.into_raw(), b_flat.into_raw()));
354
355                *out = c_flat.to_tower();
356            }
357
358            Self(res)
359        }
360
361        #[cfg(not(target_arch = "aarch64"))]
362        {
363            let mut res = [Block128::ZERO; PACKED_WIDTH_128];
364            for ((out, l), r) in res.iter_mut().zip(self.0.iter()).zip(rhs.0.iter()) {
365                *out = *l * *r;
366            }
367
368            Self(res)
369        }
370    }
371}
372
373impl MulAssign for PackedBlock128 {
374    #[inline(always)]
375    fn mul_assign(&mut self, rhs: Self) {
376        for (l, r) in self.0.iter_mut().zip(rhs.0.iter()) {
377            *l *= *r;
378        }
379    }
380}
381
382// 4. SCALAR MULTIPLICATION (Vector * Scalar)
383// Used for broadcasting coefficients.
384
385impl Mul<Block128> for PackedBlock128 {
386    type Output = Self;
387
388    #[inline(always)]
389    fn mul(self, rhs: Block128) -> Self {
390        let mut res = [Block128::ZERO; PACKED_WIDTH_128];
391        for (out, v) in res.iter_mut().zip(self.0.iter()) {
392            *out = *v * rhs;
393        }
394
395        Self(res)
396    }
397}
398
399impl MulAssign<Block128> for PackedBlock128 {
400    #[inline(always)]
401    fn mul_assign(&mut self, rhs: Block128) {
402        for v in self.0.iter_mut() {
403            *v *= rhs;
404        }
405    }
406}
407
408// ===================================
409// Block128 Hardware Field
410// ===================================
411
412impl HardwareField for Block128 {
413    #[inline(always)]
414    fn to_hardware(self) -> Flat<Self> {
415        #[cfg(feature = "table-math")]
416        {
417            Flat::from_raw(apply_matrix_128(self, &constants::TOWER_TO_FLAT_128))
418        }
419
420        #[cfg(not(feature = "table-math"))]
421        {
422            Flat::from_raw(Block128(map_ct_128_split(
423                self.0,
424                &TOWER_TO_FLAT_BASIS_128.0,
425            )))
426        }
427    }
428
429    #[inline(always)]
430    fn from_hardware(value: Flat<Self>) -> Self {
431        let value = value.into_raw();
432
433        #[cfg(feature = "table-math")]
434        {
435            apply_matrix_128(value, &constants::FLAT_TO_TOWER_128)
436        }
437
438        #[cfg(not(feature = "table-math"))]
439        {
440            Block128(map_ct_128_split(value.0, &FLAT_TO_TOWER_BASIS_128.0))
441        }
442    }
443
444    #[inline(always)]
445    fn add_hardware(lhs: Flat<Self>, rhs: Flat<Self>) -> Flat<Self> {
446        Flat::from_raw(lhs.into_raw() + rhs.into_raw())
447    }
448
449    #[inline(always)]
450    fn add_hardware_packed(lhs: PackedFlat<Self>, rhs: PackedFlat<Self>) -> PackedFlat<Self> {
451        let lhs = lhs.into_raw();
452        let rhs = rhs.into_raw();
453
454        #[cfg(target_arch = "aarch64")]
455        {
456            PackedFlat::from_raw(neon::add_packed_128(lhs, rhs))
457        }
458
459        #[cfg(not(target_arch = "aarch64"))]
460        {
461            PackedFlat::from_raw(lhs + rhs)
462        }
463    }
464
465    #[inline(always)]
466    fn mul_hardware(lhs: Flat<Self>, rhs: Flat<Self>) -> Flat<Self> {
467        let lhs = lhs.into_raw();
468        let rhs = rhs.into_raw();
469
470        #[cfg(target_arch = "aarch64")]
471        {
472            Flat::from_raw(neon::mul_flat_128(lhs, rhs))
473        }
474
475        #[cfg(not(target_arch = "aarch64"))]
476        {
477            let a_tower = Self::from_hardware(Flat::from_raw(lhs));
478            let b_tower = Self::from_hardware(Flat::from_raw(rhs));
479
480            (a_tower * b_tower).to_hardware()
481        }
482    }
483
484    #[inline(always)]
485    fn mul_hardware_packed(lhs: PackedFlat<Self>, rhs: PackedFlat<Self>) -> PackedFlat<Self> {
486        let lhs = lhs.into_raw();
487        let rhs = rhs.into_raw();
488
489        #[cfg(target_arch = "aarch64")]
490        {
491            let mut res = [Block128::ZERO; PACKED_WIDTH_128];
492            for ((out, l), r) in res.iter_mut().zip(lhs.0.iter()).zip(rhs.0.iter()) {
493                *out = neon::mul_flat_128(*l, *r);
494            }
495
496            PackedFlat::from_raw(PackedBlock128(res))
497        }
498
499        #[cfg(not(target_arch = "aarch64"))]
500        {
501            let mut l = [Self::ZERO; <Self as PackableField>::WIDTH];
502            let mut r = [Self::ZERO; <Self as PackableField>::WIDTH];
503            let mut res = [Self::ZERO; <Self as PackableField>::WIDTH];
504
505            Self::unpack(lhs, &mut l);
506            Self::unpack(rhs, &mut r);
507
508            for i in 0..<Self as PackableField>::WIDTH {
509                res[i] = Self::mul_hardware(Flat::from_raw(l[i]), Flat::from_raw(r[i])).into_raw();
510            }
511
512            PackedFlat::from_raw(Self::pack(&res))
513        }
514    }
515
516    #[inline(always)]
517    fn mul_hardware_scalar_packed(lhs: PackedFlat<Self>, rhs: Flat<Self>) -> PackedFlat<Self> {
518        let broadcasted = PackedBlock128::broadcast(rhs.into_raw());
519        Self::mul_hardware_packed(lhs, PackedFlat::from_raw(broadcasted))
520    }
521
522    #[inline(always)]
523    fn tower_bit_from_hardware(value: Flat<Self>, bit_idx: usize) -> u8 {
524        let mask = constants::FLAT_TO_TOWER_BIT_MASKS_128[bit_idx];
525
526        // Parity of (x & mask) without popcount.
527        // Folds 128 bits down to 1
528        // using a binary XOR tree.
529        let mut v = value.into_raw().0 & mask;
530        v ^= v >> 64;
531        v ^= v >> 32;
532        v ^= v >> 16;
533        v ^= v >> 8;
534        v ^= v >> 4;
535        v ^= v >> 2;
536        v ^= v >> 1;
537
538        (v & 1) as u8
539    }
540}
541
542// ========================================
543// FIELD LIFTING (FlatPromote)
544// ========================================
545//
546// SECURITY:
547// Default implementation is constant-time (CT):
548// no secret-dependent memory access.
549
550#[cfg(not(feature = "table-math"))]
551impl FlatPromote<Block8> for Block128 {
552    #[inline(always)]
553    fn promote_flat(val: Flat<Block8>) -> Flat<Self> {
554        let val = val.into_raw();
555        Flat::from_raw(Block128(lift_ct::<8>(
556            val.0 as u64,
557            &constants::LIFT_BASIS_8.0,
558        )))
559    }
560
561    fn promote_flat_batch(input: &[Flat<Block8>], output: &mut [Flat<Self>]) {
562        let n = input.len().min(output.len());
563
564        #[cfg(target_arch = "aarch64")]
565        {
566            let full = n / 16;
567            for chunk in 0..full {
568                let i = chunk * 16;
569                unsafe {
570                    neon::promote_batch_8_to_128(
571                        input.as_ptr().add(i).cast::<u8>(),
572                        output.as_mut_ptr().add(i).cast::<u128>(),
573                    );
574                }
575            }
576
577            let tail = full * 16;
578            for i in tail..n {
579                output[i] = Self::promote_flat(input[i]);
580            }
581        }
582
583        #[cfg(not(target_arch = "aarch64"))]
584        {
585            for i in 0..n {
586                output[i] = Self::promote_flat(input[i]);
587            }
588        }
589    }
590}
591
592#[cfg(not(feature = "table-math"))]
593impl FlatPromote<Block16> for Block128 {
594    #[inline(always)]
595    fn promote_flat(val: Flat<Block16>) -> Flat<Self> {
596        Flat::from_raw(Block128(lift_ct::<16>(
597            val.into_raw().0 as u64,
598            &constants::LIFT_BASIS_16.0,
599        )))
600    }
601
602    fn promote_flat_batch(input: &[Flat<Block16>], output: &mut [Flat<Self>]) {
603        let n = input.len().min(output.len());
604
605        #[cfg(target_arch = "aarch64")]
606        {
607            let full = n / 16;
608            for chunk in 0..full {
609                let i = chunk * 16;
610                unsafe {
611                    neon::promote_batch_16_to_128(
612                        input.as_ptr().add(i).cast::<u8>(),
613                        output.as_mut_ptr().add(i).cast::<u128>(),
614                    );
615                }
616            }
617
618            let tail = full * 16;
619            for i in tail..n {
620                output[i] = Self::promote_flat(input[i]);
621            }
622        }
623
624        #[cfg(not(target_arch = "aarch64"))]
625        {
626            for i in 0..n {
627                output[i] = Self::promote_flat(input[i]);
628            }
629        }
630    }
631}
632
633#[cfg(not(feature = "table-math"))]
634impl FlatPromote<Block32> for Block128 {
635    #[inline(always)]
636    fn promote_flat(val: Flat<Block32>) -> Flat<Self> {
637        Flat::from_raw(Block128(lift_ct::<32>(
638            val.into_raw().0 as u64,
639            &constants::LIFT_BASIS_32.0,
640        )))
641    }
642
643    fn promote_flat_batch(input: &[Flat<Block32>], output: &mut [Flat<Self>]) {
644        let n = input.len().min(output.len());
645
646        #[cfg(target_arch = "aarch64")]
647        {
648            let full = n / 16;
649            for chunk in 0..full {
650                let i = chunk * 16;
651                unsafe {
652                    neon::promote_batch_32_to_128(
653                        input.as_ptr().add(i).cast::<u8>(),
654                        output.as_mut_ptr().add(i).cast::<u128>(),
655                    );
656                }
657            }
658
659            let tail = full * 16;
660            for i in tail..n {
661                output[i] = Self::promote_flat(input[i]);
662            }
663        }
664
665        #[cfg(not(target_arch = "aarch64"))]
666        {
667            for i in 0..n {
668                output[i] = Self::promote_flat(input[i]);
669            }
670        }
671    }
672}
673
674#[cfg(not(feature = "table-math"))]
675impl FlatPromote<Block64> for Block128 {
676    #[inline(always)]
677    fn promote_flat(val: Flat<Block64>) -> Flat<Self> {
678        Flat::from_raw(Block128(lift_ct::<64>(
679            val.into_raw().0,
680            &constants::LIFT_BASIS_64.0,
681        )))
682    }
683}
684
685// Insecure (secret-dependent table indexing) lifting path
686#[cfg(feature = "table-math")]
687impl FlatPromote<Block8> for Block128 {
688    #[inline(always)]
689    fn promote_flat(val: Flat<Block8>) -> Flat<Self> {
690        let idx = val.into_raw().0 as usize;
691        Flat::from_raw(Block128(unsafe {
692            *constants::LIFT_TABLE_8_TO_128.get_unchecked(idx)
693        }))
694    }
695}
696
697#[cfg(feature = "table-math")]
698impl FlatPromote<Block16> for Block128 {
699    #[inline(always)]
700    fn promote_flat(val: Flat<Block16>) -> Flat<Self> {
701        let v = val.into_raw().0;
702        let res = unsafe {
703            *constants::PROMOTE_16_BYTE_0_TO_128.get_unchecked((v & 0xFF) as usize)
704                ^ *constants::PROMOTE_16_BYTE_1_TO_128.get_unchecked(((v >> 8) & 0xFF) as usize)
705        };
706
707        Flat::from_raw(Block128(res))
708    }
709}
710
711#[cfg(feature = "table-math")]
712impl FlatPromote<Block32> for Block128 {
713    #[inline(always)]
714    fn promote_flat(val: Flat<Block32>) -> Flat<Self> {
715        let v = val.into_raw().0;
716        let res = unsafe {
717            *constants::PROMOTE_32_BYTE_0_TO_128.get_unchecked((v & 0xFF) as usize)
718                ^ *constants::PROMOTE_32_BYTE_1_TO_128.get_unchecked(((v >> 8) & 0xFF) as usize)
719                ^ *constants::PROMOTE_32_BYTE_2_TO_128.get_unchecked(((v >> 16) & 0xFF) as usize)
720                ^ *constants::PROMOTE_32_BYTE_3_TO_128.get_unchecked(((v >> 24) & 0xFF) as usize)
721        };
722
723        Flat::from_raw(Block128(res))
724    }
725}
726
727#[cfg(feature = "table-math")]
728impl FlatPromote<Block64> for Block128 {
729    #[inline(always)]
730    fn promote_flat(val: Flat<Block64>) -> Flat<Self> {
731        let v = val.into_raw().0;
732        let res = unsafe {
733            *constants::PROMOTE_64_BYTE_0_TO_128.get_unchecked((v & 0xFF) as usize)
734                ^ *constants::PROMOTE_64_BYTE_1_TO_128.get_unchecked(((v >> 8) & 0xFF) as usize)
735                ^ *constants::PROMOTE_64_BYTE_2_TO_128.get_unchecked(((v >> 16) & 0xFF) as usize)
736                ^ *constants::PROMOTE_64_BYTE_3_TO_128.get_unchecked(((v >> 24) & 0xFF) as usize)
737                ^ *constants::PROMOTE_64_BYTE_4_TO_128.get_unchecked(((v >> 32) & 0xFF) as usize)
738                ^ *constants::PROMOTE_64_BYTE_5_TO_128.get_unchecked(((v >> 40) & 0xFF) as usize)
739                ^ *constants::PROMOTE_64_BYTE_6_TO_128.get_unchecked(((v >> 48) & 0xFF) as usize)
740                ^ *constants::PROMOTE_64_BYTE_7_TO_128.get_unchecked(((v >> 56) & 0xFF) as usize)
741        };
742
743        Flat::from_raw(Block128(res))
744    }
745}
746
747// ===========================================
748// UTILS
749// ===========================================
750
751#[cfg(feature = "table-math")]
752#[inline(always)]
753pub fn apply_matrix_128(val: Block128, table: &[u128; 4096]) -> Block128 {
754    let mut res = 0u128;
755    let v = val.0;
756
757    // [!] The Ghost isn't in the Shell @_@
758
759    // 16 lookups (8-bit window)
760    for i in 0..16 {
761        let byte = (v >> (i * 8)) & 0xFF;
762        let idx = (i * 256) + (byte as usize);
763        res ^= unsafe { *table.get_unchecked(idx) };
764    }
765
766    Block128(res)
767}
768
769#[cfg(not(feature = "table-math"))]
770#[inline(always)]
771fn map_ct_128_split(x: u128, basis: &[u128; 128]) -> u128 {
772    let mut acc_lo = 0u64;
773    let mut acc_hi = 0u64;
774    let mut i = 0usize;
775
776    while i < 128 {
777        let bit = ((x >> i) & 1) as u64;
778        let mask = 0u64.wrapping_sub(bit);
779
780        let b = basis[i];
781        acc_lo ^= (b as u64) & mask;
782        acc_hi ^= ((b >> 64) as u64) & mask;
783
784        i += 1;
785    }
786
787    (acc_lo as u128) | ((acc_hi as u128) << 64)
788}
789
790#[cfg(not(feature = "table-math"))]
791#[inline(always)]
792fn lift_ct<const N: usize>(x: u64, basis: &'static [u128; N]) -> u128 {
793    let mut acc = 0u128;
794
795    let mut i = 0usize;
796    while i < N {
797        let bit = (x >> i) & 1;
798        let mask = 0u128.wrapping_sub(bit as u128);
799        acc ^= basis[i] & mask;
800        i += 1;
801    }
802
803    acc
804}
805
806// ===========================================
807// 128-BIT SIMD INSTRUCTIONS
808// ===========================================
809
810#[cfg(target_arch = "aarch64")]
811mod neon {
812    use super::*;
813    use core::arch::aarch64::*;
814    use core::mem::transmute;
815
816    #[inline(always)]
817    pub fn add_packed_128(lhs: PackedBlock128, rhs: PackedBlock128) -> PackedBlock128 {
818        unsafe {
819            // Block128 is packed into 4 elements
820            // (512 bits), work with 4 registers.
821            let l: [uint8x16_t; 4] = transmute(lhs.0);
822            let r: [uint8x16_t; 4] = transmute(rhs.0);
823
824            let res = [
825                veorq_u8(l[0], r[0]),
826                veorq_u8(l[1], r[1]),
827                veorq_u8(l[2], r[2]),
828                veorq_u8(l[3], r[3]),
829            ];
830
831            transmute(res)
832        }
833    }
834
835    #[inline(always)]
836    pub fn mul_flat_128(a: Block128, b: Block128) -> Block128 {
837        unsafe {
838            // Treat inputs as pairs of u64
839            let a_vec: uint64x2_t = transmute(a.0);
840            let b_vec: uint64x2_t = transmute(b.0);
841
842            let a0 = vgetq_lane_u64(a_vec, 0);
843            let a1 = vgetq_lane_u64(a_vec, 1);
844            let b0 = vgetq_lane_u64(b_vec, 0);
845            let b1 = vgetq_lane_u64(b_vec, 1);
846
847            // Karatsuba Multiplication using PMULL (64x64 -> 128)
848            // vmull_p64 takes poly64_t (which is u64)
849            let d0 = vmull_p64(a0, b0);
850            let d2 = vmull_p64(a1, b1);
851            let d1 = vmull_p64(a0 ^ a1, b0 ^ b1);
852
853            // Mid term = D1 ^ D0 ^ D2 (128-bit XOR)
854            // Since d0, d1, d2 are poly128_t,
855            // cast to uint128-like (uint8x16_t) to XOR
856            let d0_v: uint8x16_t = transmute(d0);
857            let d1_v: uint8x16_t = transmute(d1);
858            let d2_v: uint8x16_t = transmute(d2);
859
860            let mid_v = veorq_u8(d1_v, veorq_u8(d0_v, d2_v));
861
862            // Convert results to u64 parts for reduction
863            let d0_u64: uint64x2_t = transmute(d0);
864            let mid_u64: uint64x2_t = transmute(mid_v);
865            let d2_u64: uint64x2_t = transmute(d2);
866
867            let c0 = vgetq_lane_u64(d0_u64, 0);
868            let c1 = vgetq_lane_u64(d0_u64, 1) ^ vgetq_lane_u64(mid_u64, 0);
869            let c2 = vgetq_lane_u64(d2_u64, 0) ^ vgetq_lane_u64(mid_u64, 1);
870            let c3 = vgetq_lane_u64(d2_u64, 1);
871
872            // Reduction P(x) = x^128 + R(x).
873            // R(x) = 0x87 (fits in u64)
874            let r_val = constants::POLY_128 as u64;
875
876            // Fold H = [C2, C3]
877            // Multiply C2 and C3 by R(x)
878            let p0 = vmull_p64(c2, r_val);
879            let p1 = vmull_p64(c3, r_val);
880
881            let p0_u64: uint64x2_t = transmute(p0);
882            let p1_u64: uint64x2_t = transmute(p1);
883
884            let folded_0 = vgetq_lane_u64(p0_u64, 0);
885            let folded_1 = vgetq_lane_u64(p0_u64, 1) ^ vgetq_lane_u64(p1_u64, 0);
886            let carry = vgetq_lane_u64(p1_u64, 1);
887
888            let final_0 = c0 ^ folded_0;
889            let final_1 = c1 ^ folded_1;
890
891            // Second reduction for carry
892            let carry_mul = vmull_p64(carry, r_val);
893
894            // Use transmute to convert the opaque
895            // poly128_t to something we can read.
896            let carry_res_vec: uint64x2_t = transmute(carry_mul);
897            let carry_res = vgetq_lane_u64(carry_res_vec, 0);
898
899            let res_lo = final_0 ^ carry_res;
900            let res_hi = final_1;
901
902            Block128((res_lo as u128) | ((res_hi as u128) << 64))
903        }
904    }
905
906    /// CT packed promote:
907    /// 16 × Block8 → 16 × Block128 via nibble decomposition.
908    #[cfg(not(feature = "table-math"))]
909    #[inline(always)]
910    pub unsafe fn promote_batch_8_to_128(input: *const u8, output: *mut u128) {
911        unsafe {
912            let vals = vld1q_u8(input);
913
914            let mask_0f = vdupq_n_u8(0x0F);
915            let lo_nib = vandq_u8(vals, mask_0f);
916            let hi_nib = vshrq_n_u8::<4>(vals);
917
918            let mut out = [vdupq_n_u8(0); 16];
919
920            macro_rules! lookup {
921                ($j:expr, $lo:ident, $hi:ident, $dst:ident) => {{
922                    let t0 = vld1q_u8(constants::NIBBLE_PROMOTE_8_0_TO_128[$j].as_ptr());
923                    let t1 = vld1q_u8(constants::NIBBLE_PROMOTE_8_1_TO_128[$j].as_ptr());
924
925                    $dst[$j] = veorq_u8(vqtbl1q_u8(t0, $lo), vqtbl1q_u8(t1, $hi));
926                }};
927            }
928
929            lookup!(0, lo_nib, hi_nib, out);
930            lookup!(1, lo_nib, hi_nib, out);
931            lookup!(2, lo_nib, hi_nib, out);
932            lookup!(3, lo_nib, hi_nib, out);
933            lookup!(4, lo_nib, hi_nib, out);
934            lookup!(5, lo_nib, hi_nib, out);
935            lookup!(6, lo_nib, hi_nib, out);
936            lookup!(7, lo_nib, hi_nib, out);
937            lookup!(8, lo_nib, hi_nib, out);
938            lookup!(9, lo_nib, hi_nib, out);
939            lookup!(10, lo_nib, hi_nib, out);
940            lookup!(11, lo_nib, hi_nib, out);
941            lookup!(12, lo_nib, hi_nib, out);
942            lookup!(13, lo_nib, hi_nib, out);
943            lookup!(14, lo_nib, hi_nib, out);
944            lookup!(15, lo_nib, hi_nib, out);
945
946            let elems = transpose_16x16(&out);
947            for (i, elem) in elems.iter().enumerate() {
948                vst1q_u8(output.add(i).cast::<u8>(), *elem);
949            }
950        }
951    }
952
953    /// CT packed promote:
954    /// 16 × Block16 → 16 × Block128 via nibble decomposition.
955    #[cfg(not(feature = "table-math"))]
956    #[inline(always)]
957    pub unsafe fn promote_batch_16_to_128(input: *const u8, output: *mut u128) {
958        unsafe {
959            let raw0 = vld1q_u8(input);
960            let raw1 = vld1q_u8(input.add(16));
961
962            let lo_bytes = vuzp1q_u8(raw0, raw1);
963            let hi_bytes = vuzp2q_u8(raw0, raw1);
964
965            let mask_0f = vdupq_n_u8(0x0F);
966            let n0 = vandq_u8(lo_bytes, mask_0f);
967            let n1 = vshrq_n_u8::<4>(lo_bytes);
968            let n2 = vandq_u8(hi_bytes, mask_0f);
969            let n3 = vshrq_n_u8::<4>(hi_bytes);
970
971            let mut out = [vdupq_n_u8(0); 16];
972
973            macro_rules! lookup {
974                ($j:expr, $n0:ident, $n1:ident, $n2:ident, $n3:ident, $dst:ident) => {{
975                    let t0 = vld1q_u8(constants::NIBBLE_PROMOTE_16_0_TO_128[$j].as_ptr());
976                    let t1 = vld1q_u8(constants::NIBBLE_PROMOTE_16_1_TO_128[$j].as_ptr());
977                    let t2 = vld1q_u8(constants::NIBBLE_PROMOTE_16_2_TO_128[$j].as_ptr());
978                    let t3 = vld1q_u8(constants::NIBBLE_PROMOTE_16_3_TO_128[$j].as_ptr());
979
980                    $dst[$j] = veorq_u8(
981                        veorq_u8(vqtbl1q_u8(t0, $n0), vqtbl1q_u8(t1, $n1)),
982                        veorq_u8(vqtbl1q_u8(t2, $n2), vqtbl1q_u8(t3, $n3)),
983                    );
984                }};
985            }
986
987            lookup!(0, n0, n1, n2, n3, out);
988            lookup!(1, n0, n1, n2, n3, out);
989            lookup!(2, n0, n1, n2, n3, out);
990            lookup!(3, n0, n1, n2, n3, out);
991            lookup!(4, n0, n1, n2, n3, out);
992            lookup!(5, n0, n1, n2, n3, out);
993            lookup!(6, n0, n1, n2, n3, out);
994            lookup!(7, n0, n1, n2, n3, out);
995            lookup!(8, n0, n1, n2, n3, out);
996            lookup!(9, n0, n1, n2, n3, out);
997            lookup!(10, n0, n1, n2, n3, out);
998            lookup!(11, n0, n1, n2, n3, out);
999            lookup!(12, n0, n1, n2, n3, out);
1000            lookup!(13, n0, n1, n2, n3, out);
1001            lookup!(14, n0, n1, n2, n3, out);
1002            lookup!(15, n0, n1, n2, n3, out);
1003
1004            let elems = transpose_16x16(&out);
1005            for (i, elem) in elems.iter().enumerate() {
1006                vst1q_u8(output.add(i).cast::<u8>(), *elem);
1007            }
1008        }
1009    }
1010
1011    /// CT packed promote:
1012    /// 16 × Block32 → 16 × Block128 via nibble decomposition.
1013    #[cfg(not(feature = "table-math"))]
1014    #[inline(always)]
1015    pub unsafe fn promote_batch_32_to_128(input: *const u8, output: *mut u128) {
1016        unsafe {
1017            let raw0 = vld1q_u8(input);
1018            let raw1 = vld1q_u8(input.add(16));
1019            let raw2 = vld1q_u8(input.add(32));
1020            let raw3 = vld1q_u8(input.add(48));
1021
1022            let a02 = vuzp1q_u8(raw0, raw1);
1023            let a13 = vuzp2q_u8(raw0, raw1);
1024            let b02 = vuzp1q_u8(raw2, raw3);
1025            let b13 = vuzp2q_u8(raw2, raw3);
1026
1027            let byte0 = vuzp1q_u8(a02, b02);
1028            let byte2 = vuzp2q_u8(a02, b02);
1029            let byte1 = vuzp1q_u8(a13, b13);
1030            let byte3 = vuzp2q_u8(a13, b13);
1031
1032            let mask_0f = vdupq_n_u8(0x0F);
1033            let n0 = vandq_u8(byte0, mask_0f);
1034            let n1 = vshrq_n_u8::<4>(byte0);
1035            let n2 = vandq_u8(byte1, mask_0f);
1036            let n3 = vshrq_n_u8::<4>(byte1);
1037            let n4 = vandq_u8(byte2, mask_0f);
1038            let n5 = vshrq_n_u8::<4>(byte2);
1039            let n6 = vandq_u8(byte3, mask_0f);
1040            let n7 = vshrq_n_u8::<4>(byte3);
1041
1042            let mut out = [vdupq_n_u8(0); 16];
1043
1044            macro_rules! lookup {
1045                ($j:expr, $n0:ident, $n1:ident, $n2:ident, $n3:ident,
1046                 $n4:ident, $n5:ident, $n6:ident, $n7:ident, $dst:ident) => {{
1047                    let t0 = vld1q_u8(constants::NIBBLE_PROMOTE_32_0_TO_128[$j].as_ptr());
1048                    let t1 = vld1q_u8(constants::NIBBLE_PROMOTE_32_1_TO_128[$j].as_ptr());
1049                    let t2 = vld1q_u8(constants::NIBBLE_PROMOTE_32_2_TO_128[$j].as_ptr());
1050                    let t3 = vld1q_u8(constants::NIBBLE_PROMOTE_32_3_TO_128[$j].as_ptr());
1051                    let t4 = vld1q_u8(constants::NIBBLE_PROMOTE_32_4_TO_128[$j].as_ptr());
1052                    let t5 = vld1q_u8(constants::NIBBLE_PROMOTE_32_5_TO_128[$j].as_ptr());
1053                    let t6 = vld1q_u8(constants::NIBBLE_PROMOTE_32_6_TO_128[$j].as_ptr());
1054                    let t7 = vld1q_u8(constants::NIBBLE_PROMOTE_32_7_TO_128[$j].as_ptr());
1055
1056                    $dst[$j] = veorq_u8(
1057                        veorq_u8(
1058                            veorq_u8(vqtbl1q_u8(t0, $n0), vqtbl1q_u8(t1, $n1)),
1059                            veorq_u8(vqtbl1q_u8(t2, $n2), vqtbl1q_u8(t3, $n3)),
1060                        ),
1061                        veorq_u8(
1062                            veorq_u8(vqtbl1q_u8(t4, $n4), vqtbl1q_u8(t5, $n5)),
1063                            veorq_u8(vqtbl1q_u8(t6, $n6), vqtbl1q_u8(t7, $n7)),
1064                        ),
1065                    );
1066                }};
1067            }
1068
1069            lookup!(0, n0, n1, n2, n3, n4, n5, n6, n7, out);
1070            lookup!(1, n0, n1, n2, n3, n4, n5, n6, n7, out);
1071            lookup!(2, n0, n1, n2, n3, n4, n5, n6, n7, out);
1072            lookup!(3, n0, n1, n2, n3, n4, n5, n6, n7, out);
1073            lookup!(4, n0, n1, n2, n3, n4, n5, n6, n7, out);
1074            lookup!(5, n0, n1, n2, n3, n4, n5, n6, n7, out);
1075            lookup!(6, n0, n1, n2, n3, n4, n5, n6, n7, out);
1076            lookup!(7, n0, n1, n2, n3, n4, n5, n6, n7, out);
1077            lookup!(8, n0, n1, n2, n3, n4, n5, n6, n7, out);
1078            lookup!(9, n0, n1, n2, n3, n4, n5, n6, n7, out);
1079            lookup!(10, n0, n1, n2, n3, n4, n5, n6, n7, out);
1080            lookup!(11, n0, n1, n2, n3, n4, n5, n6, n7, out);
1081            lookup!(12, n0, n1, n2, n3, n4, n5, n6, n7, out);
1082            lookup!(13, n0, n1, n2, n3, n4, n5, n6, n7, out);
1083            lookup!(14, n0, n1, n2, n3, n4, n5, n6, n7, out);
1084            lookup!(15, n0, n1, n2, n3, n4, n5, n6, n7, out);
1085
1086            let elems = transpose_16x16(&out);
1087            for (i, elem) in elems.iter().enumerate() {
1088                vst1q_u8(output.add(i).cast::<u8>(), *elem);
1089            }
1090        }
1091    }
1092
1093    /// 16×16 byte matrix transpose via TRN cascade.
1094    #[cfg(not(feature = "table-math"))]
1095    #[inline(always)]
1096    unsafe fn transpose_16x16(r: &[uint8x16_t; 16]) -> [uint8x16_t; 16] {
1097        // Shorthand reinterpret casts
1098        // between NEON register types.
1099        #[inline(always)]
1100        const fn u8_to_u16(v: uint8x16_t) -> uint16x8_t {
1101            unsafe { transmute::<uint8x16_t, uint16x8_t>(v) }
1102        }
1103
1104        #[inline(always)]
1105        const fn u16_to_u32(v: uint16x8_t) -> uint32x4_t {
1106            unsafe { transmute::<uint16x8_t, uint32x4_t>(v) }
1107        }
1108
1109        #[inline(always)]
1110        const fn u32_to_u64(v: uint32x4_t) -> uint64x2_t {
1111            unsafe { transmute::<uint32x4_t, uint64x2_t>(v) }
1112        }
1113
1114        #[inline(always)]
1115        const fn u64_to_u8(v: uint64x2_t) -> uint8x16_t {
1116            unsafe { transmute::<uint64x2_t, uint8x16_t>(v) }
1117        }
1118
1119        unsafe {
1120            // Phase 1:
1121            // TRN u8, transpose 2×2 byte blocks
1122            let a0 = vtrn1q_u8(r[0], r[1]);
1123            let a1 = vtrn2q_u8(r[0], r[1]);
1124            let a2 = vtrn1q_u8(r[2], r[3]);
1125            let a3 = vtrn2q_u8(r[2], r[3]);
1126            let a4 = vtrn1q_u8(r[4], r[5]);
1127            let a5 = vtrn2q_u8(r[4], r[5]);
1128            let a6 = vtrn1q_u8(r[6], r[7]);
1129            let a7 = vtrn2q_u8(r[6], r[7]);
1130            let a8 = vtrn1q_u8(r[8], r[9]);
1131            let a9 = vtrn2q_u8(r[8], r[9]);
1132            let a10 = vtrn1q_u8(r[10], r[11]);
1133            let a11 = vtrn2q_u8(r[10], r[11]);
1134            let a12 = vtrn1q_u8(r[12], r[13]);
1135            let a13 = vtrn2q_u8(r[12], r[13]);
1136            let a14 = vtrn1q_u8(r[14], r[15]);
1137            let a15 = vtrn2q_u8(r[14], r[15]);
1138
1139            // Phase 2:
1140            // TRN u16, transpose 4×4 blocks
1141            let b0 = vtrn1q_u16(u8_to_u16(a0), u8_to_u16(a2));
1142            let b2 = vtrn2q_u16(u8_to_u16(a0), u8_to_u16(a2));
1143            let b1 = vtrn1q_u16(u8_to_u16(a1), u8_to_u16(a3));
1144            let b3 = vtrn2q_u16(u8_to_u16(a1), u8_to_u16(a3));
1145            let b4 = vtrn1q_u16(u8_to_u16(a4), u8_to_u16(a6));
1146            let b6 = vtrn2q_u16(u8_to_u16(a4), u8_to_u16(a6));
1147            let b5 = vtrn1q_u16(u8_to_u16(a5), u8_to_u16(a7));
1148            let b7 = vtrn2q_u16(u8_to_u16(a5), u8_to_u16(a7));
1149            let b8 = vtrn1q_u16(u8_to_u16(a8), u8_to_u16(a10));
1150            let b10 = vtrn2q_u16(u8_to_u16(a8), u8_to_u16(a10));
1151            let b9 = vtrn1q_u16(u8_to_u16(a9), u8_to_u16(a11));
1152            let b11 = vtrn2q_u16(u8_to_u16(a9), u8_to_u16(a11));
1153            let b12 = vtrn1q_u16(u8_to_u16(a12), u8_to_u16(a14));
1154            let b14 = vtrn2q_u16(u8_to_u16(a12), u8_to_u16(a14));
1155            let b13 = vtrn1q_u16(u8_to_u16(a13), u8_to_u16(a15));
1156            let b15 = vtrn2q_u16(u8_to_u16(a13), u8_to_u16(a15));
1157
1158            // Phase 3:
1159            // TRN u32, transpose 8×8 blocks
1160            let c0 = vtrn1q_u32(u16_to_u32(b0), u16_to_u32(b4));
1161            let c4 = vtrn2q_u32(u16_to_u32(b0), u16_to_u32(b4));
1162            let c1 = vtrn1q_u32(u16_to_u32(b1), u16_to_u32(b5));
1163            let c5 = vtrn2q_u32(u16_to_u32(b1), u16_to_u32(b5));
1164            let c2 = vtrn1q_u32(u16_to_u32(b2), u16_to_u32(b6));
1165            let c6 = vtrn2q_u32(u16_to_u32(b2), u16_to_u32(b6));
1166            let c3 = vtrn1q_u32(u16_to_u32(b3), u16_to_u32(b7));
1167            let c7 = vtrn2q_u32(u16_to_u32(b3), u16_to_u32(b7));
1168            let c8 = vtrn1q_u32(u16_to_u32(b8), u16_to_u32(b12));
1169            let c12 = vtrn2q_u32(u16_to_u32(b8), u16_to_u32(b12));
1170            let c9 = vtrn1q_u32(u16_to_u32(b9), u16_to_u32(b13));
1171            let c13 = vtrn2q_u32(u16_to_u32(b9), u16_to_u32(b13));
1172            let c10 = vtrn1q_u32(u16_to_u32(b10), u16_to_u32(b14));
1173            let c14 = vtrn2q_u32(u16_to_u32(b10), u16_to_u32(b14));
1174            let c11 = vtrn1q_u32(u16_to_u32(b11), u16_to_u32(b15));
1175            let c15 = vtrn2q_u32(u16_to_u32(b11), u16_to_u32(b15));
1176
1177            // Phase 4:
1178            // TRN u64, full 16×16 transpose
1179            [
1180                u64_to_u8(vtrn1q_u64(u32_to_u64(c0), u32_to_u64(c8))),
1181                u64_to_u8(vtrn1q_u64(u32_to_u64(c1), u32_to_u64(c9))),
1182                u64_to_u8(vtrn1q_u64(u32_to_u64(c2), u32_to_u64(c10))),
1183                u64_to_u8(vtrn1q_u64(u32_to_u64(c3), u32_to_u64(c11))),
1184                u64_to_u8(vtrn1q_u64(u32_to_u64(c4), u32_to_u64(c12))),
1185                u64_to_u8(vtrn1q_u64(u32_to_u64(c5), u32_to_u64(c13))),
1186                u64_to_u8(vtrn1q_u64(u32_to_u64(c6), u32_to_u64(c14))),
1187                u64_to_u8(vtrn1q_u64(u32_to_u64(c7), u32_to_u64(c15))),
1188                u64_to_u8(vtrn2q_u64(u32_to_u64(c0), u32_to_u64(c8))),
1189                u64_to_u8(vtrn2q_u64(u32_to_u64(c1), u32_to_u64(c9))),
1190                u64_to_u8(vtrn2q_u64(u32_to_u64(c2), u32_to_u64(c10))),
1191                u64_to_u8(vtrn2q_u64(u32_to_u64(c3), u32_to_u64(c11))),
1192                u64_to_u8(vtrn2q_u64(u32_to_u64(c4), u32_to_u64(c12))),
1193                u64_to_u8(vtrn2q_u64(u32_to_u64(c5), u32_to_u64(c13))),
1194                u64_to_u8(vtrn2q_u64(u32_to_u64(c6), u32_to_u64(c14))),
1195                u64_to_u8(vtrn2q_u64(u32_to_u64(c7), u32_to_u64(c15))),
1196            ]
1197        }
1198    }
1199}
1200
1201// ==================================
1202// BLOCK 128 TESTS
1203// ==================================
1204
1205#[cfg(test)]
1206mod tests {
1207    use super::*;
1208    use proptest::prelude::*;
1209    use rand::{RngExt, rng};
1210
1211    // ==================================
1212    // BASIC
1213    // ==================================
1214
1215    #[test]
1216    fn tower_constants() {
1217        // Check that tau is propagated correctly
1218        // For Block128, tau must be (0, 1) from Block64.
1219        let tau128 = Block128::EXTENSION_TAU;
1220        let (lo128, hi128) = tau128.split();
1221        assert_eq!(lo128, Block64::ZERO);
1222        assert_eq!(hi128, Block64::TAU);
1223    }
1224
1225    #[test]
1226    fn add_truth() {
1227        let zero = Block128::ZERO;
1228        let one = Block128::ONE;
1229
1230        assert_eq!(zero + zero, zero);
1231        assert_eq!(zero + one, one);
1232        assert_eq!(one + zero, one);
1233        assert_eq!(one + one, zero);
1234    }
1235
1236    #[test]
1237    fn mul_truth() {
1238        let zero = Block128::ZERO;
1239        let one = Block128::ONE;
1240
1241        assert_eq!(zero * zero, zero);
1242        assert_eq!(zero * one, zero);
1243        assert_eq!(one * one, one);
1244    }
1245
1246    #[test]
1247    fn add() {
1248        // 5 ^ 3 = 6
1249        // 101 ^ 011 = 110
1250        assert_eq!(Block128(5) + Block128(3), Block128(6));
1251    }
1252
1253    #[test]
1254    fn mul_simple() {
1255        // Check for prime numbers (without overflow)
1256        // x^1 * x^1 = x^2 (2 * 2 = 4)
1257        assert_eq!(Block128(2) * Block128(2), Block128(4));
1258    }
1259
1260    #[test]
1261    fn mul_overflow() {
1262        // Reduction verification (AES test vectors)
1263        // Example from the AES specification:
1264        // 0x57 * 0x83 = 0xC1
1265        assert_eq!(Block128(0x57) * Block128(0x83), Block128(0xC1));
1266    }
1267
1268    #[test]
1269    fn karatsuba_correctness() {
1270        // Let's check using Block128 as an example
1271        // Let A = X (hi=1, lo=0)
1272        // Let B = X (hi=1, lo=0)
1273        // A * B = X^2
1274        // According to the rule:
1275        // X^2 = X + tau
1276        // Where tau for Block64 = 0x2000_0000_0000_0000.
1277        // So the result should be:
1278        // hi=1 (X), lo=0x20 (tau)
1279
1280        // Construct X manually
1281        let x = Block128::new(Block64::ZERO, Block64::ONE);
1282        let squared = x * x;
1283
1284        // Verify result via splitting
1285        let (res_lo, res_hi) = squared.split();
1286
1287        assert_eq!(res_hi, Block64::ONE, "X^2 should contain X component");
1288        assert_eq!(
1289            res_lo,
1290            Block64(0x2000_0000_0000_0000),
1291            "X^2 should contain tau component (0x2000_0000_0000_0000)"
1292        );
1293    }
1294
1295    #[test]
1296    fn security_zeroize() {
1297        // Setup sensitive data
1298        let mut secret_val = Block128::from(0xDEAD_BEEF_CAFE_BABE_u128);
1299        assert_ne!(secret_val, Block128::ZERO);
1300
1301        // Nuke it
1302        secret_val.zeroize();
1303
1304        // Verify absolute zero
1305        assert_eq!(secret_val, Block128::ZERO, "Memory was not wiped!");
1306
1307        // Check internal bytes just to be sure
1308        assert_eq!(secret_val.0, 0u128, "Underlying memory leak detected");
1309    }
1310
1311    #[test]
1312    fn invert_zero() {
1313        // Ensure strictly that 0 cannot be inverted.
1314        assert_eq!(
1315            Block128::ZERO.invert(),
1316            Block128::ZERO,
1317            "invert(0) must return 0"
1318        );
1319    }
1320
1321    #[test]
1322    fn inversion_random() {
1323        let mut rng = rng();
1324        for _i in 0..1000 {
1325            let val = Block128(rng.random());
1326
1327            if val != Block128::ZERO {
1328                let inv = val.invert();
1329                let identity = val * inv;
1330
1331                assert_eq!(
1332                    identity,
1333                    Block128::ONE,
1334                    "Inversion identity failed: a * a^-1 != 1"
1335                );
1336            }
1337        }
1338    }
1339
1340    #[test]
1341    fn tower_embedding() {
1342        let mut rng = rng();
1343        for _ in 0..100 {
1344            let a = Block64(rng.random());
1345            let b = Block64(rng.random());
1346
1347            // 1. Structure check: Lifting Block64 -> Block128
1348            let a_lifted: Block128 = a.into();
1349            let (lo, hi) = a_lifted.split();
1350
1351            assert_eq!(lo, a, "Embedding structure failed: low part mismatch");
1352            assert_eq!(
1353                hi,
1354                Block64::ZERO,
1355                "Embedding structure failed: high part must be zero"
1356            );
1357
1358            // 2. Addition Homomorphism
1359            let sum_sub = a + b;
1360            let sum_lifted: Block128 = sum_sub.into();
1361            let sum_in_super = Block128::from(a) + Block128::from(b);
1362
1363            assert_eq!(sum_lifted, sum_in_super, "Homomorphism failed: add");
1364
1365            // 3. Multiplication Homomorphism
1366            // If I multiply two small numbers inside the big field,
1367            // the result must be the same as multiplying them in the small field
1368            // and then converting.
1369            let prod_sub = a * b;
1370            let prod_lifted: Block128 = prod_sub.into();
1371            let prod_in_super = Block128::from(a) * Block128::from(b);
1372
1373            assert_eq!(prod_lifted, prod_in_super, "Homomorphism failed: mul");
1374        }
1375    }
1376
1377    // ==================================
1378    // HARDWARE
1379    // ==================================
1380
1381    #[test]
1382    fn isomorphism_roundtrip() {
1383        let mut rng = rng();
1384        for _ in 0..1000 {
1385            let val = Block128(rng.random::<u128>());
1386            assert_eq!(val.to_hardware().to_tower(), val);
1387        }
1388    }
1389
1390    #[test]
1391    fn flat_mul_homomorphism() {
1392        let mut rng = rng();
1393        for _ in 0..1000 {
1394            let a = Block128(rng.random());
1395            let b = Block128(rng.random());
1396
1397            let expected_flat = (a * b).to_hardware();
1398            let actual_flat = a.to_hardware() * b.to_hardware();
1399
1400            assert_eq!(
1401                actual_flat, expected_flat,
1402                "Block128 flat multiplication mismatch: (a*b)^H != a^H * b^H"
1403            );
1404        }
1405    }
1406
1407    #[test]
1408    fn packed_consistency() {
1409        let mut rng = rng();
1410        for _ in 0..100 {
1411            let mut a_vals = [Block128::ZERO; 4];
1412            let mut b_vals = [Block128::ZERO; 4];
1413
1414            for i in 0..4 {
1415                a_vals[i] = Block128(rng.random::<u128>());
1416                b_vals[i] = Block128(rng.random::<u128>());
1417            }
1418
1419            let a_flat_vals = a_vals.map(|x| x.to_hardware());
1420            let b_flat_vals = b_vals.map(|x| x.to_hardware());
1421            let a_packed = Flat::<Block128>::pack(&a_flat_vals);
1422            let b_packed = Flat::<Block128>::pack(&b_flat_vals);
1423
1424            // 1. Test SIMD Add (Check 512-bit / 4-register XOR)
1425            let add_res = Block128::add_hardware_packed(a_packed, b_packed);
1426
1427            let mut add_out = [Block128::ZERO.to_hardware(); 4];
1428            Flat::<Block128>::unpack(add_res, &mut add_out);
1429
1430            for i in 0..4 {
1431                assert_eq!(
1432                    add_out[i],
1433                    (a_vals[i] + b_vals[i]).to_hardware(),
1434                    "Block128 SIMD add mismatch at index {}",
1435                    i
1436                );
1437            }
1438
1439            // 2. Test SIMD Mul (Flat basis)
1440            let mul_res = Block128::mul_hardware_packed(a_packed, b_packed);
1441
1442            let mut mul_out = [Block128::ZERO.to_hardware(); 4];
1443            Flat::<Block128>::unpack(mul_res, &mut mul_out);
1444
1445            for i in 0..4 {
1446                let expected_flat = (a_vals[i] * b_vals[i]).to_hardware();
1447                assert_eq!(
1448                    mul_out[i], expected_flat,
1449                    "Block128 SIMD mul mismatch at index {}",
1450                    i
1451                );
1452            }
1453        }
1454    }
1455
1456    // ==================================
1457    // PACKED
1458    // ==================================
1459
1460    #[test]
1461    fn pack_unpack_roundtrip() {
1462        let mut rng = rng();
1463        let mut data = [Block128::ZERO; PACKED_WIDTH_128];
1464        for v in data.iter_mut() {
1465            *v = Block128(rng.random());
1466        }
1467
1468        let packed = Block128::pack(&data);
1469        let mut unpacked = [Block128::ZERO; PACKED_WIDTH_128];
1470        Block128::unpack(packed, &mut unpacked);
1471        assert_eq!(data, unpacked);
1472    }
1473
1474    #[test]
1475    fn packed_add_consistency() {
1476        let mut rng = rng();
1477        let mut a_vals = [Block128::ZERO; PACKED_WIDTH_128];
1478        let mut b_vals = [Block128::ZERO; PACKED_WIDTH_128];
1479
1480        for i in 0..PACKED_WIDTH_128 {
1481            a_vals[i] = Block128(rng.random());
1482            b_vals[i] = Block128(rng.random());
1483        }
1484
1485        let res_packed = Block128::pack(&a_vals) + Block128::pack(&b_vals);
1486        let mut res_unpacked = [Block128::ZERO; PACKED_WIDTH_128];
1487        Block128::unpack(res_packed, &mut res_unpacked);
1488
1489        for i in 0..PACKED_WIDTH_128 {
1490            assert_eq!(res_unpacked[i], a_vals[i] + b_vals[i]);
1491        }
1492    }
1493
1494    #[test]
1495    fn packed_mul_consistency() {
1496        let mut rng = rng();
1497
1498        for _ in 0..1000 {
1499            // Check 1000 random cases.
1500            // Generate random inputs
1501            let mut a_arr = [Block128::ZERO; PACKED_WIDTH_128];
1502            let mut b_arr = [Block128::ZERO; PACKED_WIDTH_128];
1503
1504            for i in 0..PACKED_WIDTH_128 {
1505                // Generate random u128
1506                let val_a: u128 = rng.random();
1507                let val_b: u128 = rng.random();
1508                a_arr[i] = Block128(val_a);
1509                b_arr[i] = Block128(val_b);
1510            }
1511
1512            let a_packed = PackedBlock128(a_arr);
1513            let b_packed = PackedBlock128(b_arr);
1514
1515            // Perform SIMD multiplication
1516            let c_packed = a_packed * b_packed;
1517
1518            // Verify against Scalar multiplication
1519            let mut c_expected = [Block128::ZERO; PACKED_WIDTH_128];
1520            for i in 0..PACKED_WIDTH_128 {
1521                c_expected[i] = a_arr[i] * b_arr[i];
1522            }
1523
1524            assert_eq!(c_packed.0, c_expected, "SIMD multiplication mismatch!");
1525        }
1526    }
1527
1528    // ==================================
1529    // CT LIFTING BASIS
1530    // ==================================
1531
1532    #[inline(always)]
1533    fn promote_block8_tables(val: Block8) -> Block128 {
1534        // Current (table) lifting: flat/hardware byte -> tower byte -> Block128 flat.
1535        let idx_flat = val.0 as usize;
1536        let tower_byte = unsafe { *constants::FLAT_TO_TOWER_8.get_unchecked(idx_flat) };
1537        let idx_tower = tower_byte as usize;
1538
1539        Block128(unsafe { *constants::TOWER_TO_FLAT_128.get_unchecked(idx_tower) })
1540    }
1541
1542    #[inline(always)]
1543    fn promote_block16_tables(val: Block16) -> Block128 {
1544        let v_flat = val.0;
1545
1546        let mut v_tower = 0u16;
1547        for i in 0..2 {
1548            let byte = ((v_flat >> (i * 8)) & 0xFF) as usize;
1549            let idx = (i * 256) + byte;
1550            v_tower ^= unsafe { *constants::FLAT_TO_TOWER_16.get_unchecked(idx) };
1551        }
1552
1553        let mut res = 0u128;
1554        for i in 0..2 {
1555            let byte = ((v_tower >> (i * 8)) & 0xFF) as usize;
1556            let idx = (i * 256) + byte;
1557            res ^= unsafe { *constants::TOWER_TO_FLAT_128.get_unchecked(idx) };
1558        }
1559
1560        Block128(res)
1561    }
1562
1563    #[inline(always)]
1564    fn promote_block32_tables(val: Block32) -> Block128 {
1565        let v_flat = val.0;
1566
1567        let mut v_tower = 0u32;
1568        for i in 0..4 {
1569            let byte = ((v_flat >> (i * 8)) & 0xFF) as usize;
1570            let idx = (i * 256) + byte;
1571            v_tower ^= unsafe { *constants::FLAT_TO_TOWER_32.get_unchecked(idx) };
1572        }
1573
1574        let mut res = 0u128;
1575        for i in 0..4 {
1576            let byte = ((v_tower >> (i * 8)) & 0xFF) as usize;
1577            let idx = (i * 256) + byte;
1578            res ^= unsafe { *constants::TOWER_TO_FLAT_128.get_unchecked(idx) };
1579        }
1580
1581        Block128(res)
1582    }
1583
1584    #[inline(always)]
1585    fn promote_block64_tables(val: Block64) -> Block128 {
1586        let v_flat = val.0;
1587
1588        let mut v_tower = 0u64;
1589        for i in 0..8 {
1590            let byte = ((v_flat >> (i * 8)) & 0xFF) as usize;
1591            let idx = (i * 256) + byte;
1592            v_tower ^= unsafe { *constants::FLAT_TO_TOWER_64.get_unchecked(idx) };
1593        }
1594
1595        let mut res = 0u128;
1596        for i in 0..8 {
1597            let byte = ((v_tower >> (i * 8)) & 0xFF) as usize;
1598            let idx = (i * 256) + byte;
1599            res ^= unsafe { *constants::TOWER_TO_FLAT_128.get_unchecked(idx) };
1600        }
1601
1602        Block128(res)
1603    }
1604
1605    #[test]
1606    fn lift_from_partial_hardware_matches_tables_block8_exhaustive() {
1607        for x in 0u16..=u8::MAX as u16 {
1608            let v = Block8(x as u8);
1609            let got = Block128::promote_flat(Flat::from_raw(v)).into_raw();
1610            let expected = promote_block8_tables(v);
1611
1612            assert_eq!(got, expected);
1613        }
1614    }
1615
1616    #[test]
1617    fn lift_from_partial_hardware_matches_tables_block16_exhaustive() {
1618        for x in 0..=u16::MAX {
1619            let v = Block16(x);
1620            let got = Block128::promote_flat(Flat::from_raw(v)).into_raw();
1621            let expected = promote_block16_tables(v);
1622
1623            assert_eq!(got, expected);
1624        }
1625    }
1626
1627    #[test]
1628    fn lift_from_partial_hardware_matches_tables_block32_random() {
1629        let mut rng = rng();
1630        for _ in 0..10_000 {
1631            let v = Block32(rng.random::<u32>());
1632            let got = Block128::promote_flat(Flat::from_raw(v)).into_raw();
1633            let expected = promote_block32_tables(v);
1634
1635            assert_eq!(got, expected);
1636        }
1637    }
1638
1639    #[test]
1640    fn lift_from_partial_hardware_matches_tables_block64_random() {
1641        let mut rng = rng();
1642        for _ in 0..10_000 {
1643            let v = Block64(rng.random::<u64>());
1644            let got = Block128::promote_flat(Flat::from_raw(v)).into_raw();
1645            let expected = promote_block64_tables(v);
1646
1647            assert_eq!(got, expected);
1648        }
1649    }
1650
1651    // ==================================
1652    // PROMOTE BATCH + EDGE CASES
1653    // ==================================
1654
1655    #[test]
1656    fn promote_flat_batch_matches_scalar_block8() {
1657        let mut rng = rng();
1658        let input: Vec<Flat<Block8>> = (0..64)
1659            .map(|_| Block8(rng.random::<u8>()).to_hardware())
1660            .collect();
1661
1662        let mut batch_out = vec![Flat::from_raw(Block128::ZERO); 64];
1663        Block128::promote_flat_batch(&input, &mut batch_out);
1664
1665        for (i, &v) in input.iter().enumerate() {
1666            let scalar = Block128::promote_flat(v);
1667            assert_eq!(batch_out[i], scalar, "batch/scalar mismatch at index {}", i);
1668        }
1669    }
1670
1671    #[test]
1672    fn promote_flat_batch_matches_scalar_block16() {
1673        let mut rng = rng();
1674        let input: Vec<Flat<Block16>> = (0..32)
1675            .map(|_| Block16(rng.random::<u16>()).to_hardware())
1676            .collect();
1677
1678        let mut batch_out = vec![Flat::from_raw(Block128::ZERO); 32];
1679        Block128::promote_flat_batch(&input, &mut batch_out);
1680
1681        for (i, &v) in input.iter().enumerate() {
1682            assert_eq!(
1683                batch_out[i],
1684                Block128::promote_flat(v),
1685                "batch/scalar mismatch at index {}",
1686                i
1687            );
1688        }
1689    }
1690
1691    #[test]
1692    fn promote_flat_batch_matches_scalar_block32() {
1693        let mut rng = rng();
1694        let input: Vec<Flat<Block32>> = (0..16)
1695            .map(|_| Block32(rng.random::<u32>()).to_hardware())
1696            .collect();
1697
1698        let mut batch_out = vec![Flat::from_raw(Block128::ZERO); 16];
1699        Block128::promote_flat_batch(&input, &mut batch_out);
1700
1701        for (i, &v) in input.iter().enumerate() {
1702            assert_eq!(
1703                batch_out[i],
1704                Block128::promote_flat(v),
1705                "batch/scalar mismatch at index {}",
1706                i
1707            );
1708        }
1709    }
1710
1711    #[test]
1712    fn promote_flat_batch_matches_scalar_block64() {
1713        let mut rng = rng();
1714        let input: Vec<Flat<Block64>> = (0..8)
1715            .map(|_| Block64(rng.random::<u64>()).to_hardware())
1716            .collect();
1717
1718        let mut batch_out = vec![Flat::from_raw(Block128::ZERO); 8];
1719        Block128::promote_flat_batch(&input, &mut batch_out);
1720
1721        for (i, &v) in input.iter().enumerate() {
1722            assert_eq!(
1723                batch_out[i],
1724                Block128::promote_flat(v),
1725                "batch/scalar mismatch at index {}",
1726                i
1727            );
1728        }
1729    }
1730
1731    #[test]
1732    fn promote_flat_batch_partial_slice() {
1733        let input: Vec<Flat<Block8>> = (0..16).map(|i| Block8(i as u8).to_hardware()).collect();
1734
1735        // Output shorter than input
1736        let mut out_short = vec![Flat::from_raw(Block128::ZERO); 5];
1737        Block128::promote_flat_batch(&input, &mut out_short);
1738
1739        for i in 0..5 {
1740            assert_eq!(out_short[i], Block128::promote_flat(input[i]));
1741        }
1742
1743        // Input shorter than output
1744        let short_input = &input[..3];
1745        let mut out_long = vec![Flat::from_raw(Block128::ZERO); 10];
1746
1747        Block128::promote_flat_batch(short_input, &mut out_long);
1748
1749        for i in 0..3 {
1750            assert_eq!(out_long[i], Block128::promote_flat(short_input[i]));
1751        }
1752
1753        // Elements beyond input length untouched
1754        for val in &out_long[3..10] {
1755            assert_eq!(*val, Flat::from_raw(Block128::ZERO));
1756        }
1757    }
1758
1759    #[test]
1760    fn promote_edge_zero() {
1761        let zero = Flat::from_raw(Block8(0));
1762        let promoted = Block128::promote_flat(zero);
1763
1764        assert_eq!(
1765            promoted,
1766            Flat::from_raw(Block128::ZERO),
1767            "promote(0) must be 0"
1768        );
1769
1770        // Batch:
1771        // all-zero input
1772        let input = vec![zero; 16];
1773        let mut output = vec![Flat::from_raw(Block128(0xDEAD)); 16];
1774
1775        Block128::promote_flat_batch(&input, &mut output);
1776
1777        for o in &output {
1778            assert_eq!(*o, Flat::from_raw(Block128::ZERO));
1779        }
1780    }
1781
1782    #[test]
1783    fn promote_edge_one() {
1784        let one_flat8 = Block8::ONE.to_hardware();
1785        let one_flat128 = Block128::ONE.to_hardware();
1786
1787        assert_eq!(
1788            Block128::promote_flat(one_flat8),
1789            one_flat128,
1790            "promote(1) must equal 1 in target field"
1791        );
1792    }
1793
1794    #[test]
1795    fn promote_edge_max_block8() {
1796        let max = Flat::from_raw(Block8(0xFF));
1797        let promoted = Block128::promote_flat(max);
1798
1799        // Must not be zero
1800        assert_ne!(promoted, Flat::from_raw(Block128::ZERO));
1801
1802        // Roundtrip through tower
1803        // must preserve embedding.
1804        let tower_8 = max.to_tower();
1805        let tower_128 = Block128::from(tower_8);
1806
1807        assert_eq!(promoted.to_tower(), tower_128);
1808    }
1809
1810    #[test]
1811    fn promote_edge_single_bits() {
1812        for bit in 0..8 {
1813            let val = Flat::from_raw(Block8(1u8 << bit));
1814            let promoted = Block128::promote_flat(val);
1815
1816            // Must not be zero
1817            assert_ne!(
1818                promoted,
1819                Flat::from_raw(Block128::ZERO),
1820                "single-bit {} promoted to zero",
1821                bit
1822            );
1823
1824            // Tower roundtrip
1825            let tower_8 = val.to_tower();
1826            let tower_128 = Block128::from(tower_8);
1827
1828            assert_eq!(
1829                promoted.to_tower(),
1830                tower_128,
1831                "tower roundtrip failed for bit {}",
1832                bit
1833            );
1834        }
1835    }
1836
1837    #[test]
1838    fn promote_edge_alternating_packed() {
1839        let input: Vec<Flat<Block8>> = (0..16)
1840            .map(|i| {
1841                if i % 2 == 0 {
1842                    Flat::from_raw(Block8(0x00))
1843                } else {
1844                    Flat::from_raw(Block8(0xFF))
1845                }
1846            })
1847            .collect();
1848
1849        let mut output = vec![Flat::from_raw(Block128::ZERO); 16];
1850        Block128::promote_flat_batch(&input, &mut output);
1851
1852        for (i, &v) in input.iter().enumerate() {
1853            assert_eq!(
1854                output[i],
1855                Block128::promote_flat(v),
1856                "alternating mismatch at {}",
1857                i
1858            );
1859        }
1860    }
1861
1862    #[test]
1863    fn promote_edge_all_same_packed() {
1864        let val = Flat::from_raw(Block8(0x42));
1865        let expected = Block128::promote_flat(val);
1866
1867        let input = vec![val; 16];
1868        let mut output = vec![Flat::from_raw(Block128::ZERO); 16];
1869
1870        Block128::promote_flat_batch(&input, &mut output);
1871
1872        for (i, o) in output.iter().enumerate() {
1873            assert_eq!(*o, expected, "all-same mismatch at {}", i);
1874        }
1875    }
1876
1877    #[test]
1878    fn promote_tower_roundtrip_block8() {
1879        for x in 0u16..=u8::MAX as u16 {
1880            let b8 = Block8(x as u8);
1881            let promoted = Block128::promote_flat(b8.to_hardware());
1882            let tower_128 = promoted.to_tower();
1883            let embedded = Block128::from(b8);
1884
1885            assert_eq!(
1886                tower_128, embedded,
1887                "tower roundtrip failed for Block8({})",
1888                x
1889            );
1890        }
1891    }
1892
1893    #[test]
1894    fn promote_tower_roundtrip_block16() {
1895        let mut rng = rng();
1896        for _ in 0..10_000 {
1897            let v = Block16(rng.random::<u16>());
1898            let promoted = Block128::promote_flat(v.to_hardware());
1899            let tower_128 = promoted.to_tower();
1900            let embedded = Block128::from(v);
1901
1902            assert_eq!(
1903                tower_128, embedded,
1904                "tower roundtrip failed for Block16({})",
1905                v.0
1906            );
1907        }
1908    }
1909
1910    #[test]
1911    fn promote_tower_roundtrip_block32() {
1912        let mut rng = rng();
1913        for _ in 0..10_000 {
1914            let v = Block32(rng.random::<u32>());
1915            let promoted = Block128::promote_flat(v.to_hardware());
1916            let tower_128 = promoted.to_tower();
1917            let embedded = Block128::from(v);
1918
1919            assert_eq!(
1920                tower_128, embedded,
1921                "tower roundtrip failed for Block32({})",
1922                v.0
1923            );
1924        }
1925    }
1926
1927    #[test]
1928    fn promote_tower_roundtrip_block64() {
1929        let mut rng = rng();
1930        for _ in 0..10_000 {
1931            let v = Block64(rng.random::<u64>());
1932            let promoted = Block128::promote_flat(v.to_hardware());
1933            let tower_128 = promoted.to_tower();
1934            let embedded = Block128::from(v);
1935
1936            assert_eq!(
1937                tower_128, embedded,
1938                "tower roundtrip failed for Block64({})",
1939                v.0
1940            );
1941        }
1942    }
1943
1944    #[test]
1945    fn promote_algebraic_homomorphism_add_block8() {
1946        let mut rng = rng();
1947        for _ in 0..1000 {
1948            let a = Block8(rng.random::<u8>());
1949            let b = Block8(rng.random::<u8>());
1950
1951            let promote_a = Block128::promote_flat(a.to_hardware());
1952            let promote_b = Block128::promote_flat(b.to_hardware());
1953            let promote_sum = Block128::promote_flat((a + b).to_hardware());
1954
1955            assert_eq!(
1956                promote_a + promote_b,
1957                promote_sum,
1958                "add homomorphism: promote(a)+promote(b) != promote(a+b)"
1959            );
1960        }
1961    }
1962
1963    #[test]
1964    fn promote_algebraic_homomorphism_mul_block8() {
1965        let mut rng = rng();
1966        for _ in 0..1000 {
1967            let a = Block8(rng.random::<u8>());
1968            let b = Block8(rng.random::<u8>());
1969
1970            let promote_a = Block128::promote_flat(a.to_hardware());
1971            let promote_b = Block128::promote_flat(b.to_hardware());
1972            let promote_prod = Block128::promote_flat((a * b).to_hardware());
1973
1974            // Subfield elements: promote then multiply
1975            // must equal multiply then promote.
1976            assert_eq!(
1977                promote_a * promote_b,
1978                promote_prod,
1979                "mul homomorphism: promote(a)*promote(b) != promote(a*b)"
1980            );
1981        }
1982    }
1983
1984    #[test]
1985    fn promote_algebraic_homomorphism_add_block16() {
1986        let mut rng = rng();
1987        for _ in 0..1000 {
1988            let a = Block16(rng.random::<u16>());
1989            let b = Block16(rng.random::<u16>());
1990
1991            let pa = Block128::promote_flat(a.to_hardware());
1992            let pb = Block128::promote_flat(b.to_hardware());
1993            let p_sum = Block128::promote_flat((a + b).to_hardware());
1994
1995            assert_eq!(pa + pb, p_sum, "Block16 add homomorphism failed");
1996        }
1997    }
1998
1999    #[test]
2000    fn promote_algebraic_homomorphism_mul_block16() {
2001        let mut rng = rng();
2002        for _ in 0..1000 {
2003            let a = Block16(rng.random::<u16>());
2004            let b = Block16(rng.random::<u16>());
2005
2006            let pa = Block128::promote_flat(a.to_hardware());
2007            let pb = Block128::promote_flat(b.to_hardware());
2008            let p_prod = Block128::promote_flat((a * b).to_hardware());
2009
2010            assert_eq!(pa * pb, p_prod, "Block16 mul homomorphism failed");
2011        }
2012    }
2013
2014    #[test]
2015    fn promote_algebraic_homomorphism_add_block32() {
2016        let mut rng = rng();
2017        for _ in 0..1000 {
2018            let a = Block32(rng.random::<u32>());
2019            let b = Block32(rng.random::<u32>());
2020
2021            let pa = Block128::promote_flat(a.to_hardware());
2022            let pb = Block128::promote_flat(b.to_hardware());
2023            let p_sum = Block128::promote_flat((a + b).to_hardware());
2024
2025            assert_eq!(pa + pb, p_sum, "Block32 add homomorphism failed");
2026        }
2027    }
2028
2029    #[test]
2030    fn promote_algebraic_homomorphism_mul_block32() {
2031        let mut rng = rng();
2032        for _ in 0..1000 {
2033            let a = Block32(rng.random::<u32>());
2034            let b = Block32(rng.random::<u32>());
2035
2036            let pa = Block128::promote_flat(a.to_hardware());
2037            let pb = Block128::promote_flat(b.to_hardware());
2038            let p_prod = Block128::promote_flat((a * b).to_hardware());
2039
2040            assert_eq!(pa * pb, p_prod, "Block32 mul homomorphism failed");
2041        }
2042    }
2043
2044    #[test]
2045    fn promote_algebraic_homomorphism_add_block64() {
2046        let mut rng = rng();
2047        for _ in 0..1000 {
2048            let a = Block64(rng.random::<u64>());
2049            let b = Block64(rng.random::<u64>());
2050
2051            let pa = Block128::promote_flat(a.to_hardware());
2052            let pb = Block128::promote_flat(b.to_hardware());
2053            let p_sum = Block128::promote_flat((a + b).to_hardware());
2054
2055            assert_eq!(pa + pb, p_sum, "Block64 add homomorphism failed");
2056        }
2057    }
2058
2059    #[test]
2060    fn promote_algebraic_homomorphism_mul_block64() {
2061        let mut rng = rng();
2062        for _ in 0..1000 {
2063            let a = Block64(rng.random::<u64>());
2064            let b = Block64(rng.random::<u64>());
2065
2066            let pa = Block128::promote_flat(a.to_hardware());
2067            let pb = Block128::promote_flat(b.to_hardware());
2068            let p_prod = Block128::promote_flat((a * b).to_hardware());
2069
2070            assert_eq!(pa * pb, p_prod, "Block64 mul homomorphism failed");
2071        }
2072    }
2073
2074    #[test]
2075    fn promote_generator_preserves_order() {
2076        // Block8 generator is 3 with order 255
2077        let g = Block8(3);
2078        let g_promoted = Block128::promote_flat(g.to_hardware());
2079
2080        // Fermat:
2081        // g^255 = 1 in GF(2^8)
2082        let mut acc8 = Block8::ONE;
2083        for _ in 0..255 {
2084            acc8 *= g;
2085        }
2086
2087        assert_eq!(acc8, Block8::ONE, "Block8 Fermat: g^255 must be 1");
2088
2089        // Promoted element must
2090        // also satisfy g^255 = 1.
2091        let mut acc128 = Flat::from_raw(Block128::ONE);
2092        for _ in 0..255 {
2093            acc128 *= g_promoted;
2094        }
2095
2096        assert_eq!(
2097            acc128,
2098            Flat::from_raw(Block128::ONE),
2099            "promoted element lost multiplicative order"
2100        );
2101    }
2102
2103    proptest! {
2104        #[test]
2105        fn parity_masks_match_from_hardware(x_flat in any::<u128>()) {
2106            let tower = Block128::from_hardware(Flat::from_raw(Block128(x_flat))).0;
2107
2108            for k in 0..128 {
2109                let bit = ((tower >> k) & 1) as u8;
2110                let via_api = Flat::from_raw(Block128(x_flat)).tower_bit(k);
2111
2112                prop_assert_eq!(
2113                    via_api, bit,
2114                    "Block128 tower_bit_from_hardware mismatch at x_flat={:#034x}, bit_idx={}",
2115                    x_flat, k
2116                );
2117            }
2118        }
2119    }
2120}