cryprot_core/block/
gf128.rs

1use super::Block;
2
3/// The irreducible polynomial for gf128 operations.
4const MOD: u64 = 0b10000111; // 0x87
5
6#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7cpufeatures::new!(target_feature_pclmulqdq, "pclmulqdq");
8
9impl Block {
10    /// Carryless multiplication of two Blocks as polynomials over GF(2).
11    ///
12    /// Returns (low, high) bits.
13    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
14    #[inline]
15    pub fn clmul(&self, rhs: &Self) -> (Self, Self) {
16        if target_feature_pclmulqdq::get() {
17            // SAFETY: pclmulqdq is available
18            unsafe {
19                let (low, high) = clmul::clmul128(self.into(), rhs.into());
20                (low.into(), high.into())
21            }
22        } else {
23            let (low, high) = scalar::clmul128(self.into(), rhs.into());
24            (low.into(), high.into())
25        }
26    }
27
28    /// Carryless multiplication of two Blocks as polynomials over GF(2).
29    ///
30    /// Returns (low, high) bits.
31    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
32    #[inline]
33    pub fn clmul(&self, rhs: &Self) -> (Self, Self) {
34        let (low, high) = scalar::clmul128(self.into(), rhs.into());
35        (low.into(), high.into())
36    }
37
38    /// Multiplication over GF(2^128).
39    ///
40    /// Uses the irreducible polynomial `x^128 + x^7 + x^2 + x + 1.
41    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
42    #[inline]
43    pub fn gf_mul(&self, rhs: &Self) -> Self {
44        if target_feature_pclmulqdq::get() {
45            // SAFETY: pclmulqdq is available
46            unsafe { clmul::gf128_mul(self.into(), rhs.into()).into() }
47        } else {
48            scalar::gf128_mul(self.into(), rhs.into()).into()
49        }
50    }
51
52    /// Multiplication over GF(2^128).
53    ///
54    /// Uses the irreducible polynomial `x^128 + x^7 + x^2 + x + 1`.
55    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
56    #[inline]
57    pub fn gf_mul(&self, rhs: &Self) -> Self {
58        scalar::gf128_mul(self.into(), rhs.into()).into()
59    }
60
61    /// Reduce polynomial over GF(2) by `x^128 + x^7 + x^2 + x + 1`.
62    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
63    #[inline]
64    pub fn gf_reduce(low: &Self, high: &Self) -> Self {
65        if target_feature_pclmulqdq::get() {
66            // SAFETY: pclmulqdq is available
67            unsafe { clmul::gf128_reduce(low.into(), high.into()).into() }
68        } else {
69            scalar::gf128_reduce(low.into(), high.into()).into()
70        }
71    }
72
73    /// Reduce polynomial over GF(2) by `x^128 + x^7 + x^2 + x + 1`.
74    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
75    #[inline]
76    pub fn gf_reduce(low: &Self, high: &Self) -> Self {
77        scalar::gf128_reduce(low.into(), high.into()).into()
78    }
79
80    #[inline]
81    pub fn gf_pow(&self, mut exp: u64) -> Block {
82        let mut s = Block::ONE;
83        let mut pow2 = *self;
84
85        while exp != 0 {
86            if exp & 1 != 0 {
87                s = s.gf_mul(&pow2);
88            }
89            pow2 = pow2.gf_mul(&pow2);
90            exp >>= 1;
91        }
92        s
93    }
94}
95
96#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97mod clmul {
98    #[cfg(target_arch = "x86")]
99    use std::arch::x86::*;
100    #[cfg(target_arch = "x86_64")]
101    use std::arch::x86_64::*;
102
103    use super::MOD;
104
105    #[target_feature(enable = "pclmulqdq")]
106    #[inline]
107    pub fn gf128_mul(a: __m128i, b: __m128i) -> __m128i {
108        let (low, high) = clmul128(a, b);
109        gf128_reduce(low, high)
110    }
111
112    /// Carry-less multiply of two 128-bit numbers.
113    ///
114    /// Return (low, high) bits
115    #[target_feature(enable = "pclmulqdq")]
116    #[inline]
117    pub fn clmul128(a: __m128i, b: __m128i) -> (__m128i, __m128i) {
118        // NOTE: I tried using karatsuba but it was slightly slower than the naive
119        // multiplication
120        let ab_low = _mm_clmulepi64_si128::<0x00>(a, b);
121        let ab_high = _mm_clmulepi64_si128::<0x11>(a, b);
122        let ab_lohi1 = _mm_clmulepi64_si128::<0x01>(a, b);
123        let ab_lohi2 = _mm_clmulepi64_si128::<0x10>(a, b);
124        let ab_mid = _mm_xor_si128(ab_lohi1, ab_lohi2);
125        let low = _mm_xor_si128(ab_low, _mm_slli_si128::<8>(ab_mid));
126        let high = _mm_xor_si128(ab_high, _mm_srli_si128::<8>(ab_mid));
127        (low, high)
128    }
129
130    #[target_feature(enable = "pclmulqdq")]
131    #[inline]
132    pub fn gf128_reduce(mut low: __m128i, mut high: __m128i) -> __m128i {
133        // NOTE: I tried a sse shift based reduction but it was slower than the clmul
134        // implementation
135        let modulus = [MOD, 0];
136        // SAFETY: Ptr to modulus is valid and pclmulqdq implies sse2 is enabled
137        let modulus = unsafe { _mm_loadu_si64(modulus.as_ptr().cast()) };
138
139        let tmp = _mm_clmulepi64_si128::<0x01>(high, modulus);
140        let tmp_shifted = _mm_slli_si128::<8>(tmp);
141        low = _mm_xor_si128(low, tmp_shifted);
142        high = _mm_xor_si128(high, tmp_shifted);
143
144        // reduce overflow
145        let tmp = _mm_clmulepi64_si128::<0x01>(tmp, modulus);
146        low = _mm_xor_si128(low, tmp);
147
148        let tmp = _mm_clmulepi64_si128::<0x00>(high, modulus);
149        _mm_xor_si128(low, tmp)
150    }
151
152    #[cfg(all(test, target_feature = "pclmulqdq"))]
153    mod test {
154        use std::{arch::x86_64::__m128i, mem::transmute};
155
156        use crate::block::gf128::clmul::{clmul128, gf128_mul, gf128_reduce};
157
158        #[test]
159        fn test_gf128_mul_zero() {
160            unsafe {
161                let a = transmute(0x19831239123916248127031273012381_u128);
162                let b = transmute(0_u128);
163                let exp = 0_u128;
164                let mul = transmute(gf128_mul(a, b));
165                assert_eq!(exp, mul);
166            }
167        }
168
169        #[test]
170        fn test_gf128_mul_onw() {
171            unsafe {
172                let a = transmute(0x19831239123916248127031273012381_u128);
173                let b = transmute(0x1_u128);
174                let exp = 0x19831239123916248127031273012381_u128;
175                let mul = transmute(gf128_mul(a, b));
176                assert_eq!(exp, mul);
177            }
178        }
179
180        #[test]
181        fn test_gf128_mul() {
182            unsafe {
183                let a = transmute(0x19831239123916248127031273012381_u128);
184                let b = transmute(0xabcdef0123456789abcdef0123456789_u128);
185                let exp = 0x63a033d0ed643e85153c50f4268a7d9_u128;
186                let mul = transmute(gf128_mul(a, b));
187                assert_eq!(exp, mul);
188            }
189        }
190
191        #[test]
192        fn test_clmul128() {
193            unsafe {
194                let a: __m128i = transmute(0x19831239123916248127031273012381_u128);
195                let b: __m128i = transmute(0xabcdef0123456789abcdef0123456789_u128);
196                let (low, high) = clmul128(a, b);
197                let [low, high] = transmute([low, high]);
198                let exp_low: u128 = 0xa5de9b50e6db7b5147e92b99ee261809;
199                let exp_high: u128 = 0xf1d6d37d58114afed2addfedd7c77f7;
200                assert_eq!(exp_low, low);
201                assert_eq!(exp_high, high);
202            }
203        }
204
205        #[test]
206        fn test_gf128_reduce() {
207            unsafe {
208                // test vectors computed using sage
209                let low: __m128i = transmute(0x0123456789abcdef0123456789abcdef_u128);
210                let high: __m128i = transmute(0xabcdef0123456789abcdef0123456789_u128);
211                let exp = 0xb4b548f1c3c23f86b4b548f1c3c21572_u128;
212                let res: u128 = transmute(gf128_reduce(low, high));
213
214                println!("res: {res:b}");
215                println!("exp: {exp:b}");
216                assert_eq!(exp, res);
217            }
218        }
219    }
220
221    #[cfg(all(is_nightly, test, target_feature = "pclmulqdq"))]
222    mod benches {
223        extern crate test;
224
225        use std::{hint::black_box, mem::transmute};
226
227        use rand::{RngExt, rng};
228        use test::Bencher;
229
230        #[bench]
231        fn bench_gf128_mul(b: &mut Bencher) {
232            let [low, high] = unsafe { transmute(rng().random::<[u128; 2]>()) };
233            b.iter(|| black_box(unsafe { super::gf128_mul(black_box(low), black_box(high)) }));
234        }
235
236        #[bench]
237        fn bench_gf128_reduce(b: &mut Bencher) {
238            let [low, high] = unsafe { transmute(rng().random::<[u128; 2]>()) };
239            b.iter(|| black_box(unsafe { super::gf128_reduce(black_box(low), black_box(high)) }));
240        }
241    }
242}
243
244// used in tests, but if we're not compiling tests these will otherwise be
245// flagged as unused
246#[allow(dead_code)]
247mod scalar {
248    #[inline]
249    pub fn gf128_mul(a: u128, b: u128) -> u128 {
250        let (low, high) = clmul128(a, b);
251        gf128_reduce(low, high)
252    }
253
254    /// Carry-less multiply of two 128-bit numbers.
255    ///
256    /// Return (low, high) bits
257    #[inline]
258    pub fn clmul128(a: u128, b: u128) -> (u128, u128) {
259        let (a_low, a_high) = (a as u64, (a >> 64) as u64);
260        let (b_low, b_high) = (b as u64, (b >> 64) as u64);
261
262        // Use karatsuba multiplication
263        let ab_low = clmul64(a_low, b_low);
264        let ab_high = clmul64(a_high, b_high);
265        let ab_mid = clmul64(a_low ^ a_high, b_low ^ b_high) ^ ab_low ^ ab_high;
266        let low = ab_low ^ (ab_mid << 64);
267        let high = ab_high ^ (ab_mid >> 64);
268        (low, high)
269    }
270
271    // Adapted from https://github.com/RustCrypto/universal-hashes/blob/802b40974a08bbd2663c63780fc87a23ee931868/polyval/src/backend/soft64.rs#L201C1-L227C2
272    // Uses the technique described in https://www.bearssl.org/constanttime.html#ghash-for-gcm
273    // but directly outputs the 128 bits wihtout needing the Rev trick.
274    // This method is constant time and significantly faster than iterating over the
275    // bits of y and xoring shifted x.
276    /// Multiplication in GF(2)[X] with “holes”
277    /// (sequences of zeroes) to avoid carry spilling.
278    ///
279    /// When carries do occur, they wind up in a "hole" and are subsequently
280    /// masked out of the result.
281    #[inline]
282    fn clmul64(x: u64, y: u64) -> u128 {
283        // generate mask with 4 wide 0-bit holes to account for larger overflow of 64
284        // bit input multiplication
285        pub const fn mask(offset: u32) -> u128 {
286            let mut mask: u128 = 0;
287            let mut i = offset;
288            while i < 128 {
289                mask |= 1 << i;
290                i += 5;
291            }
292
293            mask
294        }
295        let x0 = x as u128 & const { mask(0) };
296        let x1 = x as u128 & const { mask(1) };
297        let x2 = x as u128 & const { mask(2) };
298        let x3 = x as u128 & const { mask(3) };
299        let x4 = x as u128 & const { mask(4) };
300        let y0 = y as u128 & const { mask(0) };
301        let y1 = y as u128 & const { mask(1) };
302        let y2 = y as u128 & const { mask(2) };
303        let y3 = y as u128 & const { mask(3) };
304        let y4 = y as u128 & const { mask(4) };
305
306        let mut z0 = (x0 * y0) ^ (x1 * y4) ^ (x2 * y3) ^ (x3 * y2) ^ (x4 * y1);
307        let mut z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y4) ^ (x3 * y3) ^ (x4 * y2);
308        let mut z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y4) ^ (x4 * y3);
309        let mut z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0) ^ (x4 * y4);
310        let mut z4 = (x0 * y4) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1) ^ (x4 * y0);
311
312        z0 &= const { mask(0) };
313        z1 &= const { mask(1) };
314        z2 &= const { mask(2) };
315        z3 &= const { mask(3) };
316        z4 &= const { mask(4) };
317
318        z0 | z1 | z2 | z3 | z4
319    }
320
321    /// Generated by ChatGPT o3-mini and reviewed by me.
322    /// Reduces a 256-bit value (given as two u128 words, `high` and `low`)
323    /// modulo the irreducible polynomial f(x) = x^128 + x^7 + x^2 + x + 1.
324    ///
325    /// That is, it computes:
326    ///      low ^ reduce(high * (x^7 + x^2 + x + 1))
327    /// since x^128 ≡ x^7 + x^2 + x + 1 (mod f(x)).
328    #[inline]
329    pub fn gf128_reduce(low: u128, high: u128) -> u128 {
330        // Helper: performs a left shift on a 128-bit word and returns
331        // a tuple (overflow, lower) where:
332        //    x << shift = (overflow << 128) | lower.
333        #[inline]
334        fn shift_u128(x: u128, shift: u32) -> (u128, u128) {
335            // For 0 < shift < 128.
336            let overflow = x >> (128 - shift);
337            let lower = x << shift;
338            (overflow, lower)
339        }
340
341        // For the reduction, note that:
342        //   x^128 ≡ x^7 + x^2 + x + 1 (mod f(x)).
343        // So the contribution of the high word is:
344        //   (high << 7) ^ (high << 2) ^ (high << 1) ^ high,
345        // but each shift must be computed as a 256–bit quantity.
346        let (ov7, lo7) = shift_u128(high, 7);
347        let (ov2, lo2) = shift_u128(high, 2);
348        let (ov1, lo1) = shift_u128(high, 1);
349        let lo0 = high; // equivalent to shift 0
350
351        // Combine the 128-bit parts of each term.
352        let combined_low = lo7 ^ lo2 ^ lo1 ^ lo0;
353        // Combine the overflow (upper) parts.
354        let combined_overflow = ov7 ^ ov2 ^ ov1;
355
356        // The bits in `combined_overflow` represent extra contributions from bits
357        // at positions ≥ 128. Since they are at most 7 bits wide, we can reduce them
358        // by multiplying with the reduction polynomial (i.e. shifting and XORing):
359        let reduced_overflow = (combined_overflow << 7)
360            ^ (combined_overflow << 2)
361            ^ (combined_overflow << 1)
362            ^ combined_overflow;
363
364        // The full contribution from `high` is then given by the low part
365        // combined with the reduced overflow.
366        let poly_contrib = combined_low ^ reduced_overflow;
367
368        // Finally, reduce the entire 256-bit value by XORing in the contribution.
369        low ^ poly_contrib
370    }
371
372    #[cfg(test)]
373    mod tests {
374        use super::{clmul128, gf128_mul, gf128_reduce};
375
376        #[test]
377        fn test_gf128_mul_zero() {
378            let a = 0x19831239123916248127031273012381;
379            let b = 0;
380            let exp = 0;
381            let mul = gf128_mul(a, b);
382            assert_eq!(exp, mul);
383        }
384
385        #[test]
386        fn test_gf128_mul_one() {
387            let a = 0x19831239123916248127031273012381;
388            let b = 1;
389            let exp = 0x19831239123916248127031273012381;
390            let mul = gf128_mul(a, b);
391            assert_eq!(exp, mul);
392        }
393
394        #[test]
395        fn test_gf128_mul() {
396            let a = 0x19831239123916248127031273012381;
397            let b = 0xabcdef0123456789abcdef0123456789;
398            let exp = 0x63a033d0ed643e85153c50f4268a7d9;
399            let mul = gf128_mul(a, b);
400            assert_eq!(exp, mul);
401        }
402
403        #[test]
404        fn test_gf128_reduce_zero() {
405            assert_eq!(gf128_reduce(0, 0), 0);
406        }
407
408        #[test]
409        fn test_gf128_reduce_low_only() {
410            assert_eq!(gf128_reduce(1, 0), 1);
411            assert_eq!(gf128_reduce(0x87, 0), 0x87); // Reduction polynomial itself.
412            assert_eq!(gf128_reduce(0xFFFFFFFFFFFFFFFF, 0), 0xFFFFFFFFFFFFFFFF);
413        }
414
415        #[test]
416        fn test_gf128_reduce_high_only() {
417            // high << 64
418            assert_eq!(gf128_reduce(0, 1), 0x87);
419            assert_eq!(gf128_reduce(0, 2), 0x87 << 1);
420            assert_eq!(gf128_reduce(0, 3), (0x87 << 1) ^ 0x87);
421
422            assert_eq!(gf128_reduce(0, 1 << 63), 0x87 << 63);
423        }
424
425        #[test]
426        fn test_gf128_reduce_overflow() {
427            let high = u128::MAX; // All bits set in high
428            let low = u128::MAX; // All bits set in low.
429            assert_eq!(gf128_reduce(low, high), 0xffffffffffffffffffffffffffffc071);
430        }
431
432        #[test]
433        fn tests_gf128_reduce() {
434            // test vectors computed using sage
435            let low = 0x0123456789abcdef0123456789abcdef;
436            let high = 0xabcdef0123456789abcdef0123456789;
437            let exp = 0xb4b548f1c3c23f86b4b548f1c3c21572;
438            let res = gf128_reduce(low, high);
439
440            println!("res: {res:b}");
441            println!("exp: {exp:b}");
442            assert_eq!(exp, res);
443        }
444
445        #[test]
446        fn test_clmul128() {
447            let a = 0x19831239123916248127031273012381;
448            let b = 0xabcdef0123456789abcdef0123456789;
449            let (low, high) = clmul128(a, b);
450            let exp_low = 0xa5de9b50e6db7b5147e92b99ee261809;
451            let exp_high = 0xf1d6d37d58114afed2addfedd7c77f7;
452            assert_eq!(exp_low, low);
453            assert_eq!(exp_high, high);
454        }
455    }
456
457    #[cfg(all(is_nightly, test))]
458    mod benches {
459        extern crate test;
460
461        use criterion::black_box;
462        use rand::{RngExt, rng};
463        use test::Bencher;
464
465        #[bench]
466        fn bench_gf128_mul(b: &mut Bencher) {
467            let [low, high] = rng().random::<[u128; 2]>();
468            b.iter(|| black_box(super::gf128_mul(black_box(low), black_box(high))));
469        }
470
471        #[bench]
472        fn bench_gf128_reduce(b: &mut Bencher) {
473            let [low, high] = rng().random::<[u128; 2]>();
474            b.iter(|| black_box(super::gf128_reduce(black_box(low), black_box(high))));
475        }
476    }
477}
478
479/// Test that scalar implementation and clmul implementation produce the same
480/// results
481#[cfg(all(test, not(miri), target_feature = "pclmulqdq"))]
482mod scalar_simd_tests {
483    use std::mem::transmute;
484
485    use proptest::prelude::*;
486
487    use super::{clmul, scalar};
488
489    // https://github.com/proptest-rs/proptest/issues/82
490    // really sad that this is not standard behavior
491    fn u128_with_edges() -> impl Strategy<Value = u128> {
492        prop_oneof![
493            // weight -> strategy
494            1 => Just(0u128),
495            1 => Just(1u128),
496            1 => Just(u128::MAX),
497            1 => Just(u128::MAX - 1),
498            6 => any::<u128>(),
499        ]
500    }
501
502    proptest! {
503        #[test]
504        fn test_clmul128(a in u128_with_edges(), b in u128_with_edges()) {
505             unsafe {
506                let clmul_res = clmul::clmul128(transmute(a), transmute(b));
507                let scalar_res = scalar::clmul128(a, b);
508                assert_eq!(scalar_res.0, transmute(clmul_res.0));
509            }
510        }
511    }
512
513    proptest! {
514        #[test]
515        fn test_gf128_reduce(a in u128_with_edges(), b in u128_with_edges()) {
516             unsafe {
517                let clmul_res = clmul::gf128_reduce(transmute(a), transmute(b));
518                let scalar_res = scalar::gf128_reduce(a, b);
519                assert_eq!(scalar_res, transmute(clmul_res));
520            }
521        }
522    }
523
524    proptest! {
525        #[test]
526        fn test_gf128_mul(a in u128_with_edges(), b in u128_with_edges()) {
527            unsafe {
528                let clmul_res = clmul::gf128_mul(transmute(a), transmute(b));
529                let scalar_res = scalar::gf128_mul(a, b);
530                assert_eq!(scalar_res, transmute(clmul_res));
531            }
532        }
533    }
534}
535
536#[cfg(test)]
537mod tests {
538    use crate::Block;
539
540    #[test]
541    fn test_gf_pow() {
542        let b: Block = 24646523424323_u128.into();
543        assert_eq!(Block::ONE, b.gf_pow(0));
544        assert_eq!(b, b.gf_pow(1));
545        assert_eq!(b.gf_mul(&b), b.gf_pow(2));
546        assert_eq!(b.gf_mul(&b.gf_mul(&b)), b.gf_pow(3));
547    }
548}
cryprot_core/block/gf128.rs

cryprot_core/block/
gf128.rs