rscrypto 0.1.1

Pure Rust cryptography, hardware-accelerated: BLAKE3, SHA-2/3, AES-GCM, ChaCha20-Poly1305, Ed25519, X25519, HMAC, HKDF, Argon2, CRC. no_std, WASM, ten CPU architectures.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
//! PCLMULQDQ/PMULL folding constants for CRC-64.
//!
//! This module generates the constant sets needed by the classic Intel/TiKV
//! carryless-multiply folding algorithm:
//! - Process 128 bytes at a time (8×16B lanes)
//! - Fold lanes down to 16B, then 8B
//! - Finish with Barrett reduction
//!
//! The constants are defined in terms of the **reciprocal polynomial** (TiKV
//! nomenclature):
//!
//! - `POLY = (reflected_poly << 1) | 1`
//! - `NORMAL = bit_reverse(reflected_poly)` (the non-reflected polynomial)
//! - `K_n = bit_reverse(x^n mod (x^W ⊕ NORMAL))` where W is the CRC width
//! - `MU = POLY⁻¹ mod x^W` (the multiplicative inverse in GF(2)[x]/(x^W))
//!
//! References:
//! - Intel: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ"
//! - TiKV: `crc64fast` + `crc64fast-nvme`

use crate::checksum::common::tables::{CRC64_NVME_POLY, CRC64_XZ_POLY};

// ─────────────────────────────────────────────────────────────────────────────
// GF(2) Polynomial Arithmetic
// ─────────────────────────────────────────────────────────────────────────────

/// Carryless multiplication of two 64-bit values, returning 128-bit result.
///
/// This is the software equivalent of PCLMULQDQ/PMULL.
/// Result is (high64, low64) where high64 contains bits 127..64.
#[must_use]
const fn clmul64(a: u64, b: u64) -> (u64, u64) {
  let mut hi: u64 = 0;
  let mut lo: u64 = 0;

  // Process each bit of 'a'
  let mut i: u32 = 0;
  while i < 64 {
    if (a >> i) & 1 != 0 {
      // XOR b shifted by i positions into the result
      // If i + bit_position < 64, it goes into lo
      // If i + bit_position >= 64, it goes into hi
      if i == 0 {
        lo ^= b;
      } else {
        lo ^= b << i;
        hi ^= b >> (64 - i);
      }
    }
    i = i.strict_add(1);
  }

  (hi, lo)
}

/// Reduce a 128-bit value modulo a 65-bit polynomial.
///
/// The polynomial is represented as (poly_hi, poly_lo) where poly_hi is
/// the high bit (always 1 for degree-64 polynomials) and poly_lo is the
/// lower 64 bits.
///
/// For reflected CRCs, the polynomial's implicit x^64 term is at bit 64.
#[must_use]
pub(crate) const fn reduce128(hi: u64, lo: u64, poly: u64) -> u64 {
  // For a 65-bit polynomial G(x) = x^64 + poly (where poly is the low 64 bits),
  // we reduce by: if bit 64+i is set, XOR poly shifted left by i positions.
  //
  // We process from high bits down to bit 64.
  let mut result_hi = hi;
  let mut result_lo = lo;

  // Reduce bits 127 down to 64
  let mut i: i32 = 63;
  while i >= 0 {
    if (result_hi >> i) & 1 != 0 {
      // Bit 64+i is set, XOR with poly shifted by i
      if i == 0 {
        // XOR poly into low part, clear bit 64
        result_lo ^= poly;
        result_hi ^= 1;
      } else {
        // XOR poly shifted by i
        result_lo ^= poly << i;
        result_hi ^= (poly >> (64 - i)) | (1 << i);
      }
    }
    i = i.strict_sub(1);
  }

  result_lo
}

/// Compute x^n mod P for a 64-bit polynomial.
///
/// Uses square-and-multiply algorithm in GF(2).
///
/// # Arguments
///
/// * `n` - The exponent (must be >= 1)
/// * `poly` - The polynomial (lower 64 bits of the 65-bit polynomial)
#[must_use]
const fn xpow_mod(n: u32, poly: u64) -> u64 {
  if n == 0 {
    return 1;
  }
  if n == 1 {
    return 2; // x^1
  }

  // Start with x^1 = 2
  let mut result: u64 = 1; // Identity for multiplication
  let mut base: u64 = 2; // x^1

  let mut exp = n;
  while exp > 0 {
    if exp & 1 != 0 {
      // Multiply result by base
      let (hi, lo) = clmul64(result, base);
      result = reduce128(hi, lo, poly);
    }
    // Square base
    let (hi, lo) = clmul64(base, base);
    base = reduce128(hi, lo, poly);
    exp = exp.strict_shr(1);
  }

  result
}

// ─────────────────────────────────────────────────────────────────────────────
// TiKV/Intel CRC64 folding constants
// ─────────────────────────────────────────────────────────────────────────────

/// Folding constants needed by the TiKV/Intel CRC64 CLMUL algorithm.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Crc64ClmulConstants {
  /// Reciprocal polynomial low 64 bits (`POLY` in TiKV code).
  pub poly: u64,
  /// Barrett reduction constant (`MU` in TiKV code).
  pub mu: u64,
  /// 128-byte folding coefficient (high, low) = (K_1023, K_1087).
  pub fold_128b: (u64, u64),
  /// Tail fold coefficients (distance 112..16 bytes): (K_{d-1}, K_{d+63}).
  pub tail_fold_16b: [(u64, u64); 7],
  /// 16B→8B fold coefficient (`K_127`).
  pub fold_8b: u64,
}

/// Compute TiKV-style reciprocal polynomial from a reflected CRC polynomial.
///
/// TiKV defines:
/// `POLY = (reflected_poly << 1) | 1`.
#[must_use]
const fn reciprocal_poly(reflected_poly: u64) -> u64 {
  (reflected_poly << 1) | 1
}

/// Compute the normal (non-reflected) polynomial from a reflected polynomial.
///
/// For width-64 CRC polynomials, the normal and reflected forms are bit-reverses.
#[must_use]
const fn normal_poly(reflected_poly: u64) -> u64 {
  reflected_poly.reverse_bits()
}

/// Compute folding constant `K_n = bit_reverse(x^n mod (x^64 ⊕ NORMAL))`.
#[must_use]
const fn fold_k(normal_poly: u64, n: u32) -> u64 {
  xpow_mod(n, normal_poly).reverse_bits()
}

/// Compute a `(high, low)` fold coefficient pair for folding 16 bytes by `shift_bytes`.
///
/// The TiKV/Intel CLMUL folding step uses a pair `(K_{d-1}, K_{d+63})` where
/// `d = 8 * shift_bytes` (in bits).
#[must_use]
pub(crate) const fn fold16_coeff_for_bytes(reflected_poly: u64, shift_bytes: u32) -> (u64, u64) {
  if shift_bytes == 0 {
    return (0, 0);
  }

  let normal = normal_poly(reflected_poly);
  let d = shift_bytes * 8;
  // `d >= 8`, so `d - 1` is valid.
  (fold_k(normal, d - 1), fold_k(normal, d + 63))
}

impl Crc64ClmulConstants {
  #[must_use]
  pub const fn new(reflected_poly: u64) -> Self {
    let poly = reciprocal_poly(reflected_poly);
    let normal = normal_poly(reflected_poly);
    let mu = compute_tikv_mu(poly);

    Self {
      poly,
      mu,
      fold_128b: (fold_k(normal, 1023), fold_k(normal, 1087)),
      tail_fold_16b: [
        (fold_k(normal, 895), fold_k(normal, 959)), // 112 bytes
        (fold_k(normal, 767), fold_k(normal, 831)), // 96 bytes
        (fold_k(normal, 639), fold_k(normal, 703)), // 80 bytes
        (fold_k(normal, 511), fold_k(normal, 575)), // 64 bytes
        (fold_k(normal, 383), fold_k(normal, 447)), // 48 bytes
        (fold_k(normal, 255), fold_k(normal, 319)), // 32 bytes
        (fold_k(normal, 127), fold_k(normal, 191)), // 16 bytes
      ],
      fold_8b: fold_k(normal, 127),
    }
  }
}

/// Compute TiKV `MU` for Barrett reduction.
///
/// TiKV's CRC64 CLMUL reduction uses `MU = POLY⁻¹ mod x^64` where `POLY` is the
/// reciprocal polynomial (low 64 bits; the x^64 term is implicit). Equivalently:
/// `(MU ⊗ POLY) mod x^64 == 1`.
///
/// Since `POLY` always has constant term 1, the inverse exists in
/// GF(2)[x]/(x^64). We compute the inverse bit-by-bit as a power series.
#[must_use]
const fn compute_tikv_mu(poly: u64) -> u64 {
  // q(x) = 1 / p(x) (mod x^64), with p(0)=1.
  // For k>=1: q_k = Σ_{i=1..k} p_i * q_{k-i}  (in GF(2)).
  let mut inv: u64 = 1;

  let mut k: u32 = 1;
  while k < 64 {
    let mut s: u64 = 0;

    let mut i: u32 = 1;
    while i <= k {
      let p_i = (poly >> i) & 1;
      let q_j = (inv >> (k - i)) & 1;
      s ^= p_i & q_j;
      i = i.strict_add(1);
    }

    inv |= s << k;
    k = k.strict_add(1);
  }

  inv
}

// ─────────────────────────────────────────────────────────────────────────────
// Pre-computed constant sets for CRC-64.
pub(crate) const CRC64_XZ_CLMUL: Crc64ClmulConstants = Crc64ClmulConstants::new(CRC64_XZ_POLY);
pub(crate) const CRC64_NVME_CLMUL: Crc64ClmulConstants = Crc64ClmulConstants::new(CRC64_NVME_POLY);

// ─────────────────────────────────────────────────────────────────────────────
// Multi-stream folding constants for CRC-64.
// ─────────────────────────────────────────────────────────────────────────────

/// Multi-stream folding constants for CRC-64 CLMUL kernels.
///
/// These constants support multi-way ILP (instruction-level parallelism)
/// optimizations on x86_64 (PCLMULQDQ/VPCLMULQDQ) and aarch64 (PMULL).
///
/// - `fold_256b`: 2-way striping (both architectures)
/// - `fold_384b`: 3-way striping (aarch64)
/// - `fold_512b`: 4-way striping (x86_64)
/// - `fold_896b`: 7-way striping (x86_64)
/// - `fold_1024b`: 8-way striping (x86_64, Intel/Linux kernel standard)
/// - `fold_256b` is also used by the 4×512-bit VPCLMULQDQ kernel: it processes 256 bytes per
///   iteration (2048 bits), so the folding distance is 256 bytes.
/// - `combine_4way`: merge coefficients for 4-way (x86_64)
/// - `combine_7way`: merge coefficients for 7-way (x86_64)
/// - `combine_8way`: merge coefficients for 8-way (x86_64)
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[derive(Clone, Copy, Debug)]
#[allow(dead_code)] // Field subsets vary by architecture (x86_64 vs aarch64 stream widths).
pub(crate) struct Crc64StreamConstants {
  /// 2-way fold coefficient (256B = 2×128B).
  pub fold_256b: (u64, u64),
  /// 3-way fold coefficient (384B = 3×128B).
  pub fold_384b: (u64, u64),
  /// 4-way fold coefficient (512B = 4×128B).
  pub fold_512b: (u64, u64),
  /// 7-way fold coefficient (896B = 7×128B).
  pub fold_896b: (u64, u64),
  /// 8-way fold coefficient (1024B = 8×128B).
  pub fold_1024b: (u64, u64),
  /// 4-way combine coefficients: shifts by 384B, 256B, 128B.
  pub combine_4way: [(u64, u64); 3],
  /// 7-way combine coefficients: shifts by 768B, 640B, 512B, 384B, 256B, 128B.
  pub combine_7way: [(u64, u64); 6],
  /// 8-way combine coefficients: shifts by 896B, 768B, 640B, 512B, 384B, 256B, 128B.
  pub combine_8way: [(u64, u64); 7],
}

#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
impl Crc64StreamConstants {
  /// Compute all multi-stream folding constants for a given polynomial.
  #[must_use]
  pub const fn new(reflected_poly: u64) -> Self {
    Self {
      fold_256b: fold16_coeff_for_bytes(reflected_poly, 256),
      fold_384b: fold16_coeff_for_bytes(reflected_poly, 384),
      fold_512b: fold16_coeff_for_bytes(reflected_poly, 512),
      fold_896b: fold16_coeff_for_bytes(reflected_poly, 896),
      fold_1024b: fold16_coeff_for_bytes(reflected_poly, 1024),
      combine_4way: [
        fold16_coeff_for_bytes(reflected_poly, 384),
        fold16_coeff_for_bytes(reflected_poly, 256),
        fold16_coeff_for_bytes(reflected_poly, 128),
      ],
      combine_7way: [
        fold16_coeff_for_bytes(reflected_poly, 768),
        fold16_coeff_for_bytes(reflected_poly, 640),
        fold16_coeff_for_bytes(reflected_poly, 512),
        fold16_coeff_for_bytes(reflected_poly, 384),
        fold16_coeff_for_bytes(reflected_poly, 256),
        fold16_coeff_for_bytes(reflected_poly, 128),
      ],
      combine_8way: [
        fold16_coeff_for_bytes(reflected_poly, 896),
        fold16_coeff_for_bytes(reflected_poly, 768),
        fold16_coeff_for_bytes(reflected_poly, 640),
        fold16_coeff_for_bytes(reflected_poly, 512),
        fold16_coeff_for_bytes(reflected_poly, 384),
        fold16_coeff_for_bytes(reflected_poly, 256),
        fold16_coeff_for_bytes(reflected_poly, 128),
      ],
    }
  }
}

// Pre-computed multi-stream constants for CRC-64.
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
pub(crate) const CRC64_XZ_STREAM: Crc64StreamConstants = Crc64StreamConstants::new(CRC64_XZ_POLY);
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
pub(crate) const CRC64_NVME_STREAM: Crc64StreamConstants = Crc64StreamConstants::new(CRC64_NVME_POLY);

// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
  use super::*;

  #[test]
  fn test_clmul64_basic() {
    // 0 * anything = 0
    assert_eq!(clmul64(0, 12345), (0, 0));
    assert_eq!(clmul64(12345, 0), (0, 0));

    // 1 * x = x
    assert_eq!(clmul64(1, 0x1234), (0, 0x1234));
    assert_eq!(clmul64(0x1234, 1), (0, 0x1234));

    // x * x = x^2 (2 * 2 = 4 in GF(2))
    assert_eq!(clmul64(2, 2), (0, 4));

    // (x+1) * (x+1) = x^2 + 1 (no 2x term in GF(2))
    assert_eq!(clmul64(3, 3), (0, 5)); // 3*3 = 5 in GF(2)
  }

  #[test]
  fn test_clmul64_overflow() {
    // High values that overflow into the high 64 bits
    let a = 1u64 << 63;
    let b = 2u64; // x
    // (x^63) * x = x^64, which is bit 64 (in high part)
    let (hi, lo) = clmul64(a, b);
    assert_eq!(hi, 1);
    assert_eq!(lo, 0);
  }

  #[test]
  fn test_xpow_mod_basic() {
    let poly = CRC64_XZ_CLMUL.poly;

    // x^0 = 1
    assert_eq!(xpow_mod(0, poly), 1);

    // x^1 = 2 (just x in reflected form)
    assert_eq!(xpow_mod(1, poly), 2);

    // x^2 = 4
    assert_eq!(xpow_mod(2, poly), 4);

    // Higher powers should be reduced mod poly
    let x64 = xpow_mod(64, poly);
    // x^64 mod P should equal poly (since P = x^64 + poly)
    assert_eq!(x64, poly);
  }

  #[test]
  fn test_xpow_mod_xz_polynomial() {
    // Verify some known values for CRC-64-XZ
    let poly = CRC64_XZ_POLY;

    // x^64 mod P = poly (by definition of the polynomial)
    assert_eq!(xpow_mod(64, poly), poly);

    // Verify the result is always 64 bits
    assert!(xpow_mod(128, poly) != 0);
    assert!(xpow_mod(1024, poly) != 0);
  }

  #[test]
  fn test_fold_constants_generated() {
    // Just verify the constants are generated without panic
    // Constant sets should be distinct
    assert_ne!(CRC64_XZ_CLMUL.poly, CRC64_NVME_CLMUL.poly);
    assert_ne!(CRC64_XZ_CLMUL.mu, CRC64_NVME_CLMUL.mu);
  }

  #[test]
  fn test_barrett_mu() {
    // Barrett µ should be approximately x^64 (high bit set)
    // µ for a degree-64 polynomial is a 65-bit value; low bits should be non-zero.
    assert_ne!(CRC64_XZ_CLMUL.mu, 0);
  }

  #[test]
  fn test_polynomial_property() {
    // Verify that x^64 mod P = P (lower 64 bits)
    // This is the fundamental property of the polynomial
    // x^64 mod (x^64 + poly) = poly
    assert_eq!(xpow_mod(64, CRC64_XZ_CLMUL.poly), CRC64_XZ_CLMUL.poly);
    assert_eq!(xpow_mod(64, CRC64_NVME_CLMUL.poly), CRC64_NVME_CLMUL.poly);
  }

  #[test]
  fn test_reduce128_identity() {
    // Reducing a value less than 2^64 should return itself
    let poly = CRC64_XZ_POLY;
    assert_eq!(reduce128(0, 0x12345678, poly), 0x12345678);
    assert_eq!(reduce128(0, poly - 1, poly), poly - 1);
  }

  #[test]
  fn test_reduce128_single_bit() {
    let poly = CRC64_XZ_POLY;

    // Reducing x^64 (bit 64 set) should give poly
    assert_eq!(reduce128(1, 0, poly), poly);

    // Reducing x^65 (bit 65 set) should give poly * 2 mod P
    let x65 = reduce128(2, 0, poly);
    let expected = xpow_mod(65, poly);
    assert_eq!(x65, expected);
  }

  #[test]
  fn test_constants_symmetry() {
    // `K_127` should be non-zero.
    assert_ne!(CRC64_XZ_CLMUL.fold_8b, 0);
  }

  #[test]
  fn test_xz_constants_match_tikv() {
    // TiKV `crc64fast` constants (v1.1.0).
    assert_eq!(CRC64_XZ_CLMUL.poly, 0x92d8_af2b_af0e_1e85);
    assert_eq!(CRC64_XZ_CLMUL.mu, 0x9c3e_466c_1729_63d5);
    assert_eq!(CRC64_XZ_CLMUL.fold_128b, (0xd7d8_6b2a_f73d_e740, 0x8757_d71d_4fcc_1000));
    assert_eq!(
      CRC64_XZ_CLMUL.tail_fold_16b,
      [
        (0x9478_74de_5950_52cb, 0x9e73_5cb5_9b47_24da), // 112
        (0xe4ce_2cd5_5fea_0037, 0x2fe3_fd29_20ce_82ec), // 96
        (0x0e31_d519_421a_63a5, 0x2e30_2032_12ca_c325), // 80
        (0x081f_6054_a784_2df4, 0x6ae3_efbb_9dd4_41f3), // 64
        (0x69a3_5d91_c373_0254, 0xb5ea_1af9_c013_aca4), // 48
        (0x3be6_53a3_0fe1_af51, 0x6009_5b00_8a9e_fa44), // 32
        (0xdabe_95af_c787_5f40, 0xe05d_d497_ca39_3ae4), // 16
      ]
    );
    assert_eq!(CRC64_XZ_CLMUL.fold_8b, 0xdabe_95af_c787_5f40);
  }

  #[test]
  fn test_nvme_constants_match_tikv() {
    // TiKV `crc64fast-nvme` constants (v1.2.1).
    assert_eq!(CRC64_NVME_CLMUL.poly, 0x34d9_2653_5897_936b);
    assert_eq!(CRC64_NVME_CLMUL.mu, 0x27ec_fa32_9aef_9f77);
    assert_eq!(
      CRC64_NVME_CLMUL.fold_128b,
      (0x5f85_2fb6_1e8d_92dc, 0xa1ca_681e_733f_9c40)
    );
    assert_eq!(
      CRC64_NVME_CLMUL.tail_fold_16b,
      [
        (0x9465_8840_3d4a_dcbc, 0xd083_dd59_4d96_319d), // 112
        (0x34f5_a24e_22d6_6e90, 0x3c25_5f5e_bc41_4423), // 96
        (0x0336_3823_e6e7_91e5, 0x7b0a_b10d_d0f8_09fe), // 80
        (0x6224_2240_ace5_045a, 0x0c32_cdb3_1e18_a84a), // 64
        (0xa3ff_dc1f_e8e8_2a8b, 0xbdd7_ac0e_e1a4_a0f0), // 48
        (0xe1e0_bb9d_45d7_a44c, 0xb0bc_2e58_9204_f500), // 32
        (0x21e9_761e_2526_21ac, 0xeadc_41fd_2ba3_d420), // 16
      ]
    );
    assert_eq!(CRC64_NVME_CLMUL.fold_8b, 0x21e9_761e_2526_21ac);
  }
}