cryptography-rs 0.6.2

Block ciphers, hashes, public-key, and post-quantum primitives implemented directly from their specifications and original papers.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
//! Constant-time helpers, the OWCPA + FO-style KEM core, and the
//! `NtruVariant` trait shared by the NIST PQC NTRU modules
//! ([`crate::public_key::ntru_hps509`], `_hps677`, `_hps821`, `_hrss701`).
//!
//! This module hosts the algorithmic core that the four per-set NTRU
//! PQC modules in this crate share. Each per-set file is the
//! parameter constants plus a single `impl NtruVariant<N, LOGQ>` and
//! a `define_pqc_kem!` invocation; everything else lives here.
//!
//! Reference: round-3 NTRU specification (Chen, Chung, Hülsing, Lange,
//! Lyubashevsky, Saito, Schanck, Schwabe, Stehlé, Whyte, Xagawa,
//! Yamakawa, Zhang; NIST PQC, 2020-10-16).
//!
//! Construction (HPS variants `Hps509Variant` / `Hps677Variant` /
//! `Hps821Variant`; HRSS variant `Hrss701Variant`):
//!
//! - Ring $\mathbb{Z}_q[x] / (x^N - 1)$ with operations projected onto
//!   $\mathbb{Z}_q[x] / \Phi_n(x)$ where
//!   $\Phi_n(x) = (x^N - 1) / (x - 1)$ for the `Sq` and `S3` views.
//! - One-way CPA-secure encryption (OWCPA) under the trapdoor
//!   $(f, g)$ with public key $h = g / f$ in $R_q$, encryption
//!   $c = r \cdot h + \text{lift}(m)$, decryption recovering
//!   $(r, m)$. Variant-specific bits ($g$-update, $\text{lift}$,
//!   message validation) come from [`NtruVariant`] methods.
//! - CCA KEM via the SXY/Sch18 Fujisaki–Okamoto-style transform:
//!   shared key $K = \text{SHA3-256}(r \mathbin\| m)$, with
//!   deterministic implicit rejection
//!   $K = \text{SHA3-256}(\text{prf} \mathbin\| c)$ on any
//!   decapsulation failure. SHA3-256 and AES-256 CTR-DRBG come
//!   from this crate's `hash` and `cprng` modules; no C/FFI
//!   backends are used.
//!
//! Implementation notes shared by all four parameter sets:
//!
//! - inversion in $R_2 = \mathbb{F}_2[x] / (x^N - 1)$ and in
//!   $S_3 = \mathbb{F}_3[x] / \Phi_n(x)$ uses the constant-time gcd
//!   recursion of Bernstein and Yang ("Fast constant-time gcd
//!   computation and modular inversion", TCHES 2019).
//! - the fixed-weight `T_fixed` sampler ([`sample_fixed_type`], HPS
//!   only) tags each candidate coefficient with 30 random bits and
//!   a 2-bit trinary intent, then sorts by tag using Batcher's
//!   bitonic sorting network (Batcher, "Sorting networks and their
//!   applications", AFIPS 1968).
//! - polynomial arithmetic and packings are implemented in-tree;
//!   the cyclic multiplier ([`crate::public_key::ntru_poly_mul`])
//!   uses Karatsuba over `u16` wrapping arithmetic.
//!
//! Side-channel inventory (the per-set modules link here instead of
//! repeating it):
//!
//! - **Constant-time** (data-independent control flow): the
//!   Bernstein–Yang $R_2$ and $S_3$ inverters, the four-round
//!   Newton/Hensel lift to $R_q$, the Batcher fixed-weight sort
//!   ([`crypto_sort_int32`], used by HPS only), [`cmov`], `mod3`,
//!   `mod3_u8`, the IID-uniform-mod-3 sampler, the SHA3-256 + AES-256
//!   CTR-DRBG implementations from this crate's `hash` and `cprng`
//!   modules, and the polynomial multiplier in
//!   [`crate::public_key::ntru_poly_mul`] (its schoolbook base case
//!   issues exactly one `wrapping_mul` and one `wrapping_add` per
//!   coefficient pair, with no early-skip on zeros, and Karatsuba
//!   inherits that property recursively).
//!
//! - **Caveats**: `u16::wrapping_mul` is only constant-time at the
//!   hardware level on architectures whose integer multiplier is
//!   itself constant-time, which is the case on every CPU this crate
//!   targets (modern AArch64 / x86-64 / RISC-V `MUL`). The four NIST
//!   PQC NTRU modules remain re-exported under [`crate::vt`] because
//!   that namespace is this crate's convention for "public-key
//!   primitives that have not been independently formally vetted as
//!   constant-time across all relevant micro-architectural channels"
//!   — e.g. cache-timing on the `params` accesses, branch-predictor
//!   training on the FO transform — not because of any specific
//!   data-dependent branch in the source.

/// Branch-free conditional move. When `b == 1`, `r` is set to `x`; when
/// `b == 0`, `r` is unchanged. The caller is responsible for keeping `b`
/// in `{0, 1}`.
///
/// Mask trick: $-(b)$ as a `u8` is `0xff` when $b = 1$ and `0x00` when
/// $b = 0$. XOR-blend gives the conditional copy without branching.
pub(crate) fn cmov(r: &mut [u8], x: &[u8], b: u8) {
    debug_assert_eq!(r.len(), x.len());
    debug_assert!(b == 0 || b == 1);
    let mask = (!b).wrapping_add(1);
    for (ri, &xi) in r.iter_mut().zip(x.iter()) {
        *ri ^= mask & (xi ^ *ri);
    }
}

/// Branchless `(a, b) := (min(a, b), max(a, b))` over signed `i32`.
/// Used as the comparator inside [`crypto_sort_int32`].
#[inline(always)]
fn int32_minmax(a: &mut i32, b: &mut i32) {
    let ab = (*b) ^ (*a);
    let mut c = ((*b) as i64).wrapping_sub((*a) as i64) as i32;
    c ^= ab & (c ^ (*b));
    c >>= 31;
    c &= ab;
    *a ^= c;
    *b ^= c;
}

/// Sort `array` ascending using Batcher's merge-exchange network.
///
/// Every comparator is data-independent (only the two slot indices vary),
/// so the resulting sort is constant-time conditional on `array.len()`.
/// Used by the NIST PQC `T_fixed` sampler to permute by 30-bit random
/// tags without revealing the tag values through timing.
///
/// Reference: Batcher, "Sorting networks and their applications" (AFIPS
/// 1968).
pub(crate) fn crypto_sort_int32(array: &mut [i32]) {
    let n = array.len();
    if n < 2 {
        return;
    }
    let mut top: usize = 1;
    while top < n - top {
        top += top;
    }

    let mut p = top;
    while p >= 1 {
        let mut i = 0usize;
        while i + 2 * p <= n {
            for j in i..i + p {
                let (lo, hi) = array.split_at_mut(j + p);
                int32_minmax(&mut lo[j], &mut hi[0]);
            }
            i += 2 * p;
        }
        for j in i..n.saturating_sub(p) {
            let (lo, hi) = array.split_at_mut(j + p);
            int32_minmax(&mut lo[j], &mut hi[0]);
        }

        let mut i = 0usize;
        let mut j = 0usize;
        let mut q = top;
        while q > p {
            'outer: loop {
                if j != i {
                    loop {
                        if j == n - q {
                            break 'outer;
                        }
                        let mut a = array[j + p];
                        let mut r = q;
                        while r > p {
                            // `a` is a register copy of `array[j+p]`; we
                            // only need a mutable reference to
                            // `array[j+r]` here, no split needed.
                            int32_minmax(&mut a, &mut array[j + r]);
                            r >>= 1;
                        }
                        array[j + p] = a;
                        j += 1;
                        if j == i + p {
                            i += 2 * p;
                            break;
                        }
                    }
                }
                while i + p <= n - q {
                    for k in i..i + p {
                        let mut a = array[k + p];
                        let mut r = q;
                        while r > p {
                            int32_minmax(&mut a, &mut array[k + r]);
                            r >>= 1;
                        }
                        array[k + p] = a;
                    }
                    i += 2 * p;
                }
                let mut k = i;
                while k < n.saturating_sub(q) {
                    let mut a = array[k + p];
                    let mut r = q;
                    while r > p {
                        int32_minmax(&mut a, &mut array[k + r]);
                        r >>= 1;
                    }
                    array[k + p] = a;
                    k += 1;
                }
                break;
            }
            q >>= 1;
        }

        p >>= 1;
    }
}

/// Sign-bit AND on signed `i16`: returns `-1` (all-ones in `i16`) when
/// both `x` and `y` are negative, `0` otherwise. Used inside the
/// constant-time Bernstein–Yang inverter loop.
#[inline(always)]
pub(crate) fn both_negative_mask_i16(x: i16, y: i16) -> i16 {
    (x & y) >> 15
}

/// Reduce $a \in [0, 2^{16})$ modulo 3 without branches.
///
/// Folds the input through successive halvings of the modulus
/// (`mod 255 → mod 15 → mod 3 → mod 3`) and then applies a single
/// branchless correction step. Identical reduction is used by all four
/// NIST PQC NTRU parameter sets.
#[inline]
pub(crate) fn mod3(a: u16) -> u16 {
    let mut r = (a >> 8) + (a & 0xff);
    r = (r >> 4) + (r & 0xf);
    r = (r >> 2) + (r & 0x3);
    r = (r >> 2) + (r & 0x3);
    let t = (r as i16) - 3;
    let c = t >> 15;
    (((c as u16) & r) | ((!c as u16) & (t as u16))) & 0xffff
}

/// Reduce $a \in [0, 14]$ modulo 3 without branches.
///
/// One round of $(a \gg 2) + (a \mathbin\& 3)$ shrinks the input
/// to $[0, 5]$ for any $a \le 14$, which is then folded by a
/// branchless "subtract 3 if non-negative" step. The Bernstein–Yang
/// $\mathbb{F}_3$ inverter feeds in values up to 9
/// (`(a[i] & 3) + 2 * (a[N - 1] & 3)`), so the bound is honoured at
/// every call site. The function's output is wrong for $a \ge 15$
/// (e.g. `mod3_u8(15) = 3`); use `mod3` for unbounded `u16` inputs.
#[inline]
pub(crate) fn mod3_u8(a: u8) -> u8 {
    debug_assert!(a <= 14, "mod3_u8 input out of range: {a}");
    let a = (a >> 2) + (a & 3);
    let t = (a as i16) - 3;
    let c = t >> 5;
    (t ^ (c & ((a as i16) ^ t))) as u8
}

/// Builder-style hash-update helper used by the NIST FO-style KEM
/// transforms: `Sha3_256::new().chain(a).chain(b).finalize()` reads more
/// naturally than a sequence of `update` calls.
pub(crate) trait DigestChain: crate::hash::Digest + Sized {
    fn chain(self, data: &[u8]) -> Self {
        let mut me = self;
        me.update(data);
        me
    }
}

impl<D: crate::hash::Digest> DigestChain for D {}

// ---- shared polynomial inverters (Bernstein–Yang + Hensel) -----------------

/// Constant-time inverse of `a` in $R_2 = \mathbb{F}_2[x] / (x^N - 1)$.
///
/// Bernstein and Yang's swap-and-shift gcd recursion (TCHES 2019, "Fast
/// constant-time gcd computation and modular inversion") with $2(N - 1) - 1$
/// iterations, the worst-case bound from the cited paper. Every comparator,
/// shift, and conditional in the loop is data-independent.
pub(crate) fn poly_r2_inv<const N: usize>(r: &mut [u16; N], a: &[u16; N]) {
    let mut f = [0u16; N];
    let mut g = [0u16; N];
    let mut v = [0u16; N];
    let mut w = [0u16; N];
    w[0] = 1;
    for fi in f.iter_mut() {
        *fi = 1;
    }
    for i in 0..N - 1 {
        g[N - 2 - i] = (a[i] ^ a[N - 1]) & 1;
    }
    g[N - 1] = 0;
    let mut delta: i16 = 1;

    for _ in 0..(2 * (N - 1) - 1) {
        for i in (1..N).rev() {
            v[i] = v[i - 1];
        }
        v[0] = 0;

        let sign = (g[0] & f[0]) as i16;
        let swap = both_negative_mask_i16(-delta, -(g[0] as i16));
        delta ^= swap & (delta ^ -delta);
        delta += 1;

        for i in 0..N {
            let t = (swap as u16) & (f[i] ^ g[i]);
            f[i] ^= t;
            g[i] ^= t;
            let t = (swap as u16) & (v[i] ^ w[i]);
            v[i] ^= t;
            w[i] ^= t;
        }
        for i in 0..N {
            g[i] ^= (sign as u16) & f[i];
        }
        for i in 0..N {
            w[i] ^= (sign as u16) & v[i];
        }
        for i in 0..N - 1 {
            g[i] = g[i + 1];
        }
        g[N - 1] = 0;
    }

    for i in 0..N - 1 {
        r[i] = v[N - 2 - i];
    }
    r[N - 1] = 0;
}

/// Constant-time inverse of `a` in $S_3 = \mathbb{F}_3[x] / \Phi_n(x)$.
/// Same Bernstein–Yang recursion as [`poly_r2_inv`] but over $\mathbb{F}_3$;
/// `mod3_u8` keeps each step's coefficients canonical in $\{0, 1, 2\}$.
pub(crate) fn poly_s3_inv<const N: usize>(r: &mut [u16; N], a: &[u16; N]) {
    let mut f = [0u16; N];
    let mut g = [0u16; N];
    let mut v = [0u16; N];
    let mut w = [0u16; N];
    w[0] = 1;
    for fi in f.iter_mut() {
        *fi = 1;
    }
    for i in 0..N - 1 {
        g[N - 2 - i] = mod3_u8(((a[i] & 3) + 2 * (a[N - 1] & 3)) as u8) as u16;
    }
    g[N - 1] = 0;
    let mut delta: i16 = 1;

    for _ in 0..(2 * (N - 1) - 1) {
        for i in (1..N).rev() {
            v[i] = v[i - 1];
        }
        v[0] = 0;

        let sign = mod3_u8((2 * g[0] * f[0]) as u8) as u16;
        let swap = both_negative_mask_i16(-delta, -(g[0] as i16));
        delta ^= swap & (delta ^ -delta);
        delta += 1;

        for i in 0..N {
            let t = (swap as u16) & (f[i] ^ g[i]);
            f[i] ^= t;
            g[i] ^= t;
            let t = (swap as u16) & (v[i] ^ w[i]);
            v[i] ^= t;
            w[i] ^= t;
        }
        for i in 0..N {
            g[i] = mod3_u8((g[i] + sign * f[i]) as u8) as u16;
        }
        for i in 0..N {
            w[i] = mod3_u8((w[i] + sign * v[i]) as u8) as u16;
        }
        for i in 0..N - 1 {
            g[i] = g[i + 1];
        }
        g[N - 1] = 0;
    }

    let sign = f[0] as u16;
    for i in 0..N - 1 {
        r[i] = mod3_u8((sign * v[N - 2 - i]) as u8) as u16;
    }
    r[N - 1] = 0;
}

/// Hensel-lift an inverse of `a` from $R_2$ to $R_q = \mathbb{Z}_q[x] / (x^N - 1)$.
///
/// Newton-style 2-adic lift: given $a \cdot b \equiv 1 \pmod{2^k}$,
/// the update $b \leftarrow b \cdot (2 - a \cdot b)$ doubles the precision
/// to $\pmod{2^{2k}}$. Four iterations carry the precision from $2^1$ to
/// $2^{16}$, which subsumes every $q$ in this NTRU family ($q \le 2^{13}$).
/// All arithmetic is `u16` wrapping; the caller reduces modulo $q$ at use.
pub(crate) fn poly_r2_inv_to_rq_inv<const N: usize>(
    r: &mut [u16; N],
    ai: &[u16; N],
    a: &[u16; N],
) {
    let mut b = [0u16; N];
    for i in 0..N {
        b[i] = 0u16.wrapping_sub(a[i]);
    }
    r.copy_from_slice(ai);

    let mut c = [0u16; N];
    let mut s = [0u16; N];

    use crate::public_key::ntru_poly_mul::poly_mul_cyclic as mul;

    mul(&mut c, r, &b);
    c[0] = c[0].wrapping_add(2);
    mul(&mut s, &c, r);

    mul(&mut c, &s, &b);
    c[0] = c[0].wrapping_add(2);
    mul(r, &c, &s);

    mul(&mut c, r, &b);
    c[0] = c[0].wrapping_add(2);
    mul(&mut s, &c, r);

    mul(&mut c, &s, &b);
    c[0] = c[0].wrapping_add(2);
    mul(r, &c, &s);
}

// ---- per-set wrapper macro --------------------------------------------------
//
// Each NIST PQC NTRU set ships a typed wrapper around the shared
// `kem_keypair_seeded` / `kem_enc_seeded` / `kem_dec` routines and a fixed
// set of byte-length constants (`PUBLIC_KEY_BYTES`, `PRIVATE_KEY_BYTES`,
// `CIPHERTEXT_BYTES`, `SHARED_SECRET_BYTES`). The wrapper, the
// newtype quartet, the `Debug` impls, the `from_wire_bytes` /
// `to_wire_bytes` / `as_bytes` methods, and the standard generic test
// scaffolding (round-trip, implicit rejection, wire-format round-trip,
// sampled NIST KAT, full NIST KAT) are mechanical — this macro emits
// them so each NIST module is just the algebra plus the parameter
// constants.
//
// Caller-scope identifiers the expansion captures (these must exist
// in the calling module's namespace):
//   - `N` (`const usize`): ring degree for this parameter set.
//   - `LOGQ` (`const usize`): $\log_2 q$ for this parameter set.
//   - `PUBLIC_KEY_BYTES`, `PRIVATE_KEY_BYTES`, `CIPHERTEXT_BYTES`,
//     `SHARED_SECRET_BYTES` (`const usize`): wire-format byte sizes
//     used as `[u8; …]` element counts in the newtype storage.
//   - `SAMPLE_FG_BYTES`, `SAMPLE_RM_BYTES`, `OWCPA_MSGBYTES`
//     (`const usize`): scratch-buffer sizes the macro stack-allocates
//     and threads into the shared kem_*_seeded / kem_dec routines.
// Every NIST PQC per-set file in this crate defines these; a future
// module that uses different naming will hit a confusing macro-side
// resolution error, so this list is the contract.

macro_rules! define_pqc_kem {
    (
        namespace = $type_name:ident,
        public_key = $pk_ty:ident,
        private_key = $sk_ty:ident,
        ciphertext = $ct_ty:ident,
        shared_secret = $ss_ty:ident,
        variant = $variant:ident,
        kat_path = $kat_path:literal $(,)?
    ) => {
        #[derive(Clone, Eq, PartialEq)]
        pub struct $pk_ty {
            bytes: [u8; PUBLIC_KEY_BYTES],
        }

        #[derive(Clone, Eq, PartialEq)]
        pub struct $sk_ty {
            bytes: [u8; PRIVATE_KEY_BYTES],
        }

        #[derive(Clone, Eq, PartialEq)]
        pub struct $ct_ty {
            bytes: [u8; CIPHERTEXT_BYTES],
        }

        #[derive(Clone, Eq, PartialEq)]
        pub struct $ss_ty {
            bytes: [u8; SHARED_SECRET_BYTES],
        }

        impl $pk_ty {
            #[must_use]
            pub fn from_wire_bytes(bytes: &[u8]) -> Option<Self> {
                if bytes.len() != PUBLIC_KEY_BYTES { return None; }
                let mut out = [0u8; PUBLIC_KEY_BYTES];
                out.copy_from_slice(bytes);
                Some(Self { bytes: out })
            }

            #[must_use]
            pub fn to_wire_bytes(&self) -> [u8; PUBLIC_KEY_BYTES] { self.bytes }

            #[must_use]
            pub fn as_bytes(&self) -> &[u8; PUBLIC_KEY_BYTES] { &self.bytes }
        }

        impl $sk_ty {
            #[must_use]
            pub fn from_wire_bytes(bytes: &[u8]) -> Option<Self> {
                if bytes.len() != PRIVATE_KEY_BYTES { return None; }
                let mut out = [0u8; PRIVATE_KEY_BYTES];
                out.copy_from_slice(bytes);
                Some(Self { bytes: out })
            }

            #[must_use]
            pub fn to_wire_bytes(&self) -> [u8; PRIVATE_KEY_BYTES] { self.bytes }

            #[must_use]
            pub fn as_bytes(&self) -> &[u8; PRIVATE_KEY_BYTES] { &self.bytes }
        }

        impl $ct_ty {
            #[must_use]
            pub fn from_wire_bytes(bytes: &[u8]) -> Option<Self> {
                if bytes.len() != CIPHERTEXT_BYTES { return None; }
                let mut out = [0u8; CIPHERTEXT_BYTES];
                out.copy_from_slice(bytes);
                Some(Self { bytes: out })
            }

            #[must_use]
            pub fn to_wire_bytes(&self) -> [u8; CIPHERTEXT_BYTES] { self.bytes }

            #[must_use]
            pub fn as_bytes(&self) -> &[u8; CIPHERTEXT_BYTES] { &self.bytes }
        }

        impl $ss_ty {
            #[must_use]
            pub fn to_wire_bytes(&self) -> [u8; SHARED_SECRET_BYTES] { self.bytes }

            #[must_use]
            pub fn as_bytes(&self) -> &[u8; SHARED_SECRET_BYTES] { &self.bytes }
        }

        impl ::core::fmt::Debug for $pk_ty {
            fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
                f.debug_struct(stringify!($pk_ty)).finish()
            }
        }

        impl ::core::fmt::Debug for $ct_ty {
            fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
                f.debug_struct(stringify!($ct_ty)).finish()
            }
        }

        impl ::core::fmt::Debug for $sk_ty {
            fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
                f.write_str(concat!(stringify!($sk_ty), "(<redacted>)"))
            }
        }

        impl ::core::fmt::Debug for $ss_ty {
            fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
                f.write_str(concat!(stringify!($ss_ty), "(<redacted>)"))
            }
        }

        pub struct $type_name;

        impl $type_name {
            /// Wire-format public-key length in bytes for this set.
            pub const PUBLIC_KEY_BYTES: usize = PUBLIC_KEY_BYTES;
            /// Wire-format private-key length in bytes for this set
            /// (includes the implicit-rejection PRF key tail).
            pub const PRIVATE_KEY_BYTES: usize = PRIVATE_KEY_BYTES;
            /// Wire-format ciphertext length in bytes for this set.
            pub const CIPHERTEXT_BYTES: usize = CIPHERTEXT_BYTES;
            /// Shared-secret length in bytes (always 32 for the
            /// round-3 NTRU sets).
            pub const SHARED_SECRET_BYTES: usize = SHARED_SECRET_BYTES;

            pub fn keygen<R: $crate::Csprng>(rng: &mut R) -> ($pk_ty, $sk_ty) {
                let mut pk = [0u8; PUBLIC_KEY_BYTES];
                let mut sk = [0u8; PRIVATE_KEY_BYTES];
                let mut seed_scratch = [0u8; SAMPLE_FG_BYTES];
                $crate::public_key::ntru_pqc_shared::kem_keypair_seeded::<$variant, R, N, LOGQ>(
                    &mut pk,
                    &mut sk,
                    rng,
                    &mut seed_scratch,
                );
                ($pk_ty { bytes: pk }, $sk_ty { bytes: sk })
            }

            pub fn encaps<R: $crate::Csprng>(
                pk: &$pk_ty,
                rng: &mut R,
            ) -> ($ct_ty, $ss_ty) {
                let mut ct = [0u8; CIPHERTEXT_BYTES];
                let mut ss = [0u8; SHARED_SECRET_BYTES];
                let mut rm_seed_scratch = [0u8; SAMPLE_RM_BYTES];
                let mut rm_scratch = [0u8; OWCPA_MSGBYTES];
                $crate::public_key::ntru_pqc_shared::kem_enc_seeded::<$variant, R, N, LOGQ>(
                    &mut ct,
                    &mut ss,
                    &pk.bytes,
                    rng,
                    &mut rm_seed_scratch,
                    &mut rm_scratch,
                );
                ($ct_ty { bytes: ct }, $ss_ty { bytes: ss })
            }

            pub fn decaps(sk: &$sk_ty, ct: &$ct_ty) -> $ss_ty {
                let mut ss = [0u8; SHARED_SECRET_BYTES];
                let mut rm_scratch = [0u8; OWCPA_MSGBYTES];
                $crate::public_key::ntru_pqc_shared::kem_dec::<$variant, N, LOGQ>(
                    &mut ss,
                    &ct.bytes,
                    &sk.bytes,
                    &mut rm_scratch,
                );
                $ss_ty { bytes: ss }
            }
        }

        #[cfg(test)]
        mod tests {
            use super::*;
            use $crate::CtrDrbgAes256;

            #[test]
            fn parameter_byte_lengths() {
                assert!(PUBLIC_KEY_BYTES > 0);
                assert!(PRIVATE_KEY_BYTES > 0);
                assert!(CIPHERTEXT_BYTES > 0);
                assert_eq!(SHARED_SECRET_BYTES, 32);
            }

            #[test]
            fn roundtrip_random() {
                let mut drbg = CtrDrbgAes256::new(&[0x42u8; 48]);
                let (pk, sk) = $type_name::keygen(&mut drbg);
                let (ct, ss_a) = $type_name::encaps(&pk, &mut drbg);
                let ss_b = $type_name::decaps(&sk, &ct);
                assert_eq!(ss_a.as_bytes(), ss_b.as_bytes());
            }

            #[test]
            fn roundtrip_multiple_seeds() {
                for seed in [0x00u8, 0x55, 0xaa, 0xff] {
                    let mut drbg = CtrDrbgAes256::new(&[seed; 48]);
                    let (pk, sk) = $type_name::keygen(&mut drbg);
                    let (ct, ss_a) = $type_name::encaps(&pk, &mut drbg);
                    let ss_b = $type_name::decaps(&sk, &ct);
                    assert_eq!(
                        ss_a.as_bytes(),
                        ss_b.as_bytes(),
                        "seed byte 0x{seed:02x}"
                    );
                }
            }

            #[test]
            fn implicit_rejection_on_corrupted_ciphertext() {
                let mut drbg = CtrDrbgAes256::new(&[0x99u8; 48]);
                let (pk, sk) = $type_name::keygen(&mut drbg);
                let (ct, ss_a) = $type_name::encaps(&pk, &mut drbg);
                let mut bad = ct.to_wire_bytes();
                bad[0] ^= 0x01;
                let bad_ct = $ct_ty::from_wire_bytes(&bad).unwrap();
                let ss_bad = $type_name::decaps(&sk, &bad_ct);
                assert_ne!(ss_bad.as_bytes(), ss_a.as_bytes());
                let ss_bad2 = $type_name::decaps(&sk, &bad_ct);
                assert_eq!(ss_bad.as_bytes(), ss_bad2.as_bytes());
            }

            #[test]
            fn wire_format_roundtrip() {
                let mut drbg = CtrDrbgAes256::new(&[0x21u8; 48]);
                let (pk, sk) = $type_name::keygen(&mut drbg);
                let (ct, _) = $type_name::encaps(&pk, &mut drbg);
                let pk_bytes = pk.to_wire_bytes();
                let sk_bytes = sk.to_wire_bytes();
                let ct_bytes = ct.to_wire_bytes();
                assert_eq!(pk_bytes.len(), PUBLIC_KEY_BYTES);
                assert_eq!(sk_bytes.len(), PRIVATE_KEY_BYTES);
                assert_eq!(ct_bytes.len(), CIPHERTEXT_BYTES);
                let pk2 = $pk_ty::from_wire_bytes(&pk_bytes).unwrap();
                let sk2 = $sk_ty::from_wire_bytes(&sk_bytes).unwrap();
                let ct2 = $ct_ty::from_wire_bytes(&ct_bytes).unwrap();
                assert_eq!(pk, pk2);
                assert_eq!(sk, sk2);
                assert_eq!(ct, ct2);
            }

            /// Sampled NIST round-3 KAT validation for this parameter set.
            /// See [`nist_kat_full`] for the full 100-vector sweep.
            #[test]
            fn nist_kat_sampled_counts() {
                let rsp = include_str!($kat_path);
                for &count in $crate::public_key::ntru_pqc_shared::KAT_SAMPLED_COUNTS {
                    run_kat_count(rsp, count);
                }
            }

            /// Heavy variant — validates all 100 entries of the .rsp file.
            #[test]
            #[ignore]
            fn nist_kat_full() {
                let rsp = include_str!($kat_path);
                for count in 0..100 {
                    run_kat_count(rsp, count);
                }
            }

            fn run_kat_count(rsp: &str, count: usize) {
                let entry = $crate::public_key::ntru_pqc_shared::parse_kat_entry(rsp, count)
                    .unwrap_or_else(|| panic!("KAT count={count} missing"));
                assert_eq!(entry.seed.len(), 48, "seed length");
                let mut seed = [0u8; 48];
                seed.copy_from_slice(&entry.seed);
                let mut drbg = CtrDrbgAes256::new(&seed);

                let (pk, sk) = $type_name::keygen(&mut drbg);
                assert_eq!(pk.to_wire_bytes().as_slice(), entry.pk.as_slice(), "pk @ count={count}");
                assert_eq!(sk.to_wire_bytes().as_slice(), entry.sk.as_slice(), "sk @ count={count}");

                let (ct, ss) = $type_name::encaps(&pk, &mut drbg);
                assert_eq!(ct.to_wire_bytes().as_slice(), entry.ct.as_slice(), "ct @ count={count}");
                assert_eq!(ss.to_wire_bytes().as_slice(), entry.ss.as_slice(), "ss @ count={count}");

                let ss2 = $type_name::decaps(&sk, &ct);
                assert_eq!(ss.as_bytes(), ss2.as_bytes(), "decaps @ count={count}");
            }
        }
    };
}

pub(crate) use define_pqc_kem;

// ---- shared polynomial helpers (N- and LOGQ-parameterised) -----------------

/// $\Phi_n$-projection of a polynomial coefficient vector treated mod 3.
/// Subtracts the constant term from every coefficient (with the standard
/// "$2 \cdot \text{last}$" identity for mod-3 arithmetic), then reduces.
pub(crate) fn poly_mod_3_phi_n<const N: usize>(r: &mut [u16; N]) {
    let last = r[N - 1];
    for c in r.iter_mut() {
        *c = mod3(*c + 2 * last);
    }
}

/// $\Phi_n$-projection of a polynomial coefficient vector mod $q$. The
/// caller is responsible for masking with `Q_MASK` afterwards if it wants
/// canonical values; the multiplications elsewhere already do so.
pub(crate) fn poly_mod_q_phi_n<const N: usize>(r: &mut [u16; N]) {
    let last = r[N - 1];
    for c in r.iter_mut() {
        *c = c.wrapping_sub(last);
    }
}

/// Embed coefficients in $\{0, 1, 2\}$ into $\mathbb{Z}_q$ as
/// $\{0, 1, q - 1\}$.
pub(crate) fn poly_z3_to_zq<const N: usize>(r: &mut [u16; N], q_mask: u16) {
    for c in r.iter_mut() {
        *c |= (0u16.wrapping_sub(*c >> 1)) & q_mask;
    }
}

/// Project $\mathbb{Z}_q$ coefficients in $\{0, 1, q - 1\}$ back to
/// $\{0, 1, 2\}$.
pub(crate) fn poly_trinary_zq_to_z3<const N: usize, const LOGQ: usize>(r: &mut [u16; N]) {
    let q_mask = ((1u32 << LOGQ) - 1) as u16;
    for c in r.iter_mut() {
        *c = *c & q_mask;
        *c = 3 & (*c ^ (*c >> (LOGQ - 1)));
    }
}

/// Project an arbitrary $R_q$ coefficient vector onto $S_3$ (mod 3,
/// mod $\Phi_n$).
pub(crate) fn poly_rq_to_s3<const N: usize, const LOGQ: usize>(
    r: &mut [u16; N],
    a: &[u16; N],
) {
    let q_mask = ((1u32 << LOGQ) - 1) as u16;
    for i in 0..N {
        let mut c = a[i] & q_mask;
        let flag = c >> (LOGQ - 1);
        c = c.wrapping_add(flag << (1 - (LOGQ & 1)));
        r[i] = c;
    }
    poly_mod_3_phi_n::<N>(r);
}

/// Inverse in $R_q = \mathbb{Z}_q[x] / (x^N - 1)$: F_2 inverse via
/// Bernstein–Yang, then Hensel-lift to mod $q$.
pub(crate) fn poly_rq_inv<const N: usize>(r: &mut [u16; N], a: &[u16; N]) {
    let mut ai2 = [0u16; N];
    poly_r2_inv(&mut ai2, a);
    poly_r2_inv_to_rq_inv(r, &ai2, a);
}

// ---- per-set NTRU variant trait + shared OWCPA core ------------------------
//
// The HPS-509 / HPS-677 / HPS-821 / HRSS-701 modules differ in five
// variant-specific behaviours plus an Sq packer that depends on `LOGQ`:
//
// - the joint $f, g$ sampler (HPS uses iid + fixed-weight; HRSS uses
//   `Sample_iid_plus` for both)
// - the joint $r, m$ sampler (HPS uses iid + fixed-weight; HRSS uses
//   iid for both)
// - the keygen $g$-update step ($g \gets 3g$ for HPS, $g \gets 3(x-1)g$
//   for HRSS)
// - the lift function (trivial $\mathbb{Z}_3 \to \mathbb{Z}_q$ embedding for
//   HPS, $(x-1)$-factor lift for HRSS)
// - the message-space check (weight + balance for HPS; HRSS accepts any
//   $S_3$ element)
// - the `Sq` packer / unpacker (11-bit, 12-bit, 13-bit)
//
// Everything else (Bernstein–Yang inversion, Hensel lift, Sq/S3
// arithmetic, IGF/MGF-style sampling helpers, OWCPA validity checks,
// the FO transform on top of OWCPA) is identical. The `NtruVariant`
// trait names just the variant-specific bits, and the
// [`owcpa_keypair`] / [`owcpa_enc`] / [`owcpa_dec`] free functions
// in this module implement OWCPA on top of it.

/// Per-set NTRU variant: HPS-509 / HPS-677 / HPS-821 / HRSS-701 each
/// implement this trait. The `N` and `LOGQ` const generics carry the
/// ring-degree and log-modulus into associated-constant expressions
/// without `generic_const_exprs`.
///
/// The trait carries the HPS sampler / lift / message-check defaults so
/// HPS-flavoured impls only need to set the parameter consts and the
/// LOGQ-specific Sq packer; HRSS-701 overrides the variant-specific
/// methods.
pub(crate) trait NtruVariant<const N: usize, const LOGQ: usize> {
    const Q_MASK: u16;
    const SAMPLE_FG_BYTES: usize;
    const SAMPLE_RM_BYTES: usize;
    const PACK_TRINARY_BYTES: usize;
    const OWCPA_PUBLICKEYBYTES: usize;
    const OWCPA_SECRETKEYBYTES: usize;
    const OWCPA_BYTES: usize;
    const OWCPA_MSGBYTES: usize;

    /// HPS-only fixed sampling weight. HRSS-701 must set this to 0
    /// explicitly — the trait deliberately declines to provide a
    /// default so that a future variant which forgets to set
    /// `WEIGHT` cannot silently feed 0 into [`owcpa_check_m`] and
    /// accept every message.
    const WEIGHT: usize;

    /// HPS default: $f$ via `sample_iid`, $g$ via `sample_fixed_type`
    /// with `WEIGHT`. HRSS overrides to use `sample_iid_plus` for both.
    fn sample_fg(f: &mut [u16; N], g: &mut [u16; N], seed: &[u8]) {
        debug_assert_eq!(seed.len(), Self::SAMPLE_FG_BYTES);
        let iid_bytes = N - 1;
        sample_iid::<N>(f, &seed[..iid_bytes]);
        let mut scratch = [0i32; N];
        sample_fixed_type::<N>(g, &seed[iid_bytes..], Self::WEIGHT, &mut scratch);
    }

    /// HPS default: $r$ via `sample_iid`, $m$ via `sample_fixed_type`
    /// with `WEIGHT`. HRSS overrides to use `sample_iid` for both.
    fn sample_rm(r: &mut [u16; N], m: &mut [u16; N], seed: &[u8]) {
        debug_assert_eq!(seed.len(), Self::SAMPLE_RM_BYTES);
        let iid_bytes = N - 1;
        sample_iid::<N>(r, &seed[..iid_bytes]);
        let mut scratch = [0i32; N];
        sample_fixed_type::<N>(m, &seed[iid_bytes..], Self::WEIGHT, &mut scratch);
    }

    /// HPS default: $g \gets 3 g$. HRSS overrides to
    /// $g \gets 3 (x - 1) g$.
    fn update_g_after_z3_to_zq(g: &mut [u16; N]) {
        for gi in g.iter_mut() {
            *gi = gi.wrapping_mul(3);
        }
    }

    /// HPS default: trivial $\{0, 1, 2\} \to \{0, 1, q - 1\}$ embedding.
    /// HRSS overrides with the $(x - 1)$-factor lift.
    fn poly_lift(r: &mut [u16; N], a: &[u16; N]) {
        poly_lift_hps::<N>(r, a, Self::Q_MASK);
    }

    /// HPS default: weight + balance check against `WEIGHT`. HRSS
    /// overrides to return 0 (any $S_3$ element is a valid message).
    fn check_m(m: &[u16; N]) -> i32 {
        owcpa_check_m::<N>(m, Self::WEIGHT)
    }

    fn poly_sq_tobytes(r: &mut [u8], a: &[u16; N]);
    fn poly_sq_frombytes(r: &mut [u16; N], a: &[u8]);
}

/// OWCPA key pair generation. Writes the canonical public-key wire
/// bytes to `pk` and the canonical OWCPA secret-key bytes (without the
/// FO implicit-rejection PRF tail) to `sk`. Uses `seed` for the joint
/// $(f, g)$ sample.
pub(crate) fn owcpa_keypair<V, const N: usize, const LOGQ: usize>(
    pk: &mut [u8],
    sk: &mut [u8],
    seed: &[u8],
) where
    V: NtruVariant<N, LOGQ>,
{
    debug_assert_eq!(pk.len(), V::OWCPA_PUBLICKEYBYTES);
    debug_assert_eq!(sk.len(), V::OWCPA_SECRETKEYBYTES);
    debug_assert_eq!(seed.len(), V::SAMPLE_FG_BYTES);

    let mut f = [0u16; N];
    let mut g = [0u16; N];
    V::sample_fg(&mut f, &mut g, seed);

    let mut invf_mod3 = [0u16; N];
    poly_s3_inv::<N>(&mut invf_mod3, &f);
    poly_s3_tobytes::<N>(&mut sk[..V::PACK_TRINARY_BYTES], &f);
    poly_s3_tobytes::<N>(
        &mut sk[V::PACK_TRINARY_BYTES..2 * V::PACK_TRINARY_BYTES],
        &invf_mod3,
    );

    poly_z3_to_zq::<N>(&mut f, V::Q_MASK);
    poly_z3_to_zq::<N>(&mut g, V::Q_MASK);
    V::update_g_after_z3_to_zq(&mut g);

    let mut gf = [0u16; N];
    poly_rq_mul::<N>(&mut gf, &g, &f);

    let mut invgf = [0u16; N];
    poly_rq_inv::<N>(&mut invgf, &gf);

    let mut tmp = [0u16; N];
    let mut invh = [0u16; N];
    poly_rq_mul::<N>(&mut tmp, &invgf, &f);
    poly_sq_mul::<N>(&mut invh, &tmp, &f);
    V::poly_sq_tobytes(&mut sk[2 * V::PACK_TRINARY_BYTES..], &invh);

    let mut h = [0u16; N];
    poly_rq_mul::<N>(&mut tmp, &invgf, &g);
    poly_rq_mul::<N>(&mut h, &tmp, &g);
    V::poly_sq_tobytes(pk, &h);
}

/// OWCPA encryption. Computes $c = r \cdot h + \text{lift}(m)$ in
/// $R_q$, packed via the variant's Sq packer.
pub(crate) fn owcpa_enc<V, const N: usize, const LOGQ: usize>(
    c: &mut [u8],
    r: &[u16; N],
    m: &[u16; N],
    pk: &[u8],
) where
    V: NtruVariant<N, LOGQ>,
{
    debug_assert_eq!(c.len(), V::OWCPA_BYTES);
    debug_assert_eq!(pk.len(), V::OWCPA_PUBLICKEYBYTES);

    let mut h = [0u16; N];
    V::poly_sq_frombytes(&mut h, pk);
    poly_rq_sum_zero_adjust::<N>(&mut h);

    let mut ct = [0u16; N];
    poly_rq_mul::<N>(&mut ct, r, &h);

    let mut liftm = [0u16; N];
    V::poly_lift(&mut liftm, m);
    for i in 0..N {
        ct[i] = ct[i].wrapping_add(liftm[i]);
    }

    V::poly_sq_tobytes(c, &ct);
}

/// OWCPA decryption. Recovers $(r, m)$ from `ciphertext` under the
/// trapdoor encoded in `secretkey`, packs them into `rm`, and returns
/// 0 on success and 1 on any consistency failure (invalid ciphertext
/// padding, $m$ outside the valid set, or recovered $r$ outside
/// $\{0, 1, q - 1\}$).
pub(crate) fn owcpa_dec<V, const N: usize, const LOGQ: usize>(
    rm: &mut [u8],
    ciphertext: &[u8],
    secretkey: &[u8],
) -> i32
where
    V: NtruVariant<N, LOGQ>,
{
    debug_assert_eq!(rm.len(), V::OWCPA_MSGBYTES);
    debug_assert_eq!(ciphertext.len(), V::OWCPA_BYTES);
    debug_assert_eq!(secretkey.len(), V::OWCPA_SECRETKEYBYTES);

    let mut c = [0u16; N];
    V::poly_sq_frombytes(&mut c, ciphertext);
    poly_rq_sum_zero_adjust::<N>(&mut c);

    let mut f = [0u16; N];
    poly_s3_frombytes::<N>(&mut f, &secretkey[..V::PACK_TRINARY_BYTES]);
    poly_z3_to_zq::<N>(&mut f, V::Q_MASK);

    let mut cf = [0u16; N];
    poly_rq_mul::<N>(&mut cf, &c, &f);

    let mut mf = [0u16; N];
    poly_rq_to_s3::<N, LOGQ>(&mut mf, &cf);

    let mut finv3 = [0u16; N];
    poly_s3_frombytes::<N>(
        &mut finv3,
        &secretkey[V::PACK_TRINARY_BYTES..2 * V::PACK_TRINARY_BYTES],
    );

    let mut m = [0u16; N];
    poly_s3_mul::<N>(&mut m, &mf, &finv3);
    poly_s3_tobytes::<N>(&mut rm[V::PACK_TRINARY_BYTES..], &m);

    let mut fail = 0i32;
    fail |= owcpa_check_ciphertext::<N, LOGQ>(ciphertext);
    fail |= V::check_m(&m);

    let mut liftm = [0u16; N];
    V::poly_lift(&mut liftm, &m);
    let mut b = [0u16; N];
    for i in 0..N {
        b[i] = c[i].wrapping_sub(liftm[i]);
    }

    let mut invh = [0u16; N];
    V::poly_sq_frombytes(&mut invh, &secretkey[2 * V::PACK_TRINARY_BYTES..]);
    let mut r = [0u16; N];
    poly_sq_mul::<N>(&mut r, &b, &invh);

    fail |= owcpa_check_r::<N, LOGQ>(&r);

    poly_trinary_zq_to_z3::<N, LOGQ>(&mut r);
    poly_s3_tobytes::<N>(&mut rm[..V::PACK_TRINARY_BYTES], &r);

    fail
}

// ---- OWCPA validity checks -------------------------------------------------

/// Check that the high padding bits of a ciphertext's last byte are zero
/// (a wire-format malleability check). Returns 0 on success, 1 on any
/// non-zero padding bit.
///
/// `bits_used` (= `(LOGQ * (N - 1)) mod 8`) is the number of valid
/// low-order bits in the final byte; the high `8 - bits_used` bits are
/// padding and must be zero. Mask `0xff << bits_used` selects exactly
/// those high padding bits.
pub(crate) fn owcpa_check_ciphertext<const N: usize, const LOGQ: usize>(
    ciphertext: &[u8],
) -> i32 {
    let pack_deg = N - 1;
    let bits_used = (LOGQ * pack_deg) & 7;
    let mask: u8 = if bits_used == 0 { 0 } else { 0xffu8 << bits_used };
    let last = *ciphertext.last().expect("non-empty ciphertext");
    let t = (last & mask) as u16;
    (1 & ((!t).wrapping_add(1) >> 15)) as i32
}

/// Check that a recovered $r \in R_q$ is in the trinary set $\{0, 1, q-1\}$
/// with `r[N - 1] == 0`. Returns 0 on success, 1 on any out-of-range
/// coefficient.
pub(crate) fn owcpa_check_r<const N: usize, const LOGQ: usize>(r: &[u16; N]) -> i32 {
    let q16: u16 = if LOGQ < 16 { 1u16 << LOGQ } else { 0 };
    let mut t: u32 = 0;
    for i in 0..N - 1 {
        let c = r[i];
        t |= ((c.wrapping_add(1)) & q16.wrapping_sub(4)) as u32;
        t |= (c.wrapping_add(2) & 4) as u32;
    }
    t |= r[N - 1] as u32;
    (1 & ((!t).wrapping_add(1) >> 31)) as i32
}

/// Check that `m` is in $S_3$ with the given target weight, balanced
/// $+1$ / $-1$ counts. Returns 0 on success, 1 on weight or balance
/// mismatch. HPS-only — HRSS-701 accepts any $S_3$ message.
pub(crate) fn owcpa_check_m<const N: usize>(m: &[u16; N], weight: usize) -> i32 {
    let mut ps: u16 = 0;
    let mut ms: u16 = 0;
    for i in 0..N {
        ps = ps.wrapping_add(m[i] & 1);
        ms = ms.wrapping_add(m[i] & 2);
    }
    let mut t: u32 = 0;
    t |= (ps ^ (ms >> 1)) as u32;
    t |= (ms ^ (weight as u16)) as u32;
    (1 & ((!t).wrapping_add(1) >> 31)) as i32
}

/// Restore the high coefficient of an $R_q$ polynomial whose $N - 1$
/// low coefficients were just unpacked from an `Sq` byte stream, so the
/// total coefficient sum is zero modulo $q$. The unpacker leaves
/// `r[N - 1] == 0`; this routine sets it to the negated sum of the
/// others.
pub(crate) fn poly_rq_sum_zero_adjust<const N: usize>(r: &mut [u16; N]) {
    r[N - 1] = 0;
    let mut acc: u16 = 0;
    for i in 0..(N - 1) {
        acc = acc.wrapping_sub(r[i]);
    }
    r[N - 1] = acc;
}

// ---- KEM key generation + encapsulation (FO-style transform) --------------

/// CCA KEM key generation: draw an OWCPA seed plus the implicit-rejection
/// PRF key from `rng`, run [`owcpa_keypair`], and pack everything into the
/// caller's wire-format buffers.
///
/// `seed_scratch` must be `V::SAMPLE_FG_BYTES` bytes long; the macro that
/// invokes this function declares it as a stack array of the per-set
/// size so no heap allocation appears on the keygen hot path.
pub(crate) fn kem_keypair_seeded<V, R, const N: usize, const LOGQ: usize>(
    pk: &mut [u8],
    sk: &mut [u8],
    rng: &mut R,
    seed_scratch: &mut [u8],
) where
    V: NtruVariant<N, LOGQ>,
    R: crate::Csprng,
{
    debug_assert_eq!(seed_scratch.len(), V::SAMPLE_FG_BYTES);
    rng.fill_bytes(seed_scratch);
    owcpa_keypair::<V, N, LOGQ>(pk, &mut sk[..V::OWCPA_SECRETKEYBYTES], seed_scratch);
    rng.fill_bytes(&mut sk[V::OWCPA_SECRETKEYBYTES..]);
}

/// CCA KEM encapsulation: draw fresh randomness for $(r, m)$, hash the
/// resulting message into the shared secret, then OWCPA-encrypt against
/// `pk`.
///
/// `rm_seed_scratch` must be `V::SAMPLE_RM_BYTES` long and `rm_scratch`
/// must be `V::OWCPA_MSGBYTES` long; the macro stack-allocates both.
pub(crate) fn kem_enc_seeded<V, R, const N: usize, const LOGQ: usize>(
    c: &mut [u8],
    k: &mut [u8],
    pk: &[u8],
    rng: &mut R,
    rm_seed_scratch: &mut [u8],
    rm_scratch: &mut [u8],
) where
    V: NtruVariant<N, LOGQ>,
    R: crate::Csprng,
{
    use crate::hash::sha3::Sha3_256;
    debug_assert_eq!(k.len(), 32);
    debug_assert_eq!(rm_seed_scratch.len(), V::SAMPLE_RM_BYTES);
    debug_assert_eq!(rm_scratch.len(), V::OWCPA_MSGBYTES);

    rng.fill_bytes(rm_seed_scratch);

    let mut r = [0u16; N];
    let mut m = [0u16; N];
    V::sample_rm(&mut r, &mut m, rm_seed_scratch);

    poly_s3_tobytes::<N>(&mut rm_scratch[..V::PACK_TRINARY_BYTES], &r);
    poly_s3_tobytes::<N>(&mut rm_scratch[V::PACK_TRINARY_BYTES..], &m);

    let digest = Sha3_256::new().chain(rm_scratch).finalize();
    k.copy_from_slice(&digest);

    poly_z3_to_zq::<N>(&mut r, V::Q_MASK);
    owcpa_enc::<V, N, LOGQ>(c, &r, &m, pk);
}

// ---- KEM decapsulation (FO-style transform) --------------------------------

/// CCA KEM decapsulation: run [`owcpa_dec`], hash $r \| m$ for the
/// session key, hash `prf || c` for the implicit-rejection key, and
/// `cmov` between them on the OWCPA failure flag.
///
/// `rm_scratch` must be `V::OWCPA_MSGBYTES` long; the macro
/// stack-allocates it.
pub(crate) fn kem_dec<V, const N: usize, const LOGQ: usize>(
    k: &mut [u8],
    c: &[u8],
    sk: &[u8],
    rm_scratch: &mut [u8],
) where
    V: NtruVariant<N, LOGQ>,
{
    use crate::hash::sha3::Sha3_256;
    debug_assert_eq!(k.len(), 32);
    debug_assert_eq!(rm_scratch.len(), V::OWCPA_MSGBYTES);
    let fail = owcpa_dec::<V, N, LOGQ>(rm_scratch, c, &sk[..V::OWCPA_SECRETKEYBYTES]);

    let digest = Sha3_256::new().chain(rm_scratch).finalize();
    k.copy_from_slice(&digest);

    let reject = Sha3_256::new()
        .chain(&sk[V::OWCPA_SECRETKEYBYTES..])
        .chain(c)
        .finalize();
    cmov(k, &reject, fail as u8);
}

// ---- IID and fixed-weight samplers -----------------------------------------

/// $\text{Sample\_iid}$ from round-3 NTRU, §3.3.1: each output coefficient
/// is the input byte reduced modulo 3. Output buffer is $N - 1$
/// coefficients (the high coefficient is set to 0); input length must be
/// $N - 1$ bytes.
pub(crate) fn sample_iid<const N: usize>(r: &mut [u16; N], uniform_bytes: &[u8]) {
    debug_assert_eq!(uniform_bytes.len(), N - 1);
    for i in 0..N - 1 {
        r[i] = mod3(uniform_bytes[i] as u16);
    }
    r[N - 1] = 0;
}

/// $\text{Sample\_fixed\_type}$ from round-3 NTRU, §3.3.4: pack 30
/// random bits per word from `u` (4 words per 15 bytes), tag the bottom
/// two bits with the intended trinary value (half $+1$ as `01`, half
/// $-1$ as `10`, rest $0$), sort using the constant-time bitonic
/// network, and read off the bottom-two-bit tag of each sorted slot.
/// Used by HPS keygen for `g` and by HPS encryption for `r`.
pub(crate) fn sample_fixed_type<const N: usize>(
    r: &mut [u16; N],
    u: &[u8],
    weight: usize,
    scratch: &mut [i32; N],
) {
    debug_assert_eq!(u.len(), (30 * (N - 1)).div_ceil(8));
    // All NIST round-3 parameter sets have $(N - 1) \equiv 0 \pmod 4$
    // (508, 676, 820), so the input always lands on a block boundary
    // and there is no tail to handle. The assertion below documents the
    // assumption — flip it to a tail branch if a future parameter set
    // breaks the alignment.
    debug_assert_eq!((N - 1) % 4, 0, "sample_fixed_type assumes (N - 1) % 4 == 0");

    // Use the first N - 1 slots of the caller's scratch buffer. Slot
    // `N - 1` exists only because stable Rust can't size an inline
    // array as `[i32; N - 1]` without `generic_const_exprs`.
    let s = &mut scratch[..N - 1];
    for slot in s.iter_mut() {
        *slot = 0;
    }

    let blocks = (N - 1) / 4;
    for i in 0..blocks {
        let base = 15 * i;
        s[4 * i] = ((u[base] as i32) << 2)
            | ((u[base + 1] as i32) << 10)
            | ((u[base + 2] as i32) << 18)
            | ((u[base + 3] as u32 as i32) << 26);
        s[4 * i + 1] = (((u[base + 3] as i32) & 0xc0) >> 4)
            | ((u[base + 4] as i32) << 4)
            | ((u[base + 5] as i32) << 12)
            | ((u[base + 6] as i32) << 20)
            | ((u[base + 7] as u32 as i32) << 28);
        s[4 * i + 2] = (((u[base + 7] as i32) & 0xf0) >> 2)
            | ((u[base + 8] as i32) << 6)
            | ((u[base + 9] as i32) << 14)
            | ((u[base + 10] as i32) << 22)
            | ((u[base + 11] as u32 as i32) << 30);
        s[4 * i + 3] = ((u[base + 11] as i32) & 0xfc)
            | ((u[base + 12] as i32) << 8)
            | ((u[base + 13] as i32) << 16)
            | ((u[base + 14] as u32 as i32) << 24);
    }

    for i in 0..weight / 2 {
        s[i] |= 1;
    }
    for i in weight / 2..weight {
        s[i] |= 2;
    }

    crypto_sort_int32(s);

    for i in 0..N - 1 {
        r[i] = (s[i] & 3) as u16;
    }
    r[N - 1] = 0;
}

// ---- ring multiplication wrappers ------------------------------------------

/// Cyclic multiplication in $R = \mathbb{Z}[x] / (x^N - 1)$ over `u16`
/// wrapping arithmetic. Thin alias for the shared
/// [`crate::public_key::ntru_poly_mul::poly_mul_cyclic`] entry point.
pub(crate) fn poly_rq_mul<const N: usize>(
    r: &mut [u16; N],
    a: &[u16; N],
    b: &[u16; N],
) {
    crate::public_key::ntru_poly_mul::poly_mul_cyclic(r, a, b);
}

/// $R_q$ multiplication followed by mod-$\Phi_n$ projection.
pub(crate) fn poly_sq_mul<const N: usize>(
    r: &mut [u16; N],
    a: &[u16; N],
    b: &[u16; N],
) {
    poly_rq_mul::<N>(r, a, b);
    poly_mod_q_phi_n::<N>(r);
}

/// $R$ multiplication followed by mod-3, mod-$\Phi_n$ projection.
pub(crate) fn poly_s3_mul<const N: usize>(
    r: &mut [u16; N],
    a: &[u16; N],
    b: &[u16; N],
) {
    poly_rq_mul::<N>(r, a, b);
    poly_mod_3_phi_n::<N>(r);
}

// ---- HPS lift: trivial Z_3 -> Z_q embedding --------------------------------

/// HPS lift: copy `a`'s coefficients into `r` and remap $\{0, 1, 2\}$
/// onto $\{0, 1, q - 1\}$. The HRSS variant (with the `(x - 1)` factor)
/// is in `ntru_hrss701` because it has no other call site.
pub(crate) fn poly_lift_hps<const N: usize>(r: &mut [u16; N], a: &[u16; N], q_mask: u16) {
    *r = *a;
    poly_z3_to_zq::<N>(r, q_mask);
}

// ---- S_q packing for q = 2^11 (HPS509, HPS677) -----------------------------

/// Pack `a`'s 11-bit coefficients into bytes: 8 coefficients per 11-byte
/// block. Output buffer must be `((N - 1) * 11 + 7) / 8` bytes.
pub(crate) fn poly_sq_tobytes_logq11<const N: usize>(r: &mut [u8], a: &[u16; N]) {
    const Q_MASK_11: u16 = (1u16 << 11) - 1;
    let pack_deg = N - 1;
    debug_assert_eq!(r.len(), (pack_deg * 11).div_ceil(8));
    let mut t = [0u16; 8];
    let full = pack_deg / 8;
    for i in 0..full {
        for j in 0..8 {
            t[j] = a[8 * i + j] & Q_MASK_11;
        }
        r[11 * i] = (t[0] & 0xff) as u8;
        r[11 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x1f) << 3)) as u8;
        r[11 * i + 2] = ((t[1] >> 5) | ((t[2] & 0x03) << 6)) as u8;
        r[11 * i + 3] = ((t[2] >> 2) & 0xff) as u8;
        r[11 * i + 4] = ((t[2] >> 10) | ((t[3] & 0x7f) << 1)) as u8;
        r[11 * i + 5] = ((t[3] >> 7) | ((t[4] & 0x0f) << 4)) as u8;
        r[11 * i + 6] = ((t[4] >> 4) | ((t[5] & 0x01) << 7)) as u8;
        r[11 * i + 7] = ((t[5] >> 1) & 0xff) as u8;
        r[11 * i + 8] = ((t[5] >> 9) | ((t[6] & 0x3f) << 2)) as u8;
        r[11 * i + 9] = ((t[6] >> 6) | ((t[7] & 0x07) << 5)) as u8;
        r[11 * i + 10] = (t[7] >> 3) as u8;
    }
    let i = full;
    let tail = pack_deg - 8 * i;
    for j in 0..tail {
        t[j] = a[8 * i + j] & Q_MASK_11;
    }
    for j in tail..8 {
        t[j] = 0;
    }
    match pack_deg & 0x07 {
        4 => {
            r[11 * i] = (t[0] & 0xff) as u8;
            r[11 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x1f) << 3)) as u8;
            r[11 * i + 2] = ((t[1] >> 5) | ((t[2] & 0x03) << 6)) as u8;
            r[11 * i + 3] = ((t[2] >> 2) & 0xff) as u8;
            r[11 * i + 4] = ((t[2] >> 10) | ((t[3] & 0x7f) << 1)) as u8;
            r[11 * i + 5] = ((t[3] >> 7) | ((t[4] & 0x0f) << 4)) as u8;
        }
        2 => {
            r[11 * i] = (t[0] & 0xff) as u8;
            r[11 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x1f) << 3)) as u8;
            r[11 * i + 2] = ((t[1] >> 5) | ((t[2] & 0x03) << 6)) as u8;
        }
        0 => {}
        _ => unreachable!(),
    }
}

/// Inverse of [`poly_sq_tobytes_logq11`].
pub(crate) fn poly_sq_frombytes_logq11<const N: usize>(r: &mut [u16; N], a: &[u8]) {
    let pack_deg = N - 1;
    debug_assert!(a.len() >= (pack_deg * 11).div_ceil(8));
    let full = pack_deg / 8;
    for i in 0..full {
        r[8 * i] = (a[11 * i] as u16) | (((a[11 * i + 1] as u16) & 0x07) << 8);
        r[8 * i + 1] =
            ((a[11 * i + 1] as u16) >> 3) | (((a[11 * i + 2] as u16) & 0x3f) << 5);
        r[8 * i + 2] = ((a[11 * i + 2] as u16) >> 6)
            | (((a[11 * i + 3] as u16) & 0xff) << 2)
            | (((a[11 * i + 4] as u16) & 0x01) << 10);
        r[8 * i + 3] =
            ((a[11 * i + 4] as u16) >> 1) | (((a[11 * i + 5] as u16) & 0x0f) << 7);
        r[8 * i + 4] =
            ((a[11 * i + 5] as u16) >> 4) | (((a[11 * i + 6] as u16) & 0x7f) << 4);
        r[8 * i + 5] = ((a[11 * i + 6] as u16) >> 7)
            | (((a[11 * i + 7] as u16) & 0xff) << 1)
            | (((a[11 * i + 8] as u16) & 0x03) << 9);
        r[8 * i + 6] =
            ((a[11 * i + 8] as u16) >> 2) | (((a[11 * i + 9] as u16) & 0x1f) << 6);
        r[8 * i + 7] =
            ((a[11 * i + 9] as u16) >> 5) | (((a[11 * i + 10] as u16) & 0xff) << 3);
    }
    let i = full;
    match pack_deg & 0x07 {
        4 => {
            r[8 * i] = (a[11 * i] as u16) | (((a[11 * i + 1] as u16) & 0x07) << 8);
            r[8 * i + 1] =
                ((a[11 * i + 1] as u16) >> 3) | (((a[11 * i + 2] as u16) & 0x3f) << 5);
            r[8 * i + 2] = ((a[11 * i + 2] as u16) >> 6)
                | (((a[11 * i + 3] as u16) & 0xff) << 2)
                | (((a[11 * i + 4] as u16) & 0x01) << 10);
            r[8 * i + 3] =
                ((a[11 * i + 4] as u16) >> 1) | (((a[11 * i + 5] as u16) & 0x0f) << 7);
        }
        2 => {
            r[8 * i] = (a[11 * i] as u16) | (((a[11 * i + 1] as u16) & 0x07) << 8);
            r[8 * i + 1] =
                ((a[11 * i + 1] as u16) >> 3) | (((a[11 * i + 2] as u16) & 0x3f) << 5);
        }
        0 => {}
        _ => unreachable!(),
    }
    r[N - 1] = 0;
}

// ---- S_q packing for q = 2^12 (HPS821) -------------------------------------

/// Pack `a`'s 12-bit coefficients into bytes: 2 coefficients per 3-byte
/// block. Output buffer must be `((N - 1) * 12 + 7) / 8` bytes.
pub(crate) fn poly_sq_tobytes_logq12<const N: usize>(r: &mut [u8], a: &[u16; N]) {
    const Q_MASK_12: u16 = (1u16 << 12) - 1;
    let pack_deg = N - 1;
    debug_assert_eq!(r.len(), (pack_deg * 12).div_ceil(8));
    for i in 0..pack_deg / 2 {
        let c0 = a[2 * i] & Q_MASK_12;
        let c1 = a[2 * i + 1] & Q_MASK_12;
        r[3 * i] = (c0 & 0xff) as u8;
        r[3 * i + 1] = ((c0 >> 8) | ((c1 & 0x0f) << 4)) as u8;
        r[3 * i + 2] = (c1 >> 4) as u8;
    }
}

/// Inverse of [`poly_sq_tobytes_logq12`].
pub(crate) fn poly_sq_frombytes_logq12<const N: usize>(r: &mut [u16; N], a: &[u8]) {
    let pack_deg = N - 1;
    debug_assert!(a.len() >= (pack_deg * 12).div_ceil(8));
    for i in 0..pack_deg / 2 {
        r[2 * i] = (a[3 * i] as u16) | (((a[3 * i + 1] as u16) & 0x0f) << 8);
        r[2 * i + 1] =
            ((a[3 * i + 1] as u16) >> 4) | (((a[3 * i + 2] as u16) & 0xff) << 4);
    }
    r[N - 1] = 0;
}

// ---- S_q packing for q = 2^13 (HRSS701) ------------------------------------

/// Pack `a`'s 13-bit coefficients into bytes: 8 coefficients per 13-byte
/// block. Output buffer must be `((N - 1) * 13 + 7) / 8` bytes.
pub(crate) fn poly_sq_tobytes_logq13<const N: usize>(r: &mut [u8], a: &[u16; N]) {
    const Q_MASK_13: u16 = (1u16 << 13) - 1;
    let pack_deg = N - 1;
    debug_assert_eq!(r.len(), (pack_deg * 13).div_ceil(8));
    let mut t = [0u16; 8];
    let full = pack_deg / 8;
    for i in 0..full {
        for j in 0..8 {
            t[j] = a[8 * i + j] & Q_MASK_13;
        }
        r[13 * i] = (t[0] & 0xff) as u8;
        r[13 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x07) << 5)) as u8;
        r[13 * i + 2] = ((t[1] >> 3) & 0xff) as u8;
        r[13 * i + 3] = ((t[1] >> 11) | ((t[2] & 0x3f) << 2)) as u8;
        r[13 * i + 4] = ((t[2] >> 6) | ((t[3] & 0x01) << 7)) as u8;
        r[13 * i + 5] = ((t[3] >> 1) & 0xff) as u8;
        r[13 * i + 6] = ((t[3] >> 9) | ((t[4] & 0x0f) << 4)) as u8;
        r[13 * i + 7] = ((t[4] >> 4) & 0xff) as u8;
        r[13 * i + 8] = ((t[4] >> 12) | ((t[5] & 0x7f) << 1)) as u8;
        r[13 * i + 9] = ((t[5] >> 7) | ((t[6] & 0x03) << 6)) as u8;
        r[13 * i + 10] = ((t[6] >> 2) & 0xff) as u8;
        r[13 * i + 11] = ((t[6] >> 10) | ((t[7] & 0x1f) << 3)) as u8;
        r[13 * i + 12] = (t[7] >> 5) as u8;
    }
    let i = full;
    let tail = pack_deg - 8 * i;
    for j in 0..tail {
        t[j] = a[8 * i + j] & Q_MASK_13;
    }
    for j in tail..8 {
        t[j] = 0;
    }
    match pack_deg & 0x07 {
        4 => {
            r[13 * i] = (t[0] & 0xff) as u8;
            r[13 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x07) << 5)) as u8;
            r[13 * i + 2] = ((t[1] >> 3) & 0xff) as u8;
            r[13 * i + 3] = ((t[1] >> 11) | ((t[2] & 0x3f) << 2)) as u8;
            r[13 * i + 4] = ((t[2] >> 6) | ((t[3] & 0x01) << 7)) as u8;
            r[13 * i + 5] = ((t[3] >> 1) & 0xff) as u8;
            r[13 * i + 6] = ((t[3] >> 9) | ((t[4] & 0x0f) << 4)) as u8;
        }
        2 => {
            r[13 * i] = (t[0] & 0xff) as u8;
            r[13 * i + 1] = ((t[0] >> 8) | ((t[1] & 0x07) << 5)) as u8;
            r[13 * i + 2] = ((t[1] >> 3) & 0xff) as u8;
            r[13 * i + 3] = ((t[1] >> 11) | ((t[2] & 0x3f) << 2)) as u8;
        }
        0 => {}
        _ => unreachable!(),
    }
}

/// Inverse of [`poly_sq_tobytes_logq13`].
pub(crate) fn poly_sq_frombytes_logq13<const N: usize>(r: &mut [u16; N], a: &[u8]) {
    let pack_deg = N - 1;
    debug_assert!(a.len() >= (pack_deg * 13).div_ceil(8));
    let full = pack_deg / 8;
    for i in 0..full {
        r[8 * i] = (a[13 * i] as u16) | (((a[13 * i + 1] as u16) & 0x1f) << 8);
        r[8 * i + 1] = ((a[13 * i + 1] as u16) >> 5)
            | ((a[13 * i + 2] as u16) << 3)
            | (((a[13 * i + 3] as u16) & 0x03) << 11);
        r[8 * i + 2] =
            ((a[13 * i + 3] as u16) >> 2) | (((a[13 * i + 4] as u16) & 0x7f) << 6);
        r[8 * i + 3] = ((a[13 * i + 4] as u16) >> 7)
            | ((a[13 * i + 5] as u16) << 1)
            | (((a[13 * i + 6] as u16) & 0x0f) << 9);
        r[8 * i + 4] = ((a[13 * i + 6] as u16) >> 4)
            | ((a[13 * i + 7] as u16) << 4)
            | (((a[13 * i + 8] as u16) & 0x01) << 12);
        r[8 * i + 5] =
            ((a[13 * i + 8] as u16) >> 1) | (((a[13 * i + 9] as u16) & 0x3f) << 7);
        r[8 * i + 6] = ((a[13 * i + 9] as u16) >> 6)
            | ((a[13 * i + 10] as u16) << 2)
            | (((a[13 * i + 11] as u16) & 0x07) << 10);
        r[8 * i + 7] =
            ((a[13 * i + 11] as u16) >> 3) | ((a[13 * i + 12] as u16) << 5);
    }
    let i = full;
    match pack_deg & 0x07 {
        4 => {
            r[8 * i] = (a[13 * i] as u16) | (((a[13 * i + 1] as u16) & 0x1f) << 8);
            r[8 * i + 1] = ((a[13 * i + 1] as u16) >> 5)
                | ((a[13 * i + 2] as u16) << 3)
                | (((a[13 * i + 3] as u16) & 0x03) << 11);
            r[8 * i + 2] =
                ((a[13 * i + 3] as u16) >> 2) | (((a[13 * i + 4] as u16) & 0x7f) << 6);
            r[8 * i + 3] = ((a[13 * i + 4] as u16) >> 7)
                | ((a[13 * i + 5] as u16) << 1)
                | (((a[13 * i + 6] as u16) & 0x0f) << 9);
        }
        2 => {
            r[8 * i] = (a[13 * i] as u16) | (((a[13 * i + 1] as u16) & 0x1f) << 8);
            r[8 * i + 1] = ((a[13 * i + 1] as u16) >> 5)
                | ((a[13 * i + 2] as u16) << 3)
                | (((a[13 * i + 3] as u16) & 0x03) << 11);
        }
        0 => {}
        _ => unreachable!(),
    }
    r[N - 1] = 0;
}

// ---- S_3 packing: 5 trits per byte in base 3 -------------------------------

/// Pack `a`'s $N - 1$ trinary coefficients (in $\{0, 1, 2\}$) into bytes
/// using base-3 encoding: each output byte holds 5 trits, with the
/// least-significant trit at the bottom of the byte. The output buffer
/// length must equal `((N - 1) + 4) / 5`.
pub(crate) fn poly_s3_tobytes<const N: usize>(msg: &mut [u8], a: &[u16; N]) {
    let pack_deg = N - 1;
    debug_assert_eq!(msg.len(), pack_deg.div_ceil(5));
    let full = pack_deg / 5;
    for i in 0..full {
        let mut c = (a[5 * i + 4] & 0xff) as u8;
        c = (3u8.wrapping_mul(c)).wrapping_add(a[5 * i + 3] as u8);
        c = (3u8.wrapping_mul(c)).wrapping_add(a[5 * i + 2] as u8);
        c = (3u8.wrapping_mul(c)).wrapping_add(a[5 * i + 1] as u8);
        c = (3u8.wrapping_mul(c)).wrapping_add(a[5 * i] as u8);
        msg[i] = c;
    }
    if pack_deg > full * 5 {
        let mut c: u8 = 0;
        let start = 5 * full;
        let mut j = (pack_deg - start) as isize - 1;
        while j >= 0 {
            c = (3u8.wrapping_mul(c)).wrapping_add(a[start + j as usize] as u8);
            j -= 1;
        }
        msg[full] = c;
    }
}

/// Inverse of [`poly_s3_tobytes`]. Reduces mod 3, mod $\Phi_n$ on the way out.
pub(crate) fn poly_s3_frombytes<const N: usize>(r: &mut [u16; N], msg: &[u8]) {
    let pack_deg = N - 1;
    debug_assert_eq!(msg.len(), pack_deg.div_ceil(5));
    let full = pack_deg / 5;
    for i in 0..full {
        let c = msg[i] as u32;
        r[5 * i] = c as u16;
        r[5 * i + 1] = ((c * 171) >> 9) as u16;
        r[5 * i + 2] = ((c * 57) >> 9) as u16;
        r[5 * i + 3] = ((c * 19) >> 9) as u16;
        r[5 * i + 4] = ((c * 203) >> 14) as u16;
    }
    if pack_deg > full * 5 {
        let mut c = msg[full] as u32;
        let mut j = 0;
        while 5 * full + j < pack_deg {
            r[5 * full + j] = c as u16;
            c = (c * 171) >> 9;
            j += 1;
        }
    }
    r[N - 1] = 0;
    poly_mod_3_phi_n::<N>(r);
}

// ---- shared NIST PQC KAT parsing (test only) -------------------------------

/// One entry of a NIST PQC `.rsp` KAT file: 48-byte seed plus the
/// reference-implementation outputs.
#[cfg(test)]
#[derive(Debug)]
pub(crate) struct KatEntry {
    pub seed: Vec<u8>,
    pub pk: Vec<u8>,
    pub sk: Vec<u8>,
    pub ct: Vec<u8>,
    pub ss: Vec<u8>,
}

/// Decode an even-length hex string into bytes. Permissive about embedded
/// whitespace so the same routine handles `.rsp`-line hex fields, which
/// don't always end at a fixed column.
#[cfg(test)]
pub(crate) fn hex_to_bytes(s: &str) -> Vec<u8> {
    let cleaned: String = s.chars().filter(|c| !c.is_whitespace()).collect();
    assert!(cleaned.len() % 2 == 0, "hex length must be even");
    (0..cleaned.len())
        .step_by(2)
        .map(|i| u8::from_str_radix(&cleaned[i..i + 2], 16).expect("valid hex"))
        .collect()
}

/// Parse the `count = N` entry out of a NIST PQC `.rsp` KAT file. Returns
/// `None` if the count is absent (e.g. asking for entry 100 from a 100-entry
/// file).
///
/// The parser scans line-by-line for the literal `count = N` header
/// (after `str::trim`), then collects every `key = hex` line that
/// follows until either a blank line, the next `count =` header, or
/// end-of-file. Unrecognised keys are ignored. This means an extra
/// metadata line in a future `.rsp` (e.g. `mlen = 32`) does not
/// silently truncate the entry.
#[cfg(test)]
pub(crate) fn parse_kat_entry(rsp: &str, count: usize) -> Option<KatEntry> {
    let target = format!("count = {count}");
    let mut lines = rsp.lines();
    while let Some(line) = lines.next() {
        if line.trim() == target {
            let mut seed = None;
            let mut pk = None;
            let mut sk = None;
            let mut ct = None;
            let mut ss = None;
            for line in lines.by_ref() {
                let trimmed = line.trim();
                if trimmed.is_empty() || trimmed.starts_with("count = ") {
                    break;
                }
                let Some((key, value)) = trimmed.split_once(" = ") else {
                    continue;
                };
                let bytes = hex_to_bytes(value.trim());
                match key.trim() {
                    "seed" => seed = Some(bytes),
                    "pk" => pk = Some(bytes),
                    "sk" => sk = Some(bytes),
                    "ct" => ct = Some(bytes),
                    "ss" => ss = Some(bytes),
                    _ => {}
                }
            }
            return Some(KatEntry {
                seed: seed?,
                pk: pk?,
                sk: sk?,
                ct: ct?,
                ss: ss?,
            });
        }
    }
    None
}

/// Counts that span the full 0..100 range of the NIST round-3 KAT
/// files (each contains exactly 100 entries). Chosen to catch
/// first-entry / state-rollover / final-entry bugs without running a
/// full 100-entry sweep on every `cargo test`. The full sweep is
/// `nist_kat_full` per parameter set, behind `#[ignore]`.
#[cfg(test)]
pub(crate) const KAT_SAMPLED_COUNTS: &[usize] = &[0, 1, 7, 23, 42, 67, 83, 99];

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cmov_copies_when_b_is_one_else_no_change() {
        let mut r = [1u8, 2, 3, 4];
        let x = [9u8, 8, 7, 6];
        cmov(&mut r, &x, 0);
        assert_eq!(r, [1, 2, 3, 4]);
        cmov(&mut r, &x, 1);
        assert_eq!(r, [9, 8, 7, 6]);
    }

    #[test]
    fn crypto_sort_int32_matches_std_sort() {
        let inputs: &[&[i32]] = &[
            &[],
            &[0],
            &[3, 1, 2],
            &[i32::MAX, i32::MIN, 0, -1, 1],
            &[7, 7, 7, 7, 7],
            &[5, -3, 8, 0, -7, 2, 6, -1, 9, 4, -2, 1, -5, 3, -6, 7, -8, -4],
        ];
        for &case in inputs {
            let mut a = case.to_vec();
            let mut b = case.to_vec();
            crypto_sort_int32(&mut a);
            b.sort();
            assert_eq!(a, b, "sort mismatch on {case:?}");
        }
    }

    #[test]
    fn mod3_matches_naive_reduction() {
        for a in 0u16..=u16::MAX {
            assert_eq!(mod3(a), a % 3);
        }
    }

    /// `DigestChain::chain` is a blanket impl over every `Digest`; this
    /// test exercises it on both `Sha3_256` (the FO-transform consumer)
    /// and `Sha256` so the chained-update equivalence is locked in for
    /// the whole `Digest` family rather than just the one in-use site.
    #[test]
    fn digest_chain_matches_concat_then_update() {
        use crate::hash::sha2::Sha256;
        use crate::hash::sha3::Sha3_256;

        let parts: [&[u8]; 3] = [b"abc", b"defghij", b""];
        let concat: Vec<u8> = parts.iter().flat_map(|p| p.iter().copied()).collect();

        for &(a, b, c) in &[(parts[0], parts[1], parts[2])] {
            let chained = Sha3_256::new().chain(a).chain(b).chain(c).finalize();
            let oneshot = {
                let mut h = Sha3_256::new();
                h.update(&concat);
                h.finalize()
            };
            assert_eq!(chained.as_slice(), oneshot.as_slice(), "Sha3_256 chain");

            let chained = Sha256::new().chain(a).chain(b).chain(c).finalize();
            let oneshot = {
                let mut h = Sha256::new();
                h.update(&concat);
                h.finalize()
            };
            assert_eq!(chained.as_slice(), oneshot.as_slice(), "Sha256 chain");
        }
    }
}