turboshake/
keccak.rs

1/// Logarithm base 2 of bit width of lane of Keccak-p\[1600, 12\] permutation.
2const L: usize = 6;
3
4/// Bit width of each lane of Keccak-p\[1600, 12\] permutation.
5pub const W: usize = 1 << L;
6
7/// \# -of lanes in keccak permutation state s.t. each lane is of 64 -bit width.
8pub const LANE_CNT: usize = 25;
9
10/// \# -of rounds of Keccak permutation is applied per iteration i.e. it's Keccak-p\[1600, 12\].
11const ROUNDS: usize = 12;
12
13/// Maximum number of rounds that can be supported by Keccak-f\[1600\] permutation.
14const MAX_ROUNDS: usize = 12 + 2 * L;
15
16/// Compile-time computed lane rotation factor table used when applying ρ step mapping function.
17const ROT: [usize; LANE_CNT] = compute_rotation_factors_table();
18
19/// Compile-time computed round constants table used when applying ι step mapping function.
20const RC: [u64; ROUNDS] = compute_round_constants_table();
21
22/// Compile-time evaluable function for generating leftwards circular rotation offset
23/// for lanes of the keccak state array, computed following step 3(a), 3(b) of algorithm 2
24/// in section 3.2.2 of https://dx.doi.org/10.6028/NIST.FIPS.202.
25const fn compute_rotation_factors_table() -> [usize; LANE_CNT] {
26    let mut table = [0usize; LANE_CNT];
27
28    let mut x = 1;
29    let mut y = 0;
30    let mut t = 0;
31    while t <= 23 {
32        table[y * 5 + x] = ((t + 1) * (t + 2) / 2) % W;
33
34        let y_prime = (2 * x + 3 * y) % 5;
35        x = y;
36        y = y_prime;
37
38        t += 1;
39    }
40
41    table
42}
43
44/// Compile-time evaluable computation of single bit of Keccak-p\[1600, 12\] round constant,
45/// using binary LFSR, defined by primitive polynomial x^8 + x^6 + x^5 + x^4 + 1.
46///
47/// See algorithm 5 in section 3.2.5 of http://dx.doi.org/10.6028/NIST.FIPS.202.
48/// Taken from https://github.com/itzmeanjan/sha3/blob/faef1bd6f/include/keccak.hpp#L53-L91.
49const fn rc(t: usize) -> bool {
50    // step 1 of algorithm 5
51    if t % 255 == 0 {
52        return true;
53    }
54
55    // step 2 of algorithm 5
56    //
57    // note, step 3.a of algorithm 5 is also being
58    // executed in this statement ( for first iteration, with i = 1 ) !
59    let mut r = 0b10000000u16;
60
61    // step 3 of algorithm 5
62    let mut i = 1;
63    while i <= t % 255 {
64        let b0 = r & 1;
65
66        r = (r & 0b011111111) ^ ((((r >> 8) & 1) ^ b0) << 8);
67        r = (r & 0b111101111) ^ ((((r >> 4) & 1) ^ b0) << 4);
68        r = (r & 0b111110111) ^ ((((r >> 3) & 1) ^ b0) << 3);
69        r = (r & 0b111111011) ^ ((((r >> 2) & 1) ^ b0) << 2);
70
71        // step 3.f of algorithm 5
72        //
73        // note, this statement also executes step 3.a for upcoming
74        // iterations ( i.e. when i > 1 )
75        r >>= 1;
76
77        i += 1;
78    }
79
80    ((r >> 7) & 1) == 1
81}
82
83/// Compile-time evaluable computation of a 64 -bit round constant, which is XOR-ed into
84/// the very first lane ( = lane(0, 0) ) of Keccak-p\[1600, 12\] permutation state.
85///
86/// Taken from https://github.com/itzmeanjan/sha3/blob/faef1bd6f/include/keccak.hpp#L93C1-L109C2
87const fn compute_round_constant(r_idx: usize) -> u64 {
88    let mut rc_word = 0;
89
90    let mut j = 0;
91    while j < (L + 1) {
92        let boff = (1usize << j) - 1;
93        rc_word |= (rc(j + 7 * r_idx) as u64) << boff;
94
95        j += 1;
96    }
97
98    rc_word
99}
100
101/// Compile-time evaluable computation of all round constants of Keccak-p\[1600, 12\] permutation.
102const fn compute_round_constants_table() -> [u64; ROUNDS] {
103    let mut table = [0u64; ROUNDS];
104
105    let mut r_idx = MAX_ROUNDS - ROUNDS;
106    while r_idx < MAX_ROUNDS {
107        table[r_idx - ROUNDS] = compute_round_constant(r_idx);
108        r_idx += 1;
109    }
110
111    table
112}
113
114/// Keccak-p\[1600, 12\] round function, which applies all five step mapping functions in order, for four consecutive rounds
115/// starting from round index `ridx`, mutating state array, following section 3.3 of https://dx.doi.org/10.6028/NIST.FIPS.202.
116///
117/// Adapted from https://github.com/itzmeanjan/sha3/blob/b6ce9069/include/sha3/internals/keccak.hpp#L140-L583
118#[inline(always)]
119fn roundx4(state: &mut [u64; LANE_CNT], ridx: usize) {
120    let mut c = [0u64; 5];
121    let mut d = [0u64; 5];
122    let mut t;
123
124    // Round ridx + 0
125    for i in (0..LANE_CNT).step_by(5) {
126        c[0] ^= state[i];
127        c[1] ^= state[i + 1];
128        c[2] ^= state[i + 2];
129        c[3] ^= state[i + 3];
130        c[4] ^= state[i + 4];
131    }
132
133    d[0] = c[4] ^ c[1].rotate_left(1);
134    d[1] = c[0] ^ c[2].rotate_left(1);
135    d[2] = c[1] ^ c[3].rotate_left(1);
136    d[3] = c[2] ^ c[4].rotate_left(1);
137    d[4] = c[3] ^ c[0].rotate_left(1);
138
139    c[0] = state[0] ^ d[0];
140    t = state[6] ^ d[1];
141    c[1] = t.rotate_left(ROT[6] as u32);
142    t = state[12] ^ d[2];
143    c[2] = t.rotate_left(ROT[12] as u32);
144    t = state[18] ^ d[3];
145    c[3] = t.rotate_left(ROT[18] as u32);
146    t = state[24] ^ d[4];
147    c[4] = t.rotate_left(ROT[24] as u32);
148
149    state[0] = c[0] ^ (c[2] & !c[1]) ^ RC[ridx];
150    state[6] = c[1] ^ (c[3] & !c[2]);
151    state[12] = c[2] ^ (c[4] & !c[3]);
152    state[18] = c[3] ^ (c[0] & !c[4]);
153    state[24] = c[4] ^ (c[1] & !c[0]);
154
155    t = state[10] ^ d[0];
156    c[2] = t.rotate_left(ROT[10] as u32);
157    t = state[16] ^ d[1];
158    c[3] = t.rotate_left(ROT[16] as u32);
159    t = state[22] ^ d[2];
160    c[4] = t.rotate_left(ROT[22] as u32);
161    t = state[3] ^ d[3];
162    c[0] = t.rotate_left(ROT[3] as u32);
163    t = state[9] ^ d[4];
164    c[1] = t.rotate_left(ROT[9] as u32);
165
166    state[10] = c[0] ^ (c[2] & !c[1]);
167    state[16] = c[1] ^ (c[3] & !c[2]);
168    state[22] = c[2] ^ (c[4] & !c[3]);
169    state[3] = c[3] ^ (c[0] & !c[4]);
170    state[9] = c[4] ^ (c[1] & !c[0]);
171
172    t = state[20] ^ d[0];
173    c[4] = t.rotate_left(ROT[20] as u32);
174    t = state[1] ^ d[1];
175    c[0] = t.rotate_left(ROT[1] as u32);
176    t = state[7] ^ d[2];
177    c[1] = t.rotate_left(ROT[7] as u32);
178    t = state[13] ^ d[3];
179    c[2] = t.rotate_left(ROT[13] as u32);
180    t = state[19] ^ d[4];
181    c[3] = t.rotate_left(ROT[19] as u32);
182
183    state[20] = c[0] ^ (c[2] & !c[1]);
184    state[1] = c[1] ^ (c[3] & !c[2]);
185    state[7] = c[2] ^ (c[4] & !c[3]);
186    state[13] = c[3] ^ (c[0] & !c[4]);
187    state[19] = c[4] ^ (c[1] & !c[0]);
188
189    t = state[5] ^ d[0];
190    c[1] = t.rotate_left(ROT[5] as u32);
191    t = state[11] ^ d[1];
192    c[2] = t.rotate_left(ROT[11] as u32);
193    t = state[17] ^ d[2];
194    c[3] = t.rotate_left(ROT[17] as u32);
195    t = state[23] ^ d[3];
196    c[4] = t.rotate_left(ROT[23] as u32);
197    t = state[4] ^ d[4];
198    c[0] = t.rotate_left(ROT[4] as u32);
199
200    state[5] = c[0] ^ (c[2] & !c[1]);
201    state[11] = c[1] ^ (c[3] & !c[2]);
202    state[17] = c[2] ^ (c[4] & !c[3]);
203    state[23] = c[3] ^ (c[0] & !c[4]);
204    state[4] = c[4] ^ (c[1] & !c[0]);
205
206    t = state[15] ^ d[0];
207    c[3] = t.rotate_left(ROT[15] as u32);
208    t = state[21] ^ d[1];
209    c[4] = t.rotate_left(ROT[21] as u32);
210    t = state[2] ^ d[2];
211    c[0] = t.rotate_left(ROT[2] as u32);
212    t = state[8] ^ d[3];
213    c[1] = t.rotate_left(ROT[8] as u32);
214    t = state[14] ^ d[4];
215    c[2] = t.rotate_left(ROT[14] as u32);
216
217    state[15] = c[0] ^ (c[2] & !c[1]);
218    state[21] = c[1] ^ (c[3] & !c[2]);
219    state[2] = c[2] ^ (c[4] & !c[3]);
220    state[8] = c[3] ^ (c[0] & !c[4]);
221    state[14] = c[4] ^ (c[1] & !c[0]);
222
223    // Round ridx + 1
224    c.fill(0);
225
226    for i in (0..LANE_CNT).step_by(5) {
227        c[0] ^= state[i];
228        c[1] ^= state[i + 1];
229        c[2] ^= state[i + 2];
230        c[3] ^= state[i + 3];
231        c[4] ^= state[i + 4];
232    }
233
234    d[0] = c[4] ^ c[1].rotate_left(1);
235    d[1] = c[0] ^ c[2].rotate_left(1);
236    d[2] = c[1] ^ c[3].rotate_left(1);
237    d[3] = c[2] ^ c[4].rotate_left(1);
238    d[4] = c[3] ^ c[0].rotate_left(1);
239
240    c[0] = state[0] ^ d[0];
241    t = state[16] ^ d[1];
242    c[1] = t.rotate_left(ROT[6] as u32);
243    t = state[7] ^ d[2];
244    c[2] = t.rotate_left(ROT[12] as u32);
245    t = state[23] ^ d[3];
246    c[3] = t.rotate_left(ROT[18] as u32);
247    t = state[14] ^ d[4];
248    c[4] = t.rotate_left(ROT[24] as u32);
249
250    state[0] = c[0] ^ (c[2] & !c[1]) ^ RC[ridx + 1];
251    state[16] = c[1] ^ (c[3] & !c[2]);
252    state[7] = c[2] ^ (c[4] & !c[3]);
253    state[23] = c[3] ^ (c[0] & !c[4]);
254    state[14] = c[4] ^ (c[1] & !c[0]);
255
256    t = state[20] ^ d[0];
257    c[2] = t.rotate_left(ROT[10] as u32);
258    t = state[11] ^ d[1];
259    c[3] = t.rotate_left(ROT[16] as u32);
260    t = state[2] ^ d[2];
261    c[4] = t.rotate_left(ROT[22] as u32);
262    t = state[18] ^ d[3];
263    c[0] = t.rotate_left(ROT[3] as u32);
264    t = state[9] ^ d[4];
265    c[1] = t.rotate_left(ROT[9] as u32);
266
267    state[20] = c[0] ^ (c[2] & !c[1]);
268    state[11] = c[1] ^ (c[3] & !c[2]);
269    state[2] = c[2] ^ (c[4] & !c[3]);
270    state[18] = c[3] ^ (c[0] & !c[4]);
271    state[9] = c[4] ^ (c[1] & !c[0]);
272
273    t = state[15] ^ d[0];
274    c[4] = t.rotate_left(ROT[20] as u32);
275    t = state[6] ^ d[1];
276    c[0] = t.rotate_left(ROT[1] as u32);
277    t = state[22] ^ d[2];
278    c[1] = t.rotate_left(ROT[7] as u32);
279    t = state[13] ^ d[3];
280    c[2] = t.rotate_left(ROT[13] as u32);
281    t = state[4] ^ d[4];
282    c[3] = t.rotate_left(ROT[19] as u32);
283
284    state[15] = c[0] ^ (c[2] & !c[1]);
285    state[6] = c[1] ^ (c[3] & !c[2]);
286    state[22] = c[2] ^ (c[4] & !c[3]);
287    state[13] = c[3] ^ (c[0] & !c[4]);
288    state[4] = c[4] ^ (c[1] & !c[0]);
289
290    t = state[10] ^ d[0];
291    c[1] = t.rotate_left(ROT[5] as u32);
292    t = state[1] ^ d[1];
293    c[2] = t.rotate_left(ROT[11] as u32);
294    t = state[17] ^ d[2];
295    c[3] = t.rotate_left(ROT[17] as u32);
296    t = state[8] ^ d[3];
297    c[4] = t.rotate_left(ROT[23] as u32);
298    t = state[24] ^ d[4];
299    c[0] = t.rotate_left(ROT[4] as u32);
300
301    state[10] = c[0] ^ (c[2] & !c[1]);
302    state[1] = c[1] ^ (c[3] & !c[2]);
303    state[17] = c[2] ^ (c[4] & !c[3]);
304    state[8] = c[3] ^ (c[0] & !c[4]);
305    state[24] = c[4] ^ (c[1] & !c[0]);
306
307    t = state[5] ^ d[0];
308    c[3] = t.rotate_left(ROT[15] as u32);
309    t = state[21] ^ d[1];
310    c[4] = t.rotate_left(ROT[21] as u32);
311    t = state[12] ^ d[2];
312    c[0] = t.rotate_left(ROT[2] as u32);
313    t = state[3] ^ d[3];
314    c[1] = t.rotate_left(ROT[8] as u32);
315    t = state[19] ^ d[4];
316    c[2] = t.rotate_left(ROT[14] as u32);
317
318    state[5] = c[0] ^ (c[2] & !c[1]);
319    state[21] = c[1] ^ (c[3] & !c[2]);
320    state[12] = c[2] ^ (c[4] & !c[3]);
321    state[3] = c[3] ^ (c[0] & !c[4]);
322    state[19] = c[4] ^ (c[1] & !c[0]);
323
324    // Round ridx + 2
325    c.fill(0);
326
327    for i in (0..LANE_CNT).step_by(5) {
328        c[0] ^= state[i];
329        c[1] ^= state[i + 1];
330        c[2] ^= state[i + 2];
331        c[3] ^= state[i + 3];
332        c[4] ^= state[i + 4];
333    }
334
335    d[0] = c[4] ^ c[1].rotate_left(1);
336    d[1] = c[0] ^ c[2].rotate_left(1);
337    d[2] = c[1] ^ c[3].rotate_left(1);
338    d[3] = c[2] ^ c[4].rotate_left(1);
339    d[4] = c[3] ^ c[0].rotate_left(1);
340
341    c[0] = state[0] ^ d[0];
342    t = state[11] ^ d[1];
343    c[1] = t.rotate_left(ROT[6] as u32);
344    t = state[22] ^ d[2];
345    c[2] = t.rotate_left(ROT[12] as u32);
346    t = state[8] ^ d[3];
347    c[3] = t.rotate_left(ROT[18] as u32);
348    t = state[19] ^ d[4];
349    c[4] = t.rotate_left(ROT[24] as u32);
350
351    state[0] = c[0] ^ (c[2] & !c[1]) ^ RC[ridx + 2];
352    state[11] = c[1] ^ (c[3] & !c[2]);
353    state[22] = c[2] ^ (c[4] & !c[3]);
354    state[8] = c[3] ^ (c[0] & !c[4]);
355    state[19] = c[4] ^ (c[1] & !c[0]);
356
357    t = state[15] ^ d[0];
358    c[2] = t.rotate_left(ROT[10] as u32);
359    t = state[1] ^ d[1];
360    c[3] = t.rotate_left(ROT[16] as u32);
361    t = state[12] ^ d[2];
362    c[4] = t.rotate_left(ROT[22] as u32);
363    t = state[23] ^ d[3];
364    c[0] = t.rotate_left(ROT[3] as u32);
365    t = state[9] ^ d[4];
366    c[1] = t.rotate_left(ROT[9] as u32);
367
368    state[15] = c[0] ^ (c[2] & !c[1]);
369    state[1] = c[1] ^ (c[3] & !c[2]);
370    state[12] = c[2] ^ (c[4] & !c[3]);
371    state[23] = c[3] ^ (c[0] & !c[4]);
372    state[9] = c[4] ^ (c[1] & !c[0]);
373
374    t = state[5] ^ d[0];
375    c[4] = t.rotate_left(ROT[20] as u32);
376    t = state[16] ^ d[1];
377    c[0] = t.rotate_left(ROT[1] as u32);
378    t = state[2] ^ d[2];
379    c[1] = t.rotate_left(ROT[7] as u32);
380    t = state[13] ^ d[3];
381    c[2] = t.rotate_left(ROT[13] as u32);
382    t = state[24] ^ d[4];
383    c[3] = t.rotate_left(ROT[19] as u32);
384
385    state[5] = c[0] ^ (c[2] & !c[1]);
386    state[16] = c[1] ^ (c[3] & !c[2]);
387    state[2] = c[2] ^ (c[4] & !c[3]);
388    state[13] = c[3] ^ (c[0] & !c[4]);
389    state[24] = c[4] ^ (c[1] & !c[0]);
390
391    t = state[20] ^ d[0];
392    c[1] = t.rotate_left(ROT[5] as u32);
393    t = state[6] ^ d[1];
394    c[2] = t.rotate_left(ROT[11] as u32);
395    t = state[17] ^ d[2];
396    c[3] = t.rotate_left(ROT[17] as u32);
397    t = state[3] ^ d[3];
398    c[4] = t.rotate_left(ROT[23] as u32);
399    t = state[14] ^ d[4];
400    c[0] = t.rotate_left(ROT[4] as u32);
401
402    state[20] = c[0] ^ (c[2] & !c[1]);
403    state[6] = c[1] ^ (c[3] & !c[2]);
404    state[17] = c[2] ^ (c[4] & !c[3]);
405    state[3] = c[3] ^ (c[0] & !c[4]);
406    state[14] = c[4] ^ (c[1] & !c[0]);
407
408    t = state[10] ^ d[0];
409    c[3] = t.rotate_left(ROT[15] as u32);
410    t = state[21] ^ d[1];
411    c[4] = t.rotate_left(ROT[21] as u32);
412    t = state[7] ^ d[2];
413    c[0] = t.rotate_left(ROT[2] as u32);
414    t = state[18] ^ d[3];
415    c[1] = t.rotate_left(ROT[8] as u32);
416    t = state[4] ^ d[4];
417    c[2] = t.rotate_left(ROT[14] as u32);
418
419    state[10] = c[0] ^ (c[2] & !c[1]);
420    state[21] = c[1] ^ (c[3] & !c[2]);
421    state[7] = c[2] ^ (c[4] & !c[3]);
422    state[18] = c[3] ^ (c[0] & !c[4]);
423    state[4] = c[4] ^ (c[1] & !c[0]);
424
425    // Round ridx + 3
426    c.fill(0);
427
428    for i in (0..LANE_CNT).step_by(5) {
429        c[0] ^= state[i];
430        c[1] ^= state[i + 1];
431        c[2] ^= state[i + 2];
432        c[3] ^= state[i + 3];
433        c[4] ^= state[i + 4];
434    }
435
436    d[0] = c[4] ^ c[1].rotate_left(1);
437    d[1] = c[0] ^ c[2].rotate_left(1);
438    d[2] = c[1] ^ c[3].rotate_left(1);
439    d[3] = c[2] ^ c[4].rotate_left(1);
440    d[4] = c[3] ^ c[0].rotate_left(1);
441
442    c[0] = state[0] ^ d[0];
443    t = state[1] ^ d[1];
444    c[1] = t.rotate_left(ROT[6] as u32);
445    t = state[2] ^ d[2];
446    c[2] = t.rotate_left(ROT[12] as u32);
447    t = state[3] ^ d[3];
448    c[3] = t.rotate_left(ROT[18] as u32);
449    t = state[4] ^ d[4];
450    c[4] = t.rotate_left(ROT[24] as u32);
451
452    state[0] = c[0] ^ (c[2] & !c[1]) ^ RC[ridx + 3];
453    state[1] = c[1] ^ (c[3] & !c[2]);
454    state[2] = c[2] ^ (c[4] & !c[3]);
455    state[3] = c[3] ^ (c[0] & !c[4]);
456    state[4] = c[4] ^ (c[1] & !c[0]);
457
458    t = state[5] ^ d[0];
459    c[2] = t.rotate_left(ROT[10] as u32);
460    t = state[6] ^ d[1];
461    c[3] = t.rotate_left(ROT[16] as u32);
462    t = state[7] ^ d[2];
463    c[4] = t.rotate_left(ROT[22] as u32);
464    t = state[8] ^ d[3];
465    c[0] = t.rotate_left(ROT[3] as u32);
466    t = state[9] ^ d[4];
467    c[1] = t.rotate_left(ROT[9] as u32);
468
469    state[5] = c[0] ^ (c[2] & !c[1]);
470    state[6] = c[1] ^ (c[3] & !c[2]);
471    state[7] = c[2] ^ (c[4] & !c[3]);
472    state[8] = c[3] ^ (c[0] & !c[4]);
473    state[9] = c[4] ^ (c[1] & !c[0]);
474
475    t = state[10] ^ d[0];
476    c[4] = t.rotate_left(ROT[20] as u32);
477    t = state[11] ^ d[1];
478    c[0] = t.rotate_left(ROT[1] as u32);
479    t = state[12] ^ d[2];
480    c[1] = t.rotate_left(ROT[7] as u32);
481    t = state[13] ^ d[3];
482    c[2] = t.rotate_left(ROT[13] as u32);
483    t = state[14] ^ d[4];
484    c[3] = t.rotate_left(ROT[19] as u32);
485
486    state[10] = c[0] ^ (c[2] & !c[1]);
487    state[11] = c[1] ^ (c[3] & !c[2]);
488    state[12] = c[2] ^ (c[4] & !c[3]);
489    state[13] = c[3] ^ (c[0] & !c[4]);
490    state[14] = c[4] ^ (c[1] & !c[0]);
491
492    t = state[15] ^ d[0];
493    c[1] = t.rotate_left(ROT[5] as u32);
494    t = state[16] ^ d[1];
495    c[2] = t.rotate_left(ROT[11] as u32);
496    t = state[17] ^ d[2];
497    c[3] = t.rotate_left(ROT[17] as u32);
498    t = state[18] ^ d[3];
499    c[4] = t.rotate_left(ROT[23] as u32);
500    t = state[19] ^ d[4];
501    c[0] = t.rotate_left(ROT[4] as u32);
502
503    state[15] = c[0] ^ (c[2] & !c[1]);
504    state[16] = c[1] ^ (c[3] & !c[2]);
505    state[17] = c[2] ^ (c[4] & !c[3]);
506    state[18] = c[3] ^ (c[0] & !c[4]);
507    state[19] = c[4] ^ (c[1] & !c[0]);
508
509    t = state[20] ^ d[0];
510    c[3] = t.rotate_left(ROT[15] as u32);
511    t = state[21] ^ d[1];
512    c[4] = t.rotate_left(ROT[21] as u32);
513    t = state[22] ^ d[2];
514    c[0] = t.rotate_left(ROT[2] as u32);
515    t = state[23] ^ d[3];
516    c[1] = t.rotate_left(ROT[8] as u32);
517    t = state[24] ^ d[4];
518    c[2] = t.rotate_left(ROT[14] as u32);
519
520    state[20] = c[0] ^ (c[2] & !c[1]);
521    state[21] = c[1] ^ (c[3] & !c[2]);
522    state[22] = c[2] ^ (c[4] & !c[3]);
523    state[23] = c[3] ^ (c[0] & !c[4]);
524    state[24] = c[4] ^ (c[1] & !c[0]);
525}
526
527/// Keccak-p\[1600, 12\] permutation, applying 12 rounds of permutation
528/// on state of dimension 5 x 5 x 64 ( = 1600 -bits ), following algorithm 7 defined
529/// in section 3.3 of SHA3 specification https://dx.doi.org/10.6028/NIST.FIPS.202
530///
531/// Adapted from https://github.com/itzmeanjan/sha3/blob/b5e897ed/include/keccak.hpp#L253-L493
532#[inline(always)]
533pub fn permute(state: &mut [u64; LANE_CNT]) {
534    const STEP_BY: usize = 4;
535    const { assert!(ROUNDS % STEP_BY == 0) }
536
537    roundx4(state, 0);
538    roundx4(state, 4);
539    roundx4(state, 8);
540}