solana_shake256/
lib.rs

1//! SHAKE256 (FIPS 202) — hand-rolled, `no_std`, zero dependencies, tuned
2//! for the Solana SBF target.
3//!
4//! This is the single source of the Keccak-f[1600] core shared by
5//! [`solana-hawk512`] and [`solana-falcon512`] (the permutation,
6//! `absorb`/`finalize` and SHAKE256 padding were byte-identical in both;
7//! consolidating here keeps two consensus-critical verifiers provably
8//! running the same primitive). The two output styles each verifier needs
9//! are both exposed: the bulk **rate-draining** path
10//! ([`Shake256::rate_lanes`] + [`Shake256::permute`], used by Falcon's
11//! `hash_to_point` rejection sampling) and a fixed-length
12//! [`Shake256::squeeze`] (used by HAWK's `hpub`/`M`/`h`).
13//!
14//! The `keccak_f1600` core uses Bertoni **lane-complementing** (the 6-lane
15//! Keccak-Team set `{1,2,8,12,17,20}`, pre-/post-complemented once per
16//! permute so ~456 NOTs are eliminated across 24 rounds) fused with an
17//! **in-place chi-row + 10 cell-saves** layout (no `B[25]` scratch).
18//!
19//! [`solana-hawk512`]: https://github.com/blueshift-gg/solana-hawk512
20//! [`solana-falcon512`]: https://github.com/blueshift-gg/solana-falcon512
21
22#![no_std]
23
24const RC: [u64; 24] = [
25    0x0000000000000001,
26    0x0000000000008082,
27    0x800000000000808a,
28    0x8000000080008000,
29    0x000000000000808b,
30    0x0000000080000001,
31    0x8000000080008081,
32    0x8000000000008009,
33    0x000000000000008a,
34    0x0000000000000088,
35    0x0000000080008009,
36    0x000000008000000a,
37    0x000000008000808b,
38    0x800000000000008b,
39    0x8000000000008089,
40    0x8000000000008003,
41    0x8000000000008002,
42    0x8000000000000080,
43    0x000000000000800a,
44    0x800000008000000a,
45    0x8000000080008081,
46    0x8000000000008080,
47    0x0000000080000001,
48    0x8000000080008008,
49];
50
51fn keccak_f1600(s: &mut [u64; 25]) {
52    // **Bertoni lane-complementing + chi-row** layout.
53    //
54    // Pre-complement the canonical 6-lane Keccak Team set
55    //     CS = {1, 2, 8, 12, 17, 20}
56    // chosen so that across one full round (theta+rho+pi+chi+iota), the
57    // complementation pattern is invariant. Per-row IN-complemented b's at
58    // post-pi positions (derived from theta+rho+pi propagation):
59    //     row 0: b0, b2, b3   row 1: b0, b2     row 2: b0, b2
60    //     row 3: b1, b3, b4   row 4: b0, b3
61    // Per-row OUT-complemented (must store ~A_logical_new):
62    //     row 0: x=1, x=2     row 1: x=3        row 2: x=2
63    //     row 3: x=2          row 4: x=0
64    // Net ~456 NOTs eliminated per 24-round permute, ~12 added at boundaries.
65
66    // Entry: complement the 6 CS lanes once.
67    s[1] = !s[1];
68    s[2] = !s[2];
69    s[8] = !s[8];
70    s[12] = !s[12];
71    s[17] = !s[17];
72    s[20] = !s[20];
73
74    macro_rules! round {
75        ($rc:expr) => {{
76            // theta — column parities
77            let c0 = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
78            let c1 = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
79            let c2 = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
80            let c3 = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
81            let c4 = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
82
83            let d0 = c4 ^ c1.rotate_left(1);
84            let d1 = c0 ^ c2.rotate_left(1);
85            let d2 = c1 ^ c3.rotate_left(1);
86            let d3 = c2 ^ c4.rotate_left(1);
87            let d4 = c3 ^ c0.rotate_left(1);
88
89            // **In-place chi-row + 10 cell-saves**.
90            // Row 0 outputs to s[0..5]; rows 1..4 read s[3], s[1], s[4], s[2]
91            // from this range — save before overwriting.
92            let s3 = s[3];
93            let s1 = s[1];
94            let s4 = s[4];
95            let s2 = s[2];
96
97            // Row 0 — IN: b0,b2,b3 complemented; OUT-complement: x=1,2.
98            // Iota fused into lane 0.
99            {
100                let b0 = s[0] ^ d0;
101                let b1 = (s[6] ^ d1).rotate_left(44);
102                let b2 = (s[12] ^ d2).rotate_left(43);
103                let b3 = (s[18] ^ d3).rotate_left(21);
104                let b4 = (s[24] ^ d4).rotate_left(14);
105                s[0] = b0 ^ (b1 | b2) ^ $rc;
106                s[1] = b1 ^ ((!b2) | b3);
107                s[2] = b2 ^ (b3 & b4);
108                s[3] = b3 ^ (b4 | b0);
109                s[4] = b4 ^ (b0 & b1);
110            }
111
112            // Row 1 outputs to s[5..10]; rows 2..4 read s[7], s[5], s[8].
113            let s7 = s[7];
114            let s5 = s[5];
115            let s8 = s[8];
116
117            // Row 1 — IN: b0,b2 complemented; OUT-complement: x=3.
118            {
119                let b0 = (s3 ^ d3).rotate_left(28);
120                let b1 = (s[9] ^ d4).rotate_left(20);
121                let b2 = (s[10] ^ d0).rotate_left(3);
122                let b3 = (s[16] ^ d1).rotate_left(45);
123                let b4 = (s[22] ^ d2).rotate_left(61);
124                s[5] = b0 ^ (b1 | b2);
125                s[6] = b1 ^ (b2 & b3);
126                s[7] = (!b2) ^ b4 ^ (b3 & b4);
127                s[8] = b3 ^ (b4 | b0);
128                s[9] = b4 ^ (b0 & b1);
129            }
130
131            // Row 2 outputs to s[10..15]; rows 3..4 read s[11], s[14].
132            let s11 = s[11];
133            let s14 = s[14];
134
135            // Row 2 — IN: b0,b2 complemented; OUT-complement: x=2.
136            {
137                let b0 = (s1 ^ d1).rotate_left(1);
138                let b1 = (s7 ^ d2).rotate_left(6);
139                let b2 = (s[13] ^ d3).rotate_left(25);
140                let b3 = (s[19] ^ d4).rotate_left(8);
141                let b4 = (s[20] ^ d0).rotate_left(18);
142                s[10] = b0 ^ (b1 | b2);
143                s[11] = b1 ^ (b2 & b3);
144                s[12] = b2 ^ b4 ^ (b3 & b4);
145                s[13] = b3 ^ !(b4 | b0);
146                s[14] = b4 ^ (b0 & b1);
147            }
148
149            // Row 3 outputs to s[15..20]; row 4 reads s[15].
150            let s15 = s[15];
151
152            // Row 3 — IN: b1,b3,b4 complemented; OUT-complement: x=2.
153            {
154                let b0 = (s4 ^ d4).rotate_left(27);
155                let b1 = (s5 ^ d0).rotate_left(36);
156                let b2 = (s11 ^ d1).rotate_left(10);
157                let b3 = (s[17] ^ d2).rotate_left(15);
158                let b4 = (s[23] ^ d3).rotate_left(56);
159                s[15] = b0 ^ (b1 & b2);
160                s[16] = b1 ^ (b2 | b3);
161                s[17] = b2 ^ ((!b3) | b4);
162                s[18] = (!b3) ^ (b4 & b0);
163                s[19] = b4 ^ (b0 | b1);
164            }
165
166            // Row 4 — IN: b0,b3 complemented; OUT-complement: x=0.
167            {
168                let b0 = (s2 ^ d2).rotate_left(62);
169                let b1 = (s8 ^ d3).rotate_left(55);
170                let b2 = (s14 ^ d4).rotate_left(39);
171                let b3 = (s15 ^ d0).rotate_left(41);
172                let b4 = (s[21] ^ d1).rotate_left(2);
173                s[20] = b0 ^ b2 ^ (b1 & b2);
174                s[21] = b1 ^ !(b2 | b3);
175                s[22] = b2 ^ (b3 & b4);
176                s[23] = b3 ^ (b4 | b0);
177                s[24] = b4 ^ (b0 & b1);
178            }
179        }};
180    }
181
182    round!(RC[0]);
183    round!(RC[1]);
184    round!(RC[2]);
185    round!(RC[3]);
186    round!(RC[4]);
187    round!(RC[5]);
188    round!(RC[6]);
189    round!(RC[7]);
190    round!(RC[8]);
191    round!(RC[9]);
192    round!(RC[10]);
193    round!(RC[11]);
194    round!(RC[12]);
195    round!(RC[13]);
196    round!(RC[14]);
197    round!(RC[15]);
198    round!(RC[16]);
199    round!(RC[17]);
200    round!(RC[18]);
201    round!(RC[19]);
202    round!(RC[20]);
203    round!(RC[21]);
204    round!(RC[22]);
205    round!(RC[23]);
206
207    // Exit: un-complement the 6 CS lanes so the caller sees the normal
208    // (uncomplemented) state. Cost paid once per permute.
209    s[1] = !s[1];
210    s[2] = !s[2];
211    s[8] = !s[8];
212    s[12] = !s[12];
213    s[17] = !s[17];
214    s[20] = !s[20];
215}
216
217/// SHAKE256 rate in bytes (1600-bit state − 2·256-bit capacity = 1088 bits).
218pub const RATE: usize = 136;
219
220/// Incremental SHAKE256 (FIPS 202). `new` → `absorb`* → `finalize` → then
221/// either drain the rate (`rate_lanes`/`permute`) or `squeeze` a fixed
222/// number of bytes.
223///
224/// Every method is `#[inline]` so the consumer (built `lto`/`opt-level=3`
225/// for SBF) folds the whole thing in exactly as if it were a local module —
226/// the crate boundary has no codegen cost.
227pub struct Shake256 {
228    state: [u64; 25],
229    pos: usize,
230}
231
232impl Default for Shake256 {
233    #[inline]
234    fn default() -> Self {
235        Self::new()
236    }
237}
238
239impl Shake256 {
240    #[inline]
241    pub fn new() -> Self {
242        Self {
243            state: [0; 25],
244            pos: 0,
245        }
246    }
247
248    #[inline(always)]
249    pub fn absorb(&mut self, data: &[u8]) {
250        let mut i = 0;
251        let len = data.len();
252
253        // Phase 1: byte-by-byte until lane-aligned.
254        while i < len && !self.pos.is_multiple_of(8) {
255            let lane = self.pos / 8;
256            let shift = 8 * (self.pos % 8);
257            self.state[lane] ^= (data[i] as u64) << shift;
258            self.pos += 1;
259            if self.pos == RATE {
260                keccak_f1600(&mut self.state);
261                self.pos = 0;
262            }
263            i += 1;
264        }
265
266        // Phase 2: bulk 8-byte chunks XORed straight into a lane. Bytes within
267        // a lane are little-endian per FIPS 202, so `from_le_bytes` is the
268        // correct assembly.
269        while i + 8 <= len {
270            // SAFETY: phase 1 made `self.pos` lane-aligned (multiple of 8),
271            // and `pos < RATE = 136 = 17 * 8`, so `pos / 8 < 17 < 25`.
272            unsafe { core::hint::assert_unchecked(self.pos / 8 < 17) };
273            let chunk_bytes: [u8; 8] = data[i..i + 8].try_into().unwrap();
274            let chunk = u64::from_le_bytes(chunk_bytes);
275            self.state[self.pos / 8] ^= chunk;
276            self.pos += 8;
277            i += 8;
278            if self.pos == RATE {
279                keccak_f1600(&mut self.state);
280                self.pos = 0;
281            }
282        }
283
284        // Phase 3: tail bytes (< 8 left).
285        while i < len {
286            let lane = self.pos / 8;
287            let shift = 8 * (self.pos % 8);
288            self.state[lane] ^= (data[i] as u64) << shift;
289            self.pos += 1;
290            if self.pos == RATE {
291                keccak_f1600(&mut self.state);
292                self.pos = 0;
293            }
294            i += 1;
295        }
296    }
297
298    #[inline(always)]
299    pub fn finalize(&mut self) {
300        let lane = self.pos / 8;
301        let shift = 8 * (self.pos % 8);
302        self.state[lane] ^= 0x1Fu64 << shift;
303        let last = RATE - 1;
304        self.state[last / 8] ^= 0x80u64 << (8 * (last % 8));
305        keccak_f1600(&mut self.state);
306        self.pos = 0;
307    }
308
309    /// First 17 u64 lanes (= the 136-byte rate). Bytes within each lane are
310    /// little-endian per FIPS 202: byte at offset `b` of lane `l` is
311    /// `(rate_lanes()[l] >> (8*b)) & 0xff`. Drain this, then call
312    /// [`permute`](Self::permute) for the next block — the bulk-rate squeeze
313    /// path (e.g. Falcon's `hash_to_point` rejection sampling).
314    #[inline]
315    pub fn rate_lanes(&self) -> &[u64] {
316        &self.state[..17]
317    }
318
319    /// Apply Keccak-f[1600] to refill the rate (used with
320    /// [`rate_lanes`](Self::rate_lanes)).
321    #[inline]
322    pub fn permute(&mut self) {
323        keccak_f1600(&mut self.state);
324    }
325
326    /// Squeeze exactly `LEN` bytes, handling rate-boundary permutes. `LEN`
327    /// is const so the loop bounds and the rate-boundary check fold and the
328    /// bulk lane copy fully unrolls (the fixed-length path, e.g. HAWK's
329    /// `hpub`/`M`/`h`). Bytes within a lane are little-endian per FIPS 202,
330    /// so a lane-aligned run of ≥ 8 bytes is one `to_le_bytes` copy (the
331    /// same bytes as eight `state[lane] >> 8k` reads).
332    #[inline]
333    pub fn squeeze<const LEN: usize>(&mut self, out: &mut [u8; LEN]) {
334        let len = LEN;
335        let mut i = 0;
336
337        // Byte-by-byte until lane-aligned.
338        while i < len && !self.pos.is_multiple_of(8) {
339            let lane = self.pos / 8;
340            let shift = 8 * (self.pos % 8);
341            out[i] = (self.state[lane] >> shift) as u8;
342            self.pos += 1;
343            i += 1;
344            if self.pos == RATE {
345                keccak_f1600(&mut self.state);
346                self.pos = 0;
347            }
348        }
349
350        // Bulk 8-byte lanes (no rate boundary lands mid-lane: RATE = 17·8).
351        while i + 8 <= len && self.pos + 8 <= RATE {
352            // SAFETY: `pos` lane-aligned, `pos + 8 ≤ RATE = 136` ⇒
353            // `pos/8 ≤ 16 < 17 < 25`.
354            unsafe { core::hint::assert_unchecked(self.pos / 8 < 17) };
355            out[i..i + 8].copy_from_slice(&self.state[self.pos / 8].to_le_bytes());
356            self.pos += 8;
357            i += 8;
358            if self.pos == RATE {
359                keccak_f1600(&mut self.state);
360                self.pos = 0;
361            }
362        }
363
364        // Tail bytes (< 8 left, or a partial lane before a rate boundary).
365        while i < len {
366            let lane = self.pos / 8;
367            let shift = 8 * (self.pos % 8);
368            out[i] = (self.state[lane] >> shift) as u8;
369            self.pos += 1;
370            i += 1;
371            if self.pos == RATE {
372                keccak_f1600(&mut self.state);
373                self.pos = 0;
374            }
375        }
376    }
377}
solana_shake256/lib.rs

solana_shake256/
lib.rs