solana_shake256/lib.rs
1//! SHAKE256 (FIPS 202) — hand-rolled, `no_std`, zero dependencies, tuned
2//! for the Solana SBF target.
3//!
4//! This is the single source of the Keccak-f[1600] core shared by
5//! [`solana-hawk512`] and [`solana-falcon512`] (the permutation,
6//! `absorb`/`finalize` and SHAKE256 padding were byte-identical in both;
7//! consolidating here keeps two consensus-critical verifiers provably
8//! running the same primitive). The two output styles each verifier needs
9//! are both exposed: the bulk **rate-draining** path
10//! ([`Shake256::rate_lanes`] + [`Shake256::permute`], used by Falcon's
11//! `hash_to_point` rejection sampling) and a fixed-length
12//! [`Shake256::squeeze`] (used by HAWK's `hpub`/`M`/`h`).
13//!
14//! The `keccak_f1600` core uses Bertoni **lane-complementing** (the 6-lane
15//! Keccak-Team set `{1,2,8,12,17,20}`, pre-/post-complemented once per
16//! permute so ~456 NOTs are eliminated across 24 rounds) fused with an
17//! **in-place chi-row + 10 cell-saves** layout (no `B[25]` scratch).
18//!
19//! [`solana-hawk512`]: https://github.com/blueshift-gg/solana-hawk512
20//! [`solana-falcon512`]: https://github.com/blueshift-gg/solana-falcon512
21
22#![no_std]
23
24const RC: [u64; 24] = [
25 0x0000000000000001,
26 0x0000000000008082,
27 0x800000000000808a,
28 0x8000000080008000,
29 0x000000000000808b,
30 0x0000000080000001,
31 0x8000000080008081,
32 0x8000000000008009,
33 0x000000000000008a,
34 0x0000000000000088,
35 0x0000000080008009,
36 0x000000008000000a,
37 0x000000008000808b,
38 0x800000000000008b,
39 0x8000000000008089,
40 0x8000000000008003,
41 0x8000000000008002,
42 0x8000000000000080,
43 0x000000000000800a,
44 0x800000008000000a,
45 0x8000000080008081,
46 0x8000000000008080,
47 0x0000000080000001,
48 0x8000000080008008,
49];
50
51fn keccak_f1600(s: &mut [u64; 25]) {
52 // **Bertoni lane-complementing + chi-row** layout.
53 //
54 // Pre-complement the canonical 6-lane Keccak Team set
55 // CS = {1, 2, 8, 12, 17, 20}
56 // chosen so that across one full round (theta+rho+pi+chi+iota), the
57 // complementation pattern is invariant. Per-row IN-complemented b's at
58 // post-pi positions (derived from theta+rho+pi propagation):
59 // row 0: b0, b2, b3 row 1: b0, b2 row 2: b0, b2
60 // row 3: b1, b3, b4 row 4: b0, b3
61 // Per-row OUT-complemented (must store ~A_logical_new):
62 // row 0: x=1, x=2 row 1: x=3 row 2: x=2
63 // row 3: x=2 row 4: x=0
64 // Net ~456 NOTs eliminated per 24-round permute, ~12 added at boundaries.
65
66 // Entry: complement the 6 CS lanes once.
67 s[1] = !s[1];
68 s[2] = !s[2];
69 s[8] = !s[8];
70 s[12] = !s[12];
71 s[17] = !s[17];
72 s[20] = !s[20];
73
74 macro_rules! round {
75 ($rc:expr) => {{
76 // theta — column parities
77 let c0 = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
78 let c1 = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
79 let c2 = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
80 let c3 = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
81 let c4 = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
82
83 let d0 = c4 ^ c1.rotate_left(1);
84 let d1 = c0 ^ c2.rotate_left(1);
85 let d2 = c1 ^ c3.rotate_left(1);
86 let d3 = c2 ^ c4.rotate_left(1);
87 let d4 = c3 ^ c0.rotate_left(1);
88
89 // **In-place chi-row + 10 cell-saves**.
90 // Row 0 outputs to s[0..5]; rows 1..4 read s[3], s[1], s[4], s[2]
91 // from this range — save before overwriting.
92 let s3 = s[3];
93 let s1 = s[1];
94 let s4 = s[4];
95 let s2 = s[2];
96
97 // Row 0 — IN: b0,b2,b3 complemented; OUT-complement: x=1,2.
98 // Iota fused into lane 0.
99 {
100 let b0 = s[0] ^ d0;
101 let b1 = (s[6] ^ d1).rotate_left(44);
102 let b2 = (s[12] ^ d2).rotate_left(43);
103 let b3 = (s[18] ^ d3).rotate_left(21);
104 let b4 = (s[24] ^ d4).rotate_left(14);
105 s[0] = b0 ^ (b1 | b2) ^ $rc;
106 s[1] = b1 ^ ((!b2) | b3);
107 s[2] = b2 ^ (b3 & b4);
108 s[3] = b3 ^ (b4 | b0);
109 s[4] = b4 ^ (b0 & b1);
110 }
111
112 // Row 1 outputs to s[5..10]; rows 2..4 read s[7], s[5], s[8].
113 let s7 = s[7];
114 let s5 = s[5];
115 let s8 = s[8];
116
117 // Row 1 — IN: b0,b2 complemented; OUT-complement: x=3.
118 {
119 let b0 = (s3 ^ d3).rotate_left(28);
120 let b1 = (s[9] ^ d4).rotate_left(20);
121 let b2 = (s[10] ^ d0).rotate_left(3);
122 let b3 = (s[16] ^ d1).rotate_left(45);
123 let b4 = (s[22] ^ d2).rotate_left(61);
124 s[5] = b0 ^ (b1 | b2);
125 s[6] = b1 ^ (b2 & b3);
126 s[7] = (!b2) ^ b4 ^ (b3 & b4);
127 s[8] = b3 ^ (b4 | b0);
128 s[9] = b4 ^ (b0 & b1);
129 }
130
131 // Row 2 outputs to s[10..15]; rows 3..4 read s[11], s[14].
132 let s11 = s[11];
133 let s14 = s[14];
134
135 // Row 2 — IN: b0,b2 complemented; OUT-complement: x=2.
136 {
137 let b0 = (s1 ^ d1).rotate_left(1);
138 let b1 = (s7 ^ d2).rotate_left(6);
139 let b2 = (s[13] ^ d3).rotate_left(25);
140 let b3 = (s[19] ^ d4).rotate_left(8);
141 let b4 = (s[20] ^ d0).rotate_left(18);
142 s[10] = b0 ^ (b1 | b2);
143 s[11] = b1 ^ (b2 & b3);
144 s[12] = b2 ^ b4 ^ (b3 & b4);
145 s[13] = b3 ^ !(b4 | b0);
146 s[14] = b4 ^ (b0 & b1);
147 }
148
149 // Row 3 outputs to s[15..20]; row 4 reads s[15].
150 let s15 = s[15];
151
152 // Row 3 — IN: b1,b3,b4 complemented; OUT-complement: x=2.
153 {
154 let b0 = (s4 ^ d4).rotate_left(27);
155 let b1 = (s5 ^ d0).rotate_left(36);
156 let b2 = (s11 ^ d1).rotate_left(10);
157 let b3 = (s[17] ^ d2).rotate_left(15);
158 let b4 = (s[23] ^ d3).rotate_left(56);
159 s[15] = b0 ^ (b1 & b2);
160 s[16] = b1 ^ (b2 | b3);
161 s[17] = b2 ^ ((!b3) | b4);
162 s[18] = (!b3) ^ (b4 & b0);
163 s[19] = b4 ^ (b0 | b1);
164 }
165
166 // Row 4 — IN: b0,b3 complemented; OUT-complement: x=0.
167 {
168 let b0 = (s2 ^ d2).rotate_left(62);
169 let b1 = (s8 ^ d3).rotate_left(55);
170 let b2 = (s14 ^ d4).rotate_left(39);
171 let b3 = (s15 ^ d0).rotate_left(41);
172 let b4 = (s[21] ^ d1).rotate_left(2);
173 s[20] = b0 ^ b2 ^ (b1 & b2);
174 s[21] = b1 ^ !(b2 | b3);
175 s[22] = b2 ^ (b3 & b4);
176 s[23] = b3 ^ (b4 | b0);
177 s[24] = b4 ^ (b0 & b1);
178 }
179 }};
180 }
181
182 round!(RC[0]);
183 round!(RC[1]);
184 round!(RC[2]);
185 round!(RC[3]);
186 round!(RC[4]);
187 round!(RC[5]);
188 round!(RC[6]);
189 round!(RC[7]);
190 round!(RC[8]);
191 round!(RC[9]);
192 round!(RC[10]);
193 round!(RC[11]);
194 round!(RC[12]);
195 round!(RC[13]);
196 round!(RC[14]);
197 round!(RC[15]);
198 round!(RC[16]);
199 round!(RC[17]);
200 round!(RC[18]);
201 round!(RC[19]);
202 round!(RC[20]);
203 round!(RC[21]);
204 round!(RC[22]);
205 round!(RC[23]);
206
207 // Exit: un-complement the 6 CS lanes so the caller sees the normal
208 // (uncomplemented) state. Cost paid once per permute.
209 s[1] = !s[1];
210 s[2] = !s[2];
211 s[8] = !s[8];
212 s[12] = !s[12];
213 s[17] = !s[17];
214 s[20] = !s[20];
215}
216
217/// SHAKE256 rate in bytes (1600-bit state − 2·256-bit capacity = 1088 bits).
218pub const RATE: usize = 136;
219
220/// Incremental SHAKE256 (FIPS 202). `new` → `absorb`* → `finalize` → then
221/// either drain the rate (`rate_lanes`/`permute`) or `squeeze` a fixed
222/// number of bytes.
223///
224/// Every method is `#[inline]` so the consumer (built `lto`/`opt-level=3`
225/// for SBF) folds the whole thing in exactly as if it were a local module —
226/// the crate boundary has no codegen cost.
227pub struct Shake256 {
228 state: [u64; 25],
229 pos: usize,
230}
231
232impl Default for Shake256 {
233 #[inline]
234 fn default() -> Self {
235 Self::new()
236 }
237}
238
239impl Shake256 {
240 #[inline]
241 pub fn new() -> Self {
242 Self {
243 state: [0; 25],
244 pos: 0,
245 }
246 }
247
248 #[inline(always)]
249 pub fn absorb(&mut self, data: &[u8]) {
250 let mut i = 0;
251 let len = data.len();
252
253 // Phase 1: byte-by-byte until lane-aligned.
254 while i < len && !self.pos.is_multiple_of(8) {
255 let lane = self.pos / 8;
256 let shift = 8 * (self.pos % 8);
257 self.state[lane] ^= (data[i] as u64) << shift;
258 self.pos += 1;
259 if self.pos == RATE {
260 keccak_f1600(&mut self.state);
261 self.pos = 0;
262 }
263 i += 1;
264 }
265
266 // Phase 2: bulk 8-byte chunks XORed straight into a lane. Bytes within
267 // a lane are little-endian per FIPS 202, so `from_le_bytes` is the
268 // correct assembly.
269 while i + 8 <= len {
270 // SAFETY: phase 1 made `self.pos` lane-aligned (multiple of 8),
271 // and `pos < RATE = 136 = 17 * 8`, so `pos / 8 < 17 < 25`.
272 unsafe { core::hint::assert_unchecked(self.pos / 8 < 17) };
273 let chunk_bytes: [u8; 8] = data[i..i + 8].try_into().unwrap();
274 let chunk = u64::from_le_bytes(chunk_bytes);
275 self.state[self.pos / 8] ^= chunk;
276 self.pos += 8;
277 i += 8;
278 if self.pos == RATE {
279 keccak_f1600(&mut self.state);
280 self.pos = 0;
281 }
282 }
283
284 // Phase 3: tail bytes (< 8 left).
285 while i < len {
286 let lane = self.pos / 8;
287 let shift = 8 * (self.pos % 8);
288 self.state[lane] ^= (data[i] as u64) << shift;
289 self.pos += 1;
290 if self.pos == RATE {
291 keccak_f1600(&mut self.state);
292 self.pos = 0;
293 }
294 i += 1;
295 }
296 }
297
298 #[inline(always)]
299 pub fn finalize(&mut self) {
300 let lane = self.pos / 8;
301 let shift = 8 * (self.pos % 8);
302 self.state[lane] ^= 0x1Fu64 << shift;
303 let last = RATE - 1;
304 self.state[last / 8] ^= 0x80u64 << (8 * (last % 8));
305 keccak_f1600(&mut self.state);
306 self.pos = 0;
307 }
308
309 /// First 17 u64 lanes (= the 136-byte rate). Bytes within each lane are
310 /// little-endian per FIPS 202: byte at offset `b` of lane `l` is
311 /// `(rate_lanes()[l] >> (8*b)) & 0xff`. Drain this, then call
312 /// [`permute`](Self::permute) for the next block — the bulk-rate squeeze
313 /// path (e.g. Falcon's `hash_to_point` rejection sampling).
314 #[inline]
315 pub fn rate_lanes(&self) -> &[u64] {
316 &self.state[..17]
317 }
318
319 /// Apply Keccak-f[1600] to refill the rate (used with
320 /// [`rate_lanes`](Self::rate_lanes)).
321 #[inline]
322 pub fn permute(&mut self) {
323 keccak_f1600(&mut self.state);
324 }
325
326 /// Squeeze exactly `LEN` bytes, handling rate-boundary permutes. `LEN`
327 /// is const so the loop bounds and the rate-boundary check fold and the
328 /// bulk lane copy fully unrolls (the fixed-length path, e.g. HAWK's
329 /// `hpub`/`M`/`h`). Bytes within a lane are little-endian per FIPS 202,
330 /// so a lane-aligned run of ≥ 8 bytes is one `to_le_bytes` copy (the
331 /// same bytes as eight `state[lane] >> 8k` reads).
332 #[inline]
333 pub fn squeeze<const LEN: usize>(&mut self, out: &mut [u8; LEN]) {
334 let len = LEN;
335 let mut i = 0;
336
337 // Byte-by-byte until lane-aligned.
338 while i < len && !self.pos.is_multiple_of(8) {
339 let lane = self.pos / 8;
340 let shift = 8 * (self.pos % 8);
341 out[i] = (self.state[lane] >> shift) as u8;
342 self.pos += 1;
343 i += 1;
344 if self.pos == RATE {
345 keccak_f1600(&mut self.state);
346 self.pos = 0;
347 }
348 }
349
350 // Bulk 8-byte lanes (no rate boundary lands mid-lane: RATE = 17·8).
351 while i + 8 <= len && self.pos + 8 <= RATE {
352 // SAFETY: `pos` lane-aligned, `pos + 8 ≤ RATE = 136` ⇒
353 // `pos/8 ≤ 16 < 17 < 25`.
354 unsafe { core::hint::assert_unchecked(self.pos / 8 < 17) };
355 out[i..i + 8].copy_from_slice(&self.state[self.pos / 8].to_le_bytes());
356 self.pos += 8;
357 i += 8;
358 if self.pos == RATE {
359 keccak_f1600(&mut self.state);
360 self.pos = 0;
361 }
362 }
363
364 // Tail bytes (< 8 left, or a partial lane before a rate boundary).
365 while i < len {
366 let lane = self.pos / 8;
367 let shift = 8 * (self.pos % 8);
368 out[i] = (self.state[lane] >> shift) as u8;
369 self.pos += 1;
370 i += 1;
371 if self.pos == RATE {
372 keccak_f1600(&mut self.state);
373 self.pos = 0;
374 }
375 }
376 }
377}