dinoxor 0.3.0

Re-implements bitwise operations as abstractions in aarch64 neon registers.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
extern crate quickcheck;

use core::arch::aarch64::*;
use quickcheck::{Arbitrary, Gen};

use crate::dinoxor::dinoxor;

/// ChaCha20 state consists of 16 words (u32)
const STATE_LEN: usize = 16;

/// Number of rounds in the ChaCha20 algorithm
const NUM_ROUNDS: usize = 10; // ChaCha20 uses 20 rounds, each function call here represents 2 rounds

/// Represents the internal state of a ChaCha20 stream cipher implementation. This struct holds the core parameters required for cryptographic operations.
pub struct ChaCha20State {
    state: [u32; STATE_LEN],
}

// Newtype structs for different array sizes
#[derive(Clone, Debug)]
pub struct Key(pub [u8; 32]);
#[derive(Clone, Debug)]
pub struct Nonce(pub [u8; 12]);
#[derive(Clone, Debug)]
pub struct Block(pub [u8; 64]);

// Implement Arbitrary for each new type
impl Arbitrary for Key {
    /// Generates a random `Key` instance using the given generator.
    ///
    /// Fills an array of 32 u8 values with random bytes and wraps them into a `Key`.
    /// This function uses the provided generator to produce each byte.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to the random number generator.
    ///
    /// Returns:
    /// - A new `Key` instance filled with randomly generated bytes.
    ///
    /// Notes:
    /// - The `Key` struct is constructed by wrapping a 32-byte array.
    /// - This function internally calls `u8::arbitrary(g)` to generate each byte value.
    ///
    /// Examples:
    /// ```rust
    /// use quickcheck::Gen;
    /// use quickcheck::Arbitrary;
    /// use dinoxor::chacha20::Key;
    ///
    /// fn example(g: &mut Gen) -> Result<Key, Box<dyn std::error::Error>> {
    ///     let key = Key::arbitrary(g);
    ///     Ok(key)
    ///  }
    ///
    /// ```
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 32];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Key(arr)
    }
}

impl Arbitrary for Nonce {
    /// Handle generating a random `Nonce` value using the `arbitrary` function.
    ///
    /// This function generates a 12-byte random nonce by calling the `arbitrary` method
    /// on each byte of a fixed-size array. The generated nonce is wrapped in the `Nonce` struct
    /// and returned as a result.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to the random generator (`Gen`).
    ///
    /// Returns:
    /// - The newly generated `Nonce` instance.
    ///
    /// Errors:
    /// This function does not return errors directly, but may encounter failures
    /// during the random generation process. These are typically handled internally by
    /// the test harness and not exposed here.
    ///
    /// Notes:
    /// This function uses the `arbitrary` method from the standard library's test harness
    /// to generate random values. It is intended for use in testing and benchmarking scenarios.
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 12];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Nonce(arr)
    }
}

impl Arbitrary for Block {
    /// Handle arbitrary generation of a `Block` struct using the `arbitrary` generator.
    ///
    /// This function generates an instance of a `Block`, which is initialized with a
    /// 64-element array of bytes. Each byte in the array is generated using a call to
    /// `u8::arbitrary`.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to a generator used for arbitrary value generation.
    ///
    /// Returns:
    /// - An instance of `Block` populated with randomly generated bytes.
    ///
    /// Notes:
    /// - The function uses the standard `arbitrary` crate to generate random values.
    /// - Each byte in the array is generated independently and filled into the block.
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 64];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Block(arr)
    }
}

impl ChaCha20State {
    /// Creates a new [`ChaCha20State`] instance with the provided key, nonce, and counter.
    ///
    /// The ChaCha20 state is a 4×4 matrix of 32-bit words (16 words total = 512 bits).
    /// It is initialized according to the ChaCha20 block function specification:
    ///
    /// ```text
    ///  +----+----+----+----+
    ///  | C0 | C1 | C2 | C3 |  Constants (128 bits)
    ///  +----+----+----+----+
    ///  | K0 | K1 | K2 | K3 |  Key (first 128 bits)
    ///  +----+----+----+----+
    ///  | K4 | K5 | K6 | K7 |  Key (second 128 bits)
    ///  +----+----+----+----+
    ///  | CT | N0 | N1 | N2 |  Block counter + Nonce (96 bits)
    ///  +----+----+----+----+
    /// ```
    ///
    /// ### Layout details:
    ///
    /// - **Constants (`C0..C3`)**  
    ///   Fixed 32-bit words that serve as an algorithm identifier and prevent
    ///   misuse across different stream ciphers.  
    ///   The chosen constants are the ASCII string `"expand 32-byte k"` split into
    ///   little-endian 32-bit words:
    ///   ```text
    ///   0x61707865  // "expa"
    ///   0x3320646e  // "nd 3"
    ///   0x79622d32  // "2-by"
    ///   0x6b206574  // "te k"
    ///   ```
    ///
    /// - **Key (`K0..K7`)**  
    ///   A 256-bit (32-byte) secret key, split into 8 little-endian `u32`s.
    ///   These words provide the cryptographic strength of ChaCha20.
    ///
    /// - **Counter (`CT`)**  
    ///   A 32-bit block counter.  
    ///   This ensures that each 64-byte keystream block produced by ChaCha20
    ///   is unique under a given key/nonce pair. The counter typically starts at
    ///   zero, but may be set to a different value to allow random access into
    ///   the keystream.
    ///
    /// - **Nonce (`N0..N2`)**  
    ///   A 96-bit (12-byte) unique per-message value, split into 3 little-endian `u32`s.  
    ///   The nonce prevents keystream reuse across messages under the same key,
    ///   ensuring security against replay and ciphertext-only attacks.
    ///
    /// ### Why this layout?
    /// - The constants fix the cipher definition.  
    /// - The key provides secrecy.  
    /// - The counter and nonce ensure each block of output is unique, even if
    ///   the key is reused.  
    /// - The state structure makes the quarter-round function efficient by
    ///   aligning data on 32-bit word boundaries.
    ///
    /// # Parameters
    /// - `key`: A 32-byte slice representing the encryption key.
    /// - `nonce`: A 12-byte slice representing the per-message nonce.
    /// - `counter`: A 32-bit unsigned integer block counter.
    ///
    /// # Returns
    /// - `Self`: A new [`ChaCha20State`] instance with the initialized state matrix.
    ///
    /// # Errors
    /// - None. This function will not fail as long as the key and nonce slices
    ///   have the correct lengths.
    ///
    /// # Examples
    /// ```no_run
    /// use dinoxor::chacha20::ChaCha20State;
    ///
    /// let key = [0u8; 32];
    /// let nonce = [0u8; 12];
    /// let counter = 0;
    ///
    /// let state = ChaCha20State::new(&key, &nonce, counter);
    /// ```
    pub fn new(key: &[u8; 32], nonce: &[u8; 12], counter: u32) -> Self {
        let mut state = [
            0x6170_7865,
            0x3320_646e,
            0x7962_2d32,
            0x6b20_6574, // Constants
            0,
            0,
            0,
            0, // 256-bit key
            0,
            0,
            0,
            0,
            counter,                                                      // Block counter
            u32::from_le_bytes([nonce[0], nonce[1], nonce[2], nonce[3]]), // Nonce
            u32::from_le_bytes([nonce[4], nonce[5], nonce[6], nonce[7]]),
            u32::from_le_bytes([nonce[8], nonce[9], nonce[10], nonce[11]]),
        ];

        // Split the 32-byte key into eight 32-bit words (little-endian)
        // and place them into state[4..12].
        for i in 0..8 {
            state[4 + i] =
                u32::from_le_bytes([key[4 * i], key[4 * i + 1], key[4 * i + 2], key[4 * i + 3]]);
        }

        ChaCha20State { state }
    }

    /// Handle the reset operation for a ChaCha20State.
    ///
    /// This function resets the internal state of a `ChaCha20State` with a given key, nonce,
    /// and counter. It initializes the state using constants, then populates it with the
    /// provided key, nonce, and counter values. The `from_le_bytes` method is used to
    /// convert bytes into u32 values for nonces and key components.
    ///
    /// Parameters:
    /// - `key`: A 32-byte slice representing the encryption key.
    /// - `nonce`: A 12-byte slice representing the counter and nonce value.
    /// - `counter`: The block counter to initialize with.
    ///
    /// Returns:
    /// - No return value. This function performs an in-place reset of the state.
    ///
    /// Errors:
    /// - None expected, as this function does not return an error.
    ///
    /// Notes:
    /// - The function uses four separate calls to `u32::from_le_bytes` for each part
    ///   of the nonce and key, as well as one call for `counter`.
    pub fn reset(&mut self, key: &[u8; 32], nonce: &[u8; 12], counter: u32) {
        self.state = [
            0x6170_7865,
            0x3320_646e,
            0x7962_2d32,
            0x6b20_6574, // Constants
            0,
            0,
            0,
            0, // 256-bit key
            0,
            0,
            0,
            0,
            counter,                                                      // Block counter
            u32::from_le_bytes([nonce[0], nonce[1], nonce[2], nonce[3]]), // Nonce
            u32::from_le_bytes([nonce[4], nonce[5], nonce[6], nonce[7]]),
            u32::from_le_bytes([nonce[8], nonce[9], nonce[10], nonce[11]]),
        ];

        for i in 0..8 {
            self.state[4 + i] =
                u32::from_le_bytes([key[4 * i], key[4 * i + 1], key[4 * i + 2], key[4 * i + 3]]);
        }
    }

    /// Handle processing of the ChaCha20 stream with a vector of bytes.
    ///
    /// Runs the ChaCha20 **block function** on the current 4×4 state matrix and
    /// XORs the resulting 64-byte keystream block with `input` to produce `output`.
    /// ChaCha20 uses an ARX construction (Add-Rotate-XOR) over 32-bit words and
    /// performs **10 double rounds** (20 rounds total). Each **double round** is:
    /// 1× **column round** followed by 1× **diagonal round**.
    ///
    /// ## State view (words 0..15)
    /// The internal state is a 4×4 matrix of `u32` (little-endian words):
    ///
    /// ```text
    ///  0: x00  x01  x02  x03
    ///  1: x04  x05  x06  x07
    ///  2: x08  x09  x10  x11
    ///  3: x12  x13  x14  x15
    /// ```
    ///
    /// In this implementation, four NEON vectors hold the rows:
    /// `x[0] = {x00,x01,x02,x03}`, `x[1] = {x04,x05,x06,x07}`, etc.
    ///
    /// ## Quarter round (QR)
    /// The fundamental operation acts on four words `(a,b,c,d)` by ARX steps:
    ///
    /// ```text
    /// a += b; d ^= a; d <<< 16
    /// c += d; b ^= c; b <<< 12
    /// a += b; d ^= a; d <<<  8
    /// c += d; b ^= c; b <<<  7
    /// ```
    ///
    /// (Rotations are left rotates; `+=` is `u32` addition modulo 2³².)
    ///
    /// ## One **double round** = Column round + Diagonal round
    ///
    /// **Column round**: apply QR to the 4 independent columns:
    /// ```text
    /// QR(x00,x04,x08,x12)   QR(x01,x05,x09,x13)
    /// QR(x02,x06,x10,x14)   QR(x03,x07,x11,x15)
    /// ```
    ///
    /// **Diagonal round**: apply QR to the 4 “diagonals”:
    /// ```text
    /// QR(x00,x05,x10,x15)   QR(x01,x06,x11,x12)
    /// QR(x02,x07,x08,x13)   QR(x03,x04,x09,x14)
    /// ```
    ///
    /// Ten such double rounds (i.e., `NUM_ROUNDS = 20`) produce diffusion across
    /// all words while remaining constant-time and branchless.
    ///
    /// ## Serialize and XOR
    /// After all rounds, the algorithm **adds the original state** to the working
    /// state (word-wise, modulo 2³²). The result is serialized as 64 bytes and
    /// XORed with the plaintext/ciphertext to produce the output block.
    ///
    /// ## Important usage notes
    /// - **Block counter**: Each 64-byte output block must use a **distinct**
    ///   `(key, nonce, counter)` tuple. For multi-block inputs you MUST increment
    ///   the 32-bit counter for each subsequent block (and handle wraparound by
    ///   rejecting or switching nonce). Reusing keystream (same key+nonce+counter)
    ///   **breaks confidentiality**.
    /// - **Round count**: Standard ChaCha20 uses **20 rounds** (`NUM_ROUNDS = 20`,
    ///   an even number). Using fewer rounds reduces security.
    /// - **Endianness**: Words are little-endian per RFC 8439; serialization to
    ///   bytes must preserve that ordering.
    ///
    /// # Parameters
    /// - `self`: mutable state holding the 16 words (constants, key, counter, nonce)
    /// - `input`: bytes to encrypt/decrypt (XOR with keystream)
    /// - `output`: destination buffer (must be the same length as `input`)
    ///
    /// # Returns
    /// - None (writes into `output`)
    ///
    /// # Safety
    /// This function is `unsafe` because it uses raw pointers/NEON intrinsics:
    /// - **Buffer aliasing/lengths**: Caller must ensure `input.len() == output.len()`.
    ///   Overlapping `input` and `output` is only safe if they are the **same**
    ///   buffer or non-overlapping; arbitrary partial overlaps can cause UB.
    /// - **Alignment & validity**: Pointers derived from slices must remain valid
    ///   for 16-byte vector loads/stores used by NEON intrinsics.
    /// - **Target features**: Requires AArch64 with NEON/AdvSIMD; calling on an
    ///   unsupported CPU is undefined.
    /// - **Constant-time**: The ChaCha20 core (ARX) has no secret-dependent
    ///   branches or memory lookups, aiding side-channel resistance. Still, ensure
    ///   the broader context doesn’t introduce timing/cache leaks (e.g., through
    ///   conditional handling based on secret data).
    ///
    /// # Cryptographic safety checklist
    /// - ✅ Use **unique nonce** per message under a fixed key.
    /// - ✅ Increment the 32-bit **block counter per 64-byte block**.
    /// - ✅ Keep `NUM_ROUNDS = 20`.
    /// - ✅ Zeroize keys/state when no longer needed (outside this function).
    /// - ❌ Never reuse (key, nonce, counter) for two different blocks/messages.
    pub unsafe fn process(&mut self, input: &[u8], output: &mut [u8]) {
        assert_eq!(
            input.len(),
            output.len(),
            "Input and output must be the same length"
        );

        // Load the 16 u32 words of the ChaCha20 state into four NEON 128-bit registers (q0–q3).
        // Each vld1q_u32 loads 4 consecutive u32s (16 bytes).
        // So:
        //   x[0] = {state[0], state[1], state[2], state[3]}
        //   x[1] = {state[4], state[5], state[6], state[7]}
        //   x[2] = {state[8], state[9], state[10], state[11]}
        //   x[3] = {state[12],state[13],state[14],state[15]}
        let mut x = [
            vld1q_u32(&self.state[0]),
            vld1q_u32(&self.state[4]),
            vld1q_u32(&self.state[8]),
            vld1q_u32(&self.state[12]),
        ];

        // ──────────────────────────────────────────────────────────────────────────────
        // ChaCha20 double-round diagram (Column round → Diagonal round)
        // This loop performs NUM_ROUNDS/“round-steps”, where each step is a *double round*
        // consisting of a Column round followed by a Diagonal round. For standard
        // ChaCha20 you should set NUM_ROUNDS = 10 (→ 20 rounds total).
        //
        // State is a 4×4 matrix of u32 words; we name them x00..x15 for clarity:
        //
        //   Row 0: x00 x01 x02 x03
        //   Row 1: x04 x05 x06 x07
        //   Row 2: x08 x09 x10 x11
        //   Row 3: x12 x13 x14 x15
        //
        // Internally, `x` holds four NEON vectors (rows):
        //   x[0] = {x00,x01,x02,x03}
        //   x[1] = {x04,x05,x06,x07}
        //   x[2] = {x08,x09,x10,x11}
        //   x[3] = {x12,x13,x14,x15}
        //
        // QuarterRound (QR) acts on 4 words (a,b,c,d) via ARX steps:
        //   a += b; d ^= a; d <<< 16
        //   c += d; b ^= c; b <<< 12
        //   a += b; d ^= a; d <<<  8
        //   c += d; b ^= c; b <<<  7
        //
        // ── Column round ──────────────────────────────────────────────────────────────
        // Apply QR down each vertical column (independent, can be visualized in parallel):
        //
        //   Column 0: QR(x00, x04, x08, x12)
        //   Column 1: QR(x01, x05, x09, x13)
        //   Column 2: QR(x02, x06, x10, x14)
        //   Column 3: QR(x03, x07, x11, x15)
        //
        // Visually:
        //
        //   (x00)   (x01)   (x02)   (x03)
        //     |       |       |       |
        //   (x04)   (x05)   (x06)   (x07)    ← QR applied down each column
        //     |       |       |       |
        //   (x08)   (x09)   (x10)   (x11)
        //     |       |       |       |
        //   (x12)   (x13)   (x14)   (x15)
        //
        // Effect: mixes words within columns; no cross-column mixing yet.
        //
        // ── Diagonal round ────────────────────────────────────────────────────────────
        // Rotate/permute words so that the next set of QRs act along “diagonals”,
        // then apply QR on those 4-tuples:
        //
        //   Diagonal A: QR(x00, x05, x10, x15)
        //   Diagonal B: QR(x01, x06, x11, x12)
        //   Diagonal C: QR(x02, x07, x08, x13)
        //   Diagonal D: QR(x03, x04, x09, x14)
        //
        // Visually (indices wrap around edges):
        //
        //   x00  x01  x02  x03
        //   x04  x05  x06  x07
        //   x08  x09  x10  x11
        //   x12  x13  x14  x15
        //
        //   Diagonals used for QRs (with wrap):
        //     • x00 → x05 → x10 → x15
        //     • x01 → x06 → x11 → x12
        //     • x02 → x07 → x08 → x13
        //     • x03 → x04 → x09 → x14
        //
        // Effect: cross-row/column diffusion; after (Column + Diagonal) the mixing
        // has reached across both axes of the matrix → one *double round*.
        //
        // ── Summary per loop iteration ────────────────────────────────────────────────
        //   self.quarter_round(&mut x, 0, 1, 2, 3);  // Column round (4 QRs on columns)
        //   self.diagonal_round(&mut x);             // Diagonal round (4 QRs on diagonals)
        //
        // Repeat this pair NUM_ROUNDS times (NUM_ROUNDS must be even; 10 for ChaCha20).
        // After all double rounds, add the original state (feed-forward) and serialize.
        // ──────────────────────────────────────────────────────────────────────────────
        for _ in 0..NUM_ROUNDS {
            // Column rounds: apply the quarter round function to each vertical column
            // of the state matrix (acting across registers).
            self.quarter_round(&mut x, 0, 1, 2, 3);

            // Diagonal rounds: apply the quarter round function to the diagonals
            // of the state matrix (rotating words across registers).
            self.diagonal_round(&mut x);
        }

        // After the rounds, add the original state back (ChaCha20's "feed-forward" step).
        //
        // ChaCha20 applies 20 rounds of the ARX (Add-Rotate-XOR) function to a copy
        // of the initial 16-word state. If we stopped there, the result would be a
        // highly mixed but *non-invertible* transformation: you couldn’t guarantee
        // the ability to regenerate the keystream deterministically from just the key,
        // nonce, and counter.
        //
        // To fix this, the algorithm “feeds forward” by adding each word of the
        // *original state* to the corresponding word of the scrambled state
        // (addition is performed modulo 2^32, i.e., wrapping on overflow).
        //
        // Effectively:
        //   for i in 0..16 {
        //       working_state[i] = working_state[i].wrapping_add(original_state[i]);
        //   }
        //
        // This has two purposes:
        // 1. It ties the final block output directly to the initial state (key,
        //    counter, nonce, constants). Without this, two different inputs could
        //    potentially converge to the same scrambled state after 20 rounds.
        // 2. It preserves the cipher’s reversibility in a theoretical sense: the
        //    keystream is uniquely determined by the (key, nonce, counter).
        //
        // After this feed-forward, the 16 words are serialized into 64 bytes, and
        // XORed with the plaintext/ciphertext to produce the final stream cipher
        // output.
        for i in 0..4 {
            // vaddq_u32: 128-bit vector addition (element-wise, wrapping modulo 2^32).
            // Each lane in x[i] is incremented by the corresponding word from the original state.
            x[i] = vaddq_u32(x[i], vld1q_u32(&self.state[i * 4]));

            // Treat x[i] (a uint32x4_t NEON vector) as 16 raw bytes for serialization.
            let output_bytes =
                core::slice::from_raw_parts((&x[i] as *const uint32x4_t) as *const u8, 16);

            // XOR keystream bytes with input bytes to produce ciphertext (or plaintext if decrypting).
            for j in 0..16 {
                output[i * 16 + j] = input[i * 16 + j] ^ output_bytes[j];
            }
        }
    }

    /// Performs one ChaCha20 **quarter round** over four selected words of the state (ARX: Add–Rotate–XOR).
    ///
    /// A quarter round (QR) mixes four words `(a, b, c, d)` with 4 ARX substeps.
    /// In this NEON implementation, each word is a **lane-wise** `u32` inside a 128-bit
    /// vector (`uint32x4_t`). All operations occur **per lane**, i.e., on four independent
    /// words in parallel.
    ///
    /// # Algorithm (per lane)
    /// ```text
    /// a = a + b;  d ^= a;  d = rotl32(d, 16)
    /// c = c + d;  b ^= c;  b = rotl32(b, 12)
    /// a = a + b;  d ^= a;  d = rotl32(d,  8)
    /// c = c + d;  b ^= c;  b = rotl32(b,  7)
    /// ```
    /// where `+` is addition modulo 2³² (wrapping), `^` is XOR, and `rotl32(x,n)` is a left
    /// rotate of 32-bit lanes by `n` bits.
    ///
    /// # Parameters
    /// - `x`: The working state as four NEON rows (`x[0]..x[3]`), each `uint32x4_t = [u32;4]`.
    /// - `a`, `b`, `c`, `d`: **Indices** into `x` selecting which rows to treat as the 4 words.
    ///   (When used in the block function, `(0,1,2,3)` gives the column round; the
    ///   diagonal round uses a permuted selection.)
    ///
    /// # Effects on state
    /// - Updates `x[a]`, `x[b]`, `x[c]`, `x[d]` in place, lane-wise, according to the QR.
    /// - Other rows in `x` are unchanged.
    ///
    /// # Rotations with NEON
    /// NEON lacks a direct rotate instruction for 32-bit lanes, so we synthesize it:
    /// `rotl32(v, n) = (v << n) | (v >> (32 - n))`, implemented with `vshlq_n_u32`,
    /// `vshrq_n_u32`, and `vorrq_u32`.
    ///
    /// # Safety
    /// - Uses `core::arch::aarch64` intrinsics and thus is `unsafe`.
    /// - Caller must ensure AArch64 with AdvSIMD (NEON) is available.
    /// - The function is intended for internal use by the ChaCha20 rounds; it assumes
    ///   `x` holds valid state rows and that indices `a,b,c,d` are in `0..4` and distinct
    ///   in the intended call patterns.
    ///
    /// # Notes
    /// - All arithmetic is constant-time with respect to data (no data-dependent branches or lookups).
    /// - This QR is applied four times per double round (once per column / diagonal tuple).
    ///
    /// (See the block function docs for full usage.)`
    fn quarter_round(&self, x: &mut [uint32x4_t; 4], a: usize, b: usize, c: usize, d: usize) {
        unsafe {
            x[a] = vaddq_u32(x[a], x[b]);
            x[d] = veorq_u32(x[d], x[a]);
            x[d] = vorrq_u32(vshlq_n_u32(x[d], 16), vshrq_n_u32(x[d], 16)); // Rotate by 16 bits

            x[c] = vaddq_u32(x[c], x[d]);
            x[b] = veorq_u32(x[b], x[c]);
            x[b] = vorrq_u32(vshlq_n_u32(x[b], 12), vshrq_n_u32(x[b], 20)); // Rotate by 12 bits

            x[a] = vaddq_u32(x[a], x[b]);
            x[d] = veorq_u32(x[d], x[a]);
            x[d] = vorrq_u32(vshlq_n_u32(x[d], 8), vshrq_n_u32(x[d], 24)); // Rotate by 8 bits

            x[c] = vaddq_u32(x[c], x[d]);
            x[b] = veorq_u32(x[b], x[c]);
            x[b] = vorrq_u32(vshlq_n_u32(x[b], 7), vshrq_n_u32(x[b], 25)); // Rotate by 7 bits
        }
    }

    /// Performs the ChaCha20 **diagonal round** over the working state `x` using NEON rows.
    ///
    /// In ChaCha20, one **double round** = a **column round** followed by a **diagonal round**.
    /// Each round applies the Quarter Round (QR) to four 32-bit words arranged as either:
    /// - vertical **columns** of the 4×4 state matrix, or
    /// - wrapped **diagonals** of the same matrix.
    ///
    /// This implementation represents the 4×4 matrix as **four NEON row vectors**
    /// (`uint32x4_t`), so a diagonal round can be realized by:
    /// 1) applying a QR to the current `(a,b,c,d)` selection, and then
    /// 2) **cyclically rotating the row registers** so that, on the next call,
    ///    the same `(a,b,c,d)` indices address the *next* diagonal tuple.
    /// Over successive iterations, this rotation causes the QR to visit all
    /// diagonal tuples without per-lane shuffles.
    ///
    /// # Parameters
    /// - `x`: The working state as four rows, `x[0]..x[3]`, each `uint32x4_t = [u32;4]`.
    ///
    /// # Effects on state
    /// - Updates `x[0]..x[3]` in place:
    ///   - First applies the **quarter round** to the tuple `(x[0], x[1], x[2], x[3])`.
    ///   - Then performs a **row-rotation**: `x[1]←x[2]`, `x[2]←x[3]`, `x[3]←old x[1]`.
    ///     This changes which physical rows are bound to the logical `(b,c,d)` positions
    ///     on the next invocation, steering the QR across the four diagonals over time.
    ///
    /// # Notes
    /// - This “rotate-the-rows” strategy is a register-level trick: by rotating entire
    ///   vectors, we avoid per-lane permutations to form diagonal tuples. The accompanying
    ///   column round (`quarter_round(x, 0,1,2,3)`) should be called in the main loop
    ///   before this function so that each loop iteration realizes:
    ///   `Column QR` → `Diagonal QR` (via this function) → **row rotation**.
    /// - Ensure the main round loop and the final “feed-forward” step (adding the original
    ///   state rows back to `x[0]..x[3]`) are written with this row rotation in mind.
    ///
    /// # Safety
    /// - Intended for internal use by the ChaCha20 block function. Assumes `x.len()==4`.
    /// - Uses NEON types; caller must ensure AArch64 with AdvSIMD is available.
    fn diagonal_round(&mut self, x: &mut [uint32x4_t; 4]) {
        self.quarter_round(x, 0, 1, 2, 3);

        let temp = x[1];
        x[1] = x[2];
        x[2] = x[3];
        x[3] = temp;
    }

    /// Same as [`process`] but uses [`dinoxor`] instead of `^`.
    pub unsafe fn process_with_dinoxor(&mut self, input: &[u8], output: &mut [u8]) {
        assert_eq!(
            input.len(),
            output.len(),
            "Input and output must be the same length"
        );

        let mut x = [
            vld1q_u32(&self.state[0]),
            vld1q_u32(&self.state[4]),
            vld1q_u32(&self.state[8]),
            vld1q_u32(&self.state[12]),
        ];

        for _ in 0..NUM_ROUNDS {
            self.quarter_round(&mut x, 0, 1, 2, 3);
            self.diagonal_round(&mut x);
        }

        for i in 0..4 {
            x[i] = vaddq_u32(x[i], vld1q_u32(&self.state[i * 4]));
            let mut output_bytes = [0u8; 16];
            vst1q_u8(output_bytes.as_mut_ptr(), vreinterpretq_u8_u32(x[i]));

            for j in 0..16 {
                output[i * 16 + j] = dinoxor(input[i * 16 + j], output_bytes[j]);
            }
        }
    }
}