dinoxor 0.3.0 - Docs.rs

extern crate quickcheck;

use core::arch::aarch64::*;
use quickcheck::{Arbitrary, Gen};

use crate::dinoxor::dinoxor;

/// ChaCha20 state consists of 16 words (u32)
const STATE_LEN: usize = 16;

/// Number of rounds in the ChaCha20 algorithm
const NUM_ROUNDS: usize = 10; // ChaCha20 uses 20 rounds, each function call here represents 2 rounds

/// Represents the internal state of a ChaCha20 stream cipher implementation. This struct holds the core parameters required for cryptographic operations.
pub struct ChaCha20State {
    state: [u32; STATE_LEN],
}

// Newtype structs for different array sizes
#[derive(Clone, Debug)]
pub struct Key(pub [u8; 32]);
#[derive(Clone, Debug)]
pub struct Nonce(pub [u8; 12]);
#[derive(Clone, Debug)]
pub struct Block(pub [u8; 64]);

// Implement Arbitrary for each new type
impl Arbitrary for Key {
    /// Generates a random `Key` instance using the given generator.
    ///
    /// Fills an array of 32 u8 values with random bytes and wraps them into a `Key`.
    /// This function uses the provided generator to produce each byte.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to the random number generator.
    ///
    /// Returns:
    /// - A new `Key` instance filled with randomly generated bytes.
    ///
    /// Notes:
    /// - The `Key` struct is constructed by wrapping a 32-byte array.
    /// - This function internally calls `u8::arbitrary(g)` to generate each byte value.
    ///
    /// Examples:
    /// ```rust
    /// use quickcheck::Gen;
    /// use quickcheck::Arbitrary;
    /// use dinoxor::chacha20::Key;
    ///
    /// fn example(g: &mut Gen) -> Result<Key, Box<dyn std::error::Error>> {
    ///     let key = Key::arbitrary(g);
    ///     Ok(key)
    ///  }
    ///
    /// ```
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 32];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Key(arr)
    }
}

impl Arbitrary for Nonce {
    /// Handle generating a random `Nonce` value using the `arbitrary` function.
    ///
    /// This function generates a 12-byte random nonce by calling the `arbitrary` method
    /// on each byte of a fixed-size array. The generated nonce is wrapped in the `Nonce` struct
    /// and returned as a result.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to the random generator (`Gen`).
    ///
    /// Returns:
    /// - The newly generated `Nonce` instance.
    ///
    /// Errors:
    /// This function does not return errors directly, but may encounter failures
    /// during the random generation process. These are typically handled internally by
    /// the test harness and not exposed here.
    ///
    /// Notes:
    /// This function uses the `arbitrary` method from the standard library's test harness
    /// to generate random values. It is intended for use in testing and benchmarking scenarios.
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 12];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Nonce(arr)
    }
}

impl Arbitrary for Block {
    /// Handle arbitrary generation of a `Block` struct using the `arbitrary` generator.
    ///
    /// This function generates an instance of a `Block`, which is initialized with a
    /// 64-element array of bytes. Each byte in the array is generated using a call to
    /// `u8::arbitrary`.
    ///
    /// Parameters:
    /// - `g`: A mutable reference to a generator used for arbitrary value generation.
    ///
    /// Returns:
    /// - An instance of `Block` populated with randomly generated bytes.
    ///
    /// Notes:
    /// - The function uses the standard `arbitrary` crate to generate random values.
    /// - Each byte in the array is generated independently and filled into the block.
    fn arbitrary(g: &mut Gen) -> Self {
        let mut arr = [0u8; 64];
        for byte in arr.iter_mut() {
            *byte = u8::arbitrary(g);
        }
        Block(arr)
    }
}

impl ChaCha20State {
    /// Creates a new [`ChaCha20State`] instance with the provided key, nonce, and counter.
    ///
    /// The ChaCha20 state is a 4×4 matrix of 32-bit words (16 words total = 512 bits).
    /// It is initialized according to the ChaCha20 block function specification:
    ///
    /// ```text
    ///  +----+----+----+----+
    ///  | C0 | C1 | C2 | C3 |  Constants (128 bits)
    ///  +----+----+----+----+
    ///  | K0 | K1 | K2 | K3 |  Key (first 128 bits)
    ///  +----+----+----+----+
    ///  | K4 | K5 | K6 | K7 |  Key (second 128 bits)
    ///  +----+----+----+----+
    ///  | CT | N0 | N1 | N2 |  Block counter + Nonce (96 bits)
    ///  +----+----+----+----+
    /// ```
    ///
    /// ### Layout details:
    ///
    /// - **Constants (`C0..C3`)**  
    ///   Fixed 32-bit words that serve as an algorithm identifier and prevent
    ///   misuse across different stream ciphers.  
    ///   The chosen constants are the ASCII string `"expand 32-byte k"` split into
    ///   little-endian 32-bit words:
    ///   ```text
    ///   0x61707865  // "expa"
    ///   0x3320646e  // "nd 3"
    ///   0x79622d32  // "2-by"
    ///   0x6b206574  // "te k"
    ///   ```
    ///
    /// - **Key (`K0..K7`)**  
    ///   A 256-bit (32-byte) secret key, split into 8 little-endian `u32`s.
    ///   These words provide the cryptographic strength of ChaCha20.
    ///
    /// - **Counter (`CT`)**  
    ///   A 32-bit block counter.  
    ///   This ensures that each 64-byte keystream block produced by ChaCha20
    ///   is unique under a given key/nonce pair. The counter typically starts at
    ///   zero, but may be set to a different value to allow random access into
    ///   the keystream.
    ///
    /// - **Nonce (`N0..N2`)**  
    ///   A 96-bit (12-byte) unique per-message value, split into 3 little-endian `u32`s.  
    ///   The nonce prevents keystream reuse across messages under the same key,
    ///   ensuring security against replay and ciphertext-only attacks.
    ///
    /// ### Why this layout?
    /// - The constants fix the cipher definition.  
    /// - The key provides secrecy.  
    /// - The counter and nonce ensure each block of output is unique, even if
    ///   the key is reused.  
    /// - The state structure makes the quarter-round function efficient by
    ///   aligning data on 32-bit word boundaries.
    ///
    /// # Parameters
    /// - `key`: A 32-byte slice representing the encryption key.
    /// - `nonce`: A 12-byte slice representing the per-message nonce.
    /// - `counter`: A 32-bit unsigned integer block counter.
    ///
    /// # Returns
    /// - `Self`: A new [`ChaCha20State`] instance with the initialized state matrix.
    ///
    /// # Errors
    /// - None. This function will not fail as long as the key and nonce slices
    ///   have the correct lengths.
    ///
    /// # Examples
    /// ```no_run
    /// use dinoxor::chacha20::ChaCha20State;
    ///
    /// let key = [0u8; 32];
    /// let nonce = [0u8; 12];
    /// let counter = 0;
    ///
    /// let state = ChaCha20State::new(&key, &nonce, counter);
    /// ```
    pub fn new(key: &[u8; 32], nonce: &[u8; 12], counter: u32) -> Self {
        let mut state = [
            0x6170_7865,
            0x3320_646e,
            0x7962_2d32,
            0x6b20_6574, // Constants
            0,
            0,
            0,
            0, // 256-bit key
            0,
            0,
            0,
            0,
            counter,                                                      // Block counter
            u32::from_le_bytes([nonce[0], nonce[1], nonce[2], nonce[3]]), // Nonce
            u32::from_le_bytes([nonce[4], nonce[5], nonce[6], nonce[7]]),
            u32::from_le_bytes([nonce[8], nonce[9], nonce[10], nonce[11]]),
        ];

        // Split the 32-byte key into eight 32-bit words (little-endian)
        // and place them into state[4..12].
        for i in 0..8 {
            state[4 + i] =
                u32::from_le_bytes([key[4 * i], key[4 * i + 1], key[4 * i + 2], key[4 * i + 3]]);
        }

        ChaCha20State { state }
    }

    /// Handle the reset operation for a ChaCha20State.
    ///
    /// This function resets the internal state of a `ChaCha20State` with a given key, nonce,
    /// and counter. It initializes the state using constants, then populates it with the
    /// provided key, nonce, and counter values. The `from_le_bytes` method is used to
    /// convert bytes into u32 values for nonces and key components.
    ///
    /// Parameters:
    /// - `key`: A 32-byte slice representing the encryption key.
    /// - `nonce`: A 12-byte slice representing the counter and nonce value.
    /// - `counter`: The block counter to initialize with.
    ///
    /// Returns:
    /// - No return value. This function performs an in-place reset of the state.
    ///
    /// Errors:
    /// - None expected, as this function does not return an error.
    ///
    /// Notes:
    /// - The function uses four separate calls to `u32::from_le_bytes` for each part
    ///   of the nonce and key, as well as one call for `counter`.
    pub fn reset(&mut self, key: &[u8; 32], nonce: &[u8; 12], counter: u32) {
        self.state = [
            0x6170_7865,
            0x3320_646e,
            0x7962_2d32,
            0x6b20_6574, // Constants
            0,
            0,
            0,
            0, // 256-bit key
            0,
            0,
            0,
            0,
            counter,                                                      // Block counter
            u32::from_le_bytes([nonce[0], nonce[1], nonce[2], nonce[3]]), // Nonce
            u32::from_le_bytes([nonce[4], nonce[5], nonce[6], nonce[7]]),
            u32::from_le_bytes([nonce[8], nonce[9], nonce[10], nonce[11]]),
        ];

        for i in 0..8 {
            self.state[4 + i] =
                u32::from_le_bytes([key[4 * i], key[4 * i + 1], key[4 * i + 2], key[4 * i + 3]]);
        }
    }

    /// Handle processing of the ChaCha20 stream with a vector of bytes.
    ///
    /// Runs the ChaCha20 **block function** on the current 4×4 state matrix and
    /// XORs the resulting 64-byte keystream block with `input` to produce `output`.
    /// ChaCha20 uses an ARX construction (Add-Rotate-XOR) over 32-bit words and
    /// performs **10 double rounds** (20 rounds total). Each **double round** is:
    /// 1× **column round** followed by 1× **diagonal round**.
    ///
    /// ## State view (words 0..15)
    /// The internal state is a 4×4 matrix of `u32` (little-endian words):
    ///
    /// ```text
    ///  0: x00  x01  x02  x03
    ///  1: x04  x05  x06  x07
    ///  2: x08  x09  x10  x11
    ///  3: x12  x13  x14  x15
    /// ```
    ///
    /// In this implementation, four NEON vectors hold the rows:
    /// `x[0] = {x00,x01,x02,x03}`, `x[1] = {x04,x05,x06,x07}`, etc.
    ///
    /// ## Quarter round (QR)
    /// The fundamental operation acts on four words `(a,b,c,d)` by ARX steps:
    ///
    /// ```text
    /// a += b; d ^= a; d <<< 16
    /// c += d; b ^= c; b <<< 12
    /// a += b; d ^= a; d <<<  8
    /// c += d; b ^= c; b <<<  7
    /// ```
    ///
    /// (Rotations are left rotates; `+=` is `u32` addition modulo 2³².)
    ///
    /// ## One **double round** = Column round + Diagonal round
    ///
    /// **Column round**: apply QR to the 4 independent columns:
    /// ```text
    /// QR(x00,x04,x08,x12)   QR(x01,x05,x09,x13)
    /// QR(x02,x06,x10,x14)   QR(x03,x07,x11,x15)
    /// ```
    ///
    /// **Diagonal round**: apply QR to the 4 “diagonals”:
    /// ```text
    /// QR(x00,x05,x10,x15)   QR(x01,x06,x11,x12)
    /// QR(x02,x07,x08,x13)   QR(x03,x04,x09,x14)
    /// ```
    ///
    /// Ten such double rounds (i.e., `NUM_ROUNDS = 20`) produce diffusion across
    /// all words while remaining constant-time and branchless.
    ///
    /// ## Serialize and XOR
    /// After all rounds, the algorithm **adds the original state** to the working
    /// state (word-wise, modulo 2³²). The result is serialized as 64 bytes and
    /// XORed with the plaintext/ciphertext to produce the output block.
    ///
    /// ## Important usage notes
    /// - **Block counter**: Each 64-byte output block must use a **distinct**
    ///   `(key, nonce, counter)` tuple. For multi-block inputs you MUST increment
    ///   the 32-bit counter for each subsequent block (and handle wraparound by
    ///   rejecting or switching nonce). Reusing keystream (same key+nonce+counter)
    ///   **breaks confidentiality**.
    /// - **Round count**: Standard ChaCha20 uses **20 rounds** (`NUM_ROUNDS = 20`,
    ///   an even number). Using fewer rounds reduces security.
    /// - **Endianness**: Words are little-endian per RFC 8439; serialization to
    ///   bytes must preserve that ordering.
    ///
    /// # Parameters
    /// - `self`: mutable state holding the 16 words (constants, key, counter, nonce)
    /// - `input`: bytes to encrypt/decrypt (XOR with keystream)
    /// - `output`: destination buffer (must be the same length as `input`)
    ///
    /// # Returns
    /// - None (writes into `output`)
    ///
    /// # Safety
    /// This function is `unsafe` because it uses raw pointers/NEON intrinsics:
    /// - **Buffer aliasing/lengths**: Caller must ensure `input.len() == output.len()`.
    ///   Overlapping `input` and `output` is only safe if they are the **same**
    ///   buffer or non-overlapping; arbitrary partial overlaps can cause UB.
    /// - **Alignment & validity**: Pointers derived from slices must remain valid
    ///   for 16-byte vector loads/stores used by NEON intrinsics.
    /// - **Target features**: Requires AArch64 with NEON/AdvSIMD; calling on an
    ///   unsupported CPU is undefined.
    /// - **Constant-time**: The ChaCha20 core (ARX) has no secret-dependent
    ///   branches or memory lookups, aiding side-channel resistance. Still, ensure
    ///   the broader context doesn’t introduce timing/cache leaks (e.g., through
    ///   conditional handling based on secret data).
    ///
    /// # Cryptographic safety checklist
    /// - ✅ Use **unique nonce** per message under a fixed key.
    /// - ✅ Increment the 32-bit **block counter per 64-byte block**.
    /// - ✅ Keep `NUM_ROUNDS = 20`.
    /// - ✅ Zeroize keys/state when no longer needed (outside this function).
    /// - ❌ Never reuse (key, nonce, counter) for two different blocks/messages.
    pub unsafe fn process(&mut self, input: &[u8], output: &mut [u8]) {
        assert_eq!(
            input.len(),
            output.len(),
            "Input and output must be the same length"
        );

        // Load the 16 u32 words of the ChaCha20 state into four NEON 128-bit registers (q0–q3).
        // Each vld1q_u32 loads 4 consecutive u32s (16 bytes).
        // So:
        //   x[0] = {state[0], state[1], state[2], state[3]}
        //   x[1] = {state[4], state[5], state[6], state[7]}
        //   x[2] = {state[8], state[9], state[10], state[11]}
        //   x[3] = {state[12],state[13],state[14],state[15]}
        let mut x = [
            vld1q_u32(&self.state[0]),
            vld1q_u32(&self.state[4]),
            vld1q_u32(&self.state[8]),
            vld1q_u32(&self.state[12]),
        ];

        // ──────────────────────────────────────────────────────────────────────────────
        // ChaCha20 double-round diagram (Column round → Diagonal round)
        // This loop performs NUM_ROUNDS/“round-steps”, where each step is a *double round*
        // consisting of a Column round followed by a Diagonal round. For standard
        // ChaCha20 you should set NUM_ROUNDS = 10 (→ 20 rounds total).
        //
        // State is a 4×4 matrix of u32 words; we name them x00..x15 for clarity:
        //
        //   Row 0: x00 x01 x02 x03
        //   Row 1: x04 x05 x06 x07
        //   Row 2: x08 x09 x10 x11
        //   Row 3: x12 x13 x14 x15
        //
        // Internally, `x` holds four NEON vectors (rows):
        //   x[0] = {x00,x01,x02,x03}
        //   x[1] = {x04,x05,x06,x07}
        //   x[2] = {x08,x09,x10,x11}
        //   x[3] = {x12,x13,x14,x15}
        //
        // QuarterRound (QR) acts on 4 words (a,b,c,d) via ARX steps:
        //   a += b; d ^= a; d <<< 16
        //   c += d; b ^= c; b <<< 12
        //   a += b; d ^= a; d <<<  8
        //   c += d; b ^= c; b <<<  7
        //
        // ── Column round ──────────────────────────────────────────────────────────────
        // Apply QR down each vertical column (independent, can be visualized in parallel):
        //
        //   Column 0: QR(x00, x04, x08, x12)
        //   Column 1: QR(x01, x05, x09, x13)
        //   Column 2: QR(x02, x06, x10, x14)
        //   Column 3: QR(x03, x07, x11, x15)
        //
        // Visually:
        //
        //   (x00)   (x01)   (x02)   (x03)
        //     |       |       |       |
        //   (x04)   (x05)   (x06)   (x07)    ← QR applied down each column
        //     |       |       |       |
        //   (x08)   (x09)   (x10)   (x11)
        //     |       |       |       |
        //   (x12)   (x13)   (x14)   (x15)
        //
        // Effect: mixes words within columns; no cross-column mixing yet.
        //
        // ── Diagonal round ────────────────────────────────────────────────────────────
        // Rotate/permute words so that the next set of QRs act along “diagonals”,
        // then apply QR on those 4-tuples:
        //
        //   Diagonal A: QR(x00, x05, x10, x15)
        //   Diagonal B: QR(x01, x06, x11, x12)
        //   Diagonal C: QR(x02, x07, x08, x13)
        //   Diagonal D: QR(x03, x04, x09, x14)
        //
        // Visually (indices wrap around edges):
        //
        //   x00  x01  x02  x03
        //   x04  x05  x06  x07
        //   x08  x09  x10  x11
        //   x12  x13  x14  x15
        //
        //   Diagonals used for QRs (with wrap):
        //     • x00 → x05 → x10 → x15
        //     • x01 → x06 → x11 → x12
        //     • x02 → x07 → x08 → x13
        //     • x03 → x04 → x09 → x14
        //
        // Effect: cross-row/column diffusion; after (Column + Diagonal) the mixing
        // has reached across both axes of the matrix → one *double round*.
        //
        // ── Summary per loop iteration ────────────────────────────────────────────────
        //   self.quarter_round(&mut x, 0, 1, 2, 3);  // Column round (4 QRs on columns)
        //   self.diagonal_round(&mut x);             // Diagonal round (4 QRs on diagonals)
        //
        // Repeat this pair NUM_ROUNDS times (NUM_ROUNDS must be even; 10 for ChaCha20).
        // After all double rounds, add the original state (feed-forward) and serialize.
        // ──────────────────────────────────────────────────────────────────────────────
        for _ in 0..NUM_ROUNDS {
            // Column rounds: apply the quarter round function to each vertical column
            // of the state matrix (acting across registers).
            self.quarter_round(&mut x, 0, 1, 2, 3);

            // Diagonal rounds: apply the quarter round function to the diagonals
            // of the state matrix (rotating words across registers).
            self.diagonal_round(&mut x);
        }

        // After the rounds, add the original state back (ChaCha20's "feed-forward" step).
        //
        // ChaCha20 applies 20 rounds of the ARX (Add-Rotate-XOR) function to a copy
        // of the initial 16-word state. If we stopped there, the result would be a
        // highly mixed but *non-invertible* transformation: you couldn’t guarantee
        // the ability to regenerate the keystream deterministically from just the key,
        // nonce, and counter.
        //
        // To fix this, the algorithm “feeds forward” by adding each word of the
        // *original state* to the corresponding word of the scrambled state
        // (addition is performed modulo 2^32, i.e., wrapping on overflow).
        //
        // Effectively:
        //   for i in 0..16 {
        //       working_state[i] = working_state[i].wrapping_add(original_state[i]);
        //   }
        //
        // This has two purposes:
        // 1. It ties the final block output directly to the initial state (key,
        //    counter, nonce, constants). Without this, two different inputs could
        //    potentially converge to the same scrambled state after 20 rounds.
        // 2. It preserves the cipher’s reversibility in a theoretical sense: the
        //    keystream is uniquely determined by the (key, nonce, counter).
        //
        // After this feed-forward, the 16 words are serialized into 64 bytes, and
        // XORed with the plaintext/ciphertext to produce the final stream cipher
        // output.
        for i in 0..4 {
            // vaddq_u32: 128-bit vector addition (element-wise, wrapping modulo 2^32).
            // Each lane in x[i] is incremented by the corresponding word from the original state.
            x[i] = vaddq_u32(x[i], vld1q_u32(&self.state[i * 4]));

            // Treat x[i] (a uint32x4_t NEON vector) as 16 raw bytes for serialization.
            let output_bytes =
                core::slice::from_raw_parts((&x[i] as *const uint32x4_t) as *const u8, 16);

            // XOR keystream bytes with input bytes to produce ciphertext (or plaintext if decrypting).
            for j in 0..16 {
                output[i * 16 + j] = input[i * 16 + j] ^ output_bytes[j];
            }
        }
    }

    /// Performs one ChaCha20 **quarter round** over four selected words of the state (ARX: Add–Rotate–XOR).
    ///
    /// A quarter round (QR) mixes four words `(a, b, c, d)` with 4 ARX substeps.
    /// In this NEON implementation, each word is a **lane-wise** `u32` inside a 128-bit
    /// vector (`uint32x4_t`). All operations occur **per lane**, i.e., on four independent
    /// words in parallel.
    ///
    /// # Algorithm (per lane)
    /// ```text
    /// a = a + b;  d ^= a;  d = rotl32(d, 16)
    /// c = c + d;  b ^= c;  b = rotl32(b, 12)
    /// a = a + b;  d ^= a;  d = rotl32(d,  8)
    /// c = c + d;  b ^= c;  b = rotl32(b,  7)
    /// ```
    /// where `+` is addition modulo 2³² (wrapping), `^` is XOR, and `rotl32(x,n)` is a left
    /// rotate of 32-bit lanes by `n` bits.
    ///
    /// # Parameters
    /// - `x`: The working state as four NEON rows (`x[0]..x[3]`), each `uint32x4_t = [u32;4]`.
    /// - `a`, `b`, `c`, `d`: **Indices** into `x` selecting which rows to treat as the 4 words.
    ///   (When used in the block function, `(0,1,2,3)` gives the column round; the
    ///   diagonal round uses a permuted selection.)
    ///
    /// # Effects on state
    /// - Updates `x[a]`, `x[b]`, `x[c]`, `x[d]` in place, lane-wise, according to the QR.
    /// - Other rows in `x` are unchanged.
    ///
    /// # Rotations with NEON
    /// NEON lacks a direct rotate instruction for 32-bit lanes, so we synthesize it:
    /// `rotl32(v, n) = (v << n) | (v >> (32 - n))`, implemented with `vshlq_n_u32`,
    /// `vshrq_n_u32`, and `vorrq_u32`.
    ///
    /// # Safety
    /// - Uses `core::arch::aarch64` intrinsics and thus is `unsafe`.
    /// - Caller must ensure AArch64 with AdvSIMD (NEON) is available.
    /// - The function is intended for internal use by the ChaCha20 rounds; it assumes
    ///   `x` holds valid state rows and that indices `a,b,c,d` are in `0..4` and distinct
    ///   in the intended call patterns.
    ///
    /// # Notes
    /// - All arithmetic is constant-time with respect to data (no data-dependent branches or lookups).
    /// - This QR is applied four times per double round (once per column / diagonal tuple).
    ///
    /// (See the block function docs for full usage.)`
    fn quarter_round(&self, x: &mut [uint32x4_t; 4], a: usize, b: usize, c: usize, d: usize) {
        unsafe {
            x[a] = vaddq_u32(x[a], x[b]);
            x[d] = veorq_u32(x[d], x[a]);
            x[d] = vorrq_u32(vshlq_n_u32(x[d], 16), vshrq_n_u32(x[d], 16)); // Rotate by 16 bits

            x[c] = vaddq_u32(x[c], x[d]);
            x[b] = veorq_u32(x[b], x[c]);
            x[b] = vorrq_u32(vshlq_n_u32(x[b], 12), vshrq_n_u32(x[b], 20)); // Rotate by 12 bits

            x[a] = vaddq_u32(x[a], x[b]);
            x[d] = veorq_u32(x[d], x[a]);
            x[d] = vorrq_u32(vshlq_n_u32(x[d], 8), vshrq_n_u32(x[d], 24)); // Rotate by 8 bits

            x[c] = vaddq_u32(x[c], x[d]);
            x[b] = veorq_u32(x[b], x[c]);
            x[b] = vorrq_u32(vshlq_n_u32(x[b], 7), vshrq_n_u32(x[b], 25)); // Rotate by 7 bits
        }
    }

    /// Performs the ChaCha20 **diagonal round** over the working state `x` using NEON rows.
    ///
    /// In ChaCha20, one **double round** = a **column round** followed by a **diagonal round**.
    /// Each round applies the Quarter Round (QR) to four 32-bit words arranged as either:
    /// - vertical **columns** of the 4×4 state matrix, or
    /// - wrapped **diagonals** of the same matrix.
    ///
    /// This implementation represents the 4×4 matrix as **four NEON row vectors**
    /// (`uint32x4_t`), so a diagonal round can be realized by:
    /// 1) applying a QR to the current `(a,b,c,d)` selection, and then
    /// 2) **cyclically rotating the row registers** so that, on the next call,
    ///    the same `(a,b,c,d)` indices address the *next* diagonal tuple.
    /// Over successive iterations, this rotation causes the QR to visit all
    /// diagonal tuples without per-lane shuffles.
    ///
    /// # Parameters
    /// - `x`: The working state as four rows, `x[0]..x[3]`, each `uint32x4_t = [u32;4]`.
    ///
    /// # Effects on state
    /// - Updates `x[0]..x[3]` in place:
    ///   - First applies the **quarter round** to the tuple `(x[0], x[1], x[2], x[3])`.
    ///   - Then performs a **row-rotation**: `x[1]←x[2]`, `x[2]←x[3]`, `x[3]←old x[1]`.
    ///     This changes which physical rows are bound to the logical `(b,c,d)` positions
    ///     on the next invocation, steering the QR across the four diagonals over time.
    ///
    /// # Notes
    /// - This “rotate-the-rows” strategy is a register-level trick: by rotating entire
    ///   vectors, we avoid per-lane permutations to form diagonal tuples. The accompanying
    ///   column round (`quarter_round(x, 0,1,2,3)`) should be called in the main loop
    ///   before this function so that each loop iteration realizes:
    ///   `Column QR` → `Diagonal QR` (via this function) → **row rotation**.
    /// - Ensure the main round loop and the final “feed-forward” step (adding the original
    ///   state rows back to `x[0]..x[3]`) are written with this row rotation in mind.
    ///
    /// # Safety
    /// - Intended for internal use by the ChaCha20 block function. Assumes `x.len()==4`.
    /// - Uses NEON types; caller must ensure AArch64 with AdvSIMD is available.
    fn diagonal_round(&mut self, x: &mut [uint32x4_t; 4]) {
        self.quarter_round(x, 0, 1, 2, 3);

        let temp = x[1];
        x[1] = x[2];
        x[2] = x[3];
        x[3] = temp;
    }

    /// Same as [`process`] but uses [`dinoxor`] instead of `^`.
    pub unsafe fn process_with_dinoxor(&mut self, input: &[u8], output: &mut [u8]) {
        assert_eq!(
            input.len(),
            output.len(),
            "Input and output must be the same length"
        );

        let mut x = [
            vld1q_u32(&self.state[0]),
            vld1q_u32(&self.state[4]),
            vld1q_u32(&self.state[8]),
            vld1q_u32(&self.state[12]),
        ];

        for _ in 0..NUM_ROUNDS {
            self.quarter_round(&mut x, 0, 1, 2, 3);
            self.diagonal_round(&mut x);
        }

        for i in 0..4 {
            x[i] = vaddq_u32(x[i], vld1q_u32(&self.state[i * 4]));
            let mut output_bytes = [0u8; 16];
            vst1q_u8(output_bytes.as_mut_ptr(), vreinterpretq_u8_u32(x[i]));

            for j in 0..16 {
                output[i * 16 + j] = dinoxor(input[i * 16 + j], output_bytes[j]);
            }
        }
    }
}