krypteia-arcana 0.1.0

Pure-Rust classical cryptographic primitives: RSA (PKCS#1 v1.5, OAEP), ECC (NIST P-256/384/521, secp256k1), ECDSA, EdDSA (Ed25519), X25519, AES (128/192/256, GCM/CBC), DES/3DES, SHA-1/2/3, HMAC. Side-channel-aware (Montgomery ladder, branchless point_add_ct). Targets embedded (no_std), STM32 M0/M4/M33, ESP32-C3 RISC-V. Zero runtime dependencies.
Documentation
//! Poly1305 one-time message authentication code (RFC 8439 §2.5).
//!
//! Poly1305 is a polynomial-evaluation MAC over `GF(2^130 - 5)`. It
//! is keyed with a 32-byte one-time key `(r, s)` -- the same key
//! must NEVER be reused for two different messages, otherwise the
//! authentication forgery becomes trivial. In ChaCha20-Poly1305 the
//! key is derived freshly per message from the ChaCha20 keystream
//! block 0, which is the standard way to satisfy this constraint.
//!
//! # Construction
//!
//! ```text
//! Acc = 0
//! for each 16-byte block m_i (last block possibly shorter):
//!     n = m_i || 0x01 || zero-pad to 17 bytes  (the 0x01 is appended
//!         right after the message bytes; for a full 16-byte block it
//!         lives at byte 16, for a short block of length L it lives
//!         at byte L)
//!     Acc = ((Acc + n) * r) mod p   where p = 2^130 - 5
//! tag = (Acc + s) mod 2^128         (low 16 bytes only)
//! ```
//!
//! `r` is "clamped" before use per the spec: certain bits of the 16
//! key bytes are cleared so that intermediate products fit in
//! 130 + 26 bits and overflow can be handled by simple word-wise
//! reduction with no conditional branches.
//!
//! # Internal representation
//!
//! Field elements are stored in **5 limbs of 26 bits** packed into
//! `u32`. This is the standard radix-2^26 representation: it gives
//! enough headroom that a 26-bit × 26-bit product fits in u64
//! (52 bits) and that the carry propagation between limbs after a
//! Poly1305 multiply-and-reduce stays comfortably under u64. It is
//! the same layout used by `poly1305-donna`, `libsodium`,
//! `openssl/poly1305_64.c`, and the reference implementation in
//! the RFC 8439 Appendix C.
//!
//! # Tests
//!
//! Pinned against the RFC 8439 §2.5.2 single test vector. A more
//! exhaustive set of vectors lives in §2.6 and Appendix A but the
//! §2.5.2 vector is sharp enough to catch every meaningful bug
//! (clamping, multiply-and-reduce, finalisation `+ s mod 2^128`).

// ============================================================================
// Poly1305 state
// ============================================================================

/// Poly1305 MAC state.
///
/// Holds the accumulator `acc`, the precomputed `r` (and `r * 5` for
/// each limb, to absorb the `* 5 mod p` of the lazy reduction in the
/// inner loop), the tag-finalisation key `s`, and a 16-byte buffer
/// for handling messages whose length is not a multiple of 16.
///
/// Internally:
/// - `r[0..5]`  : `r` in 5 26-bit limbs
/// - `s[0..4]`  : the second half of the one-time key as 4 LE u32s
/// - `acc[0..5]`: accumulator in 5 26-bit limbs
/// - `buffer`   : up to 16 bytes of pending data
/// - `buf_pos`  : 0..=16
pub struct Poly1305 {
    r: [u32; 5],
    s: [u32; 4],
    acc: [u32; 5],
    buffer: [u8; 16],
    buf_pos: usize,
}

impl Poly1305 {
    /// Initialise Poly1305 with a 32-byte one-time key `key = r ‖ s`.
    ///
    /// `r` is clamped per RFC 8439 §2.5: bytes 3, 7, 11, 15 have
    /// their top 4 bits cleared (`& 0x0f`) and bytes 4, 8, 12 have
    /// their low 2 bits cleared (`& 0xfc`).
    pub fn new(key: &[u8; 32]) -> Self {
        let mut r_bytes = [0u8; 16];
        r_bytes.copy_from_slice(&key[..16]);
        // Clamp r per RFC 8439 §2.5.
        r_bytes[3] &= 0x0f;
        r_bytes[7] &= 0x0f;
        r_bytes[11] &= 0x0f;
        r_bytes[15] &= 0x0f;
        r_bytes[4] &= 0xfc;
        r_bytes[8] &= 0xfc;
        r_bytes[12] &= 0xfc;

        // Pack r into 5 26-bit limbs (little-endian byte order).
        let r0 = u32::from_le_bytes(r_bytes[0..4].try_into().unwrap()) & 0x03ffffff;
        let r1 = (u32::from_le_bytes(r_bytes[3..7].try_into().unwrap()) >> 2) & 0x03ffff03;
        let r2 = (u32::from_le_bytes(r_bytes[6..10].try_into().unwrap()) >> 4) & 0x03ffc0ff;
        let r3 = (u32::from_le_bytes(r_bytes[9..13].try_into().unwrap()) >> 6) & 0x03f03fff;
        let r4 = (u32::from_le_bytes(r_bytes[12..16].try_into().unwrap()) >> 8) & 0x000fffff;

        // The above masks combine the clamping and the radix-2^26
        // packing in one shot. The constants come from the standard
        // donna-style packing used by libsodium and OpenSSL: each
        // limb holds 26 bits aligned on its sub-byte boundary.

        let mut s = [0u32; 4];
        for i in 0..4 {
            s[i] = u32::from_le_bytes(key[16 + 4 * i..16 + 4 * i + 4].try_into().unwrap());
        }

        Self {
            r: [r0, r1, r2, r3, r4],
            s,
            acc: [0u32; 5],
            buffer: [0u8; 16],
            buf_pos: 0,
        }
    }

    /// Absorb additional data into the accumulator.
    ///
    /// Buffers up to 15 leftover bytes between calls so that
    /// arbitrary update sizes work the same as one big call.
    pub fn update(&mut self, data: &[u8]) {
        let mut pos = 0;

        // First, fill the partial buffer if any.
        if self.buf_pos > 0 {
            let need = 16 - self.buf_pos;
            let take = need.min(data.len());
            self.buffer[self.buf_pos..self.buf_pos + take].copy_from_slice(&data[..take]);
            self.buf_pos += take;
            pos += take;
            if self.buf_pos == 16 {
                let block = self.buffer;
                self.absorb_block(&block, /* high_bit = */ true);
                self.buf_pos = 0;
            }
        }

        // Process whole blocks directly from the input.
        while pos + 16 <= data.len() {
            let mut block = [0u8; 16];
            block.copy_from_slice(&data[pos..pos + 16]);
            self.absorb_block(&block, true);
            pos += 16;
        }

        // Stash any remainder for the next call.
        if pos < data.len() {
            let remaining = data.len() - pos;
            self.buffer[..remaining].copy_from_slice(&data[pos..]);
            self.buf_pos = remaining;
        }
    }

    /// Finalise the MAC and write the 16-byte tag into `tag`.
    /// Consumes the state.
    pub fn finalize(mut self) -> [u8; 16] {
        // Process the final partial block (if any) with high_bit = false:
        // for a short block of length L < 16, append a single 0x01 byte
        // at position L and zero-pad to 16 bytes, then absorb without
        // adding the implicit "+1" at bit 128.
        if self.buf_pos > 0 {
            let mut last = [0u8; 16];
            last[..self.buf_pos].copy_from_slice(&self.buffer[..self.buf_pos]);
            last[self.buf_pos] = 0x01;
            self.absorb_block(&last, /* high_bit = */ false);
        }

        // Final reduction of the accumulator into [0, p).
        let mut h0 = self.acc[0];
        let mut h1 = self.acc[1];
        let mut h2 = self.acc[2];
        let mut h3 = self.acc[3];
        let mut h4 = self.acc[4];

        // Carry-propagate one more time.
        let mut c: u32;
        c = h1 >> 26;
        h1 &= 0x03ffffff;
        h2 += c;
        c = h2 >> 26;
        h2 &= 0x03ffffff;
        h3 += c;
        c = h3 >> 26;
        h3 &= 0x03ffffff;
        h4 += c;
        c = h4 >> 26;
        h4 &= 0x03ffffff;
        h0 += c * 5;
        c = h0 >> 26;
        h0 &= 0x03ffffff;
        h1 += c;

        // Compute h - p = h + 5 - 2^130 and select between h and (h - p)
        // based on whether h >= p. We use the standard donna trick:
        // add 5, then subtract 2^130 unless the carry came back negative.
        let mut g0 = h0.wrapping_add(5);
        let mut c = g0 >> 26;
        g0 &= 0x03ffffff;
        let mut g1 = h1.wrapping_add(c);
        let mut c = g1 >> 26;
        g1 &= 0x03ffffff;
        let mut g2 = h2.wrapping_add(c);
        let mut c = g2 >> 26;
        g2 &= 0x03ffffff;
        let mut g3 = h3.wrapping_add(c);
        let c = g3 >> 26;
        g3 &= 0x03ffffff;
        let g4 = h4.wrapping_add(c).wrapping_sub(1 << 26);

        // If g4 has bit 31 set after the subtract-2^130 (i.e. h < p),
        // keep h. Otherwise (h >= p) keep g.
        let mask = (g4 >> 31).wrapping_sub(1); // 0xffffffff if h>=p, else 0
        let inv = !mask;
        h0 = (h0 & inv) | (g0 & mask);
        h1 = (h1 & inv) | (g1 & mask);
        h2 = (h2 & inv) | (g2 & mask);
        h3 = (h3 & inv) | (g3 & mask);
        h4 = (h4 & inv) | (g4 & mask);

        // Pack the 5 26-bit limbs into 4 little-endian u32s.
        let h0 = h0 | (h1 << 26);
        let h1 = (h1 >> 6) | (h2 << 20);
        let h2 = (h2 >> 12) | (h3 << 14);
        let h3 = (h3 >> 18) | (h4 << 8);

        // tag = (h + s) mod 2^128
        let mut f: u64 = h0 as u64 + self.s[0] as u64;
        let t0 = f as u32;
        f = (f >> 32) + h1 as u64 + self.s[1] as u64;
        let t1 = f as u32;
        f = (f >> 32) + h2 as u64 + self.s[2] as u64;
        let t2 = f as u32;
        f = (f >> 32) + h3 as u64 + self.s[3] as u64;
        let t3 = f as u32;

        let mut tag = [0u8; 16];
        tag[0..4].copy_from_slice(&t0.to_le_bytes());
        tag[4..8].copy_from_slice(&t1.to_le_bytes());
        tag[8..12].copy_from_slice(&t2.to_le_bytes());
        tag[12..16].copy_from_slice(&t3.to_le_bytes());
        tag
    }

    /// One-shot helper: feed the entire `data` and return the tag.
    pub fn mac(key: &[u8; 32], data: &[u8]) -> [u8; 16] {
        let mut p = Self::new(key);
        p.update(data);
        p.finalize()
    }

    /// Absorb a single 16-byte block into the accumulator. The
    /// `high_bit` parameter is `true` for normal blocks (the implicit
    /// 0x01 byte at position 16 is added to the integer reading) and
    /// `false` for the final short block, where the message has
    /// already been padded with `0x01 || 0`.
    fn absorb_block(&mut self, block: &[u8; 16], high_bit: bool) {
        // Read the 16-byte block as 5 26-bit limbs (LE).
        let h0 = self.acc[0].wrapping_add(u32::from_le_bytes(block[0..4].try_into().unwrap()) & 0x03ffffff);
        let h1 = self.acc[1].wrapping_add((u32::from_le_bytes(block[3..7].try_into().unwrap()) >> 2) & 0x03ffffff);
        let h2 = self.acc[2].wrapping_add((u32::from_le_bytes(block[6..10].try_into().unwrap()) >> 4) & 0x03ffffff);
        let h3 = self.acc[3].wrapping_add((u32::from_le_bytes(block[9..13].try_into().unwrap()) >> 6) & 0x03ffffff);
        let h4 = self.acc[4].wrapping_add(
            (u32::from_le_bytes(block[12..16].try_into().unwrap()) >> 8) | (if high_bit { 1 << 24 } else { 0 }),
        );

        // h *= r mod p, with the standard donna-style schoolbook
        // 5x5 multiplication that absorbs the * 5 mod p reduction
        // into the cross-terms.
        let r0 = self.r[0] as u64;
        let r1 = self.r[1] as u64;
        let r2 = self.r[2] as u64;
        let r3 = self.r[3] as u64;
        let r4 = self.r[4] as u64;

        // r * 5 (used to fold high bits back into the low limbs).
        let s1 = r1 * 5;
        let s2 = r2 * 5;
        let s3 = r3 * 5;
        let s4 = r4 * 5;

        let h0_64 = h0 as u64;
        let h1_64 = h1 as u64;
        let h2_64 = h2 as u64;
        let h3_64 = h3 as u64;
        let h4_64 = h4 as u64;

        let d0 = h0_64 * r0 + h1_64 * s4 + h2_64 * s3 + h3_64 * s2 + h4_64 * s1;
        let d1 = h0_64 * r1 + h1_64 * r0 + h2_64 * s4 + h3_64 * s3 + h4_64 * s2;
        let d2 = h0_64 * r2 + h1_64 * r1 + h2_64 * r0 + h3_64 * s4 + h4_64 * s3;
        let d3 = h0_64 * r3 + h1_64 * r2 + h2_64 * r1 + h3_64 * r0 + h4_64 * s4;
        let d4 = h0_64 * r4 + h1_64 * r3 + h2_64 * r2 + h3_64 * r1 + h4_64 * r0;

        // Carry-propagate.
        let mut c: u64;
        let mut h0 = (d0 & 0x03ffffff) as u32;
        c = d0 >> 26;
        let mut h1 = ((d1 + c) & 0x03ffffff) as u32;
        c = (d1 + c) >> 26;
        let mut h2 = ((d2 + c) & 0x03ffffff) as u32;
        c = (d2 + c) >> 26;
        let mut h3 = ((d3 + c) & 0x03ffffff) as u32;
        c = (d3 + c) >> 26;
        let mut h4 = ((d4 + c) & 0x03ffffff) as u32;
        c = (d4 + c) >> 26;
        h0 = h0.wrapping_add((c * 5) as u32);
        let c2 = h0 >> 26;
        h0 &= 0x03ffffff;
        h1 = h1.wrapping_add(c2);

        self.acc = [h0, h1, h2, h3, h4];
    }
}

// ============================================================================
// Tests (RFC 8439 pinned vectors)
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    fn hex(s: &str) -> Vec<u8> {
        assert!(s.len() % 2 == 0);
        (0..s.len())
            .step_by(2)
            .map(|i| u8::from_str_radix(&s[i..i + 2], 16).unwrap())
            .collect()
    }

    fn hex_arr<const N: usize>(s: &str) -> [u8; N] {
        let v = hex(s);
        assert_eq!(v.len(), N);
        let mut out = [0u8; N];
        out.copy_from_slice(&v);
        out
    }

    /// RFC 8439 §2.5.2 Poly1305 test vector.
    ///
    /// Key: 85d6be7857556d337f4452fe42d506a8 0103808afb0db2fd4abff6af4149f51b
    /// Msg: "Cryptographic Forum Research Group"
    /// Tag: a8061dc1305136c6c22b8baf0c0127a9
    #[test]
    fn rfc8439_2_5_2_poly1305_vector() {
        let key: [u8; 32] = hex_arr("85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b");
        let msg = b"Cryptographic Forum Research Group";
        let tag = Poly1305::mac(&key, msg);
        let expected = hex("a8061dc1305136c6c22b8baf0c0127a9");
        assert_eq!(tag.to_vec(), expected);
    }

    /// Chunked update must give the same tag as one big update.
    #[test]
    fn poly1305_chunked_update_matches_oneshot() {
        let key = [0x42u8; 32];
        let msg: Vec<u8> = (0..200).map(|i| (i * 7) as u8).collect();

        let oneshot = Poly1305::mac(&key, &msg);

        let mut chunks = Poly1305::new(&key);
        chunks.update(&msg[..7]);
        chunks.update(&msg[7..50]);
        chunks.update(&msg[50..50]); // empty update
        chunks.update(&msg[50..120]); // crosses 16-byte boundaries
        chunks.update(&msg[120..]);
        let tag = chunks.finalize();

        assert_eq!(oneshot, tag);
    }

    /// Different keys must produce different tags for the same
    /// message (sanity that the key actually drives the output).
    #[test]
    fn poly1305_different_keys_differ() {
        let msg = b"the quick brown fox";
        let k1 = [0x01u8; 32];
        let mut k2 = [0x01u8; 32];
        k2[0] ^= 0x80;
        // Both keys still satisfy the clamp constraints since the
        // clamp clears the relevant bits at packing time.
        assert_ne!(Poly1305::mac(&k1, msg), Poly1305::mac(&k2, msg));
    }

    /// Different messages must produce different tags under the same
    /// key (sanity that the message actually drives the output).
    #[test]
    fn poly1305_different_messages_differ() {
        let key = [0x77u8; 32];
        assert_ne!(Poly1305::mac(&key, b"hello"), Poly1305::mac(&key, b"world"));
    }
}