noxtls-crypto 0.2.10

// Copyright (c) 2019-2026, Argenox Technologies LLC
// All rights reserved.
//
// SPDX-License-Identifier: GPL-2.0-only OR LicenseRef-Argenox-Commercial-License
//
// This file is part of the NoxTLS Library.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by the
// Free Software Foundation; version 2 of the License.
//
// Alternatively, this file may be used under the terms of a commercial
// license from Argenox Technologies LLC.
//
// See `noxtls/LICENSE` and `noxtls/LICENSE.md` in this repository for full details.
// CONTACT: info@argenox.com

use crate::drbg::HmacDrbgSha256;
use noxtls_core::{Error, Result};

const MASK25: i64 = (1_i64 << 25) - 1;
const MASK26: i64 = (1_i64 << 26) - 1;
const TWO_P_TIMES_RADIX: [i32; 10] = [
    0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe,
    0x7fffffe, 0x3fffffe,
];

/// Represents an X25519 private scalar.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct X25519PrivateKey {
    scalar: [u8; 32],
}

/// Represents an X25519 public key (Montgomery u-coordinate).
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct X25519PublicKey {
    pub bytes: [u8; 32],
}

impl X25519PrivateKey {
    /// Creates a private key from raw scalar bytes.
    ///
    /// # Arguments
    /// * `bytes`: Raw 32-byte private scalar prior to RFC 7748 clamping.
    ///
    /// # Returns
    /// `X25519PrivateKey` wrapping the provided scalar bytes.
    #[must_use]
    pub fn from_bytes(bytes: [u8; 32]) -> Self {
        Self { scalar: bytes }
    }

    /// Returns the raw 32-byte private scalar bytes.
    ///
    /// # Arguments
    /// * `self`: Private key whose scalar bytes should be copied.
    ///
    /// # Returns
    /// Raw private scalar octets as stored in this key.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    pub fn to_bytes(&self) -> [u8; 32] {
        self.scalar
    }

    /// Clears private scalar bytes in place.
    ///
    /// # Arguments
    /// * `self` — Private key whose scalar buffer is scrubbed.
    ///
    /// # Returns
    /// `()`; all scalar bytes are reset to zero.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    pub fn clear(&mut self) {
        self.scalar.fill(0);
    }

    /// Returns the clamped private scalar bytes.
    ///
    /// # Arguments
    /// * `self`: Private key whose scalar should be clamped for ladder use.
    ///
    /// # Returns
    /// RFC 7748-clamped scalar bytes.
    #[must_use]
    pub fn clamped_scalar(&self) -> [u8; 32] {
        clamp_scalar(self.scalar)
    }

    /// Computes the corresponding public key from Curve25519 basepoint.
    ///
    /// # Arguments
    /// * `self`: Private key used for scalar multiplication with basepoint.
    ///
    /// # Returns
    /// Derived X25519 public key bytes.
    #[must_use]
    pub fn public_key(&self) -> X25519PublicKey {
        X25519PublicKey {
            bytes: noxtls_x25519_basepoint(&self.scalar),
        }
    }

    /// Performs ECDH with a peer public key and returns shared secret bytes.
    ///
    /// # Arguments
    /// * `self`: Local private key.
    /// * `peer`: Peer public key bytes.
    ///
    /// # Returns
    /// 32-byte shared secret result from X25519 scalar multiplication.
    #[must_use]
    pub fn diffie_hellman(&self, peer: X25519PublicKey) -> [u8; 32] {
        noxtls_x25519(&self.scalar, &peer.bytes)
    }

    /// Performs checked ECDH and rejects invalid/weak peer keys and zero shared outputs.
    ///
    /// # Arguments
    /// * `self`: Local private key.
    /// * `peer`: Peer public key to validate and use.
    ///
    /// # Returns
    /// Shared secret when peer validation succeeds and output is non-zero.
    ///
    /// # Errors
    ///
    /// Returns the same errors as [`X25519PublicKey::validate`], or [`Error::CryptoFailure`] when the derived shared secret is all-zero.
    pub fn diffie_hellman_checked(&self, peer: X25519PublicKey) -> Result<[u8; 32]> {
        peer.validate()?;
        let shared = self.diffie_hellman(peer);
        if is_all_zero(&shared) {
            return Err(Error::CryptoFailure(
                "noxtls_x25519 shared secret is all-zero",
            ));
        }
        Ok(shared)
    }
}

impl Drop for X25519PrivateKey {
    fn drop(&mut self) {
        self.clear();
    }
}

impl X25519PublicKey {
    /// Creates a public key from raw bytes.
    ///
    /// # Arguments
    /// * `bytes`: Raw 32-byte Montgomery u-coordinate.
    ///
    /// # Returns
    /// `X25519PublicKey` wrapping the provided bytes.
    #[must_use]
    pub fn from_bytes(bytes: [u8; 32]) -> Self {
        Self { bytes }
    }

    /// Returns true when raw public-key bytes are all-zero.
    ///
    /// # Arguments
    /// * `self`: Public key bytes to inspect.
    ///
    /// # Returns
    /// `true` when all 32 bytes are zero.
    #[must_use]
    pub fn is_all_zero(self) -> bool {
        is_all_zero(&self.bytes)
    }

    /// Validates peer public key for baseline X25519 safety checks.
    ///
    /// # Arguments
    /// * `self`: Peer public key candidate to validate.
    ///
    /// # Returns
    /// `Ok(())` when key is not one of the rejected low-order encodings.
    ///
    /// # Errors
    ///
    /// Returns [`Error::CryptoFailure`] when the RFC 7748 masked u-coordinate is all-zero or equal to one (low-order points).
    pub fn validate(self) -> Result<()> {
        let masked = self.masked_u_coordinate();
        if is_all_zero(&masked) {
            return Err(Error::CryptoFailure(
                "noxtls_x25519 peer public key is low-order (masked zero)",
            ));
        }
        if is_montgomery_u_one(&masked) {
            return Err(Error::CryptoFailure(
                "noxtls_x25519 peer public key is low-order (u=1)",
            ));
        }
        Ok(())
    }

    /// Returns the peer u-coordinate with RFC 7748 high-bit masking applied.
    ///
    /// # Arguments
    ///
    /// * `self` — Public key bytes to copy and mask.
    ///
    /// # Returns
    ///
    /// 32-byte masked Montgomery u-coordinate.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn masked_u_coordinate(self) -> [u8; 32] {
        let mut masked = self.bytes;
        masked[31] &= 0x7f;
        masked
    }
}

/// Computes X25519 scalar multiplication over arbitrary u-coordinate.
///
/// # Arguments
/// * `scalar`: Private scalar bytes (clamped internally).
/// * `u`: Peer Montgomery u-coordinate bytes.
///
/// # Returns
/// 32-byte X25519 scalar multiplication output.
#[must_use]
pub fn noxtls_x25519(scalar: &[u8; 32], u: &[u8; 32]) -> [u8; 32] {
    let k = clamp_scalar(*scalar);
    let mut u_masked = *u;
    u_masked[31] &= 0x7f;
    if is_montgomery_basepoint(&u_masked) {
        return noxtls_x25519_basepoint_ladder(&k);
    }
    let x1 = FieldElement::from_bytes(&u_masked);

    let mut x2 = FieldElement::one();
    let mut z2 = FieldElement::zero();
    let mut x3 = x1;
    let mut z3 = FieldElement::one();
    let mut swap = 0_u8;

    for t in (0..255).rev() {
        let k_t = (k[t / 8] >> (t & 7)) & 1;
        swap ^= k_t;
        FieldElement::cswap(&mut x2, &mut x3, swap);
        FieldElement::cswap(&mut z2, &mut z3, swap);
        swap = k_t;

        let a = x2.add_lazy(&z2);
        let aa = a.square();
        let b = x2.sub_lazy(&z2);
        let bb = b.square();
        let e = aa.sub_lazy(&bb);
        let c = x3.add_lazy(&z3);
        let d = x3.sub_lazy(&z3);
        let da = d.mul(&a);
        let cb = c.mul(&b);
        x3 = da.add_lazy(&cb).square();
        z3 = x1.mul(&da.sub_lazy(&cb).square());
        x2 = aa.mul(&bb);
        z2 = e.mul(&aa.add_lazy(&e.mul_121665()));
    }

    FieldElement::cswap(&mut x2, &mut x3, swap);
    FieldElement::cswap(&mut z2, &mut z3, swap);

    x2.mul(&z2.invert()).to_bytes()
}

#[must_use]
fn noxtls_x25519_basepoint_ladder(clamped_scalar: &[u8; 32]) -> [u8; 32] {
    let mut x2 = FieldElement::one();
    let mut z2 = FieldElement::zero();
    let mut x3 = FieldElement::from_u32(9);
    let mut z3 = FieldElement::one();
    let mut swap = 0_u8;

    for t in (0..255).rev() {
        let k_t = (clamped_scalar[t / 8] >> (t & 7)) & 1;
        swap ^= k_t;
        FieldElement::cswap(&mut x2, &mut x3, swap);
        FieldElement::cswap(&mut z2, &mut z3, swap);
        swap = k_t;

        let a = x2.add_lazy(&z2);
        let aa = a.square();
        let b = x2.sub_lazy(&z2);
        let bb = b.square();
        let e = aa.sub_lazy(&bb);
        let c = x3.add_lazy(&z3);
        let d = x3.sub_lazy(&z3);
        let da = d.mul(&a);
        let cb = c.mul(&b);
        x3 = da.add_lazy(&cb).square();
        z3 = da.sub_lazy(&cb).square().mul_9();
        x2 = aa.mul(&bb);
        z2 = e.mul(&aa.add_lazy(&e.mul_121665()));
    }

    FieldElement::cswap(&mut x2, &mut x3, swap);
    FieldElement::cswap(&mut z2, &mut z3, swap);

    x2.mul(&z2.invert()).to_bytes()
}

/// Computes X25519 scalar multiplication against standard basepoint.
///
/// # Arguments
/// * `scalar`: Private scalar bytes (clamped internally).
///
/// # Returns
/// 32-byte public key u-coordinate for the Curve25519 basepoint.
#[must_use]
pub fn noxtls_x25519_basepoint(scalar: &[u8; 32]) -> [u8; 32] {
    let mut basepoint = [0_u8; 32];
    basepoint[0] = 9;
    noxtls_x25519(scalar, &basepoint)
}

/// Computes X25519 shared secret and validates non-zero output.
///
/// # Arguments
/// * `private_key`: Local private key used for key agreement.
/// * `peer_public_key`: Peer public key to validate and use.
///
/// # Returns
/// Shared secret when peer key and output pass safety checks.
///
/// # Errors
///
/// Forwards errors from [`X25519PrivateKey::diffie_hellman_checked`].
pub fn noxtls_x25519_shared_secret(
    private_key: X25519PrivateKey,
    peer_public_key: X25519PublicKey,
) -> Result<[u8; 32]> {
    private_key.diffie_hellman_checked(peer_public_key)
}

/// Generates an X25519 private key from DRBG output.
///
/// # Arguments
/// * `drbg`: DRBG instance used to fill private scalar bytes.
///
/// # Returns
/// X25519 private key containing DRBG-derived scalar bytes.
///
/// # Errors
///
/// Returns DRBG errors from [`HmacDrbgSha256::generate`], or [`Error::InvalidLength`] if the DRBG output is not exactly 32 bytes.
pub fn noxtls_x25519_generate_private_key_auto(
    drbg: &mut HmacDrbgSha256,
) -> Result<X25519PrivateKey> {
    let scalar = drbg.generate(32, b"x25519_private_scalar")?;
    let bytes: [u8; 32] = scalar
        .as_slice()
        .try_into()
        .map_err(|_| Error::InvalidLength("noxtls_x25519 private scalar length mismatch"))?;
    Ok(X25519PrivateKey::from_bytes(bytes))
}

#[derive(Debug, Clone, Copy, Eq, PartialEq)]
struct FieldElement([i32; 10]);

impl FieldElement {
    /// Returns the additive identity in the Curve25519 base field representation.
    ///
    /// # Returns
    ///
    /// Field element with all limbs zero.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn zero() -> Self {
        Self([0; 10])
    }

    /// Returns the multiplicative identity in the Curve25519 base field representation.
    ///
    /// # Returns
    ///
    /// Field element equal to one.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn one() -> Self {
        Self([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    }

    #[must_use]
    #[inline(always)]
    fn from_u32(value: u32) -> Self {
        Self([value as i32, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    }

    /// Decodes a little-endian 32-byte field encoding into ten 25/26-bit limbs.
    ///
    /// # Arguments
    ///
    /// * `input` — Canonical or non-canonical 32-byte field element encoding.
    ///
    /// # Returns
    ///
    /// Unreduced field element after limb unpacking and carry folding.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn from_bytes(input: &[u8; 32]) -> Self {
        let h0 = (load4(input, 0) as i64) & MASK26;
        let h1 = ((load4(input, 3) as i64) >> 2) & MASK25;
        let h2 = ((load4(input, 6) as i64) >> 3) & MASK26;
        let h3 = ((load4(input, 9) as i64) >> 5) & MASK25;
        let h4 = ((load4(input, 12) as i64) >> 6) & MASK26;
        let h5 = (load4(input, 16) as i64) & MASK25;
        let h6 = ((load4(input, 19) as i64) >> 1) & MASK26;
        let h7 = ((load4(input, 22) as i64) >> 3) & MASK25;
        let h8 = ((load4(input, 25) as i64) >> 4) & MASK26;
        let h9 = ((load4(input, 28) as i64) >> 6) & MASK25;
        Self([
            h0 as i32, h1 as i32, h2 as i32, h3 as i32, h4 as i32, h5 as i32, h6 as i32, h7 as i32,
            h8 as i32, h9 as i32,
        ])
    }

    /// Encodes a normalized field element into a little-endian 32-byte string.
    ///
    /// # Arguments
    ///
    /// * `self` — Field element to normalize and encode.
    ///
    /// # Returns
    ///
    /// Canonical 32-byte encoding suitable for wire formats.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn to_bytes(self) -> [u8; 32] {
        let mut h0 = i64::from(self.0[0]);
        let mut h1 = i64::from(self.0[1]);
        let mut h2 = i64::from(self.0[2]);
        let mut h3 = i64::from(self.0[3]);
        let mut h4 = i64::from(self.0[4]);
        let mut h5 = i64::from(self.0[5]);
        let mut h6 = i64::from(self.0[6]);
        let mut h7 = i64::from(self.0[7]);
        let mut h8 = i64::from(self.0[8]);
        let mut h9 = i64::from(self.0[9]);

        let mut q = (19 * h9 + (1 << 24)) >> 25;
        q = (h0 + q) >> 26;
        q = (h1 + q) >> 25;
        q = (h2 + q) >> 26;
        q = (h3 + q) >> 25;
        q = (h4 + q) >> 26;
        q = (h5 + q) >> 25;
        q = (h6 + q) >> 26;
        q = (h7 + q) >> 25;
        q = (h8 + q) >> 26;
        q = (h9 + q) >> 25;

        h0 += 19 * q;

        let carry0 = h0 >> 26;
        h1 += carry0;
        h0 -= carry0 << 26;
        let carry1 = h1 >> 25;
        h2 += carry1;
        h1 -= carry1 << 25;
        let carry2 = h2 >> 26;
        h3 += carry2;
        h2 -= carry2 << 26;
        let carry3 = h3 >> 25;
        h4 += carry3;
        h3 -= carry3 << 25;
        let carry4 = h4 >> 26;
        h5 += carry4;
        h4 -= carry4 << 26;
        let carry5 = h5 >> 25;
        h6 += carry5;
        h5 -= carry5 << 25;
        let carry6 = h6 >> 26;
        h7 += carry6;
        h6 -= carry6 << 26;
        let carry7 = h7 >> 25;
        h8 += carry7;
        h7 -= carry7 << 25;
        let carry8 = h8 >> 26;
        h9 += carry8;
        h8 -= carry8 << 26;
        let carry9 = h9 >> 25;
        h9 -= carry9 << 25;

        [
            h0 as u8,
            (h0 >> 8) as u8,
            (h0 >> 16) as u8,
            ((h0 >> 24) | (h1 << 2)) as u8,
            (h1 >> 6) as u8,
            (h1 >> 14) as u8,
            ((h1 >> 22) | (h2 << 3)) as u8,
            (h2 >> 5) as u8,
            (h2 >> 13) as u8,
            ((h2 >> 21) | (h3 << 5)) as u8,
            (h3 >> 3) as u8,
            (h3 >> 11) as u8,
            ((h3 >> 19) | (h4 << 6)) as u8,
            (h4 >> 2) as u8,
            (h4 >> 10) as u8,
            (h4 >> 18) as u8,
            h5 as u8,
            (h5 >> 8) as u8,
            (h5 >> 16) as u8,
            ((h5 >> 24) | (h6 << 1)) as u8,
            (h6 >> 7) as u8,
            (h6 >> 15) as u8,
            ((h6 >> 23) | (h7 << 3)) as u8,
            (h7 >> 5) as u8,
            (h7 >> 13) as u8,
            ((h7 >> 21) | (h8 << 4)) as u8,
            (h8 >> 4) as u8,
            (h8 >> 12) as u8,
            ((h8 >> 20) | (h9 << 6)) as u8,
            (h9 >> 2) as u8,
            (h9 >> 10) as u8,
            (h9 >> 18) as u8,
        ]
    }

    /// Adds two field elements and applies carry reduction.
    ///
    /// # Arguments
    ///
    /// * `self` — Left operand.
    /// * `rhs` — Right operand.
    ///
    /// # Returns
    ///
    /// Sum modulo \\(2^{255} - 19\\) in limb form.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[allow(dead_code)]
    fn add(&self, rhs: &Self) -> Self {
        Self([
            self.0[0].wrapping_add(rhs.0[0]),
            self.0[1].wrapping_add(rhs.0[1]),
            self.0[2].wrapping_add(rhs.0[2]),
            self.0[3].wrapping_add(rhs.0[3]),
            self.0[4].wrapping_add(rhs.0[4]),
            self.0[5].wrapping_add(rhs.0[5]),
            self.0[6].wrapping_add(rhs.0[6]),
            self.0[7].wrapping_add(rhs.0[7]),
            self.0[8].wrapping_add(rhs.0[8]),
            self.0[9].wrapping_add(rhs.0[9]),
        ])
        .carry_reduce()
    }

    /// Adds two field elements without an immediate carry pass.
    ///
    /// Intended for Montgomery ladder intermediates that feed directly into
    /// multiplication or squaring, where reduction already happens.
    #[must_use]
    #[inline(always)]
    fn add_lazy(&self, rhs: &Self) -> Self {
        Self([
            self.0[0].wrapping_add(rhs.0[0]),
            self.0[1].wrapping_add(rhs.0[1]),
            self.0[2].wrapping_add(rhs.0[2]),
            self.0[3].wrapping_add(rhs.0[3]),
            self.0[4].wrapping_add(rhs.0[4]),
            self.0[5].wrapping_add(rhs.0[5]),
            self.0[6].wrapping_add(rhs.0[6]),
            self.0[7].wrapping_add(rhs.0[7]),
            self.0[8].wrapping_add(rhs.0[8]),
            self.0[9].wrapping_add(rhs.0[9]),
        ])
    }

    /// Subtracts two field elements modulo \\(p = 2^{255} - 19\\) with borrow-safe limb arithmetic.
    ///
    /// # Arguments
    ///
    /// * `self` — Minuend.
    /// * `rhs` — Subtrahend.
    ///
    /// # Returns
    ///
    /// Difference after adding \\(2p\\) internally to avoid underflow, then reducing.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[allow(dead_code)]
    fn sub(&self, rhs: &Self) -> Self {
        Self([
            self.0[0]
                .wrapping_add(TWO_P_TIMES_RADIX[0])
                .wrapping_sub(rhs.0[0]),
            self.0[1]
                .wrapping_add(TWO_P_TIMES_RADIX[1])
                .wrapping_sub(rhs.0[1]),
            self.0[2]
                .wrapping_add(TWO_P_TIMES_RADIX[2])
                .wrapping_sub(rhs.0[2]),
            self.0[3]
                .wrapping_add(TWO_P_TIMES_RADIX[3])
                .wrapping_sub(rhs.0[3]),
            self.0[4]
                .wrapping_add(TWO_P_TIMES_RADIX[4])
                .wrapping_sub(rhs.0[4]),
            self.0[5]
                .wrapping_add(TWO_P_TIMES_RADIX[5])
                .wrapping_sub(rhs.0[5]),
            self.0[6]
                .wrapping_add(TWO_P_TIMES_RADIX[6])
                .wrapping_sub(rhs.0[6]),
            self.0[7]
                .wrapping_add(TWO_P_TIMES_RADIX[7])
                .wrapping_sub(rhs.0[7]),
            self.0[8]
                .wrapping_add(TWO_P_TIMES_RADIX[8])
                .wrapping_sub(rhs.0[8]),
            self.0[9]
                .wrapping_add(TWO_P_TIMES_RADIX[9])
                .wrapping_sub(rhs.0[9]),
        ])
        .carry_reduce()
    }

    /// Subtracts two field elements without an immediate carry pass.
    ///
    /// This keeps the same borrow-avoidance strategy as [`Self::sub`] but
    /// leaves carry propagation to the following multiply or square.
    #[must_use]
    #[inline(always)]
    fn sub_lazy(&self, rhs: &Self) -> Self {
        Self([
            self.0[0]
                .wrapping_add(TWO_P_TIMES_RADIX[0])
                .wrapping_sub(rhs.0[0]),
            self.0[1]
                .wrapping_add(TWO_P_TIMES_RADIX[1])
                .wrapping_sub(rhs.0[1]),
            self.0[2]
                .wrapping_add(TWO_P_TIMES_RADIX[2])
                .wrapping_sub(rhs.0[2]),
            self.0[3]
                .wrapping_add(TWO_P_TIMES_RADIX[3])
                .wrapping_sub(rhs.0[3]),
            self.0[4]
                .wrapping_add(TWO_P_TIMES_RADIX[4])
                .wrapping_sub(rhs.0[4]),
            self.0[5]
                .wrapping_add(TWO_P_TIMES_RADIX[5])
                .wrapping_sub(rhs.0[5]),
            self.0[6]
                .wrapping_add(TWO_P_TIMES_RADIX[6])
                .wrapping_sub(rhs.0[6]),
            self.0[7]
                .wrapping_add(TWO_P_TIMES_RADIX[7])
                .wrapping_sub(rhs.0[7]),
            self.0[8]
                .wrapping_add(TWO_P_TIMES_RADIX[8])
                .wrapping_sub(rhs.0[8]),
            self.0[9]
                .wrapping_add(TWO_P_TIMES_RADIX[9])
                .wrapping_sub(rhs.0[9]),
        ])
    }

    /// Multiplies a field element by a small scalar constant and reduces the result.
    ///
    /// # Arguments
    ///
    /// * `self` — Field element to scale.
    /// * `scalar` — Small integer multiplier (used with the Curve25519 `121665` constant).
    ///
    /// # Returns
    ///
    /// Product after limb-wise multiply-accumulate and reduction.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[inline(always)]
    fn mul_121665(&self) -> Self {
        const C121665: i64 = 121665;
        Self(reduce_coefficients([
            i64::from(self.0[0]) * C121665,
            i64::from(self.0[1]) * C121665,
            i64::from(self.0[2]) * C121665,
            i64::from(self.0[3]) * C121665,
            i64::from(self.0[4]) * C121665,
            i64::from(self.0[5]) * C121665,
            i64::from(self.0[6]) * C121665,
            i64::from(self.0[7]) * C121665,
            i64::from(self.0[8]) * C121665,
            i64::from(self.0[9]) * C121665,
        ]))
    }

    #[must_use]
    #[inline(always)]
    fn mul_9(&self) -> Self {
        Self(reduce_coefficients([
            i64::from(self.0[0]) * 9,
            i64::from(self.0[1]) * 9,
            i64::from(self.0[2]) * 9,
            i64::from(self.0[3]) * 9,
            i64::from(self.0[4]) * 9,
            i64::from(self.0[5]) * 9,
            i64::from(self.0[6]) * 9,
            i64::from(self.0[7]) * 9,
            i64::from(self.0[8]) * 9,
            i64::from(self.0[9]) * 9,
        ]))
    }

    /// Multiplies two field elements using 64-bit products over 25/26-bit limbs.
    ///
    /// # Arguments
    ///
    /// * `self` — Left operand.
    /// * `rhs` — Right operand.
    ///
    /// # Returns
    ///
    /// Product after schoolbook multiplication, carry propagation, and reduction.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[inline(always)]
    fn mul(&self, rhs: &Self) -> Self {
        let f0 = i64::from(self.0[0]);
        let f1 = i64::from(self.0[1]);
        let f2 = i64::from(self.0[2]);
        let f3 = i64::from(self.0[3]);
        let f4 = i64::from(self.0[4]);
        let f5 = i64::from(self.0[5]);
        let f6 = i64::from(self.0[6]);
        let f7 = i64::from(self.0[7]);
        let f8 = i64::from(self.0[8]);
        let f9 = i64::from(self.0[9]);

        let g0 = i64::from(rhs.0[0]);
        let g1 = i64::from(rhs.0[1]);
        let g2 = i64::from(rhs.0[2]);
        let g3 = i64::from(rhs.0[3]);
        let g4 = i64::from(rhs.0[4]);
        let g5 = i64::from(rhs.0[5]);
        let g6 = i64::from(rhs.0[6]);
        let g7 = i64::from(rhs.0[7]);
        let g8 = i64::from(rhs.0[8]);
        let g9 = i64::from(rhs.0[9]);

        let g1_19 = 19 * g1;
        let g2_19 = 19 * g2;
        let g3_19 = 19 * g3;
        let g4_19 = 19 * g4;
        let g5_19 = 19 * g5;
        let g6_19 = 19 * g6;
        let g7_19 = 19 * g7;
        let g8_19 = 19 * g8;
        let g9_19 = 19 * g9;

        let f1_2 = 2 * f1;
        let f3_2 = 2 * f3;
        let f5_2 = 2 * f5;
        let f7_2 = 2 * f7;
        let f9_2 = 2 * f9;

        let h0 = f0 * g0
            + f1_2 * g9_19
            + f2 * g8_19
            + f3_2 * g7_19
            + f4 * g6_19
            + f5_2 * g5_19
            + f6 * g4_19
            + f7_2 * g3_19
            + f8 * g2_19
            + f9_2 * g1_19;
        let h1 = f0 * g1
            + f1 * g0
            + f2 * g9_19
            + f3 * g8_19
            + f4 * g7_19
            + f5 * g6_19
            + f6 * g5_19
            + f7 * g4_19
            + f8 * g3_19
            + f9 * g2_19;
        let h2 = f0 * g2
            + f1_2 * g1
            + f2 * g0
            + f3_2 * g9_19
            + f4 * g8_19
            + f5_2 * g7_19
            + f6 * g6_19
            + f7_2 * g5_19
            + f8 * g4_19
            + f9_2 * g3_19;
        let h3 = f0 * g3
            + f1 * g2
            + f2 * g1
            + f3 * g0
            + f4 * g9_19
            + f5 * g8_19
            + f6 * g7_19
            + f7 * g6_19
            + f8 * g5_19
            + f9 * g4_19;
        let h4 = f0 * g4
            + f1_2 * g3
            + f2 * g2
            + f3_2 * g1
            + f4 * g0
            + f5_2 * g9_19
            + f6 * g8_19
            + f7_2 * g7_19
            + f8 * g6_19
            + f9_2 * g5_19;
        let h5 = f0 * g5
            + f1 * g4
            + f2 * g3
            + f3 * g2
            + f4 * g1
            + f5 * g0
            + f6 * g9_19
            + f7 * g8_19
            + f8 * g7_19
            + f9 * g6_19;
        let h6 = f0 * g6
            + f1_2 * g5
            + f2 * g4
            + f3_2 * g3
            + f4 * g2
            + f5_2 * g1
            + f6 * g0
            + f7_2 * g9_19
            + f8 * g8_19
            + f9_2 * g7_19;
        let h7 = f0 * g7
            + f1 * g6
            + f2 * g5
            + f3 * g4
            + f4 * g3
            + f5 * g2
            + f6 * g1
            + f7 * g0
            + f8 * g9_19
            + f9 * g8_19;
        let h8 = f0 * g8
            + f1_2 * g7
            + f2 * g6
            + f3_2 * g5
            + f4 * g4
            + f5_2 * g3
            + f6 * g2
            + f7_2 * g1
            + f8 * g0
            + f9_2 * g9_19;
        let h9 = f0 * g9
            + f1 * g8
            + f2 * g7
            + f3 * g6
            + f4 * g5
            + f5 * g4
            + f6 * g3
            + f7 * g2
            + f8 * g1
            + f9 * g0;

        Self(reduce_coefficients([
            h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,
        ]))
    }

    /// Squares a field element using a dedicated 10-limb formula.
    ///
    /// # Arguments
    ///
    /// * `self` — Operand to square.
    ///
    /// # Returns
    ///
    /// \\(self^2\\) modulo \\(p\\).
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[inline(always)]
    fn square(&self) -> Self {
        let f0 = i64::from(self.0[0]);
        let f1 = i64::from(self.0[1]);
        let f2 = i64::from(self.0[2]);
        let f3 = i64::from(self.0[3]);
        let f4 = i64::from(self.0[4]);
        let f5 = i64::from(self.0[5]);
        let f6 = i64::from(self.0[6]);
        let f7 = i64::from(self.0[7]);
        let f8 = i64::from(self.0[8]);
        let f9 = i64::from(self.0[9]);

        let f0_2 = 2 * f0;
        let f1_2 = 2 * f1;
        let f2_2 = 2 * f2;
        let f3_2 = 2 * f3;
        let f4_2 = 2 * f4;
        let f5_2 = 2 * f5;
        let f7_2 = 2 * f7;
        let f1_4 = 4 * f1;
        let f3_4 = 4 * f3;

        let f9_38 = 38 * f9;
        let f8_38 = 38 * f8;
        let f7_38 = 38 * f7;
        let f6_38 = 38 * f6;
        let f5_38 = 38 * f5;
        let f8_19 = 19 * f8;
        let f6_19 = 19 * f6;

        let h0 = f0 * f0 + f1_2 * f9_38 + f2 * f8_38 + f3_2 * f7_38 + f4 * f6_38 + f5 * f5_38;
        let h1 = f0_2 * f1 + f2 * f9_38 + f3 * f8_38 + f4 * f7_38 + f5 * f6_38;
        let h2 = f0_2 * f2 + f1_2 * f1 + f3_2 * f9_38 + f4 * f8_38 + f5_2 * f7_38 + f6 * f6_19;
        let h3 = f0_2 * f3 + f1_2 * f2 + f4 * f9_38 + f5 * f8_38 + f6 * f7_38;
        let h4 = f0_2 * f4 + f1_4 * f3 + f2 * f2 + f5_2 * f9_38 + f6 * f8_38 + f7 * f7_38;
        let h5 = f0_2 * f5 + f1_2 * f4 + f2_2 * f3 + f6 * f9_38 + f7 * f8_38;
        let h6 = f0_2 * f6 + f1_4 * f5 + f2_2 * f4 + f3_2 * f3 + f7_2 * f9_38 + f8 * f8_19;
        let h7 = f0_2 * f7 + f1_2 * f6 + f2_2 * f5 + f3_2 * f4 + f8 * f9_38;
        let h8 = f0_2 * f8 + f1_4 * f7 + f2_2 * f6 + f3_4 * f5 + f4 * f4 + f9 * f9_38;
        let h9 = f0_2 * f9 + f1_2 * f8 + f2_2 * f7 + f3_2 * f6 + f4_2 * f5;

        Self(reduce_coefficients([
            h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,
        ]))
    }

    /// Computes the multiplicative inverse via exponentiation to \\(p - 2\\).
    ///
    /// # Arguments
    ///
    /// * `self` — Non-zero field element to invert (caller must ensure non-zero in this construction).
    ///
    /// # Returns
    ///
    /// Multiplicative inverse used by the Montgomery ladder output step.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    fn invert(&self) -> Self {
        let z2 = self.square();
        let mut t1 = z2.square();
        t1 = t1.square();
        t1 = self.mul(&t1);
        let z11 = z2.mul(&t1);
        let mut z2_5_0 = z11.square();
        z2_5_0 = t1.mul(&z2_5_0);

        let z2_10_0 = pow2k(&z2_5_0, 5).mul(&z2_5_0);
        let z2_20_0 = pow2k(&z2_10_0, 10).mul(&z2_10_0);
        let z2_40_0 = pow2k(&z2_20_0, 20).mul(&z2_20_0);
        let z2_50_0 = pow2k(&z2_40_0, 10).mul(&z2_10_0);
        let z2_100_0 = pow2k(&z2_50_0, 50).mul(&z2_50_0);
        let z2_200_0 = pow2k(&z2_100_0, 100).mul(&z2_100_0);
        let z2_250_0 = pow2k(&z2_200_0, 50).mul(&z2_50_0);

        pow2k(&z2_250_0, 5).mul(&z11)
    }

    /// Constant-time conditional swap of two field elements for Montgomery ladder steps.
    ///
    /// # Arguments
    ///
    /// * `a` — First operand; may be swapped with `b`.
    /// * `b` — Second operand; may be swapped with `a`.
    /// * `choice` — `1` swaps limbs, `0` leaves them unchanged (mask-derived).
    ///
    /// # Returns
    ///
    /// `()`; mutates `a` and `b` in place.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[inline(always)]
    fn cswap(a: &mut Self, b: &mut Self, choice: u8) {
        let mask = 0_i32.wrapping_sub(i32::from(choice));
        let t0 = mask & (a.0[0] ^ b.0[0]);
        let t1 = mask & (a.0[1] ^ b.0[1]);
        let t2 = mask & (a.0[2] ^ b.0[2]);
        let t3 = mask & (a.0[3] ^ b.0[3]);
        let t4 = mask & (a.0[4] ^ b.0[4]);
        let t5 = mask & (a.0[5] ^ b.0[5]);
        let t6 = mask & (a.0[6] ^ b.0[6]);
        let t7 = mask & (a.0[7] ^ b.0[7]);
        let t8 = mask & (a.0[8] ^ b.0[8]);
        let t9 = mask & (a.0[9] ^ b.0[9]);
        a.0[0] ^= t0;
        a.0[1] ^= t1;
        a.0[2] ^= t2;
        a.0[3] ^= t3;
        a.0[4] ^= t4;
        a.0[5] ^= t5;
        a.0[6] ^= t6;
        a.0[7] ^= t7;
        a.0[8] ^= t8;
        a.0[9] ^= t9;
        b.0[0] ^= t0;
        b.0[1] ^= t1;
        b.0[2] ^= t2;
        b.0[3] ^= t3;
        b.0[4] ^= t4;
        b.0[5] ^= t5;
        b.0[6] ^= t6;
        b.0[7] ^= t7;
        b.0[8] ^= t8;
        b.0[9] ^= t9;
    }

    /// Propagates carries across 25/26-bit limbs and folds the high carry modulo \\(p\\).
    ///
    /// # Arguments
    ///
    /// * `self` — Possibly unreduced limb array after arithmetic.
    ///
    /// # Returns
    ///
    /// Reduced element with each limb restored to the expected 25/26-bit range.
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[allow(dead_code)]
    fn carry_reduce(self) -> Self {
        let mut h = [0_i64; 10];
        for (idx, item) in h.iter_mut().enumerate() {
            *item = i64::from(self.0[idx]);
        }
        Self(reduce_coefficients(h))
    }

    /// Canonicalizes an element to its unique representative in \\([0, p)\\).
    ///
    /// # Arguments
    ///
    /// * `self` — Field element after [`Self::carry_reduce`].
    ///
    /// # Returns
    ///
    /// Fully reduced limbs suitable for bit extraction in [`Self::to_bytes`].
    ///
    /// # Panics
    ///
    /// This function does not panic.
    #[must_use]
    #[allow(dead_code)]
    fn normalize(self) -> Self {
        Self::from_bytes(&self.carry_reduce().to_bytes())
    }

    /// Returns a single bit from the canonical 255-bit limb representation.
    ///
    /// # Arguments
    ///
    /// * `self` — Canonical field element.
    /// * `bit_idx` — Bit index in \\([0, 254]\\) mapped into 51-bit limbs.
    ///
    /// # Returns
    ///
    /// `true` when the selected bit is set.
    ///
    /// # Panics
    ///
    /// This function does not panic for indices used by [`Self::to_bytes`].
    #[allow(dead_code)]
    #[must_use]
    fn bit(&self, bit_idx: usize) -> bool {
        let bytes = self.to_bytes();
        ((bytes[bit_idx / 8] >> (bit_idx % 8)) & 1) == 1
    }
}

#[inline(always)]
fn reduce_coefficients(mut h: [i64; 10]) -> [i32; 10] {
    let carry0 = (h[0] + (1 << 25)) >> 26;
    h[1] += carry0;
    h[0] -= carry0 << 26;
    let carry4 = (h[4] + (1 << 25)) >> 26;
    h[5] += carry4;
    h[4] -= carry4 << 26;

    let carry1 = (h[1] + (1 << 24)) >> 25;
    h[2] += carry1;
    h[1] -= carry1 << 25;
    let carry5 = (h[5] + (1 << 24)) >> 25;
    h[6] += carry5;
    h[5] -= carry5 << 25;

    let carry2 = (h[2] + (1 << 25)) >> 26;
    h[3] += carry2;
    h[2] -= carry2 << 26;
    let carry6 = (h[6] + (1 << 25)) >> 26;
    h[7] += carry6;
    h[6] -= carry6 << 26;

    let carry3 = (h[3] + (1 << 24)) >> 25;
    h[4] += carry3;
    h[3] -= carry3 << 25;
    let carry7 = (h[7] + (1 << 24)) >> 25;
    h[8] += carry7;
    h[7] -= carry7 << 25;

    let carry4 = (h[4] + (1 << 25)) >> 26;
    h[5] += carry4;
    h[4] -= carry4 << 26;
    let carry8 = (h[8] + (1 << 25)) >> 26;
    h[9] += carry8;
    h[8] -= carry8 << 26;

    let carry9 = (h[9] + (1 << 24)) >> 25;
    h[0] += carry9 * 19;
    h[9] -= carry9 << 25;

    let carry0 = (h[0] + (1 << 25)) >> 26;
    h[1] += carry0;
    h[0] -= carry0 << 26;

    [
        h[0] as i32,
        h[1] as i32,
        h[2] as i32,
        h[3] as i32,
        h[4] as i32,
        h[5] as i32,
        h[6] as i32,
        h[7] as i32,
        h[8] as i32,
        h[9] as i32,
    ]
}

#[inline(always)]
fn pow2k(base: &FieldElement, squarings: usize) -> FieldElement {
    let mut out = *base;
    for _ in 0..squarings {
        out = out.square();
    }
    out
}

/// Loads eight little-endian bytes from `input[offset..offset + 8]`.
///
/// # Arguments
///
/// * `input` — 32-byte buffer providing the limb slice.
/// * `offset` — Start index of the eight-byte chunk (must allow eight bytes).
///
/// # Returns
///
/// Little-endian `u64` value from the selected bytes.
///
/// # Panics
///
/// Panics if `offset + 8` exceeds `input` length (internal callers use fixed offsets only).
#[allow(dead_code)]
fn load3(input: &[u8; 32], offset: usize) -> u32 {
    u32::from(input[offset])
        | (u32::from(input[offset + 1]) << 8)
        | (u32::from(input[offset + 2]) << 16)
}

fn load4(input: &[u8; 32], offset: usize) -> u32 {
    u32::from(input[offset])
        | (u32::from(input[offset + 1]) << 8)
        | (u32::from(input[offset + 2]) << 16)
        | (u32::from(input[offset + 3]) << 24)
}

/// Clamps a raw Curve25519 scalar according to RFC 7748 bit clearing and setting rules.
///
/// # Arguments
///
/// * `scalar` — Raw 32-byte scalar before masking.
///
/// # Returns
///
/// Clamped scalar bytes suitable for the Montgomery ladder.
///
/// # Panics
///
/// This function does not panic.
fn clamp_scalar(mut scalar: [u8; 32]) -> [u8; 32] {
    scalar[0] &= 248;
    scalar[31] &= 127;
    scalar[31] |= 64;
    scalar
}

/// Returns `true` when every byte in the 32-byte array is zero.
///
/// # Arguments
///
/// * `bytes` — Fixed-size buffer to test in constant time style (byte OR fold).
///
/// # Returns
///
/// `true` if all bytes are zero.
///
/// # Panics
///
/// This function does not panic.
fn is_all_zero(bytes: &[u8; 32]) -> bool {
    let mut acc = 0_u8;
    for byte in bytes {
        acc |= *byte;
    }
    acc == 0
}

/// Returns `true` when the little-endian Montgomery u-coordinate equals one.
///
/// # Arguments
///
/// * `bytes` — 32-byte u-coordinate in wire order.
///
/// # Returns
///
/// `true` when `bytes` encodes the integer one.
///
/// # Panics
///
/// This function does not panic.
fn is_montgomery_u_one(bytes: &[u8; 32]) -> bool {
    bytes[0] == 1 && bytes[1..].iter().all(|byte| *byte == 0)
}

fn is_montgomery_basepoint(bytes: &[u8; 32]) -> bool {
    bytes[0] == 9 && bytes[1..].iter().all(|byte| *byte == 0)
}

#[cfg(test)]
mod tests {
    use super::{noxtls_x25519, noxtls_x25519_basepoint, FieldElement};
    use x25519_dalek::{PublicKey, StaticSecret};

    fn sample_field_element() -> FieldElement {
        FieldElement::from_bytes(&[
            0x42, 0x7d, 0x13, 0xa9, 0x55, 0x81, 0x2c, 0x73, 0x10, 0xef, 0x91, 0x04, 0xbb, 0x8d,
            0x67, 0xc0, 0x1e, 0x54, 0xf9, 0x22, 0x7a, 0x33, 0x61, 0x0d, 0x8c, 0xfe, 0x49, 0xb5,
            0x28, 0x90, 0xda, 0x11,
        ])
    }

    #[test]
    fn noxtls_x25519_basepoint_matches_dalek() {
        let scalar = [
            0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46,
            0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44,
            0xba, 0x44, 0x9a, 0xc4,
        ];

        let expected = PublicKey::from(&StaticSecret::from(scalar)).to_bytes();
        assert_eq!(noxtls_x25519_basepoint(&scalar), expected);

        let mut basepoint = [0_u8; 32];
        basepoint[0] = 9;
        assert_eq!(noxtls_x25519(&scalar, &basepoint), expected);
    }

    #[test]
    fn noxtls_x25519_shared_secret_matches_dalek() {
        let scalar = [
            0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2,
            0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, 0xfb, 0xa5,
            0x1d, 0xb9, 0x2c, 0x2a,
        ];
        let peer_scalar = [
            0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80,
            0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, 0x8b, 0x27,
            0xff, 0x88, 0xe0, 0xeb,
        ];

        let peer_public = PublicKey::from(&StaticSecret::from(peer_scalar)).to_bytes();
        let expected = StaticSecret::from(scalar)
            .diffie_hellman(&PublicKey::from(peer_public))
            .to_bytes();

        assert_eq!(noxtls_x25519(&scalar, &peer_public), expected);
    }

    #[test]
    fn x25519_square_matches_self_mul() {
        let fe = sample_field_element();
        assert_eq!(fe.square(), fe.mul(&fe));
    }

    #[test]
    fn x25519_field_decode_encode_roundtrip_for_canonical_bytes() {
        let canonical = [
            0x42, 0x7d, 0x13, 0xa9, 0x55, 0x81, 0x2c, 0x73, 0x10, 0xef, 0x91, 0x04, 0xbb, 0x8d,
            0x67, 0xc0, 0x1e, 0x54, 0xf9, 0x22, 0x7a, 0x33, 0x61, 0x0d, 0x8c, 0xfe, 0x49, 0xb5,
            0x28, 0x90, 0x5a, 0x11,
        ];

        assert_eq!(FieldElement::from_bytes(&canonical).to_bytes(), canonical);
    }

    #[test]
    fn x25519_lazy_add_sub_match_reduced_forms_after_normalization() {
        let lhs = sample_field_element();
        let rhs = FieldElement::from_bytes(&[
            0x17, 0x2b, 0x9c, 0x04, 0xde, 0x66, 0xa8, 0x31, 0x4f, 0xe2, 0x5d, 0x99, 0x40, 0x13,
            0xc7, 0x6e, 0x88, 0xab, 0x1f, 0x52, 0x03, 0x74, 0xcd, 0x95, 0xb2, 0x49, 0x0e, 0x21,
            0xfa, 0x63, 0x18, 0x05,
        ]);

        assert_eq!(lhs.add_lazy(&rhs).normalize(), lhs.add(&rhs).normalize());
        assert_eq!(lhs.sub_lazy(&rhs).normalize(), lhs.sub(&rhs).normalize());
    }
}