noxtls-crypto 0.2.10

use core::cmp::Ordering;

use crate::hash::noxtls_sha256;
use noxtls_core::{Error, Result};
#[cfg(all(not(feature = "std"), target_has_atomic = "8"))]
use spin::Once;
#[cfg(feature = "std")]
use std::sync::OnceLock;

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct U256([u32; 8]);

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct AffinePoint {
    x: U256,
    y: U256,
    infinity: bool,
}

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct JacobianPoint {
    x: U256,
    y: U256,
    z: U256,
    infinity: bool,
}

const ZERO: U256 = U256([0; 8]);
const ONE: U256 = U256([1, 0, 0, 0, 0, 0, 0, 0]);
const P: U256 = U256([
    0xffff_ffff,
    0xffff_ffff,
    0xffff_ffff,
    0x0000_0000,
    0x0000_0000,
    0x0000_0000,
    0x0000_0001,
    0xffff_ffff,
]);
const N: U256 = U256([
    0xfc63_2551,
    0xf3b9_cac2,
    0xa717_9e84,
    0xbce6_faad,
    0xffff_ffff,
    0xffff_ffff,
    0x0000_0000,
    0xffff_ffff,
]);
const A: U256 = U256([
    0xffff_fffc,
    0xffff_ffff,
    0xffff_ffff,
    0x0000_0000,
    0x0000_0000,
    0x0000_0000,
    0x0000_0001,
    0xffff_ffff,
]);
const B: U256 = U256([
    0x27d2_604b,
    0x3bce_3c3e,
    0xcc53_b0f6,
    0x651d_06b0,
    0x7698_86bc,
    0xb3eb_bd55,
    0xaa3a_93e7,
    0x5ac6_35d8,
]);
const GX: U256 = U256([
    0xd898_c296,
    0xf4a1_3945,
    0x2deb_33a0,
    0x7703_7d81,
    0x63a4_40f2,
    0xf8bc_e6e5,
    0xe12c_4247,
    0x6b17_d1f2,
]);
const GY: U256 = U256([
    0x37bf_51f5,
    0xcbb6_4068,
    0x6b31_5ece,
    0x2bce_3357,
    0x7c0f_9e16,
    0x8ee7_eb4a,
    0xfe1a_7f9b,
    0x4fe3_42e2,
]);

#[cfg(feature = "std")]
static BASEPOINT_TABLE: OnceLock<[AffinePoint; 16]> = OnceLock::new();
#[cfg(all(not(feature = "std"), target_has_atomic = "8"))]
static BASEPOINT_TABLE: Once<[AffinePoint; 16]> = Once::new();

impl U256 {
    fn from_be_bytes(bytes: &[u8; 32]) -> Self {
        let mut limbs = [0_u32; 8];
        for (i, chunk) in bytes.chunks_exact(4).rev().enumerate() {
            limbs[i] = u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
        }
        Self(limbs)
    }

    fn to_be_bytes(self) -> [u8; 32] {
        let mut out = [0_u8; 32];
        for (chunk, limb) in out.chunks_exact_mut(4).zip(self.0.iter().rev()) {
            chunk.copy_from_slice(&limb.to_be_bytes());
        }
        out
    }

    fn cmp(&self, other: &Self) -> Ordering {
        for idx in (0..8).rev() {
            if self.0[idx] != other.0[idx] {
                return self.0[idx].cmp(&other.0[idx]);
            }
        }
        Ordering::Equal
    }

    fn is_zero(&self) -> bool {
        self.0.iter().all(|&limb| limb == 0)
    }

    fn is_even(&self) -> bool {
        (self.0[0] & 1) == 0
    }

    fn bit_len(&self) -> usize {
        for idx in (0..8).rev() {
            let limb = self.0[idx];
            if limb != 0 {
                return idx * 32 + (32 - limb.leading_zeros() as usize);
            }
        }
        0
    }

    fn nibble_le(&self, nibble_idx: usize) -> u8 {
        let bit_idx = nibble_idx * 4;
        let limb_idx = bit_idx / 32;
        let shift = bit_idx % 32;
        let low = u64::from(*self.0.get(limb_idx).unwrap_or(&0));
        let high = u64::from(*self.0.get(limb_idx + 1).unwrap_or(&0));
        (((low >> shift) | (high << (32 - shift))) & 0x0f) as u8
    }

    fn shr1_assign(&mut self) {
        let mut carry = 0_u32;
        for idx in (0..8).rev() {
            let next_carry = self.0[idx] & 1;
            self.0[idx] = (self.0[idx] >> 1) | (carry << 31);
            carry = next_carry;
        }
    }

    fn shr1_with_high_bit_assign(&mut self, high_bit: u32) {
        let mut carry = high_bit & 1;
        for idx in (0..8).rev() {
            let next_carry = self.0[idx] & 1;
            self.0[idx] = (self.0[idx] >> 1) | (carry << 31);
            carry = next_carry;
        }
    }

    fn add_raw(self, other: Self) -> (Self, u32) {
        let mut out = [0_u32; 8];
        let mut carry = 0_u64;
        for (idx, dst) in out.iter_mut().enumerate() {
            let sum = u64::from(self.0[idx]) + u64::from(other.0[idx]) + carry;
            *dst = sum as u32;
            carry = sum >> 32;
        }
        (Self(out), carry as u32)
    }

    fn sub_raw(self, other: Self) -> (Self, u32) {
        let mut out = [0_u32; 8];
        let mut borrow = 0_u64;
        for (idx, dst) in out.iter_mut().enumerate() {
            let lhs = u64::from(self.0[idx]);
            let rhs = u64::from(other.0[idx]) + borrow;
            if lhs < rhs {
                *dst = (lhs + (1_u64 << 32) - rhs) as u32;
                borrow = 1;
            } else {
                *dst = (lhs - rhs) as u32;
                borrow = 0;
            }
        }
        (Self(out), borrow as u32)
    }

    #[allow(dead_code)]
    fn add_assign_raw(&mut self, other: Self) {
        let (value, _) = self.add_raw(other);
        *self = value;
    }

    fn sub_assign_raw(&mut self, other: Self) {
        let (value, _) = self.sub_raw(other);
        *self = value;
    }

    fn add_mod(self, other: Self, modulus: Self) -> Self {
        let (mut sum, carry) = self.add_raw(other);
        if carry != 0 || sum.cmp(&modulus) != Ordering::Less {
            sum = sum.sub_raw(modulus).0;
        }
        sum
    }

    fn sub_mod(self, other: Self, modulus: Self) -> Self {
        let (diff, borrow) = self.sub_raw(other);
        if borrow == 0 {
            diff
        } else {
            diff.add_raw(modulus).0
        }
    }

    fn modulo_small(mut self, modulus: Self) -> Self {
        while self.cmp(&modulus) != Ordering::Less {
            self = self.sub_raw(modulus).0;
        }
        self
    }
}

impl AffinePoint {
    fn infinity() -> Self {
        Self {
            x: ZERO,
            y: ZERO,
            infinity: true,
        }
    }
}

impl JacobianPoint {
    fn infinity() -> Self {
        Self {
            x: ZERO,
            y: ZERO,
            z: ZERO,
            infinity: true,
        }
    }

    fn from_affine(point: AffinePoint) -> Self {
        if point.infinity {
            Self::infinity()
        } else {
            Self {
                x: point.x,
                y: point.y,
                z: ONE,
                infinity: false,
            }
        }
    }

    fn to_affine(self) -> Result<AffinePoint> {
        if self.infinity || self.z.is_zero() {
            return Ok(AffinePoint::infinity());
        }
        let z_inv = mod_inv(self.z, P)?;
        let z_inv2 = mod_mul(z_inv, z_inv, P);
        let z_inv3 = mod_mul(z_inv2, z_inv, P);
        Ok(AffinePoint {
            x: mod_mul(self.x, z_inv2, P),
            y: mod_mul(self.y, z_inv3, P),
            infinity: false,
        })
    }
}

pub(crate) fn scalar_mul_basepoint_bytes(scalar: &[u8; 32]) -> Result<([u8; 32], [u8; 32])> {
    let point = scalar_mul_basepoint(U256::from_be_bytes(scalar)).to_affine()?;
    if point.infinity {
        return Err(Error::CryptoFailure(
            "p256 public key derivation produced infinity",
        ));
    }
    Ok((point.x.to_be_bytes(), point.y.to_be_bytes()))
}

pub(crate) fn ecdh_shared_secret_bytes(
    private_scalar: &[u8; 32],
    public_x: &[u8; 32],
    public_y: &[u8; 32],
) -> Result<[u8; 32]> {
    let public = decode_affine(public_x, public_y)?;
    let shared = scalar_mul(U256::from_be_bytes(private_scalar), public).to_affine()?;
    if shared.infinity {
        return Err(Error::CryptoFailure("p256 shared point is at infinity"));
    }
    let out = shared.x.to_be_bytes();
    if out.iter().all(|&b| b == 0) {
        return Err(Error::CryptoFailure("p256 shared secret is all-zero"));
    }
    Ok(out)
}

pub(crate) fn ecdsa_sign_digest_bytes(
    private_scalar: &[u8; 32],
    digest: &[u8; 32],
) -> Result<([u8; 32], [u8; 32])> {
    let d = U256::from_be_bytes(private_scalar);
    let e = U256::from_be_bytes(digest).modulo_small(N);

    let mut counter = 0_u32;
    loop {
        let k = derive_nonce(private_scalar, digest, counter).modulo_small(N);
        counter = counter.wrapping_add(1);
        if let Some(sig) = ecdsa_sign_core(d, e, k)? {
            return Ok(sig);
        }
        if counter == 0 {
            return Err(Error::CryptoFailure(
                "p256 ecdsa nonce derivation exhausted",
            ));
        }
    }
}

pub(crate) fn ecdsa_sign_digest_with_nonce_bytes(
    private_scalar: &[u8; 32],
    digest: &[u8; 32],
    nonce_candidate: &[u8; 32],
) -> Result<Option<([u8; 32], [u8; 32])>> {
    let d = U256::from_be_bytes(private_scalar);
    let e = U256::from_be_bytes(digest).modulo_small(N);
    let k = U256::from_be_bytes(nonce_candidate).modulo_small(N);
    ecdsa_sign_core(d, e, k)
}

pub(crate) fn ecdsa_verify_digest_bytes(
    public_x: &[u8; 32],
    public_y: &[u8; 32],
    digest: &[u8; 32],
    r: &[u8; 32],
    s: &[u8; 32],
) -> Result<()> {
    let public = decode_affine(public_x, public_y)?;
    let r = U256::from_be_bytes(r);
    let s = U256::from_be_bytes(s);
    if r.is_zero() || s.is_zero() || r.cmp(&N) != Ordering::Less || s.cmp(&N) != Ordering::Less {
        return Err(Error::CryptoFailure(
            "p256 ecdsa signature scalars out of range",
        ));
    }

    let e = U256::from_be_bytes(digest).modulo_small(N);
    let w = mod_inv(s, N)?;
    let u1 = mod_mul(e, w, N);
    let u2 = mod_mul(r, w, N);
    let point = double_scalar_mul_basepoint(u1, u2, public).to_affine()?;
    if point.infinity {
        return Err(Error::CryptoFailure(
            "p256 ecdsa verification produced point at infinity",
        ));
    }
    let v = point.x.modulo_small(N);
    if v == r {
        Ok(())
    } else {
        Err(Error::CryptoFailure("p256 ecdsa verification failed"))
    }
}

pub(crate) fn validate_public_key_bytes(x: &[u8; 32], y: &[u8; 32]) -> bool {
    decode_affine(x, y).is_ok()
}

fn decode_affine(x: &[u8; 32], y: &[u8; 32]) -> Result<AffinePoint> {
    let point = AffinePoint {
        x: U256::from_be_bytes(x),
        y: U256::from_be_bytes(y),
        infinity: false,
    };
    if point.x.cmp(&P) != Ordering::Less || point.y.cmp(&P) != Ordering::Less {
        return Err(Error::CryptoFailure(
            "p256 public point coordinates out of field range",
        ));
    }
    if !is_point_on_curve(point) {
        return Err(Error::CryptoFailure("p256 public point is not on curve"));
    }
    Ok(point)
}

fn ecdsa_sign_core(d: U256, e: U256, k: U256) -> Result<Option<([u8; 32], [u8; 32])>> {
    if k.is_zero() {
        return Ok(None);
    }
    let rp = scalar_mul_basepoint(k).to_affine()?;
    if rp.infinity {
        return Ok(None);
    }
    let r = rp.x.modulo_small(N);
    if r.is_zero() {
        return Ok(None);
    }
    let rd = mod_mul(r, d, N);
    let e_plus_rd = e.add_mod(rd, N);
    let k_inv = mod_inv(k, N)?;
    let s = mod_mul(k_inv, e_plus_rd, N);
    if s.is_zero() {
        return Ok(None);
    }
    Ok(Some((r.to_be_bytes(), s.to_be_bytes())))
}

fn derive_nonce(private_scalar: &[u8; 32], digest: &[u8; 32], counter: u32) -> U256 {
    let mut seed = [0_u8; 68];
    seed[..32].copy_from_slice(private_scalar);
    seed[32..64].copy_from_slice(digest);
    seed[64..].copy_from_slice(&counter.to_be_bytes());
    U256::from_be_bytes(&noxtls_sha256(&seed))
}

#[cfg(feature = "std")]
fn basepoint_table() -> [AffinePoint; 16] {
    *BASEPOINT_TABLE.get_or_init(|| {
        build_affine_window_table(AffinePoint {
            x: GX,
            y: GY,
            infinity: false,
        })
    })
}

#[cfg(all(not(feature = "std"), target_has_atomic = "8"))]
fn basepoint_table() -> [AffinePoint; 16] {
    *BASEPOINT_TABLE.call_once(|| {
        build_affine_window_table(AffinePoint {
            x: GX,
            y: GY,
            infinity: false,
        })
    })
}

#[cfg(all(not(feature = "std"), not(target_has_atomic = "8")))]
fn basepoint_table() -> [AffinePoint; 16] {
    build_affine_window_table(AffinePoint {
        x: GX,
        y: GY,
        infinity: false,
    })
}

fn scalar_mul_basepoint(scalar: U256) -> JacobianPoint {
    let table = basepoint_table();
    scalar_mul_basepoint_from_table(scalar, &table)
}

fn scalar_mul(scalar: U256, point: AffinePoint) -> JacobianPoint {
    if point.infinity {
        return JacobianPoint::infinity();
    }
    let table = build_window_table(JacobianPoint::from_affine(point));
    scalar_mul_from_table(scalar, &table)
}

fn double_scalar_mul_basepoint(
    base_scalar: U256,
    scalar: U256,
    point: AffinePoint,
) -> JacobianPoint {
    if point.infinity {
        return scalar_mul_basepoint(base_scalar);
    }

    let base_table = basepoint_table();
    let point_table = build_window_table(JacobianPoint::from_affine(point));
    let nibble_count = base_scalar.bit_len().max(scalar.bit_len()).div_ceil(4);
    let mut acc = JacobianPoint::infinity();

    for nibble_idx in (0..nibble_count).rev() {
        for _ in 0..4 {
            acc = jacobian_double(acc);
        }

        let base_nibble = usize::from(base_scalar.nibble_le(nibble_idx));
        if base_nibble != 0 {
            acc = jacobian_add_mixed(acc, base_table[base_nibble]);
        }

        let point_nibble = usize::from(scalar.nibble_le(nibble_idx));
        if point_nibble != 0 {
            acc = jacobian_add(acc, point_table[point_nibble]);
        }
    }

    acc
}

fn scalar_mul_basepoint_from_table(scalar: U256, table: &[AffinePoint; 16]) -> JacobianPoint {
    let mut acc = JacobianPoint::infinity();
    let nibble_count = scalar.bit_len().div_ceil(4);
    for nibble_idx in (0..nibble_count).rev() {
        for _ in 0..4 {
            acc = jacobian_double(acc);
        }
        let nibble = usize::from(scalar.nibble_le(nibble_idx));
        if nibble != 0 {
            acc = jacobian_add_mixed(acc, table[nibble]);
        }
    }
    acc
}

fn scalar_mul_from_table(scalar: U256, table: &[JacobianPoint; 16]) -> JacobianPoint {
    let mut acc = JacobianPoint::infinity();
    let nibble_count = scalar.bit_len().div_ceil(4);
    for nibble_idx in (0..nibble_count).rev() {
        for _ in 0..4 {
            acc = jacobian_double(acc);
        }
        let nibble = usize::from(scalar.nibble_le(nibble_idx));
        if nibble != 0 {
            acc = jacobian_add(acc, table[nibble]);
        }
    }
    acc
}

fn build_window_table(base: JacobianPoint) -> [JacobianPoint; 16] {
    let mut table = [JacobianPoint::infinity(); 16];
    table[1] = base;
    for idx in 2..16 {
        table[idx] = jacobian_add(table[idx - 1], base);
    }
    table
}

fn build_affine_window_table(base: AffinePoint) -> [AffinePoint; 16] {
    let mut table = [AffinePoint::infinity(); 16];
    if base.infinity {
        return table;
    }
    table[1] = base;
    let mut acc = JacobianPoint::from_affine(base);
    for item in table.iter_mut().skip(2) {
        acc = jacobian_add_mixed(acc, base);
        *item = acc
            .to_affine()
            .expect("p256 basepoint affine table entry conversion");
    }
    table
}

fn is_point_on_curve(point: AffinePoint) -> bool {
    if point.infinity {
        return false;
    }
    let y_sq = mod_mul(point.y, point.y, P);
    let x_sq = mod_mul(point.x, point.x, P);
    let x_cu = mod_mul(x_sq, point.x, P);
    let ax = mod_mul(A, point.x, P);
    let rhs = x_cu.add_mod(ax, P).add_mod(B, P);
    y_sq == rhs
}

fn jacobian_double(a: JacobianPoint) -> JacobianPoint {
    if a.infinity || a.y.is_zero() {
        return JacobianPoint::infinity();
    }

    let yy = mod_mul(a.y, a.y, P);
    let yyyy = mod_mul(yy, yy, P);
    let x_yy = mod_mul(a.x, yy, P);
    let s = x_yy.add_mod(x_yy, P).add_mod(x_yy, P).add_mod(x_yy, P);

    let zz = mod_mul(a.z, a.z, P);
    let x_minus_zz = a.x.sub_mod(zz, P);
    let x_plus_zz = a.x.add_mod(zz, P);
    let m_term = mod_mul(x_minus_zz, x_plus_zz, P);
    let m = m_term.add_mod(m_term, P).add_mod(m_term, P);

    let m2 = mod_mul(m, m, P);
    let two_s = s.add_mod(s, P);
    let x3 = m2.sub_mod(two_s, P);
    let s_minus_x3 = s.sub_mod(x3, P);
    let m_s_minus_x3 = mod_mul(m, s_minus_x3, P);
    let two_yyyy = yyyy.add_mod(yyyy, P);
    let four_yyyy = two_yyyy.add_mod(two_yyyy, P);
    let eight_yyyy = four_yyyy.add_mod(four_yyyy, P);
    let y3 = m_s_minus_x3.sub_mod(eight_yyyy, P);
    let yz = mod_mul(a.y, a.z, P);
    let z3 = yz.add_mod(yz, P);

    JacobianPoint {
        x: x3,
        y: y3,
        z: z3,
        infinity: false,
    }
}

fn jacobian_add(a: JacobianPoint, b: JacobianPoint) -> JacobianPoint {
    if a.infinity {
        return b;
    }
    if b.infinity {
        return a;
    }

    let z1z1 = mod_mul(a.z, a.z, P);
    let z2z2 = mod_mul(b.z, b.z, P);
    let u1 = mod_mul(a.x, z2z2, P);
    let u2 = mod_mul(b.x, z1z1, P);

    let z1_cubed = mod_mul(z1z1, a.z, P);
    let z2_cubed = mod_mul(z2z2, b.z, P);
    let s1 = mod_mul(a.y, z2_cubed, P);
    let s2 = mod_mul(b.y, z1_cubed, P);

    if u1 == u2 {
        if s1 != s2 {
            return JacobianPoint::infinity();
        }
        return jacobian_double(a);
    }

    let h = u2.sub_mod(u1, P);
    let two_h = h.add_mod(h, P);
    let i = mod_mul(two_h, two_h, P);
    let j = mod_mul(h, i, P);
    let s2_minus_s1 = s2.sub_mod(s1, P);
    let r = s2_minus_s1.add_mod(s2_minus_s1, P);
    let v = mod_mul(u1, i, P);

    let r2 = mod_mul(r, r, P);
    let two_v = v.add_mod(v, P);
    let x3 = r2.sub_mod(j, P).sub_mod(two_v, P);

    let v_minus_x3 = v.sub_mod(x3, P);
    let r_v_minus_x3 = mod_mul(r, v_minus_x3, P);
    let two_s1 = s1.add_mod(s1, P);
    let two_s1_j = mod_mul(two_s1, j, P);
    let y3 = r_v_minus_x3.sub_mod(two_s1_j, P);

    let z1_plus_z2 = a.z.add_mod(b.z, P);
    let z1_plus_z2_sq = mod_mul(z1_plus_z2, z1_plus_z2, P);
    let z_sum = z1_plus_z2_sq.sub_mod(z1z1, P).sub_mod(z2z2, P);
    let z3 = mod_mul(z_sum, h, P);

    JacobianPoint {
        x: x3,
        y: y3,
        z: z3,
        infinity: false,
    }
}

fn jacobian_add_mixed(a: JacobianPoint, b: AffinePoint) -> JacobianPoint {
    if a.infinity {
        return JacobianPoint::from_affine(b);
    }
    if b.infinity {
        return a;
    }

    let z1z1 = mod_mul(a.z, a.z, P);
    let u2 = mod_mul(b.x, z1z1, P);
    let z1_cubed = mod_mul(z1z1, a.z, P);
    let s2 = mod_mul(b.y, z1_cubed, P);

    if a.x == u2 {
        if a.y != s2 {
            return JacobianPoint::infinity();
        }
        return jacobian_double(a);
    }

    let h = u2.sub_mod(a.x, P);
    let hh = mod_mul(h, h, P);
    let i = hh.add_mod(hh, P).add_mod(hh, P).add_mod(hh, P);
    let j = mod_mul(h, i, P);
    let s2_minus_y1 = s2.sub_mod(a.y, P);
    let r = s2_minus_y1.add_mod(s2_minus_y1, P);
    let v = mod_mul(a.x, i, P);

    let r2 = mod_mul(r, r, P);
    let two_v = v.add_mod(v, P);
    let x3 = r2.sub_mod(j, P).sub_mod(two_v, P);

    let v_minus_x3 = v.sub_mod(x3, P);
    let r_v_minus_x3 = mod_mul(r, v_minus_x3, P);
    let two_y1 = a.y.add_mod(a.y, P);
    let two_y1_j = mod_mul(two_y1, j, P);
    let y3 = r_v_minus_x3.sub_mod(two_y1_j, P);

    let z1_plus_h = a.z.add_mod(h, P);
    let z1_plus_h_sq = mod_mul(z1_plus_h, z1_plus_h, P);
    let z3 = z1_plus_h_sq.sub_mod(z1z1, P).sub_mod(hh, P);

    JacobianPoint {
        x: x3,
        y: y3,
        z: z3,
        infinity: false,
    }
}

fn mod_mul(a: U256, b: U256, modulus: U256) -> U256 {
    let product = mul_wide(a, b);
    if modulus == P {
        mod_512_by_p256(product)
    } else {
        mod_512_by_256(product, modulus)
    }
}

fn mod_inv(a: U256, modulus: U256) -> Result<U256> {
    if a.is_zero() {
        return Err(Error::CryptoFailure(
            "p256 modular inverse of zero is undefined",
        ));
    }
    let mut u = a.modulo_small(modulus);
    let mut v = modulus;
    let mut x1 = ONE;
    let mut x2 = ZERO;

    while u != ONE && v != ONE {
        while u.is_even() {
            u.shr1_assign();
            mod_inv_halve_assign(&mut x1, modulus);
        }
        while v.is_even() {
            v.shr1_assign();
            mod_inv_halve_assign(&mut x2, modulus);
        }
        if u.cmp(&v) != Ordering::Less {
            u.sub_assign_raw(v);
            mod_sub_assign(&mut x1, x2, modulus);
        } else {
            v.sub_assign_raw(u);
            mod_sub_assign(&mut x2, x1, modulus);
        }
    }

    if u == ONE {
        Ok(x1.modulo_small(modulus))
    } else if v == ONE {
        Ok(x2.modulo_small(modulus))
    } else {
        Err(Error::CryptoFailure("p256 modular inverse does not exist"))
    }
}

fn mod_inv_halve_assign(value: &mut U256, modulus: U256) {
    if value.is_even() {
        value.shr1_assign();
    } else {
        let (sum, carry) = value.add_raw(modulus);
        *value = sum;
        value.shr1_with_high_bit_assign(carry);
    }
}

fn mod_sub_assign(value: &mut U256, other: U256, modulus: U256) {
    if value.cmp(&other) != Ordering::Less {
        value.sub_assign_raw(other);
    } else {
        let (sum, carry) = value.add_raw(modulus);
        let (diff, borrow) = sum.sub_raw(other);
        debug_assert_eq!(borrow, carry);
        *value = diff;
    }
}

fn mul_wide(a: U256, b: U256) -> [u32; 16] {
    let mut out = [0_u32; 16];
    for (i, &a_limb) in a.0.iter().enumerate() {
        let mut carry = 0_u64;
        for (j, &b_limb) in b.0.iter().enumerate() {
            let idx = i + j;
            let cur = u64::from(out[idx]) + u64::from(a_limb) * u64::from(b_limb) + carry;
            out[idx] = cur as u32;
            carry = cur >> 32;
        }
        let mut idx = i + 8;
        while carry != 0 && idx < out.len() {
            let cur = u64::from(out[idx]) + carry;
            out[idx] = cur as u32;
            carry = cur >> 32;
            idx += 1;
        }
    }
    out
}

fn mod_512_by_256(dividend: [u32; 16], divisor: U256) -> U256 {
    let mut v = divisor.0;
    let mut u = [0_u32; 17];
    u[..16].copy_from_slice(&dividend);

    let norm_shift = v[7].leading_zeros() as usize;
    if norm_shift > 0 {
        limbs_shl_bits(&mut v, norm_shift);
        limbs_shl_bits(&mut u, norm_shift);
    }

    for j in (0..=8).rev() {
        let num = (u64::from(u[j + 8]) << 32) | u64::from(u[j + 7]);
        let den = u64::from(v[7]);
        let mut qhat = (num / den).min(u64::from(u32::MAX));
        let mut rhat = num - (qhat * den);

        loop {
            let lhs = qhat * u64::from(v[6]);
            if (rhat >> 32) != 0 {
                break;
            }
            let rhs = (rhat << 32) + u64::from(u[j + 6]);
            if lhs <= rhs {
                break;
            }
            qhat -= 1;
            rhat += den;
        }

        if qhat != 0 {
            let borrow = limb_mul_sub(&mut u, j, qhat as u32, &v);
            if borrow {
                let carry_out = limb_add_at(&mut u, j, &v);
                if carry_out != 0 && (j + 9) <= 16 {
                    u[j + 9] = u[j + 9].wrapping_add(carry_out);
                }
            }
        }
    }

    while u[8] != 0 || ge_limbs_prefix(&u, &divisor.0, 8) {
        if sub_limbs_borrow_prefix(&mut u, &divisor.0, 8) {
            if u[8] != 0 {
                u[8] -= 1;
            } else {
                break;
            }
        }
    }

    if norm_shift > 0 {
        limbs_shr_bits(&mut u[..8], norm_shift);
    }

    let mut rem = [0_u32; 8];
    rem.copy_from_slice(&u[..8]);
    if ge_limbs_prefix(&rem, &divisor.0, 8) {
        sub_limbs_prefix(&mut rem, &divisor.0, 8);
    }
    U256(rem)
}

fn mod_512_by_p256(dividend: [u32; 16]) -> U256 {
    let mut acc = [0_i128; 16];
    for (idx, &limb) in dividend.iter().enumerate() {
        acc[idx] = i128::from(limb);
    }

    // Fold the top half with 2^256 = 2^224 - 2^192 - 2^96 + 1 (mod p).
    for idx in (8..16).rev() {
        let high = acc[idx];
        if high == 0 {
            continue;
        }
        acc[idx] = 0;
        acc[idx - 8] += high;
        acc[idx - 1] += high;
        acc[idx - 2] -= high;
        acc[idx - 5] -= high;
    }

    let mut carry = normalize_signed_limbs(&mut acc[..8]);
    while carry != 0 {
        acc[0] += carry;
        acc[7] += carry;
        acc[6] -= carry;
        acc[3] -= carry;
        carry = normalize_signed_limbs(&mut acc[..8]);
    }

    let mut reduced = [0_u32; 8];
    for (dst, &src) in reduced.iter_mut().zip(acc[..8].iter()) {
        *dst = src as u32;
    }

    let mut out = U256(reduced);
    while out.cmp(&P) != Ordering::Less {
        out = out.sub_raw(P).0;
    }
    out
}

fn normalize_signed_limbs(limbs: &mut [i128]) -> i128 {
    let radix = 1_i128 << 32;
    let mut carry = 0_i128;
    for limb in limbs {
        let value = *limb + carry;
        *limb = value.rem_euclid(radix);
        carry = value.div_euclid(radix);
    }
    carry
}

fn limbs_shl_bits(limbs: &mut [u32], k: usize) {
    if k == 0 || k >= 32 {
        return;
    }
    let mut carry = 0_u32;
    for limb in limbs {
        let v = *limb;
        *limb = (v << k) | carry;
        carry = v >> (32 - k);
    }
}

fn limbs_shr_bits(limbs: &mut [u32], k: usize) {
    if k == 0 || k >= 32 {
        return;
    }
    let mut carry = 0_u32;
    for idx in (0..limbs.len()).rev() {
        let v = limbs[idx];
        limbs[idx] = (v >> k) | carry;
        carry = v << (32 - k);
    }
}

fn ge_limbs_prefix(a: &[u32], b: &[u32; 8], len: usize) -> bool {
    for idx in (0..len).rev() {
        if a[idx] != b[idx] {
            return a[idx] > b[idx];
        }
    }
    true
}

fn sub_limbs_prefix(a: &mut [u32], b: &[u32; 8], len: usize) {
    let mut borrow = 0_u64;
    for idx in 0..len {
        let av = u64::from(a[idx]);
        let bv = u64::from(b[idx]) + borrow;
        if av < bv {
            a[idx] = (av + (1_u64 << 32) - bv) as u32;
            borrow = 1;
        } else {
            a[idx] = (av - bv) as u32;
            borrow = 0;
        }
    }
}

fn sub_limbs_borrow_prefix(a: &mut [u32], b: &[u32; 8], len: usize) -> bool {
    let mut borrow = 0_u64;
    for idx in 0..len {
        let av = u64::from(a[idx]);
        let bv = u64::from(b[idx]) + borrow;
        if av < bv {
            a[idx] = (av + (1_u64 << 32) - bv) as u32;
            borrow = 1;
        } else {
            a[idx] = (av - bv) as u32;
            borrow = 0;
        }
    }
    borrow != 0
}

fn limb_mul_sub(u: &mut [u32; 17], offset: usize, qhat: u32, v: &[u32; 8]) -> bool {
    let mut borrow = 0_u64;
    let mut carry = 0_u64;
    for idx in 0..8 {
        let prod = u64::from(qhat) * u64::from(v[idx]) + carry;
        carry = prod >> 32;
        let sub = (prod & 0xFFFF_FFFF) + borrow;
        let cur = u64::from(u[offset + idx]);
        if cur < sub {
            u[offset + idx] = (cur + (1_u64 << 32) - sub) as u32;
            borrow = 1;
        } else {
            u[offset + idx] = (cur - sub) as u32;
            borrow = 0;
        }
    }
    let sub_hi = carry + borrow;
    let cur = u64::from(u[offset + 8]);
    if cur < sub_hi {
        u[offset + 8] = (cur + (1_u64 << 32) - sub_hi) as u32;
        true
    } else {
        u[offset + 8] = (cur - sub_hi) as u32;
        false
    }
}

fn limb_add_at(u: &mut [u32; 17], offset: usize, v: &[u32; 8]) -> u32 {
    let mut carry = 0_u64;
    for idx in 0..8 {
        let cur = u64::from(u[offset + idx]) + u64::from(v[idx]) + carry;
        u[offset + idx] = cur as u32;
        carry = cur >> 32;
    }
    let cur = u64::from(u[offset + 8]) + carry;
    u[offset + 8] = cur as u32;
    (cur >> 32) as u32
}

#[cfg(test)]
mod tests {
    use super::*;

    const TWO: U256 = U256([2, 0, 0, 0, 0, 0, 0, 0]);
    const THREE: U256 = U256([3, 0, 0, 0, 0, 0, 0, 0]);

    fn basepoint() -> AffinePoint {
        AffinePoint {
            x: GX,
            y: GY,
            infinity: false,
        }
    }

    #[test]
    fn p256_mod_inv_halve_assign_handles_overflowing_odd_values() {
        let original = P.sub_raw(TWO).0;
        assert!(!original.is_even());

        let mut halved = original;
        mod_inv_halve_assign(&mut halved, P);

        assert_eq!(halved.add_mod(halved, P), original);
    }

    #[test]
    fn p256_mod_sub_assign_handles_overflowing_value_plus_modulus() {
        let original = P.sub_raw(THREE).0;
        let other = P.sub_raw(ONE).0;
        assert_eq!(original.cmp(&other), Ordering::Less);

        let mut reduced = original;
        mod_sub_assign(&mut reduced, other, P);

        assert_eq!(reduced.add_mod(other, P), original);
    }

    #[test]
    fn p256_jacobian_double_matches_add_for_basepoint() {
        let base = JacobianPoint::from_affine(basepoint());
        let doubled = jacobian_double(base).to_affine().expect("double");
        let added = jacobian_add(base, base).to_affine().expect("add");

        assert_eq!(doubled, added);
        assert!(is_point_on_curve(doubled));
    }

    #[test]
    fn p256_jacobian_add_mixed_matches_full_add_for_basepoint() {
        let base_affine = basepoint();
        let base = JacobianPoint::from_affine(base_affine);
        let mixed = jacobian_add_mixed(base, base_affine)
            .to_affine()
            .expect("mixed");
        let full = jacobian_add(base, JacobianPoint::from_affine(base_affine))
            .to_affine()
            .expect("full");

        assert_eq!(mixed, full);
        assert!(is_point_on_curve(mixed));
    }

    #[test]
    fn p256_specialized_field_reduction_matches_generic_reducer() {
        let samples = [
            (GX, GY),
            (GY, GX),
            (P.sub_raw(ONE).0, P.sub_raw(TWO).0),
            (
                U256([
                    0x89ab_cdef,
                    0x0123_4567,
                    0xfedc_ba98,
                    0x7654_3210,
                    1,
                    2,
                    3,
                    4,
                ]),
                GX,
            ),
        ];

        for (lhs, rhs) in samples {
            let product = mul_wide(lhs, rhs);
            let specialized = mod_512_by_p256(product);
            let generic = mod_512_by_256(product, P);
            assert_eq!(specialized, generic);
        }
    }

    #[test]
    fn p256_double_scalar_mul_matches_separate_multiplications() {
        let public = basepoint();
        let u1 = U256([
            0x89ab_cdef,
            0x0123_4567,
            0xfedc_ba98,
            0x7654_3210,
            1,
            2,
            3,
            4,
        ])
        .modulo_small(N);
        let u2 = U256([
            0x1020_3040,
            0x5060_7080,
            0x90ab_cdef,
            0x1357_9bdf,
            5,
            6,
            7,
            8,
        ])
        .modulo_small(N);

        let joint = double_scalar_mul_basepoint(u1, u2, public)
            .to_affine()
            .expect("joint");
        let separate = jacobian_add(scalar_mul_basepoint(u1), scalar_mul(u2, public))
            .to_affine()
            .expect("separate");

        assert_eq!(joint, separate);
        assert!(is_point_on_curve(joint));
    }

    #[test]
    fn p256_basepoint_scalar_mul_matches_variable_scalar_mul() {
        let scalar = U256([
            0x89ab_cdef,
            0x0123_4567,
            0xfedc_ba98,
            0x7654_3210,
            1,
            2,
            3,
            4,
        ])
        .modulo_small(N);

        let fixed = scalar_mul_basepoint(scalar).to_affine().expect("fixed");
        let variable = scalar_mul(scalar, basepoint())
            .to_affine()
            .expect("variable");

        assert_eq!(fixed, variable);
        assert!(is_point_on_curve(fixed));
    }
}