krypteia-arcana 0.1.0

//! ECDSA signing / verifying primitives (FIPS 186-5) with deterministic
//! nonces (RFC 6979).
//!
//! This module hosts the LIMBS-generic internals that implement the
//! per-curve ECDSA (and ECDH) operations. The user-facing API -- the
//! [`Curve`](super::curves::Curve) trait and the per-curve unit structs
//! ([`P256`](super::curves::P256), [`P384`](super::curves::P384), ...)
//! lives in [`super::curves`].
//!
//! The [`Signature`] type + DER encoding also lives here because it is
//! ECDSA-specific (ECDH has no signature value).

use super::curve::*;
use super::curves::{CryptoRng, PublicKey, SecretKey};
use super::field::*;
use crate::Hasher;

// ============================================================================
// Signature type + DER
// ============================================================================

/// ECDSA signature `(r, s)` as big-endian byte arrays.
///
/// Each component is `felem_bytes` octets long for the curve
/// (32 / 48 / 64 / 66 depending on the curve).
#[derive(Clone, Debug)]
pub struct Signature {
    /// `r` component of the signature, big-endian.
    pub r: Vec<u8>,
    /// `s` component of the signature, big-endian.
    pub s: Vec<u8>,
}

impl Signature {
    /// Encode as ASN.1 DER (RFC 5480 / X.509 standard form):
    ///
    /// ```asn1
    /// ECDSA-Sig-Value ::= SEQUENCE {
    ///     r  INTEGER,
    ///     s  INTEGER
    /// }
    /// ```
    ///
    /// Uses the strict canonical DER encoding:
    /// - Lengths use the shortest form (single byte < 128, `81 xx` up
    ///   to 255, `82 hi lo` above).
    /// - INTEGERs strip leading zero octets, and prepend one `00` octet
    ///   if the high bit of the first octet would otherwise be set
    ///   (which would make the number look negative in two's complement).
    ///
    /// This is the format used by X.509 certificates, TLS, S/MIME, CMS,
    /// JWS `ES256-DER`, and virtually every OpenSSL-derived tool.
    pub fn to_der(&self) -> Vec<u8> {
        let r_der = encode_der_integer(&self.r);
        let s_der = encode_der_integer(&self.s);
        let payload_len = r_der.len() + s_der.len();

        let mut out = Vec::with_capacity(4 + payload_len);
        out.push(0x30); // SEQUENCE
        encode_der_len(&mut out, payload_len);
        out.extend_from_slice(&r_der);
        out.extend_from_slice(&s_der);
        out
    }

    /// Parse an ASN.1 DER encoding (strict). Returns `None` if the input
    /// is not a valid canonical DER encoding of an ECDSA signature.
    ///
    /// Rejected inputs include (for defence against signature malleability
    /// and parser-differential attacks):
    /// - Any input whose length does not exactly match the advertised
    ///   SEQUENCE length.
    /// - Non-minimal length encodings (e.g. `81 10` where `10` would do,
    ///   or `82 00 10` where `10` would do).
    /// - INTEGERs with superfluous leading zero octets, or with the high
    ///   bit set (which would be a negative number).
    /// - r or s equal to zero.
    ///
    /// The returned `Signature` has `r` and `s` stripped of their DER
    /// padding (so they may be shorter than `LIMBS * 8` bytes). This is
    /// fine for `Curve::verify`, which interprets them via `bits2int` and
    /// thus handles variable widths correctly.
    pub fn from_der(der: &[u8]) -> Option<Self> {
        let (seq_tag, rest) = (*der.first()?, &der[1..]);
        if seq_tag != 0x30 {
            return None;
        }
        let (payload_len, rest) = decode_der_len(rest)?;
        if rest.len() != payload_len {
            return None;
        }

        let (r, rest) = decode_der_integer(rest)?;
        let (s, tail) = decode_der_integer(rest)?;
        if !tail.is_empty() {
            return None;
        }

        // Reject r == 0 or s == 0 (the signing algorithm never produces
        // them either).
        if r.iter().all(|&b| b == 0) || s.iter().all(|&b| b == 0) {
            return None;
        }

        Some(Signature { r, s })
    }
}

// ============================================================================
// Tiny ASN.1 DER helpers (SEQUENCE, INTEGER) -- strict canonical encoding
// ============================================================================
//
// We implement only the two primitives ECDSA needs, deliberately. No
// generic ASN.1 parser: the attack surface on full ASN.1 is notorious.

/// Encode a DER length into `out` using the shortest valid form.
fn encode_der_len(out: &mut Vec<u8>, len: usize) {
    if len < 0x80 {
        out.push(len as u8);
    } else if len < 0x100 {
        out.push(0x81);
        out.push(len as u8);
    } else if len < 0x10000 {
        out.push(0x82);
        out.push((len >> 8) as u8);
        out.push((len & 0xff) as u8);
    } else {
        // No ECDSA signature on the curves we support comes anywhere
        // close to 64 kiB; if we ever see one, the caller has a much
        // bigger problem.
        panic!("DER length > 65535 is not supported by this encoder");
    }
}

/// Decode a DER length. Returns `Some((len, remaining_slice))` on
/// success, or `None` if the encoding is malformed or not in
/// strict-canonical form.
fn decode_der_len(bytes: &[u8]) -> Option<(usize, &[u8])> {
    let first = *bytes.first()?;
    if first < 0x80 {
        Some((first as usize, &bytes[1..]))
    } else if first == 0x81 {
        let b = *bytes.get(1)?;
        if b < 0x80 {
            // Must have used the single-byte form.
            return None;
        }
        Some((b as usize, &bytes[2..]))
    } else if first == 0x82 {
        let b1 = *bytes.get(1)?;
        let b2 = *bytes.get(2)?;
        let len = ((b1 as usize) << 8) | (b2 as usize);
        if len < 0x100 {
            // Must have used the `81 xx` form.
            return None;
        }
        Some((len, &bytes[3..]))
    } else {
        // Longer length forms are legal ASN.1 but no ECDSA signature
        // we produce needs them.
        None
    }
}

/// Encode a big-endian non-negative integer as a DER INTEGER (tag 0x02).
/// Strips leading zero octets, and prepends one 0x00 if the first octet
/// of the stripped representation has its high bit set.
fn encode_der_integer(be: &[u8]) -> Vec<u8> {
    // Strip leading zero octets.
    let mut i = 0;
    while i < be.len() && be[i] == 0 {
        i += 1;
    }
    let stripped: &[u8] = if i == be.len() { &[0u8] } else { &be[i..] };

    // If the high bit of the first octet is set, prepend 0x00 so that
    // the integer is interpreted as positive.
    let needs_pad = !stripped.is_empty() && (stripped[0] & 0x80) != 0;
    let body_len = stripped.len() + if needs_pad { 1 } else { 0 };

    let mut out = Vec::with_capacity(2 + body_len);
    out.push(0x02); // INTEGER tag
    encode_der_len(&mut out, body_len);
    if needs_pad {
        out.push(0x00);
    }
    out.extend_from_slice(stripped);
    out
}

/// Decode a DER INTEGER. Returns `Some((big_endian_bytes, rest))` on
/// success. Rejects non-canonical encodings:
/// - superfluous leading zero octets
/// - negative numbers (high bit of first octet set)
/// - empty contents
fn decode_der_integer(bytes: &[u8]) -> Option<(Vec<u8>, &[u8])> {
    let tag = *bytes.first()?;
    if tag != 0x02 {
        return None;
    }
    let (len, rest) = decode_der_len(&bytes[1..])?;
    if len == 0 || rest.len() < len {
        return None;
    }
    let (content, tail) = rest.split_at(len);

    // Strict checks.
    if content[0] & 0x80 != 0 {
        // High bit set on a positive integer.
        return None;
    }
    if content.len() > 1 && content[0] == 0x00 && (content[1] & 0x80) == 0 {
        // Non-minimal: leading 0x00 was not needed.
        return None;
    }

    // Strip the leading 0x00 padding byte if present (it was there only
    // to disambiguate the sign). The result is the canonical big-endian
    // representation.
    let out = if content[0] == 0x00 && content.len() > 1 {
        content[1..].to_vec()
    } else {
        content.to_vec()
    };

    Some((out, tail))
}

// ============================================================================
// HMAC (used by RFC 6979)
// ============================================================================

/// HMAC using a Hasher H. HMAC(K, text) = H((K ^ opad) || H((K ^ ipad) || text))
fn hmac<H: Hasher>(key: &[u8], data: &[u8]) -> Vec<u8> {
    let block_len = H::BLOCK_LEN;

    // If key > block_len, hash it.
    let k = if key.len() > block_len {
        H::hash(key)
    } else {
        key.to_vec()
    };

    // Pad key to block_len.
    let mut k_padded = vec![0u8; block_len];
    k_padded[..k.len()].copy_from_slice(&k);

    // ipad = k_padded XOR 0x36
    let mut ipad = vec![0u8; block_len];
    for i in 0..block_len {
        ipad[i] = k_padded[i] ^ 0x36;
    }

    // opad = k_padded XOR 0x5C
    let mut opad = vec![0u8; block_len];
    for i in 0..block_len {
        opad[i] = k_padded[i] ^ 0x5c;
    }

    // inner = H(ipad || data)
    let mut inner_hasher = H::new();
    inner_hasher.update(&ipad);
    inner_hasher.update(data);
    let inner = inner_hasher.finalize();

    // outer = H(opad || inner)
    let mut outer_hasher = H::new();
    outer_hasher.update(&opad);
    outer_hasher.update(&inner);
    outer_hasher.finalize()
}

/// HMAC with multiple data segments (concatenated).
fn hmac_multi<H: Hasher>(key: &[u8], parts: &[&[u8]]) -> Vec<u8> {
    let block_len = H::BLOCK_LEN;

    let k = if key.len() > block_len {
        H::hash(key)
    } else {
        key.to_vec()
    };

    let mut k_padded = vec![0u8; block_len];
    k_padded[..k.len()].copy_from_slice(&k);

    let mut ipad = vec![0u8; block_len];
    let mut opad = vec![0u8; block_len];
    for i in 0..block_len {
        ipad[i] = k_padded[i] ^ 0x36;
        opad[i] = k_padded[i] ^ 0x5c;
    }

    let mut inner_hasher = H::new();
    inner_hasher.update(&ipad);
    for part in parts {
        inner_hasher.update(part);
    }
    let inner = inner_hasher.finalize();

    let mut outer_hasher = H::new();
    outer_hasher.update(&opad);
    outer_hasher.update(&inner);
    outer_hasher.finalize()
}

// ============================================================================
// RFC 6979 §2.3.2 bits2int
// ============================================================================

/// Convert a digest (or any byte string) into a field element, per the
/// `bits2int` operation of RFC 6979 §2.3.2.
///
/// Given an input of `blen = 8 * input.len()` bits and a curve order of
/// `qlen` bits, `bits2int` returns the leftmost `min(blen, qlen)` bits of
/// `input` interpreted as a big-endian non-negative integer.
///
/// Byte-level implementation:
///
/// 1. Compute `rlen_bytes = (qlen_bits + 7) / 8`, the ceil-byte length of
///    the curve order.
/// 2. Take `take = min(input.len(), rlen_bytes)` leftmost bytes of `input`.
/// 3. Interpret those bytes as a big-endian integer.
/// 4. If `take * 8 > qlen_bits`, right-shift by `take * 8 - qlen_bits`
///    bits to drop the excess low bits (sub-byte shift).
///
/// For every curve we ship **except P-521**, `qlen_bits` is a multiple
/// of 8 and step 4 is a no-op. For P-521 (`qlen_bits = 521`,
/// `rlen_bytes = 66`), the shift is non-zero only when the input is
/// exactly 66+ bytes long -- e.g. P-521 + SHA-512 (64 bytes) needs no
/// shift, but a hypothetical P-521 + 528-bit hash would need a 7-bit
/// right-shift.
///
/// **This is the operation that the original code was missing**: the
/// first version did `from_bytes_be(input)` directly, which reads bytes
/// starting at the LSB and therefore takes the **rightmost** bytes when
/// `input.len() > LIMBS*8` -- the opposite of what RFC 6979 specifies.
/// The bug never surfaced until the flexible (curve, hash) API exposed
/// pairings where `hlen > qlen`.
fn bits2int<const LIMBS: usize>(input: &[u8], qlen_bits: usize) -> FieldElement<LIMBS> {
    let rlen_bytes = (qlen_bits + 7) / 8;
    let take = input.len().min(rlen_bytes);

    // Interpret the leftmost `take` bytes as a big-endian integer.
    let mut fe = FieldElement::<LIMBS>::from_bytes_be(&input[..take]);

    // Sub-byte right shift if the taken bytes cover more bits than qlen
    // allows. For curves with qlen_bits % 8 == 0 this is always a no-op.
    // For P-521 (qlen_bits = 521, rlen_bytes = 66) it fires only when
    // the input is >= 66 bytes, shifting by 528 - 521 = 7.
    let take_bits = take * 8;
    if take_bits > qlen_bits {
        shr_limbs(&mut fe, (take_bits - qlen_bits) as u32);
    }
    fe
}

/// In-place right shift of a little-endian-limb FieldElement by `n` bits,
/// where `0 < n < 64`. Used by [`bits2int`] for the sub-byte truncation
/// step on curves whose qlen is not a multiple of 8 (i.e. P-521).
fn shr_limbs<const LIMBS: usize>(x: &mut FieldElement<LIMBS>, n: u32) {
    debug_assert!(n > 0 && n < 64, "shr_limbs supports 1..=63");
    let lo = n;
    let hi = 64 - n;
    for i in 0..LIMBS - 1 {
        x.limbs[i] = (x.limbs[i] >> lo) | (x.limbs[i + 1] << hi);
    }
    x.limbs[LIMBS - 1] >>= lo;
}

// ============================================================================
// RFC 6979 deterministic nonce generation
// ============================================================================

/// RFC 6979 §2.3.3 `int2octets(x)`: produce the canonical `rlen_bytes`
/// big-endian encoding of the non-negative integer given by `x_bytes`
/// (which may itself be any length).
///
/// If `x_bytes` is longer than `rlen_bytes`, the leading excess bytes
/// are sliced off (in our use, those bytes are always zero since
/// `x < n < 2^qlen`). If it is shorter, it is left-padded with zeros.
fn int2octets(x_bytes: &[u8], rlen_bytes: usize) -> Vec<u8> {
    use core::cmp::Ordering;
    match x_bytes.len().cmp(&rlen_bytes) {
        Ordering::Equal => x_bytes.to_vec(),
        Ordering::Greater => x_bytes[x_bytes.len() - rlen_bytes..].to_vec(),
        Ordering::Less => {
            let mut out = vec![0u8; rlen_bytes];
            out[rlen_bytes - x_bytes.len()..].copy_from_slice(x_bytes);
            out
        }
    }
}

/// Generate deterministic nonce k per RFC 6979 using hash H.
///
/// `x_bytes` is the private key as big-endian bytes, `h1` is the message
/// digest (already produced by `H`), `n` is the curve order, and
/// `qlen_bits` is the bit length of `n`.
///
/// The byte lengths used internally are driven by
/// `rlen_bytes = (qlen_bits + 7) / 8`, NOT `LIMBS * 8`. For P-521
/// (qlen_bits = 521, LIMBS = 9) these values disagree: rlen_bytes = 66,
/// LIMBS*8 = 72. Using the wrong one breaks the HMAC chain and produces
/// a different nonce than the RFC 6979 reference.
fn rfc6979_k<H: Hasher, const LIMBS: usize>(
    x_bytes: &[u8],
    h1: &[u8],
    n: &[u64; LIMBS],
    qlen_bits: usize,
) -> FieldElement<LIMBS> {
    let hlen = H::OUTPUT_LEN;
    let rlen_bytes = (qlen_bits + 7) / 8;

    // int2octets(x): canonical rlen_bytes big-endian encoding of the
    // secret scalar. For P-521 this strips 6 leading zero bytes from the
    // 72-byte internal storage; for every other curve it is a no-op.
    let x_octets = int2octets(x_bytes, rlen_bytes);

    // bits2octets(h1) = int2octets(bits2int(h1) mod q)
    // We use the full bits2int (with sub-byte shift support) so this is
    // correct even for non-byte-aligned qlen.
    let z1 = bits2int::<LIMBS>(h1, qlen_bits);
    let z1_reduced = reduce_mod_n(&z1, n);
    let z1_octets = int2octets(&z1_reduced.to_bytes_be(), rlen_bytes);

    // Step a: h1 is already computed (passed in).
    // Step b: V = 0x01 0x01 ... 0x01 (hlen bytes).
    let mut v = vec![0x01u8; hlen];
    // Step c: K = 0x00 0x00 ... 0x00 (hlen bytes).
    let mut k = vec![0x00u8; hlen];

    // Step d: K = HMAC_K(V || 0x00 || int2octets(x) || bits2octets(h1))
    k = hmac_multi::<H>(&k, &[&v, &[0x00], &x_octets, &z1_octets]);

    // Step e: V = HMAC_K(V)
    v = hmac::<H>(&k, &v);

    // Step f: K = HMAC_K(V || 0x01 || int2octets(x) || bits2octets(h1))
    k = hmac_multi::<H>(&k, &[&v, &[0x01], &x_octets, &z1_octets]);

    // Step g: V = HMAC_K(V)
    v = hmac::<H>(&k, &v);

    // Step h: generate k.
    loop {
        // h.1: T = empty
        let mut t = Vec::new();

        // h.2: while tlen < rlen_bytes, T = T || HMAC_K(V), V = HMAC_K(V)
        while t.len() < rlen_bytes {
            v = hmac::<H>(&k, &v);
            t.extend_from_slice(&v);
        }

        // h.3: k = bits2int(T). This applies the sub-byte shift for
        // curves with qlen_bits not a multiple of 8 (i.e. P-521).
        let candidate = bits2int::<LIMBS>(&t[..rlen_bytes], qlen_bits);
        // Check 1 <= k < n.
        if !candidate.is_zero() && scalar_is_valid(&candidate, n) {
            return candidate;
        }

        // If not valid, update K and V and try again.
        k = hmac_multi::<H>(&k, &[&v, &[0x00]]);
        v = hmac::<H>(&k, &v);
    }
}

/// Reduce a value mod n by conditional subtraction.
fn reduce_mod_n<const LIMBS: usize>(a: &FieldElement<LIMBS>, n: &[u64; LIMBS]) -> FieldElement<LIMBS> {
    let mut result = *a;
    // Up to two subtractions (hash might be up to 2n).
    for _ in 0..2 {
        let mut borrow: u64 = 0;
        let mut sub = [0u64; LIMBS];
        for i in 0..LIMBS {
            let diff = (result.limbs[i] as u128)
                .wrapping_sub(n[i] as u128)
                .wrapping_sub(borrow as u128);
            sub[i] = diff as u64;
            borrow = ((diff >> 64) as u64) & 1;
        }
        let mask = 0u64.wrapping_sub(1 - borrow); // all-ones if result >= n
        let inv_mask = !mask;
        for i in 0..LIMBS {
            result.limbs[i] = (sub[i] & mask) | (result.limbs[i] & inv_mask);
        }
    }
    result
}

// ============================================================================
// Generic ECDSA implementation
// ============================================================================
//
// keygen / sign / verify are written once, generic over both the curve
// (`CurveParams<LIMBS>`) and the hash function (`H: Hasher`). The per-curve
// wrappers below (P256, Secp256k1, ...) are thin glue that pin the
// LIMBS const, the curve params constructor, and the canonical hash.

/// Parse an SEC1 public key -- **either** uncompressed or compressed --
/// and validate that the encoded point lies on the curve.
///
/// Accepted encodings (where `F = params.felem_bytes` is the **SEC1
/// field element octet length**, not the internal storage width):
///
/// | Tag  | Length    | Form                           |
/// |------|-----------|--------------------------------|
/// | 0x04 | 1 + 2*F   | Uncompressed: `04 \|\| X \|\| Y` |
/// | 0x02 | 1 + F     | Compressed, y is even          |
/// | 0x03 | 1 + F     | Compressed, y is odd           |
///
/// For every curve we ship except P-521, `F == LIMBS*8`. For P-521
/// `F = 66` while `LIMBS*8 = 72`, so the parser accepts 133-byte
/// uncompressed / 67-byte compressed inputs as per SEC1.
///
/// Returns `Some(point)` only if the length, the tag, and the on-curve
/// check all pass. For compressed input, the Y coordinate is recovered
/// via `field_sqrt_p3mod4(x^3 + a*x + b)`: if that value squared does
/// not equal the RHS of the curve equation, X is not a valid
/// x-coordinate and the on-curve safety-net rejects the point.
///
/// **All callers that consume an externally-provided public key must
/// use this function.** Skipping the on-curve check enables invalid-curve
/// attacks. The check is mandatory in ECDH and is the recommended
/// hardening for ECDSA verify (where it defends against attacker-
/// controlled keys fed to a verifier).
///
/// Shared between [`verify_internal`], [`ecdh_internal`], and the
/// `Curve::decompress_pubkey` trait method so that there is exactly
/// one entry point for external public keys.
pub(super) fn parse_and_validate_pubkey<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    pk: &PublicKey,
) -> Option<JacobianPoint<LIMBS>> {
    let felem = params.felem_bytes;
    let bytes = pk.bytes.as_slice();

    let (qx, qy) = match bytes.first().copied()? {
        // Uncompressed: 04 || X || Y
        0x04 => {
            if bytes.len() != 1 + 2 * felem {
                return None;
            }
            // `from_bytes_be` transparently left-pads short inputs into
            // the internal LIMBS*8 storage, so passing the `felem`-wide
            // slice directly is correct for P-521 (66 -> 72 with 6
            // leading zero bytes).
            let qx = FieldElement::<LIMBS>::from_bytes_be(&bytes[1..1 + felem]);
            let qy = FieldElement::<LIMBS>::from_bytes_be(&bytes[1 + felem..1 + 2 * felem]);
            (qx, qy)
        }

        // Compressed: 02 || X (y even) or 03 || X (y odd)
        tag @ (0x02 | 0x03) => {
            if bytes.len() != 1 + felem {
                return None;
            }
            let qx = FieldElement::<LIMBS>::from_bytes_be(&bytes[1..1 + felem]);

            // Compute RHS = x^3 + a*x + b  mod p
            let p = &params.p;
            let x2 = field_sqr(&qx, p);
            let x3 = field_mul(&x2, &qx, p);
            let ax = field_mul(&params.a, &qx, p);
            let rhs = field_add(&field_add(&x3, &ax, p), &params.b, p);

            // Candidate y = sqrt(RHS) under the assumption p ≡ 3 (mod 4).
            // This is correct for all six curves we currently support.
            let mut qy = field_sqrt_p3mod4(&rhs, p);

            // If the computed parity doesn't match the requested parity,
            // use -y = p - y instead. Not constant-time (parity of a public
            // point is public information).
            let want_odd = (tag & 1) == 1;
            let have_odd = (qy.limbs[0] & 1) == 1;
            if have_odd != want_odd {
                qy = field_neg(&qy, p);
            }

            (qx, qy)
        }

        _ => return None,
    };

    // On-curve check: for uncompressed input this is the main defence
    // against attacker-supplied off-curve keys; for compressed input it
    // also doubles as the non-residue detector (a bogus `y` produced by
    // field_sqrt_p3mod4 on a non-QR X will fail here).
    if !is_on_curve(&qx, &qy, params) {
        return None;
    }

    Some(JacobianPoint::from_affine(qx, qy))
}

/// Serialize a field element as `felem_bytes` big-endian octets.
///
/// Converts from the internal `LIMBS*8`-wide representation produced by
/// `FieldElement::to_bytes_be()` to the SEC1 §2.3.5 external width by
/// stripping the leading zero bytes. Returns a fresh `Vec<u8>` of
/// exactly `felem_bytes` length.
///
/// For curves with `felem_bytes == LIMBS*8` (i.e. every curve we ship
/// except P-521) this is a straight copy of `to_bytes_be()`. For P-521
/// it drops the 6 leading zero bytes of the 72-byte internal encoding
/// to produce the standard 66-byte SEC1 field element.
///
/// **Invariant**: the skipped bytes are always zero when the field
/// element value is less than `p`, which is enforced everywhere that
/// builds a FieldElement (field_add, field_mul, etc.) -- this function
/// relies on that invariant rather than checking it at runtime.
pub(super) fn fe_to_felem_bytes<const LIMBS: usize>(fe: &FieldElement<LIMBS>, felem_bytes: usize) -> Vec<u8> {
    let full = fe.to_bytes_be();
    debug_assert!(felem_bytes <= full.len());
    full[full.len() - felem_bytes..].to_vec()
}

/// SEC1 compress: encode a public key as
/// `0x02 || X`  (y even)  or  `0x03 || X`  (y odd).
///
/// Accepts either:
/// - SEC1 **uncompressed** input (`0x04 || X || Y`): slices out X and the
///   last byte of Y directly, after validating that the point is on the
///   curve (defence in depth: we don't emit a compressed encoding for a
///   pk we wouldn't accept as input).
/// - SEC1 **compressed** input (`0x02/0x03 || X`): re-validates and
///   returns a clone (idempotent).
///
/// Returns `None` if the input is malformed or off-curve.
pub(super) fn compress_pubkey_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    pk: &PublicKey,
) -> Option<Vec<u8>> {
    let felem = params.felem_bytes;
    let bytes = pk.bytes.as_slice();
    let tag = bytes.first().copied()?;

    match tag {
        0x04 => {
            if bytes.len() != 1 + 2 * felem {
                return None;
            }
            // Validate on-curve before emitting a compressed encoding.
            parse_and_validate_pubkey::<LIMBS>(params, pk)?;
            // Y's last byte is at index `2*felem` in the uncompressed
            // encoding (1 byte tag + felem bytes of X + felem bytes of Y,
            // so Y spans indices [1+felem, 1+2*felem)).
            let y_last = 2 * felem;
            let tag_out = 0x02 | (bytes[y_last] & 1);
            let mut out = Vec::with_capacity(1 + felem);
            out.push(tag_out);
            out.extend_from_slice(&bytes[1..1 + felem]);
            Some(out)
        }
        0x02 | 0x03 => {
            if bytes.len() != 1 + felem {
                return None;
            }
            // Already compressed -- validate and return a clone.
            parse_and_validate_pubkey::<LIMBS>(params, pk)?;
            Some(bytes.to_vec())
        }
        _ => None,
    }
}

/// SEC1 decompress: take a `0x02/0x03 || X` encoding and return the
/// `0x04 || X || Y` uncompressed form, recovering Y via
/// `field_sqrt_p3mod4`.
///
/// Returns `None` if the input is malformed, if X is not a valid
/// x-coordinate on the curve (i.e. `RHS(X)` is a non-residue), or if
/// the decompressed point fails the on-curve safety check.
///
/// Also accepts an *already uncompressed* input as a no-op, so callers
/// can use `decompress_pubkey(bytes)` as a "normalise to uncompressed"
/// entry point regardless of which form they start from.
pub(super) fn decompress_pubkey_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    compressed: &[u8],
) -> Option<PublicKey> {
    let felem = params.felem_bytes;
    let pk = PublicKey {
        bytes: compressed.to_vec(),
    };
    let point = parse_and_validate_pubkey::<LIMBS>(params, &pk)?;
    // `from_affine` gives us Z=1, so `to_affine` is effectively the
    // identity (plus a wasted modular inverse that is negligible here
    // and keeps the code free of special cases).
    let (qx, qy) = point.to_affine(&params.p)?;
    let mut out = Vec::with_capacity(1 + 2 * felem);
    out.push(0x04);
    out.extend_from_slice(&fe_to_felem_bytes(&qx, felem));
    out.extend_from_slice(&fe_to_felem_bytes(&qy, felem));
    Some(PublicKey { bytes: out })
}

/// ECDH shared-secret derivation, generic over the curve.
///
/// Computes `sk * peer_pk` and returns the X coordinate of the resulting
/// affine point as **`felem_bytes`** big-endian bytes (= 32 / 48 / 64
/// for the byte-aligned curves, 66 for P-521 per SEC1 §2.3.5). Returns
/// `None` if any of the validation steps fails:
///
/// 1. SEC1 parsing + on-curve validation of `peer_pk` (delegated to
///    [`parse_and_validate_pubkey`])
/// 2. Our secret scalar lies in `[1, n-1]`
/// 3. The resulting shared point is not the point at infinity (small
///    subgroup defence in depth)
///
/// Sits next to the ECDSA helpers because all the LIMBS-generic curve
/// operations live here, and is wired into the same per-curve dispatch
/// macro so the public API is uniform: `P256::ecdh(...)`,
/// `P256::sign_rfc6979(...)`, etc. all dispatch through the same trait.
pub(super) fn ecdh_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    sk: &SecretKey,
    peer_pk: &PublicKey,
) -> Option<Vec<u8>> {
    // 1. Validate the peer's public key.
    let peer_point = parse_and_validate_pubkey::<LIMBS>(params, peer_pk)?;

    // 2. Decode our secret scalar and re-check that it is in [1, n-1].
    let d = FieldElement::<LIMBS>::from_bytes_be(&sk.bytes);
    if d.is_zero() || !scalar_is_valid(&d, &params.n) {
        return None;
    }

    // 3. Compute the shared point d * peer_pk via the constant-time
    //    Montgomery ladder.
    let shared = scalar_mul_point(&d, &peer_point, params);

    // 4. Reject the point at infinity.
    let (sx, _sy) = shared.to_affine(&params.p)?;

    // 5. Output the X coordinate as SEC1 felem_bytes BE octets.
    Some(fe_to_felem_bytes(&sx, params.felem_bytes))
}

/// Fill `buf` with fresh random bytes and clear any bits above
/// `qlen_bits` in the high (big-endian first) byte. Used by the
/// rejection-sampling loops below to avoid a near-infinite rejection
/// rate on curves like P-521 where `qlen_bits` is significantly less
/// than `buf.len() * 8`.
///
/// For byte-aligned curves (`qlen_bits % 8 == 0`) this is just a plain
/// `fill_bytes` with no masking.
fn fill_bytes_masked(buf: &mut [u8], qlen_bits: usize, rng: &mut dyn CryptoRng) {
    rng.fill_bytes(buf);
    let excess = buf.len() * 8 - qlen_bits;
    if excess > 0 && !buf.is_empty() {
        // Keep only the low (8 - excess) bits of the top byte.
        buf[0] &= (1u8 << (8 - excess)) - 1;
    }
}

/// ECDSA key generation, generic over the curve. Same operation as ECDH
/// keygen (a curve key pair is curve-key-pair, regardless of how it will be
/// used downstream), so this is exposed to sibling modules under `ecc::`.
///
/// Uses `rlen_bytes = (qlen_bits + 7) / 8` as the sampling width with
/// the high byte masked to the curve's qlen, so the rejection rate of
/// the `< n` check stays ~50 % instead of collapsing to ~2^-55 on
/// curves where `qlen_bits` is significantly below `LIMBS*8*8`
/// (i.e. P-521: qlen=521, LIMBS*8*8=576).
///
/// The returned `SecretKey.bytes` has the **SEC1 felem_bytes** width
/// (i.e. rlen_bytes for every curve we ship, because rlen_bytes ==
/// felem_bytes for all of them). The returned `PublicKey.bytes` is
/// the SEC1 uncompressed encoding `04 || X || Y` with `felem_bytes`
/// per coordinate.
pub(super) fn keygen_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    rng: &mut dyn CryptoRng,
) -> (PublicKey, SecretKey) {
    let g = JacobianPoint::from_affine(params.gx, params.gy);
    let felem = params.felem_bytes;
    let rlen_bytes = (params.qlen_bits + 7) / 8;

    loop {
        // Draw `rlen_bytes` masked to qlen_bits (see fill_bytes_masked
        // for why this matters on curves where rlen_bytes < LIMBS*8*8).
        let mut sk_bytes = vec![0u8; rlen_bytes];
        fill_bytes_masked(&mut sk_bytes, params.qlen_bits, rng);

        // For every curve we ship, `rlen_bytes == felem_bytes` -- the
        // secret key's external width equals the sampling width. (If a
        // future curve broke that invariant, we would need to either
        // pad or truncate `sk_bytes` to felem_bytes here.)
        debug_assert_eq!(rlen_bytes, felem);

        // Decode into a FieldElement<LIMBS> for the scalar check and
        // the scalar-mul. `from_bytes_be` left-pads automatically.
        let d = FieldElement::<LIMBS>::from_bytes_be(&sk_bytes);

        // Ensure 1 <= d < n.
        if d.is_zero() || !scalar_is_valid(&d, &params.n) {
            continue;
        }

        let q = scalar_mul_point(&d, &g, params);
        let (qx, qy) = q.to_affine(&params.p).unwrap();

        // SEC1 uncompressed: 04 || X (felem_bytes) || Y (felem_bytes).
        let mut pk_bytes = Vec::with_capacity(1 + 2 * felem);
        pk_bytes.push(0x04);
        pk_bytes.extend_from_slice(&fe_to_felem_bytes(&qx, felem));
        pk_bytes.extend_from_slice(&fe_to_felem_bytes(&qy, felem));

        return (PublicKey { bytes: pk_bytes }, SecretKey { bytes: sk_bytes });
    }
}

/// Compute the (r, s) signature pair from a chosen nonce `k`. Returns `None`
/// if this particular `k` would yield `r == 0` or `s == 0` (probability
/// ~2^-256, but the spec mandates rejecting the value and trying another).
///
/// `r` and `s` are emitted at SEC1 `felem_bytes` width (32 / 48 / 64
/// for the byte-aligned curves, 66 for P-521). This matches what
/// OpenSSL and every other standards-compliant library produces on
/// the wire, and it is what the DER encoder expects on input.
///
/// Shared between the random-nonce and RFC 6979 sign paths.
fn try_sign_with_k<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    g: &JacobianPoint<LIMBS>,
    d: &FieldElement<LIMBS>,
    e: &FieldElement<LIMBS>,
    k: &FieldElement<LIMBS>,
) -> Option<Signature> {
    let n = &params.n;

    // (x1, _) = k*G
    let kg = scalar_mul_point(k, g, params);
    let (x1, _y1) = kg.to_affine(&params.p)?;

    // r = x1 mod n; reject if zero.
    let r = reduce_mod_n(&x1, n);
    if r.is_zero() {
        return None;
    }

    // s = k^{-1} * (e + r*d) mod n; reject if zero.
    let k_inv = scalar_inv(k, n);
    let rd = scalar_mul(&r, d, n);
    let e_plus_rd = scalar_add(e, &rd, n);
    let s = scalar_mul(&k_inv, &e_plus_rd, n);
    if s.is_zero() {
        return None;
    }

    Some(Signature {
        r: fe_to_felem_bytes(&r, params.felem_bytes),
        s: fe_to_felem_bytes(&s, params.felem_bytes),
    })
}

/// Sample a uniformly random scalar `k` in `[1, n-1]` from `rng`, by
/// rejection sampling on `rlen_bytes = (qlen_bits + 7) / 8` bytes with
/// the top byte masked to the curve's qlen.
///
/// Sampling directly into a `LIMBS*8`-wide buffer (the field element
/// storage width) would be catastrophic on curves where `qlen_bits`
/// is significantly below `LIMBS*8*8` -- e.g. P-521 has qlen=521 and
/// LIMBS*8*8=576, so a naive sample has rejection rate 2^-55 and
/// effectively loops forever.
fn sample_random_scalar<const LIMBS: usize>(
    n: &[u64; LIMBS],
    qlen_bits: usize,
    rng: &mut dyn CryptoRng,
) -> FieldElement<LIMBS> {
    let rlen_bytes = (qlen_bits + 7) / 8;
    let mut buf = vec![0u8; rlen_bytes];
    loop {
        fill_bytes_masked(&mut buf, qlen_bits, rng);
        let candidate = FieldElement::<LIMBS>::from_bytes_be(&buf);
        if !candidate.is_zero() && scalar_is_valid(&candidate, n) {
            return candidate;
        }
    }
}

/// ECDSA sign with **random** nonce (classical ECDSA / FIPS 186-5).
///
/// Takes a precomputed `digest`. The hash function used to produce the
/// digest is irrelevant to the random-nonce path; only its byte length
/// matters (it is interpreted via `bits2int`). The verifier consumes the
/// same digest bytes.
///
/// Each call to `rng` must produce fresh, unpredictable bytes. Reusing `k`
/// across two signatures with the same key recovers the secret key.
pub(super) fn sign_random_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    sk: &SecretKey,
    digest: &[u8],
    rng: &mut dyn CryptoRng,
) -> Signature {
    let g = JacobianPoint::from_affine(params.gx, params.gy);

    let e = bits2int::<LIMBS>(digest, params.qlen_bits);
    let e = reduce_mod_n(&e, &params.n);

    let d = FieldElement::<LIMBS>::from_bytes_be(&sk.bytes);

    loop {
        let k = sample_random_scalar::<LIMBS>(&params.n, params.qlen_bits, rng);
        if let Some(sig) = try_sign_with_k::<LIMBS>(params, &g, &d, &e, &k) {
            return sig;
        }
        // Otherwise: r or s was zero, draw a fresh k and retry.
    }
}

/// ECDSA sign with **deterministic** nonce per RFC 6979.
///
/// Takes a precomputed `digest`. The generic `H` is the hash function used
/// to produce the digest -- it is required here because RFC 6979 derives
/// the nonce via HMAC-`H` internally and the choice of `H` materially
/// changes the nonce. **The caller MUST pass the same `H` that produced
/// the digest**, otherwise determinism is broken (and verifying the same
/// `(sk, msg)` from a different binding will yield a different signature).
pub(super) fn sign_rfc6979_internal<H: Hasher, const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    sk: &SecretKey,
    digest: &[u8],
) -> Signature {
    let g = JacobianPoint::from_affine(params.gx, params.gy);

    let e = bits2int::<LIMBS>(digest, params.qlen_bits);
    let e = reduce_mod_n(&e, &params.n);

    let d = FieldElement::<LIMBS>::from_bytes_be(&sk.bytes);

    // RFC 6979 §3.2 produces a valid k on the first call. The outer loop
    // exists for the (~2^-256) edge case where r or s comes out zero -- the
    // current `rfc6979_k` doesn't expose a "retry counter", so this loop
    // would spin forever in that pathological case. In practice it never
    // triggers, and a future change can thread a counter into rfc6979_k.
    loop {
        let k = rfc6979_k::<H, LIMBS>(&sk.bytes, digest, &params.n, params.qlen_bits);
        if let Some(sig) = try_sign_with_k::<LIMBS>(params, &g, &d, &e, &k) {
            return sig;
        }
    }
}

/// ECDSA verify, generic over the curve. Takes a precomputed `digest`.
///
/// The hash function used to produce the digest is irrelevant to the
/// verifier (the verifier only knows the digest as bytes, interprets them
/// via `bits2int`, and checks the algebraic relation). The caller is
/// responsible for using the same digest length the signer used.
///
/// Public-key validation:
///
/// - SEC1 uncompressed length and format byte
/// - Decoded `(qx, qy)` lies on the curve (defends against attacker-supplied
///   off-curve keys, see [`parse_and_validate_pubkey`])
///
/// All curves we ship are prime-order (cofactor 1), so once `Q` is on the
/// curve and not the encoded point at infinity (which the `0x04` tag rules
/// out), it automatically has full order `n` -- no extra subgroup test is
/// required.
pub(super) fn verify_internal<const LIMBS: usize>(
    params: &CurveParams<LIMBS>,
    pk: &PublicKey,
    digest: &[u8],
    sig: &Signature,
) -> bool {
    let n = &params.n;

    // Parse + on-curve validate the public key. Same entry point used by
    // ECDH derive, so the validation rules cannot drift between the two.
    let q = match parse_and_validate_pubkey::<LIMBS>(params, pk) {
        Some(q) => q,
        None => return false,
    };

    // Parse signature components. Reject if r or s is wider than the
    // field element size (LIMBS * 8 bytes). Without this check, an
    // oversized DER INTEGER (e.g. 33 bytes for P-256) would be silently
    // truncated by from_bytes_be, and the truncated value might
    // accidentally verify against the public key.
    let felem_bytes = LIMBS * 8;
    if sig.r.len() > felem_bytes || sig.s.len() > felem_bytes {
        return false;
    }
    let r = FieldElement::<LIMBS>::from_bytes_be(&sig.r);
    let s = FieldElement::<LIMBS>::from_bytes_be(&sig.s);

    if r.is_zero() || s.is_zero() {
        return false;
    }
    if !scalar_is_valid(&r, n) || !scalar_is_valid(&s, n) {
        return false;
    }

    let e = bits2int::<LIMBS>(digest, params.qlen_bits);
    let e = reduce_mod_n(&e, n);

    let w = scalar_inv(&s, n);
    let u1 = scalar_mul(&e, &w, n);
    let u2 = scalar_mul(&r, &w, n);

    let g = JacobianPoint::from_affine(params.gx, params.gy);
    let point = double_scalar_mul(&u1, &g, &u2, &q, params);

    let (x1, _y1) = match point.to_affine(&params.p) {
        Some(pt) => pt,
        None => return false,
    };
    let v = reduce_mod_n(&x1, n);
    r == v
}

// The user-facing [`Curve`](super::curves::Curve) trait, the per-curve
// unit structs (`P256`, `P384`, ...), and the dispatch macro are defined
// in [`super::curves`]; they dispatch through the `*_internal` functions
// above.

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hash::sha256::Sha256;

    fn hex_to_bytes(hex: &str) -> Vec<u8> {
        (0..hex.len())
            .step_by(2)
            .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).unwrap())
            .collect()
    }

    /// RFC 6979 test vector for P-256 with SHA-256.
    /// Private key: C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721
    /// Message: "sample"
    /// Expected k: A6E3C57DD01ABE90086538398355DD4C3B17AA873382B0F24D6129493D8AAD60
    /// Expected r: EFD48B2AACB6A8FD1140DD9CD45E81D69D2C877B56AAF991C34D0EA84EAF3716
    /// Expected s: F7CB1C942D657C41D436C7A1B6E29F65F3E900DBB9AFF4064DC4AB2F843ACDA8
    #[test]
    fn test_rfc6979_p256_nonce() {
        let sk_bytes = hex_to_bytes("C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721");
        let msg = b"sample";
        let e_hash = Sha256::hash(msg);

        let k = rfc6979_k::<Sha256, 4>(&sk_bytes, &e_hash, &P256_N, 256);
        let k_bytes = k.to_bytes_be();

        let expected_k = hex_to_bytes("A6E3C57DD01ABE90086538398355DD4C3B17AA873382B0F24D6129493D8AAD60");
        assert_eq!(k_bytes, expected_k, "RFC 6979 nonce k mismatch");
    }

    /// Known DER encoding of the RFC 6979 P-256 / SHA-256 "sample" vector.
    /// Pins the DER output format against an external reference value
    /// (computed from the RFC 6979 r and s by hand).
    #[test]
    fn test_der_rfc6979_p256_sample_vector() {
        let r = hex_to_bytes("EFD48B2AACB6A8FD1140DD9CD45E81D69D2C877B56AAF991C34D0EA84EAF3716");
        let s = hex_to_bytes("F7CB1C942D657C41D436C7A1B6E29F65F3E900DBB9AFF4064DC4AB2F843ACDA8");
        let sig = Signature { r, s };

        // Both r and s have MSB set (0xEF, 0xF7), so each INTEGER needs a
        // leading 0x00 padding byte to stay positive. Each INTEGER body
        // is therefore 33 bytes; with the 0x02+0x21 header, each INTEGER
        // is 35 bytes; the SEQUENCE payload is 70 bytes; the SEQUENCE
        // header is 0x30 0x46; total DER length is 72 bytes.
        let expected = hex_to_bytes(
            "3046\
             02210\
             0EFD48B2AACB6A8FD1140DD9CD45E81D69D2C877B56AAF991C34D0EA84EAF3716\
             0221\
             00F7CB1C942D657C41D436C7A1B6E29F65F3E900DBB9AFF4064DC4AB2F843ACDA8",
        );
        assert_eq!(sig.to_der(), expected);

        // Round-trip back.
        let parsed = Signature::from_der(&expected).unwrap();
        assert_eq!(parsed.r, sig.r);
        assert_eq!(parsed.s, sig.s);
    }

    /// from_der rejects malformed / non-canonical encodings.
    #[test]
    fn test_der_rejects_malformed() {
        // Empty
        assert!(Signature::from_der(&[]).is_none());
        // Wrong top tag (SET instead of SEQUENCE)
        assert!(Signature::from_der(&[0x31, 0x00]).is_none());
        // SEQUENCE of nothing (we require two INTEGERs inside)
        assert!(Signature::from_der(&[0x30, 0x00]).is_none());
        // Advertised length longer than the actual content
        assert!(Signature::from_der(&[0x30, 0x06, 0x02, 0x01, 0x01]).is_none());
        // Truncated after r, no s
        assert!(Signature::from_der(&[0x30, 0x03, 0x02, 0x01, 0x01]).is_none());
        // r = 0 must be rejected
        assert!(Signature::from_der(&[0x30, 0x06, 0x02, 0x01, 0x00, 0x02, 0x01, 0x01]).is_none());
        // Non-canonical length (uses 0x81 for a length < 128)
        assert!(Signature::from_der(&[0x30, 0x81, 0x06, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01]).is_none());
        // Non-minimal INTEGER: leading 00 that isn't needed (0x01 is
        // positive already)
        assert!(Signature::from_der(&[0x30, 0x08, 0x02, 0x02, 0x00, 0x01, 0x02, 0x01, 0x01]).is_none());
        // Negative INTEGER (high bit set, no leading 00)
        assert!(Signature::from_der(&[0x30, 0x06, 0x02, 0x01, 0x80, 0x02, 0x01, 0x01]).is_none());
        // Trailing garbage after the SEQUENCE payload
        assert!(Signature::from_der(&[0x30, 0x06, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0xAB]).is_none());
    }

    /// A tiny DER encoding of r=1, s=1 round-trips cleanly.
    #[test]
    fn test_der_small_integers_roundtrip() {
        let sig = Signature {
            r: vec![0x01],
            s: vec![0x01],
        };
        let der = sig.to_der();
        // SEQUENCE { 0x02 01 01, 0x02 01 01 }  -> 0x30 06 02 01 01 02 01 01
        assert_eq!(der, vec![0x30, 0x06, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01]);
        let back = Signature::from_der(&der).unwrap();
        assert_eq!(back.r, vec![0x01]);
        assert_eq!(back.s, vec![0x01]);
    }
}