krypteia-silentops 0.1.0

//! x86_64 constant-time primitives using inline assembly.
//!
//! Why this backend exists
//! -----------------------
//!
//! The generic pure-Rust fallback writes CT primitives as bit-twiddle
//! patterns like `b ^ (mask & (a ^ b))`. LLVM's optimiser recognises
//! that pattern and — even at `opt-level=2` — can transform it into a
//! **pointer CMOV followed by a conditional load**, which reads from
//! an address selected by the secret condition. That leaks via the
//! data-cache side-channel and is flagged by `valgrind --memcheck`
//! when the condition is poisoned. See `doc/infra/ctgrind.md`.
//!
//! Inline asm is **opaque** to LLVM: it cannot introspect or rewrite
//! the body, so each primitive call compiles to exactly the `cmov`
//! (or bit-hack) we wrote, operating on **values in registers** — no
//! address depends on the secret condition.
//!
//! Instructions used
//! -----------------
//!
//! | Primitive            | Instruction(s)                                      |
//! |----------------------|-----------------------------------------------------|
//! | `ct_select_u8/16/32` | `test cond, cond` + `cmovne dst, src`               |
//! | `ct_eq`              | byte-wise XOR accumulation + `test` + `sete`        |
//! | `ct_copy`            | per-byte `ct_select_u8`                             |
//! | `ct_zeroize{,_i16}`  | `write_volatile` + compiler fence (same as generic) |
//!
//! `cmov` is constant-time on every x86_64 core since at least Nehalem
//! (2008): it does not branch, does not change the commit order, and
//! is not subject to branch-predictor side-channels. Memcheck models
//! it as a value-select, so the result's definedness propagates
//! without raising an error — unlike the pointer-CMOV + load pattern
//! the optimiser generates for the pure-Rust fallback.

use core::arch::asm;

/// Constant-time select using `cmovne`.
///
/// Returns `a` if `condition != 0`, else `b`. Executes identical
/// instructions regardless of `condition`; `a` and `b` are both
/// materialised in registers before the cmov so neither branch is
/// "taken" in any observable sense.
#[inline(always)]
pub fn ct_select_u8(a: u8, b: u8, condition: u8) -> u8 {
    let result: u64;
    unsafe {
        asm!(
            "test {cond:e}, {cond:e}",
            "cmovne {out:e}, {a:e}",
            cond = in(reg) condition as u32,
            a    = in(reg) a as u32,
            out  = inout(reg) b as u64 => result,
            options(pure, nomem, nostack),
        );
    }
    result as u8
}

/// Constant-time select for `i16` (NTT coefficients).
#[inline(always)]
pub fn ct_select_i16(a: i16, b: i16, condition: u8) -> i16 {
    let result: u64;
    unsafe {
        asm!(
            "test {cond:e}, {cond:e}",
            "cmovne {out:e}, {a:e}",
            cond = in(reg) condition as u32,
            a    = in(reg) a as i32 as u32,
            out  = inout(reg) b as i32 as i64 as u64 => result,
            options(pure, nomem, nostack),
        );
    }
    result as u16 as i16
}

/// Constant-time select for `i32` (ML-DSA coefficients).
#[inline(always)]
pub fn ct_select_i32(a: i32, b: i32, condition: u8) -> i32 {
    let result: u64;
    unsafe {
        asm!(
            "test {cond:e}, {cond:e}",
            "cmovne {out:e}, {a:e}",
            cond = in(reg) condition as u32,
            a    = in(reg) a as u32,
            out  = inout(reg) b as i64 as u64 => result,
            options(pure, nomem, nostack),
        );
    }
    result as u32 as i32
}

/// Constant-time equality of two `u32`s: returns 1 if `a == b`, 0 otherwise.
///
/// Pure bit-twiddle (no inline asm) — `(diff | -diff) >> 31` materialises
/// the high bit branchlessly; the same pattern is used in `ct_eq`'s
/// reduction. Four wrapping ops, no branch, no memory access.
#[inline(always)]
pub fn ct_eq_u32(a: u32, b: u32) -> u8 {
    let diff = a ^ b;
    let mask = (diff | diff.wrapping_neg()) >> 31;
    ((mask as u8) ^ 1) & 1
}

/// Constant-time equality: returns 1 if `a == b`, 0 otherwise.
///
/// The byte-wise XOR accumulator stays in Rust (LLVM cannot turn a
/// straight-line reduction into a branch). The final 0/nonzero →
/// 1/0 conversion is done with `test`/`sete` in inline asm so the
/// mapping cannot be pattern-matched into a branch either.
#[inline(never)]
pub fn ct_eq(a: &[u8], b: &[u8]) -> u8 {
    if a.len() != b.len() {
        return 0;
    }
    let mut diff = 0u8;
    for i in 0..a.len() {
        diff |= a[i] ^ b[i];
    }
    let result: u64;
    unsafe {
        asm!(
            "test {diff:e}, {diff:e}",
            "sete {out:l}",
            "movzx {out:e}, {out:l}",
            diff = in(reg) diff as u32,
            out  = out(reg) result,
            options(pure, nomem, nostack),
        );
    }
    result as u8
}

/// Conditional copy: if `condition != 0`, copy `src` to `dst`.
///
/// Implemented as a per-byte `ct_select_u8` loop; each iteration
/// compiles to its own `cmov` so LLVM cannot hoist or merge the
/// secret-dependent selection across iterations.
#[inline(never)]
pub fn ct_copy(dst: &mut [u8], src: &[u8], condition: u8) {
    let len = dst.len().min(src.len());
    for i in 0..len {
        dst[i] = ct_select_u8(src[i], dst[i], condition);
    }
}

/// Constant-time slice select: writes `a` to `out` if `condition != 0`,
/// else writes `b`. Both `a` and `b` are read in full byte-by-byte; no
/// secret-dependent control flow or memory access. Length used is
/// `out.len().min(a.len()).min(b.len())`.
///
/// Per-byte `ct_select_u8` loop; each iteration compiles to its own
/// `cmov` so LLVM cannot hoist or merge the secret-dependent selection
/// across iterations.
#[inline(never)]
pub fn ct_select_bytes(out: &mut [u8], a: &[u8], b: &[u8], condition: u8) {
    let len = out.len().min(a.len()).min(b.len());
    for i in 0..len {
        out[i] = ct_select_u8(a[i], b[i], condition);
    }
}

/// Secure zeroization — volatile writes plus a compiler fence so the
/// optimiser cannot elide the clear.
#[inline(never)]
pub fn ct_zeroize(buf: &mut [u8]) {
    for byte in buf.iter_mut() {
        unsafe { core::ptr::write_volatile(byte, 0) };
    }
    core::sync::atomic::compiler_fence(core::sync::atomic::Ordering::SeqCst);
}

/// Secure zeroization for `i16` slices.
#[inline(never)]
pub fn ct_zeroize_i16(buf: &mut [i16]) {
    for val in buf.iter_mut() {
        unsafe { core::ptr::write_volatile(val, 0) };
    }
    core::sync::atomic::compiler_fence(core::sync::atomic::Ordering::SeqCst);
}