rscrypto 0.1.1 - Docs.rs

//! Tuned dispatch tables for SHA-512.
//!
//! SHA-512 NI, ARM SHA512 CE, and Zknh have negligible setup cost — use HW
//! accel for all size classes when available.

pub use super::kernels::Sha512KernelId as KernelId;
use crate::platform::Caps;

pub const DEFAULT_BOUNDARIES: [usize; 3] = [64, 256, 4096];

#[derive(Clone, Copy, Debug)]
pub struct DispatchTable {
  pub boundaries: [usize; 3],
  pub xs: KernelId,
  pub s: KernelId,
  pub m: KernelId,
  pub l: KernelId,
}

pub static DEFAULT_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::Portable,
  s: KernelId::Portable,
  m: KernelId::Portable,
  l: KernelId::Portable,
};

#[cfg(target_arch = "aarch64")]
pub static AARCH64_SHA512_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::Aarch64Sha512,
  s: KernelId::Aarch64Sha512,
  m: KernelId::Aarch64Sha512,
  l: KernelId::Aarch64Sha512,
};

#[cfg(target_arch = "x86_64")]
pub static X86_SHA512_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::X86Sha512,
  s: KernelId::X86Sha512,
  m: KernelId::X86Sha512,
  l: KernelId::X86Sha512,
};

#[cfg(target_arch = "x86_64")]
pub static X86_AVX512VL_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::X86Avx512vl,
  s: KernelId::X86Avx512vl,
  m: KernelId::X86Avx512vl,
  l: KernelId::X86Avx512vl,
};

#[cfg(target_arch = "x86_64")]
pub static X86_AVX2_DECOUPLED_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::X86Avx2Decoupled,
  s: KernelId::X86Avx2Decoupled,
  m: KernelId::X86Avx2Decoupled,
  l: KernelId::X86Avx2Decoupled,
};

#[cfg(target_arch = "x86_64")]
pub static X86_AVX512VL_DECOUPLED_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::X86Avx512vlDecoupled,
  s: KernelId::X86Avx512vlDecoupled,
  m: KernelId::X86Avx512vlDecoupled,
  l: KernelId::X86Avx512vlDecoupled,
};

#[cfg(target_arch = "riscv64")]
pub static RISCV_ZKNH_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::Riscv64Zknh,
  s: KernelId::Riscv64Zknh,
  m: KernelId::Riscv64Zknh,
  l: KernelId::Riscv64Zknh,
};

#[cfg(target_arch = "wasm32")]
pub static WASM_SIMD128_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::WasmSimd128,
  s: KernelId::WasmSimd128,
  m: KernelId::WasmSimd128,
  l: KernelId::WasmSimd128,
};

#[cfg(target_arch = "s390x")]
pub static S390X_KIMD_TABLE: DispatchTable = DispatchTable {
  boundaries: DEFAULT_BOUNDARIES,
  xs: KernelId::S390xKimd,
  s: KernelId::S390xKimd,
  m: KernelId::S390xKimd,
  l: KernelId::S390xKimd,
};

#[inline]
#[must_use]
pub fn select_runtime_table(#[allow(unused_variables)] caps: Caps) -> &'static DispatchTable {
  // x86_64 cascade: SHA-512 NI > vendor-aware AVX2/AVX-512VL > Portable
  //
  // The stitched AVX2+BMI2 dual-block kernel beats AVX-512VL in raw
  // compression throughput on both AMD and Intel. However, the AVX2 kernel
  // falls back to portable for odd-block-count inputs (the common case for
  // small inputs: 0-64 B = 1 block). On Intel, AVX-512VL handles single
  // blocks natively, so it wins at small sizes and breaks even at scale.
  //
  // AMD: AVX2 decoupled > AVX-512VL.
  // Intel: AVX-512VL decoupled > AVX2 decoupled.
  //
  // Measured: sha512-compress/raw CI 2026-03-23.
  #[cfg(target_arch = "x86_64")]
  {
    use crate::platform::caps::x86;
    if caps.has(x86::SHA512) {
      return &X86_SHA512_TABLE;
    }
    if caps.has(x86::AMD) {
      // Decoupled kernel: schedule one-ahead of rounds. The stitched kernel
      // serialises schedule → extract → round within each iteration, limiting
      // IPC on wide pipelines. The decoupled pattern gives the OOO engine
      // 16 independent scalar rounds to overlap with SIMD schedule latency.
      // Measured: stitched Zen4→Zen5 scaling 1.32x vs sha2 crate 1.71x.
      if caps.has(x86::AVX2) {
        return &X86_AVX2_DECOUPLED_TABLE;
      }
      if caps.has(x86::AVX512F) && caps.has(x86::AVX512VL) {
        return &X86_AVX512VL_TABLE;
      }
    } else {
      // Intel: decoupled AVX-512VL > stitched AVX-512VL > AVX2.
      // The decoupled kernel uses rotation-based schedule (no cross-lane
      // permute) + VPRORQ native rotates + schedule one-ahead of rounds,
      // eliminating the `_mm256_permute2x128_si256` bottleneck (3-cycle
      // latency on SPR) that caused 0.95-0.96x vs sha2 at ≥1KiB.
      if caps.has(x86::AVX512F) && caps.has(x86::AVX512VL) {
        return &X86_AVX512VL_DECOUPLED_TABLE;
      }
      if caps.has(x86::AVX2) {
        return &X86_AVX2_DECOUPLED_TABLE;
      }
    }
  }
  #[cfg(target_arch = "aarch64")]
  {
    use crate::platform::caps::aarch64;
    if caps.has(aarch64::SHA512) {
      return &AARCH64_SHA512_TABLE;
    }
  }
  #[cfg(target_arch = "riscv64")]
  {
    use crate::platform::caps::riscv;
    if caps.has(riscv::ZKNH) {
      return &RISCV_ZKNH_TABLE;
    }
  }
  #[cfg(target_arch = "wasm32")]
  {
    use crate::platform::caps::wasm;
    if caps.has(wasm::SIMD128) {
      return &WASM_SIMD128_TABLE;
    }
  }
  #[cfg(target_arch = "s390x")]
  {
    use crate::platform::caps::s390x;
    if caps.has(s390x::MSA) {
      return &S390X_KIMD_TABLE;
    }
  }
  &DEFAULT_TABLE
}