singe-cublas 0.1.0-alpha.5

#![allow(deprecated)]

#[allow(unused_imports)]
use crate::error::Status;

use std::fmt::{self, Display, Formatter};

use num_enum::{IntoPrimitive, TryFromPrimitive};
use singe_cublas_sys as sys;

use singe_core::impl_enum_conversion;

/// Indicates whether scalar values are read from host memory or device memory.
/// If an operation uses several scalar values, all of them must use the same pointer mode.
/// The pointer mode can be set and retrieved using
/// [`Context::set_scalar_pointer_mode`](crate::context::Context::set_scalar_pointer_mode) and
/// [`Context::scalar_pointer_mode`](crate::context::Context::scalar_pointer_mode), respectively.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum PointerMode {
    /// Scalars are read from host memory.
    Host = sys::cublasPointerMode_t::CUBLAS_POINTER_MODE_HOST as _,
    /// Scalars are read from device memory.
    Device = sys::cublasPointerMode_t::CUBLAS_POINTER_MODE_DEVICE as _,
}

impl_enum_conversion!(sys::cublasPointerMode_t, PointerMode);

/// Indicates whether cuBLAS operations that have an alternate implementation using atomics can use it.
/// The atomics mode can be set and queried using
/// [`Context::set_atomics_mode`](crate::context::Context::set_atomics_mode) and
/// [`Context::atomics_mode`](crate::context::Context::atomics_mode), respectively.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum AtomicsMode {
    /// The usage of atomics is not allowed.
    NotAllowed = sys::cublasAtomicsMode_t::CUBLAS_ATOMICS_NOT_ALLOWED as _,
    /// The usage of atomics is allowed.
    Allowed = sys::cublasAtomicsMode_t::CUBLAS_ATOMICS_ALLOWED as _,
}

impl_enum_conversion!(sys::cublasAtomicsMode_t, AtomicsMode);

/// [`MathMode`] is used with [`Context::set_math_mode`](crate::context::Context::set_math_mode)
/// to choose compute precision modes.
/// Since this setting does not directly control Tensor Core use, [`MathMode::TensorOp`] is deprecated.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum MathMode {
    /// Default and highest-performance mode, using compute and intermediate storage
    /// precisions with at least the requested mantissa and exponent bit counts.
    /// Tensor Cores are used whenever possible.
    Default = sys::cublasMath_t::CUBLAS_DEFAULT_MATH as _,
    /// Deprecated mode.
    /// Allows the library to use Tensor Core operations whenever possible.
    /// For single precision GEMM operations, cuBLAS uses the [`ComputeType::F32FastF16`] compute type.
    #[deprecated]
    TensorOp = 1,
    /// Uses the prescribed precision and standardized arithmetic for all calculation
    /// phases, primarily for numerical robustness studies, testing, and debugging.
    /// May be slower than the other modes.
    Pedantic = sys::cublasMath_t::CUBLAS_PEDANTIC_MATH as _,
    /// Enable acceleration of single-precision operations using TF32 tensor cores.
    /// Input conversions round to nearest even.
    Tf32TensorOp = sys::cublasMath_t::CUBLAS_TF32_TENSOR_OP_MATH as _,
    /// Enable acceleration of single-precision operations using the BF16x9 algorithm.
    /// See [`EmulationStrategy`] for floating-point emulation controls.
    /// For single precision GEMM operations, cuBLAS uses the [`ComputeType::F32EmulatedBf16x9`] compute type.
    Fp32EmulatedBf16x9 = sys::cublasMath_t::CUBLAS_FP32_EMULATED_BF16X9_MATH as _,
    /// Enable acceleration of double-precision operations using fixed-point emulation algorithms.
    /// See [`EmulationStrategy`] for floating-point emulation controls.
    Fp64EmulatedFixedPoint = sys::cublasMath_t::CUBLAS_FP64_EMULATED_FIXEDPOINT_MATH as _,
    /// Forces reductions during matrix multiplications to use the accumulator
    /// type, not the output type, for mixed precision operations whose output
    /// precision is lower than the compute type precision.
    /// Flag that can be combined with other values using bitwise OR.
    DisallowReducedPrecisionReduction =
        sys::cublasMath_t::CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION as _,
}

impl_enum_conversion!(sys::cublasMath_t, MathMode);

/// Indicates whether the lower or upper part of the dense matrix is filled and used.
///
/// This corresponds to BLAS `L`/`l` (lower) and `U`/`u` (upper) arguments.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum FillMode {
    /// The lower part of the matrix is filled.
    Lower = sys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER as _,
    /// The upper part of the matrix is filled.
    Upper = sys::cublasFillMode_t::CUBLAS_FILL_MODE_UPPER as _,
    /// The full matrix is filled.
    Full = sys::cublasFillMode_t::CUBLAS_FILL_MODE_FULL as _,
}

impl_enum_conversion!(sys::cublasFillMode_t, FillMode);

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum DiagonalType {
    NonUnit = sys::cublasDiagType_t::CUBLAS_DIAG_NON_UNIT as _,
    Unit = sys::cublasDiagType_t::CUBLAS_DIAG_UNIT as _,
}

impl_enum_conversion!(sys::cublasDiagType_t, DiagonalType);

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum SideMode {
    Left = sys::cublasSideMode_t::CUBLAS_SIDE_LEFT as _,
    Right = sys::cublasSideMode_t::CUBLAS_SIDE_RIGHT as _,
}

impl_enum_conversion!(sys::cublasSideMode_t, SideMode);

/// Selects the operation to perform with a dense matrix.
///
/// This corresponds to BLAS `N`/`n` (non-transpose), `T`/`t` (transpose), and
/// `C`/`c` (conjugate transpose) arguments.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum Operation {
    /// Non-transpose operation.
    NonTranspose = sys::cublasOperation_t::CUBLAS_OP_N as _,
    /// Transpose operation.
    Transpose = sys::cublasOperation_t::CUBLAS_OP_T as _,
    /// Conjugate transpose operation.
    ConjugateTranspose = sys::cublasOperation_t::CUBLAS_OP_C as _,
    Conjugate = sys::cublasOperation_t::CUBLAS_OP_CONJG as _,
}

impl Operation {
    pub const HERMITIAN: Self = Self::ConjugateTranspose;
}

impl_enum_conversion!(sys::cublasOperation_t, Operation);

/// Specifies the GEMM algorithm for matrix-matrix multiplication on GPU architectures up to `sm_75`.
/// On `sm_80` and newer GPU architectures, this value has no effect.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(i32)]
pub enum GemmAlgorithm {
    Default = sys::cublasGemmAlgo_t::CUBLAS_GEMM_DFALT as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo0 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO0 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo1 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO1 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo2 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO2 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo3 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO3 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo4 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO4 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo5 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO5 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo6 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO6 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo7 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO7 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo8 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO8 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo9 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO9 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo10 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO10 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo11 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO11 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo12 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO12 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo13 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO13 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo14 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO14 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo15 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO15 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo16 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO16 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo17 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO17 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo18 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO18 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo19 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO19 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo20 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO20 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo21 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO21 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo22 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO22 as _,
    /// Explicitly choose an algorithm `0..23`.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    Algo23 = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO23 as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo0TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO0_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo1TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO1_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo2TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO2_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo3TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO3_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo4TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO4_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo5TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO5_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo6TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO6_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo7TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO7_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo8TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO8_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo9TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO9_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo10TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO10_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo11TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO11_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo12TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO12_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo13TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO13_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo14TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO14_TENSOR_OP as _,
    /// This Tensor Core algorithm value is deprecated.
    /// Explicitly choose a Tensor Core GEMM algorithm `0..15`.
    /// Allows reduced-precision [`ComputeType::F32FastF16`] kernels for backward compatibility.
    /// Has no effect on NVIDIA Ampere architecture GPUs and newer.
    #[deprecated]
    Algo15TensorOp = sys::cublasGemmAlgo_t::CUBLAS_GEMM_ALGO15_TENSOR_OP as _,
    /// `EXPERIMENTAL` Benchmarks available algorithms and chooses the optimal one for the given problem configuration.
    /// The solution is cached in the cuBLAS handle so later calls with the same problem size use the cached configuration.
    /// To avoid overwriting application data, cuBLAS allocates memory matching the output size.
    /// Benchmarking is not supported during stream capture; [`Status::NotSupported`] is returned if no configuration was found in the cache for the given problem size.
    Autotune = sys::cublasGemmAlgo_t::CUBLAS_GEMM_AUTOTUNE as _,
}

impl_enum_conversion!(i32, sys::cublasGemmAlgo_t, GemmAlgorithm);

/// [`ComputeType`] is used in [`gemm_ex`](crate::blas::level3::gemm_ex) and [`matmul`](crate::lt::matmul::matmul) (including all batched and strided batched variants) to choose compute precision modes as defined below.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum ComputeType {
    F16 = sys::cublasComputeType_t::CUBLAS_COMPUTE_16F as _,
    F16Pedantic = sys::cublasComputeType_t::CUBLAS_COMPUTE_16F_PEDANTIC as _,
    F32 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F as _,
    F32Pedantic = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F_PEDANTIC as _,
    F32FastF16 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F_FAST_16F as _,
    F32FastBf16 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F_FAST_16BF as _,
    F32FastTf32 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F_FAST_TF32 as _,
    F32EmulatedBf16x9 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32F_EMULATED_16BFX9 as _,
    F64 = sys::cublasComputeType_t::CUBLAS_COMPUTE_64F as _,
    F64Pedantic = sys::cublasComputeType_t::CUBLAS_COMPUTE_64F_PEDANTIC as _,
    F64EmulatedFixedPoint = sys::cublasComputeType_t::CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT as _,
    I32 = sys::cublasComputeType_t::CUBLAS_COMPUTE_32I as _,
    I32Pedantic = sys::cublasComputeType_t::CUBLAS_COMPUTE_32I_PEDANTIC as _,
}

impl_enum_conversion!(sys::cublasComputeType_t, ComputeType);

/// [`EmulationStrategy`] is used with
/// [`Context::set_emulation_strategy`](crate::context::Context::set_emulation_strategy) to choose
/// how to leverage floating point emulation algorithms.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum EmulationStrategy {
    /// Default emulation strategy; equivalent to [`EmulationStrategy::Performant`]
    /// unless the `CUBLAS_EMULATION_STRATEGY` environment variable is set.
    Default = sys::cublasEmulationStrategy_t::CUBLAS_EMULATION_STRATEGY_DEFAULT as _,
    /// Uses emulation whenever it provides a performance benefit.
    Performant = sys::cublasEmulationStrategy_t::CUBLAS_EMULATION_STRATEGY_PERFORMANT as _,
    /// Uses emulation whenever possible.
    Eager = sys::cublasEmulationStrategy_t::CUBLAS_EMULATION_STRATEGY_EAGER as _,
}

impl_enum_conversion!(sys::cublasEmulationStrategy_t, EmulationStrategy);

impl Display for PointerMode {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Host => write!(f, "CUBLAS_POINTER_MODE_HOST"),
            Self::Device => write!(f, "CUBLAS_POINTER_MODE_DEVICE"),
        }
    }
}

impl Display for AtomicsMode {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::NotAllowed => write!(f, "CUBLAS_ATOMICS_NOT_ALLOWED"),
            Self::Allowed => write!(f, "CUBLAS_ATOMICS_ALLOWED"),
        }
    }
}

impl Display for MathMode {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Default => write!(f, "CUBLAS_DEFAULT_MATH"),

            Self::TensorOp => write!(f, "CUBLAS_TENSOR_OP_MATH"),
            Self::Pedantic => write!(f, "CUBLAS_PEDANTIC_MATH"),
            Self::Tf32TensorOp => write!(f, "CUBLAS_TF32_TENSOR_OP_MATH"),
            Self::Fp32EmulatedBf16x9 => write!(f, "CUBLAS_FP32_EMULATED_BF16X9_MATH"),
            Self::Fp64EmulatedFixedPoint => write!(f, "CUBLAS_FP64_EMULATED_FIXEDPOINT_MATH"),
            Self::DisallowReducedPrecisionReduction => {
                write!(f, "CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION")
            }
        }
    }
}

impl Display for FillMode {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Lower => write!(f, "CUBLAS_FILL_MODE_LOWER"),
            Self::Upper => write!(f, "CUBLAS_FILL_MODE_UPPER"),
            Self::Full => write!(f, "CUBLAS_FILL_MODE_FULL"),
        }
    }
}

impl Display for DiagonalType {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::NonUnit => write!(f, "CUBLAS_DIAG_NON_UNIT"),
            Self::Unit => write!(f, "CUBLAS_DIAG_UNIT"),
        }
    }
}

impl Display for SideMode {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Left => write!(f, "CUBLAS_SIDE_LEFT"),
            Self::Right => write!(f, "CUBLAS_SIDE_RIGHT"),
        }
    }
}

impl Display for Operation {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::NonTranspose => write!(f, "CUBLAS_OP_N"),
            Self::Transpose => write!(f, "CUBLAS_OP_T"),
            Self::ConjugateTranspose => write!(f, "CUBLAS_OP_C"),
            Self::Conjugate => write!(f, "CUBLAS_OP_CONJG"),
        }
    }
}

impl Display for GemmAlgorithm {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        write!(f, "{:?}", self)
    }
}

impl Display for ComputeType {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::F16 => write!(f, "CUBLAS_COMPUTE_16F"),
            Self::F16Pedantic => write!(f, "CUBLAS_COMPUTE_16F_PEDANTIC"),
            Self::F32 => write!(f, "CUBLAS_COMPUTE_32F"),
            Self::F32Pedantic => write!(f, "CUBLAS_COMPUTE_32F_PEDANTIC"),
            Self::F32FastF16 => write!(f, "CUBLAS_COMPUTE_32F_FAST_16F"),
            Self::F32FastBf16 => write!(f, "CUBLAS_COMPUTE_32F_FAST_16BF"),
            Self::F32FastTf32 => write!(f, "CUBLAS_COMPUTE_32F_FAST_TF32"),
            Self::F32EmulatedBf16x9 => write!(f, "CUBLAS_COMPUTE_32F_EMULATED_16BFX9"),
            Self::F64 => write!(f, "CUBLAS_COMPUTE_64F"),
            Self::F64Pedantic => write!(f, "CUBLAS_COMPUTE_64F_PEDANTIC"),
            Self::F64EmulatedFixedPoint => {
                write!(f, "CUBLAS_COMPUTE_64F_EMULATED_FIXEDPOINT")
            }
            Self::I32 => write!(f, "CUBLAS_COMPUTE_32I"),
            Self::I32Pedantic => write!(f, "CUBLAS_COMPUTE_32I_PEDANTIC"),
        }
    }
}

impl Display for EmulationStrategy {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Default => write!(f, "CUBLAS_EMULATION_STRATEGY_DEFAULT"),
            Self::Performant => write!(f, "CUBLAS_EMULATION_STRATEGY_PERFORMANT"),
            Self::Eager => write!(f, "CUBLAS_EMULATION_STRATEGY_EAGER"),
        }
    }
}