ferrotorch-gpu 0.5.7

//! cuSPARSELt-backed 2:4 structured sparse matmul.
//!
//! This module wraps the raw FFI emitted by `build.rs` (see
//! `cusparseLt.h` / [NVIDIA cuSPARSELt SDK](https://docs.nvidia.com/cuda/cusparselt))
//! into safe-ish helpers used by `CudaBackendImpl::sparse_matmul_24_*`.
//!
//! # Why a separate library?
//!
//! cuSPARSELt is a distinct NVIDIA SDK from cuSPARSE — it specialises
//! in dense-by-2:4-structured-sparse matmul on Ampere+ Tensor Cores. It
//! ships its own `libcusparseLt.so` and header `cusparseLt.h`. Activating
//! the `cusparselt` cargo feature opts the workspace into linking this
//! library; the default workspace build does not require it.
//!
//! # PyTorch parity
//!
//! `torch._C._sparse_semi_structured_apply` (used by the
//! [`SparseSemiStructuredTensor`] internal class in core PyTorch and by
//! `nn.utils.parametrize` 2:4-pruned linears) routes through cuSPARSELt
//! on Ampere+. ferrotorch mirrors that per `rust-gpu-discipline §3` for
//! `SemiStructuredSparseTensor::sparse_matmul_24` whenever the feature
//! is built **and** `libcusparseLt.so` is available at runtime.
//!
//! # Storage convention
//!
//! cuSPARSELt's "structured" matrix is the 2:4-sparse operand of the
//! matmul (typically the **B** operand in PyTorch's terminology). The
//! ferrotorch `SemiStructuredSparseTensor::sparse_matmul_24(a, b)` API
//! has `b` as the sparse 2:4 weight; that maps directly onto cuSPARSELt's
//! `matB`.
//!
//! cuSPARSELt expects column-major storage internally for the structured
//! operand; the caller must supply leading-dim/order info on the matrix
//! descriptor. We use ROW order on all three dense descriptors and let
//! cuSPARSELt re-pack the structured operand internally.
//!
//! # Compute type
//!
//! For FP16 / BF16 inputs we pick `CUSPARSE_COMPUTE_32F` (FP32 accumulator
//! on Tensor Cores). For FP32 inputs we pick `CUSPARSE_COMPUTE_TF32`,
//! which is the only Tensor-Core-accelerated FP32 mode cuSPARSELt accepts.

#![cfg(all(feature = "cuda", feature = "cusparselt"))]
#![allow(non_snake_case, non_camel_case_types, non_upper_case_globals)]
#![allow(dead_code)]

use cudarc::driver::DevicePtr;

use crate::buffer::CudaBuffer;
use crate::device::GpuDevice;
use crate::error::{GpuError, GpuResult};

// ---------------------------------------------------------------------------
// Raw FFI — generated by build.rs via bindgen against cusparseLt.h
// ---------------------------------------------------------------------------

/// Bindgen-generated Rust FFI for cuSPARSELt. `OUT_DIR` is set by cargo
/// during the build script run.
pub mod sys {
    #![allow(clippy::all)]
    #![allow(unused, non_snake_case, non_camel_case_types, non_upper_case_globals)]
    include!(concat!(env!("OUT_DIR"), "/cusparselt_sys.rs"));
}

// ---------------------------------------------------------------------------
// Status helper
// ---------------------------------------------------------------------------

#[inline]
fn check(status: sys::cusparseStatus_t, op: &'static str) -> GpuResult<()> {
    if status == sys::cusparseStatus_t::CUSPARSE_STATUS_SUCCESS {
        Ok(())
    } else {
        Err(GpuError::InvalidState {
            message: format!("{op} returned cuSPARSELt status {status:?}"),
        })
    }
}

// ---------------------------------------------------------------------------
// Handle wrapper
// ---------------------------------------------------------------------------

/// Owning wrapper around `cusparseLtHandle_t`. Mirrors the
/// `CusparseHandle` pattern from `crate::sparse` — destroy on drop, one
/// per backend, stream rebound per call.
pub struct CusparseLtHandle {
    inner: sys::cusparseLtHandle_t,
}

// SAFETY: `cusparseLtHandle_t` is an opaque struct value (not a pointer)
// per the SDK; cuSPARSELt requires the handle to be used from one thread
// at a time. `CudaBackendImpl` owns it inside `OnceLock` and serialises
// per-device access.
unsafe impl Send for CusparseLtHandle {}
unsafe impl Sync for CusparseLtHandle {}

impl std::fmt::Debug for CusparseLtHandle {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CusparseLtHandle").finish()
    }
}

impl CusparseLtHandle {
    /// Create and initialise a fresh cuSPARSELt handle on the current
    /// CUDA context.
    pub fn new() -> GpuResult<Self> {
        // SAFETY: `cusparseLtInit` writes a fresh handle into the
        // pointed-to memory. `MaybeUninit::zeroed()` for an opaque
        // struct of trivially-readable bytes is sound — cuSPARSELt
        // overwrites every field on success.
        let mut inner: sys::cusparseLtHandle_t =
            unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
        let status = unsafe { sys::cusparseLtInit(&mut inner as *mut _) };
        check(status, "cusparseLtInit")?;
        Ok(Self { inner })
    }

    /// Pointer-typed access for FFI calls.
    #[inline]
    pub fn raw(&self) -> *const sys::cusparseLtHandle_t {
        &self.inner as *const _
    }

    /// Mutable pointer-typed access for FFI calls that take `*mut`.
    #[inline]
    pub fn raw_mut(&mut self) -> *mut sys::cusparseLtHandle_t {
        &mut self.inner as *mut _
    }
}

impl Drop for CusparseLtHandle {
    fn drop(&mut self) {
        // SAFETY: handle was initialised by `cusparseLtInit` and not
        // destroyed yet; Drop runs at most once.
        unsafe {
            let _ = sys::cusparseLtDestroy(&mut self.inner as *mut _);
        }
    }
}

// ---------------------------------------------------------------------------
// dtype mapping
// ---------------------------------------------------------------------------

/// Compile-time-distinct value types accepted by cuSPARSELt's structured
/// matmul path. ferrotorch wires the three PyTorch parity covers — FP16,
/// BF16, FP32 (TF32 mode).
#[derive(Debug, Clone, Copy)]
pub enum CuSpLtDType {
    F16,
    Bf16,
    F32,
}

impl CuSpLtDType {
    fn cuda_dtype(self) -> sys::cudaDataType_t {
        match self {
            CuSpLtDType::F16 => sys::cudaDataType_t::CUDA_R_16F,
            CuSpLtDType::Bf16 => sys::cudaDataType_t::CUDA_R_16BF,
            CuSpLtDType::F32 => sys::cudaDataType_t::CUDA_R_32F,
        }
    }

    fn compute_type(self) -> sys::cusparseComputeType {
        match self {
            // FP16/BF16 tensor cores accumulate to FP32 by default; this
            // matches PyTorch autocast semantics and the cuSPARSELt
            // documentation's recommended mode.
            CuSpLtDType::F16 | CuSpLtDType::Bf16 => sys::cusparseComputeType::CUSPARSE_COMPUTE_32F,
            // The only Tensor-Core-accelerated FP32 mode cuSPARSELt
            // exposes is TF32. Plain CUSPARSE_COMPUTE_32F on FP32 inputs
            // is rejected by `cusparseLtMatmulDescriptorInit` on Ampere.
            CuSpLtDType::F32 => sys::cusparseComputeType::CUSPARSE_COMPUTE_TF32,
        }
    }

    fn elem_bytes(self) -> usize {
        match self {
            CuSpLtDType::F16 | CuSpLtDType::Bf16 => 2,
            CuSpLtDType::F32 => 4,
        }
    }

    fn alignment(self) -> u32 {
        // cuSPARSELt requires 16-byte alignment for the leading dim of
        // every matrix on Ampere+ (8 elements for FP16/BF16, 4 elements
        // for FP32). Return alignment in **bytes** — that's what the
        // descriptor init takes.
        16
    }
}

// ---------------------------------------------------------------------------
// Structured matmul — generic over (sparse_b, dense_a) flavoured by dtype
// ---------------------------------------------------------------------------

/// Compute `D = alpha * A @ B + beta * C` where `B` is dense
/// `[k, n]` row-major **but stored in 2:4 structured sparse layout**
/// after compression, and `A` is dense `[m, k]` row-major. Returns a
/// dense `[m, n]` row-major device buffer of element type matching
/// `dtype`.
///
/// `b_dense_decompressed` must be the **decompressed** form of the 2:4
/// matrix (i.e. dense values with the masked positions set to zero). We
/// hand the dense form to cuSPARSELt's `cusparseLtSpMMACompress` which
/// re-packs it into the Tensor-Core-friendly format internally.
///
/// `b_dense_decompressed.len() == k * n` (row-major, contiguous).
/// `a.len() == m * k` (row-major, contiguous).
///
/// The element type is the same for A, B, C, D — set via `dtype`. Mixed-
/// precision compute is selected automatically (FP32 accumulator for
/// FP16/BF16, TF32 mode for FP32).
#[allow(clippy::too_many_arguments)]
pub fn gpu_sparse_matmul_24<T>(
    handle: &CusparseLtHandle,
    a_dense: &CudaBuffer<T>,
    b_dense_decompressed: &CudaBuffer<T>,
    m: usize,
    k: usize,
    n: usize,
    dtype: CuSpLtDType,
    device: &GpuDevice,
) -> GpuResult<CudaBuffer<T>>
where
    T: cudarc::driver::DeviceRepr + Default + Copy + 'static,
{
    if a_dense.len() != m * k {
        return Err(GpuError::ShapeMismatch {
            op: "cusparselt::sparse_matmul_24",
            expected: vec![m, k],
            got: vec![a_dense.len()],
        });
    }
    if b_dense_decompressed.len() != k * n {
        return Err(GpuError::ShapeMismatch {
            op: "cusparselt::sparse_matmul_24",
            expected: vec![k, n],
            got: vec![b_dense_decompressed.len()],
        });
    }
    if m == 0 || n == 0 || k == 0 {
        let stream = device.stream();
        let slice = stream.alloc_zeros::<T>(m * n)?;
        return Ok(CudaBuffer::<T> {
            data: Some(slice),
            len: m * n,
            alloc_len: m * n,
            device_ordinal: device.ordinal(),
            pool_fn: None,
        });
    }

    // cuSPARSELt requires the structured-side leading dim to be a
    // multiple of 8 for FP16/BF16 (or 4 for FP32). Fall back to the
    // caller (`Err`) so the dispatch site can decompress + dense matmul
    // for non-aligned shapes — that's the same fallback PyTorch takes.
    let elem_align: usize = match dtype {
        CuSpLtDType::F16 | CuSpLtDType::Bf16 => 8,
        CuSpLtDType::F32 => 4,
    };
    if k % elem_align != 0 || n % elem_align != 0 || m % elem_align != 0 {
        return Err(GpuError::InvalidState {
            message: format!(
                "cusparselt::sparse_matmul_24: dims (m={m}, k={k}, n={n}) must each be a multiple of {elem_align} for dtype {dtype:?}"
            ),
        });
    }

    let stream = device.stream();
    // SAFETY: cudarc's CudaStream::cu_stream returns a valid CUstream
    // for the lifetime of the Arc<CudaStream>. cusparseLt's
    // cudaStream_t is the same ABI-level pointer.
    let cu_stream = stream.cu_stream() as sys::cudaStream_t;

    let dtype_cuda = dtype.cuda_dtype();
    let compute = dtype.compute_type();
    let align: u32 = dtype.alignment();

    // ---- Descriptors --------------------------------------------------------
    // SAFETY: each descriptor is initialised by the matching cuSPARSELt
    // *Init function before any read; we hand the descriptor's address
    // into the FFI exactly once and destroy it (or let it go out of
    // scope as a POD value the SDK does not require explicit freeing
    // for) at the end of the function.
    let mut a_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut b_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut c_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut matmul_descr: sys::cusparseLtMatmulDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut alg_sel: sys::cusparseLtMatmulAlgSelection_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut plan: sys::cusparseLtMatmulPlan_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };

    // We allocate the output buffer up front so we can hand the device
    // pointer to the matmul without re-borrowing later.
    let mut out_slice = stream.alloc_zeros::<T>(m * n)?;

    let m_i64 = i64::try_from(m).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: m={m} exceeds i64::MAX"),
    })?;
    let n_i64 = i64::try_from(n).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: n={n} exceeds i64::MAX"),
    })?;
    let k_i64 = i64::try_from(k).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: k={k} exceeds i64::MAX"),
    })?;

    let result = (|| -> GpuResult<CudaBuffer<T>> {
        // A: dense [m, k], row-major, ld = k.
        let status = unsafe {
            sys::cusparseLtDenseDescriptorInit(
                handle.raw(),
                &mut a_descr as *mut _,
                m_i64,
                k_i64,
                k_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
            )
        };
        check(status, "cusparseLtDenseDescriptorInit (A)")?;

        // B: structured 2:4 [k, n], row-major, ld = n.
        let status = unsafe {
            sys::cusparseLtStructuredDescriptorInit(
                handle.raw(),
                &mut b_descr as *mut _,
                k_i64,
                n_i64,
                n_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
                sys::cusparseLtSparsity_t::CUSPARSELT_SPARSITY_50_PERCENT,
            )
        };
        check(status, "cusparseLtStructuredDescriptorInit (B)")?;

        // C/D: dense [m, n], row-major, ld = n. cuSPARSELt's
        // descriptor is shared between C and D.
        let status = unsafe {
            sys::cusparseLtDenseDescriptorInit(
                handle.raw(),
                &mut c_descr as *mut _,
                m_i64,
                n_i64,
                n_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
            )
        };
        check(status, "cusparseLtDenseDescriptorInit (C)")?;

        // Matmul descriptor: opA = N, opB = N (no transpose).
        let status = unsafe {
            sys::cusparseLtMatmulDescriptorInit(
                handle.raw(),
                &mut matmul_descr as *mut _,
                sys::cusparseOperation_t::CUSPARSE_OPERATION_NON_TRANSPOSE,
                sys::cusparseOperation_t::CUSPARSE_OPERATION_NON_TRANSPOSE,
                &a_descr as *const _,
                &b_descr as *const _,
                &c_descr as *const _,
                &c_descr as *const _,
                compute,
            )
        };
        check(status, "cusparseLtMatmulDescriptorInit")?;

        // Algorithm selection — DEFAULT (the SDK picks based on shape).
        let status = unsafe {
            sys::cusparseLtMatmulAlgSelectionInit(
                handle.raw(),
                &mut alg_sel as *mut _,
                &matmul_descr as *const _,
                sys::cusparseLtMatmulAlg_t::CUSPARSELT_MATMUL_ALG_DEFAULT,
            )
        };
        check(status, "cusparseLtMatmulAlgSelectionInit")?;

        // Build the plan.
        let status = unsafe {
            sys::cusparseLtMatmulPlanInit(
                handle.raw(),
                &mut plan as *mut _,
                &matmul_descr as *const _,
                &alg_sel as *const _,
            )
        };
        check(status, "cusparseLtMatmulPlanInit")?;

        // Workspace size for the matmul kernel.
        let mut workspace_size: usize = 0;
        let status = unsafe {
            sys::cusparseLtMatmulGetWorkspace(
                handle.raw(),
                &plan as *const _,
                &mut workspace_size as *mut _,
            )
        };
        check(status, "cusparseLtMatmulGetWorkspace")?;

        // Compressed-buffer sizes for the structured operand.
        let mut compressed_size: usize = 0;
        let mut compressed_buffer_size: usize = 0;
        let status = unsafe {
            sys::cusparseLtSpMMACompressedSize(
                handle.raw(),
                &plan as *const _,
                &mut compressed_size as *mut _,
                &mut compressed_buffer_size as *mut _,
            )
        };
        check(status, "cusparseLtSpMMACompressedSize")?;

        // Allocate the workspace + compressed-output + scratch buffers
        // on the same stream.
        let mut workspace = stream.alloc_zeros::<u8>(workspace_size.max(1))?;
        let mut compressed = stream.alloc_zeros::<u8>(compressed_size.max(1))?;
        let mut compressed_scratch = stream.alloc_zeros::<u8>(compressed_buffer_size.max(1))?;

        // SAFETY: device_ptr* returns a CUdeviceptr that's valid while
        // the underlying CudaSlice is live. The SyncOnDrop guards bind
        // the borrow lifetime to inner scopes so we can move
        // `out_slice` into the returned `CudaBuffer` once those guards
        // drop.
        use cudarc::driver::DevicePtrMut;

        // Compress B (the dense decompressed form) into the
        // Tensor-Core-friendly cuSPARSELt layout. Scope the borrow on
        // `compressed` / `compressed_scratch` so we can re-borrow
        // `compressed` for the matmul below.
        {
            let (b_dense_ptr, _b_dense_sync) = b_dense_decompressed.inner().device_ptr(&stream);
            let (compressed_ptr, _compressed_sync) = compressed.device_ptr_mut(&stream);
            let (compressed_scratch_ptr, _compressed_scratch_sync) =
                compressed_scratch.device_ptr_mut(&stream);

            let status = unsafe {
                sys::cusparseLtSpMMACompress(
                    handle.raw(),
                    &plan as *const _,
                    b_dense_ptr as *const std::ffi::c_void,
                    compressed_ptr as *mut std::ffi::c_void,
                    compressed_scratch_ptr as *mut std::ffi::c_void,
                    cu_stream,
                )
            };
            check(status, "cusparseLtSpMMACompress")?;
        }

        // alpha = 1, beta = 0 — packed as f32 because cuSPARSELt reads
        // them as the *compute* type. For FP16/BF16 inputs with FP32
        // accumulator, the scalar pointer interpretation is FP32. For
        // TF32-mode FP32 inputs, the scalar pointer is also FP32.
        let alpha: f32 = 1.0;
        let beta: f32 = 0.0;

        // Run the matmul in a tight scope so all `_sync` guards drop
        // before we move `out_slice` into the returned `CudaBuffer`.
        {
            let (a_ptr, _a_sync) = a_dense.inner().device_ptr(&stream);
            let (compressed_ptr_ro, _compressed_sync_ro) = compressed.device_ptr_mut(&stream);
            let (out_ptr, _out_sync) = out_slice.device_ptr_mut(&stream);
            let (workspace_ptr, _workspace_sync) = workspace.device_ptr_mut(&stream);

            // The matmul `streams` arg is a list of CUDA streams to
            // multiplex across; we pass our single stream.
            let mut streams: [sys::cudaStream_t; 1] = [cu_stream];

            let status = unsafe {
                sys::cusparseLtMatmul(
                    handle.raw(),
                    &plan as *const _,
                    std::ptr::from_ref::<f32>(&alpha).cast::<std::ffi::c_void>(),
                    a_ptr as *const std::ffi::c_void,
                    compressed_ptr_ro as *const std::ffi::c_void,
                    std::ptr::from_ref::<f32>(&beta).cast::<std::ffi::c_void>(),
                    out_ptr as *const std::ffi::c_void,
                    out_ptr as *mut std::ffi::c_void,
                    workspace_ptr as *mut std::ffi::c_void,
                    streams.as_mut_ptr(),
                    1,
                )
            };
            check(status, "cusparseLtMatmul")?;
        }

        Ok(CudaBuffer::<T> {
            data: Some(out_slice),
            len: m * n,
            alloc_len: m * n,
            device_ordinal: device.ordinal(),
            pool_fn: None,
        })
    })();

    // SAFETY: each *DescriptorDestroy / PlanDestroy is null-tolerant on
    // partially-initialised descriptors per the SDK; we only have the
    // descriptor's address-of-stack-slot, which is always valid.
    unsafe {
        let _ = sys::cusparseLtMatmulPlanDestroy(&mut plan as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut c_descr as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut b_descr as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut a_descr as *mut _);
    }

    result
}