ferrotorch-gpu 0.5.7

CUDA GPU backend for ferrotorch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
//! cuSPARSELt-backed 2:4 structured sparse matmul.
//!
//! This module wraps the raw FFI emitted by `build.rs` (see
//! `cusparseLt.h` / [NVIDIA cuSPARSELt SDK](https://docs.nvidia.com/cuda/cusparselt))
//! into safe-ish helpers used by `CudaBackendImpl::sparse_matmul_24_*`.
//!
//! # Why a separate library?
//!
//! cuSPARSELt is a distinct NVIDIA SDK from cuSPARSE — it specialises
//! in dense-by-2:4-structured-sparse matmul on Ampere+ Tensor Cores. It
//! ships its own `libcusparseLt.so` and header `cusparseLt.h`. Activating
//! the `cusparselt` cargo feature opts the workspace into linking this
//! library; the default workspace build does not require it.
//!
//! # PyTorch parity
//!
//! `torch._C._sparse_semi_structured_apply` (used by the
//! [`SparseSemiStructuredTensor`] internal class in core PyTorch and by
//! `nn.utils.parametrize` 2:4-pruned linears) routes through cuSPARSELt
//! on Ampere+. ferrotorch mirrors that per `rust-gpu-discipline §3` for
//! `SemiStructuredSparseTensor::sparse_matmul_24` whenever the feature
//! is built **and** `libcusparseLt.so` is available at runtime.
//!
//! # Storage convention
//!
//! cuSPARSELt's "structured" matrix is the 2:4-sparse operand of the
//! matmul (typically the **B** operand in PyTorch's terminology). The
//! ferrotorch `SemiStructuredSparseTensor::sparse_matmul_24(a, b)` API
//! has `b` as the sparse 2:4 weight; that maps directly onto cuSPARSELt's
//! `matB`.
//!
//! cuSPARSELt expects column-major storage internally for the structured
//! operand; the caller must supply leading-dim/order info on the matrix
//! descriptor. We use ROW order on all three dense descriptors and let
//! cuSPARSELt re-pack the structured operand internally.
//!
//! # Compute type
//!
//! For FP16 / BF16 inputs we pick `CUSPARSE_COMPUTE_32F` (FP32 accumulator
//! on Tensor Cores). For FP32 inputs we pick `CUSPARSE_COMPUTE_TF32`,
//! which is the only Tensor-Core-accelerated FP32 mode cuSPARSELt accepts.

#![cfg(all(feature = "cuda", feature = "cusparselt"))]
#![allow(non_snake_case, non_camel_case_types, non_upper_case_globals)]
#![allow(dead_code)]

use cudarc::driver::DevicePtr;

use crate::buffer::CudaBuffer;
use crate::device::GpuDevice;
use crate::error::{GpuError, GpuResult};

// ---------------------------------------------------------------------------
// Raw FFI — generated by build.rs via bindgen against cusparseLt.h
// ---------------------------------------------------------------------------

/// Bindgen-generated Rust FFI for cuSPARSELt. `OUT_DIR` is set by cargo
/// during the build script run.
pub mod sys {
    #![allow(clippy::all)]
    #![allow(unused, non_snake_case, non_camel_case_types, non_upper_case_globals)]
    include!(concat!(env!("OUT_DIR"), "/cusparselt_sys.rs"));
}

// ---------------------------------------------------------------------------
// Status helper
// ---------------------------------------------------------------------------

#[inline]
fn check(status: sys::cusparseStatus_t, op: &'static str) -> GpuResult<()> {
    if status == sys::cusparseStatus_t::CUSPARSE_STATUS_SUCCESS {
        Ok(())
    } else {
        Err(GpuError::InvalidState {
            message: format!("{op} returned cuSPARSELt status {status:?}"),
        })
    }
}

// ---------------------------------------------------------------------------
// Handle wrapper
// ---------------------------------------------------------------------------

/// Owning wrapper around `cusparseLtHandle_t`. Mirrors the
/// `CusparseHandle` pattern from `crate::sparse` — destroy on drop, one
/// per backend, stream rebound per call.
pub struct CusparseLtHandle {
    inner: sys::cusparseLtHandle_t,
}

// SAFETY: `cusparseLtHandle_t` is an opaque struct value (not a pointer)
// per the SDK; cuSPARSELt requires the handle to be used from one thread
// at a time. `CudaBackendImpl` owns it inside `OnceLock` and serialises
// per-device access.
unsafe impl Send for CusparseLtHandle {}
unsafe impl Sync for CusparseLtHandle {}

impl std::fmt::Debug for CusparseLtHandle {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CusparseLtHandle").finish()
    }
}

impl CusparseLtHandle {
    /// Create and initialise a fresh cuSPARSELt handle on the current
    /// CUDA context.
    pub fn new() -> GpuResult<Self> {
        // SAFETY: `cusparseLtInit` writes a fresh handle into the
        // pointed-to memory. `MaybeUninit::zeroed()` for an opaque
        // struct of trivially-readable bytes is sound — cuSPARSELt
        // overwrites every field on success.
        let mut inner: sys::cusparseLtHandle_t =
            unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
        let status = unsafe { sys::cusparseLtInit(&mut inner as *mut _) };
        check(status, "cusparseLtInit")?;
        Ok(Self { inner })
    }

    /// Pointer-typed access for FFI calls.
    #[inline]
    pub fn raw(&self) -> *const sys::cusparseLtHandle_t {
        &self.inner as *const _
    }

    /// Mutable pointer-typed access for FFI calls that take `*mut`.
    #[inline]
    pub fn raw_mut(&mut self) -> *mut sys::cusparseLtHandle_t {
        &mut self.inner as *mut _
    }
}

impl Drop for CusparseLtHandle {
    fn drop(&mut self) {
        // SAFETY: handle was initialised by `cusparseLtInit` and not
        // destroyed yet; Drop runs at most once.
        unsafe {
            let _ = sys::cusparseLtDestroy(&mut self.inner as *mut _);
        }
    }
}

// ---------------------------------------------------------------------------
// dtype mapping
// ---------------------------------------------------------------------------

/// Compile-time-distinct value types accepted by cuSPARSELt's structured
/// matmul path. ferrotorch wires the three PyTorch parity covers — FP16,
/// BF16, FP32 (TF32 mode).
#[derive(Debug, Clone, Copy)]
pub enum CuSpLtDType {
    F16,
    Bf16,
    F32,
}

impl CuSpLtDType {
    fn cuda_dtype(self) -> sys::cudaDataType_t {
        match self {
            CuSpLtDType::F16 => sys::cudaDataType_t::CUDA_R_16F,
            CuSpLtDType::Bf16 => sys::cudaDataType_t::CUDA_R_16BF,
            CuSpLtDType::F32 => sys::cudaDataType_t::CUDA_R_32F,
        }
    }

    fn compute_type(self) -> sys::cusparseComputeType {
        match self {
            // FP16/BF16 tensor cores accumulate to FP32 by default; this
            // matches PyTorch autocast semantics and the cuSPARSELt
            // documentation's recommended mode.
            CuSpLtDType::F16 | CuSpLtDType::Bf16 => sys::cusparseComputeType::CUSPARSE_COMPUTE_32F,
            // The only Tensor-Core-accelerated FP32 mode cuSPARSELt
            // exposes is TF32. Plain CUSPARSE_COMPUTE_32F on FP32 inputs
            // is rejected by `cusparseLtMatmulDescriptorInit` on Ampere.
            CuSpLtDType::F32 => sys::cusparseComputeType::CUSPARSE_COMPUTE_TF32,
        }
    }

    fn elem_bytes(self) -> usize {
        match self {
            CuSpLtDType::F16 | CuSpLtDType::Bf16 => 2,
            CuSpLtDType::F32 => 4,
        }
    }

    fn alignment(self) -> u32 {
        // cuSPARSELt requires 16-byte alignment for the leading dim of
        // every matrix on Ampere+ (8 elements for FP16/BF16, 4 elements
        // for FP32). Return alignment in **bytes** — that's what the
        // descriptor init takes.
        16
    }
}

// ---------------------------------------------------------------------------
// Structured matmul — generic over (sparse_b, dense_a) flavoured by dtype
// ---------------------------------------------------------------------------

/// Compute `D = alpha * A @ B + beta * C` where `B` is dense
/// `[k, n]` row-major **but stored in 2:4 structured sparse layout**
/// after compression, and `A` is dense `[m, k]` row-major. Returns a
/// dense `[m, n]` row-major device buffer of element type matching
/// `dtype`.
///
/// `b_dense_decompressed` must be the **decompressed** form of the 2:4
/// matrix (i.e. dense values with the masked positions set to zero). We
/// hand the dense form to cuSPARSELt's `cusparseLtSpMMACompress` which
/// re-packs it into the Tensor-Core-friendly format internally.
///
/// `b_dense_decompressed.len() == k * n` (row-major, contiguous).
/// `a.len() == m * k` (row-major, contiguous).
///
/// The element type is the same for A, B, C, D — set via `dtype`. Mixed-
/// precision compute is selected automatically (FP32 accumulator for
/// FP16/BF16, TF32 mode for FP32).
#[allow(clippy::too_many_arguments)]
pub fn gpu_sparse_matmul_24<T>(
    handle: &CusparseLtHandle,
    a_dense: &CudaBuffer<T>,
    b_dense_decompressed: &CudaBuffer<T>,
    m: usize,
    k: usize,
    n: usize,
    dtype: CuSpLtDType,
    device: &GpuDevice,
) -> GpuResult<CudaBuffer<T>>
where
    T: cudarc::driver::DeviceRepr + Default + Copy + 'static,
{
    if a_dense.len() != m * k {
        return Err(GpuError::ShapeMismatch {
            op: "cusparselt::sparse_matmul_24",
            expected: vec![m, k],
            got: vec![a_dense.len()],
        });
    }
    if b_dense_decompressed.len() != k * n {
        return Err(GpuError::ShapeMismatch {
            op: "cusparselt::sparse_matmul_24",
            expected: vec![k, n],
            got: vec![b_dense_decompressed.len()],
        });
    }
    if m == 0 || n == 0 || k == 0 {
        let stream = device.stream();
        let slice = stream.alloc_zeros::<T>(m * n)?;
        return Ok(CudaBuffer::<T> {
            data: Some(slice),
            len: m * n,
            alloc_len: m * n,
            device_ordinal: device.ordinal(),
            pool_fn: None,
        });
    }

    // cuSPARSELt requires the structured-side leading dim to be a
    // multiple of 8 for FP16/BF16 (or 4 for FP32). Fall back to the
    // caller (`Err`) so the dispatch site can decompress + dense matmul
    // for non-aligned shapes — that's the same fallback PyTorch takes.
    let elem_align: usize = match dtype {
        CuSpLtDType::F16 | CuSpLtDType::Bf16 => 8,
        CuSpLtDType::F32 => 4,
    };
    if k % elem_align != 0 || n % elem_align != 0 || m % elem_align != 0 {
        return Err(GpuError::InvalidState {
            message: format!(
                "cusparselt::sparse_matmul_24: dims (m={m}, k={k}, n={n}) must each be a multiple of {elem_align} for dtype {dtype:?}"
            ),
        });
    }

    let stream = device.stream();
    // SAFETY: cudarc's CudaStream::cu_stream returns a valid CUstream
    // for the lifetime of the Arc<CudaStream>. cusparseLt's
    // cudaStream_t is the same ABI-level pointer.
    let cu_stream = stream.cu_stream() as sys::cudaStream_t;

    let dtype_cuda = dtype.cuda_dtype();
    let compute = dtype.compute_type();
    let align: u32 = dtype.alignment();

    // ---- Descriptors --------------------------------------------------------
    // SAFETY: each descriptor is initialised by the matching cuSPARSELt
    // *Init function before any read; we hand the descriptor's address
    // into the FFI exactly once and destroy it (or let it go out of
    // scope as a POD value the SDK does not require explicit freeing
    // for) at the end of the function.
    let mut a_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut b_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut c_descr: sys::cusparseLtMatDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut matmul_descr: sys::cusparseLtMatmulDescriptor_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut alg_sel: sys::cusparseLtMatmulAlgSelection_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };
    let mut plan: sys::cusparseLtMatmulPlan_t =
        unsafe { std::mem::MaybeUninit::zeroed().assume_init() };

    // We allocate the output buffer up front so we can hand the device
    // pointer to the matmul without re-borrowing later.
    let mut out_slice = stream.alloc_zeros::<T>(m * n)?;

    let m_i64 = i64::try_from(m).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: m={m} exceeds i64::MAX"),
    })?;
    let n_i64 = i64::try_from(n).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: n={n} exceeds i64::MAX"),
    })?;
    let k_i64 = i64::try_from(k).map_err(|_| GpuError::InvalidState {
        message: format!("cusparselt: k={k} exceeds i64::MAX"),
    })?;

    let result = (|| -> GpuResult<CudaBuffer<T>> {
        // A: dense [m, k], row-major, ld = k.
        let status = unsafe {
            sys::cusparseLtDenseDescriptorInit(
                handle.raw(),
                &mut a_descr as *mut _,
                m_i64,
                k_i64,
                k_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
            )
        };
        check(status, "cusparseLtDenseDescriptorInit (A)")?;

        // B: structured 2:4 [k, n], row-major, ld = n.
        let status = unsafe {
            sys::cusparseLtStructuredDescriptorInit(
                handle.raw(),
                &mut b_descr as *mut _,
                k_i64,
                n_i64,
                n_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
                sys::cusparseLtSparsity_t::CUSPARSELT_SPARSITY_50_PERCENT,
            )
        };
        check(status, "cusparseLtStructuredDescriptorInit (B)")?;

        // C/D: dense [m, n], row-major, ld = n. cuSPARSELt's
        // descriptor is shared between C and D.
        let status = unsafe {
            sys::cusparseLtDenseDescriptorInit(
                handle.raw(),
                &mut c_descr as *mut _,
                m_i64,
                n_i64,
                n_i64,
                align,
                dtype_cuda,
                sys::cusparseOrder_t::CUSPARSE_ORDER_ROW,
            )
        };
        check(status, "cusparseLtDenseDescriptorInit (C)")?;

        // Matmul descriptor: opA = N, opB = N (no transpose).
        let status = unsafe {
            sys::cusparseLtMatmulDescriptorInit(
                handle.raw(),
                &mut matmul_descr as *mut _,
                sys::cusparseOperation_t::CUSPARSE_OPERATION_NON_TRANSPOSE,
                sys::cusparseOperation_t::CUSPARSE_OPERATION_NON_TRANSPOSE,
                &a_descr as *const _,
                &b_descr as *const _,
                &c_descr as *const _,
                &c_descr as *const _,
                compute,
            )
        };
        check(status, "cusparseLtMatmulDescriptorInit")?;

        // Algorithm selection — DEFAULT (the SDK picks based on shape).
        let status = unsafe {
            sys::cusparseLtMatmulAlgSelectionInit(
                handle.raw(),
                &mut alg_sel as *mut _,
                &matmul_descr as *const _,
                sys::cusparseLtMatmulAlg_t::CUSPARSELT_MATMUL_ALG_DEFAULT,
            )
        };
        check(status, "cusparseLtMatmulAlgSelectionInit")?;

        // Build the plan.
        let status = unsafe {
            sys::cusparseLtMatmulPlanInit(
                handle.raw(),
                &mut plan as *mut _,
                &matmul_descr as *const _,
                &alg_sel as *const _,
            )
        };
        check(status, "cusparseLtMatmulPlanInit")?;

        // Workspace size for the matmul kernel.
        let mut workspace_size: usize = 0;
        let status = unsafe {
            sys::cusparseLtMatmulGetWorkspace(
                handle.raw(),
                &plan as *const _,
                &mut workspace_size as *mut _,
            )
        };
        check(status, "cusparseLtMatmulGetWorkspace")?;

        // Compressed-buffer sizes for the structured operand.
        let mut compressed_size: usize = 0;
        let mut compressed_buffer_size: usize = 0;
        let status = unsafe {
            sys::cusparseLtSpMMACompressedSize(
                handle.raw(),
                &plan as *const _,
                &mut compressed_size as *mut _,
                &mut compressed_buffer_size as *mut _,
            )
        };
        check(status, "cusparseLtSpMMACompressedSize")?;

        // Allocate the workspace + compressed-output + scratch buffers
        // on the same stream.
        let mut workspace = stream.alloc_zeros::<u8>(workspace_size.max(1))?;
        let mut compressed = stream.alloc_zeros::<u8>(compressed_size.max(1))?;
        let mut compressed_scratch = stream.alloc_zeros::<u8>(compressed_buffer_size.max(1))?;

        // SAFETY: device_ptr* returns a CUdeviceptr that's valid while
        // the underlying CudaSlice is live. The SyncOnDrop guards bind
        // the borrow lifetime to inner scopes so we can move
        // `out_slice` into the returned `CudaBuffer` once those guards
        // drop.
        use cudarc::driver::DevicePtrMut;

        // Compress B (the dense decompressed form) into the
        // Tensor-Core-friendly cuSPARSELt layout. Scope the borrow on
        // `compressed` / `compressed_scratch` so we can re-borrow
        // `compressed` for the matmul below.
        {
            let (b_dense_ptr, _b_dense_sync) = b_dense_decompressed.inner().device_ptr(&stream);
            let (compressed_ptr, _compressed_sync) = compressed.device_ptr_mut(&stream);
            let (compressed_scratch_ptr, _compressed_scratch_sync) =
                compressed_scratch.device_ptr_mut(&stream);

            let status = unsafe {
                sys::cusparseLtSpMMACompress(
                    handle.raw(),
                    &plan as *const _,
                    b_dense_ptr as *const std::ffi::c_void,
                    compressed_ptr as *mut std::ffi::c_void,
                    compressed_scratch_ptr as *mut std::ffi::c_void,
                    cu_stream,
                )
            };
            check(status, "cusparseLtSpMMACompress")?;
        }

        // alpha = 1, beta = 0 — packed as f32 because cuSPARSELt reads
        // them as the *compute* type. For FP16/BF16 inputs with FP32
        // accumulator, the scalar pointer interpretation is FP32. For
        // TF32-mode FP32 inputs, the scalar pointer is also FP32.
        let alpha: f32 = 1.0;
        let beta: f32 = 0.0;

        // Run the matmul in a tight scope so all `_sync` guards drop
        // before we move `out_slice` into the returned `CudaBuffer`.
        {
            let (a_ptr, _a_sync) = a_dense.inner().device_ptr(&stream);
            let (compressed_ptr_ro, _compressed_sync_ro) = compressed.device_ptr_mut(&stream);
            let (out_ptr, _out_sync) = out_slice.device_ptr_mut(&stream);
            let (workspace_ptr, _workspace_sync) = workspace.device_ptr_mut(&stream);

            // The matmul `streams` arg is a list of CUDA streams to
            // multiplex across; we pass our single stream.
            let mut streams: [sys::cudaStream_t; 1] = [cu_stream];

            let status = unsafe {
                sys::cusparseLtMatmul(
                    handle.raw(),
                    &plan as *const _,
                    std::ptr::from_ref::<f32>(&alpha).cast::<std::ffi::c_void>(),
                    a_ptr as *const std::ffi::c_void,
                    compressed_ptr_ro as *const std::ffi::c_void,
                    std::ptr::from_ref::<f32>(&beta).cast::<std::ffi::c_void>(),
                    out_ptr as *const std::ffi::c_void,
                    out_ptr as *mut std::ffi::c_void,
                    workspace_ptr as *mut std::ffi::c_void,
                    streams.as_mut_ptr(),
                    1,
                )
            };
            check(status, "cusparseLtMatmul")?;
        }

        Ok(CudaBuffer::<T> {
            data: Some(out_slice),
            len: m * n,
            alloc_len: m * n,
            device_ordinal: device.ordinal(),
            pool_fn: None,
        })
    })();

    // SAFETY: each *DescriptorDestroy / PlanDestroy is null-tolerant on
    // partially-initialised descriptors per the SDK; we only have the
    // descriptor's address-of-stack-slot, which is always valid.
    unsafe {
        let _ = sys::cusparseLtMatmulPlanDestroy(&mut plan as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut c_descr as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut b_descr as *mut _);
        let _ = sys::cusparseLtMatDescriptorDestroy(&mut a_descr as *mut _);
    }

    result
}