ferrotorch-gpu 0.5.8

CUDA GPU backend for ferrotorch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//! Host-to-device and device-to-host memory transfers.
//!
//! These functions copy data between CPU (`&[T]` / `Vec<T>`) and GPU
//! ([`CudaBuffer`]) memory via the device's default CUDA stream.

use crate::buffer::CudaBuffer;
use crate::device::GpuDevice;
use crate::error::{GpuError, GpuResult};

/// Copy a host slice to device memory, returning a new [`CudaBuffer`].
///
/// The transfer uses the device's default CUDA stream and blocks until
/// the copy is complete.
///
/// # Errors
///
/// Returns [`GpuError::Driver`] if the CUDA memcpy fails.
#[cfg(feature = "cuda")]
pub fn cpu_to_gpu<T>(data: &[T], device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
    T: cudarc::driver::DeviceRepr,
{
    let slice = device.stream().clone_htod(data)?;
    Ok(CudaBuffer {
        data: Some(slice),
        len: data.len(),
        alloc_len: data.len(),
        device_ordinal: device.ordinal(),
        pool_fn: None,
    })
}

/// Copy device memory back to a host `Vec<T>`.
///
/// # Errors
///
/// Returns [`GpuError::DeviceMismatch`] if the buffer's device ordinal does
/// not match the provided device, or [`GpuError::Driver`] on CUDA errors.
#[cfg(feature = "cuda")]
pub fn gpu_to_cpu<T>(buffer: &CudaBuffer<T>, device: &GpuDevice) -> GpuResult<Vec<T>>
where
    T: cudarc::driver::DeviceRepr,
{
    if buffer.device_ordinal() != device.ordinal() {
        return Err(GpuError::DeviceMismatch {
            expected: buffer.device_ordinal(),
            got: device.ordinal(),
        });
    }
    let mut vec = device.stream().clone_dtoh(buffer.inner())?;
    // When the allocation is rounded up (pooled buffers), the CudaSlice
    // contains more elements than the logical `len`. Truncate to the
    // logical length so callers only see the meaningful data.
    vec.truncate(buffer.len());
    Ok(vec)
}

/// Allocate a zero-initialized [`CudaBuffer<f32>`] on the given device.
///
/// Checks the global buffer pool first. On a pool hit, the existing
/// `CudaSlice` (with its CUDA events) is reused and only `cuMemsetD8Async`
/// is called. On a miss, a fresh allocation is made via cudarc with the
/// rounded length so the buffer is findable in the pool on subsequent lookups.
///
/// `memset_zeros` is called on the full `alloc_len` (rounded) allocation,
/// not just the logical `len`. This is intentional: it ensures no stale
/// data from previous uses leaks into the padding region.
#[cfg(feature = "cuda")]
pub fn alloc_zeros_f32(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
    use cudarc::driver::CudaSlice;

    let rounded = crate::pool::round_len(len);

    // Pool hit: reuse a cached CudaSlice — no cuMemAllocAsync, no cuEventCreate.
    if let Some(mut slice) = crate::pool::pool_take::<CudaSlice<f32>>(device.ordinal(), rounded, 4)
    {
        // Zero the full allocation to ensure no stale data (P10: intentional).
        device.stream().memset_zeros(&mut slice)?;
        return Ok(CudaBuffer::<f32>::new_pooled(
            slice,
            len,
            rounded,
            device.ordinal(),
        ));
    }

    // Pool miss: fresh allocation from CUDA driver with rounded length
    // so the pool key matches on return. Allocating `rounded` elements
    // (not `len`) ensures the CudaSlice size matches what pool_take
    // will look for later (B12 fix).
    let slice = device.stream().alloc_zeros::<f32>(rounded)?;
    Ok(CudaBuffer::<f32>::new_pooled(
        slice,
        len,
        rounded,
        device.ordinal(),
    ))
}

/// Allocate a zero-initialized `CudaSlice<u16>` carrying `len` bf16 elements.
///
/// bf16 storage convention in ferrotorch-gpu: each `u16` element holds a bf16
/// bit pattern (top 16 bits of an f32). The returned `CudaSlice<u16>` is the
/// same shape consumed by every `*_bf16` PTX kernel in [`crate::bf16`] and by
/// the `gpu_matmul_bf16_bf16` family in [`crate::blas`].
///
/// Unlike [`alloc_zeros_f32`] / [`alloc_zeros_f64`], this returns a raw
/// `CudaSlice<u16>` instead of a [`CudaBuffer`] wrapper. The bf16 paths in
/// the dispatcher (`softmax_bf16_f32`, `add_bf16_f32`, etc.) downcast handles
/// directly to `CudaSlice<u16>`, so wrapping in `CudaBuffer<T>` would force
/// changes in every existing bf16 dispatcher branch.
///
/// The CUDA pool is not consulted: pool entries are keyed by `(ordinal, len,
/// elem_size)` and pool returns the original `Box<dyn Any>` (currently typed
/// `CudaSlice<f32>` / `CudaSlice<f64>`). Adding a u16-typed pool path is a
/// follow-up — bf16 buffers are short-lived intermediates in the forward
/// pass and the pool-miss cost is dominated by the matmul itself.
///
/// # Errors
///
/// Returns [`GpuError::Driver`] if the underlying `cuMemAllocAsync` /
/// `cuMemsetD8Async` calls fail.
#[cfg(feature = "cuda")]
pub fn alloc_zeros_bf16(
    len: usize,
    device: &GpuDevice,
) -> GpuResult<cudarc::driver::CudaSlice<u16>> {
    Ok(device.stream().alloc_zeros::<u16>(len)?)
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros_bf16(_len: usize, _device: &GpuDevice) -> GpuResult<()> {
    Err(GpuError::NoCudaFeature)
}

/// Allocate a zero-initialized [`CudaBuffer<f64>`] on the given device.
///
/// Pool-aware variant for f64 buffers. See [`alloc_zeros_f32`] for details.
#[cfg(feature = "cuda")]
pub fn alloc_zeros_f64(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
    use cudarc::driver::CudaSlice;

    let rounded = crate::pool::round_len(len);

    if let Some(mut slice) = crate::pool::pool_take::<CudaSlice<f64>>(device.ordinal(), rounded, 8)
    {
        device.stream().memset_zeros(&mut slice)?;
        return Ok(CudaBuffer::<f64>::new_pooled(
            slice,
            len,
            rounded,
            device.ordinal(),
        ));
    }

    let slice = device.stream().alloc_zeros::<f64>(rounded)?;
    Ok(CudaBuffer::<f64>::new_pooled(
        slice,
        len,
        rounded,
        device.ordinal(),
    ))
}

/// Generic alloc_zeros — kept for backward compatibility and non-f32/f64 types.
/// Does NOT use the pool (no pool support for arbitrary T).
#[cfg(feature = "cuda")]
pub fn alloc_zeros<T>(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
    T: cudarc::driver::DeviceRepr + cudarc::driver::ValidAsZeroBits,
{
    let slice = device.stream().alloc_zeros::<T>(len)?;
    Ok(CudaBuffer {
        data: Some(slice),
        len,
        alloc_len: len,
        device_ordinal: device.ordinal(),
        pool_fn: None,
    })
}

/// Copy a host slice to device memory via pinned (page-locked) host memory.
///
/// Allocates a temporary `PinnedHostSlice`, copies `data` into it, then
/// transfers to the GPU using DMA. The pinned allocation is freed after
/// the transfer completes. For large tensors, this is ~2x faster than
/// [`cpu_to_gpu`] which uses pageable memory.
///
/// # When to use
///
/// Use this in DataLoader's prefetch pipeline when `pin_memory=True`.
/// For small tensors (< 64KB), the overhead of pinned allocation may
/// outweigh the DMA benefit — prefer [`cpu_to_gpu`] instead.
#[cfg(feature = "cuda")]
pub fn cpu_to_gpu_pinned<T>(data: &[T], device: &GpuDevice) -> GpuResult<CudaBuffer<T>>
where
    T: cudarc::driver::DeviceRepr + cudarc::driver::ValidAsZeroBits + Copy,
{
    let ctx = device.context();
    let stream = device.stream();

    // Allocate pinned host memory and copy data into it.
    // SAFETY:
    // - `CudaContext::alloc_pinned` (cudarc 0.19.4 src/driver/safe/core.rs:1346)
    //   is `unsafe` because the returned `PinnedHostSlice<T>` contains
    //   uninitialized memory after `cuMemAllocHost`/`cuMemHostAlloc`
    //   (CUDA driver API). The caller's obligation is to fully initialize
    //   the slice before any read.
    // - We initialize every element on the very next line via
    //   `pinned.as_mut_slice()?.copy_from_slice(data)`. The `copy_from_slice`
    //   contract requires source and destination to have equal lengths;
    //   `pinned` was allocated with `data.len()` elements (line 167), so the
    //   length precondition holds and every element of the pinned region
    //   is overwritten before the subsequent `clone_htod` read on line 171.
    // - `T: DeviceRepr + ValidAsZeroBits + Copy` (function bound on line
    //   161-162) guarantees the bit layout is suitable for both pinned host
    //   memory and DMA transfer to device.
    // - `ctx` is a valid `Arc<CudaContext>` obtained from `device.context()`
    //   on line 163; cudarc upholds the bind-to-thread invariant inside
    //   `alloc_pinned` (line 1350 of upstream).
    // - Lifetime: `pinned` is owned by this stack frame and explicitly
    //   `drop`-ped on line 174 after `clone_htod` consumes it as `&pinned`,
    //   so the pinned allocation outlives every read.
    let mut pinned = unsafe { ctx.alloc_pinned::<T>(data.len())? };
    pinned.as_mut_slice()?.copy_from_slice(data);

    // Transfer from pinned host to device (uses DMA, ~2x faster than pageable).
    let slice = stream.clone_htod(&pinned)?;

    // pinned is dropped here, freeing the host memory.
    drop(pinned);

    Ok(CudaBuffer {
        data: Some(slice),
        len: data.len(),
        alloc_len: data.len(),
        device_ordinal: device.ordinal(),
        pool_fn: None,
    })
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn cpu_to_gpu_pinned<T>(_data: &[T], _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
    Err(GpuError::NoCudaFeature)
}

// ---------------------------------------------------------------------------
// Stubs when `cuda` feature is disabled
// ---------------------------------------------------------------------------

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn cpu_to_gpu<T>(_data: &[T], _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
    Err(GpuError::NoCudaFeature)
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn gpu_to_cpu<T>(_buffer: &CudaBuffer<T>, _device: &GpuDevice) -> GpuResult<Vec<T>> {
    Err(GpuError::NoCudaFeature)
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros<T>(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<T>> {
    Err(GpuError::NoCudaFeature)
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros_f32(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
    Err(GpuError::NoCudaFeature)
}

/// Stub — always returns [`GpuError::NoCudaFeature`].
#[cfg(not(feature = "cuda"))]
pub fn alloc_zeros_f64(_len: usize, _device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
    Err(GpuError::NoCudaFeature)
}

// ---------------------------------------------------------------------------
// Tests — require a real CUDA GPU
// ---------------------------------------------------------------------------

#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
    use super::*;

    #[test]
    fn round_trip_f32() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let host: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0];

        let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
        assert_eq!(gpu_buf.len(), 5);
        assert_eq!(gpu_buf.device_ordinal(), 0);

        let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
        assert_eq!(back, host);
    }

    #[test]
    #[allow(clippy::approx_constant)] // 3.14 is an arbitrary round-trip value, not π.
    fn round_trip_f64() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let host: Vec<f64> = vec![1.0, -2.5, 3.14, 0.0, f64::MAX];

        let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
        assert_eq!(gpu_buf.len(), 5);

        let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
        assert_eq!(back, host);
    }

    #[test]
    fn alloc_zeros_f32_basic() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let buf = alloc_zeros_f32(1024, &device).expect("alloc_zeros_f32");
        assert_eq!(buf.len(), 1024);
        assert!(buf.pool_fn.is_some());

        let host = gpu_to_cpu(&buf, &device).expect("gpu_to_cpu");
        assert!(host.iter().all(|&x| x == 0.0));
    }

    #[test]
    fn pool_reuse_f32() {
        let device = GpuDevice::new(0).expect("CUDA device 0");

        // Allocate and drop — should go to pool.
        let buf = alloc_zeros_f32(512, &device).expect("alloc 1");
        assert!(buf.pool_fn.is_some());
        drop(buf);

        assert!(crate::pool::cached_bytes(0) > 0);

        // Second allocation of same size — should hit pool.
        let buf2 = alloc_zeros_f32(512, &device).expect("alloc 2");
        assert!(buf2.pool_fn.is_some());

        let host = gpu_to_cpu(&buf2, &device).expect("gpu_to_cpu");
        assert!(
            host.iter().all(|&x| x == 0.0),
            "pooled buffer must be zeroed"
        );
    }

    #[test]
    fn empty_cache_clears_pool() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let buf = alloc_zeros_f32(256, &device).expect("alloc");
        drop(buf);
        assert!(crate::pool::cached_bytes(0) > 0);

        crate::pool::empty_cache(0);
        assert_eq!(crate::pool::cached_bytes(0), 0);
    }

    #[test]
    fn alloc_zeros_generic() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let buf = alloc_zeros::<f32>(1024, &device).expect("alloc_zeros");
        assert_eq!(buf.len(), 1024);

        let host = gpu_to_cpu(&buf, &device).expect("gpu_to_cpu");
        assert!(host.iter().all(|&x| x == 0.0));
    }

    #[test]
    fn alloc_zeros_bf16_basic() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let slice = alloc_zeros_bf16(1024, &device).expect("alloc_zeros_bf16");
        assert_eq!(slice.len(), 1024);

        let host = device
            .stream()
            .clone_dtoh(&slice)
            .expect("clone_dtoh bf16 zeros");
        assert!(host.iter().all(|&x| x == 0));
    }

    #[test]
    fn round_trip_bf16() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        // Hand-rolled bf16 bit patterns for {0.0, 1.0, -1.0, 2.5, -3.5}.
        // bf16 is the top 16 bits of an f32; we encode via half::bf16
        // round-to-nearest-even so this test does not depend on the
        // f32->bf16 conversion kernel.
        let host: Vec<u16> = [0.0_f32, 1.0, -1.0, 2.5, -3.5]
            .iter()
            .map(|&x| half::bf16::from_f32(x).to_bits())
            .collect();

        let slice = device.stream().clone_htod(&host).expect("clone_htod bf16");
        assert_eq!(slice.len(), host.len());

        let back = device.stream().clone_dtoh(&slice).expect("clone_dtoh bf16");
        assert_eq!(back, host);
    }

    #[test]
    fn empty_transfer() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let host: Vec<f32> = vec![];

        let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
        assert_eq!(gpu_buf.len(), 0);
        assert!(gpu_buf.is_empty());

        let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
        assert!(back.is_empty());
    }

    #[test]
    fn large_transfer() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let n = 1_000_000;
        let host: Vec<f32> = (0..n).map(|i| i as f32).collect();

        let gpu_buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
        assert_eq!(gpu_buf.len(), n);

        let back = gpu_to_cpu(&gpu_buf, &device).expect("gpu_to_cpu");
        assert_eq!(back, host);
    }

    #[test]
    fn device_mismatch_rejected() {
        let device = GpuDevice::new(0).expect("CUDA device 0");
        let host: Vec<f32> = vec![1.0];
        let mut buf = cpu_to_gpu(&host, &device).expect("cpu_to_gpu");
        buf.device_ordinal = 99;

        let err = gpu_to_cpu(&buf, &device).unwrap_err();
        match err {
            GpuError::DeviceMismatch {
                expected: 99,
                got: 0,
            } => {}
            other => panic!("unexpected error: {other}"),
        }
    }
}