aprender-gpu 0.31.1

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
use super::*;
use std::mem;

#[test]
#[cfg(not(feature = "cuda"))]
fn test_buffer_requires_cuda_feature() {
    // Without cuda feature, allocation should fail
    // This test verifies the module compiles
    assert!(true);
}

#[test]
fn test_size_bytes_calculation() {
    // Test size calculation logic (doesn't require CUDA)
    let size = 1024 * mem::size_of::<f32>();
    assert_eq!(size, 4096);
}

#[cfg(feature = "cuda")]
mod cuda_tests {
    use super::*;
    use crate::driver::CudaContext;

    macro_rules! cuda_ctx {
        () => {
            match CudaContext::new(0) {
                Ok(ctx) => ctx,
                Err(e) => {
                    eprintln!("Skipping CUDA test: {:?}", e);
                    return;
                }
            }
        };
    }

    #[test]
    fn test_gpu_buffer_new_empty() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 0).unwrap();
        assert!(buf.is_empty());
        assert_eq!(buf.len(), 0);
        assert_eq!(buf.size_bytes(), 0);
    }

    #[test]
    fn test_gpu_buffer_new_allocation() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 1024).unwrap();
        assert!(!buf.is_empty());
        assert_eq!(buf.len(), 1024);
        assert_eq!(buf.size_bytes(), 4096);
        assert!(buf.as_ptr() != 0);
    }

    #[test]
    fn test_gpu_buffer_copy_roundtrip() {
        let ctx = cuda_ctx!();
        let mut buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 256).unwrap();

        // Upload data
        let host_data: Vec<f32> = (0..256).map(|i| i as f32).collect();
        buf.copy_from_host(&host_data).unwrap();

        // Download and verify
        let mut result = vec![0.0f32; 256];
        buf.copy_to_host(&mut result).unwrap();
        assert_eq!(host_data, result);
    }

    #[test]
    fn test_gpu_buffer_copy_from_host_size_mismatch() {
        let ctx = cuda_ctx!();
        let mut buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 100).unwrap();

        // Try to copy too much data
        let host_data: Vec<f32> = vec![1.0; 200];
        let result = buf.copy_from_host(&host_data);
        assert!(result.is_err());
    }

    #[test]
    fn test_gpu_buffer_copy_to_host_size_mismatch() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 100).unwrap();

        // Smaller host buffer is OK (partial copy) — copies first 50 elements
        let mut partial: Vec<f32> = vec![0.0; 50];
        assert!(buf.copy_to_host(&mut partial).is_ok());

        // Larger host buffer than device MUST fail
        let mut too_large: Vec<f32> = vec![0.0; 200];
        assert!(buf.copy_to_host(&mut too_large).is_err());
    }

    #[test]
    fn test_gpu_buffer_clone_metadata() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 512).unwrap();

        let view = buf.clone_metadata();
        assert_eq!(view.as_ptr(), buf.as_ptr());
        assert_eq!(view.len(), buf.len());
        assert!(!view.is_empty());
    }

    #[test]
    fn test_gpu_buffer_view_empty() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 0).unwrap();

        let view = buf.clone_metadata();
        assert!(view.is_empty());
        assert_eq!(view.len(), 0);
    }

    #[test]
    fn test_gpu_buffer_raw_parts() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 64).unwrap();
        let ptr = buf.as_ptr();
        let len = buf.len();

        // Create non-owning buffer from raw parts
        let buf2 = unsafe { GpuBuffer::<f32>::from_raw_parts(ptr, len) };
        assert_eq!(buf2.as_ptr(), ptr);
        assert_eq!(buf2.len(), len);

        // Forget buf2 to prevent double-free
        std::mem::forget(buf2);
    }

    #[test]
    fn test_gpu_buffer_from_host() {
        let ctx = cuda_ctx!();
        let data: Vec<f32> = (0..128).map(|i| i as f32).collect();
        let buf = GpuBuffer::from_host(&ctx, &data).unwrap();
        assert_eq!(buf.len(), 128);

        // Verify data was uploaded correctly
        let mut result = vec![0.0f32; 128];
        buf.copy_to_host(&mut result).unwrap();
        assert_eq!(data, result);
    }

    #[test]
    fn test_gpu_buffer_copy_from_host_at() {
        let ctx = cuda_ctx!();
        let mut buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 100).unwrap();

        // Initialize with zeros
        let zeros = vec![0.0f32; 100];
        buf.copy_from_host(&zeros).unwrap();

        // Copy partial data at offset
        let partial = vec![1.0f32; 20];
        buf.copy_from_host_at(&partial, 50).unwrap();

        // Verify
        let mut result = vec![0.0f32; 100];
        buf.copy_to_host(&mut result).unwrap();
        assert_eq!(result[49], 0.0);
        assert_eq!(result[50], 1.0);
        assert_eq!(result[69], 1.0);
        assert_eq!(result[70], 0.0);
    }

    #[test]
    fn test_gpu_buffer_copy_to_host_at() {
        let ctx = cuda_ctx!();
        let data: Vec<f32> = (0..100).map(|i| i as f32).collect();
        let buf = GpuBuffer::from_host(&ctx, &data).unwrap();

        // Copy partial data from offset
        let mut result = vec![0.0f32; 20];
        buf.copy_to_host_at(&mut result, 30).unwrap();

        assert_eq!(result[0], 30.0);
        assert_eq!(result[19], 49.0);
    }

    #[test]
    fn test_gpu_buffer_clone_device() {
        let ctx = cuda_ctx!();
        let data: Vec<f32> = (0..64).map(|i| i as f32).collect();
        let buf = GpuBuffer::from_host(&ctx, &data).unwrap();

        // Clone on device
        let cloned = buf.clone(&ctx).unwrap();
        assert_eq!(cloned.len(), buf.len());
        assert_ne!(cloned.as_ptr(), buf.as_ptr()); // Different memory

        // Verify content
        let mut result = vec![0.0f32; 64];
        cloned.copy_to_host(&mut result).unwrap();
        assert_eq!(data, result);
    }

    #[test]
    fn test_gpu_buffer_copy_from_buffer() {
        let ctx = cuda_ctx!();
        let data: Vec<f32> = (0..32).map(|i| i as f32).collect();
        let src = GpuBuffer::from_host(&ctx, &data).unwrap();

        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 32).unwrap();
        dst.copy_from_buffer(&src).unwrap();

        // Verify
        let mut result = vec![0.0f32; 32];
        dst.copy_to_host(&mut result).unwrap();
        assert_eq!(data, result);
    }

    #[test]
    fn test_gpu_buffer_copy_from_buffer_at() {
        let ctx = cuda_ctx!();
        let src_data: Vec<f32> = vec![5.0f32; 10];
        let src = GpuBuffer::from_host(&ctx, &src_data).unwrap();

        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 50).unwrap();
        let zeros = vec![0.0f32; 50];
        dst.copy_from_host(&zeros).unwrap();

        // Copy src to dst at offset 20
        dst.copy_from_buffer_at(&src, 20, 0, 10).unwrap();

        // Verify
        let mut result = vec![0.0f32; 50];
        dst.copy_to_host(&mut result).unwrap();
        assert_eq!(result[19], 0.0);
        assert_eq!(result[20], 5.0);
        assert_eq!(result[29], 5.0);
        assert_eq!(result[30], 0.0);
    }

    #[test]
    fn test_gpu_buffer_view_size_bytes() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 256).unwrap();
        let view = buf.clone_metadata();
        assert_eq!(view.size_bytes(), 256 * 4);
    }

    #[test]
    fn test_gpu_buffer_as_kernel_arg() {
        let ctx = cuda_ctx!();
        let buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 32).unwrap();
        let arg = buf.as_kernel_arg();
        assert!(!arg.is_null());
    }

    #[test]
    fn test_gpu_buffer_async_copy() {
        use crate::driver::CudaStream;
        let ctx = cuda_ctx!();
        let stream = CudaStream::new(&ctx).unwrap();

        let data: Vec<f32> = (0..64).map(|i| i as f32).collect();
        let src = GpuBuffer::from_host(&ctx, &data).unwrap();
        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 64).unwrap();

        unsafe {
            dst.copy_from_buffer_async(&src, &stream).unwrap();
        }
        stream.synchronize().unwrap();

        let mut result = vec![0.0f32; 64];
        dst.copy_to_host(&mut result).unwrap();
        assert_eq!(data, result);
    }

    #[test]
    fn test_gpu_buffer_async_copy_at() {
        use crate::driver::CudaStream;
        let ctx = cuda_ctx!();
        let stream = CudaStream::new(&ctx).unwrap();

        let data: Vec<f32> = vec![7.0f32; 10];
        let src = GpuBuffer::from_host(&ctx, &data).unwrap();

        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 50).unwrap();
        let zeros = vec![0.0f32; 50];
        dst.copy_from_host(&zeros).unwrap();

        unsafe {
            dst.copy_from_buffer_at_async(&src, 15, 0, 10, &stream)
                .unwrap();
        }
        stream.synchronize().unwrap();

        let mut result = vec![0.0f32; 50];
        dst.copy_to_host(&mut result).unwrap();
        assert_eq!(result[14], 0.0);
        assert_eq!(result[15], 7.0);
        assert_eq!(result[24], 7.0);
        assert_eq!(result[25], 0.0);
    }

    #[test]
    fn test_gpu_buffer_async_copy_size_mismatch() {
        use crate::driver::CudaStream;
        let ctx = cuda_ctx!();
        let stream = CudaStream::new(&ctx).unwrap();

        let src: GpuBuffer<f32> = GpuBuffer::new(&ctx, 100).unwrap();
        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 50).unwrap();

        let result = unsafe { dst.copy_from_buffer_async(&src, &stream) };
        assert!(result.is_err());
    }

    #[test]
    fn test_gpu_buffer_async_copy_empty() {
        use crate::driver::CudaStream;
        let ctx = cuda_ctx!();
        let stream = CudaStream::new(&ctx).unwrap();

        let src: GpuBuffer<f32> = GpuBuffer::new(&ctx, 0).unwrap();
        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 0).unwrap();

        unsafe {
            dst.copy_from_buffer_async(&src, &stream).unwrap();
        }
    }

    #[test]
    fn test_gpu_buffer_async_copy_at_bounds_check() {
        use crate::driver::CudaStream;
        let ctx = cuda_ctx!();
        let stream = CudaStream::new(&ctx).unwrap();

        let src: GpuBuffer<f32> = GpuBuffer::new(&ctx, 10).unwrap();
        let mut dst: GpuBuffer<f32> = GpuBuffer::new(&ctx, 20).unwrap();

        // dst out of bounds
        let result = unsafe { dst.copy_from_buffer_at_async(&src, 15, 0, 10, &stream) };
        assert!(result.is_err());

        // src out of bounds
        let result = unsafe { dst.copy_from_buffer_at_async(&src, 0, 5, 10, &stream) };
        assert!(result.is_err());

        // zero count
        unsafe {
            dst.copy_from_buffer_at_async(&src, 0, 0, 0, &stream)
                .unwrap();
        }
    }

    /// PMAT-420 / trueno#232: Verify that GpuBuffer transfers work correctly
    /// when the buffer is moved to a different thread than where the CUDA
    /// context was created.
    ///
    /// Before the fix, cuMemcpyHtoD returned CUDA_SUCCESS but silently
    /// transferred zero bytes because no CUDA context was current on the
    /// worker thread. Now `ensure_context()` pushes the stored context
    /// handle before every transfer.
    #[test]
    fn test_pmat420_cross_thread_transfer_nonzero() {
        let ctx = cuda_ctx!();
        let data: Vec<f32> = (1..=256).map(|i| i as f32).collect();

        // Allocate and upload on this thread (context is current here)
        let mut buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 256).unwrap();
        buf.copy_from_host(&data).unwrap();

        // Send the buffer to a new thread where context is NOT current
        let handle = std::thread::spawn(move || {
            // This thread has NO CUDA context pushed.
            // Before PMAT-420 fix, copy_to_host would read back all zeros.
            let mut result = vec![0.0f32; 256];
            buf.copy_to_host(&mut result)
                .expect("copy_to_host on foreign thread must succeed");

            // The critical assertion: data must NOT be all zeros
            let nonzero = result.iter().filter(|&&v| v != 0.0).count();
            assert_eq!(
                nonzero, 256,
                "PMAT-420 regression: GPU readback is zeros on cross-thread transfer \
                 ({} of 256 elements are nonzero)",
                nonzero
            );
            assert_eq!(result[0], 1.0);
            assert_eq!(result[255], 256.0);

            buf // return ownership for cleanup
        });

        let mut buf = handle.join().expect("Worker thread must not panic");

        // Also test cross-thread upload: write on a different thread
        let handle2 = std::thread::spawn(move || {
            let new_data: Vec<f32> = (0..256).map(|i| -(i as f32)).collect();
            buf.copy_from_host(&new_data)
                .expect("copy_from_host on foreign thread must succeed");
            (buf, new_data)
        });

        let (buf, new_data) = handle2.join().expect("Worker thread 2 must not panic");

        // Verify the cross-thread upload was correct (read back on original thread)
        let mut result = vec![0.0f32; 256];
        buf.copy_to_host(&mut result).unwrap();
        assert_eq!(result, new_data);
    }

    /// PMAT-420: Verify copy_from_host_at also works cross-thread
    #[test]
    fn test_pmat420_cross_thread_partial_transfer() {
        let ctx = cuda_ctx!();

        let mut buf: GpuBuffer<f32> = GpuBuffer::new(&ctx, 100).unwrap();
        let zeros = vec![0.0f32; 100];
        buf.copy_from_host(&zeros).unwrap();

        // Send to a worker thread and do partial write
        let handle = std::thread::spawn(move || {
            let patch: Vec<f32> = vec![42.0f32; 10];
            buf.copy_from_host_at(&patch, 50)
                .expect("copy_from_host_at on foreign thread must succeed");

            let mut result = vec![0.0f32; 100];
            buf.copy_to_host(&mut result)
                .expect("copy_to_host on foreign thread must succeed");

            // Verify the partial write landed correctly
            assert_eq!(result[49], 0.0);
            assert_eq!(result[50], 42.0);
            assert_eq!(result[59], 42.0);
            assert_eq!(result[60], 0.0);
        });

        handle.join().expect("Worker thread must not panic");
    }
}