trueno-gpu 0.4.17

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
//! Memory Fuzz Tests (PMAT-018)
//!
//! Stress testing for GPU memory management.
//!
//! # Falsification Strategy
//! - **Scarcity**: Force OOM conditions
//! - **Degeneracy**: Zero-sized buffers, unaligned copies
//! - **Concurrency**: Stream overlap (simulated)

#![cfg(all(test, feature = "cuda"))]

use super::context::CudaContext;
use super::memory::GpuBuffer;
use crate::GpuError;
use proptest::prelude::*;

#[test]
fn test_zero_sized_buffer() {
    let ctx = CudaContext::new(0).expect("Context");

    // 0-sized allocation should either succeed (ptr=null/special) or fail gracefully
    // It should NOT panic or crash CUDA
    let buf_result = GpuBuffer::<f32>::new(&ctx, 0);

    // We accept either behavior, but it must be robust
    if let Ok(mut buf) = buf_result {
        assert_eq!(buf.len(), 0);
        // Copying 0 bytes should be a no-op
        let src: Vec<f32> = vec![];
        buf.copy_from_host(&src)
            .expect("Zero-byte copy should succeed");
    }
}

#[test]
fn test_unaligned_byte_copy() {
    let ctx = CudaContext::new(0).expect("Context");
    let len = 1024;
    let mut buf = GpuBuffer::<u8>::new(&ctx, len).expect("Alloc");

    let data: Vec<u8> = (0..len).map(|i| (i % 255) as u8).collect();
    buf.copy_from_host(&data).expect("Copy");

    let mut out = vec![0u8; len];
    buf.copy_to_host(&mut out).expect("Download");

    assert_eq!(data, out);
}

#[test]
fn test_oom_resilience() {
    let ctx = CudaContext::new(0).expect("Context");
    let (free_start, _) = ctx.memory_info().expect("Mem info");

    // Allocate 1GB chunks until failure
    let mut allocations = Vec::new();
    let chunk_size = 1024 * 1024 * 1024 / 4; // 1GB of f32 (256M elements)

    // RTX 4090 has 24GB. 30 chunks * 1GB = 30GB -> Must OOM.
    // Limit to 20 to avoid freezing system if driver is aggressive
    let mut hit_oom = false;

    for i in 0..30 {
        match GpuBuffer::<f32>::new(&ctx, chunk_size) {
            Ok(buf) => allocations.push(buf),
            Err(GpuError::OutOfMemory { .. }) => {
                hit_oom = true;
                println!("Hit OOM at chunk {}", i);
                break;
            }
            Err(GpuError::MemoryAllocation(msg)) if msg.contains("OUT_OF_MEMORY") => {
                // Also acceptable - OOM wrapped in MemoryAllocation
                hit_oom = true;
                println!("Hit OOM (MemoryAllocation) at chunk {}", i);
                break;
            }
            Err(e) => panic!("Unexpected error during OOM stress: {:?}", e),
        }
    }

    // If we didn't hit OOM, we either have >30GB RAM or something is wrong
    // But we don't assert(hit_oom) to avoid flaky fails on 80GB A100s if running elsewhere

    // Drop all allocations
    drop(allocations);

    // Verify memory is returned
    let (free_end, _) = ctx.memory_info().expect("Mem info");
    // Allow some small driver overhead variance, but major blocks should be free
    // Diff should be small
    let diff = if free_start > free_end {
        free_start - free_end
    } else {
        0
    };
    // 100MB tolerance
    assert!(
        diff < 100 * 1024 * 1024,
        "Memory leak detected! {} bytes missing",
        diff
    );
}

proptest! {
    #[test]
    fn test_buffer_roundtrip_fuzz(
        len in 1usize..100_000usize,
        val in any::<f32>()
    ) {
        // Setup context locally per test (expensive but safe for proptest)
        // Note: In real life, use a lazy_static context or run this test single-threaded
        // For now, we assume single-threaded execution via Makefile
        if let Ok(ctx) = CudaContext::new(0) {
             let mut buf = GpuBuffer::<f32>::new(&ctx, len).unwrap();

             let data = vec![val; len];
             buf.copy_from_host(&data).unwrap();

             let mut out = vec![0.0; len];
             buf.copy_to_host(&mut out).unwrap();

             // Check first, middle, last
             prop_assert_eq!(data[0], out[0]);
             prop_assert_eq!(data[len/2], out[len/2]);
             prop_assert_eq!(data[len-1], out[len-1]);
        }
    }
}

// =============================================================================
// ADVERSARIAL TESTS (Dr. Popper's Falsification Protocol)
// =============================================================================
// These tests try to BREAK the driver, not validate happy paths.

/// Falsification Test 1: Oversize Allocation
/// Attempt to allocate 100GB - must return OOM, not panic or hang
#[test]
fn test_alloc_oversize_100gb() {
    let ctx = CudaContext::new(0).expect("Context");

    // 100GB of f32 = 25 billion elements
    let oversize = 25_000_000_000usize;

    let result = GpuBuffer::<f32>::new(&ctx, oversize);

    match result {
        Err(GpuError::OutOfMemory { .. }) => {
            // Expected - driver correctly reported OOM
        }
        Err(GpuError::MemoryAllocation(_)) => {
            // Also acceptable - allocation failed
        }
        Err(e) => {
            // Any other error is acceptable as long as it doesn't panic
            println!("Oversize alloc returned: {:?}", e);
        }
        Ok(_) => {
            panic!("CRITICAL: 100GB allocation succeeded - this should be impossible on RTX 4090!");
        }
    }
}

/// Falsification Test 2: Copy from host with size mismatch (too small host)
#[test]
fn test_copy_from_host_too_small() {
    let ctx = CudaContext::new(0).expect("Context");
    let mut buf = GpuBuffer::<f32>::new(&ctx, 1000).expect("Alloc");

    // Try to copy from a smaller host buffer
    let small_data = vec![1.0f32; 500];
    let result = buf.copy_from_host(&small_data);

    assert!(
        result.is_err(),
        "copy_from_host should fail when host buffer is smaller"
    );
    if let Err(e) = result {
        assert!(
            format!("{:?}", e).contains("mismatch") || format!("{:?}", e).contains("Transfer"),
            "Error should mention size mismatch: {:?}",
            e
        );
    }
}

/// Falsification Test 3: Copy to host with size mismatch (too large host)
#[test]
fn test_copy_to_host_too_large() {
    let ctx = CudaContext::new(0).expect("Context");
    let buf = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc");

    // Try to copy to a larger host buffer
    let mut large_data = vec![0.0f32; 500];
    let result = buf.copy_to_host(&mut large_data);

    assert!(
        result.is_err(),
        "copy_to_host should fail when host buffer size doesn't match"
    );
}

/// Falsification Test 4: Partial copy out of bounds (offset too large)
#[test]
fn test_copy_from_host_at_out_of_bounds() {
    let ctx = CudaContext::new(0).expect("Context");
    let mut buf = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc");

    let data = vec![1.0f32; 50];

    // Offset 60 + len 50 = 110 > 100 buffer size
    let result = buf.copy_from_host_at(&data, 60);
    assert!(
        result.is_err(),
        "copy_from_host_at should fail when offset+len > buffer size"
    );
}

/// Falsification Test 5: Partial copy to host out of bounds
#[test]
fn test_copy_to_host_at_out_of_bounds() {
    let ctx = CudaContext::new(0).expect("Context");
    let data = vec![1.0f32; 100];
    let buf = GpuBuffer::from_host(&ctx, &data).expect("Alloc");

    let mut result = vec![0.0f32; 50];

    // Offset 60 + len 50 = 110 > 100 buffer size
    let copy_result = buf.copy_to_host_at(&mut result, 60);
    assert!(
        copy_result.is_err(),
        "copy_to_host_at should fail when offset+len > buffer size"
    );
}

/// Falsification Test 6: D2D copy size mismatch
#[test]
fn test_d2d_copy_size_mismatch() {
    let ctx = CudaContext::new(0).expect("Context");

    let src = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 200).expect("Alloc dst");

    let result = dst.copy_from_buffer(&src);
    assert!(
        result.is_err(),
        "D2D copy should fail when buffer sizes don't match"
    );
}

/// Falsification Test 7: D2D partial copy out of bounds (dst)
#[test]
fn test_d2d_copy_at_dst_out_of_bounds() {
    let ctx = CudaContext::new(0).expect("Context");

    let src = GpuBuffer::<f32>::new(&ctx, 50).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc dst");

    // dst_offset 60 + count 50 = 110 > dst.len 100
    let result = dst.copy_from_buffer_at(&src, 60, 0, 50);
    assert!(
        result.is_err(),
        "D2D copy_at should fail when dst_offset+count > dst.len"
    );
}

/// Falsification Test 8: D2D partial copy out of bounds (src)
#[test]
fn test_d2d_copy_at_src_out_of_bounds() {
    let ctx = CudaContext::new(0).expect("Context");

    let src = GpuBuffer::<f32>::new(&ctx, 50).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc dst");

    // src_offset 30 + count 50 = 80 > src.len 50
    let result = dst.copy_from_buffer_at(&src, 0, 30, 50);
    assert!(
        result.is_err(),
        "D2D copy_at should fail when src_offset+count > src.len"
    );
}

/// Falsification Test 9: RAII cleanup verification
/// Allocate, drop, verify memory returns to the pool
#[test]
fn test_raii_cleanup_single_buffer() {
    let ctx = CudaContext::new(0).expect("Context");

    let (free_before, _) = ctx.memory_info().expect("Memory info");

    // Allocate 100MB
    let size = 25_000_000; // 100MB of f32
    {
        let _buf = GpuBuffer::<f32>::new(&ctx, size).expect("Alloc");
        let (free_during, _) = ctx.memory_info().expect("Memory info");

        // Memory should be allocated (less free memory)
        assert!(
            free_during < free_before,
            "Memory should decrease after allocation: before={}, during={}",
            free_before,
            free_during
        );
    }
    // Buffer dropped here

    let (free_after, _) = ctx.memory_info().expect("Memory info");

    // Memory should be returned (within 10MB tolerance for driver overhead)
    let tolerance = 10 * 1024 * 1024;
    assert!(
        free_after >= free_before - tolerance,
        "Memory leak detected! before={}, after={}, diff={}",
        free_before,
        free_after,
        free_before.saturating_sub(free_after)
    );
}

/// Falsification Test 10: Async copy size mismatch
#[test]
fn test_async_d2d_copy_size_mismatch() {
    use super::stream::CudaStream;

    let ctx = CudaContext::new(0).expect("Context");
    let stream = CudaStream::new(&ctx).expect("Stream");

    let src = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 200).expect("Alloc dst");

    let result = unsafe { dst.copy_from_buffer_async(&src, &stream) };
    assert!(
        result.is_err(),
        "Async D2D copy should fail when buffer sizes don't match"
    );
}

/// Falsification Test 11: Async partial copy out of bounds
#[test]
fn test_async_d2d_copy_at_out_of_bounds() {
    use super::stream::CudaStream;

    let ctx = CudaContext::new(0).expect("Context");
    let stream = CudaStream::new(&ctx).expect("Stream");

    let src = GpuBuffer::<f32>::new(&ctx, 50).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc dst");

    // dst out of bounds
    let result = unsafe { dst.copy_from_buffer_at_async(&src, 60, 0, 50, &stream) };
    assert!(
        result.is_err(),
        "Async D2D copy_at should fail when dst out of bounds"
    );

    // src out of bounds
    let result = unsafe { dst.copy_from_buffer_at_async(&src, 0, 30, 50, &stream) };
    assert!(
        result.is_err(),
        "Async D2D copy_at should fail when src out of bounds"
    );
}

/// Falsification Test 12: Async H2D copy size mismatch
#[test]
fn test_async_h2d_copy_size_mismatch() {
    use super::stream::CudaStream;

    let ctx = CudaContext::new(0).expect("Context");
    let stream = CudaStream::new(&ctx).expect("Stream");

    let mut buf = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc");
    let small_data = vec![1.0f32; 50];

    let result = unsafe { buf.copy_from_host_async(&small_data, &stream) };
    assert!(
        result.is_err(),
        "Async H2D copy should fail when host buffer size doesn't match"
    );
}

/// Falsification Test 13: Async D2H copy size mismatch
#[test]
fn test_async_d2h_copy_size_mismatch() {
    use super::stream::CudaStream;

    let ctx = CudaContext::new(0).expect("Context");
    let stream = CudaStream::new(&ctx).expect("Stream");

    let buf = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc");
    let mut large_data = vec![0.0f32; 200];

    let result = unsafe { buf.copy_to_host_async(&mut large_data, &stream) };
    assert!(
        result.is_err(),
        "Async D2H copy should fail when host buffer size doesn't match"
    );
}

/// Falsification Test 14: Empty buffer operations
#[test]
fn test_empty_buffer_operations() {
    let ctx = CudaContext::new(0).expect("Context");

    let mut empty_buf = GpuBuffer::<f32>::new(&ctx, 0).expect("Alloc empty");
    assert!(empty_buf.is_empty());
    assert_eq!(empty_buf.len(), 0);
    assert_eq!(empty_buf.size_bytes(), 0);

    // All these should succeed as no-ops
    let empty_data: Vec<f32> = vec![];
    empty_buf
        .copy_from_host(&empty_data)
        .expect("Empty H2D should succeed");

    let mut empty_out: Vec<f32> = vec![];
    empty_buf
        .copy_to_host(&mut empty_out)
        .expect("Empty D2H should succeed");

    // D2D with empty buffers
    let mut empty_dst = GpuBuffer::<f32>::new(&ctx, 0).expect("Alloc empty dst");
    empty_dst
        .copy_from_buffer(&empty_buf)
        .expect("Empty D2D should succeed");
}

/// Falsification Test 15: Partial copy with zero count
#[test]
fn test_partial_copy_zero_count() {
    let ctx = CudaContext::new(0).expect("Context");

    let src = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc dst");

    // Zero count should be a no-op, regardless of offsets
    dst.copy_from_buffer_at(&src, 0, 0, 0)
        .expect("Zero count D2D should succeed");
    dst.copy_from_buffer_at(&src, 50, 50, 0)
        .expect("Zero count D2D with offsets should succeed");
}

/// Falsification Test 16: Async raw copy bounds check
#[test]
fn test_async_raw_copy_bounds_check() {
    use super::stream::CudaStream;

    let ctx = CudaContext::new(0).expect("Context");
    let stream = CudaStream::new(&ctx).expect("Stream");

    let src = GpuBuffer::<f32>::new(&ctx, 50).expect("Alloc src");
    let mut dst = GpuBuffer::<f32>::new(&ctx, 100).expect("Alloc dst");

    // Use raw stream handle
    let stream_handle = stream.raw();

    // dst out of bounds
    let result = unsafe { dst.copy_from_buffer_at_async_raw(&src, 60, 0, 50, stream_handle) };
    assert!(
        result.is_err(),
        "Async raw D2D should fail when dst out of bounds"
    );

    // src out of bounds
    let result = unsafe { dst.copy_from_buffer_at_async_raw(&src, 0, 30, 50, stream_handle) };
    assert!(
        result.is_err(),
        "Async raw D2D should fail when src out of bounds"
    );

    // Zero count should succeed
    unsafe {
        dst.copy_from_buffer_at_async_raw(&src, 0, 0, 0, stream_handle)
            .expect("Zero count should succeed");
    }
}

/// Falsification Test 17: Buffer view properties
#[test]
fn test_buffer_view_properties() {
    let ctx = CudaContext::new(0).expect("Context");
    let buf = GpuBuffer::<f32>::new(&ctx, 256).expect("Alloc");

    let view = buf.clone_metadata();

    assert_eq!(view.as_ptr(), buf.as_ptr());
    assert_eq!(view.len(), buf.len());
    assert_eq!(view.is_empty(), buf.is_empty());
    assert_eq!(view.size_bytes(), buf.size_bytes());

    // View should NOT free memory when dropped (non-owning)
    drop(view);

    // Original buffer should still be valid
    assert_eq!(buf.len(), 256);
}

/// Falsification Test 18: Stress multiple allocations and drops
#[test]
fn test_stress_alloc_dealloc_cycle() {
    let ctx = CudaContext::new(0).expect("Context");
    let (free_start, _) = ctx.memory_info().expect("Memory info");

    // Allocate and drop 100 buffers
    for i in 0..100 {
        let size = (i + 1) * 1000; // 1K to 100K elements
        let _buf = GpuBuffer::<f32>::new(&ctx, size).expect("Alloc");
        // Buffer dropped at end of iteration
    }

    let (free_end, _) = ctx.memory_info().expect("Memory info");

    // Should be back to roughly same free memory
    // CUDA driver has internal fragmentation and caching, so allow 50MB tolerance
    let tolerance = 50 * 1024 * 1024; // 50MB tolerance for driver overhead
    assert!(
        free_end >= free_start - tolerance,
        "Memory leak after 100 alloc/dealloc cycles! start={}, end={}, leaked={}",
        free_start,
        free_end,
        free_start.saturating_sub(free_end)
    );
}