aprender-gpu 0.32.0

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
//! PHASE 3: Memory Pool Integration
//! PHASE 4: Full Attention Pipeline
//! Performance Targets (Acceptance Criteria)

// ============================================================================
// PHASE 3: Memory Pool Integration
// ============================================================================

/// Test: Memory pool should reuse allocations
///
/// Requirement: Freeing a GPU allocation should return memory to pool,
/// and subsequent allocations of same size should reuse without cudaMalloc.
#[test]
#[ignore = "TDD: Implementation pending - pool integration not yet implemented"]
fn test_memory_pool_reuse() {
    // When implemented:
    //
    // let ctx = CudaContext::new(0).expect("test");
    // let pool = GpuMemoryPool::new(&ctx, 64 * 1024 * 1024).expect("test"); // 64MB pool
    //
    // // First allocation
    // let a = pool.allocate::<f32>(1000).expect("test");
    // let ptr_a = a.device_ptr();
    //
    // // Track cudaMalloc calls
    // let malloc_before = pool.cuda_malloc_calls();
    //
    // // Free the allocation
    // drop(a);
    //
    // // Allocate same size - should reuse
    // let b = pool.allocate::<f32>(1000).expect("test");
    //
    // // Same pointer (reused)
    // assert_eq!(b.device_ptr(), ptr_a);
    // // No new cudaMalloc
    // assert_eq!(pool.cuda_malloc_calls(), malloc_before);

    // TODO: "TDD: memory pool reuse not implemented"
}

/// Test: GpuResidentTensor should use memory pool when available
///
/// Requirement: Tensors should allocate from pool to avoid cudaMalloc overhead.
#[test]
#[ignore = "TDD: Implementation pending - pool-backed tensors not yet implemented"]
fn test_tensor_uses_memory_pool() {
    // When implemented:
    //
    // let ctx = CudaContext::new(0).expect("test");
    // let pool = GpuMemoryPool::new(&ctx, 64 * 1024 * 1024).expect("test");
    //
    // // Create tensor backed by pool
    // let data = vec![1.0f32; 10000];
    // let tensor = GpuResidentTensor::from_host_pooled(&ctx, &pool, &data).expect("test");
    //
    // // Should be pool-backed
    // assert!(tensor.is_pool_backed());
    // // Pool should show allocation
    // assert_eq!(pool.active_allocations(), 1);
    //
    // // Drop tensor
    // drop(tensor);
    // // Memory returned to pool (not freed)
    // assert_eq!(pool.active_allocations(), 0);
    // assert!(pool.has_available(10000 * 4));

    // TODO: "TDD: pool-backed tensors not implemented"
}

// ============================================================================
// PHASE 4: Full Attention Pipeline
// ============================================================================

/// Test: Full encoder layer should have minimal transfers
///
/// Requirement: Processing one encoder layer should have:
/// - 0 host transfers for attention (weights pre-uploaded)
/// - Output stays on GPU for next layer
#[test]
#[ignore = "TDD: Implementation pending - full pipeline not yet implemented"]
fn test_encoder_layer_minimal_transfers() {
    // When implemented:
    //
    // let ctx = CudaContext::new(0).expect("test");
    //
    // // Pre-upload all weights (done ONCE at model load time)
    // let weights = EncoderLayerWeights::upload(&ctx, &model_weights).expect("test");
    //
    // // Process input through encoder layer
    // let input = GpuResidentTensor::from_host(&ctx, &input_data).expect("test");
    //
    // // Track transfers before
    // let h2d_before = ctx.total_h2d_transfers();
    // let d2h_before = ctx.total_d2h_transfers();
    //
    // // Run encoder layer - should have ZERO additional transfers
    // let output = encoder_layer_forward(&ctx, &input, &weights).expect("test");
    //
    // // Verify no transfers during forward pass
    // assert_eq!(ctx.total_h2d_transfers(), h2d_before);
    // assert_eq!(ctx.total_d2h_transfers(), d2h_before);
    //
    // // Output should be on GPU, ready for next layer
    // assert!(output.is_device_resident());

    // TODO: "TDD: encoder layer pipeline not implemented"
}

/// Test: Full encoder (all layers) should have 2 total transfers
///
/// Requirement: Processing ALL encoder layers should have:
/// - 1 H2D transfer: input audio features
/// - 1 D2H transfer: final encoder output
/// - Weights are pre-uploaded (not counted per-inference)
#[test]
#[ignore = "TDD: Implementation pending - full encoder not yet implemented"]
fn test_full_encoder_two_transfers_total() {
    // When implemented:
    //
    // let ctx = CudaContext::new(0).expect("test");
    //
    // // Model weights pre-uploaded (done ONCE at load time)
    // let model = WhisperEncoderGpu::load(&ctx, model_path).expect("test");
    //
    // // Reset transfer counters for this inference
    // ctx.reset_transfer_counters();
    //
    // // Input: mel spectrogram
    // let mel_features = vec![0.0f32; 1500 * 80]; // [seq_len, n_mels]
    //
    // // Run full encoder
    // let output = model.encode(&mel_features).expect("test");
    //
    // // Verify ONLY 2 transfers:
    // assert_eq!(ctx.total_h2d_transfers(), 1, "Should have 1 upload (mel features)");
    // assert_eq!(ctx.total_d2h_transfers(), 1, "Should have 1 download (encoder output)");

    // TODO: "TDD: full encoder not implemented"
}

// ============================================================================
// Performance Targets (Acceptance Criteria)
// ============================================================================

/// Test: Encoder should achieve <300ms for 1.5s audio
///
/// Acceptance criteria from WAPR-PERF-004 specification.
#[test]
#[ignore = "TDD: Performance test - run after implementation complete"]
fn test_encoder_performance_target() {
    // When implemented:
    //
    // let ctx = CudaContext::new(0).expect("test");
    // let model = WhisperEncoderGpu::load(&ctx, "models/whisper-tiny.apr").expect("test");
    //
    // // Warmup
    // for _ in 0..3 {
    //     let _ = model.encode(&vec![0.0f32; 1500 * 80]);
    // }
    //
    // // Benchmark
    // let start = std::time::Instant::now();
    // let _ = model.encode(&vec![0.0f32; 1500 * 80]).expect("test");
    // let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
    //
    // // Target: <300ms (currently 5150ms on CPU)
    // assert!(elapsed_ms < 300.0, "Encoder took {}ms, target <300ms", elapsed_ms);

    // TODO: "TDD: performance test not implemented"
}