#[cfg(feature = "cuda")]
use trueno_gpu::driver::CudaContext;
#[cfg(feature = "cuda")]
use trueno_gpu::memory::resident::{
batched_multihead_attention, clear_kernel_cache, reset_transfer_counters, total_d2h_transfers,
total_h2d_transfers, GpuResidentTensor, TransferStats,
};
#[test]
#[cfg(feature = "cuda")]
fn test_batched_attention_single_kernel() {
use trueno_gpu::memory::resident::batched_multihead_attention;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(_) => return, };
reset_transfer_counters();
let seq_len = 4u32;
let n_heads = 2u32;
let head_dim = 8u32;
let d_model = (n_heads * head_dim) as usize;
let q = GpuResidentTensor::from_host(&ctx, &vec![0.1f32; seq_len as usize * d_model])
.expect("Upload Q");
let k = GpuResidentTensor::from_host(&ctx, &vec![0.1f32; seq_len as usize * d_model])
.expect("Upload K");
let v = GpuResidentTensor::from_host(&ctx, &vec![0.1f32; seq_len as usize * d_model])
.expect("Upload V");
assert_eq!(total_h2d_transfers(), 3);
assert_eq!(total_d2h_transfers(), 0);
let output = batched_multihead_attention(&ctx, &q, &k, &v, n_heads, head_dim, seq_len)
.expect("Batched attention failed");
assert_eq!(output.len(), seq_len as usize * d_model);
assert!(output.is_device_resident());
assert_eq!(output.device_to_host_transfers(), 0);
assert_eq!(total_h2d_transfers(), 3); assert_eq!(total_d2h_transfers(), 0); }
#[test]
#[cfg(not(feature = "cuda"))]
fn test_batched_attention_single_kernel() {}
#[test]
#[ignore = "TDD: Implementation pending - fused softmax not yet implemented"]
fn test_batched_attention_fused_softmax() {
}
#[test]
#[cfg(feature = "cuda")]
fn test_matmul_2x2_correctness() {
clear_kernel_cache();
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(_) => return,
};
let a_data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let b_data: Vec<f32> = vec![5.0, 6.0, 7.0, 8.0];
let expected: Vec<f32> = vec![19.0, 22.0, 43.0, 50.0];
let a = GpuResidentTensor::from_host(&ctx, &a_data).expect("upload A");
let b = GpuResidentTensor::from_host(&ctx, &b_data).expect("upload B");
let mut c = a.matmul(&ctx, &b, 2, 2, 2).expect("matmul failed");
let result = c.to_host().expect("download C");
println!("\n=== Matmul 2x2 Test ===");
println!("A: {:?}", a_data);
println!("B: {:?}", b_data);
println!("Expected: {:?}", expected);
println!("GPU result: {:?}", result);
let max_diff: f32 = result
.iter()
.zip(expected.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
println!("Max diff: {}", max_diff);
assert!(
max_diff < 0.01,
"Matmul 2x2 failed: max diff {} > 0.01",
max_diff
);
println!("✓ Matmul 2x2 PASSED!");
}
#[test]
#[cfg(feature = "cuda")]
fn test_attention_steps_individually() {
use trueno_gpu::memory::resident::TransferStats;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(_) => return,
};
let seq_len = 2u32;
let d_model = 2u32;
let q_data: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let k_data: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let v_data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let q = GpuResidentTensor::from_host(&ctx, &q_data).expect("upload Q");
let k = GpuResidentTensor::from_host(&ctx, &k_data).expect("upload K");
let v = GpuResidentTensor::from_host(&ctx, &v_data).expect("upload V");
println!("\n=== Step-by-Step Attention Debug ===");
let mut scores = q
.matmul(&ctx, &k, seq_len, seq_len, d_model)
.expect("Q@K failed");
let scores_host = scores.to_host().expect("download scores");
println!("Step 1 - Q @ K (should be identity): {:?}", scores_host);
let scale = 1.0 / (d_model as f32).sqrt(); let q2 = GpuResidentTensor::from_host(&ctx, &q_data).expect("upload Q2");
let k2 = GpuResidentTensor::from_host(&ctx, &k_data).expect("upload K2");
let mut scores2 = q2
.matmul(&ctx, &k2, seq_len, seq_len, d_model)
.expect("Q@K");
let scaled = scores2.scale(&ctx, scale).expect("scale failed");
let mut scaled_mut = scaled;
let scaled_host = scaled_mut.to_host().expect("download scaled");
println!("Step 2 - Scaled (×{}): {:?}", scale, scaled_host);
let q3 = GpuResidentTensor::from_host(&ctx, &q_data).expect("upload Q3");
let k3 = GpuResidentTensor::from_host(&ctx, &k_data).expect("upload K3");
let mut scores3 = q3
.matmul(&ctx, &k3, seq_len, seq_len, d_model)
.expect("Q@K");
let scaled3 = scores3.scale(&ctx, scale).expect("scale");
let softmax_result = scaled3.softmax(&ctx, seq_len).expect("softmax failed");
let mut softmax_mut = softmax_result;
let softmax_host = softmax_mut.to_host().expect("download softmax");
println!("Step 3 - Softmax: {:?}", softmax_host);
println!("Expected softmax: [0.670, 0.330, 0.330, 0.670]");
let q4 = GpuResidentTensor::from_host(&ctx, &q_data).expect("Q4");
let k4 = GpuResidentTensor::from_host(&ctx, &k_data).expect("K4");
let v4 = GpuResidentTensor::from_host(&ctx, &v_data).expect("V4");
let mut scores4 = q4
.matmul(&ctx, &k4, seq_len, seq_len, d_model)
.expect("Q@K");
let scaled4 = scores4.scale(&ctx, scale).expect("scale");
let attn4 = scaled4.softmax(&ctx, seq_len).expect("softmax");
let mut output4 = attn4
.matmul(&ctx, &v4, seq_len, d_model, seq_len)
.expect("attn@V failed");
let output_host = output4.to_host().expect("download output");
println!("Step 4 - Output (attn @ V): {:?}", output_host);
let row0_zero = output_host[0].abs() < 0.001 && output_host[1].abs() < 0.001;
let row1_zero = output_host[2].abs() < 0.001 && output_host[3].abs() < 0.001;
if row0_zero {
println!("BUG: Row 0 is all zeros!");
}
if row1_zero {
println!("BUG: Row 1 is all zeros!");
}
println!("Expected output: [1.66, 2.66, 2.34, 3.34]");
}
#[test]
#[cfg(feature = "cuda")]
fn test_batched_attention_correctness() {
clear_kernel_cache();
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(_) => {
eprintln!("CUDA not available, skipping correctness test");
return;
}
};
let seq_len = 2u32;
let n_heads = 1u32;
let head_dim = 2u32;
let d_model = (n_heads * head_dim) as usize;
let q_data: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let k_data: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let v_data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let scale = 1.0 / (head_dim as f32).sqrt();
let scores = vec![
q_data[0] * k_data[0] + q_data[1] * k_data[1], q_data[0] * k_data[2] + q_data[1] * k_data[3], q_data[2] * k_data[0] + q_data[3] * k_data[1], q_data[2] * k_data[2] + q_data[3] * k_data[3], ];
let scaled: Vec<f32> = scores.iter().map(|x| x * scale).collect();
let mut attn_weights = vec![0.0f32; 4];
for row in 0..2 {
let row_start = row * 2;
let max_val = scaled[row_start].max(scaled[row_start + 1]);
let exp0 = (scaled[row_start] - max_val).exp();
let exp1 = (scaled[row_start + 1] - max_val).exp();
let sum = exp0 + exp1;
attn_weights[row_start] = exp0 / sum;
attn_weights[row_start + 1] = exp1 / sum;
}
let mut expected = vec![0.0f32; 4];
for i in 0..2 {
for j in 0..2 {
expected[i * 2 + j] =
attn_weights[i * 2] * v_data[j] + attn_weights[i * 2 + 1] * v_data[2 + j];
}
}
println!("\n=== Correctness Test ===");
println!("Q: {:?}", q_data);
println!("K: {:?}", k_data);
println!("V: {:?}", v_data);
println!("Scores (Q@K^T): {:?}", scores);
println!("Scaled (/{:.3}): {:?}", 1.0 / scale, scaled);
println!("Attn weights: {:?}", attn_weights);
println!("Expected output: {:?}", expected);
let q = GpuResidentTensor::from_host(&ctx, &q_data).expect("upload Q");
let k = GpuResidentTensor::from_host(&ctx, &k_data).expect("upload K");
let v = GpuResidentTensor::from_host(&ctx, &v_data).expect("upload V");
let mut output = batched_multihead_attention(&ctx, &q, &k, &v, n_heads, head_dim, seq_len)
.expect("GPU attention failed");
let result = output.to_host().expect("download output");
println!("GPU output: {:?}", result);
let max_diff: f32 = result
.iter()
.zip(expected.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
println!("Max diff: {}", max_diff);
assert!(
max_diff < 0.01,
"Max diff: {} exceeds tolerance 0.01",
max_diff
);
println!("✓ Correctness test PASSED!");
}