#[test]
#[serial]
fn test_cov012_batched_rmsnorm_ptr_into() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let hidden_size = 32u32;
let batch_size = 2u32;
let total = (hidden_size * batch_size) as usize;
let input: Vec<f32> = (0..total).map(|i| 1.0 + (i as f32) * 0.01).collect();
let gamma = vec![1.0f32; hidden_size as usize];
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let gamma_gpu = GpuBuffer::from_host(&executor.context, &gamma).expect("gamma buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, total).expect("output buffer");
let result = executor.batched_rmsnorm_ptr_into(
&input_gpu,
gamma_gpu.as_ptr(),
gamma.len(),
&output_gpu,
hidden_size,
batch_size,
1e-5,
);
assert!(
result.is_ok(),
"batched_rmsnorm_ptr_into failed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov012_residual_add_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let input1 = vec![1.0f32; n as usize];
let input2 = vec![2.0f32; n as usize];
let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output buffer");
let result = executor.residual_add_into(&input1_gpu, &input2_gpu, &output_gpu, n);
assert!(
result.is_ok(),
"residual_add_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for val in &output {
assert!((*val - 3.0).abs() < 1e-5, "Expected 3.0, got {}", val);
}
}
#[test]
#[serial]
fn test_cov012_fused_residual_rmsnorm_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let hidden_size = 32u32;
let residual = vec![1.0f32; hidden_size as usize];
let input = vec![0.5f32; hidden_size as usize];
let gamma = vec![1.0f32; hidden_size as usize];
let residual_gpu = GpuBuffer::from_host(&executor.context, &residual).expect("residual buffer");
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let gamma_gpu = GpuBuffer::from_host(&executor.context, &gamma).expect("gamma buffer");
let output_gpu =
GpuBuffer::<f32>::new(&executor.context, hidden_size as usize).expect("output buffer");
let result = executor.fused_residual_rmsnorm_into(
&residual_gpu,
&input_gpu,
gamma_gpu.as_ptr() as usize,
&output_gpu,
hidden_size,
1e-5,
);
assert!(
result.is_ok(),
"fused_residual_rmsnorm_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; hidden_size as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}
#[test]
#[serial]
fn test_cov012_batched_residual_add_into() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let hidden_size = 32u32;
let batch_size = 4u32;
let total = (hidden_size * batch_size) as usize;
let input1: Vec<f32> = (0..total).map(|i| i as f32).collect();
let input2: Vec<f32> = (0..total).map(|i| (i as f32) * 0.5).collect();
let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, total).expect("output buffer");
let result = executor.batched_residual_add_into(
&input1_gpu,
&input2_gpu,
&output_gpu,
hidden_size,
batch_size,
);
assert!(
result.is_ok(),
"batched_residual_add_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; total];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for (i, &val) in output.iter().enumerate() {
let expected = (i as f32) * 1.5;
assert!(
(val - expected).abs() < 1e-4,
"At {}: expected {}, got {}",
i,
expected,
val
);
}
}
#[test]
#[serial]
fn test_cov012_batched_swiglu_into() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let intermediate_dim = 64u32;
let batch_size = 2u32;
let total = (intermediate_dim * batch_size) as usize;
let gate: Vec<f32> = (0..total).map(|i| (i as f32) * 0.01).collect();
let up: Vec<f32> = (0..total).map(|_| 1.0f32).collect();
let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, total).expect("output buffer");
let result = executor.batched_swiglu_into(
&gate_gpu,
&up_gpu,
&output_gpu,
intermediate_dim,
batch_size,
);
assert!(
result.is_ok(),
"batched_swiglu_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; total];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for &val in &output {
assert!(val.is_finite(), "Output should be finite");
}
}
#[test]
#[serial]
fn test_cov012_batched_rope_into() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 2u32;
let head_dim = 16u32;
let batch_size = 2u32;
let total = (num_heads * head_dim * batch_size) as usize;
let input: Vec<f32> = (0..total).map(|i| (i as f32) * 0.01).collect();
let positions = vec![0u32, 1u32];
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, total).expect("output buffer");
let positions_gpu =
GpuBuffer::from_host(&executor.context, &positions).expect("positions buffer");
let result = executor.batched_rope_into(
&input_gpu,
&output_gpu,
&positions_gpu,
num_heads,
head_dim,
batch_size,
10000.0, );
assert!(
result.is_ok(),
"batched_rope_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; total];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for &val in &output {
assert!(val.is_finite(), "RoPE output should be finite");
}
}
#[test]
#[serial]
fn test_cov014_load_quantized_weights_with_type_q4k() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let weights = vec![0u8; 144];
let result = executor.load_quantized_weights_with_type("test_q4k", &weights, 12);
assert!(
result.is_ok(),
"load_quantized_weights_with_type Q4K failed"
);
assert!(executor.has_quantized_weights("test_q4k"));
assert_eq!(executor.get_quantized_weight_type("test_q4k"), Some(12));
}
#[test]
#[serial]
fn test_cov014_load_quantized_weights_with_type_q5k() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let weights = vec![0u8; 176]; let result = executor.load_quantized_weights_with_type("test_q5k", &weights, 13);
assert!(
result.is_ok(),
"load_quantized_weights_with_type Q5K failed"
);
assert!(executor.has_quantized_weights("test_q5k"));
assert_eq!(executor.get_quantized_weight_type("test_q5k"), Some(13));
}
#[test]
#[serial]
fn test_cov014_load_quantized_weights_with_type_q6k() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let weights = vec![0u8; 210];
let result = executor.load_quantized_weights_with_type("test_q6k", &weights, 14);
assert!(
result.is_ok(),
"load_quantized_weights_with_type Q6K failed"
);
assert!(executor.has_quantized_weights("test_q6k"));
assert_eq!(executor.get_quantized_weight_type("test_q6k"), Some(14));
}
#[test]
#[serial]
fn test_cov014_get_quantized_weight_type_not_found() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
assert_eq!(executor.get_quantized_weight_type("nonexistent"), None);
}
#[test]
#[serial]
fn test_cov014_has_quantized_weights_false() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
assert!(!executor.has_quantized_weights("nonexistent"));
}
#[test]
#[serial]
fn test_cov014_get_quantized_weight_ptr() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let weights = vec![1u8; 256];
executor
.load_quantized_weights("ptr_test", &weights)
.expect("load");
let ptr_result = executor.get_quantized_weight_ptr("ptr_test");
assert!(ptr_result.is_ok(), "get_quantized_weight_ptr failed");
let ptr = ptr_result.unwrap();
assert!(ptr > 0, "Device pointer should be non-zero");
}
#[test]
#[serial]
fn test_cov014_get_quantized_weight_ptr_not_found() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
let ptr_result = executor.get_quantized_weight_ptr("nonexistent");
assert!(ptr_result.is_err(), "Should fail for nonexistent weight");
}
#[test]
#[serial]
fn test_cov014_cached_quantized_weight_count_multiple() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
assert_eq!(executor.cached_quantized_weight_count(), 0);
executor
.load_quantized_weights("w1", &vec![0u8; 144])
.expect("load w1");
assert_eq!(executor.cached_quantized_weight_count(), 1);
executor
.load_quantized_weights("w2", &vec![0u8; 144])
.expect("load w2");
assert_eq!(executor.cached_quantized_weight_count(), 2);
executor
.load_quantized_weights("w3", &vec![0u8; 144])
.expect("load w3");
assert_eq!(executor.cached_quantized_weight_count(), 3);
}
#[test]
#[serial]
fn test_cov014_cached_quantized_weight_bytes_multiple() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
assert_eq!(executor.cached_quantized_weight_bytes(), 0);
executor
.load_quantized_weights("w1", &vec![0u8; 256])
.expect("load w1");
let bytes1 = executor.cached_quantized_weight_bytes();
assert!(bytes1 >= 256, "Should have at least 256 bytes");
executor
.load_quantized_weights("w2", &vec![0u8; 512])
.expect("load w2");
let bytes2 = executor.cached_quantized_weight_bytes();
assert!(bytes2 >= 256 + 512, "Should have at least 768 bytes");
}
#[test]
#[serial]
fn test_cov014_clear_quantized_weights_multiple() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor
.load_quantized_weights("w1", &vec![0u8; 144])
.expect("load");
executor
.load_quantized_weights("w2", &vec![0u8; 144])
.expect("load");
executor
.load_quantized_weights("w3", &vec![0u8; 144])
.expect("load");
assert_eq!(executor.cached_quantized_weight_count(), 3);
executor.clear_quantized_weights();
assert_eq!(executor.cached_quantized_weight_count(), 0);
assert_eq!(executor.cached_quantized_weight_bytes(), 0);
}