#[test]
#[serial]
fn test_cov017_residual_add_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32;
let input1 = vec![1.0f32; n as usize];
let input2 = vec![2.0f32; n as usize];
let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");
let result = executor.residual_add_gpu(&input1_gpu, &input2_gpu, n);
assert!(
result.is_ok(),
"residual_add_gpu failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for val in &output {
assert!((*val - 3.0).abs() < 1e-5, "Expected 3.0, got {}", val);
}
}
#[test]
#[serial]
fn test_cov017_init_kv_cache_gpu() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let result = executor.init_kv_cache_gpu(
2, 4, 4, 8, 16, );
assert!(
result.is_ok(),
"init_kv_cache_gpu failed: {:?}",
result.err()
);
assert!(
executor.kv_cache_max_len > 0,
"KV cache max len should be set"
);
assert_eq!(executor.kv_num_heads, 4, "num_heads should be 4");
assert_eq!(executor.kv_num_kv_heads, 4, "num_kv_heads should be 4");
assert_eq!(executor.kv_head_dim, 8, "head_dim should be 8");
}
#[test]
#[serial]
fn test_cov017_init_workspace() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let result = executor.init_workspace(64, 128);
assert!(result.is_ok(), "init_workspace failed: {:?}", result.err());
assert!(
executor.workspace.initialized,
"Workspace should be initialized"
);
}
#[test]
#[serial]
fn test_cov017_has_indexed_weights_false() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
assert!(
!executor.has_indexed_weights(),
"Should not have indexed weights initially"
);
}
#[test]
#[serial]
fn test_cov017_has_workspace_false() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
assert!(
!executor.has_workspace(),
"Should not have workspace initially"
);
}
#[test]
#[serial]
fn test_cov017_has_workspace_true() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.init_workspace(64, 128).expect("init workspace");
assert!(executor.has_workspace(), "Should have workspace after init");
}
#[test]
#[serial]
fn test_cov017_set_rope_theta() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.set_rope_theta(500000.0);
assert_eq!(
executor.rope_theta, 500000.0,
"RoPE theta should be updated"
);
}
#[test]
#[serial]
fn test_cov017_set_rope_type() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.set_rope_type(1); assert_eq!(executor.rope_type, 1, "RoPE type should be updated");
}
#[test]
#[serial]
fn test_cov017_reset_kv_cache_gpu() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.init_kv_cache_gpu(1, 4, 4, 8, 16).expect("init kv");
executor.reset_kv_cache_gpu();
}
#[test]
#[serial]
fn test_cov018_q4k_gemv_cached_missing_weight() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let input = vec![1.0f32; 256];
let mut output = vec![0.0f32; 64];
let result = executor.q4k_gemv_cached("nonexistent_weight", &input, &mut output, 64, 256);
assert!(result.is_err(), "Should fail without cached weight");
let err = format!("{:?}", result.err().unwrap());
assert!(
err.contains("not cached"),
"Error should mention not cached: {}",
err
);
}
#[test]
#[serial]
fn test_cov018_q5k_gemv_cached_missing_weight() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let input = vec![1.0f32; 256];
let mut output = vec![0.0f32; 64];
let result = executor.q5k_gemv_cached("missing_q5k", &input, &mut output, 64, 256);
assert!(result.is_err(), "Should fail without cached Q5K weight");
}
#[test]
#[serial]
fn test_cov018_q6k_gemv_cached_missing_weight() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let input = vec![1.0f32; 256];
let mut output = vec![0.0f32; 64];
let result = executor.q6k_gemv_cached("missing_q6k", &input, &mut output, 64, 256);
assert!(result.is_err(), "Should fail without cached Q6K weight");
}
#[test]
#[serial]
fn test_cov018_rmsnorm_host_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let input: Vec<f32> = (0..n).map(|i| (i as f32 + 1.0) * 0.1).collect();
let gamma = vec![1.0f32; n as usize];
let mut output = vec![0.0f32; n as usize];
let result = executor.rmsnorm_host(&input, &gamma, &mut output, 1e-5);
assert!(result.is_ok(), "rmsnorm_host failed: {:?}", result.err());
let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}
#[test]
#[serial]
fn test_cov018_residual_add_host_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32;
let input1 = vec![1.5f32; n as usize];
let input2 = vec![2.5f32; n as usize];
let mut output = vec![0.0f32; n as usize];
let result = executor.residual_add_host(&input1, &input2, &mut output);
assert!(
result.is_ok(),
"residual_add_host failed: {:?}",
result.err()
);
for val in &output {
assert!((*val - 4.0).abs() < 1e-5, "Expected 4.0, got {}", val);
}
}
#[test]
#[serial]
fn test_cov018_fused_residual_rmsnorm_host_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32;
let residual = vec![1.0f32; n as usize];
let input = vec![0.5f32; n as usize];
let gamma = vec![1.0f32; n as usize];
let mut output = vec![0.0f32; n as usize];
let result = executor.fused_residual_rmsnorm_host(&residual, &input, &gamma, &mut output, 1e-5);
assert!(
result.is_ok(),
"fused_residual_rmsnorm_host failed: {:?}",
result.err()
);
let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}
#[test]
#[serial]
fn test_cov018_gelu_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let input: Vec<f32> = (0..n).map(|i| (i as f32 - 32.0) * 0.1).collect();
let buffer = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let result = executor.gelu_gpu(&buffer, n);
assert!(result.is_ok(), "gelu_gpu failed: {:?}", result.err());
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; n as usize];
buffer.copy_to_host(&mut output).expect("copy to host");
let non_zero = output.iter().filter(|&&x| x.abs() > 1e-6).count();
assert!(non_zero > 0, "GELU should produce non-zero outputs");
}
#[test]
#[serial]
fn test_cov018_q4k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let k = 256u32;
let n = 32u32;
let num_superblocks = (n as usize * k as usize + 255) / 256;
let weight_bytes = num_superblocks * 144;
let weights = vec![0u8; weight_bytes];
let input = vec![0.1f32; k as usize];
let weights_gpu = GpuBuffer::from_host(&executor.context, &weights).expect("weights");
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output");
let result = executor.q4k_gemv_into(weights_gpu.as_ptr(), &input_gpu, &output_gpu, n, k);
assert!(result.is_ok(), "q4k_gemv_into failed: {:?}", result.err());
}
#[test]
#[serial]
fn test_cov018_q6k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let k = 256u32;
let n = 32u32;
let num_superblocks = (n as usize * k as usize + 255) / 256;
let weight_bytes = num_superblocks * 210;
let weights = vec![0u8; weight_bytes];
let input = vec![0.1f32; k as usize];
let weights_gpu = GpuBuffer::from_host(&executor.context, &weights).expect("weights");
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output");
let result = executor.q6k_gemv_into(weights_gpu.as_ptr(), &input_gpu, &output_gpu, n, k);
assert!(result.is_ok(), "q6k_gemv_into failed: {:?}", result.err());
}
#[test]
#[serial]
fn test_cov018_q8_0_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let k = 32u32;
let n = 16u32;
let num_blocks = (n as usize * k as usize + 31) / 32;
let weight_bytes = num_blocks * 34;
let weights = vec![0u8; weight_bytes];
let input = vec![0.1f32; k as usize];
let weights_gpu = GpuBuffer::from_host(&executor.context, &weights).expect("weights");
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output");
let result = executor.q8_0_gemv_into(weights_gpu.as_ptr(), &input_gpu, &output_gpu, n, k);
assert!(result.is_ok(), "q8_0_gemv_into failed: {:?}", result.err());
}
#[test]
#[serial]
fn test_cov018_fused_residual_rmsnorm_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let residual = vec![1.0f32; n as usize];
let input = vec![0.5f32; n as usize];
let gamma = vec![1.0f32; n as usize];
let residual_gpu = GpuBuffer::from_host(&executor.context, &residual).expect("residual");
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input");
let gamma_gpu = GpuBuffer::from_host(&executor.context, &gamma).expect("gamma");
let result =
executor.fused_residual_rmsnorm_gpu(&residual_gpu, &input_gpu, &gamma_gpu, n, 1e-5);
assert!(
result.is_ok(),
"fused_residual_rmsnorm_gpu failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy");
let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}