use super::*;
use serial_test::serial;
#[test]
#[serial]
fn test_cov016_silu_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 128u32;
let input: Vec<f32> = (0..n).map(|i| (i as f32 - 64.0) * 0.1).collect();
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let result = executor.silu_gpu(&input_gpu, n);
assert!(result.is_ok(), "silu_gpu failed: {:?}", result.err());
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
assert!(
non_zero_count > n as usize / 2,
"SiLU should produce many non-zero outputs"
);
}
#[test]
#[serial]
fn test_cov016_gelu_async_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 256u32;
let input: Vec<f32> = (0..n).map(|i| (i as f32 - 128.0) * 0.05).collect();
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let result = executor.gelu_async(&input_gpu, n);
assert!(result.is_ok(), "gelu_async failed: {:?}", result.err());
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
assert!(
non_zero_count > n as usize / 3,
"GELU should produce non-zero outputs"
);
}
#[test]
#[serial]
fn test_cov016_elementwise_mul_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let input1 = vec![2.0f32; n as usize];
let input2 = vec![3.0f32; n as usize];
let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");
let result = executor.elementwise_mul_gpu(&input1_gpu, &input2_gpu, n);
assert!(
result.is_ok(),
"elementwise_mul_gpu failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for val in &output {
assert!((*val - 6.0).abs() < 1e-5, "Expected 6.0, got {}", val);
}
}
#[test]
#[serial]
fn test_cov016_fused_swiglu_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 128u32;
let gate: Vec<f32> = (0..n).map(|i| (i as f32 - 64.0) * 0.1).collect();
let up = vec![1.0f32; n as usize];
let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");
let result = executor.fused_swiglu_gpu(&gate_gpu, &up_gpu, n);
assert!(
result.is_ok(),
"fused_swiglu_gpu failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
assert!(non_zero_count > 0, "SwiGLU should produce non-zero outputs");
}
#[test]
#[serial]
fn test_cov016_fused_swiglu_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let gate = vec![1.0f32; n as usize]; let up = vec![2.0f32; n as usize];
let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");
let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output buffer");
let result = executor.fused_swiglu_into(&gate_gpu, &up_gpu, &output_gpu, n);
assert!(
result.is_ok(),
"fused_swiglu_into failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for val in &output {
assert!(
val.abs() > 1.0 && val.abs() < 2.0,
"Expected ~1.46, got {}",
val
);
}
}
#[test]
#[serial]
fn test_cov016_silu_gpu_cached_module() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32;
let input = vec![0.5f32; n as usize];
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let _result1 = executor.silu_gpu(&input_gpu, n).expect("first silu_gpu");
executor.stream.synchronize().expect("sync");
let result2 = executor.silu_gpu(&input_gpu, n);
assert!(
result2.is_ok(),
"cached silu_gpu failed: {:?}",
result2.err()
);
}
#[test]
#[serial]
fn test_cov016_gelu_async_cached_module() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 48u32;
let input = vec![0.5f32; n as usize];
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let _result1 = executor
.gelu_async(&input_gpu, n)
.expect("first gelu_async");
executor.stream.synchronize().expect("sync");
let result2 = executor.gelu_async(&input_gpu, n);
assert!(
result2.is_ok(),
"cached gelu_async failed: {:?}",
result2.err()
);
}
#[test]
#[serial]
fn test_cov016_elementwise_mul_varying_values() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32;
let input1: Vec<f32> = (0..n).map(|i| i as f32).collect();
let input2: Vec<f32> = (0..n).map(|i| (n - i) as f32).collect();
let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");
let result = executor.elementwise_mul_gpu(&input1_gpu, &input2_gpu, n);
assert!(
result.is_ok(),
"elementwise_mul varying failed: {:?}",
result.err()
);
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for i in 0..n as usize {
let expected = (i as f32) * ((n as usize - i) as f32);
assert!(
(output[i] - expected).abs() < 1e-4,
"Mismatch at {}: expected {}, got {}",
i,
expected,
output[i]
);
}
}
#[test]
#[serial]
fn test_cov016_fused_swiglu_gpu_negative_inputs() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let gate = vec![-2.0f32; n as usize]; let up = vec![1.0f32; n as usize];
let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");
let result = executor.fused_swiglu_gpu(&gate_gpu, &up_gpu, n);
assert!(result.is_ok(), "swiglu negative failed: {:?}", result.err());
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
for val in &output {
assert!(
*val < 0.1,
"SwiGLU of negative gate should be small/negative: {}",
val
);
}
}
#[test]
#[serial]
fn test_cov017_num_devices() {
let count = CudaExecutor::num_devices();
if CudaExecutor::is_available() {
assert!(count >= 1, "Should have at least one CUDA device");
} else {
assert_eq!(count, 0, "No devices when CUDA unavailable");
}
}
#[test]
#[serial]
fn test_cov017_make_current() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
let result = executor.make_current();
assert!(result.is_ok(), "make_current failed: {:?}", result.err());
}
#[test]
#[serial]
fn test_cov017_profiler_enable_disable() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
assert!(
!executor.is_profiling_enabled(),
"Profiling should be disabled initially"
);
executor.enable_profiling();
assert!(
executor.is_profiling_enabled(),
"Profiling should be enabled"
);
executor.disable_profiling();
assert!(
!executor.is_profiling_enabled(),
"Profiling should be disabled"
);
}
#[test]
#[serial]
fn test_cov017_profiler_access() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let _prof = executor.profiler();
let _prof_mut = executor.profiler_mut();
}
#[test]
#[serial]
fn test_cov017_profiler_reset() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.reset_profiler();
}
#[test]
#[serial]
fn test_cov017_profiler_summary() {
if !CudaExecutor::is_available() {
return;
}
let executor = CudaExecutor::new(0).expect("CUDA executor");
let summary = executor.profiler_summary();
assert!(
!summary.is_empty() || summary.is_empty(),
"Summary should return string"
); }
#[test]
#[serial]
fn test_cov017_rmsnorm_gpu_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 64u32;
let input: Vec<f32> = (0..n).map(|i| (i as f32 + 1.0) * 0.1).collect();
let gamma = vec![1.0f32; n as usize];
let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
let gamma_gpu = GpuBuffer::from_host(&executor.context, &gamma).expect("gamma buffer");
let result = executor.rmsnorm_gpu(&input_gpu, &gamma_gpu, n, 1e-5);
assert!(result.is_ok(), "rmsnorm_gpu failed: {:?}", result.err());
executor.stream.synchronize().expect("sync");
let output_gpu = result.unwrap();
let mut output = vec![0.0f32; n as usize];
output_gpu.copy_to_host(&mut output).expect("copy to host");
let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}
include!("tests_cov017_residual.rs");
include!("tests_cov018_fused.rs");
include!("tests_cov019_gemm.rs");
include!("tests_cov020_gemm.rs");