#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_activations_with_harness_gelu() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let config = HarnessConfig::default();
if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let input = vec![0.5f32; config.hidden_dim];
let input_buf = GpuBuffer::from_host(&exec.context, &input).expect("input_buf");
let output_buf = exec
.gelu_async(&input_buf, config.hidden_dim as u32)
.expect("expected value");
exec.stream.synchronize().expect("synchronize");
let mut output = vec![0.0f32; config.hidden_dim];
output_buf.copy_to_host(&mut output).expect("copy_to_host");
assert!(
(output[0] - 0.345).abs() < 0.02,
"GELU(0.5) = {}",
output[0]
);
}
#[test]
fn test_activations_with_harness_rope() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let config = HarnessConfig::default();
if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let total_dim = config.num_heads * config.head_dim;
let input = vec![1.0f32; total_dim];
let buf_input = GpuBuffer::from_host(&exec.context, &input).expect("buf_input");
let buf_output = GpuBuffer::new(&exec.context, total_dim).expect("buf_output");
let result = exec.rope_into(
&buf_input,
&buf_output,
0,
config.num_heads as u32,
config.head_dim as u32,
exec.rope_theta,
);
assert!(result.is_ok());
}
#[test]
fn test_activations_with_harness_swiglu() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let config = HarnessConfig::default();
if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let gate = vec![1.0f32; config.intermediate_dim];
let up = vec![2.0f32; config.intermediate_dim];
let buf_gate = GpuBuffer::from_host(&exec.context, &gate).expect("buf_gate");
let buf_up = GpuBuffer::from_host(&exec.context, &up).expect("buf_up");
let output_buf = exec
.fused_swiglu_gpu(&buf_gate, &buf_up, config.intermediate_dim as u32)
.expect("expected value");
exec.stream.synchronize().expect("synchronize");
let mut output = vec![0.0f32; config.intermediate_dim];
output_buf.copy_to_host(&mut output).expect("copy_to_host");
assert!((output[0] - 1.462).abs() < 0.05);
}
#[test]
fn test_activations_with_harness_residual_add() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let config = HarnessConfig::default();
if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let output_data = vec![1.0f32; config.hidden_dim];
let input_data = vec![10.0f32; config.hidden_dim];
let buf_output = GpuBuffer::from_host(&exec.context, &output_data).expect("buf_output");
let buf_input = GpuBuffer::from_host(&exec.context, &input_data).expect("buf_input");
exec.add_residual_gpu(&buf_output, &buf_input, config.hidden_dim as u32)
.expect("expected value");
exec.stream.synchronize().expect("synchronize");
let mut result = vec![0.0f32; config.hidden_dim];
buf_output.copy_to_host(&mut result).expect("copy_to_host");
assert!((result[0] - 11.0).abs() < 1e-5);
}
#[test]
fn test_activations_with_harness_large_tensor() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let mut config = HarnessConfig::default();
config.hidden_dim = 4096; if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let input: Vec<f32> = (0..config.hidden_dim)
.map(|i| (i as f32 - 2048.0) / 1000.0)
.collect();
let input_buf = GpuBuffer::from_host(&exec.context, &input).expect("input_buf");
let output_buf = exec.silu_gpu(&input_buf, config.hidden_dim as u32).expect("output_buf");
exec.stream.synchronize().expect("synchronize");
let mut output = vec![0.0f32; config.hidden_dim];
output_buf.copy_to_host(&mut output).expect("copy_to_host");
for (i, &v) in output.iter().enumerate() {
assert!(v.is_finite(), "output[{}] = {} is not finite", i, v);
}
}
#[test]
fn test_activations_with_harness_elementwise_mul() {
use crate::cuda::executor::test_fixtures::{setup_executor_harness, HarnessConfig};
let Some(mut exec) = create_executor() else {
return;
};
let config = HarnessConfig::default();
if setup_executor_harness(&mut exec, &config).is_err() {
return;
}
let a = vec![2.0f32; config.hidden_dim];
let b = vec![3.0f32; config.hidden_dim];
let buf_a = GpuBuffer::from_host(&exec.context, &a).expect("buf_a");
let buf_b = GpuBuffer::from_host(&exec.context, &b).expect("buf_b");
let output_buf = exec
.elementwise_mul_gpu(&buf_a, &buf_b, config.hidden_dim as u32)
.expect("expected value");
exec.stream.synchronize().expect("synchronize");
let mut output = vec![0.0f32; config.hidden_dim];
output_buf.copy_to_host(&mut output).expect("copy_to_host");
assert!((output[0] - 6.0).abs() < 1e-5);
}
#[test]
fn test_qwen009_kernel_type_generation() {
use crate::cuda::kernels::{CudaKernels, KernelType};
let kernels = CudaKernels::new();
let kernel_type = KernelType::FusedRmsNormGateUpSwigluQ4K {
k: 2048, n: 5632, epsilon: 1e-6,
};
let ptx = kernels.generate_ptx(&kernel_type);
assert!(!ptx.is_empty(), "PTX should not be empty");
assert!(
ptx.contains(".version") || ptx.contains(".entry"),
"PTX should contain valid PTX assembly directives"
);
let name = kernels.kernel_name(&kernel_type);
assert_eq!(name, "fused_rmsnorm_gate_up_swiglu_q4k");
}
#[test]
fn test_qwen009_fused_ffn_rmsnorm_swiglu_q4k_basic() {
let Some(mut exec) = create_executor() else {
eprintln!("CUDA init failed - check driver");
return;
};
let hidden_size = 256u32;
let intermediate_size = 512u32;
let epsilon = 1e-6f32;
let input = vec![1.0f32; hidden_size as usize];
let gamma = vec![1.0f32; hidden_size as usize];
let num_super_blocks_per_row = (hidden_size as usize + 255) / 256;
let bytes_per_super_block = 144;
let weight_bytes =
intermediate_size as usize * num_super_blocks_per_row * bytes_per_super_block;
let w_gate_data = vec![0u8; weight_bytes];
let w_up_data = vec![0u8; weight_bytes];
let input_buf = GpuBuffer::from_host(&exec.context, &input).expect("input_buf");
let gamma_buf = GpuBuffer::from_host(&exec.context, &gamma).expect("gamma_buf");
let w_gate_buf = GpuBuffer::from_host(&exec.context, &w_gate_data).expect("w_gate_buf");
let w_up_buf = GpuBuffer::from_host(&exec.context, &w_up_data).expect("w_up_buf");
let output_buf = GpuBuffer::<f32>::new(&exec.context, intermediate_size as usize).expect("output_buf");
let result = exec.fused_ffn_rmsnorm_swiglu_q4k_into(
&input_buf,
&gamma_buf,
w_gate_buf.as_ptr(),
w_up_buf.as_ptr(),
&output_buf,
hidden_size,
intermediate_size,
epsilon,
);
assert!(result.is_ok(), "Kernel launch should succeed");
exec.stream.synchronize().expect("synchronize");
let mut output = vec![0.0f32; intermediate_size as usize];
output_buf.copy_to_host(&mut output).expect("copy_to_host");
for (i, &v) in output.iter().take(4).enumerate() {
assert!(v.is_finite(), "output[{}] = {} should be finite", i, v);
}
}
#[test]
fn test_qwen009_kernel_type_variants() {
use crate::cuda::kernels::{CudaKernels, KernelType};
let kernels = CudaKernels::new();
let test_cases = [
(896, 4864, 1e-6), (1024, 2816, 1e-5), (2048, 5632, 1e-6), ];
for (k, n, epsilon) in test_cases {
let kernel_type = KernelType::FusedRmsNormGateUpSwigluQ4K { k, n, epsilon };
let ptx = kernels.generate_ptx(&kernel_type);
let name = kernels.kernel_name(&kernel_type);
assert!(
!ptx.is_empty(),
"PTX for k={}, n={} should not be empty",
k,
n
);
assert_eq!(name, "fused_rmsnorm_gate_up_swiglu_q4k");
}
}
include!("activations_tests.rs");
}