use crate::driver::{CudaContext, CudaStream};
use crate::memory::resident::{reset_transfer_counters, GpuResidentTensor};
macro_rules! cuda_ctx {
() => {
match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping CUDA test: {:?}", e);
return;
}
}
};
}
#[test]
#[ignore = "CUDA kernel issue - investigate after coverage target reached"]
fn test_ops_linear_without_bias() {
let ctx = cuda_ctx!();
let batch_size = 2u32;
let in_features = 4u32;
let out_features = 3u32;
let input_data = vec![1.0f32; (batch_size * in_features) as usize];
let weight_data = vec![1.0f32; (in_features * out_features) as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let output = input.linear(&ctx, &weight, None, batch_size, in_features, out_features).unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
}
#[test]
fn test_ops_linear_with_bias() {
let ctx = cuda_ctx!();
let batch_size = 2u32;
let in_features = 4u32;
let out_features = 3u32;
let input_data = vec![1.0f32; (batch_size * in_features) as usize];
let weight_data = vec![1.0f32; (in_features * out_features) as usize];
let bias_data = vec![0.5f32; out_features as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let output =
input.linear(&ctx, &weight, Some(&bias), batch_size, in_features, out_features).unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
}
#[test]
fn test_ops_fused_linear_gelu() {
let ctx = cuda_ctx!();
let batch_size = 4u32;
let in_features = 8u32;
let out_features = 4u32;
let input_data = vec![1.0f32; (batch_size * in_features) as usize];
let weight_data = vec![0.1f32; (in_features * out_features) as usize];
let bias_data = vec![0.0f32; out_features as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let mut output = input
.fused_linear_gelu(&ctx, &weight, &bias, batch_size, in_features, out_features)
.unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
let result = output.to_host().unwrap();
assert!(result.iter().all(|v| v.is_finite()));
}
#[test]
fn test_ops_conv1d() {
let ctx = cuda_ctx!();
let seq_len = 100u32;
let in_channels = 1u32;
let out_channels = 32u32;
let kernel_size = 3u32;
let stride = 1u32;
let padding = 1u32;
let input_data = vec![1.0f32; (seq_len * in_channels) as usize];
let weight_data = vec![0.1f32; (out_channels * in_channels * kernel_size) as usize];
let bias_data = vec![0.0f32; out_channels as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let output = input
.conv1d(
&ctx,
&weight,
Some(&bias),
in_channels,
out_channels,
kernel_size,
stride,
padding,
seq_len,
)
.unwrap();
let expected_out_len = (seq_len + 2 * padding - kernel_size) / stride + 1;
assert_eq!(output.len(), (expected_out_len * out_channels) as usize);
}
#[test]
fn test_ops_conv1d_dimension_error() {
let ctx = cuda_ctx!();
let seq_len = 10u32;
let in_channels = 2u32;
let out_channels = 4u32;
let kernel_size = 3u32;
let input_data = vec![1.0f32; 15]; let weight_data = vec![0.1f32; (out_channels * in_channels * kernel_size) as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let result =
input.conv1d(&ctx, &weight, None, in_channels, out_channels, kernel_size, 1, 0, seq_len);
assert!(result.is_err());
}
#[test]
fn test_ops_interleaved_to_head_first() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let seq_len = 8u32;
let n_heads = 4u32;
let head_dim = 16u32;
let d_model = n_heads * head_dim;
let input_data: Vec<f32> = (0..(seq_len * d_model)).map(|i| i as f32 * 0.001).collect();
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let stream = CudaStream::new(&ctx).unwrap();
let output =
input.interleaved_to_head_first(&ctx, seq_len, n_heads, head_dim, &stream).unwrap();
stream.synchronize().unwrap();
assert_eq!(output.len(), (seq_len * d_model) as usize);
}
#[test]
fn test_ops_interleaved_dimension_error() {
let ctx = cuda_ctx!();
let input_data = vec![1.0f32; 100];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let stream = CudaStream::new(&ctx).unwrap();
let result = input.interleaved_to_head_first(&ctx, 8, 4, 16, &stream);
assert!(result.is_err());
}
#[test]
#[ignore = "Conv1d kernel does not handle null bias pointer - kernel bug to fix"]
fn test_ops_conv1d_without_bias() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let seq_len = 50u32;
let in_channels = 1u32;
let out_channels = 16u32;
let kernel_size = 3u32;
let stride = 1u32;
let padding = 1u32;
let input_data = vec![1.0f32; (seq_len * in_channels) as usize];
let weight_data = vec![0.1f32; (out_channels * in_channels * kernel_size) as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let output = input
.conv1d(
&ctx,
&weight,
None,
in_channels,
out_channels,
kernel_size,
stride,
padding,
seq_len,
)
.unwrap();
let expected_out_len = (seq_len + 2 * padding - kernel_size) / stride + 1;
assert_eq!(output.len(), (expected_out_len * out_channels) as usize);
}
#[test]
fn test_ops_conv1d_weight_dimension_error() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let seq_len = 20u32;
let in_channels = 2u32;
let out_channels = 4u32;
let kernel_size = 3u32;
let input_data = vec![1.0f32; (seq_len * in_channels) as usize];
let weight_data = vec![0.1f32; 10];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let result =
input.conv1d(&ctx, &weight, None, in_channels, out_channels, kernel_size, 1, 0, seq_len);
assert!(result.is_err());
}
#[test]
fn test_ops_linear_with_debug() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
std::env::set_var("WHISPER_DEBUG_LINEAR", "1");
let batch_size = 2u32;
let in_features = 8u32;
let out_features = 4u32;
let input_data = vec![1.0f32; (batch_size * in_features) as usize];
let weight_data = vec![0.1f32; (in_features * out_features) as usize];
let bias_data = vec![0.0f32; out_features as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let output =
input.linear(&ctx, &weight, Some(&bias), batch_size, in_features, out_features).unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
std::env::remove_var("WHISPER_DEBUG_LINEAR");
}
#[test]
fn test_ops_linear_without_bias_and_debug() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
std::env::set_var("WHISPER_DEBUG_LINEAR", "1");
let batch_size = 2u32;
let in_features = 8u32;
let out_features = 4u32;
let input_data = vec![0.5f32; (batch_size * in_features) as usize];
let weight_data = vec![0.1f32; (in_features * out_features) as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let output = input.linear(&ctx, &weight, None, batch_size, in_features, out_features).unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
std::env::remove_var("WHISPER_DEBUG_LINEAR");
}
#[test]
fn test_ops_fused_linear_gelu_larger() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let batch_size = 16u32;
let in_features = 32u32;
let out_features = 16u32;
let input_data = vec![0.5f32; (batch_size * in_features) as usize];
let weight_data = vec![0.02f32; (in_features * out_features) as usize];
let bias_data = vec![0.0f32; out_features as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let mut output = input
.fused_linear_gelu(&ctx, &weight, &bias, batch_size, in_features, out_features)
.unwrap();
assert_eq!(output.len(), (batch_size * out_features) as usize);
let result = output.to_host().unwrap();
assert!(result.iter().all(|v| v.is_finite()));
}
#[test]
fn test_ops_interleaved_to_head_first_larger() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let seq_len = 32u32;
let n_heads = 8u32;
let head_dim = 64u32;
let d_model = n_heads * head_dim;
let input_data: Vec<f32> = (0..(seq_len * d_model)).map(|i| i as f32 * 0.0001).collect();
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let stream = CudaStream::new(&ctx).unwrap();
let output =
input.interleaved_to_head_first(&ctx, seq_len, n_heads, head_dim, &stream).unwrap();
stream.synchronize().unwrap();
assert_eq!(output.len(), (seq_len * d_model) as usize);
}
#[test]
fn test_ops_conv1d_with_stride() {
use crate::memory::resident::clear_kernel_cache;
clear_kernel_cache();
let ctx = cuda_ctx!();
let seq_len = 100u32;
let in_channels = 1u32;
let out_channels = 8u32;
let kernel_size = 5u32;
let stride = 2u32;
let padding = 2u32;
let input_data = vec![1.0f32; (seq_len * in_channels) as usize];
let weight_data = vec![0.1f32; (out_channels * in_channels * kernel_size) as usize];
let bias_data = vec![0.5f32; out_channels as usize];
let input = GpuResidentTensor::from_host(&ctx, &input_data).unwrap();
let weight = GpuResidentTensor::from_host(&ctx, &weight_data).unwrap();
let bias = GpuResidentTensor::from_host(&ctx, &bias_data).unwrap();
let output = input
.conv1d(
&ctx,
&weight,
Some(&bias),
in_channels,
out_channels,
kernel_size,
stride,
padding,
seq_len,
)
.unwrap();
let expected_out_len = (seq_len + 2 * padding - kernel_size) / stride + 1;
assert_eq!(output.len(), (expected_out_len * out_channels) as usize);
}