#[test]
#[serial]
fn test_cov030_transformer_layer_batched_positions_mismatch() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
executor.init_workspace(512, 256).expect("init workspace");
executor
.init_batched_workspace(512, 256, 4)
.expect("init batched");
let input_data = vec![0.1f32; 512 * 4];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let layer_weights = test_zeroed_layer_weights();
let result = executor.transformer_layer_batched(
&input,
0,
&layer_weights,
4,
&[0, 1], 512,
256,
1e-5,
);
assert!(result.is_err(), "Should error when positions.len() != m");
}
#[test]
#[serial]
fn test_cov031_rope_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8u32;
let head_dim = 64u32;
let n = num_heads * head_dim;
let theta = 10000.0f32;
let position = 5u32;
let input_data = vec![1.0f32; n as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.rope_into(&input, &output, position, num_heads, head_dim, theta);
assert!(
result.is_ok(),
"rope_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_rope_into_positions() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8u32;
let head_dim = 64u32;
let n = num_heads * head_dim;
let theta = 10000.0f32;
let input_data = vec![1.0f32; n as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
for position in [0u32, 1, 10, 100, 1000] {
let result = executor.rope_into(&input, &output, position, num_heads, head_dim, theta);
assert!(
result.is_ok(),
"rope_into at position {} failed: {:?}",
position,
result.err()
);
}
}
#[test]
#[serial]
fn test_cov031_rope_indirect_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8u32;
let head_dim = 64u32;
let n = num_heads * head_dim;
let theta = 10000.0f32;
let input_data = vec![1.0f32; n as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let position_buf = GpuBuffer::from_host(executor.context(), &[5u32]).expect("position buf");
let result =
executor.rope_indirect_into(&input, &output, &position_buf, num_heads, head_dim, theta);
assert!(
result.is_ok(),
"rope_indirect_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_rope_neox_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8u32;
let head_dim = 64u32;
let n = num_heads * head_dim;
let theta = 10000.0f32;
let position = 5u32;
let input_data = vec![1.0f32; n as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.rope_neox_into(&input, &output, position, num_heads, head_dim, theta);
assert!(
result.is_ok(),
"rope_neox_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_rope_neox_indirect_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8u32;
let head_dim = 64u32;
let n = num_heads * head_dim;
let theta = 10000.0f32;
let input_data = vec![1.0f32; n as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let position_buf = GpuBuffer::from_host(executor.context(), &[10u32]).expect("position buf");
let result = executor.rope_neox_indirect_into(
&input,
&output,
&position_buf,
num_heads,
head_dim,
theta,
);
assert!(
result.is_ok(),
"rope_neox_indirect_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_fused_qkv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let hidden_dim = 64u32;
let kv_dim = 32u32;
let w_q_data = vec![0.01f32; (hidden_dim * hidden_dim) as usize]; let w_k_data = vec![0.01f32; (hidden_dim * kv_dim) as usize];
let w_v_data = vec![0.01f32; (hidden_dim * kv_dim) as usize];
let w_q = GpuBuffer::from_host(executor.context(), &w_q_data).expect("w_q");
let w_k = GpuBuffer::from_host(executor.context(), &w_k_data).expect("w_k");
let w_v = GpuBuffer::from_host(executor.context(), &w_v_data).expect("w_v");
let input_data = vec![0.1f32; hidden_dim as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let q_out = GpuBuffer::new(executor.context(), hidden_dim as usize).expect("q_out");
let k_out = GpuBuffer::new(executor.context(), kv_dim as usize).expect("k_out");
let v_out = GpuBuffer::new(executor.context(), kv_dim as usize).expect("v_out");
let result = executor.fused_qkv_into(
&input, &w_q, &w_k, &w_v, &q_out, &k_out, &v_out, hidden_dim, kv_dim,
);
assert!(
result.is_ok(),
"fused_qkv_into should succeed: {:?}",
result.err()
);
executor.synchronize().expect("sync");
}
#[test]
#[serial]
fn test_cov031_fused_gate_up_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let hidden_dim = 64u32;
let intermediate_dim = 128u32;
let w_gate_data = vec![0.01f32; (hidden_dim * intermediate_dim) as usize];
let w_up_data = vec![0.01f32; (hidden_dim * intermediate_dim) as usize];
let w_gate = GpuBuffer::from_host(executor.context(), &w_gate_data).expect("w_gate");
let w_up = GpuBuffer::from_host(executor.context(), &w_up_data).expect("w_up");
let input_data = vec![0.1f32; hidden_dim as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), intermediate_dim as usize).expect("output");
let result = executor.fused_gate_up_into(
&input,
&w_gate,
&w_up,
&output,
hidden_dim,
intermediate_dim,
);
assert!(
result.is_ok(),
"fused_gate_up_into should succeed: {:?}",
result.err()
);
executor.synchronize().expect("sync");
}
#[test]
#[serial]
fn test_cov031_incremental_attention_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 4usize;
let num_kv_heads = 4usize;
let head_dim = 8usize;
let q_dim = num_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
executor
.init_kv_cache_gpu(1, num_heads, num_kv_heads, head_dim, 16)
.expect("init kv");
let q_data = vec![0.1f32; q_dim];
let k_data = vec![0.1f32; kv_dim];
let v_data = vec![0.1f32; kv_dim];
let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
let out_buf = GpuBuffer::new(executor.context(), q_dim).expect("out_buf");
let result = executor.incremental_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf);
assert!(
result.is_ok(),
"incremental_attention_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_batched_incremental_attention_into() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 4usize;
let num_kv_heads = 4usize;
let head_dim = 8usize;
let batch_size = 2usize;
let q_dim = num_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
executor
.init_kv_cache_gpu(1, num_heads, num_kv_heads, head_dim, 16)
.expect("init kv");
executor
.init_batched_kv_cache_gpu(1, batch_size)
.expect("init batched kv");
let q_data = vec![0.1f32; q_dim * batch_size];
let k_data = vec![0.1f32; kv_dim * batch_size];
let v_data = vec![0.1f32; kv_dim * batch_size];
let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
let out_buf = GpuBuffer::new(executor.context(), q_dim * batch_size).expect("out_buf");
let positions = vec![0u32; batch_size];
let result = executor.batched_incremental_attention_into(
0, &q_buf, &k_buf, &v_buf, &out_buf, batch_size, &positions,
);
assert!(
result.is_ok(),
"batched_incremental_attention_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov031_flash_decoding_not_init() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let seq_len = 16usize;
let head_dim = 64usize;
let n = seq_len * head_dim;
let q_data = vec![0.1f32; n];
let k_data = vec![0.1f32; n];
let v_data = vec![0.1f32; n];
let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
let out_buf = GpuBuffer::new(executor.context(), n).expect("out_buf");
let positions = vec![0u32; 1];
let result =
executor.flash_decoding_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf, 1, &positions);
assert!(result.is_err(), "flash_decoding should error without init");
}
#[test]
#[serial]
fn test_cov031_flash_decoding_with_init() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let num_heads = 8usize;
let head_dim = 64usize;
let max_seq_len = 128usize;
let batch_size = 1usize;
let init_result = executor.init_flash_decoding(num_heads, head_dim, max_seq_len, batch_size);
assert!(
init_result.is_ok(),
"init_flash_decoding should succeed: {:?}",
init_result.err()
);
let q_dim = num_heads * head_dim;
let q_data = vec![0.1f32; q_dim];
let k_data = vec![0.1f32; q_dim];
let v_data = vec![0.1f32; q_dim];
let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
let out_buf = GpuBuffer::new(executor.context(), q_dim).expect("out_buf");
let positions = vec![0u32; batch_size];
let result = executor
.flash_decoding_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf, batch_size, &positions);
let _ = result;
}