use super::*;
use crate::cuda::types::{IndexedLayerWeights, ValidatedLayerWeights, WeightQuantType};
use serial_test::serial;
fn test_zeroed_layer_weights() -> ValidatedLayerWeights {
ValidatedLayerWeights::new_unchecked(IndexedLayerWeights {
attn_q_ptr: 0,
attn_q_len: 0,
attn_q_qtype: WeightQuantType::Q4K,
attn_k_ptr: 0,
attn_k_len: 0,
attn_k_qtype: WeightQuantType::Q4K,
attn_v_ptr: 0,
attn_v_len: 0,
attn_v_qtype: WeightQuantType::Q4K,
attn_output_ptr: 0,
attn_output_len: 0,
attn_output_qtype: WeightQuantType::Q4K,
ffn_gate_ptr: 0,
ffn_gate_len: 0,
ffn_gate_qtype: WeightQuantType::Q4K,
ffn_up_ptr: 0,
ffn_up_len: 0,
ffn_up_qtype: WeightQuantType::Q4K,
ffn_down_ptr: 0,
ffn_down_len: 0,
ffn_down_qtype: WeightQuantType::Q4K,
attn_norm_ptr: 0,
attn_norm_len: 0,
ffn_norm_ptr: 0,
ffn_norm_len: 0,
attn_q_bias_ptr: 0,
attn_q_bias_len: 0,
attn_k_bias_ptr: 0,
attn_k_bias_len: 0,
attn_v_bias_ptr: 0,
attn_v_bias_len: 0,
attn_q_norm_ptr: 0,
attn_q_norm_len: 0,
attn_k_norm_ptr: 0,
attn_k_norm_len: 0,
})
}
#[test]
#[serial]
fn test_cov026_coalesced_q4k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 144; let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights("test_coalesced", &weights)
.expect("load weights");
let weight_ptr = executor
.get_quantized_weight_ptr("test_coalesced")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.coalesced_q4k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"coalesced_q4k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_vectorized_q4k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 144;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights("test_vectorized", &weights)
.expect("load weights");
let weight_ptr = executor
.get_quantized_weight_ptr("test_vectorized")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.vectorized_q4k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"vectorized_q4k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_dp4a_q4k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 144;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights("test_dp4a", &weights)
.expect("load weights");
let weight_ptr = executor
.get_quantized_weight_ptr("test_dp4a")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.dp4a_q4k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"dp4a_q4k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_coalesced_q6k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 210;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_coalesced_q6k", &weights, 14)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_coalesced_q6k")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.coalesced_q6k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"coalesced_q6k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_q4k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 144;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights("test_q4k_into", &weights)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q4k_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q4k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q4k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_q6k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 210;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_q6k_into", &weights, 14)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q6k_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q6k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q6k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_q5k_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * 176;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_q5k_into", &weights, 13)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q5k_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q5k_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q5k_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_q8_0_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * (k as usize / 32) * 34;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_q8_0_into", &weights, 8)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q8_0_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q8_0_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q8_0_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
#[ignore = "PTX compilation issue CUDA_ERROR_INVALID_PTX - needs kernel fix"]
fn test_cov026_q4_0_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * (k as usize / 32) * 18;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_q4_0_into", &weights, 2)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q4_0_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q4_0_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q4_0_gemv_into should succeed: {:?}",
result.err()
);
}
#[test]
#[serial]
fn test_cov026_q4_1_gemv_into_basic() {
if !CudaExecutor::is_available() {
return;
}
let mut executor = CudaExecutor::new(0).expect("CUDA executor");
let n = 32u32; let k = 256u32;
let weight_bytes = (n as usize) * (k as usize / 32) * 20;
let weights = vec![0u8; weight_bytes];
executor
.load_quantized_weights_with_type("test_q4_1_into", &weights, 3)
.expect("load");
let weight_ptr = executor
.get_quantized_weight_ptr("test_q4_1_into")
.expect("get ptr");
let input_data = vec![0.1f32; k as usize];
let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
let result = executor.q4_1_gemv_into(weight_ptr, &input, &output, n, k);
assert!(
result.is_ok(),
"q4_1_gemv_into should succeed: {:?}",
result.err()
);
}
include!("tests_cov026.rs");
include!("tests_cov028_fused.rs");
include!("tests_cov029_workspace.rs");
include!("tests_cov030_transformer.rs");