#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
use super::*;
use crate::cuda::executor::test_fixtures::{
generate_q4_0_weights, generate_q5_0_weights, generate_q8_0_weights,
};
fn create_executor() -> Option<CudaExecutor> {
CudaExecutor::new(0).ok()
}
#[test]
fn test_q8_0_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q8_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let weight_ptr = weight_buf.as_ptr();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q8_0_gemv_into(weight_ptr, &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q8_0_gemv_into_large() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 512u32;
let n = 128u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q8_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = vec![0.5f32; k as usize];
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q8_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q5_0_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q5_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q5_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q5_0_gemv_into_large() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 512u32;
let n = 128u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q5_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = vec![0.5f32; k as usize];
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q5_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q4_0_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q4_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q4_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q4_0_gemv_into_single_row() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 1u32;
let blocks = k as usize / 32;
let weights = generate_q4_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = vec![1.0f32; k as usize];
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q4_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q4_1_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let blocks = (n as usize) * (k as usize / 32);
let weights = generate_q4_0_weights(blocks);
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q4_1_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q5k_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let superblocks = (n as usize) * (k as usize / 256);
let weights = vec![0u8; superblocks * 176];
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q5k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_q6k_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
exec.init_workspace(k as usize, k as usize)
.expect("init_workspace");
let superblocks = (n as usize) * (k as usize / 256);
let weights = vec![0u8; superblocks * 210];
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result = exec.q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_coalesced_q6k_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256u32;
let n = 64u32;
let superblocks = (n as usize) * (k as usize / 256);
let weights = vec![0u8; superblocks * 210];
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
let output = vec![0.0f32; n as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result =
exec.coalesced_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
let _ = result;
}
#[test]
fn test_batched_q6k_gemv_into_basic() {
let Some(mut exec) = create_executor() else {
return;
};
let m = 4u32; let k = 256u32;
let n = 64u32;
let superblocks = (n as usize) * (k as usize / 256);
let weights = vec![0u8; superblocks * 210];
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = (0..(m * k) as usize).map(|i| (i as f32) * 0.001).collect();
let output = vec![0.0f32; (m * n) as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result =
exec.batched_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, m, n, k);
let _ = result;
}
#[test]
fn test_batched_q6k_gemv_into_m8() {
let Some(mut exec) = create_executor() else {
return;
};
let m = 8u32;
let k = 256u32;
let n = 32u32;
let superblocks = (n as usize) * (k as usize / 256);
let weights = vec![0u8; superblocks * 210];
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input: Vec<f32> = vec![0.5f32; (m * k) as usize];
let output = vec![0.0f32; (m * n) as usize];
let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();
let result =
exec.batched_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, m, n, k);
let _ = result;
}
fn ggml_q4_0_row(k: usize, seed: u32) -> (Vec<u8>, Vec<f32>) {
use half::f16;
let nb = k / 32;
let mut data = Vec::with_capacity(nb * 18);
for b in 0..nb {
let d = 0.05 * (b as f32 + 1.0);
data.extend_from_slice(&f16::from_f32(d).to_le_bytes());
for j in 0..16 {
let v_lo = ((b as u32 * 7 + j as u32 * 3 + seed) % 16) as u8;
let v_hi = ((b as u32 * 11 + j as u32 * 5 + seed + 1) % 16) as u8;
data.push(v_lo | (v_hi << 4));
}
}
let row = crate::quantize::dequantize_q4_0(&data).unwrap();
(data, row)
}
fn ggml_q4_1_row(k: usize, seed: u32) -> (Vec<u8>, Vec<f32>) {
use half::f16;
let nb = k / 32;
let mut data = Vec::with_capacity(nb * 20);
for b in 0..nb {
let d = 0.05 * (b as f32 + 1.0);
let m = -0.3 * (b as f32 + 1.0);
data.extend_from_slice(&f16::from_f32(d).to_le_bytes());
data.extend_from_slice(&f16::from_f32(m).to_le_bytes());
for j in 0..16 {
let v_lo = ((b as u32 * 7 + j as u32 * 3 + seed) % 16) as u8;
let v_hi = ((b as u32 * 11 + j as u32 * 5 + seed + 1) % 16) as u8;
data.push(v_lo | (v_hi << 4));
}
}
let row = crate::quantize::dequantize_q4_1(&data).unwrap();
(data, row)
}
fn ggml_q5_0_row(k: usize, seed: u32) -> (Vec<u8>, Vec<f32>) {
use half::f16;
let nb = k / 32;
let mut data = Vec::with_capacity(nb * 22);
for b in 0..nb {
let d = 0.05 * (b as f32 + 1.0);
data.extend_from_slice(&f16::from_f32(d).to_le_bytes());
let qh: u32 = 0x1357_9BDF_u32.wrapping_mul(b as u32 + 1).wrapping_add(seed);
data.extend_from_slice(&qh.to_le_bytes());
for j in 0..16 {
let v_lo = ((b as u32 * 7 + j as u32 * 3 + seed) % 16) as u8;
let v_hi = ((b as u32 * 11 + j as u32 * 5 + seed + 1) % 16) as u8;
data.push(v_lo | (v_hi << 4));
}
}
let row = crate::quantize::dequantize_q5_0(&data).unwrap();
(data, row)
}
fn cpu_dot(row: &[f32], x: &[f32]) -> f32 {
row.iter().zip(x.iter()).map(|(w, v)| w * v).sum()
}
#[test]
fn test_q4_0_gemv_matches_cpu_ggml_layout() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256usize;
let n = 4usize;
let x: Vec<f32> = (0..k).map(|i| (i as f32) * 0.013 - 1.0).collect();
let mut weights = Vec::new();
let mut expected = vec![0.0f32; n];
for (r, exp) in expected.iter_mut().enumerate() {
let (w, row) = ggml_q4_0_row(k, r as u32 + 1);
weights.extend_from_slice(&w);
*exp = cpu_dot(&row, &x);
}
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input_buf = GpuBuffer::from_host(&exec.context, &x).unwrap();
let out = vec![0.0f32; n];
let output_buf = GpuBuffer::from_host(&exec.context, &out).unwrap();
exec.q4_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n as u32, k as u32)
.expect("q4_0 gemv");
exec.stream.synchronize().unwrap();
let mut got = vec![0.0f32; n];
output_buf.copy_to_host(&mut got).unwrap();
for r in 0..n {
let diff = (got[r] - expected[r]).abs();
let tol = 1e-2 * expected[r].abs().max(1.0);
assert!(
diff <= tol,
"Q4_0 row {r}: GPU {} vs CPU {} (diff {diff} > tol {tol})",
got[r],
expected[r]
);
}
}
#[test]
fn test_q4_1_gemv_matches_cpu_ggml_layout() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256usize;
let n = 4usize;
let x: Vec<f32> = (0..k).map(|i| (i as f32) * 0.009 - 0.5).collect();
let mut weights = Vec::new();
let mut expected = vec![0.0f32; n];
for (r, exp) in expected.iter_mut().enumerate() {
let (w, row) = ggml_q4_1_row(k, r as u32 + 2);
weights.extend_from_slice(&w);
*exp = cpu_dot(&row, &x);
}
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input_buf = GpuBuffer::from_host(&exec.context, &x).unwrap();
let out = vec![0.0f32; n];
let output_buf = GpuBuffer::from_host(&exec.context, &out).unwrap();
exec.q4_1_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n as u32, k as u32)
.expect("q4_1 gemv");
exec.stream.synchronize().unwrap();
let mut got = vec![0.0f32; n];
output_buf.copy_to_host(&mut got).unwrap();
for r in 0..n {
let diff = (got[r] - expected[r]).abs();
let tol = 1e-2 * expected[r].abs().max(1.0);
assert!(
diff <= tol,
"Q4_1 row {r}: GPU {} vs CPU {} (diff {diff} > tol {tol})",
got[r],
expected[r]
);
}
}
#[test]
fn test_q5_0_gemv_matches_cpu_ggml_layout() {
let Some(mut exec) = create_executor() else {
return;
};
let k = 256usize;
let n = 4usize;
let x: Vec<f32> = (0..k).map(|i| (i as f32) * 0.011 - 0.7).collect();
let mut weights = Vec::new();
let mut expected = vec![0.0f32; n];
for (r, exp) in expected.iter_mut().enumerate() {
let (w, row) = ggml_q5_0_row(k, r as u32 + 3);
weights.extend_from_slice(&w);
*exp = cpu_dot(&row, &x);
}
let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
let input_buf = GpuBuffer::from_host(&exec.context, &x).unwrap();
let out = vec![0.0f32; n];
let output_buf = GpuBuffer::from_host(&exec.context, &out).unwrap();
exec.q5_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n as u32, k as u32)
.expect("q5_0 gemv");
exec.stream.synchronize().unwrap();
let mut got = vec![0.0f32; n];
output_buf.copy_to_host(&mut got).unwrap();
for r in 0..n {
let diff = (got[r] - expected[r]).abs();
let tol = 1e-2 * expected[r].abs().max(1.0);
assert!(
diff <= tol,
"Q5_0 row {r}: GPU {} vs CPU {} (diff {diff} > tol {tol})",
got[r],
expected[r]
);
}
}
}