#![cfg(feature = "cuda")]
use half::f16;
pub fn generate_q4_0_weights(num_blocks: usize) -> Vec<u8> {
let mut data = Vec::with_capacity(num_blocks * 18);
for block_idx in 0..num_blocks {
let scale = 0.1 * (block_idx as f32 + 1.0);
let scale_f16 = f16::from_f32(scale);
data.extend_from_slice(&scale_f16.to_le_bytes());
for j in 0..16 {
let low = ((block_idx + j) % 16) as u8;
let high = ((block_idx + j + 1) % 16) as u8;
data.push(low | (high << 4));
}
}
data
}
pub fn generate_q5_0_weights(num_blocks: usize) -> Vec<u8> {
let mut data = Vec::with_capacity(num_blocks * 22);
for block_idx in 0..num_blocks {
let scale = 0.1 * (block_idx as f32 + 1.0);
let scale_f16 = f16::from_f32(scale);
data.extend_from_slice(&scale_f16.to_le_bytes());
let qh: u32 = 0xAAAA_5555;
data.extend_from_slice(&qh.to_le_bytes());
for j in 0..16 {
let low = ((block_idx + j) % 16) as u8;
let high = ((block_idx + j + 1) % 16) as u8;
data.push(low | (high << 4));
}
}
data
}
pub fn generate_q4_1_weights(num_blocks: usize) -> Vec<u8> {
let mut data = Vec::with_capacity(num_blocks * 20);
for block_idx in 0..num_blocks {
let scale = 0.1 * (block_idx as f32 + 1.0);
let scale_f16 = f16::from_f32(scale);
data.extend_from_slice(&scale_f16.to_le_bytes());
let min = -0.5 * (block_idx as f32 + 1.0);
let min_f16 = f16::from_f32(min);
data.extend_from_slice(&min_f16.to_le_bytes());
for j in 0..16 {
let low = ((block_idx + j) % 16) as u8;
let high = ((block_idx + j + 1) % 16) as u8;
data.push(low | (high << 4));
}
}
data
}
pub fn generate_q8_0_weights(num_blocks: usize) -> Vec<u8> {
let mut data = Vec::with_capacity(num_blocks * 34);
for block_idx in 0..num_blocks {
let scale = 0.01 * (block_idx as f32 + 1.0);
let scale_f16 = f16::from_f32(scale);
data.extend_from_slice(&scale_f16.to_le_bytes());
for j in 0..32 {
let val = (((block_idx + j) % 256) as i8) as u8;
data.push(val);
}
}
data
}
pub struct GqaConfig {
pub hidden_dim: usize,
pub num_heads: usize,
pub num_kv_heads: usize,
pub head_dim: usize,
pub intermediate_dim: usize,
pub name: &'static str,
}
impl GqaConfig {
pub const QWEN_0_5B: Self = Self {
hidden_dim: 896,
num_heads: 14,
num_kv_heads: 2,
head_dim: 64,
intermediate_dim: 4864,
name: "Qwen-0.5B",
};
pub const TINY_LLAMA: Self = Self {
hidden_dim: 2048,
num_heads: 32,
num_kv_heads: 4,
head_dim: 64,
intermediate_dim: 5632,
name: "TinyLlama",
};
pub const QWEN_1_5B: Self = Self {
hidden_dim: 1536,
num_heads: 12,
num_kv_heads: 2,
head_dim: 128,
intermediate_dim: 8960,
name: "Qwen-1.5B",
};
pub const LLAMA_7B_MHA: Self = Self {
hidden_dim: 4096,
num_heads: 32,
num_kv_heads: 32,
head_dim: 128,
intermediate_dim: 11008,
name: "Llama-7B-MHA",
};
#[must_use]
pub const fn q_dim(&self) -> usize {
self.num_heads * self.head_dim
}
#[must_use]
pub const fn kv_dim(&self) -> usize {
self.num_kv_heads * self.head_dim
}
#[must_use]
pub const fn gqa_group_size(&self) -> usize {
self.num_heads / self.num_kv_heads
}
#[must_use]
pub const fn is_gqa(&self) -> bool {
self.num_kv_heads < self.num_heads
}
}
#[must_use]
pub fn relative_diff(a: f32, b: f32) -> f32 {
let diff = (a - b).abs();
let denom = a.abs().max(b.abs()).max(1e-6);
diff / denom
}
#[must_use]
pub fn max_element_diff(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len(), "Vector lengths must match");
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).abs())
.fold(0.0f32, f32::max)
}
#[must_use]
pub fn sum_relative_diff(a: &[f32], b: &[f32]) -> f32 {
let sum_a: f32 = a.iter().sum();
let sum_b: f32 = b.iter().sum();
relative_diff(sum_a, sum_b)
}
#[must_use]
pub fn vectors_match(a: &[f32], b: &[f32], tolerance: f32) -> bool {
max_element_diff(a, b) <= tolerance
}
pub struct HarnessConfig {
pub hidden_dim: usize,
pub intermediate_dim: usize,
pub num_layers: usize,
pub num_heads: usize,
pub num_kv_heads: usize,
pub head_dim: usize,
pub vocab_size: usize,
pub max_seq_len: usize,
}
impl Default for HarnessConfig {
fn default() -> Self {
Self {
hidden_dim: 256,
intermediate_dim: 512,
num_layers: 2,
num_heads: 4,
num_kv_heads: 2,
head_dim: 64,
vocab_size: 1024,
max_seq_len: 128,
}
}
}
impl HarnessConfig {
pub fn tiny() -> Self {
Self {
hidden_dim: 64,
intermediate_dim: 128,
num_layers: 1,
num_heads: 2,
num_kv_heads: 1,
head_dim: 32,
vocab_size: 256,
max_seq_len: 32,
}
}
pub fn qwen_like() -> Self {
Self {
hidden_dim: 256, intermediate_dim: 512, num_layers: 2,
num_heads: 8,
num_kv_heads: 2,
head_dim: 32,
vocab_size: 512,
max_seq_len: 64,
}
}
}
#[inline]
fn q4k_weight_size(rows: usize, cols: usize) -> usize {
rows * cols / 256 * 144
}
fn load_zero_weights(
exec: &mut crate::cuda::executor::CudaExecutor,
name: &str,
rows: usize,
cols: usize,
) -> Result<(), crate::cuda::executor::GpuError> {
let weights = vec![0u8; q4k_weight_size(rows, cols)];
exec.load_quantized_weights(name, &weights)?;
Ok(())
}
fn load_layer_attn_weights(
exec: &mut crate::cuda::executor::CudaExecutor,
prefix: &str,
config: &HarnessConfig,
) -> Result<(), crate::cuda::executor::GpuError> {
let q_dim = config.num_heads * config.head_dim;
let kv_dim = config.num_kv_heads * config.head_dim;
load_zero_weights(
exec,
&format!("{prefix}.attn_q.weight"),
q_dim,
config.hidden_dim,
)?;
load_zero_weights(
exec,
&format!("{prefix}.attn_k.weight"),
kv_dim,
config.hidden_dim,
)?;
load_zero_weights(
exec,
&format!("{prefix}.attn_v.weight"),
kv_dim,
config.hidden_dim,
)?;
load_zero_weights(
exec,
&format!("{prefix}.attn_output.weight"),
config.hidden_dim,
q_dim,
)
}
fn load_layer_ffn_weights(
exec: &mut crate::cuda::executor::CudaExecutor,
prefix: &str,
config: &HarnessConfig,
) -> Result<(), crate::cuda::executor::GpuError> {
load_zero_weights(
exec,
&format!("{prefix}.ffn_gate.weight"),
config.intermediate_dim,
config.hidden_dim,
)?;
load_zero_weights(
exec,
&format!("{prefix}.ffn_up.weight"),
config.intermediate_dim,
config.hidden_dim,
)?;
load_zero_weights(
exec,
&format!("{prefix}.ffn_down.weight"),
config.hidden_dim,
config.intermediate_dim,
)
}
include!("executor_harness.rs");