#[derive(Debug, Clone)]
pub struct QuantizedTensorRef {
pub offset: usize,
pub byte_size: usize,
pub num_elements: usize,
pub qtype: u32,
}
#[derive(Clone)]
pub enum QKVWeights {
Fused(QuantizedTensorRef),
Separate {
q: QuantizedTensorRef,
k: QuantizedTensorRef,
v: QuantizedTensorRef,
},
}
impl QKVWeights {
pub fn out_dim(&self, hidden_dim: usize) -> usize {
match self {
Self::Fused(ref weight) => weight.num_elements / hidden_dim,
Self::Separate {
ref q,
ref k,
ref v,
} => {
let q_dim = q.num_elements / hidden_dim;
let k_dim = k.num_elements / hidden_dim;
let v_dim = v.num_elements / hidden_dim;
q_dim + k_dim + v_dim
},
}
}
pub fn q_dim(&self, hidden_dim: usize) -> usize {
match self {
Self::Fused(ref weight) => weight.num_elements / hidden_dim / 3,
Self::Separate { ref q, .. } => q.num_elements / hidden_dim,
}
}
}
#[derive(Debug, Clone)]
pub struct OwnedQuantizedTensor {
pub data: Vec<u8>,
pub in_dim: usize,
pub out_dim: usize,
pub qtype: u32,
}
impl OwnedQuantizedTensor {
#[must_use]
pub fn from_ref_with_dims(
tensor_ref: &QuantizedTensorRef,
data: &[u8],
in_dim: usize,
out_dim: usize,
) -> Self {
let start = tensor_ref.offset;
let end = start + tensor_ref.byte_size;
let tensor_data = if end <= data.len() {
data[start..end].to_vec()
} else {
Vec::new()
};
Self {
data: tensor_data,
in_dim,
out_dim,
qtype: tensor_ref.qtype,
}
}
}
#[derive(Debug, Clone)]
pub enum OwnedQKVWeights {
Fused(OwnedQuantizedTensor),
Separate {
q: OwnedQuantizedTensor,
k: OwnedQuantizedTensor,
v: OwnedQuantizedTensor,
},
}
impl OwnedQKVWeights {
#[must_use]
pub fn from_borrowed(qkv: &QKVWeights, data: &[u8], hidden_dim: usize) -> Self {
match qkv {
QKVWeights::Fused(ref tensor) => {
let qkv_dim = 3 * hidden_dim;
OwnedQKVWeights::Fused(OwnedQuantizedTensor::from_ref_with_dims(
tensor, data, hidden_dim, qkv_dim,
))
},
QKVWeights::Separate {
ref q,
ref k,
ref v,
} => {
let q_dim = q.num_elements / hidden_dim;
let k_dim = k.num_elements / hidden_dim;
let v_dim = v.num_elements / hidden_dim;
OwnedQKVWeights::Separate {
q: OwnedQuantizedTensor::from_ref_with_dims(q, data, hidden_dim, q_dim),
k: OwnedQuantizedTensor::from_ref_with_dims(k, data, hidden_dim, k_dim),
v: OwnedQuantizedTensor::from_ref_with_dims(v, data, hidden_dim, v_dim),
}
},
}
}
#[must_use]
pub fn out_dim(&self) -> usize {
match self {
OwnedQKVWeights::Fused(t) => t.out_dim,
OwnedQKVWeights::Separate { q, k, v } => q.out_dim + k.out_dim + v.out_dim,
}
}
#[must_use]
pub fn q_dim(&self) -> usize {
match self {
OwnedQKVWeights::Fused(t) => t.out_dim / 3,
OwnedQKVWeights::Separate { q, .. } => q.out_dim,
}
}
#[must_use]
pub fn q_dim_for_config(
&self,
num_heads: usize,
_num_kv_heads: usize,
_hidden_dim: usize,
head_dim: usize,
) -> usize {
match self {
OwnedQKVWeights::Fused(_) => num_heads * head_dim,
OwnedQKVWeights::Separate { q, .. } => q.out_dim,
}
}
#[must_use]
pub fn k_dim_for_config(
&self,
_num_heads: usize,
num_kv_heads: usize,
_hidden_dim: usize,
head_dim: usize,
) -> usize {
match self {
OwnedQKVWeights::Fused(_) => num_kv_heads * head_dim,
OwnedQKVWeights::Separate { k, .. } => k.out_dim,
}
}
#[must_use]
pub fn v_dim_for_config(
&self,
_num_heads: usize,
num_kv_heads: usize,
_hidden_dim: usize,
head_dim: usize,
) -> usize {
match self {
OwnedQKVWeights::Fused(_) => num_kv_heads * head_dim,
OwnedQKVWeights::Separate { v, .. } => v.out_dim,
}
}
#[must_use]
pub fn data_bytes(&self) -> usize {
match self {
OwnedQKVWeights::Fused(t) => t.data.len(),
OwnedQKVWeights::Separate { q, k, v } => q.data.len() + k.data.len() + v.data.len(),
}
}
pub fn free_data(&mut self) {
match self {
OwnedQKVWeights::Fused(t) => t.data = Vec::new(),
OwnedQKVWeights::Separate { q, k, v } => {
q.data = Vec::new();
k.data = Vec::new();
v.data = Vec::new();
},
}
}
}
#[derive(Debug, Clone)]
pub struct OwnedQuantizedLayer {
pub attn_norm_weight: Vec<f32>,
pub attn_norm_bias: Option<Vec<f32>>,
pub qkv_weight: OwnedQKVWeights,
pub qkv_bias: Option<Vec<f32>>,
pub attn_output_weight: OwnedQuantizedTensor,
pub attn_output_bias: Option<Vec<f32>>,
pub ffn_up_weight: OwnedQuantizedTensor,
pub ffn_up_bias: Option<Vec<f32>>,
pub ffn_down_weight: OwnedQuantizedTensor,
pub ffn_down_bias: Option<Vec<f32>>,
pub ffn_gate_weight: Option<OwnedQuantizedTensor>,
pub ffn_gate_bias: Option<Vec<f32>>,
pub ffn_norm_weight: Option<Vec<f32>>,
pub ffn_norm_bias: Option<Vec<f32>>,
pub attn_q_norm_weight: Option<Vec<f32>>,
pub attn_k_norm_weight: Option<Vec<f32>>,
}
impl OwnedQuantizedLayer {
pub fn free_projection_weights(&mut self) {
self.qkv_weight.free_data();
self.attn_output_weight.data = Vec::new();
self.ffn_up_weight.data = Vec::new();
self.ffn_down_weight.data = Vec::new();
if let Some(ref mut gate) = self.ffn_gate_weight {
gate.data = Vec::new();
}
}
#[must_use]
pub fn from_borrowed(
layer: &crate::gguf::QuantizedGGUFTransformerLayer,
data: &[u8],
config: &crate::gguf::GGUFConfig,
) -> Self {
let hidden_dim = config.hidden_dim;
let intermediate_dim = config.intermediate_dim;
Self {
attn_norm_weight: layer.attn_norm_weight.clone(),
attn_norm_bias: layer.attn_norm_bias.clone(),
qkv_weight: OwnedQKVWeights::from_borrowed(&layer.qkv_weight, data, hidden_dim),
qkv_bias: layer.qkv_bias.clone(),
attn_output_weight: OwnedQuantizedTensor::from_ref_with_dims(
&layer.attn_output_weight,
data,
config.q_dim(),
hidden_dim,
),
attn_output_bias: layer.attn_output_bias.clone(),
ffn_up_weight: {
let is_fused_gate_up =
layer.ffn_gate_weight.is_none() && config.constraints.has_gate_ffn();
let up_out_dim = if is_fused_gate_up {
intermediate_dim * 2
} else {
intermediate_dim
};
OwnedQuantizedTensor::from_ref_with_dims(
&layer.ffn_up_weight,
data,
hidden_dim,
up_out_dim,
)
},
ffn_up_bias: layer.ffn_up_bias.clone(),
ffn_down_weight: OwnedQuantizedTensor::from_ref_with_dims(
&layer.ffn_down_weight,
data,
intermediate_dim,
hidden_dim,
),
ffn_down_bias: layer.ffn_down_bias.clone(),
ffn_gate_weight: layer.ffn_gate_weight.as_ref().map(|gate_ref| {
OwnedQuantizedTensor::from_ref_with_dims(
gate_ref,
data,
hidden_dim,
intermediate_dim,
)
}),
ffn_gate_bias: layer.ffn_gate_bias.clone(),
ffn_norm_weight: layer.ffn_norm_weight.clone(),
ffn_norm_bias: layer.ffn_norm_bias.clone(),
attn_q_norm_weight: layer.attn_q_norm_weight.clone(),
attn_k_norm_weight: layer.attn_k_norm_weight.clone(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::gguf::types::GGUF_TYPE_Q4_K;
#[test]
fn test_quantized_tensor_ref() {
let tensor = QuantizedTensorRef {
offset: 1024,
byte_size: 4096,
num_elements: 8192,
qtype: GGUF_TYPE_Q4_K,
};
assert_eq!(tensor.offset, 1024);
assert_eq!(tensor.byte_size, 4096);
assert_eq!(tensor.num_elements, 8192);
assert_eq!(tensor.qtype, GGUF_TYPE_Q4_K);
}
#[test]
fn test_qkv_weights_fused() {
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 1024,
num_elements: 4096 * 3, qtype: GGUF_TYPE_Q4_K,
};
let qkv = QKVWeights::Fused(tensor);
assert_eq!(qkv.out_dim(4096), 3); assert_eq!(qkv.q_dim(4096), 1); }
#[test]
fn test_qkv_weights_separate() {
let q = QuantizedTensorRef {
offset: 0,
byte_size: 1024,
num_elements: 4096 * 4096, qtype: GGUF_TYPE_Q4_K,
};
let k = QuantizedTensorRef {
offset: 1024,
byte_size: 256,
num_elements: 4096 * 512, qtype: GGUF_TYPE_Q4_K,
};
let v = QuantizedTensorRef {
offset: 1280,
byte_size: 256,
num_elements: 4096 * 512,
qtype: GGUF_TYPE_Q4_K,
};
let qkv = QKVWeights::Separate { q, k, v };
assert_eq!(qkv.out_dim(4096), 4096 + 512 + 512);
assert_eq!(qkv.q_dim(4096), 4096);
}
#[test]
fn test_owned_quantized_tensor() {
let tensor_ref = QuantizedTensorRef {
offset: 0,
byte_size: 8,
num_elements: 16,
qtype: GGUF_TYPE_Q4_K,
};
let data = vec![1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let owned = OwnedQuantizedTensor::from_ref_with_dims(&tensor_ref, &data, 4, 4);
assert_eq!(owned.data, &[1, 2, 3, 4, 5, 6, 7, 8]);
assert_eq!(owned.in_dim, 4);
assert_eq!(owned.out_dim, 4);
assert_eq!(owned.qtype, GGUF_TYPE_Q4_K);
}
#[test]
fn test_owned_qkv_weights() {
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 12,
num_elements: 12, qtype: GGUF_TYPE_Q4_K,
};
let qkv_borrowed = QKVWeights::Fused(tensor);
let data = vec![0u8; 20];
let owned = OwnedQKVWeights::from_borrowed(&qkv_borrowed, &data, 4);
assert_eq!(owned.out_dim(), 12); assert_eq!(owned.q_dim(), 4); }
#[test]
fn test_owned_quantized_tensor_bounds() {
let tensor_ref = QuantizedTensorRef {
offset: 100,
byte_size: 50,
num_elements: 100,
qtype: GGUF_TYPE_Q4_K,
};
let data = vec![0u8; 50];
let owned = OwnedQuantizedTensor::from_ref_with_dims(&tensor_ref, &data, 10, 10);
assert!(owned.data.is_empty());
}
}