use crate::error::{RealizarError, Result};
use crate::quantize::QK_K;
use super::config::{GGUFConfig, ValidatedModelConfig};
use super::quantized::{QKVWeights, QuantizedTensorRef};
use super::types::{
GGUFModel, GGUF_TYPE_F32, GGUF_TYPE_Q2_K, GGUF_TYPE_Q4_0, GGUF_TYPE_Q4_1, GGUF_TYPE_Q4_K,
GGUF_TYPE_Q5_0, GGUF_TYPE_Q5_K, GGUF_TYPE_Q6_K, GGUF_TYPE_Q8_0,
};
pub struct QuantizedGGUFTransformerLayer {
pub attn_norm_weight: Vec<f32>,
pub attn_norm_bias: Option<Vec<f32>>,
pub qkv_weight: QKVWeights,
pub qkv_bias: Option<Vec<f32>>,
pub attn_output_weight: QuantizedTensorRef,
pub attn_output_bias: Option<Vec<f32>>,
pub ffn_up_weight: QuantizedTensorRef,
pub ffn_up_bias: Option<Vec<f32>>,
pub ffn_down_weight: QuantizedTensorRef,
pub ffn_down_bias: Option<Vec<f32>>,
pub ffn_gate_weight: Option<QuantizedTensorRef>,
pub ffn_gate_bias: Option<Vec<f32>>,
pub ffn_norm_weight: Option<Vec<f32>>,
pub ffn_norm_bias: Option<Vec<f32>>,
pub attn_q_norm_weight: Option<Vec<f32>>,
pub attn_k_norm_weight: Option<Vec<f32>>,
}
pub struct QuantizedGGUFTransformer<'a> {
pub config: GGUFConfig,
pub data: &'a [u8],
pub token_embedding: Vec<f32>,
pub position_embedding: Option<Vec<f32>>,
pub layers: Vec<QuantizedGGUFTransformerLayer>,
pub output_norm_weight: Vec<f32>,
pub output_norm_bias: Option<Vec<f32>>,
pub lm_head_weight: QuantizedTensorRef,
pub lm_head_bias: Option<Vec<f32>>,
}
impl<'a> QuantizedGGUFTransformer<'a> {
pub fn from_gguf(model: &GGUFModel, data: &'a [u8]) -> Result<Self> {
let config = ValidatedModelConfig::from_gguf(model)?.into_inner();
let token_embedding = model.get_tensor_f32("token_embd.weight", data)?;
let position_embedding = model
.get_tensor_f32("position_embd.weight", data)
.or_else(|_| model.get_tensor_f32("token_pos_embd.weight", data))
.or_else(|_| model.get_tensor_f32("model.position_embedding.weight", data))
.ok();
let mut layers = Vec::with_capacity(config.num_layers);
for layer_idx in 0..config.num_layers {
let layer = Self::load_quantized_layer(model, data, layer_idx)?;
layers.push(layer);
}
let output_norm_weight = model.get_tensor_f32("output_norm.weight", data)?;
let output_norm_bias = model
.get_tensor_f32("output_norm.bias", data)
.or_else(|_| model.get_tensor_f32("model.norm.bias", data))
.ok();
let lm_head_weight = Self::get_tensor_ref(model, data, "output.weight")
.or_else(|_| Self::get_tensor_ref(model, data, "token_embd.weight"))?;
let lm_head_bias = model.get_tensor_f32("output.bias", data).ok();
Ok(Self {
config,
data,
token_embedding,
position_embedding,
layers,
output_norm_weight,
output_norm_bias,
lm_head_weight,
lm_head_bias,
})
}
fn tensor_byte_size(qtype: u32, num_elements: usize, dims: &[u64]) -> Result<usize> {
fn k_quant_bytes(dims: &[u64], super_block_bytes: usize) -> usize {
if dims.len() == 2 {
let rows = dims[0] as usize;
let cols = dims[1] as usize;
rows * cols.div_ceil(QK_K) * super_block_bytes
} else {
let n: usize = dims.iter().map(|&d| d as usize).product();
n.div_ceil(QK_K) * super_block_bytes
}
}
match qtype {
GGUF_TYPE_F32 => Ok(num_elements * 4),
GGUF_TYPE_Q4_0 => Ok(num_elements.div_ceil(32) * 18),
GGUF_TYPE_Q8_0 => Ok(num_elements.div_ceil(32) * 34),
GGUF_TYPE_Q2_K => Ok(num_elements.div_ceil(QK_K) * 84),
GGUF_TYPE_Q4_1 => Ok(num_elements.div_ceil(32) * 20),
GGUF_TYPE_Q5_0 => Ok(num_elements.div_ceil(32) * 22),
GGUF_TYPE_Q4_K => Ok(k_quant_bytes(dims, 144)),
GGUF_TYPE_Q5_K => Ok(k_quant_bytes(dims, 176)),
GGUF_TYPE_Q6_K => Ok(k_quant_bytes(dims, 210)),
_ => Err(RealizarError::UnsupportedOperation {
operation: "tensor_byte_size".to_string(),
reason: format!("Unsupported quantization type: {qtype}"),
}),
}
}
fn resolve_qtype(
name: &str,
claimed_qtype: u32,
byte_size: usize,
num_elements: usize,
offset: usize,
data_len: usize,
) -> (usize, u32) {
if offset + byte_size <= data_len {
return (byte_size, claimed_qtype);
}
let avail = data_len.saturating_sub(offset);
let q4_0_size = num_elements.div_ceil(32) * 18;
if q4_0_size <= avail && q4_0_size > 0 {
eprintln!(
"[PAR-058-RESOLVED] Tensor '{name}' qtype mismatch: header says {claimed_qtype} but byte size suggests Q4_0. Using Q4_0."
);
return (q4_0_size, GGUF_TYPE_Q4_0);
}
let q8_0_size = num_elements.div_ceil(32) * 34;
if q8_0_size <= avail && q8_0_size > 0 {
eprintln!(
"[PAR-058-RESOLVED] Tensor '{name}' qtype mismatch: header says {claimed_qtype} but byte size suggests Q8_0. Using Q8_0."
);
return (q8_0_size, GGUF_TYPE_Q8_0);
}
(byte_size, claimed_qtype)
}
fn get_tensor_ref(model: &GGUFModel, data: &[u8], name: &str) -> Result<QuantizedTensorRef> {
let tensor = model
.tensors
.iter()
.find(|t| t.name == name)
.ok_or_else(|| RealizarError::InvalidShape {
reason: format!("Tensor '{}' not found", name),
})?;
let num_elements: usize = tensor.dims.iter().map(|&d| d as usize).product();
let offset = model.tensor_data_start + tensor.offset as usize;
let byte_size = Self::tensor_byte_size(tensor.qtype, num_elements, &tensor.dims)?;
let (byte_size, actual_qtype) = Self::resolve_qtype(
name,
tensor.qtype,
byte_size,
num_elements,
offset,
data.len(),
);
if offset + byte_size > data.len() {
return Err(RealizarError::InvalidShape {
reason: format!(
"Tensor '{}' data range [{}, {}) exceeds file size {}",
name,
offset,
offset + byte_size,
data.len()
),
});
}
Ok(QuantizedTensorRef {
offset,
byte_size,
num_elements,
qtype: actual_qtype,
})
}
fn load_quantized_layer(
model: &GGUFModel,
data: &[u8],
layer_idx: usize,
) -> Result<QuantizedGGUFTransformerLayer> {
let prefix = format!("blk.{}", layer_idx);
let attn_norm_weight =
model.get_tensor_f32(&format!("{}.attn_norm.weight", prefix), data)?;
let attn_norm_bias = model
.get_tensor_f32(&format!("{}.attn_norm.bias", prefix), data)
.or_else(|_| model.get_tensor_f32(&format!("{}.input_layernorm.bias", prefix), data))
.ok();
let (qkv_weight, qkv_bias) = if let Ok(fused) =
Self::get_tensor_ref(model, data, &format!("{}.attn_qkv.weight", prefix))
{
let bias = model
.get_tensor_f32(&format!("{}.attn_qkv.bias", prefix), data)
.ok();
(QKVWeights::Fused(fused), bias)
} else {
let q = Self::get_tensor_ref(model, data, &format!("{}.attn_q.weight", prefix))?;
let k = Self::get_tensor_ref(model, data, &format!("{}.attn_k.weight", prefix))?;
let v = Self::get_tensor_ref(model, data, &format!("{}.attn_v.weight", prefix))?;
let q_bias = model
.get_tensor_f32(&format!("{}.attn_q.bias", prefix), data)
.ok();
let k_bias = model
.get_tensor_f32(&format!("{}.attn_k.bias", prefix), data)
.ok();
let v_bias = model
.get_tensor_f32(&format!("{}.attn_v.bias", prefix), data)
.ok();
let bias = match (q_bias, k_bias, v_bias) {
(Some(qb), Some(kb), Some(vb)) => {
let mut combined = Vec::with_capacity(qb.len() + kb.len() + vb.len());
combined.extend_from_slice(&qb);
combined.extend_from_slice(&kb);
combined.extend_from_slice(&vb);
Some(combined)
},
_ => None,
};
(QKVWeights::Separate { q, k, v }, bias)
};
let attn_output_weight =
Self::get_tensor_ref(model, data, &format!("{}.attn_output.weight", prefix))?;
let attn_output_bias = model
.get_tensor_f32(&format!("{}.attn_output.bias", prefix), data)
.ok();
let ffn_up_weight =
Self::get_tensor_ref(model, data, &format!("{}.ffn_up.weight", prefix))?;
let ffn_up_bias = model
.get_tensor_f32(&format!("{}.ffn_up.bias", prefix), data)
.or_else(|_| model.get_tensor_f32(&format!("{}.mlp.up_proj.bias", prefix), data))
.ok();
let ffn_down_weight =
Self::get_tensor_ref(model, data, &format!("{}.ffn_down.weight", prefix))?;
let ffn_down_bias = model
.get_tensor_f32(&format!("{}.ffn_down.bias", prefix), data)
.or_else(|_| model.get_tensor_f32(&format!("{}.mlp.down_proj.bias", prefix), data))
.ok();
let ffn_gate_weight =
Self::get_tensor_ref(model, data, &format!("{}.ffn_gate.weight", prefix)).ok();
let ffn_gate_bias = model
.get_tensor_f32(&format!("{}.ffn_gate.bias", prefix), data)
.ok();
let ffn_norm_weight = model
.get_tensor_f32(&format!("{}.ffn_norm.weight", prefix), data)
.ok();
let ffn_norm_bias = model
.get_tensor_f32(&format!("{}.ffn_norm.bias", prefix), data)
.or_else(|_| {
model.get_tensor_f32(&format!("{}.post_attention_layernorm.bias", prefix), data)
})
.ok();
let attn_q_norm_weight = model
.get_tensor_f32(&format!("{}.attn_q_norm.weight", prefix), data)
.ok();
let attn_k_norm_weight = model
.get_tensor_f32(&format!("{}.attn_k_norm.weight", prefix), data)
.ok();
Ok(QuantizedGGUFTransformerLayer {
attn_norm_weight,
attn_norm_bias,
qkv_weight,
qkv_bias,
attn_output_weight,
attn_output_bias,
ffn_up_weight,
ffn_up_bias,
ffn_down_weight,
ffn_down_bias,
ffn_gate_weight,
ffn_gate_bias,
ffn_norm_weight,
ffn_norm_bias,
attn_q_norm_weight,
attn_k_norm_weight,
})
}
}
include!("transformer_quantized_layer_field.rs");