use crate::error::{ModelError, ModelResult};
use scirs2_core::ndarray::Array2;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
type GgufBufferParsed = (
u32,
HashMap<String, GgufMetaValue>,
Vec<GgufTensorInfo>,
u64,
);
fn read_u8(buf: &[u8], pos: &mut usize) -> ModelResult<u8> {
if *pos >= buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading u8 at position {}",
pos
)));
}
let v = buf[*pos];
*pos += 1;
Ok(v)
}
fn read_u16_le(buf: &[u8], pos: &mut usize) -> ModelResult<u16> {
let end = *pos + 2;
if end > buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading u16 at position {}",
pos
)));
}
let v = u16::from_le_bytes([buf[*pos], buf[*pos + 1]]);
*pos = end;
Ok(v)
}
fn read_u32_le(buf: &[u8], pos: &mut usize) -> ModelResult<u32> {
let end = *pos + 4;
if end > buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading u32 at position {}",
pos
)));
}
let v = u32::from_le_bytes([buf[*pos], buf[*pos + 1], buf[*pos + 2], buf[*pos + 3]]);
*pos = end;
Ok(v)
}
fn read_u64_le(buf: &[u8], pos: &mut usize) -> ModelResult<u64> {
let end = *pos + 8;
if end > buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading u64 at position {}",
pos
)));
}
let v = u64::from_le_bytes([
buf[*pos],
buf[*pos + 1],
buf[*pos + 2],
buf[*pos + 3],
buf[*pos + 4],
buf[*pos + 5],
buf[*pos + 6],
buf[*pos + 7],
]);
*pos = end;
Ok(v)
}
fn read_i8(buf: &[u8], pos: &mut usize) -> ModelResult<i8> {
read_u8(buf, pos).map(|v| v as i8)
}
fn read_i16_le(buf: &[u8], pos: &mut usize) -> ModelResult<i16> {
read_u16_le(buf, pos).map(|v| v as i16)
}
fn read_i32_le(buf: &[u8], pos: &mut usize) -> ModelResult<i32> {
read_u32_le(buf, pos).map(|v| v as i32)
}
fn read_i64_le(buf: &[u8], pos: &mut usize) -> ModelResult<i64> {
read_u64_le(buf, pos).map(|v| v as i64)
}
fn read_f32_le(buf: &[u8], pos: &mut usize) -> ModelResult<f32> {
read_u32_le(buf, pos).map(f32::from_bits)
}
fn read_f64_le(buf: &[u8], pos: &mut usize) -> ModelResult<f64> {
read_u64_le(buf, pos).map(f64::from_bits)
}
fn read_bool(buf: &[u8], pos: &mut usize) -> ModelResult<bool> {
read_u8(buf, pos).map(|v| v != 0)
}
fn read_string_v2(buf: &[u8], pos: &mut usize) -> ModelResult<String> {
let len = read_u64_le(buf, pos)? as usize;
let end = *pos + len;
if end > buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading string of length {} at position {}",
len, pos
)));
}
let s = std::str::from_utf8(&buf[*pos..end]).map_err(|e| {
ModelError::simple_load_error(format!("Invalid UTF-8 in GGUF string: {}", e))
})?;
let owned = s.to_owned();
*pos = end;
Ok(owned)
}
fn read_string_v1(buf: &[u8], pos: &mut usize) -> ModelResult<String> {
let len = read_u32_le(buf, pos)? as usize;
let end = *pos + len;
if end > buf.len() {
return Err(ModelError::simple_load_error(format!(
"Buffer underflow reading v1 string of length {} at position {}",
len, pos
)));
}
let s = std::str::from_utf8(&buf[*pos..end]).map_err(|e| {
ModelError::simple_load_error(format!("Invalid UTF-8 in GGUF v1 string: {}", e))
})?;
let owned = s.to_owned();
*pos = end;
Ok(owned)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum GgufValueType {
Uint8 = 0,
Int8 = 1,
Uint16 = 2,
Int16 = 3,
Uint32 = 4,
Int32 = 5,
Float32 = 6,
Bool = 7,
String = 8,
Array = 9,
Uint64 = 10,
Int64 = 11,
Float64 = 12,
}
impl GgufValueType {
fn from_u32(v: u32) -> ModelResult<Self> {
match v {
0 => Ok(Self::Uint8),
1 => Ok(Self::Int8),
2 => Ok(Self::Uint16),
3 => Ok(Self::Int16),
4 => Ok(Self::Uint32),
5 => Ok(Self::Int32),
6 => Ok(Self::Float32),
7 => Ok(Self::Bool),
8 => Ok(Self::String),
9 => Ok(Self::Array),
10 => Ok(Self::Uint64),
11 => Ok(Self::Int64),
12 => Ok(Self::Float64),
other => Err(ModelError::simple_load_error(format!(
"Unknown GGUF value type: {}",
other
))),
}
}
}
#[derive(Debug, Clone)]
pub enum GgufMetaValue {
Uint8(u8),
Int8(i8),
Uint16(u16),
Int16(i16),
Uint32(u32),
Int32(i32),
Float32(f32),
Bool(bool),
String(String),
Uint64(u64),
Int64(i64),
Float64(f64),
Array(Vec<GgufMetaValue>),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum GgufQuantType {
F32 = 0,
F16 = 1,
Q4_0 = 2,
Q4_1 = 3,
Q5_0 = 6,
Q5_1 = 7,
Q8_0 = 8,
Q8_1 = 9,
Q2K = 10,
Q3K = 11,
Q4K = 12,
Q5K = 13,
Q6K = 14,
Q8K = 15,
IQ2XXS = 16,
IQ2XS = 17,
IQ3XXS = 18,
IQ1S = 19,
IQ4NL = 20,
IQ3S = 21,
IQ2S = 22,
IQ4XS = 23,
BF16 = 30,
}
impl GgufQuantType {
fn from_u32(v: u32) -> ModelResult<Self> {
match v {
0 => Ok(Self::F32),
1 => Ok(Self::F16),
2 => Ok(Self::Q4_0),
3 => Ok(Self::Q4_1),
6 => Ok(Self::Q5_0),
7 => Ok(Self::Q5_1),
8 => Ok(Self::Q8_0),
9 => Ok(Self::Q8_1),
10 => Ok(Self::Q2K),
11 => Ok(Self::Q3K),
12 => Ok(Self::Q4K),
13 => Ok(Self::Q5K),
14 => Ok(Self::Q6K),
15 => Ok(Self::Q8K),
16 => Ok(Self::IQ2XXS),
17 => Ok(Self::IQ2XS),
18 => Ok(Self::IQ3XXS),
19 => Ok(Self::IQ1S),
20 => Ok(Self::IQ4NL),
21 => Ok(Self::IQ3S),
22 => Ok(Self::IQ2S),
23 => Ok(Self::IQ4XS),
30 => Ok(Self::BF16),
other => Err(ModelError::simple_load_error(format!(
"Unknown GGUF quantization type: {}",
other
))),
}
}
pub fn name(&self) -> &'static str {
match self {
Self::F32 => "F32",
Self::F16 => "F16",
Self::Q4_0 => "Q4_0",
Self::Q4_1 => "Q4_1",
Self::Q5_0 => "Q5_0",
Self::Q5_1 => "Q5_1",
Self::Q8_0 => "Q8_0",
Self::Q8_1 => "Q8_1",
Self::Q2K => "Q2K",
Self::Q3K => "Q3K",
Self::Q4K => "Q4K",
Self::Q5K => "Q5K",
Self::Q6K => "Q6K",
Self::Q8K => "Q8K",
Self::IQ2XXS => "IQ2XXS",
Self::IQ2XS => "IQ2XS",
Self::IQ3XXS => "IQ3XXS",
Self::IQ1S => "IQ1S",
Self::IQ4NL => "IQ4NL",
Self::IQ3S => "IQ3S",
Self::IQ2S => "IQ2S",
Self::IQ4XS => "IQ4XS",
Self::BF16 => "BF16",
}
}
}
#[derive(Debug, Clone)]
pub struct GgufTensorInfo {
pub name: String,
pub shape: Vec<u64>,
pub quant_type: GgufQuantType,
pub offset: u64,
pub data_offset: u64,
}
impl GgufTensorInfo {
pub fn n_elements(&self) -> u64 {
self.shape.iter().product()
}
}
#[derive(Debug)]
pub struct GgufFile {
pub version: u32,
pub metadata: HashMap<String, GgufMetaValue>,
pub tensors: Vec<GgufTensorInfo>,
data_offset: u64,
file_path: PathBuf,
}
#[derive(Debug, Clone)]
pub struct GgufInspection {
pub version: u32,
pub tensor_count: usize,
pub metadata_count: usize,
pub architecture: Option<String>,
pub tensor_names: Vec<String>,
pub total_param_count: u64,
pub quant_types_used: Vec<String>,
}
fn read_meta_value(
buf: &[u8],
pos: &mut usize,
vtype: GgufValueType,
version: u32,
) -> ModelResult<GgufMetaValue> {
match vtype {
GgufValueType::Uint8 => Ok(GgufMetaValue::Uint8(read_u8(buf, pos)?)),
GgufValueType::Int8 => Ok(GgufMetaValue::Int8(read_i8(buf, pos)?)),
GgufValueType::Uint16 => Ok(GgufMetaValue::Uint16(read_u16_le(buf, pos)?)),
GgufValueType::Int16 => Ok(GgufMetaValue::Int16(read_i16_le(buf, pos)?)),
GgufValueType::Uint32 => Ok(GgufMetaValue::Uint32(read_u32_le(buf, pos)?)),
GgufValueType::Int32 => Ok(GgufMetaValue::Int32(read_i32_le(buf, pos)?)),
GgufValueType::Float32 => Ok(GgufMetaValue::Float32(read_f32_le(buf, pos)?)),
GgufValueType::Bool => Ok(GgufMetaValue::Bool(read_bool(buf, pos)?)),
GgufValueType::String => {
let s = if version >= 2 {
read_string_v2(buf, pos)?
} else {
read_string_v1(buf, pos)?
};
Ok(GgufMetaValue::String(s))
}
GgufValueType::Uint64 => Ok(GgufMetaValue::Uint64(read_u64_le(buf, pos)?)),
GgufValueType::Int64 => Ok(GgufMetaValue::Int64(read_i64_le(buf, pos)?)),
GgufValueType::Float64 => Ok(GgufMetaValue::Float64(read_f64_le(buf, pos)?)),
GgufValueType::Array => {
let elem_type_raw = read_u32_le(buf, pos)?;
let elem_type = GgufValueType::from_u32(elem_type_raw)?;
let count = if version >= 2 {
read_u64_le(buf, pos)? as usize
} else {
read_u32_le(buf, pos)? as usize
};
let mut elements = Vec::with_capacity(count);
for _ in 0..count {
elements.push(read_meta_value(buf, pos, elem_type, version)?);
}
Ok(GgufMetaValue::Array(elements))
}
}
}
fn parse_gguf_buffer(buf: &[u8], file_path: &Path) -> ModelResult<GgufBufferParsed> {
if buf.len() < 4 {
return Err(ModelError::simple_load_error("File too small to be GGUF"));
}
if &buf[0..4] != b"GGUF" {
return Err(ModelError::simple_load_error(format!(
"Invalid GGUF magic in {:?}",
file_path
)));
}
let mut pos = 4usize;
let version = read_u32_le(buf, &mut pos)?;
if version == 0 || version > 3 {
return Err(ModelError::simple_load_error(format!(
"Unsupported GGUF version: {}",
version
)));
}
let (tensor_count, kv_count) = if version >= 2 {
let tc = read_u64_le(buf, &mut pos)? as usize;
let kv = read_u64_le(buf, &mut pos)? as usize;
(tc, kv)
} else {
let tc = read_u32_le(buf, &mut pos)? as usize;
let kv = read_u32_le(buf, &mut pos)? as usize;
(tc, kv)
};
let mut metadata = HashMap::with_capacity(kv_count);
for _ in 0..kv_count {
let key = if version >= 2 {
read_string_v2(buf, &mut pos)?
} else {
read_string_v1(buf, &mut pos)?
};
let vtype_raw = read_u32_le(buf, &mut pos)?;
let vtype = GgufValueType::from_u32(vtype_raw)?;
let value = read_meta_value(buf, &mut pos, vtype, version)?;
metadata.insert(key, value);
}
let mut tensors = Vec::with_capacity(tensor_count);
for _ in 0..tensor_count {
let name = if version >= 2 {
read_string_v2(buf, &mut pos)?
} else {
read_string_v1(buf, &mut pos)?
};
let n_dims = read_u32_le(buf, &mut pos)? as usize;
let mut shape = Vec::with_capacity(n_dims);
for _ in 0..n_dims {
if version >= 2 {
shape.push(read_u64_le(buf, &mut pos)?);
} else {
shape.push(read_u32_le(buf, &mut pos)? as u64);
}
}
let quant_raw = read_u32_le(buf, &mut pos)?;
let quant_type = GgufQuantType::from_u32(quant_raw)?;
let offset = read_u64_le(buf, &mut pos)?;
tensors.push(GgufTensorInfo {
name,
shape,
quant_type,
offset,
data_offset: 0, });
}
let aligned_offset = (pos as u64 + 31) & !31u64;
for t in &mut tensors {
t.data_offset = aligned_offset + t.offset;
}
Ok((version, metadata, tensors, aligned_offset))
}
impl GgufFile {
pub fn open(path: &Path) -> ModelResult<Self> {
let buf = std::fs::read(path).map_err(|e| {
ModelError::simple_load_error(format!("Failed to read GGUF file {:?}: {}", path, e))
})?;
let (version, metadata, tensors, data_offset) = parse_gguf_buffer(&buf, path)?;
Ok(Self {
version,
metadata,
tensors,
data_offset,
file_path: path.to_path_buf(),
})
}
pub fn load_tensor_f32(&self, name: &str) -> ModelResult<Array2<f32>> {
let info = self
.tensors
.iter()
.find(|t| t.name == name)
.ok_or_else(|| {
ModelError::simple_load_error(format!(
"Tensor '{}' not found in GGUF file {:?}",
name, self.file_path
))
})?;
let n_elements = info.n_elements() as usize;
let byte_offset = self.data_offset + info.offset;
let byte_len = tensor_byte_size(info)?;
let file_buf = std::fs::read(&self.file_path).map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to re-read GGUF file {:?}: {}",
self.file_path, e
))
})?;
let start = byte_offset as usize;
let end = start + byte_len;
if end > file_buf.len() {
return Err(ModelError::simple_load_error(format!(
"Tensor '{}' data region [{}, {}) exceeds file size {}",
name,
start,
end,
file_buf.len()
)));
}
let raw = &file_buf[start..end];
let floats = dequant::dequantize(raw, &info.quant_type, n_elements)?;
let (rows, cols) = shape_to_2d(&info.shape);
Array2::from_shape_vec((rows, cols), floats).map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to reshape tensor '{}' to ({}, {}): {}",
name, rows, cols, e
))
})
}
pub fn load_all_tensors_f32(&self) -> ModelResult<HashMap<String, Array2<f32>>> {
let file_buf = std::fs::read(&self.file_path).map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to read GGUF file {:?}: {}",
self.file_path, e
))
})?;
let mut result = HashMap::with_capacity(self.tensors.len());
for info in &self.tensors {
let n_elements = info.n_elements() as usize;
let byte_offset = (self.data_offset + info.offset) as usize;
let byte_len = tensor_byte_size(info)?;
let end = byte_offset + byte_len;
if end > file_buf.len() {
return Err(ModelError::simple_load_error(format!(
"Tensor '{}' data region [{}, {}) exceeds file size {}",
info.name,
byte_offset,
end,
file_buf.len()
)));
}
let raw = &file_buf[byte_offset..end];
let floats = dequant::dequantize(raw, &info.quant_type, n_elements)?;
let (rows, cols) = shape_to_2d(&info.shape);
let array = Array2::from_shape_vec((rows, cols), floats).map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to reshape tensor '{}' to ({}, {}): {}",
info.name, rows, cols, e
))
})?;
result.insert(info.name.clone(), array);
}
Ok(result)
}
pub fn architecture(&self) -> Option<&str> {
match self.metadata.get("general.architecture") {
Some(GgufMetaValue::String(s)) => Some(s.as_str()),
_ => None,
}
}
pub fn inspect(&self) -> GgufInspection {
let tensor_names: Vec<String> = self.tensors.iter().map(|t| t.name.clone()).collect();
let total_param_count: u64 = self.tensors.iter().map(|t| t.n_elements()).sum();
let mut quant_set: std::collections::HashSet<&str> = std::collections::HashSet::new();
for t in &self.tensors {
quant_set.insert(t.quant_type.name());
}
let mut quant_types_used: Vec<String> =
quant_set.into_iter().map(|s| s.to_owned()).collect();
quant_types_used.sort();
GgufInspection {
version: self.version,
tensor_count: self.tensors.len(),
metadata_count: self.metadata.len(),
architecture: self.architecture().map(|s| s.to_owned()),
tensor_names,
total_param_count,
quant_types_used,
}
}
pub fn tensor_names(&self) -> Vec<&str> {
self.tensors.iter().map(|t| t.name.as_str()).collect()
}
pub fn load_tensor_lazy<R: std::io::Read + std::io::Seek>(
reader: &mut R,
info: &GgufTensorInfo,
) -> ModelResult<Vec<f32>> {
use std::io::SeekFrom;
let n_elements = info.n_elements() as usize;
let byte_len = tensor_byte_size(info)?;
reader
.seek(SeekFrom::Start(info.data_offset))
.map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to seek to tensor '{}' at offset {}: {}",
info.name, info.data_offset, e
))
})?;
let mut raw = vec![0u8; byte_len];
reader.read_exact(&mut raw).map_err(|e| {
ModelError::simple_load_error(format!(
"Failed to read {} bytes for tensor '{}': {}",
byte_len, info.name, e
))
})?;
dequant::dequantize(&raw, &info.quant_type, n_elements)
}
}
fn shape_to_2d(shape: &[u64]) -> (usize, usize) {
match shape.len() {
0 => (1, 1),
1 => (1, shape[0] as usize),
2 => (shape[0] as usize, shape[1] as usize),
n => {
let cols = shape[n - 1] as usize;
let rows: u64 = shape[..n - 1].iter().product();
(rows as usize, cols)
}
}
}
fn tensor_byte_size(info: &GgufTensorInfo) -> ModelResult<usize> {
let n = info.n_elements() as usize;
match info.quant_type {
GgufQuantType::F32 => Ok(n * 4),
GgufQuantType::F16 | GgufQuantType::BF16 => Ok(n * 2),
GgufQuantType::Q4_0 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q4_0 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 18)
}
GgufQuantType::Q4_1 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q4_1 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 20)
}
GgufQuantType::Q5_0 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q5_0 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 22)
}
GgufQuantType::Q5_1 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q5_1 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 24)
}
GgufQuantType::Q8_0 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q8_0 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 34)
}
GgufQuantType::Q8_1 => {
if !n.is_multiple_of(32) {
return Err(ModelError::simple_load_error(format!(
"Q8_1 tensor '{}' has {} elements, not divisible by 32",
info.name, n
)));
}
Ok((n / 32) * 36)
}
GgufQuantType::Q6K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q6K tensor '{}' has {} elements, not divisible by 256",
info.name, n
)));
}
Ok((n / 256) * 210)
}
GgufQuantType::Q2K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q2K tensor has {} elements, not divisible by 256",
n
)));
}
Ok((n / 256) * 84)
}
GgufQuantType::Q3K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q3K tensor has {} elements, not divisible by 256",
n
)));
}
Ok((n / 256) * 110)
}
GgufQuantType::Q4K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q4K tensor has {} elements, not divisible by 256",
n
)));
}
Ok((n / 256) * 144)
}
GgufQuantType::Q5K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q5K tensor has {} elements, not divisible by 256",
n
)));
}
Ok((n / 256) * 176)
}
GgufQuantType::Q8K => {
if !n.is_multiple_of(256) {
return Err(ModelError::simple_load_error(format!(
"Q8K tensor has {} elements, not divisible by 256",
n
)));
}
Ok((n / 256) * 292)
}
qt => Err(ModelError::simple_load_error(format!(
"Cannot compute byte size for unsupported quant type {:?}",
qt
))),
}
}
pub(crate) mod dequant {
use super::GgufQuantType;
use crate::error::{ModelError, ModelResult};
use crate::gguf_dequant as kquant;
pub fn dequantize(
data: &[u8],
quant_type: &GgufQuantType,
n_elements: usize,
) -> ModelResult<Vec<f32>> {
match quant_type {
GgufQuantType::F32 => dequant_f32(data, n_elements),
GgufQuantType::F16 => dequant_f16(data, n_elements),
GgufQuantType::BF16 => dequant_bf16(data, n_elements),
GgufQuantType::Q4_0 => dequant_q4_0(data, n_elements),
GgufQuantType::Q4_1 => dequant_q4_1(data, n_elements),
GgufQuantType::Q5_0 => dequant_q5_0(data, n_elements),
GgufQuantType::Q5_1 => dequant_q5_1(data, n_elements),
GgufQuantType::Q8_0 => dequant_q8_0(data, n_elements),
GgufQuantType::Q6K => dequant_q6_k(data, n_elements),
GgufQuantType::Q2K => kquant::dequant_q2_k(data, n_elements),
GgufQuantType::Q3K => kquant::dequant_q3_k(data, n_elements),
GgufQuantType::Q4K => kquant::dequant_q4_k(data, n_elements),
GgufQuantType::Q5K => kquant::dequant_q5_k(data, n_elements),
GgufQuantType::Q8K => kquant::dequant_q8_k(data, n_elements),
qt => Err(ModelError::simple_load_error(format!(
"Unsupported quant type for dequantization: {:?}",
qt
))),
}
}
fn dequant_f32(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
if data.len() < n * 4 {
return Err(ModelError::simple_load_error(format!(
"F32 tensor needs {} bytes, got {}",
n * 4,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for i in 0..n {
let base = i * 4;
let v =
f32::from_le_bytes([data[base], data[base + 1], data[base + 2], data[base + 3]]);
out.push(v);
}
Ok(out)
}
pub(super) fn dequant_f16(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
if data.len() < n * 2 {
return Err(ModelError::simple_load_error(format!(
"F16 tensor needs {} bytes, got {}",
n * 2,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for i in 0..n {
let base = i * 2;
let bits = u16::from_le_bytes([data[base], data[base + 1]]);
out.push(half::f16::from_bits(bits).to_f32());
}
Ok(out)
}
pub(super) fn dequant_bf16(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
if data.len() < n * 2 {
return Err(ModelError::simple_load_error(format!(
"BF16 tensor needs {} bytes, got {}",
n * 2,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for i in 0..n {
let base = i * 2;
let bits = u16::from_le_bytes([data[base], data[base + 1]]);
out.push(f32::from_bits((bits as u32) << 16));
}
Ok(out)
}
pub(super) fn dequant_q4_0(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 32;
const BLOCK_BYTES: usize = 18; if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q4_0: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q4_0 data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let delta_bits = u16::from_le_bytes([data[base], data[base + 1]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
for byte_idx in 0..16usize {
let byte = data[base + 2 + byte_idx];
let lo = (byte & 0x0F) as i32 - 8;
let hi = ((byte >> 4) & 0x0F) as i32 - 8;
out.push(lo as f32 * delta);
out.push(hi as f32 * delta);
}
}
Ok(out)
}
pub(super) fn dequant_q4_1(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 32;
const BLOCK_BYTES: usize = 20;
if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q4_1: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q4_1 data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let delta_bits = u16::from_le_bytes([data[base], data[base + 1]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
let min_bits = u16::from_le_bytes([data[base + 2], data[base + 3]]);
let min = half::f16::from_bits(min_bits).to_f32();
for byte_idx in 0..16usize {
let byte = data[base + 4 + byte_idx];
let lo = (byte & 0x0F) as f32;
let hi = ((byte >> 4) & 0x0F) as f32;
out.push(lo * delta + min);
out.push(hi * delta + min);
}
}
Ok(out)
}
pub(super) fn dequant_q5_0(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 32;
const BLOCK_BYTES: usize = 22;
if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q5_0: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q5_0 data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let delta_bits = u16::from_le_bytes([data[base], data[base + 1]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
let qh = u32::from_le_bytes([
data[base + 2],
data[base + 3],
data[base + 4],
data[base + 5],
]);
for byte_idx in 0..16usize {
let byte = data[base + 6 + byte_idx];
let lo4 = (byte & 0x0F) as u32;
let hi4 = ((byte >> 4) & 0x0F) as u32;
let elem_lo = byte_idx * 2;
let elem_hi = byte_idx * 2 + 1;
let hi_lo = (qh >> elem_lo) & 1;
let hi_hi = (qh >> elem_hi) & 1;
let q_lo = (lo4 | (hi_lo << 4)) as i32 - 16;
let q_hi = (hi4 | (hi_hi << 4)) as i32 - 16;
out.push(q_lo as f32 * delta);
out.push(q_hi as f32 * delta);
}
}
Ok(out)
}
pub(super) fn dequant_q5_1(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 32;
const BLOCK_BYTES: usize = 24;
if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q5_1: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q5_1 data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let delta_bits = u16::from_le_bytes([data[base], data[base + 1]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
let min_bits = u16::from_le_bytes([data[base + 2], data[base + 3]]);
let min = half::f16::from_bits(min_bits).to_f32();
let qh = u32::from_le_bytes([
data[base + 4],
data[base + 5],
data[base + 6],
data[base + 7],
]);
for byte_idx in 0..16usize {
let byte = data[base + 8 + byte_idx];
let lo4 = (byte & 0x0F) as u32;
let hi4 = ((byte >> 4) & 0x0F) as u32;
let elem_lo = byte_idx * 2;
let elem_hi = byte_idx * 2 + 1;
let hi_lo = (qh >> elem_lo) & 1;
let hi_hi = (qh >> elem_hi) & 1;
let q_lo = (lo4 | (hi_lo << 4)) as f32;
let q_hi = (hi4 | (hi_hi << 4)) as f32;
out.push(q_lo * delta + min);
out.push(q_hi * delta + min);
}
}
Ok(out)
}
pub(super) fn dequant_q8_0(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 32;
const BLOCK_BYTES: usize = 34;
if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q8_0: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q8_0 data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let delta_bits = u16::from_le_bytes([data[base], data[base + 1]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
for i in 0..BLOCK_ELEMS {
let q = data[base + 2 + i] as i8;
out.push(q as f32 * delta);
}
}
Ok(out)
}
pub(super) fn dequant_q6_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 210;
if !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q6K: n_elements {} not divisible by {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
if data.len() < n_blocks * BLOCK_BYTES {
return Err(ModelError::simple_load_error("Q6K data buffer too small"));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let ql = &data[base..base + 128];
let qh = &data[base + 128..base + 192];
let scales_raw = &data[base + 192..base + 208];
let delta_bits = u16::from_le_bytes([data[base + 208], data[base + 209]]);
let delta = half::f16::from_bits(delta_bits).to_f32();
for i in 0..128usize {
let qh_byte = qh[i / 2];
let shift_lo = (i % 2) * 4; let shift_hi = (i % 2) * 4 + 2;
let q_lo_low4 = ql[i] & 0x0F;
let q_hi_low4 = (ql[i] >> 4) & 0x0F;
let q_lo_high2 = (qh_byte >> shift_lo) & 0x03;
let q_hi_high2 = (qh_byte >> shift_hi) & 0x03;
let q_lo = ((q_lo_high2 << 4) | q_lo_low4) as i32 - 32;
let q_hi = ((q_hi_high2 << 4) | q_hi_low4) as i32 - 32;
let scale_idx_lo = (i * 2) / 16; let scale_idx_hi = (i * 2 + 1) / 16;
if scale_idx_lo >= 16 || scale_idx_hi >= 16 {
return Err(ModelError::simple_load_error(
"Q6K scale index out of range",
));
}
let scale_lo = scales_raw[scale_idx_lo] as i8 as f32;
let scale_hi = scales_raw[scale_idx_hi] as i8 as f32;
out.push(delta * scale_lo * q_lo as f32);
out.push(delta * scale_hi * q_hi as f32);
}
}
Ok(out)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn write_gguf_header(buf: &mut Vec<u8>, version: u32, tensor_count: u64, kv_count: u64) {
buf.extend_from_slice(b"GGUF");
buf.extend_from_slice(&version.to_le_bytes());
if version >= 2 {
buf.extend_from_slice(&tensor_count.to_le_bytes());
buf.extend_from_slice(&kv_count.to_le_bytes());
} else {
buf.extend_from_slice(&(tensor_count as u32).to_le_bytes());
buf.extend_from_slice(&(kv_count as u32).to_le_bytes());
}
}
fn write_str_v2(buf: &mut Vec<u8>, s: &str) {
let bytes = s.as_bytes();
buf.extend_from_slice(&(bytes.len() as u64).to_le_bytes());
buf.extend_from_slice(bytes);
}
#[allow(dead_code)]
fn write_str_v1(buf: &mut Vec<u8>, s: &str) {
let bytes = s.as_bytes();
buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
buf.extend_from_slice(bytes);
}
fn pad_to_32(buf: &mut Vec<u8>) {
let rem = buf.len() % 32;
if rem != 0 {
let pad = 32 - rem;
buf.extend(std::iter::repeat_n(0u8, pad));
}
}
#[test]
fn test_magic_validation() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_bad_magic.bin");
let mut buf = Vec::new();
buf.extend_from_slice(b"XXXX");
buf.extend_from_slice(&2u32.to_le_bytes());
buf.extend_from_slice(&0u64.to_le_bytes());
buf.extend_from_slice(&0u64.to_le_bytes());
std::fs::write(&path, &buf).unwrap();
let result = GgufFile::open(&path);
assert!(result.is_err(), "Expected error for bad magic");
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_version_1_parse() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_v1_empty.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 1, 0, 0);
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("v1 parse failed");
assert_eq!(file.version, 1);
assert!(file.tensors.is_empty());
assert!(file.metadata.is_empty());
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_version_2_parse() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_v2_empty.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 0);
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("v2 parse failed");
assert_eq!(file.version, 2);
assert!(file.tensors.is_empty());
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_version_3_parse() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_v3_empty.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 3, 0, 0);
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("v3 parse failed");
assert_eq!(file.version, 3);
assert!(file.tensors.is_empty());
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_metadata_uint32() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_meta_uint32.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 1);
write_str_v2(&mut buf, "my.key");
buf.extend_from_slice(&4u32.to_le_bytes());
buf.extend_from_slice(&42u32.to_le_bytes());
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
match file.metadata.get("my.key") {
Some(GgufMetaValue::Uint32(v)) => assert_eq!(*v, 42u32),
other => panic!("Expected Uint32(42), got {:?}", other),
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_metadata_string() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_meta_string.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 1);
write_str_v2(&mut buf, "general.name");
buf.extend_from_slice(&8u32.to_le_bytes()); write_str_v2(&mut buf, "test-model");
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
match file.metadata.get("general.name") {
Some(GgufMetaValue::String(s)) => assert_eq!(s, "test-model"),
other => panic!("Expected String, got {:?}", other),
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_metadata_float32() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_meta_float32.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 1);
write_str_v2(&mut buf, "param.scale");
buf.extend_from_slice(&6u32.to_le_bytes()); buf.extend_from_slice(&std::f32::consts::PI.to_le_bytes());
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
match file.metadata.get("param.scale") {
Some(GgufMetaValue::Float32(v)) => assert!((v - std::f32::consts::PI).abs() < 1e-5),
other => panic!("Expected Float32, got {:?}", other),
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_metadata_array_uint32() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_meta_array_uint32.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 1);
write_str_v2(&mut buf, "layer.sizes");
buf.extend_from_slice(&9u32.to_le_bytes()); buf.extend_from_slice(&4u32.to_le_bytes()); buf.extend_from_slice(&3u64.to_le_bytes()); buf.extend_from_slice(&10u32.to_le_bytes());
buf.extend_from_slice(&20u32.to_le_bytes());
buf.extend_from_slice(&30u32.to_le_bytes());
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
match file.metadata.get("layer.sizes") {
Some(GgufMetaValue::Array(arr)) => {
assert_eq!(arr.len(), 3);
match (&arr[0], &arr[1], &arr[2]) {
(
GgufMetaValue::Uint32(a),
GgufMetaValue::Uint32(b),
GgufMetaValue::Uint32(c),
) => {
assert_eq!(*a, 10);
assert_eq!(*b, 20);
assert_eq!(*c, 30);
}
_ => panic!("Unexpected array element types"),
}
}
other => panic!("Expected Array, got {:?}", other),
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_f32_tensor_load() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_f32_tensor.bin");
let values: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 1, 0);
write_str_v2(&mut buf, "my_tensor");
buf.extend_from_slice(&2u32.to_le_bytes()); buf.extend_from_slice(&3u64.to_le_bytes()); buf.extend_from_slice(&2u64.to_le_bytes()); buf.extend_from_slice(&0u32.to_le_bytes()); buf.extend_from_slice(&0u64.to_le_bytes()); pad_to_32(&mut buf);
for v in &values {
buf.extend_from_slice(&v.to_le_bytes());
}
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
let arr = file.load_tensor_f32("my_tensor").expect("load failed");
assert_eq!(arr.nrows() * arr.ncols(), 6);
let flat: Vec<f32> = arr.iter().cloned().collect();
for v in &values {
assert!(flat.contains(v), "Value {} not found", v);
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_f16_tensor_load() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_f16_tensor.bin");
let n: usize = 16;
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 1, 0);
write_str_v2(&mut buf, "f16_tensor");
buf.extend_from_slice(&2u32.to_le_bytes()); buf.extend_from_slice(&4u64.to_le_bytes()); buf.extend_from_slice(&4u64.to_le_bytes()); buf.extend_from_slice(&1u32.to_le_bytes()); buf.extend_from_slice(&0u64.to_le_bytes()); pad_to_32(&mut buf);
for i in 0..n {
let val = half::f16::from_f32(i as f32 * 0.5);
buf.extend_from_slice(&val.to_bits().to_le_bytes());
}
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
let arr = file.load_tensor_f32("f16_tensor").expect("load failed");
assert_eq!(arr.nrows(), 4);
assert_eq!(arr.ncols(), 4);
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_q4_0_tensor_load() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_q4_0_tensor.bin");
let n: usize = 32;
let delta_f16 = half::f16::from_f32(1.0);
let mut block = Vec::new();
block.extend_from_slice(&delta_f16.to_bits().to_le_bytes()); block.extend(std::iter::repeat_n(0x88u8, 16));
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 1, 0);
write_str_v2(&mut buf, "q4_tensor");
buf.extend_from_slice(&1u32.to_le_bytes()); buf.extend_from_slice(&(n as u64).to_le_bytes()); buf.extend_from_slice(&2u32.to_le_bytes()); buf.extend_from_slice(&0u64.to_le_bytes()); pad_to_32(&mut buf);
buf.extend_from_slice(&block);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
let arr = file.load_tensor_f32("q4_tensor").expect("load failed");
assert_eq!(arr.nrows() * arr.ncols(), n);
for v in arr.iter() {
assert_eq!(*v, 0.0f32, "Expected 0.0, got {}", v);
}
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_architecture_extraction() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_arch.bin");
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 0, 1);
write_str_v2(&mut buf, "general.architecture");
buf.extend_from_slice(&8u32.to_le_bytes()); write_str_v2(&mut buf, "llama");
pad_to_32(&mut buf);
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
assert_eq!(file.architecture(), Some("llama"));
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_multi_tensor_loading() {
let dir = std::env::temp_dir();
let path = dir.join("gguf_multi_tensor.bin");
let vals_a: [f32; 4] = [1.0, 2.0, 3.0, 4.0];
let vals_b: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
let bytes_per_tensor: u64 = 4 * 4;
let mut buf = Vec::new();
write_gguf_header(&mut buf, 2, 2, 0);
write_str_v2(&mut buf, "tensor_a");
buf.extend_from_slice(&1u32.to_le_bytes()); buf.extend_from_slice(&4u64.to_le_bytes()); buf.extend_from_slice(&0u32.to_le_bytes()); buf.extend_from_slice(&0u64.to_le_bytes());
write_str_v2(&mut buf, "tensor_b");
buf.extend_from_slice(&1u32.to_le_bytes()); buf.extend_from_slice(&4u64.to_le_bytes()); buf.extend_from_slice(&0u32.to_le_bytes()); buf.extend_from_slice(&bytes_per_tensor.to_le_bytes());
pad_to_32(&mut buf);
for v in &vals_a {
buf.extend_from_slice(&v.to_le_bytes());
}
for v in &vals_b {
buf.extend_from_slice(&v.to_le_bytes());
}
std::fs::write(&path, &buf).unwrap();
let file = GgufFile::open(&path).expect("parse failed");
let all = file.load_all_tensors_f32().expect("load all failed");
assert!(all.contains_key("tensor_a"), "tensor_a missing");
assert!(all.contains_key("tensor_b"), "tensor_b missing");
let a = &all["tensor_a"];
let b = &all["tensor_b"];
assert_eq!(a.nrows() * a.ncols(), 4);
assert_eq!(b.nrows() * b.ncols(), 4);
let flat_a: Vec<f32> = a.iter().cloned().collect();
let flat_b: Vec<f32> = b.iter().cloned().collect();
assert_eq!(flat_a, vals_a);
assert_eq!(flat_b, vals_b);
let _ = std::fs::remove_file(&path);
}
}