pub mod loader;
pub mod model_init;
pub mod parser;
pub mod quantization;
pub mod tensors;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;
#[cfg(unix)]
use std::os::unix::fs::FileExt;
use crate::backends::ModelArchitecture;
use crate::error::{Result, RuvLLMError};
pub use loader::{
GgufLoader, LoadConfig, LoadProgress, LoadedTensor, LoadedWeights, ProgressCallback,
StreamingLoader, TensorCategory, TensorNameMapper,
};
pub use model_init::{
LayerWeights, ModelInitializer, ModelWeights, ProgressModelBuilder, QuantizedWeight,
WeightTensor,
};
pub use parser::{parse_header, parse_metadata, parse_tensor_infos, GgufHeader, GgufValue};
pub use quantization::{dequantize_block, GgufQuantType, QuantizedTensor};
pub use tensors::TensorInfo;
pub const GGUF_MAGIC: u32 = 0x46554747;
pub const GGUF_VERSION: u32 = 3;
pub const DEFAULT_ALIGNMENT: usize = 32;
pub struct GgufFile {
pub header: GgufHeader,
pub metadata: HashMap<String, GgufValue>,
pub tensors: Vec<TensorInfo>,
path: std::path::PathBuf,
mmap: Option<MmapData>,
data_offset: u64,
alignment: usize,
}
struct MmapData {
#[cfg(feature = "mmap")]
mmap: memmap2::Mmap,
#[cfg(not(feature = "mmap"))]
data: Vec<u8>,
}
impl GgufFile {
pub fn open(path: &Path) -> Result<Self> {
let file = File::open(path)
.map_err(|e| RuvLLMError::Model(format!("Failed to open GGUF file: {}", e)))?;
let mut reader = BufReader::new(file);
let header = parse_header(&mut reader)?;
if header.magic != GGUF_MAGIC {
return Err(RuvLLMError::Model(format!(
"Invalid GGUF magic: expected 0x{:08X}, got 0x{:08X}",
GGUF_MAGIC, header.magic
)));
}
if header.version != GGUF_VERSION && header.version != 2 {
return Err(RuvLLMError::Model(format!(
"Unsupported GGUF version: {} (supported: 2, 3)",
header.version
)));
}
let metadata = parse_metadata(&mut reader, header.metadata_kv_count)?;
let alignment = metadata
.get("general.alignment")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(DEFAULT_ALIGNMENT);
let tensors = parse_tensor_infos(&mut reader, header.tensor_count)?;
let current_pos = reader
.stream_position()
.map_err(|e| RuvLLMError::Model(format!("Failed to get stream position: {}", e)))?;
let data_offset = align_offset(current_pos, alignment as u64);
Ok(Self {
header,
metadata,
tensors,
path: path.to_path_buf(),
mmap: None,
data_offset,
alignment,
})
}
#[cfg(feature = "mmap")]
pub fn open_mmap(path: &Path) -> Result<Self> {
let mut gguf = Self::open(path)?;
let file = File::open(path)
.map_err(|e| RuvLLMError::Model(format!("Failed to open file for mmap: {}", e)))?;
let mmap = unsafe {
memmap2::Mmap::map(&file)
.map_err(|e| RuvLLMError::Model(format!("Failed to memory map file: {}", e)))?
};
gguf.mmap = Some(MmapData { mmap });
Ok(gguf)
}
#[cfg(not(feature = "mmap"))]
pub fn open_mmap(path: &Path) -> Result<Self> {
let mut gguf = Self::open(path)?;
let data = std::fs::read(path)
.map_err(|e| RuvLLMError::Model(format!("Failed to read file: {}", e)))?;
gguf.mmap = Some(MmapData { data });
Ok(gguf)
}
pub fn get_tensor(&self, name: &str) -> Option<&TensorInfo> {
self.tensors.iter().find(|t| t.name == name)
}
pub fn load_tensor_f32(&self, name: &str) -> Result<Vec<f32>> {
let info = self
.get_tensor(name)
.ok_or_else(|| RuvLLMError::NotFound(format!("Tensor not found: {}", name)))?;
let raw_data = self.read_tensor_bytes(info)?;
let num_elements: usize = info.shape.iter().product();
let output = quantization::dequantize_tensor(&raw_data, info.dtype, num_elements)?;
Ok(output)
}
pub fn load_tensor_quantized(&self, name: &str) -> Result<QuantizedTensor> {
let info = self
.get_tensor(name)
.ok_or_else(|| RuvLLMError::NotFound(format!("Tensor not found: {}", name)))?;
let data = self.read_tensor_bytes(info)?;
let num_elements: usize = info.shape.iter().product();
Ok(QuantizedTensor {
data,
dtype: info.dtype,
shape: info.shape.clone(),
num_elements,
})
}
pub fn tensor_data(&self, info: &TensorInfo) -> &[u8] {
let mmap = self.mmap.as_ref().expect("File not memory-mapped");
let start = (self.data_offset + info.offset) as usize;
let end = start + info.byte_size();
#[cfg(feature = "mmap")]
{
&mmap.mmap[start..end]
}
#[cfg(not(feature = "mmap"))]
{
&mmap.data[start..end]
}
}
pub fn stream_tensor<F>(&self, name: &str, chunk_size: usize, mut f: F) -> Result<()>
where
F: FnMut(&[f32]) -> Result<()>,
{
let info = self
.get_tensor(name)
.ok_or_else(|| RuvLLMError::NotFound(format!("Tensor not found: {}", name)))?;
let _num_elements: usize = info.shape.iter().product();
match info.dtype {
GgufQuantType::F32 => {
self.stream_f32_tensor(info, chunk_size, &mut f)?;
}
GgufQuantType::F16 => {
self.stream_f16_tensor(info, chunk_size, &mut f)?;
}
_ => {
let block_size = info.dtype.block_size();
let aligned_chunk = ((chunk_size + block_size - 1) / block_size) * block_size;
let full_data = self.load_tensor_f32(name)?;
for chunk in full_data.chunks(aligned_chunk) {
f(chunk)?;
}
}
}
Ok(())
}
pub fn architecture(&self) -> Option<&str> {
self.metadata
.get("general.architecture")
.and_then(|v| v.as_str())
}
pub fn architecture_type(&self) -> Option<ModelArchitecture> {
self.architecture()
.and_then(|arch| match arch.to_lowercase().as_str() {
"llama" => Some(ModelArchitecture::Llama),
"mistral" => Some(ModelArchitecture::Mistral),
"phi" | "phi2" | "phi3" => Some(ModelArchitecture::Phi),
"qwen" | "qwen2" => Some(ModelArchitecture::Qwen),
"gemma" => Some(ModelArchitecture::Gemma),
_ => None,
})
}
pub fn context_length(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.context_length", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn embedding_length(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.embedding_length", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn head_count(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.attention.head_count", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn head_count_kv(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.attention.head_count_kv", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.or_else(|| self.head_count()) }
pub fn layer_count(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.block_count", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn vocab_size(&self) -> Option<usize> {
self.metadata
.get("tokenizer.ggml.tokens")
.and_then(|v| v.as_array())
.map(|arr| arr.len())
.or_else(|| {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.vocab_size", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
})
}
pub fn rope_freq_base(&self) -> Option<f32> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.rope.freq_base", arch))
.and_then(|v| v.as_f32())
}
pub fn rope_dimension_count(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.rope.dimension_count", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn feed_forward_length(&self) -> Option<usize> {
let arch = self.architecture()?;
self.metadata
.get(&format!("{}.feed_forward_length", arch))
.and_then(|v| v.as_u64())
.map(|v| v as usize)
}
pub fn model_name(&self) -> Option<&str> {
self.metadata.get("general.name").and_then(|v| v.as_str())
}
pub fn author(&self) -> Option<&str> {
self.metadata.get("general.author").and_then(|v| v.as_str())
}
pub fn quantization_version(&self) -> Option<&str> {
self.metadata
.get("general.quantization_version")
.and_then(|v| v.as_str())
}
pub fn tensor_names(&self) -> impl Iterator<Item = &str> {
self.tensors.iter().map(|t| t.name.as_str())
}
pub fn total_tensor_size(&self) -> usize {
self.tensors.iter().map(|t| t.byte_size()).sum()
}
fn read_tensor_bytes(&self, info: &TensorInfo) -> Result<Vec<u8>> {
if let Some(ref mmap) = self.mmap {
let start = (self.data_offset + info.offset) as usize;
let end = start + info.byte_size();
#[cfg(feature = "mmap")]
let data = mmap.mmap[start..end].to_vec();
#[cfg(not(feature = "mmap"))]
let data = mmap.data[start..end].to_vec();
return Ok(data);
}
let mut file = File::open(&self.path)
.map_err(|e| RuvLLMError::Model(format!("Failed to open file: {}", e)))?;
file.seek(SeekFrom::Start(self.data_offset + info.offset))
.map_err(|e| RuvLLMError::Model(format!("Failed to seek: {}", e)))?;
let mut data = vec![0u8; info.byte_size()];
file.read_exact(&mut data)
.map_err(|e| RuvLLMError::Model(format!("Failed to read tensor: {}", e)))?;
Ok(data)
}
fn stream_f32_tensor<F>(&self, info: &TensorInfo, chunk_size: usize, f: &mut F) -> Result<()>
where
F: FnMut(&[f32]) -> Result<()>,
{
let num_elements: usize = info.shape.iter().product();
let mut file = File::open(&self.path)
.map_err(|e| RuvLLMError::Model(format!("Failed to open file: {}", e)))?;
file.seek(SeekFrom::Start(self.data_offset + info.offset))
.map_err(|e| RuvLLMError::Model(format!("Failed to seek: {}", e)))?;
let mut processed = 0;
let mut buffer = vec![0u8; chunk_size * 4];
while processed < num_elements {
let remaining = num_elements - processed;
let this_chunk = remaining.min(chunk_size);
let byte_count = this_chunk * 4;
file.read_exact(&mut buffer[..byte_count])
.map_err(|e| RuvLLMError::Model(format!("Failed to read: {}", e)))?;
let floats: Vec<f32> = buffer[..byte_count]
.chunks_exact(4)
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
.collect();
f(&floats)?;
processed += this_chunk;
}
Ok(())
}
fn stream_f16_tensor<F>(&self, info: &TensorInfo, chunk_size: usize, f: &mut F) -> Result<()>
where
F: FnMut(&[f32]) -> Result<()>,
{
let num_elements: usize = info.shape.iter().product();
let mut file = File::open(&self.path)
.map_err(|e| RuvLLMError::Model(format!("Failed to open file: {}", e)))?;
file.seek(SeekFrom::Start(self.data_offset + info.offset))
.map_err(|e| RuvLLMError::Model(format!("Failed to seek: {}", e)))?;
let mut processed = 0;
let mut buffer = vec![0u8; chunk_size * 2];
while processed < num_elements {
let remaining = num_elements - processed;
let this_chunk = remaining.min(chunk_size);
let byte_count = this_chunk * 2;
file.read_exact(&mut buffer[..byte_count])
.map_err(|e| RuvLLMError::Model(format!("Failed to read: {}", e)))?;
let floats: Vec<f32> = buffer[..byte_count]
.chunks_exact(2)
.map(|b| {
let bits = u16::from_le_bytes([b[0], b[1]]);
half::f16::from_bits(bits).to_f32()
})
.collect();
f(&floats)?;
processed += this_chunk;
}
Ok(())
}
}
pub struct GgufModelLoader {
file: GgufFile,
}
impl GgufModelLoader {
pub fn load(path: &Path) -> Result<Self> {
let file = GgufFile::open_mmap(path)?;
Ok(Self { file })
}
pub fn file(&self) -> &GgufFile {
&self.file
}
pub fn architecture(&self) -> Option<ModelArchitecture> {
self.file.architecture_type()
}
pub fn config(&self) -> ModelConfig {
ModelConfig {
architecture: self.file.architecture().map(|s| s.to_string()),
context_length: self.file.context_length(),
embedding_length: self.file.embedding_length(),
head_count: self.file.head_count(),
head_count_kv: self.file.head_count_kv(),
layer_count: self.file.layer_count(),
vocab_size: self.file.vocab_size(),
rope_freq_base: self.file.rope_freq_base(),
feed_forward_length: self.file.feed_forward_length(),
}
}
pub fn find_tensors(&self, pattern: &str) -> Vec<&str> {
self.file
.tensor_names()
.filter(|name| name.contains(pattern))
.collect()
}
pub fn is_quantized(&self) -> bool {
self.file.tensors.iter().any(|t| t.dtype.is_quantized())
}
pub fn quantization_type(&self) -> Option<GgufQuantType> {
let mut counts: HashMap<GgufQuantType, usize> = HashMap::new();
for tensor in &self.file.tensors {
if tensor.name.contains("weight") {
*counts.entry(tensor.dtype).or_insert(0) += 1;
}
}
counts
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(dtype, _)| dtype)
}
#[cfg(feature = "candle")]
pub fn to_candle_model(&self, _device: &candle_core::Device) -> Result<()> {
Err(RuvLLMError::Model(
"Candle model conversion not yet implemented".to_string(),
))
}
}
#[derive(Debug, Clone, Default)]
pub struct ModelConfig {
pub architecture: Option<String>,
pub context_length: Option<usize>,
pub embedding_length: Option<usize>,
pub head_count: Option<usize>,
pub head_count_kv: Option<usize>,
pub layer_count: Option<usize>,
pub vocab_size: Option<usize>,
pub rope_freq_base: Option<f32>,
pub feed_forward_length: Option<usize>,
}
#[inline]
fn align_offset(offset: u64, alignment: u64) -> u64 {
(offset + alignment - 1) / alignment * alignment
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_align_offset() {
assert_eq!(align_offset(0, 32), 0);
assert_eq!(align_offset(1, 32), 32);
assert_eq!(align_offset(31, 32), 32);
assert_eq!(align_offset(32, 32), 32);
assert_eq!(align_offset(33, 32), 64);
}
#[test]
fn test_gguf_magic() {
assert_eq!(GGUF_MAGIC, 0x46554747);
let bytes = GGUF_MAGIC.to_le_bytes();
assert_eq!(&bytes, b"GGUF");
}
#[test]
fn test_model_config_default() {
let config = ModelConfig::default();
assert!(config.architecture.is_none());
assert!(config.context_length.is_none());
}
}