use std::fs::File;
use std::path::Path;
use memmap2::Mmap;
use super::config::GGUFConfig;
use super::quantized::{OwnedQuantizedLayer, OwnedQuantizedTensor};
use super::types::GGUFModel;
use crate::error::{RealizarError, Result};
pub struct MappedGGUFModel {
pub model: GGUFModel,
pub(crate) mmap: Mmap,
}
impl MappedGGUFModel {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref()).map_err(|e| RealizarError::UnsupportedOperation {
operation: "open_model_file".to_string(),
reason: format!("Failed to open {}: {}", path.as_ref().display(), e),
})?;
let mmap = unsafe {
memmap2::MmapOptions::new()
.populate()
.map(&file)
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "mmap_model_file".to_string(),
reason: format!("Failed to mmap {}: {}", path.as_ref().display(), e),
})?
};
#[cfg(target_os = "linux")]
unsafe {
libc::madvise(
mmap.as_ptr() as *mut libc::c_void,
mmap.len(),
libc::MADV_HUGEPAGE,
);
libc::madvise(
mmap.as_ptr() as *mut libc::c_void,
mmap.len(),
libc::MADV_RANDOM,
);
libc::mlock(mmap.as_ptr() as *const libc::c_void, mmap.len());
}
let model = GGUFModel::from_bytes(&mmap)?;
Ok(Self { model, mmap })
}
#[must_use]
pub fn data(&self) -> &[u8] {
&self.mmap
}
#[must_use]
pub fn tensor_slice(&self, offset: usize, size: usize) -> Option<&[u8]> {
let end = offset.checked_add(size)?;
if end <= self.mmap.len() {
Some(&self.mmap[offset..end])
} else {
None
}
}
#[must_use]
pub fn file_size(&self) -> usize {
self.mmap.len()
}
#[cfg(unix)]
pub fn advise_sequential(&self) {
unsafe {
libc::madvise(
self.mmap.as_ptr().cast_mut().cast::<libc::c_void>(),
self.mmap.len(),
libc::MADV_SEQUENTIAL,
);
}
}
#[cfg(unix)]
pub fn advise_random(&self) {
unsafe {
libc::madvise(
self.mmap.as_ptr().cast_mut().cast::<libc::c_void>(),
self.mmap.len(),
libc::MADV_RANDOM,
);
}
}
#[cfg(unix)]
pub fn advise_willneed(&self) {
unsafe {
libc::madvise(
self.mmap.as_ptr().cast_mut().cast::<libc::c_void>(),
self.mmap.len(),
libc::MADV_WILLNEED,
);
}
}
#[cfg(unix)]
pub fn lock_memory(&self) -> bool {
unsafe { libc::mlock(self.mmap.as_ptr().cast::<libc::c_void>(), self.mmap.len()) == 0 }
}
}
pub struct GGUFTransformer {
pub config: GGUFConfig,
pub token_embedding: Vec<f32>,
pub position_embedding: Option<Vec<f32>>,
pub layers: Vec<GGUFTransformerLayer>,
pub output_norm_weight: Vec<f32>,
pub output_norm_bias: Option<Vec<f32>>,
pub lm_head_weight: Vec<f32>,
pub lm_head_bias: Option<Vec<f32>>,
}
pub struct GGUFTransformerLayer {
pub attn_norm_weight: Vec<f32>,
pub attn_norm_bias: Option<Vec<f32>>,
pub qkv_weight: Vec<f32>,
pub qkv_bias: Option<Vec<f32>>,
pub attn_output_weight: Vec<f32>,
pub attn_output_bias: Option<Vec<f32>>,
pub ffn_gate_weight: Option<Vec<f32>>,
pub ffn_gate_bias: Option<Vec<f32>>,
pub ffn_up_weight: Vec<f32>,
pub ffn_up_bias: Option<Vec<f32>>,
pub ffn_down_weight: Vec<f32>,
pub ffn_down_bias: Option<Vec<f32>>,
pub ffn_norm_weight: Option<Vec<f32>>,
pub ffn_norm_bias: Option<Vec<f32>>,
pub attn_q_norm_weight: Option<Vec<f32>>,
pub attn_k_norm_weight: Option<Vec<f32>>,
}
pub struct OwnedQuantizedModel {
pub(crate) config: GGUFConfig,
pub(crate) token_embedding: Vec<f32>,
pub(crate) position_embedding: Option<Vec<f32>>,
pub(crate) layers: Vec<OwnedQuantizedLayer>,
pub(crate) encoder_layers: Vec<OwnedQuantizedLayer>,
pub(crate) encoder_output_norm_weight: Option<Vec<f32>>,
pub(crate) encoder_output_norm_bias: Option<Vec<f32>>,
pub(crate) output_norm_weight: Vec<f32>,
pub(crate) output_norm_bias: Option<Vec<f32>>,
pub(crate) lm_head_weight: OwnedQuantizedTensor,
pub(crate) lm_head_bias: Option<Vec<f32>>,
#[cfg(feature = "cuda")]
pub(crate) cuda_executor: Option<std::sync::Mutex<crate::cuda::CudaExecutor>>,
#[cfg(feature = "cuda")]
pub(crate) cuda_kernel_count: std::sync::atomic::AtomicU64,
#[cfg(feature = "cuda")]
pub(crate) cached_weight_names: std::sync::Mutex<std::collections::HashSet<String>>,
}
impl std::fmt::Debug for OwnedQuantizedModel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut s = f.debug_struct("OwnedQuantizedModel");
s.field("config", &self.config)
.field("token_embedding_len", &self.token_embedding.len())
.field("has_position_embedding", &self.position_embedding.is_some())
.field("layers_count", &self.layers.len())
.field("encoder_layers_count", &self.encoder_layers.len())
.field(
"has_encoder_output_norm_weight",
&self.encoder_output_norm_weight.is_some(),
)
.field(
"has_encoder_output_norm_bias",
&self.encoder_output_norm_bias.is_some(),
)
.field("output_norm_weight_len", &self.output_norm_weight.len())
.field("has_output_norm_bias", &self.output_norm_bias.is_some())
.field("lm_head_weight", &self.lm_head_weight)
.field("has_lm_head_bias", &self.lm_head_bias.is_some());
#[cfg(feature = "cuda")]
s.field("cuda_enabled", &self.cuda_executor.is_some())
.field(
"cuda_kernel_count",
&self
.cuda_kernel_count
.load(std::sync::atomic::Ordering::Relaxed),
)
.field(
"cached_weight_count",
&self
.cached_weight_names
.lock()
.map(|g| g.len())
.unwrap_or(0),
);
s.finish()
}
}
impl Clone for OwnedQuantizedModel {
fn clone(&self) -> Self {
Self {
config: self.config.clone(),
token_embedding: self.token_embedding.clone(),
position_embedding: self.position_embedding.clone(),
layers: self.layers.clone(),
encoder_layers: self.encoder_layers.clone(),
encoder_output_norm_weight: self.encoder_output_norm_weight.clone(),
encoder_output_norm_bias: self.encoder_output_norm_bias.clone(),
output_norm_weight: self.output_norm_weight.clone(),
output_norm_bias: self.output_norm_bias.clone(),
lm_head_weight: self.lm_head_weight.clone(),
lm_head_bias: self.lm_head_bias.clone(),
#[cfg(feature = "cuda")]
cuda_executor: None,
#[cfg(feature = "cuda")]
cuda_kernel_count: std::sync::atomic::AtomicU64::new(0),
#[cfg(feature = "cuda")]
cached_weight_names: std::sync::Mutex::new(std::collections::HashSet::new()),
}
}
}
impl OwnedQuantizedModel {
pub fn config_mut(&mut self) -> &mut GGUFConfig {
&mut self.config
}
#[must_use]
pub fn token_embedding(&self) -> &[f32] {
&self.token_embedding
}
#[must_use]
pub fn position_embedding(&self) -> Option<&[f32]> {
self.position_embedding.as_deref()
}
#[must_use]
pub fn layers(&self) -> &[OwnedQuantizedLayer] {
&self.layers
}
#[must_use]
pub fn output_norm_weight(&self) -> &[f32] {
&self.output_norm_weight
}
#[must_use]
pub fn output_norm_bias(&self) -> Option<&[f32]> {
self.output_norm_bias.as_deref()
}
#[must_use]
pub fn lm_head_weight(&self) -> &OwnedQuantizedTensor {
&self.lm_head_weight
}
#[must_use]
pub fn lm_head_bias(&self) -> Option<&[f32]> {
self.lm_head_bias.as_deref()
}
}
include!("model_owned_quantized.rs");