impl OwnedQuantizedModelCuda {
pub fn preload_weights_gpu(&mut self) -> Result<usize> {
self.make_cuda_current()?;
let num_layers = self.model.layers.len();
let mut total_bytes = self.preload_layer_projection_weights()?;
total_bytes += self.preload_all_norm_weights(num_layers)?;
total_bytes += self.preload_qkv_bias_weights(num_layers)?;
total_bytes += self.preload_lm_head_bias_wrapped()?;
total_bytes += self.preload_qk_norm_weights()?;
self.ensure_indexed_weights_and_workspace(num_layers)?;
Ok(total_bytes)
}
fn make_cuda_current(&mut self) -> Result<()> {
self.executor
.make_current()
.map_err(|e| Self::cuda_err("cuda_make_current", "Failed to set CUDA context current", e))
}
fn preload_all_norm_weights(&mut self, num_layers: usize) -> Result<usize> {
let (attn_norms, ffn_norms) = self.collect_layer_norm_slices();
let mut total_bytes = self
.executor
.preload_rmsnorm_weights(num_layers, &attn_norms, &ffn_norms)
.map_err(|e| Self::cuda_err("preload_weights_gpu", "Failed to upload RMSNorm weights", e))?;
total_bytes += self
.executor
.preload_output_norm(&self.model.output_norm_weight)
.map_err(|e| {
Self::cuda_err("preload_weights_gpu", "Failed to upload output norm weights", e)
})?;
Ok(total_bytes)
}
fn collect_layer_norm_slices(&self) -> (Vec<&[f32]>, Vec<&[f32]>) {
let attn_norms = self
.model
.layers
.iter()
.map(|l| l.attn_norm_weight.as_slice())
.collect();
let ffn_norms = self
.model
.layers
.iter()
.map(|l| {
l.ffn_norm_weight
.as_ref()
.map_or(l.attn_norm_weight.as_slice(), |w| w.as_slice())
})
.collect();
(attn_norms, ffn_norms)
}
fn preload_lm_head_bias_wrapped(&mut self) -> Result<usize> {
self.executor
.preload_lm_head_bias(self.model.lm_head_bias.as_deref())
.map_err(|e| Self::cuda_err("preload_weights_gpu", "Failed to upload LM head bias", e))
}
fn ensure_indexed_weights_and_workspace(&mut self, num_layers: usize) -> Result<()> {
if !self.executor.has_indexed_weights() {
let arch = &self.model.config.constraints;
self.executor
.build_indexed_weights(num_layers, |i| format!("blk.{}", i), arch)
.map_err(|e| {
Self::cuda_err(
"preload_weights_gpu",
"PAR-043: Failed to build indexed weights",
e,
)
})?;
}
if !self.executor.has_workspace() {
self.executor
.init_workspace(
self.model.config.hidden_dim,
self.model.config.intermediate_dim,
)
.map_err(|e| {
Self::cuda_err(
"preload_weights_gpu",
"PAR-044: Failed to initialize workspace",
e,
)
})?;
}
Ok(())
}
fn cuda_err<E: std::fmt::Display>(op: &str, reason: &str, e: E) -> RealizarError {
RealizarError::UnsupportedOperation {
operation: op.to_string(),
reason: format!("{reason}: {e}"),
}
}
fn preload_qk_norm_weights(&mut self) -> Result<usize> {
let mut total_bytes = 0usize;
for (layer_idx, layer) in self.model.layers.iter().enumerate() {
if let Some(ref q_norm) = layer.attn_q_norm_weight {
let name = format!("blk.{}.attn_q_norm.gamma", layer_idx);
total_bytes += self
.executor
.cache_rmsnorm_gamma(&name, q_norm)
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "preload_qk_norm_weights".to_string(),
reason: format!(
"Failed to upload Q norm weights for layer {}: {}",
layer_idx, e
),
})?;
}
if let Some(ref k_norm) = layer.attn_k_norm_weight {
let name = format!("blk.{}.attn_k_norm.gamma", layer_idx);
total_bytes += self
.executor
.cache_rmsnorm_gamma(&name, k_norm)
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "preload_qk_norm_weights".to_string(),
reason: format!(
"Failed to upload K norm weights for layer {}: {}",
layer_idx, e
),
})?;
}
}
Ok(total_bytes)
}
fn preload_layer_projection_weights(&mut self) -> Result<usize> {
let mut total_bytes = 0usize;
for (layer_idx, layer) in self.model.layers.iter().enumerate() {
let prefix = format!("blk.{}", layer_idx);
total_bytes += upload_layer_qkv(&mut self.executor, &prefix, layer_idx, layer)?;
total_bytes += upload_layer_ffn(&mut self.executor, &prefix, layer)?;
}
total_bytes += upload_if_absent(
&mut self.executor, "output.weight",
&self.model.lm_head_weight.data, self.model.lm_head_weight.qtype,
)?;
Ok(total_bytes)
}
fn preload_qkv_bias_weights(&mut self, num_layers: usize) -> Result<usize> {
let num_heads = self.model.config.num_heads;
let num_kv_heads = self.model.config.num_kv_heads;
let hidden_dim = self.model.config.hidden_dim;
let head_dim = self.model.config.head_dim();
let q_biases: Vec<Option<&[f32]>> = self
.model
.layers
.iter()
.map(|l| {
l.qkv_bias.as_ref().map(|b| {
let q_dim = l
.qkv_weight
.q_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
&b[..q_dim]
})
})
.collect();
let k_biases: Vec<Option<&[f32]>> = self
.model
.layers
.iter()
.map(|l| {
l.qkv_bias.as_ref().map(|b| {
let q_dim = l.qkv_weight.q_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
let k_dim = l.qkv_weight.k_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
&b[q_dim..q_dim + k_dim]
})
})
.collect();
let v_biases: Vec<Option<&[f32]>> = self
.model
.layers
.iter()
.map(|l| {
l.qkv_bias.as_ref().map(|b| {
let q_dim = l
.qkv_weight
.q_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
let k_dim = l
.qkv_weight
.k_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
let v_dim = l
.qkv_weight
.v_dim_for_config(num_heads, num_kv_heads, hidden_dim, head_dim);
&b[q_dim + k_dim..q_dim + k_dim + v_dim]
})
})
.collect();
self.executor
.preload_qkv_bias(num_layers, &q_biases, &k_biases, &v_biases)
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "preload_qkv_bias_weights".to_string(),
reason: format!("Failed to upload QKV bias: {}", e),
})
}
pub fn clear_decode_graph(&mut self) {
self.executor.clear_decode_graph();
}
#[must_use]
pub fn supports_gpu_resident(&self) -> bool {
let constraints = &self.model.config.constraints;
let has_gated_ffn = constraints.has_gate_ffn();
let has_rmsnorm = constraints.uses_rmsnorm();
let has_separate_qkv = self.model.layers.first()
.is_some_and(|l| matches!(l.qkv_weight, OwnedQKVWeights::Separate { .. }));
has_separate_qkv && has_gated_ffn && has_rmsnorm
}
}
fn upload_if_absent(
executor: &mut crate::cuda::CudaExecutor,
name: &str,
data: &[u8],
qtype: u32,
) -> Result<usize> {
if executor.has_quantized_weights(name) {
return Ok(0);
}
executor
.load_quantized_weights_with_type(name, data, qtype)
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "preload_layer_projection_weights".to_string(),
reason: format!("Failed to upload '{}': {}", name, e),
})
}
fn upload_layer_qkv(
executor: &mut crate::cuda::CudaExecutor,
prefix: &str,
layer_idx: usize,
layer: &crate::gguf::quantized::OwnedQuantizedLayer,
) -> Result<usize> {
let mut total = 0usize;
match &layer.qkv_weight {
OwnedQKVWeights::Separate { q, k, v } => {
total += upload_if_absent(executor, &format!("{prefix}.attn_q.weight"), &q.data, q.qtype)?;
total += upload_if_absent(executor, &format!("{prefix}.attn_k.weight"), &k.data, k.qtype)?;
total += upload_if_absent(executor, &format!("{prefix}.attn_v.weight"), &v.data, v.qtype)?;
},
OwnedQKVWeights::Fused(_) => {
return Err(RealizarError::UnsupportedOperation {
operation: "preload_layer_projection_weights".to_string(),
reason: format!(
"Layer {} uses fused QKV (phi-2 style), GPU-resident path requires separate Q/K/V",
layer_idx
),
});
},
}
total += upload_if_absent(
executor, &format!("{prefix}.attn_output.weight"),
&layer.attn_output_weight.data, layer.attn_output_weight.qtype,
)?;
Ok(total)
}
fn upload_layer_ffn(
executor: &mut crate::cuda::CudaExecutor,
prefix: &str,
layer: &crate::gguf::quantized::OwnedQuantizedLayer,
) -> Result<usize> {
let mut total = 0usize;
if let Some(ref gate) = layer.ffn_gate_weight {
total += upload_if_absent(executor, &format!("{prefix}.ffn_gate.weight"), &gate.data, gate.qtype)?;
}
total += upload_if_absent(
executor, &format!("{prefix}.ffn_up.weight"),
&layer.ffn_up_weight.data, layer.ffn_up_weight.qtype,
)?;
total += upload_if_absent(
executor, &format!("{prefix}.ffn_down.weight"),
&layer.ffn_down_weight.data, layer.ffn_down_weight.qtype,
)?;
Ok(total)
}
include!("cache.rs");