#![allow(clippy::similar_names)]
use crate::apr_transformer::{AprTransformerConfig, QuantizedAprTransformerQ4};
#[cfg(feature = "cuda")]
use crate::cuda::CudaExecutor;
use crate::error::{RealizarError, Result};
#[cfg(feature = "cuda")]
use trueno_gpu::driver::GpuBuffer;
#[cfg(feature = "cuda")]
const Q4_0_TYPE: u32 = 2;
pub struct AprQ4ToGpuAdapter;
impl AprQ4ToGpuAdapter {
#[cfg(feature = "cuda")]
pub fn upload_weights(
apr: &QuantizedAprTransformerQ4,
executor: &mut CudaExecutor,
) -> Result<usize> {
let mut total_bytes = 0;
for (layer_idx, layer) in apr.layers.iter().enumerate() {
let qkv_name = format!("layer_{layer_idx}.attn.qkv");
let qkv_bytes = executor
.load_quantized_weights_with_type(&qkv_name, &layer.qkv_weight.data, Q4_0_TYPE)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload {qkv_name}: {e}"),
})?;
total_bytes += qkv_bytes;
let out_name = format!("layer_{layer_idx}.attn.out");
let out_bytes = executor
.load_quantized_weights_with_type(
&out_name,
&layer.attn_output_weight.data,
Q4_0_TYPE,
)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload {out_name}: {e}"),
})?;
total_bytes += out_bytes;
let up_name = format!("layer_{layer_idx}.ffn.up");
let up_bytes = executor
.load_quantized_weights_with_type(&up_name, &layer.ffn_up_weight.data, Q4_0_TYPE)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload {up_name}: {e}"),
})?;
total_bytes += up_bytes;
let down_name = format!("layer_{layer_idx}.ffn.down");
let down_bytes = executor
.load_quantized_weights_with_type(
&down_name,
&layer.ffn_down_weight.data,
Q4_0_TYPE,
)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload {down_name}: {e}"),
})?;
total_bytes += down_bytes;
if let Some(ref gate_weight) = layer.ffn_gate_weight {
let gate_name = format!("layer_{layer_idx}.ffn.gate");
let gate_bytes = executor
.load_quantized_weights_with_type(&gate_name, &gate_weight.data, Q4_0_TYPE)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload {gate_name}: {e}"),
})?;
total_bytes += gate_bytes;
}
}
let lm_head_bytes = executor
.load_quantized_weights_with_type("lm_head", &apr.lm_head_weight.data, Q4_0_TYPE)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to upload lm_head: {e}"),
})?;
total_bytes += lm_head_bytes;
for (layer_idx, layer) in apr.layers.iter().enumerate() {
let attn_norm_name = format!("apr.layer_{layer_idx}.attn_norm");
let attn_norm_bytes = executor
.cache_rmsnorm_gamma(&attn_norm_name, &layer.attn_norm_weight)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to cache {attn_norm_name}: {e}"),
})?;
total_bytes += attn_norm_bytes;
let ffn_norm_name = format!("apr.layer_{layer_idx}.ffn_norm");
let ffn_norm = layer
.ffn_norm_weight
.as_ref()
.map_or_else(|| vec![1.0f32; apr.config.hidden_dim], Clone::clone);
let ffn_norm_bytes = executor
.cache_rmsnorm_gamma(&ffn_norm_name, &ffn_norm)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to cache {ffn_norm_name}: {e}"),
})?;
total_bytes += ffn_norm_bytes;
}
let output_norm_bytes = executor
.cache_rmsnorm_gamma("apr.output_norm", &apr.output_norm_weight)
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to cache apr.output_norm: {e}"),
})?;
total_bytes += output_norm_bytes;
Ok(total_bytes)
}
#[must_use]
pub fn create_model(apr: &QuantizedAprTransformerQ4) -> GpuModelQ4 {
let layer_norms: Vec<LayerNorms> = apr
.layers
.iter()
.map(|layer| LayerNorms {
attn_norm: layer.attn_norm_weight.clone(),
ffn_norm: layer
.ffn_norm_weight
.clone()
.unwrap_or_else(|| vec![1.0; apr.config.hidden_dim]),
})
.collect();
GpuModelQ4 {
config: apr.config.clone(),
token_embedding: apr.token_embedding.clone(),
output_norm_weight: apr.output_norm_weight.clone(),
layer_norms,
num_layers: apr.layers.len(),
has_gate: apr
.layers
.first()
.is_some_and(|l| l.ffn_gate_weight.is_some()),
}
}
}
#[derive(Debug, Clone)]
pub struct LayerNorms {
pub attn_norm: Vec<f32>,
pub ffn_norm: Vec<f32>,
}
#[derive(Debug, Clone)]
pub struct GpuModelQ4 {
pub config: AprTransformerConfig,
pub token_embedding: Vec<f32>,
pub output_norm_weight: Vec<f32>,
pub layer_norms: Vec<LayerNorms>,
pub num_layers: usize,
pub has_gate: bool,
}
#[cfg(feature = "cuda")]
fn gpu_to_host(buf: &GpuBuffer<f32>) -> Result<Vec<f32>> {
let mut host = vec![0.0f32; buf.len()];
buf.copy_to_host(&mut host)
.map_err(|e| RealizarError::GpuError {
reason: format!("GPU->CPU copy failed: {e}"),
})?;
Ok(host)
}
include!("apr_q4_apply_rope_gpu.rs");
include!("activation.rs");