mod backend;
mod forward;
mod generation;
mod speculative;
mod weights;
pub use backend::CudaBackend;
pub use generation::BatchedDecodeState;
use crate::error::{RealizarError, Result};
use super::model::OwnedQuantizedModel;
use super::quantized::{OwnedQKVWeights, OwnedQuantizedTensor};
use super::runtime::{OwnedQuantizedKVCache, QuantizedGenerateConfig};
use super::utils::verbose;
pub struct CudaInitError {
pub error: RealizarError,
model: OwnedQuantizedModel,
}
impl CudaInitError {
#[must_use]
pub fn into_model(self) -> OwnedQuantizedModel {
self.model
}
}
impl std::fmt::Display for CudaInitError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.error)
}
}
impl std::fmt::Debug for CudaInitError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "CudaInitError({:?})", self.error)
}
}
pub struct OwnedQuantizedModelCuda {
pub(crate) model: OwnedQuantizedModel,
pub(crate) executor: crate::cuda::CudaExecutor,
device_name: String,
memory_info: (usize, usize),
embed_buf: Vec<f32>,
#[cfg(feature = "gpu")]
prefix_cache: crate::gguf::batch_scheduler::PrefixCache,
}
impl OwnedQuantizedModelCuda {
pub fn new(model: OwnedQuantizedModel, device_ordinal: i32) -> Result<Self> {
Self::with_max_seq_len(model, device_ordinal, 2048).map_err(|e| e.error)
}
fn check_gpu_capability(
model: OwnedQuantizedModel,
) -> std::result::Result<OwnedQuantizedModel, CudaInitError> {
let required = crate::capability::required_ops(&model.config.constraints);
let supported = crate::capability::gpu_supported_ops();
if let Err(missing) = crate::capability::check_capability(&required, &supported) {
let missing_names: Vec<String> = missing.iter().map(ToString::to_string).collect();
return Err(CudaInitError {
error: RealizarError::CapabilityMismatch {
architecture: model.config.architecture.clone(),
missing_ops: missing_names.join(", "),
suggestion: "Model will use CPU inference. To add GPU support, implement the missing kernels in trueno.".to_string(),
},
model,
});
}
Ok(model)
}
fn preload_and_verify(mut self) -> std::result::Result<Self, CudaInitError> {
if !self.supports_gpu_resident() {
return Ok(self);
}
if let Err(e) = self.preload_weights_gpu() {
return Err(CudaInitError {
error: e,
model: self.into_model(),
});
}
let skip_fp16_unified = self.executor.gpu_profile.cc >= 120
&& std::env::var("FORCE_FP16_CACHE").as_deref() != Ok("1");
if std::env::var("HGEMM_PREFILL").as_deref() != Ok("0")
&& !self.executor.gpu_profile.fp8_prefill
&& !skip_fp16_unified
{
let num_layers = self.model.config.num_layers;
let hidden_dim = self.model.config.hidden_dim as u32;
let intermediate_dim = self.model.config.intermediate_dim as u32;
let vocab_size = self.model.config.vocab_size as u32;
if let Err(e) = self.executor.ensure_cublas() {
eprintln!("[PMAT-037] cuBLAS init failed (non-fatal): {e}");
} else if let Err(e) = self.executor.warmup_hgemm_cache(
num_layers,
hidden_dim,
intermediate_dim,
vocab_size,
) {
eprintln!("[PMAT-037] FP16 cache warmup failed (non-fatal): {e}");
}
}
let no_fp8 = std::env::var("REALIZR_NO_FP8_CACHE").as_deref() == Ok("1");
if self.executor.gpu_profile.fp8_prefill && !no_fp8 {
let num_layers = self.model.config.num_layers;
let hidden_dim = self.model.config.hidden_dim as u32;
let intermediate_dim = self.model.config.intermediate_dim as u32;
let vocab_size = self.model.config.vocab_size as u32;
if let Err(e) =
self.executor
.warmup_fp8_cache(num_layers, hidden_dim, intermediate_dim, vocab_size)
{
eprintln!("[PMAT-053] FP8 cache warmup failed (non-fatal): {e}");
}
}
if self.executor.gpu_profile.w4a16_interleaved {
let num_layers = self.model.config.num_layers;
let hidden_dim = self.model.config.hidden_dim as u32;
let intermediate_dim = self.model.config.intermediate_dim as u32;
let vocab_size = self.model.config.vocab_size as u32;
if let Err(e) = self.executor.warmup_interleaved_cache(
num_layers,
hidden_dim,
intermediate_dim,
vocab_size,
) {
eprintln!("[PMAT-091] Interleaved cache warmup failed (non-fatal): {e}");
}
}
{
let hidden_dim = self.model.config.hidden_dim;
let intermediate_dim = self.model.config.intermediate_dim;
self.executor.force_workspace_reinit();
if let Err(e) = self.executor.init_workspace(hidden_dim, intermediate_dim) {
eprintln!("[GH-181] Workspace reinit failed (non-fatal): {e}");
}
}
let skip_gate = std::env::var("SKIP_PARITY_GATE")
.map(|v| v == "1")
.unwrap_or(false);
if !skip_gate {
if let Err(e) = parity_gate(&mut self) {
return Err(CudaInitError {
error: e,
model: self.into_model(),
});
}
}
Ok(self)
}
pub fn with_max_seq_len(
model: OwnedQuantizedModel,
device_ordinal: i32,
max_seq_len: usize,
) -> std::result::Result<Self, CudaInitError> {
use crate::cuda::CudaExecutor;
if let Err(e) = crate::contract_gate::validate_model_load_basic(
&model.config.architecture,
model.config.num_layers,
model.config.hidden_dim,
model.config.num_heads,
model.config.num_kv_heads,
model.config.intermediate_dim,
model.config.vocab_size,
) {
return Err(CudaInitError {
error: crate::contract_gate::gate_error(e),
model,
});
}
let model = Self::check_gpu_capability(model)?;
let mut executor = match CudaExecutor::new(device_ordinal) {
Ok(e) => e,
Err(e) => {
return Err(CudaInitError {
error: RealizarError::UnsupportedOperation {
operation: "CudaExecutor::new".to_string(),
reason: format!("CUDA initialization failed: {e}"),
},
model,
});
},
};
let device_name = executor
.device_name()
.unwrap_or_else(|_| "Unknown GPU".to_string());
let memory_info = executor.memory_info().unwrap_or((0, 0));
let num_layers = model.layers.len();
let num_heads = model.config.num_heads;
let num_kv_heads = model.config.num_kv_heads; let head_dim = model.config.hidden_dim / num_heads;
if let Err(e) =
executor.init_kv_cache_gpu(num_layers, num_heads, num_kv_heads, head_dim, max_seq_len)
{
return Err(CudaInitError {
error: RealizarError::UnsupportedOperation {
operation: "init_kv_cache_gpu".to_string(),
reason: format!("GPU KV cache initialization failed: {e}"),
},
model,
});
}
if std::env::var("CUDA_MAX_BATCH").is_err() {
let auto_batch = executor.compute_max_batch_for_memory(
num_layers,
num_kv_heads,
head_dim,
max_seq_len,
);
eprintln!("[PMAT-399] Auto-sized CUDA_MAX_BATCH={auto_batch} (no env var set)");
std::env::set_var("CUDA_MAX_BATCH", auto_batch.to_string());
}
if let Err(e) = executor.init_flash_decoding(num_heads, head_dim, max_seq_len, 1) {
if verbose() {
eprintln!(
"[PAR-118] Flash Decoding init failed: {e}, falling back to sequential attention"
);
}
}
if verbose() {
eprintln!(
"[PAR-060] Setting rope_theta = {} for GPU path",
model.config.rope_theta
);
}
executor.set_rope_theta(model.config.rope_theta);
if verbose() {
eprintln!(
"[CORRECTNESS-011] Setting rope_type = {} for GPU path (0=NORM, 2=NEOX)",
model.config.rope_type
);
}
executor.set_rope_type(model.config.rope_type);
{
let hidden_dim = model.config.hidden_dim as u32;
let intermediate_dim = model.config.intermediate_dim as u32;
let vocab_size = model.config.vocab_size as u32;
match executor.preload_modules_for_capture(
num_layers,
hidden_dim,
intermediate_dim,
vocab_size,
) {
Ok(()) => eprintln!(
"[GH-129] Early kernel preload: {} modules compiled",
executor.module_count()
),
Err(e) => {
eprintln!("[GH-129] Early kernel preload failed: {e}");
},
}
}
let embed_buf = vec![0.0f32; model.config.hidden_dim];
let cuda_model = Self {
model,
executor,
device_name,
memory_info,
embed_buf,
#[cfg(feature = "gpu")]
prefix_cache: crate::gguf::batch_scheduler::PrefixCache::new(16),
};
cuda_model.preload_and_verify()
}
pub fn free_cpu_weights(&mut self) {
let mut freed = 0usize;
for layer in &mut self.model.layers {
freed += layer.qkv_weight.data_bytes();
freed += layer.attn_output_weight.data.len();
freed += layer.ffn_up_weight.data.len();
freed += layer.ffn_down_weight.data.len();
if let Some(ref gate) = layer.ffn_gate_weight {
freed += gate.data.len();
}
layer.free_projection_weights();
}
freed += self.model.lm_head_weight.data.len();
self.model.lm_head_weight.data = Vec::new();
eprintln!(
"[GH-129] Freed {:.1} MB CPU weight copies (GPU-resident path active)",
freed as f64 / (1024.0 * 1024.0)
);
}
#[must_use]
pub fn is_available() -> bool {
crate::cuda::CudaExecutor::is_available()
}
#[must_use]
pub fn num_devices() -> usize {
crate::cuda::CudaExecutor::num_devices()
}
#[must_use]
pub fn device_name(&self) -> &str {
&self.device_name
}
#[must_use]
pub fn memory_info(&self) -> (usize, usize) {
self.memory_info
}
#[must_use]
pub fn vram_mb(&self) -> u64 {
(self.memory_info.1 / (1024 * 1024)) as u64
}
pub fn enable_profiling(&mut self) {
self.executor.enable_profiling();
}
pub fn disable_profiling(&mut self) {
self.executor.disable_profiling();
}
#[must_use]
pub fn is_profiling_enabled(&self) -> bool {
self.executor.is_profiling_enabled()
}
#[must_use]
pub fn profiler(&self) -> &trueno::BrickProfiler {
self.executor.profiler()
}
pub fn reset_profiler(&mut self) {
self.executor.reset_profiler();
}
#[must_use]
pub fn profiler_summary(&self) -> String {
self.executor.profiler_summary()
}
#[must_use]
pub fn model(&self) -> &OwnedQuantizedModel {
&self.model
}
#[must_use]
pub fn into_model(self) -> OwnedQuantizedModel {
self.model
}
#[must_use]
pub fn executor_mut(&mut self) -> &mut crate::cuda::CudaExecutor {
&mut self.executor
}
pub fn synchronize(&self) -> Result<()> {
self.executor
.synchronize()
.map_err(|e| RealizarError::UnsupportedOperation {
operation: "CudaExecutor::synchronize".to_string(),
reason: format!("CUDA sync failed: {e}"),
})
}
}
const PARITY_GATE_COSINE_MIN: f32 = 0.98;
include!("mod_parity_gate.rs");