#[cfg(feature = "gpu")]
fn try_wgpu_generate(
model: &crate::gguf::OwnedQuantizedModel,
input_tokens: &[u32],
gen_config: &crate::gguf::QuantizedGenerateConfig,
verbose: bool,
) -> Result<(Vec<u32>, bool)> {
use crate::gpu::adapters::wgpu_adapter;
if !trueno::backends::gpu::GpuDevice::is_available() {
return Err(RealizarError::InferenceError("wgpu not available".into()));
}
let gpu = trueno::backends::gpu::GpuDevice::new()
.map_err(|e| RealizarError::InferenceError(format!("wgpu init: {e}")))?;
if verbose {
eprintln!("Backend: wgpu (Vulkan)");
}
let config = model.config();
let hidden_dim = config.hidden_dim;
let num_layers = config.num_layers;
let num_heads = config.num_heads;
let num_kv_heads = config.num_kv_heads;
let head_dim = hidden_dim / num_heads;
let intermediate_dim = config.intermediate_dim;
let vocab_size = config.vocab_size;
let eps = config.eps;
let kv_dim = num_kv_heads * head_dim;
let mut fwd = trueno::backends::gpu::WgslForwardPass::new(
gpu.device, gpu.queue,
hidden_dim, num_heads, num_kv_heads, head_dim, intermediate_dim,
);
let raw_q4k = wgpu_adapter::raw_q4k_weights(model);
let q4k_names: std::collections::HashSet<String> =
raw_q4k.iter().map(|(n, _, _, _)| n.clone()).collect();
for (name, data, _rows, _cols) in &raw_q4k {
fwd.upload_q4k_weight(name, data);
}
let weights = wgpu_adapter::dequant_model_weights(model)?;
for (name, data, _rows, _cols) in &weights {
if !q4k_names.contains(name) {
fwd.upload_weight(name, data);
}
}
let output_norm = model.output_norm_weight();
let lm_head_f32: Vec<f32> = weights.iter()
.find(|(n, _, _, _)| n == "lm_head")
.map(|(_, d, _, _)| d.clone())
.unwrap_or_default();
let max_seq = gen_config.max_tokens + input_tokens.len() + 16;
let mut kv_caches: Vec<(Vec<f32>, Vec<f32>)> = (0..num_layers)
.map(|_| (vec![0.0f32; max_seq * kv_dim], vec![0.0f32; max_seq * kv_dim]))
.collect();
let mut output_tokens = input_tokens.to_vec();
let stop_tokens = &gen_config.stop_tokens;
for step in 0..gen_config.max_tokens {
let token_id = *output_tokens.last().unwrap();
let position = output_tokens.len() - 1;
let seq_len_before = if step == 0 { 0 } else { position };
let mut hidden = model.embed(&[token_id]);
for layer_idx in 0..num_layers {
let prefix = format!("layer.{layer_idx}");
let (ref mut kv_k, ref mut kv_v) = kv_caches[layer_idx];
fwd.forward_layer(
&mut hidden, &prefix, position, kv_k, kv_v,
).map_err(|e| RealizarError::InferenceError(format!("wgpu layer {layer_idx}: {e}")))?;
}
let sq_sum: f32 = hidden.iter().map(|x| x * x).sum();
let rms = (sq_sum / hidden.len() as f32 + eps).sqrt();
let normed: Vec<f32> = hidden.iter().zip(output_norm.iter())
.map(|(x, g)| (x / rms) * g)
.collect();
let mut best_idx = 0u32;
let mut best_val = f32::NEG_INFINITY;
for i in 0..vocab_size {
let row = &lm_head_f32[i * hidden_dim..(i + 1) * hidden_dim];
let logit: f32 = row.iter().zip(normed.iter()).map(|(w, x)| w * x).sum();
if logit > best_val {
best_val = logit;
best_idx = i as u32;
}
}
output_tokens.push(best_idx);
if stop_tokens.contains(&best_idx) {
break;
}
}
Ok((output_tokens, true)) }
#[cfg(feature = "cuda")]
fn try_gguf_gpu_generate(
model: crate::gguf::OwnedQuantizedModel,
input_tokens: &[u32],
gen_config: &crate::gguf::QuantizedGenerateConfig,
verbose: bool,
) -> std::result::Result<Result<(Vec<u32>, bool)>, crate::gguf::OwnedQuantizedModel> {
use crate::gguf::OwnedQuantizedModelCuda;
let mut cuda_model = match OwnedQuantizedModelCuda::with_max_seq_len(model, 0, 2048) {
Ok(m) => m,
Err(e) => {
if verbose {
eprintln!("Backend: CPU (GPU unavailable: {})", e);
}
return Err(e.into_model());
},
};
if verbose {
eprintln!(
"Backend: GPU ({}, {} MB VRAM)",
cuda_model.device_name(),
cuda_model.vram_mb()
);
}
if !validate_gpu_first_token(&mut cuda_model, gen_config) {
return Err(cuda_model.into_model());
}
let result = cuda_model
.generate_gpu_resident(input_tokens, gen_config)
.map(|tokens| (tokens, true))
.map_err(|e| RealizarError::InferenceError(format!("GPU generation failed: {}", e)));
Ok(result)
}
#[allow(unused_variables)] fn run_gguf_generate(
model: crate::gguf::OwnedQuantizedModel,
input_tokens: &[u32],
gen_config: &crate::gguf::QuantizedGenerateConfig,
config: &InferenceConfig,
) -> Result<(Vec<u32>, bool)> {
let has_legacy_quant = model_has_legacy_quant(&model);
#[cfg(feature = "cuda")]
let model = if !config.no_gpu && !has_legacy_quant {
match try_gguf_gpu_generate(model, input_tokens, gen_config, config.verbose) {
Ok(result) => return result,
Err(returned_model) => returned_model, }
} else {
model
};
#[cfg(feature = "gpu")]
if !config.no_gpu && !has_legacy_quant {
match try_wgpu_generate(&model, input_tokens, gen_config, config.verbose) {
Ok(result) => return Ok(result),
Err(e) => {
if config.verbose {
eprintln!("Backend: CPU (wgpu unavailable: {})", e);
}
}
}
}
log_cpu_backend(config.verbose, has_legacy_quant);
let tokens = model
.generate_with_cache(input_tokens, gen_config)
.map_err(|e| RealizarError::InferenceError(format!("CPU generation failed: {}", e)))?;
Ok((tokens, false))
}
fn run_apr_inference(
config: &InferenceConfig,
prepared: &PreparedTokens,
) -> Result<InferenceResult> {
if config.verbose {
eprintln!("Loading APR model: {}", config.model_path.display());
}
let load_start = Instant::now();
let input_tokens = prepared.tokens();
let input_token_count = prepared.input_count();
#[cfg(feature = "cuda")]
if !config.no_gpu {
if let Some(result) =
try_apr_cuda_inference(config, input_tokens, input_token_count, load_start)
{
return result;
}
}
#[cfg(feature = "gpu")]
if !config.no_gpu {
match try_apr_wgpu_inference(config, input_tokens, input_token_count, load_start) {
Some(Ok(result)) => return Ok(result),
Some(Err(e)) => {
if config.verbose {
eprintln!("Backend: CPU (wgpu failed: {})", e);
}
}
None => {
if config.verbose {
eprintln!("Backend: CPU (wgpu not available)");
}
}
}
}
run_apr_cpu_inference(config, input_tokens, input_token_count, load_start)
}
#[cfg(feature = "gpu")]
fn try_apr_wgpu_inference(
config: &InferenceConfig,
input_tokens: &[u32],
input_token_count: usize,
load_start: Instant,
) -> Option<Result<InferenceResult>> {
use crate::apr::MappedAprModel;
use crate::gpu::adapters::wgpu_adapter;
use trueno::backends::gpu::GpuDevice;
if !GpuDevice::is_available() {
return None;
}
let gpu = match GpuDevice::new() {
Ok(g) => g,
Err(e) => {
if config.verbose {
eprintln!("[GH-559] wgpu init failed: {}", e);
}
return None;
}
};
if config.verbose {
eprintln!("Backend: wgpu (Vulkan)");
}
let mapped = match MappedAprModel::from_path(&config.model_path) {
Ok(m) => m,
Err(_) => return None,
};
let model = match crate::gguf::OwnedQuantizedModel::from_apr(&mapped) {
Ok(m) => m,
Err(_) => return None,
};
let cfg = model.config();
let hidden_dim = cfg.hidden_dim;
let num_layers = cfg.num_layers;
let num_heads = cfg.num_heads;
let num_kv_heads = cfg.num_kv_heads;
let head_dim = hidden_dim / num_heads;
let intermediate_dim = cfg.intermediate_dim;
let vocab_size = cfg.vocab_size;
let eps = cfg.eps;
let kv_dim = num_kv_heads * head_dim;
let mut stop_toks: Vec<u32> = cfg.eos_token_id.into_iter().collect();
let extra = crate::infer::resolve_apr_stop_tokens(
cfg.eos_token_id, &[], &config.model_path,
);
for t in &extra {
if !stop_toks.contains(t) { stop_toks.push(*t); }
}
let gen_config = crate::gguf::QuantizedGenerateConfig {
max_tokens: config.max_tokens,
temperature: 0.0,
top_k: 1,
stop_tokens: stop_toks,
trace: false,
..Default::default()
};
let weights = match wgpu_adapter::dequant_model_weights(&model) {
Ok(w) => w,
Err(e) => return Some(Err(e)),
};
let mut fwd = trueno::backends::gpu::WgslForwardPass::new(
gpu.device, gpu.queue,
hidden_dim, num_heads, num_kv_heads, head_dim, intermediate_dim,
);
for (name, data, _rows, _cols) in &weights {
fwd.upload_weight(name, data);
}
let output_norm = model.output_norm_weight();
let lm_head_f32: Vec<f32> = weights.iter()
.find(|(n, _, _, _)| n == "lm_head")
.map(|(_, d, _, _)| d.clone())
.unwrap_or_default();
let max_seq = gen_config.max_tokens + input_tokens.len() + 16;
let mut kv_caches: Vec<(Vec<f32>, Vec<f32>)> = (0..num_layers)
.map(|_| (vec![0.0f32; max_seq * kv_dim], vec![0.0f32; max_seq * kv_dim]))
.collect();
let model_load_ms = load_start.elapsed().as_millis() as f64;
let infer_start = Instant::now();
let mut output_tokens = input_tokens.to_vec();
let stop_tokens = &gen_config.stop_tokens;
for step in 0..gen_config.max_tokens {
let token_id = *output_tokens.last().unwrap();
let position = output_tokens.len() - 1;
let mut hidden = model.embed(&[token_id]);
for layer_idx in 0..num_layers {
let prefix = format!("layer.{layer_idx}");
let (ref mut kv_k, ref mut kv_v) = kv_caches[layer_idx];
if let Err(e) = fwd.forward_layer(&mut hidden, &prefix, position, kv_k, kv_v) {
return Some(Err(RealizarError::InferenceError(format!("wgpu layer {layer_idx}: {e}"))));
}
}
let sq_sum: f32 = hidden.iter().map(|x| x * x).sum();
let rms = (sq_sum / hidden.len() as f32 + eps).sqrt();
let normed: Vec<f32> = hidden.iter().zip(output_norm.iter())
.map(|(x, g)| (x / rms) * g)
.collect();
let mut best_idx = 0u32;
let mut best_val = f32::NEG_INFINITY;
for i in 0..vocab_size {
let row = &lm_head_f32[i * hidden_dim..(i + 1) * hidden_dim];
let logit: f32 = row.iter().zip(normed.iter()).map(|(w, x)| w * x).sum();
if logit > best_val {
best_val = logit;
best_idx = i as u32;
}
}
output_tokens.push(best_idx);
if stop_tokens.contains(&best_idx) { break; }
}
let inference_ms = infer_start.elapsed().as_millis() as f64;
let tokens_generated = output_tokens.len() - input_token_count;
let text = crate::infer::decode_apr_tokens(&config.model_path, &output_tokens[input_token_count..]);
Some(Ok(InferenceResult {
text,
tokens: output_tokens,
input_token_count,
generated_token_count: tokens_generated,
inference_ms,
load_ms: model_load_ms,
tok_per_sec: if inference_ms > 0.0 { tokens_generated as f64 / (inference_ms / 1000.0) } else { 0.0 },
format: "APR".to_string(),
used_gpu: true,
}))
}
fn apr_arch_to_template_hint(apr_arch: &str, _model_name: &str) -> &'static str {
crate::tensor_names::normalize_architecture(apr_arch)
}
#[cfg(feature = "cuda")]
struct AprCudaModelInfo {
arch: String,
num_layers: usize,
vocab_size: usize,
hidden_dim: usize,
}
#[cfg(feature = "cuda")]
fn load_apr_cuda_model(
model_path: &std::path::Path,
verbose: bool,
) -> Option<(crate::gguf::OwnedQuantizedModelCuda, AprCudaModelInfo)> {
use crate::apr::MappedAprModel;
use crate::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda};
let mapped = MappedAprModel::from_path(model_path).map_err(|e| {
if verbose { eprintln!("[APR-CUDA] MappedAprModel::from_path failed: {}", e); }
}).ok()?;
let model = OwnedQuantizedModel::from_apr(&mapped).map_err(|e| {
if verbose { eprintln!("[APR-CUDA] OwnedQuantizedModel::from_apr failed: {}", e); }
}).ok()?;
if model_has_legacy_quant(&model) {
return None;
}
let info = AprCudaModelInfo {
arch: model.config.architecture.clone(),
num_layers: model.config.num_layers,
vocab_size: model.config.vocab_size,
hidden_dim: model.config.hidden_dim,
};
let cuda_model = OwnedQuantizedModelCuda::with_max_seq_len(model, 0, 2048).map_err(|e| {
if verbose { eprintln!("Backend: CPU (GPU unavailable: {})", e); }
}).ok()?;
Some((cuda_model, info))
}
#[cfg(feature = "cuda")]
fn log_apr_cuda_info(
info: &AprCudaModelInfo,
cuda_model: &crate::gguf::OwnedQuantizedModelCuda,
load_ms: f64,
) {
eprintln!(
"Architecture: {} ({} layers, vocab_size={})",
info.arch, info.num_layers, info.vocab_size
);
eprintln!(
"Config: hidden_size={}, quant=CUDA+KVCache, threads=1 (GPU)",
info.hidden_dim
);
eprintln!("Model loaded in {:.1}ms", load_ms);
eprintln!(
"Backend: GPU ({}, {} MB VRAM)",
cuda_model.device_name(),
cuda_model.vram_mb()
);
}
#[cfg(feature = "cuda")]
fn try_apr_cuda_inference(
config: &InferenceConfig,
input_tokens: &[u32],
input_token_count: usize,
load_start: Instant,
) -> Option<Result<InferenceResult>> {
use crate::gguf::QuantizedGenerateConfig;
let (mut cuda_model, info) = load_apr_cuda_model(&config.model_path, config.verbose)?;
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
if config.verbose {
log_apr_cuda_info(&info, &cuda_model, load_ms);
}
eprintln!("[GH-480-TRACE] try_apr_cuda_inference: model loaded OK, about to resolve stop tokens");
let stop_tokens = resolve_apr_stop_tokens(
cuda_model.model().config.eos_token_id,
&config.stop_tokens,
&config.model_path,
);
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens,
temperature: 0.0,
top_k: 1,
stop_tokens,
trace: false,
..Default::default()
};
eprintln!("[GH-480] F2 validation starting...");
if !validate_gpu_first_token(&mut cuda_model, &gen_config) {
eprintln!("[GH-480] F2 validation FAILED — falling back to CPU");
return None;
}
eprintln!("[GH-480] F2 validation PASSED — launching GPU generation");
let infer_start = Instant::now();
let tokens = match cuda_model.generate_gpu_resident(input_tokens, &gen_config) {
Ok(t) => t,
Err(e) => {
let msg = e.to_string();
eprintln!("[GH-480] generate_gpu_resident FAILED: {msg}");
if msg.contains("not supported") || msg.contains("architecture") {
if config.verbose {
eprintln!("[APR-CUDA] GPU-resident not supported, falling back to CPU: {msg}");
}
return None;
}
return Some(Err(RealizarError::InferenceError(format!(
"GPU generation failed: {}",
e
))));
},
};
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &tokens[input_token_count..];
let text = decode_apr_tokens(&config.model_path, generated_tokens);
let generated_token_count = generated_tokens.len();
Some(Ok(InferenceResult {
text,
tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tok_per_sec(generated_token_count, inference_ms),
load_ms,
format: "APR".to_string(),
used_gpu: true,
}))
}
fn log_apr_cpu_model_info(
verbose: bool,
validated: &crate::safetensors::validation::ValidatedAprTransformer,
load_ms: f64,
) {
if !verbose {
return;
}
let arch = &validated.config.architecture;
let thread_count = rayon::current_num_threads();
eprintln!(
"Architecture: {} ({} layers, vocab_size={})",
arch, validated.config.num_layers, validated.config.vocab_size
);
eprintln!(
"Config: hidden_size={}, context_length={}, quant=F32 (dequantized), threads={}",
validated.config.hidden_dim, validated.config.context_length, thread_count
);
eprintln!("Model loaded in {:.1}ms", load_ms);
eprintln!("Backend: CPU (SIMD-accelerated)");
}
fn try_load_llama_style(
model_path: &std::path::Path,
) -> Option<crate::safetensors::validation::ValidatedAprTransformer> {
if exceeds_f32_dequant_limit(model_path) {
return None;
}
if has_quantized_tensors_apr(model_path) {
return None;
}
match crate::apr_transformer::AprTransformer::from_apr_file_validated(model_path) {
Ok(t) => {
let arch = t.config.architecture.to_lowercase();
if arch.contains("gpt2") || arch.contains("gpt-2") {
None
} else {
Some(t)
}
}
Err(_) => None,
}
}
fn has_quantized_tensors_apr(model_path: &std::path::Path) -> bool {
use crate::apr::MappedAprModel;
let mapped = match MappedAprModel::from_path(model_path) {
Ok(m) => m,
Err(_) => return false,
};
mapped.tensors.iter().any(|t| {
let dtype = t.dtype.as_str();
dtype != "f32" && dtype != "f16" && dtype != "bf16"
})
}
fn exceeds_f32_dequant_limit(model_path: &std::path::Path) -> bool {
let file_size = std::fs::metadata(model_path)
.map(|m| m.len())
.unwrap_or(0);
let exceeds = crate::contract_gate::exceeds_f32_dequant_estimate(file_size);
if exceeds {
let mem_total = crate::contract_gate::system_memory_bytes().unwrap_or(0);
eprintln!(
"[GH-478] Model {} GB on disk → ~{} GB F32 dequant (system RAM: {} GB). \
Using quantized CPU inference to avoid OOM.",
file_size / (1 << 30),
file_size.saturating_mul(8) / (1 << 30),
mem_total / (1 << 30),
);
}
exceeds
}
fn run_apr_cpu_inference(
config: &InferenceConfig,
input_tokens: &[u32],
input_token_count: usize,
load_start: Instant,
) -> Result<InferenceResult> {
let validated = match try_load_llama_style(&config.model_path) {
Some(t) => t,
None => return run_apr_quantized_cpu_inference(config, input_tokens, input_token_count, load_start),
};
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
log_apr_cpu_model_info(config.verbose, &validated, load_ms);
let stop_tokens = resolve_apr_stop_tokens(
validated.config.eos_token_id,
&config.stop_tokens,
&config.model_path,
);
if config.verbose && !stop_tokens.is_empty() {
eprintln!("Stop tokens: {:?}", stop_tokens);
}
let infer_start = Instant::now();
let mut all_tokens = input_tokens.to_vec();
let mut cache = crate::apr_transformer::AprKVCache::new(&validated.config);
for (pos, &token) in input_tokens.iter().enumerate() {
let _ = validated.forward_with_cache(token, &mut cache, pos)?;
}
let mut position = input_tokens.len();
for _ in 0..config.max_tokens {
let last_token = *all_tokens.last().unwrap_or(&1);
let logits = validated.forward_with_cache(last_token, &mut cache, position)?;
let next_token = logits
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(i, _)| i as u32);
if next_token == 0 || stop_tokens.contains(&next_token) {
break;
}
all_tokens.push(next_token);
position += 1;
}
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &all_tokens[input_token_count..];
let text = decode_apr_tokens(&config.model_path, generated_tokens);
let generated_token_count = generated_tokens.len();
Ok(InferenceResult {
text,
tokens: all_tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tok_per_sec(generated_token_count, inference_ms),
load_ms,
format: "APR".to_string(),
used_gpu: false,
})
}
fn run_apr_quantized_cpu_inference(
config: &InferenceConfig,
input_tokens: &[u32],
input_token_count: usize,
load_start: Instant,
) -> Result<InferenceResult> {
use crate::apr::MappedAprModel;
use crate::gguf::{OwnedQuantizedModel, QuantizedGenerateConfig};
let mapped = MappedAprModel::from_path(&config.model_path)?;
let model = OwnedQuantizedModel::from_apr(&mapped)?;
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
if config.verbose {
eprintln!(
"Architecture: {} ({} layers, vocab_size={})",
model.config.architecture, model.config.num_layers, model.config.vocab_size
);
eprintln!(
"Config: hidden_size={}, quant=Q4_K (OwnedQuantizedModel CPU), threads={}",
model.config.hidden_dim,
rayon::current_num_threads()
);
eprintln!("Model loaded in {:.1}ms", load_ms);
eprintln!("Backend: CPU (OwnedQuantizedModel fallback for non-LLaMA arch)");
}
let stop_tokens = resolve_apr_stop_tokens(
model.config.eos_token_id,
&config.stop_tokens,
&config.model_path,
);
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens,
temperature: config.temperature,
top_k: config.top_k,
stop_tokens,
trace: config.trace,
..Default::default()
};
let infer_start = Instant::now();
let tokens = model.generate_with_cache(input_tokens, &gen_config)?;
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &tokens[input_token_count..];
let text = decode_apr_tokens(&config.model_path, generated_tokens);
let generated_token_count = generated_tokens.len();
Ok(InferenceResult {
text,
tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tok_per_sec(generated_token_count, inference_ms),
load_ms,
format: "APR".to_string(),
used_gpu: false,
})
}
fn resolve_apr_stop_tokens(
model_eos: Option<u32>,
caller_stop_tokens: &[u32],
model_path: &std::path::Path,
) -> Vec<u32> {
let mut tokens: Vec<u32> = model_eos.into_iter().collect();
for &t in caller_stop_tokens {
if !tokens.contains(&t) {
tokens.push(t);
}
}
if tokens.is_empty() {
tokens = resolve_stop_tokens_from_tokenizer(model_path);
}
tokens
}
fn resolve_stop_tokens_from_tokenizer(model_path: &std::path::Path) -> Vec<u32> {
let tokenizer = match crate::apr::AprV2Model::load_tokenizer(model_path) {
Some(t) => t,
None => return Vec::new(),
};
let mut tokens: Vec<u32> = tokenizer.eos_id.into_iter().collect();
for marker in &["<|im_end|>", "<|endoftext|>"] {
let id = tokenizer
.special_tokens
.get(*marker)
.or_else(|| tokenizer.token_to_id.get(*marker));
if let Some(&id) = id {
if !tokens.contains(&id) {
tokens.push(id);
}
}
}
tokens
}
fn decode_apr_tokens(model_path: &std::path::Path, tokens: &[u32]) -> String {
use crate::apr::AprV2Model;
let text = if let Some(tokenizer) = AprV2Model::load_tokenizer(model_path) {
tokenizer.decode(tokens)
} else if let Some(tokenizer) = find_fallback_tokenizer(model_path) {
tokenizer.decode(tokens)
} else {
format!("[{} tokens generated, tokenizer not found]", tokens.len())
};
clean_model_output(&text)
}
fn tok_per_sec(count: usize, ms: f64) -> f64 {
if ms > 0.0 {
count as f64 / (ms / 1000.0)
} else {
0.0
}
}
fn run_safetensors_inference(
config: &InferenceConfig,
prepared: &PreparedTokens,
) -> Result<InferenceResult> {
if config.verbose {
eprintln!("Loading SafeTensors model: {}", config.model_path.display());
}
let input_tokens = prepared.tokens().to_vec();
let input_token_count = prepared.input_count();
#[cfg(feature = "cuda")]
if !config.no_gpu {
if let Some(result) =
try_safetensors_cuda_inference(config, &input_tokens, input_token_count)
{
return result;
}
}
run_safetensors_cpu_inference(config, &input_tokens, input_token_count)
}
#[cfg(feature = "cuda")]
fn try_safetensors_cuda_inference(
config: &InferenceConfig,
input_tokens: &[u32],
input_token_count: usize,
) -> Option<Result<InferenceResult>> {
use crate::safetensors_cuda::SafeTensorsCudaModel;
let load_start = Instant::now();
let mut cuda_model = match SafeTensorsCudaModel::load(&config.model_path, 0) {
Ok(m) => m,
Err(e) => {
if config.verbose {
eprintln!("Backend: CPU (GPU init failed: {})", e);
}
return None;
},
};
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
if config.verbose {
eprintln!(
"Architecture: SafeTensors ({} layers, vocab_size={})",
cuda_model.config().num_layers,
cuda_model.config().vocab_size
);
eprintln!(
"Config: hidden_size={}, context_length={}, quant=F16/BF16, threads=1 (GPU)",
cuda_model.config().hidden_dim,
cuda_model.config().context_length
);
eprintln!("Model loaded in {:.1}ms", load_ms);
eprintln!(
"Backend: GPU ({}, {} MB VRAM)",
cuda_model.device_name(),
cuda_model.vram_mb()
);
}
let infer_start = Instant::now();
let eos_id = cuda_model.config().eos_token_id.unwrap_or(0);
let tokens = match cuda_model.generate(input_tokens, config.max_tokens, eos_id) {
Ok(t) => t,
Err(e) => {
return Some(Err(RealizarError::InferenceError(format!(
"GPU generation failed: {}",
e
))))
},
};
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &tokens[input_token_count..];
let text = decode_apr_tokens(&config.model_path, generated_tokens);
let generated_token_count = generated_tokens.len();
Some(Ok(InferenceResult {
text,
tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tok_per_sec(generated_token_count, inference_ms),
load_ms,
format: "SafeTensors".to_string(),
used_gpu: true,
}))
}