#[derive(Debug, Clone)]
pub struct InferenceResult {
pub text: String,
pub tokens: Vec<u32>,
pub input_token_count: usize,
pub generated_token_count: usize,
pub inference_ms: f64,
pub tok_per_sec: f64,
pub load_ms: f64,
pub format: String,
pub used_gpu: bool,
}
const VALID_MODEL_EXTENSIONS: &[&str] = &["gguf", "safetensors", "apr", "bin", "json"];
pub(crate) fn validate_model_path(path: &std::path::Path) -> Result<()> {
let path_str = path.to_string_lossy();
if path_str.contains("..") {
return Err(RealizarError::SecurityError {
reason: format!(
"Path traversal detected: '{}'. Use absolute paths or paths without '..'",
path_str
),
});
}
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(str::to_lowercase)
.unwrap_or_default();
if !VALID_MODEL_EXTENSIONS.contains(&extension.as_str()) {
return Err(RealizarError::SecurityError {
reason: format!(
"Invalid model file extension: '.{}'. Expected one of: {}",
extension,
VALID_MODEL_EXTENSIONS.join(", ")
),
});
}
if !path.exists() {
return Err(RealizarError::IoError {
message: format!("File not found: {}", path.display()),
});
}
if !path.is_file() {
return Err(RealizarError::SecurityError {
reason: format!("Path is not a regular file: {}", path.display()),
});
}
Ok(())
}
pub fn run_inference(config: &InferenceConfig) -> Result<InferenceResult> {
if config.use_mock_backend {
return run_mock_inference(config);
}
let path_str = config.model_path.to_string_lossy();
if path_str.ends_with(".safetensors.index.json") {
validate_model_path(&config.model_path)?;
let format = ModelFormat::SafeTensors;
let prepared = prepare_tokens(config, &format)?;
return run_sharded_safetensors_inference(config, &prepared);
}
validate_model_path(&config.model_path)?;
let magic = {
use std::io::Read;
let mut file = std::fs::File::open(&config.model_path).map_err(|e| RealizarError::IoError {
message: format!("Failed to read model: {}", e),
})?;
let mut buf = [0u8; 8];
file.read_exact(&mut buf).map_err(|e| {
if e.kind() == std::io::ErrorKind::UnexpectedEof {
RealizarError::FormatError {
reason: "File too small for format detection".to_string(),
}
} else {
RealizarError::IoError {
message: format!("Failed to read model header: {}", e),
}
}
})?;
buf
};
let format = detect_format(&magic).map_err(|e| RealizarError::FormatError {
reason: format!("Format detection failed: {}", e),
})?;
let prepared = prepare_tokens(config, &format)?;
match format {
ModelFormat::Gguf => run_gguf_inference(config, &prepared),
ModelFormat::Apr => run_apr_inference(config, &prepared),
ModelFormat::SafeTensors => run_safetensors_inference(config, &prepared),
}
}
fn run_gguf_inference(
config: &InferenceConfig,
prepared: &PreparedTokens,
) -> Result<InferenceResult> {
use crate::gguf::{MappedGGUFModel, OwnedQuantizedModel, QuantizedGenerateConfig};
if config.verbose {
eprintln!("Loading model: {}", config.model_path.display());
}
let load_start = Instant::now();
let mapped = MappedGGUFModel::from_path(&config.model_path)?;
prefault_mmap(mapped.data());
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
let gguf_arch = mapped.model.architecture().unwrap_or("transformer");
if config.verbose {
print_gguf_verbose_info(gguf_arch, &model, load_ms);
}
let input_tokens = prepared.tokens().to_vec();
let input_token_count = prepared.input_count();
let model_config = model.config.clone();
let mut stop_tokens: Vec<u32> = model_config.eos_token_id.into_iter().collect();
for &t in &config.stop_tokens {
if !stop_tokens.contains(&t) {
stop_tokens.push(t);
}
}
let gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens,
temperature: config.temperature,
top_k: config.top_k,
stop_tokens,
trace: config.trace,
..Default::default()
};
let infer_start = Instant::now();
let (tokens, used_gpu) = run_gguf_generate(model, &input_tokens, &gen_config, config)?;
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &tokens[input_token_count..];
let raw_text = mapped.model.decode(generated_tokens);
if config.verbose {
eprintln!("[DEBUG] input_count={}, total_tokens={}, generated_count={}", input_token_count, tokens.len(), generated_tokens.len());
eprintln!("[DEBUG] generated token ids: {:?}", &generated_tokens[..generated_tokens.len().min(20)]);
eprintln!("[DEBUG] raw decoded: {:?}", &raw_text[..raw_text.len().min(200)]);
}
let text = clean_model_output(&raw_text);
let generated_token_count = generated_tokens.len();
let tps = tok_per_sec(generated_token_count, inference_ms);
write_gguf_trace(
config,
&model_config,
input_token_count,
generated_token_count,
load_ms,
inference_ms,
tps,
used_gpu,
);
Ok(InferenceResult {
text,
tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tps,
load_ms,
format: "GGUF".to_string(),
used_gpu,
})
}
fn print_gguf_verbose_info(
gguf_arch: &str,
model: &crate::gguf::OwnedQuantizedModel,
load_ms: f64,
) {
let arch = match gguf_arch.to_lowercase().as_str() {
"qwen2" | "qwen" => "Qwen2",
"llama" => "LLaMA",
"mistral" => "Mistral",
"phi" | "phi3" => "Phi",
_ => "Transformer",
};
let quant_type = qtype_to_dtype_str(model.lm_head_weight.qtype);
let thread_count = rayon::current_num_threads();
eprintln!(
"Architecture: {} [GGUF: {}] ({} layers, vocab_size={})",
arch, gguf_arch, model.config.num_layers, model.config.vocab_size
);
eprintln!(
"Config: hidden_size={}, context_length={}, quant={}, threads={}",
model.config.hidden_dim, model.config.context_length, quant_type, thread_count
);
eprintln!("Model loaded in {:.1}ms", load_ms);
}
fn write_gguf_trace(
config: &InferenceConfig,
model_config: &crate::gguf::GGUFConfig,
input_token_count: usize,
generated_token_count: usize,
load_ms: f64,
inference_ms: f64,
tps: f64,
used_gpu: bool,
) {
let trace_path = match config.trace_output {
Some(ref p) => p,
None => return,
};
let trace_json = format!(
r#"{{
"version": "1.0",
"timestamp": "{}",
"model": {{
"path": "{}",
"format": "GGUF",
"num_layers": {},
"hidden_dim": {},
"vocab_size": {},
"num_heads": {}
}},
"inference": {{
"input_tokens": {},
"generated_tokens": {},
"load_ms": {:.2},
"inference_ms": {:.2},
"tok_per_sec": {:.2},
"used_gpu": {}
}},
"events": []
}}
"#,
chrono::Utc::now().to_rfc3339(),
config.model_path.display(),
model_config.num_layers,
model_config.hidden_dim,
model_config.vocab_size,
model_config.num_heads,
input_token_count,
generated_token_count,
load_ms,
inference_ms,
tps,
used_gpu
);
if let Err(e) = std::fs::write(trace_path, trace_json) {
eprintln!(
"Warning: Failed to write trace output to {}: {}",
trace_path.display(),
e
);
}
}
#[inline]
fn is_legacy_gguf_quant(qtype: u32) -> bool {
matches!(qtype, 2 | 3 | 6 | 7)
}
fn model_has_legacy_quant(model: &crate::gguf::OwnedQuantizedModel) -> bool {
is_legacy_gguf_quant(model.lm_head_weight.qtype)
|| model.layers.iter().any(|l| {
is_legacy_gguf_quant(l.ffn_down_weight.qtype)
|| is_legacy_gguf_quant(l.ffn_up_weight.qtype)
|| is_legacy_gguf_quant(l.attn_output_weight.qtype)
})
}
#[inline]
fn log_cpu_backend(verbose: bool, is_legacy: bool) {
if !verbose {
return;
}
if is_legacy {
eprintln!("Backend: CPU (Q4_0 format - GPU Q4_K kernels incompatible)");
} else {
eprintln!("Backend: CPU (SIMD-accelerated)");
}
}
#[cfg(feature = "cuda")]
fn validate_gpu_first_token(
cuda_model: &mut crate::gguf::OwnedQuantizedModelCuda,
gen_config: &crate::gguf::QuantizedGenerateConfig,
) -> bool {
use crate::gguf::OwnedQuantizedKVCache;
if std::env::var("SKIP_PARITY_GATE")
.map(|v| v == "1")
.unwrap_or(false)
{
return true;
}
let model = cuda_model.model();
let bos_id = match model.config.bos_token_id {
Some(id) => id,
None => {
eprintln!("[F2-VALIDATION] BOS token unknown — skipping GPU validation");
return true;
},
};
let probe_token: &[u32] = &[bos_id];
let kv_dim = model.config.num_kv_heads * (model.config.hidden_dim / model.config.num_heads);
let num_layers = model.config.num_layers;
let mut cpu_cache = OwnedQuantizedKVCache::new(num_layers, kv_dim, 2);
let cpu_logits = match model.forward_single_with_cache(probe_token[0], &mut cpu_cache, 0) {
Ok(logits) => logits,
Err(_) => return true, };
let cpu_argmax = cpu_logits
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(i, _)| i as u32);
let gpu_first_config = crate::gguf::QuantizedGenerateConfig {
max_tokens: 1,
temperature: 0.0,
top_k: 1,
..gen_config.clone()
};
match cuda_model.generate_gpu_resident(probe_token, &gpu_first_config) {
Ok(gpu_tokens) if gpu_tokens.len() > 1 => {
let gpu_first = gpu_tokens[1];
if gpu_first == cpu_argmax {
true
} else {
eprintln!(
"[F2-VALIDATION] GPU token {} != CPU token {} for BOS probe — falling back to CPU",
gpu_first, cpu_argmax
);
false
}
},
Ok(_) => true,
Err(_) => false,
}
}