#[derive(Debug, Clone)]
pub struct InferenceResult {
pub text: String,
pub tokens: Vec<u32>,
pub input_token_count: usize,
pub generated_token_count: usize,
pub inference_ms: f64,
pub tok_per_sec: f64,
pub load_ms: f64,
pub format: String,
pub used_gpu: bool,
}
const VALID_MODEL_EXTENSIONS: &[&str] = &["gguf", "safetensors", "apr", "bin", "json"];
pub(crate) fn validate_model_path(path: &std::path::Path) -> Result<()> {
let path_str = path.to_string_lossy();
if path_str.contains("..") {
return Err(RealizarError::SecurityError {
reason: format!(
"Path traversal detected: '{}'. Use absolute paths or paths without '..'",
path_str
),
});
}
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(str::to_lowercase)
.unwrap_or_default();
if !VALID_MODEL_EXTENSIONS.contains(&extension.as_str()) {
return Err(RealizarError::SecurityError {
reason: format!(
"Invalid model file extension: '.{}'. Expected one of: {}",
extension,
VALID_MODEL_EXTENSIONS.join(", ")
),
});
}
if !path.exists() {
return Err(RealizarError::IoError {
message: format!("File not found: {}", path.display()),
});
}
if !path.is_file() {
return Err(RealizarError::SecurityError {
reason: format!("Path is not a regular file: {}", path.display()),
});
}
Ok(())
}
pub fn run_inference(config: &InferenceConfig) -> Result<InferenceResult> {
if config.use_mock_backend {
return run_mock_inference(config);
}
let path_str = config.model_path.to_string_lossy();
if path_str.ends_with(".safetensors.index.json") {
validate_model_path(&config.model_path)?;
let format = ModelFormat::SafeTensors;
let prepared = prepare_tokens(config, &format)?;
return run_sharded_safetensors_inference(config, &prepared);
}
validate_model_path(&config.model_path)?;
let magic = {
use std::io::Read;
let mut file = std::fs::File::open(&config.model_path).map_err(|e| RealizarError::IoError {
message: format!("Failed to read model: {}", e),
})?;
let mut buf = [0u8; 8];
file.read_exact(&mut buf).map_err(|e| {
if e.kind() == std::io::ErrorKind::UnexpectedEof {
RealizarError::FormatError {
reason: "File too small for format detection".to_string(),
}
} else {
RealizarError::IoError {
message: format!("Failed to read model header: {}", e),
}
}
})?;
buf
};
let format = detect_format(&magic).map_err(|e| RealizarError::FormatError {
reason: format!("Format detection failed: {}", e),
})?;
let prepared = prepare_tokens(config, &format)?;
match format {
ModelFormat::Gguf => run_gguf_inference(config, &prepared),
ModelFormat::Apr => run_apr_inference(config, &prepared),
ModelFormat::SafeTensors => run_safetensors_inference(config, &prepared),
}
}
fn run_gguf_inference(
config: &InferenceConfig,
prepared: &PreparedTokens,
) -> Result<InferenceResult> {
use crate::gguf::{MappedGGUFModel, OwnedQuantizedModel, QuantizedGenerateConfig};
if config.verbose {
eprintln!("Loading model: {}", config.model_path.display());
}
let load_start = Instant::now();
let mapped = MappedGGUFModel::from_path(&config.model_path)?;
prefault_mmap(mapped.data());
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let load_ms = load_start.elapsed().as_secs_f64() * 1000.0;
let gguf_arch = mapped.model.architecture().unwrap_or("transformer");
if config.verbose {
print_gguf_verbose_info(gguf_arch, &model, load_ms);
}
let input_tokens = prepared.tokens().to_vec();
let input_token_count = prepared.input_count();
let model_config = model.config.clone();
let mut stop_tokens: Vec<u32> = model_config.eos_token_id.into_iter().collect();
for &t in &config.stop_tokens {
if !stop_tokens.contains(&t) {
stop_tokens.push(t);
}
}
let mut gen_config = QuantizedGenerateConfig {
max_tokens: config.max_tokens,
stop_tokens,
trace: config.trace,
..Default::default()
};
config.apply_sampling_to(&mut gen_config);
let infer_start = Instant::now();
let canonical_arch = crate::tensor_names::normalize_architecture(&model.config.architecture);
let (tokens, used_gpu) = if canonical_arch == "qwen3_moe" {
let tokens = crate::infer::qwen3_moe_generate::run_qwen3_moe_generate(
&mapped,
&model,
&input_tokens,
&gen_config,
)?;
(tokens, false) } else {
run_gguf_generate(model, &input_tokens, &gen_config, config)?
};
let inference_ms = infer_start.elapsed().as_secs_f64() * 1000.0;
let generated_tokens = &tokens[input_token_count..];
let raw_text = mapped.model.decode(generated_tokens);
if config.verbose {
eprintln!("[DEBUG] input_count={}, total_tokens={}, generated_count={}", input_token_count, tokens.len(), generated_tokens.len());
eprintln!("[DEBUG] generated token ids: {:?}", &generated_tokens[..generated_tokens.len().min(20)]);
eprintln!("[DEBUG] raw decoded: {:?}", &raw_text[..raw_text.len().min(200)]);
}
let text = clean_model_output(&raw_text);
let generated_token_count = generated_tokens.len();
let tps = tok_per_sec(generated_token_count, inference_ms);
write_gguf_trace(
config,
&model_config,
input_token_count,
generated_token_count,
load_ms,
inference_ms,
tps,
used_gpu,
);
Ok(InferenceResult {
text,
tokens,
input_token_count,
generated_token_count,
inference_ms,
tok_per_sec: tps,
load_ms,
format: "GGUF".to_string(),
used_gpu,
})
}
fn print_gguf_verbose_info(
gguf_arch: &str,
model: &crate::gguf::OwnedQuantizedModel,
load_ms: f64,
) {
let arch = match gguf_arch.to_lowercase().as_str() {
"qwen2" | "qwen" => "Qwen2",
"llama" => "LLaMA",
"mistral" => "Mistral",
"phi" | "phi3" => "Phi",
_ => "Transformer",
};
let quant_type = qtype_to_dtype_str(model.lm_head_weight.qtype);
let thread_count = rayon::current_num_threads();
eprintln!(
"Architecture: {} [GGUF: {}] ({} layers, vocab_size={})",
arch, gguf_arch, model.config.num_layers, model.config.vocab_size
);
eprintln!(
"Config: hidden_size={}, context_length={}, quant={}, threads={}",
model.config.hidden_dim, model.config.context_length, quant_type, thread_count
);
eprintln!("Model loaded in {:.1}ms", load_ms);
}
fn write_gguf_trace(
config: &InferenceConfig,
model_config: &crate::gguf::GGUFConfig,
input_token_count: usize,
generated_token_count: usize,
load_ms: f64,
inference_ms: f64,
tps: f64,
used_gpu: bool,
) {
let trace_path = match config.trace_output {
Some(ref p) => p,
None => return,
};
let trace_json = format!(
r#"{{
"version": "1.0",
"timestamp": "{}",
"model": {{
"path": "{}",
"format": "GGUF",
"num_layers": {},
"hidden_dim": {},
"vocab_size": {},
"num_heads": {}
}},
"inference": {{
"input_tokens": {},
"generated_tokens": {},
"load_ms": {:.2},
"inference_ms": {:.2},
"tok_per_sec": {:.2},
"used_gpu": {}
}},
"events": []
}}
"#,
chrono::Utc::now().to_rfc3339(),
config.model_path.display(),
model_config.num_layers,
model_config.hidden_dim,
model_config.vocab_size,
model_config.num_heads,
input_token_count,
generated_token_count,
load_ms,
inference_ms,
tps,
used_gpu
);
if let Err(e) = std::fs::write(trace_path, trace_json) {
eprintln!(
"Warning: Failed to write trace output to {}: {}",
trace_path.display(),
e
);
}
}
#[inline]
fn is_legacy_gguf_quant(qtype: u32) -> bool {
crate::gguf::gpu_unsupported_quant_qtype(qtype)
}
fn model_has_legacy_quant(model: &crate::gguf::OwnedQuantizedModel) -> bool {
model.has_gpu_unsupported_quant()
}
#[inline]
fn log_cpu_backend(verbose: bool, is_legacy: bool) {
if !verbose {
return;
}
if is_legacy {
eprintln!("Backend: CPU (Q4_0 format - GPU Q4_K kernels incompatible)");
} else {
eprintln!("Backend: CPU (SIMD-accelerated)");
}
}
#[cfg(feature = "cuda")]
fn validate_gpu_first_token(
cuda_model: &mut crate::gguf::OwnedQuantizedModelCuda,
gen_config: &crate::gguf::QuantizedGenerateConfig,
probe_context: &[u32],
) -> bool {
use crate::gguf::OwnedQuantizedKVCache;
if std::env::var("SKIP_PARITY_GATE")
.map(|v| v == "1")
.unwrap_or(false)
{
return true;
}
let model = cuda_model.model();
const PROBE_MAX_CTX: usize = 64;
let probe: Vec<u32> = if probe_context.is_empty() {
match model.config.bos_token_id {
Some(id) => vec![id],
None => {
eprintln!("[F2-VALIDATION] no prompt context and BOS unknown — skipping GPU validation");
return true;
},
}
} else {
let start = probe_context.len().saturating_sub(PROBE_MAX_CTX);
probe_context[start..].to_vec()
};
let kv_dim = model.config.num_kv_heads * (model.config.hidden_dim / model.config.num_heads);
let num_layers = model.config.num_layers;
let mut cpu_cache = OwnedQuantizedKVCache::new(num_layers, kv_dim, probe.len().max(2));
let mut cpu_logits = None;
for (pos, &tok) in probe.iter().enumerate() {
match model.forward_single_with_cache(tok, &mut cpu_cache, pos) {
Ok(logits) => cpu_logits = Some(logits),
Err(_) => return true, }
}
let cpu_logits = match cpu_logits {
Some(l) => l,
None => return true,
};
let cpu_argmax = cpu_logits
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(i, _)| i as u32);
let gpu_first_config = crate::gguf::QuantizedGenerateConfig {
max_tokens: 1,
temperature: 0.0,
top_k: 1,
..gen_config.clone()
};
match cuda_model.generate_gpu_resident(&probe, &gpu_first_config) {
Ok(gpu_tokens) if gpu_tokens.len() > probe.len() => {
let gpu_first = gpu_tokens[probe.len()];
let rel_gap = cpu_logit_rel_gap(&cpu_logits, cpu_argmax, gpu_first);
if gpu_probe_token_acceptable(&cpu_logits, cpu_argmax, gpu_first) {
if gpu_first != cpu_argmax {
eprintln!(
"[F2-VALIDATION] GPU token {gpu_first} != CPU argmax {cpu_argmax} but near-tie (rel_gap={rel_gap:.4} <= {GPU_PROBE_NEAR_TIE_REL_GAP}) on {}-token probe — accepting GPU path",
probe.len()
);
}
true
} else {
eprintln!(
"[F2-VALIDATION] GPU token {gpu_first} != CPU token {cpu_argmax}; rel_gap={rel_gap:.4} > {GPU_PROBE_NEAR_TIE_REL_GAP} on {}-token probe — real divergence, falling back to CPU",
probe.len()
);
false
}
},
Ok(_) => true,
Err(_) => false,
}
}
pub(crate) const GPU_PROBE_NEAR_TIE_REL_GAP: f32 = 0.15;
pub(crate) fn cpu_logit_rel_gap(cpu_logits: &[f32], cpu_argmax: u32, token: u32) -> f32 {
let cpu_max = cpu_logits[cpu_argmax as usize];
let cpu_min = cpu_logits.iter().copied().fold(f32::INFINITY, f32::min);
let at = cpu_logits
.get(token as usize)
.copied()
.unwrap_or(f32::NEG_INFINITY);
let range = (cpu_max - cpu_min).max(f32::MIN_POSITIVE);
(cpu_max - at) / range
}
pub(crate) fn gpu_probe_token_acceptable(cpu_logits: &[f32], cpu_argmax: u32, gpu_first: u32) -> bool {
gpu_first == cpu_argmax
|| cpu_logit_rel_gap(cpu_logits, cpu_argmax, gpu_first) <= GPU_PROBE_NEAR_TIE_REL_GAP
}
#[cfg(test)]
mod pmat742_parity_gate_tests {
use super::{cpu_logit_rel_gap, gpu_probe_token_acceptable, GPU_PROBE_NEAR_TIE_REL_GAP};
const NEAR_FLAT: [f32; 6] = [10.00, 9.99, 9.98, 9.97, 1.00, 0.50];
#[test]
fn near_tie_argmax_flip_is_accepted() {
let rel_gap = cpu_logit_rel_gap(&NEAR_FLAT, 0, 1);
assert!(rel_gap <= GPU_PROBE_NEAR_TIE_REL_GAP, "rel_gap {rel_gap} should be a near-tie");
assert!(gpu_probe_token_acceptable(&NEAR_FLAT, 0, 1));
assert!(gpu_probe_token_acceptable(&NEAR_FLAT, 0, 3));
}
#[test]
fn exact_argmax_match_is_accepted() {
assert!(gpu_probe_token_acceptable(&NEAR_FLAT, 0, 0));
let peaked = [20.0_f32, 1.0, 0.5, 0.1];
assert!(gpu_probe_token_acceptable(&peaked, 0, 0));
}
#[test]
fn real_divergence_is_rejected() {
let peaked = [20.0_f32, 5.0, 0.5, 0.0];
let rel_gap = cpu_logit_rel_gap(&peaked, 0, 3); assert!(rel_gap > GPU_PROBE_NEAR_TIE_REL_GAP, "tail token rel_gap {rel_gap} must exceed tolerance");
assert!(!gpu_probe_token_acceptable(&peaked, 0, 3));
}
#[test]
fn rel_gap_endpoints_are_zero_and_one() {
let logits = [3.0_f32, 2.0, 1.0, 0.0];
assert!((cpu_logit_rel_gap(&logits, 0, 0) - 0.0).abs() < 1e-6); assert!((cpu_logit_rel_gap(&logits, 0, 3) - 1.0).abs() < 1e-6); }
#[test]
fn out_of_range_gpu_token_is_rejected() {
assert!(!gpu_probe_token_acceptable(&NEAR_FLAT, 0, 9999));
}
}