#[cfg(feature = "inference")]
fn detect_model_format(path: &Path) -> Option<realizar::format::ModelFormat> {
let magic = std::fs::File::open(path).ok().and_then(|mut f| {
use std::io::Read;
let mut buf = [0u8; 8];
f.read_exact(&mut buf).ok()?;
Some(buf.to_vec())
})?;
realizar::format::detect_format(&magic).ok()
}
fn run_ptx_parity_gate(path: &Path, config: &QaConfig) -> Result<GateResult> {
let start = Instant::now();
if !config.json && config.verbose {
println!("{}", "Running PTX parity validation...".yellow());
}
#[cfg(feature = "inference")]
{
use realizar::format::ModelFormat;
if detect_model_format(path) != Some(ModelFormat::Gguf) {
return Ok(GateResult::skipped(
"ptx_parity",
"Non-GGUF format (PTX kernels only apply to quantized inference)",
));
}
let report = run_ptx_validation(path)?;
let duration = start.elapsed();
if !report.all_passed() {
print_ptx_violations(config, &report);
}
Ok(ptx_gate_result(&report, duration))
}
#[cfg(not(feature = "inference"))]
{
let _ = (path, config, start);
Ok(GateResult::skipped(
"ptx_parity",
"Requires inference feature",
))
}
}
#[cfg(feature = "inference")]
fn run_ptx_validation(path: &Path) -> Result<realizar::ptx_parity::PtxParityReport> {
use realizar::ptx_parity::{validate_all_kernel_pairs, KernelDimensions};
let mapped = realizar::gguf::MappedGGUFModel::from_path(path.to_str().unwrap_or_default())
.map_err(|e| CliError::ValidationFailed(format!("Failed to load GGUF: {e}")))?;
let model_config = realizar::gguf::GGUFConfig::from_gguf(&mapped.model)
.map_err(|e| CliError::ValidationFailed(format!("Failed to read config: {e}")))?;
let dims = KernelDimensions {
hidden_dim: model_config.hidden_dim as u32,
intermediate_dim: model_config.intermediate_dim as u32,
num_heads: model_config.num_heads as u32,
head_dim: (model_config.hidden_dim / model_config.num_heads) as u32,
rope_theta: model_config.rope_theta,
epsilon: model_config.eps,
};
Ok(validate_all_kernel_pairs(&dims))
}
#[cfg(feature = "inference")]
fn print_ptx_violations(config: &QaConfig, report: &realizar::ptx_parity::PtxParityReport) {
if config.json || !config.verbose {
return;
}
for result in &report.results {
if !result.passed {
println!(
" {} {} ({}): {}",
"FAIL".red(),
result.name,
result.dispatch_strategy,
result.violations.join("; ")
);
}
}
}
#[cfg(feature = "inference")]
fn ptx_gate_result(
report: &realizar::ptx_parity::PtxParityReport,
duration: Duration,
) -> GateResult {
if report.all_passed() {
GateResult::passed(
"ptx_parity",
&report.summary(),
Some(report.passed as f64),
Some(report.total as f64),
duration,
)
} else {
GateResult::failed(
"ptx_parity",
&report.summary(),
Some(report.passed as f64),
Some(report.total as f64),
duration,
)
}
}
fn run_gpu_state_isolation_gate(path: &Path, config: &QaConfig) -> Result<GateResult> {
let start = Instant::now();
#[cfg(all(feature = "inference", feature = "cuda"))]
{
use realizar::cuda::CudaExecutor;
use realizar::format::ModelFormat;
let _ = config;
if !CudaExecutor::is_available() || CudaExecutor::num_devices() == 0 {
return Ok(GateResult::skipped(
"gpu_state_isolation",
"CUDA not available",
));
}
if detect_model_format(path) != Some(ModelFormat::Gguf) {
return Ok(GateResult::skipped(
"gpu_state_isolation",
"Only GGUF format supported for GPU state isolation",
));
}
let result = run_gpu_isolation_test(path)?;
let duration = start.elapsed();
Ok(gpu_isolation_gate_result(result, duration))
}
#[cfg(not(all(feature = "inference", feature = "cuda")))]
{
let _ = (path, config, start);
Ok(GateResult::skipped(
"gpu_state_isolation",
"Requires inference+cuda features",
))
}
}
#[cfg(all(feature = "inference", feature = "cuda"))]
enum GpuIsolationResult {
Passed,
StateLeak(String, String),
ModelStuck,
}
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_gpu_isolation_test(path: &Path) -> Result<GpuIsolationResult> {
use realizar::gguf::{
GGUFModel, MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda,
QuantizedGenerateConfig,
};
let model_bytes = std::fs::read(path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to read model: {e}")))?;
let gguf = GGUFModel::from_bytes(&model_bytes)
.map_err(|e| CliError::ValidationFailed(format!("Failed to parse GGUF: {e}")))?;
let bos = aprender::demo::SpecialTokens::qwen2().bos_id;
let tokens_a = gguf
.encode("<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\n")
.unwrap_or_else(|| vec![bos, 9707]);
let tokens_b = gguf
.encode("<|im_start|>user\nWrite hello world in Python<|im_end|>\n<|im_start|>assistant\n")
.unwrap_or_else(|| vec![bos, 1234]);
let gen_config = QuantizedGenerateConfig {
max_tokens: 16,
temperature: 0.0,
top_k: 1,
..Default::default()
};
let mapped = MappedGGUFModel::from_path(path)
.map_err(|e| CliError::ValidationFailed(format!("Map failed: {e}")))?;
let model = OwnedQuantizedModel::from_mapped(&mapped)
.map_err(|e| CliError::ValidationFailed(format!("Model failed: {e}")))?;
let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)
.map_err(|e| CliError::ValidationFailed(format!("CUDA init failed: {e}")))?;
let output_a = cuda_model
.generate_gpu_resident(&tokens_a, &gen_config)
.map_err(|e| CliError::ValidationFailed(format!("Gen 1 failed: {e}")))?;
let output_b = cuda_model
.generate_gpu_resident(&tokens_b, &gen_config)
.map_err(|e| CliError::ValidationFailed(format!("Gen 2 failed: {e}")))?;
let output_a2 = cuda_model
.generate_gpu_resident(&tokens_a, &gen_config)
.map_err(|e| CliError::ValidationFailed(format!("Gen 3 failed: {e}")))?;
if output_a != output_a2 {
let text_a = gguf.decode(&output_a);
let text_a2 = gguf.decode(&output_a2);
return Ok(GpuIsolationResult::StateLeak(
text_a.chars().take(50).collect(),
text_a2.chars().take(50).collect(),
));
}
if output_a == output_b {
return Ok(GpuIsolationResult::ModelStuck);
}
Ok(GpuIsolationResult::Passed)
}
#[cfg(all(feature = "inference", feature = "cuda"))]
fn gpu_isolation_gate_result(result: GpuIsolationResult, duration: Duration) -> GateResult {
match result {
GpuIsolationResult::Passed => GateResult::passed(
"gpu_state_isolation",
"GPU state properly isolated: 3 generations, deterministic replay confirmed",
Some(3.0),
Some(3.0),
duration,
),
GpuIsolationResult::StateLeak(first, retry) => GateResult::failed(
"gpu_state_isolation",
&format!(
"State leak: prompt A produced different output on retry. \
First: '{first}', Retry: '{retry}'"
),
None,
None,
duration,
),
GpuIsolationResult::ModelStuck => GateResult::failed(
"gpu_state_isolation",
"Model stuck: same output for different prompts (GPU state not functional)",
None,
None,
duration,
),
}
}
fn run_performance_regression_gate(
current_gates: &[GateResult],
config: &QaConfig,
) -> Result<GateResult> {
let start = Instant::now();
let Some(prev_path) = &config.previous_report else {
return Ok(GateResult::skipped(
"performance_regression",
"No previous report provided",
));
};
let prev_json = std::fs::read_to_string(prev_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to read previous report: {e}")))?;
let prev_report: QaReport = serde_json::from_str(&prev_json)
.map_err(|e| CliError::ValidationFailed(format!("Failed to parse previous report: {e}")))?;
let threshold = config.regression_threshold;
let mut regressions = Vec::new();
let comparable_gates = ["throughput", "ollama_parity", "gpu_speedup"];
for gate_name in &comparable_gates {
let prev_gate = prev_report.gates.iter().find(|g| g.name == *gate_name);
let curr_gate = current_gates.iter().find(|g| g.name == *gate_name);
if let Some(msg) = detect_regression(prev_gate, curr_gate, gate_name, threshold) {
regressions.push(msg);
}
}
let duration = start.elapsed();
if regressions.is_empty() {
Ok(GateResult::passed(
"performance_regression",
&format!(
"No regressions >{:.0}% vs {}",
threshold * 100.0,
prev_path.display()
),
Some(0.0),
Some(threshold),
duration,
))
} else {
Ok(GateResult::failed(
"performance_regression",
&format!("Regressions detected: {}", regressions.join("; ")),
Some(regressions.len() as f64),
Some(0.0),
duration,
))
}
}
fn detect_regression(
prev: Option<&GateResult>,
curr: Option<&GateResult>,
name: &str,
threshold: f64,
) -> Option<String> {
let (prev, curr) = (prev?, curr?);
let (prev_val, curr_val) = (prev.value?, curr.value?);
if prev_val <= 0.0 || prev.skipped || curr.skipped {
return None;
}
let regression = (prev_val - curr_val) / prev_val;
if regression <= threshold {
return None;
}
Some(format!(
"{name}: {prev_val:.1} -> {curr_val:.1} ({:.0}% regression)",
regression * 100.0
))
}